In [1]:
import numpy as np
import pandas as pd
import re
import string
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
df = pd.read_csv("Language Detection.csv")

In [3]:
df.head()

Unnamed: 0,Text,Language
0,"Nature, in the broadest sense, is the natural...",English
1,"""Nature"" can refer to the phenomena of the phy...",English
2,"The study of nature is a large, if not the onl...",English
3,"Although humans are part of nature, human acti...",English
4,[1] The word nature is borrowed from the Old F...,English


Now we will remove punctuations from all the text:

In [4]:
def remove(text):
    for pun in string.punctuation:
        text = text.replace(pun,"")

    text = text.lower()
    return text

Now we will check if the function is working or not!

In [5]:
remove("'Nature' c!an r#efer to the phenomena of the phy")

'nature can refer to the phenomena of the phy'

In [6]:
df['Text'] = df['Text'].apply(remove)

In [7]:
df['Text']

0         nature in the broadest sense is the natural p...
1        nature can refer to the phenomena of the physi...
2        the study of nature is a large if not the only...
3        although humans are part of nature human activ...
4        1 the word nature is borrowed from the old fre...
                               ...                        
10332    ನಿಮ್ಮ ತಪ್ಪು ಏನು ಬಂದಿದೆಯೆಂದರೆ ಆ ದಿನದಿಂದ ನಿಮಗೆ ಒ...
10333    ನಾರ್ಸಿಸಾ ತಾನು ಮೊದಲಿಗೆ ಹೆಣಗಾಡುತ್ತಿದ್ದ ಮಾರ್ಗಗಳನ್...
10334    ಹೇಗೆ  ನಾರ್ಸಿಸಿಸಮ್ ಈಗ ಮರಿಯನ್ ಅವರಿಗೆ ಸಂಭವಿಸಿದ ಎಲ...
10335    ಅವಳು ಈಗ ಹೆಚ್ಚು ಚಿನ್ನದ ಬ್ರೆಡ್ ಬಯಸುವುದಿಲ್ಲ ಎಂದು ...
10336    ಟೆರ್ರಿ ನೀವು ನಿಜವಾಗಿಯೂ ಆ ದೇವದೂತನಂತೆ ಸ್ವಲ್ಪ ಕಾಣು...
Name: Text, Length: 10337, dtype: object

In [8]:
y = df['Language']
X = df['Text']

Now we will train our dataset:

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

Now we will use TfidfVectorizer to determine how relevant those words are to a given dataset:

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [15]:
vec = TfidfVectorizer(ngram_range=(1,2),analyzer='char')

In [17]:
from sklearn import pipeline
from sklearn.linear_model import LogisticRegression

In [18]:
pipe =pipeline.Pipeline([('vec',vec),('clf',LogisticRegression())])

In [19]:
pipe.fit(X_train,y_train)

Now to see what all classes are peresent :

In [21]:
pipe.classes_

array(['Arabic', 'Danish', 'Dutch', 'English', 'French', 'German',
       'Greek', 'Hindi', 'Italian', 'Kannada', 'Malayalam', 'Portugeese',
       'Russian', 'Spanish', 'Sweedish', 'Tamil', 'Turkish'], dtype=object)

**Model Prediction:**

In [22]:
pipe_predict = pipe.predict(X_test)

In [24]:
from sklearn.metrics import classification_report,accuracy_score

In [27]:
print(accuracy_score(y_test,pipe_predict)*100)

97.82398452611218


In [26]:
print(classification_report(y_test,pipe_predict))

              precision    recall  f1-score   support

      Arabic       0.99      1.00      1.00       106
      Danish       0.98      0.96      0.97        83
       Dutch       0.96      0.95      0.96       107
     English       0.98      0.98      0.98       279
      French       0.97      0.98      0.98       190
      German       0.96      0.97      0.96        98
       Greek       1.00      1.00      1.00        83
       Hindi       1.00      1.00      1.00        15
     Italian       0.93      0.96      0.95       147
     Kannada       1.00      1.00      1.00        71
   Malayalam       1.00      1.00      1.00        98
  Portugeese       0.98      0.93      0.95       147
     Russian       1.00      1.00      1.00       145
     Spanish       0.98      0.96      0.97       162
    Sweedish       0.97      0.98      0.98       146
       Tamil       1.00      1.00      1.00        97
     Turkish       0.98      0.99      0.98        94

    accuracy              

**Testing the model:**

In [35]:
pipe.predict(["he is a boy"])

array(['English'], dtype=object)

In [36]:
pipe.predict(['की इस सेवा के लिए'])

array(['Hindi'], dtype=object)

In [37]:
pipe.predict(['bonjour à tous'])

array(['French'], dtype=object)

So yes we can see the model is working perfectly!

**now for making a website!!**

In [38]:
import pickle

In [39]:
new_file = open('model.pkl','wb')
pickle.dump(pipe, new_file)
new_file.close()