Importing Libraries

In [10]:
import string
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns

Cleaning the Data Set imported

In [11]:
df=pd.read_csv("Language Detection.csv")
df.head()

Unnamed: 0,Text,Language
0,"Nature, in the broadest sense, is the natural...",English
1,"""Nature"" can refer to the phenomena of the phy...",English
2,"The study of nature is a large, if not the onl...",English
3,"Although humans are part of nature, human acti...",English
4,[1] The word nature is borrowed from the Old F...,English


In [12]:
def remove_pun(text):
    for pun in string.punctuation:
        text=text.replace(pun,"")
    text=text.lower()
    return text    

In [13]:
df['Text']=df['Text'].apply(remove_pun)

In [14]:
df.head()

Unnamed: 0,Text,Language
0,nature in the broadest sense is the natural p...,English
1,nature can refer to the phenomena of the physi...,English
2,the study of nature is a large if not the only...,English
3,although humans are part of nature human activ...,English
4,1 the word nature is borrowed from the old fre...,English


Applying train test split

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
X=df.iloc[:,0]
Y=df.iloc[:,1]

In [17]:
X

0         nature in the broadest sense is the natural p...
1        nature can refer to the phenomena of the physi...
2        the study of nature is a large if not the only...
3        although humans are part of nature human activ...
4        1 the word nature is borrowed from the old fre...
                               ...                        
10332    ನಿಮ್ಮ ತಪ್ಪು ಏನು ಬಂದಿದೆಯೆಂದರೆ ಆ ದಿನದಿಂದ ನಿಮಗೆ ಒ...
10333    ನಾರ್ಸಿಸಾ ತಾನು ಮೊದಲಿಗೆ ಹೆಣಗಾಡುತ್ತಿದ್ದ ಮಾರ್ಗಗಳನ್...
10334    ಹೇಗೆ  ನಾರ್ಸಿಸಿಸಮ್ ಈಗ ಮರಿಯನ್ ಅವರಿಗೆ ಸಂಭವಿಸಿದ ಎಲ...
10335    ಅವಳು ಈಗ ಹೆಚ್ಚು ಚಿನ್ನದ ಬ್ರೆಡ್ ಬಯಸುವುದಿಲ್ಲ ಎಂದು ...
10336    ಟೆರ್ರಿ ನೀವು ನಿಜವಾಗಿಯೂ ಆ ದೇವದೂತನಂತೆ ಸ್ವಲ್ಪ ಕಾಣು...
Name: Text, Length: 10337, dtype: object

In [18]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2)

In [19]:
X_train

1528    ആർക്കും വിക്കിപീഡിയയിൽ എഴുതാവുന്നതുകൊണ്ട് നല്ല...
7374    questa definizione di mitchell è rilevante poi...
3271    cest ainsi que fontenelle après descartes dira...
9547    aber nein in einem fix bedeutet in einer schwi...
4182                      je serais heureux de vous aider
                              ...                        
1220                                     nice to meet you
3378    la nature au sens le plus strict est refoulée ...
8606    en är översvämmad översvämmad betyder drunknin...
4643        zou je toevallig weten waar de bibliotheek is
8432    på svenskspråkiga wikipedia kallas dessa möten...
Name: Text, Length: 8269, dtype: object

Vectorization

In [20]:
from sklearn import feature_extraction

In [21]:
vec=feature_extraction.text.TfidfVectorizer(ngram_range=(1,2),analyzer='char')

In [23]:
from sklearn import pipeline
from sklearn import linear_model

In [24]:
model_pipe=pipeline.Pipeline([('vec',vec),('clf',linear_model.LogisticRegression())])

In [25]:
model_pipe.fit(X_train,Y_train)

Pipeline(steps=[('vec', TfidfVectorizer(analyzer='char', ngram_range=(1, 2))),
                ('clf', LogisticRegression())])

Testing

In [26]:
model_pipe.classes_

array(['Arabic', 'Danish', 'Dutch', 'English', 'French', 'German',
       'Greek', 'Hindi', 'Italian', 'Kannada', 'Malayalam', 'Portugeese',
       'Russian', 'Spanish', 'Sweedish', 'Tamil', 'Turkish'], dtype=object)

In [28]:
model_pipe.predict(X_test)

array(['Portugeese', 'English', 'Italian', ..., 'Malayalam', 'Portugeese',
       'German'], dtype=object)

In [29]:
predict_val=model_pipe.predict(X_test)

In [31]:
from sklearn import metrics

In [32]:
metrics.accuracy_score(Y_test,predict_val)

0.9782398452611218

In [33]:
metrics.confusion_matrix(Y_test,predict_val)

array([[ 97,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0],
       [  0,  94,   0,   2,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   4,   0,   0],
       [  0,   1,  97,   3,   0,   1,   0,   0,   0,   0,   0,   1,   0,
          1,   1,   0,   0],
       [  0,   0,   0, 247,   3,   0,   0,   0,   0,   0,   0,   0,   0,
          1,   0,   0,   0],
       [  0,   0,   0,   1, 194,   0,   0,   0,   1,   0,   0,   1,   0,
          2,   0,   0,   1],
       [  0,   0,   1,   0,   0,  87,   0,   0,   0,   0,   0,   0,   0,
          0,   1,   0,   0],
       [  0,   0,   0,   0,   0,   0,  80,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0],
       [  0,   0,   0,   0,   0,   0,   0,  14,   0,   0,   0,   0,   0,
          0,   0,   0,   0],
       [  0,   2,   0,   0,   2,   0,   0,   0, 142,   0,   0,   0,   0,
          5,   0,   0,   1],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,  66,   0,   0,   0,
         

In [36]:
model_pipe.predict(["Soubhagya is a good boy"])

array(['English'], dtype=object)

Pickling the model

In [37]:
import pickle

In [38]:
new_file=open("lang_model.pckl",'wb')
pickle.dump(model_pipe,new_file)
new_file.close()