In [70]:
import string
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns

In [71]:
df = pd.read_csv('Language Detection.csv')
df.head()

Unnamed: 0,Text,Language
0,"Nature, in the broadest sense, is the natural...",English
1,"""Nature"" can refer to the phenomena of the phy...",English
2,"The study of nature is a large, if not the onl...",English
3,"Although humans are part of nature, human acti...",English
4,[1] The word nature is borrowed from the Old F...,English


In [72]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [73]:
# remove punctuations & lowercase sentences
def remove_pun(text):
    for pun in string.punctuation:
        text = text.replace(pun, "")
    text = text.lower()
    return(text)
        

In [74]:
df['Text'] = df['Text'].apply(remove_pun)

In [75]:
from sklearn.model_selection import train_test_split

In [76]:
# x being the text, column 0, and y being the language, column 1
x = df.iloc[:,0]
y = df.iloc[:,1]

In [77]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = .2)

In [78]:
print(x_train)

1663    ഇന്ന് എനിക്ക് എനർജി ഡ്രിങ്ക് ഉപയോഗിച്ച് ചെയ്യാ...
1006    98 because of such challenges the effective us...
6930    kan du dække mig det er tid til at rejse og fy...
1813                       നിന്നെക്കുറിച്ച് എന്തുപറയുന്നു
749     everything2363 with many later being merged in...
                              ...                        
9902    die drei waren seit ewigkeiten freunde aber te...
5794    θα είμαι μαζί σου σε ένα λεπτό θα είμαι μαζί σ...
3669    par conséquent wikipedia adopte un code de con...
435     it is not rare for articles strongly related t...
7780    basti pensare a quella povera madre narciso er...
Name: Text, Length: 8269, dtype: object


In [79]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [81]:
vec = TfidfVectorizer(ngram_range=(1, 2), analyzer='char')


In [84]:
from sklearn import pipeline
from sklearn import linear_model

In [86]:
model_pipe = pipeline.Pipeline([('vec', vec), ('clf', linear_model.LogisticRegression())])

In [87]:
model_pipe.fit(x_train, y_train)

In [88]:
model_pipe.classes_

array(['Arabic', 'Danish', 'Dutch', 'English', 'French', 'German',
       'Greek', 'Hindi', 'Italian', 'Kannada', 'Malayalam', 'Portugeese',
       'Russian', 'Spanish', 'Sweedish', 'Tamil', 'Turkish'], dtype=object)

In [90]:
predict_val = model_pipe.predict(x_test)

In [91]:
from sklearn import metrics

In [92]:
metrics.accuracy_score(y_test,predict_val)

0.9724371373307543

In [93]:
metrics.confusion_matrix(y_test, predict_val)

array([[ 99,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0],
       [  0,  69,   2,   2,   1,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   7,   0,   0],
       [  0,   0,  92,   1,   2,   3,   0,   0,   0,   0,   0,   0,   0,
          1,   0,   0,   0],
       [  0,   2,   0, 268,   2,   0,   0,   0,   1,   0,   0,   0,   0,
          0,   0,   0,   0],
       [  0,   0,   0,   2, 204,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   1],
       [  0,   1,   3,   1,   0,  92,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   1],
       [  0,   0,   0,   0,   0,   0,  69,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0],
       [  0,   0,   0,   0,   0,   0,   0,  14,   0,   0,   0,   0,   0,
          0,   0,   0,   0],
       [  0,   0,   0,   0,   1,   0,   0,   0, 117,   0,   0,   1,   0,
          1,   0,   0,   0],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,  80,   0,   0,   0,
         

In [95]:
model_pipe.predict(['My name is John'])

array(['English'], dtype=object)

In [97]:
model_pipe.predict(['Como voce esta meu irmao'])

array(['Portugeese'], dtype=object)

In [98]:
import pickle 

In [None]:
# open a new file in current directory
new_file = open('model.pkl', 'wb')
pickle.dump()