In [1]:
import string
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('Language Detection.csv')
df.head()

Unnamed: 0,Text,Language
0,"Nature, in the broadest sense, is the natural...",English
1,"""Nature"" can refer to the phenomena of the phy...",English
2,"The study of nature is a large, if not the onl...",English
3,"Although humans are part of nature, human acti...",English
4,[1] The word nature is borrowed from the Old F...,English


In [3]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [4]:
# remove punctuations & lowercase sentences
def remove_pun(text):
    for pun in string.punctuation:
        text = text.replace(pun, "")
    text = text.lower()
    return(text)
        

In [5]:
df['Text'] = df['Text'].apply(remove_pun)

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
# x being the text, column 0, and y being the language, column 1
x = df.iloc[:,0]
y = df.iloc[:,1]

In [8]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = .2)

In [9]:
print(x_train)

6498                                          береги себя
4521    toch bedankt het maakt niet uit maar toch bedankt
2248    தீவிரமாக கவலைப்படாதீர்கள் அதை இதயத்திற்கு எடுத...
1459    വിക്കിപീഡിയയിലെ വിവരണം മാത്രമേ ഏതു പ്രശ്നത്തിന...
9696    das ist beleuchtet was bedeutet dass es erstau...
                              ...                        
8339    en wikipediaartikel som stämmer in på följande...
7334    nel suo pamphlet sulla specializzazione framme...
5506               ¿quieres hacer algo este fin de semana
1375    if thats really what you want then ill take yo...
6015                                          русск» англ
Name: Text, Length: 8269, dtype: object


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [11]:
vec = TfidfVectorizer(ngram_range=(1, 2), analyzer='char')


In [12]:
from sklearn import pipeline
from sklearn import linear_model

In [13]:
model_pipe = pipeline.Pipeline([('vec', vec), ('clf', linear_model.LogisticRegression())])

In [14]:
model_pipe.fit(x_train, y_train)

In [15]:
model_pipe.classes_

array(['Arabic', 'Danish', 'Dutch', 'English', 'French', 'German',
       'Greek', 'Hindi', 'Italian', 'Kannada', 'Malayalam', 'Portugeese',
       'Russian', 'Spanish', 'Sweedish', 'Tamil', 'Turkish'], dtype=object)

In [16]:
predict_val = model_pipe.predict(x_test)

In [17]:
from sklearn import metrics

In [18]:
metrics.accuracy_score(y_test,predict_val)

0.9777562862669246

In [19]:
metrics.confusion_matrix(y_test, predict_val)

array([[115,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0],
       [  0,  80,   1,   3,   1,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   1,   0,   0],
       [  0,   0, 108,   1,   0,   0,   0,   0,   0,   0,   0,   2,   0,
          0,   0,   0,   0],
       [  0,   1,   1, 255,   0,   1,   0,   0,   2,   0,   0,   1,   0,
          0,   0,   0,   0],
       [  0,   0,   0,   2, 208,   1,   0,   0,   0,   0,   0,   0,   0,
          1,   0,   0,   2],
       [  0,   1,   1,   0,   0,  95,   0,   0,   0,   0,   0,   0,   0,
          0,   1,   0,   0],
       [  0,   0,   0,   0,   0,   0,  71,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0],
       [  0,   0,   0,   0,   0,   0,   0,  11,   0,   0,   0,   0,   0,
          0,   0,   0,   0],
       [  0,   0,   0,   2,   1,   0,   0,   0, 135,   0,   0,   2,   0,
          2,   0,   0,   0],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,  74,   0,   0,   0,
         

In [20]:
model_pipe.predict(['My name is John'])

array(['English'], dtype=object)

In [24]:
model_pipe.predict(['Como voce esta meu irmao'])

array(['Portugeese'], dtype=object)

In [22]:
import pickle 

In [23]:
# open a new file in current directory
new_file = open('model.pkl', 'wb')
pickle.dump()

TypeError: dump() missing required argument 'obj' (pos 1)