In [1]:
import re
import codecs
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import feature_extraction
from sklearn import linear_model
from sklearn import pipeline
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [8]:
df=pd.read_csv('Language Detection.csv')

In [9]:
df.head()

Unnamed: 0,Text,Language
0,"Nature, in the broadest sense, is the natural...",English
1,"""Nature"" can refer to the phenomena of the phy...",English
2,"The study of nature is a large, if not the onl...",English
3,"Although humans are part of nature, human acti...",English
4,[1] The word nature is borrowed from the Old F...,English


In [10]:
df['Language'].nunique()

17

In [11]:
df['Language'].value_counts()

Language
English       1385
French        1014
Spanish        819
Portugeese     739
Italian        698
Russian        692
Sweedish       676
Malayalam      594
Dutch          546
Arabic         536
Turkish        474
German         470
Tamil          469
Danish         428
Kannada        369
Greek          365
Hindi           63
Name: count, dtype: int64

In [12]:
df.shape

(10337, 2)

In [13]:
len(df['Text'])

10337

In [22]:
# Text pre-processing
# here we dont remove the stopwords bcoz its a langugae detection model

In [93]:
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

In [100]:
def clean_text(cln_text):
    # cln_text = re.sub(r"[^a-zA-Z']", ' ', cln_text)
    # cln_text = re.sub(r'\d+', ' ', cln_text).strip()
    punctuation = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
    cln_text = cln_text.lower()
    cln_text = ''.join(char for char in cln_text if char not in punctuation)
    cln_text = ''.join(char for char in cln_text if not char.isdigit())
    word_tkn = word_tokenize(cln_text)
    stem = PorterStemmer()
    stem_word=[stem.stem(word) for word in word_tkn]
    preprocessed_text = ' '.join(stem_word)
    return preprocessed_text

In [101]:
df['cln_text']=df['Text'].apply(clean_text)

In [102]:
df.head()

Unnamed: 0,Text,Language,cln_text
0,"Nature, in the broadest sense, is the natural...",English,natur in the broadest sens is the natur physic...
1,"""Nature"" can refer to the phenomena of the phy...",English,natur can refer to the phenomena of the physic...
2,"The study of nature is a large, if not the onl...",English,the studi of natur is a larg if not the onli p...
3,"Although humans are part of nature, human acti...",English,although human are part of natur human activ i...
4,[1] The word nature is borrowed from the Old F...,English,the word natur is borrow from the old french n...


In [103]:
df.tail()

Unnamed: 0,Text,Language,cln_text
10332,ನಿಮ್ಮ ತಪ್ಪು ಏನು ಬಂದಿದೆಯೆಂದರೆ ಆ ದಿನದಿಂದ ನಿಮಗೆ ಒ...,Kannada,ನಿಮ್ಮ ತಪ್ಪು ಏನು ಬಂದಿದೆಯೆಂದರೆ ಆ ದಿನದಿಂದ ನಿಮಗೆ ಒ...
10333,ನಾರ್ಸಿಸಾ ತಾನು ಮೊದಲಿಗೆ ಹೆಣಗಾಡುತ್ತಿದ್ದ ಮಾರ್ಗಗಳನ್...,Kannada,ನಾರ್ಸಿಸಾ ತಾನು ಮೊದಲಿಗೆ ಹೆಣಗಾಡುತ್ತಿದ್ದ ಮಾರ್ಗಗಳನ್...
10334,ಹೇಗೆ ' ನಾರ್ಸಿಸಿಸಮ್ ಈಗ ಮರಿಯನ್ ಅವರಿಗೆ ಸಂಭವಿಸಿದ ಎ...,Kannada,ಹೇಗೆ ನಾರ್ಸಿಸಿಸಮ್ ಈಗ ಮರಿಯನ್ ಅವರಿಗೆ ಸಂಭವಿಸಿದ ಎಲ್...
10335,ಅವಳು ಈಗ ಹೆಚ್ಚು ಚಿನ್ನದ ಬ್ರೆಡ್ ಬಯಸುವುದಿಲ್ಲ ಎಂದು ...,Kannada,ಅವಳು ಈಗ ಹೆಚ್ಚು ಚಿನ್ನದ ಬ್ರೆಡ್ ಬಯಸುವುದಿಲ್ಲ ಎಂದು ...
10336,ಟೆರ್ರಿ ನೀವು ನಿಜವಾಗಿಯೂ ಆ ದೇವದೂತನಂತೆ ಸ್ವಲ್ಪ ಕಾಣು...,Kannada,ಟೆರ್ರಿ ನೀವು ನಿಜವಾಗಿಯೂ ಆ ದೇವದೂತನಂತೆ ಸ್ವಲ್ಪ ಕಾಣು...


In [21]:
# split the data set

In [104]:
X=df['cln_text']
y=df['Language']

In [105]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

In [106]:
X_train.shape,X_test.shape

((8269,), (2068,))

# Vectorize the model

In [107]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [108]:
tf_vctr=TfidfVectorizer(ngram_range=(1,3),analyzer='char')

In [109]:
from sklearn.linear_model import LogisticRegression

In [110]:
lr=LogisticRegression()

# Make the pipeline and train the model

In [111]:
nlp_pipe = pipeline.Pipeline([
    ('tf_vctr',tf_vctr),
    ('clf',lr)
])

In [112]:
nlp_pipe.fit(X_train,y_train)

In [113]:
y_pred = nlp_pipe.predict(X_test)

In [114]:
nlp_pipe.score(X_train,y_train)

0.994074253234974

In [115]:
nlp_pipe.score(y_test,y_pred)

0.07156673114119923

In [116]:
from sklearn.metrics import accuracy_score,precision_score,recall_score,confusion_matrix

In [117]:
accuracy_score(y_test,y_pred)

0.9796905222437138

In [118]:
# precision_score(y_pred,y_test)

In [119]:
nlp_pipe.predict(['Nature in the broadest sense is the natural ph'])

array(['English'], dtype=object)

In [121]:
nlp_pipe.predict(['ಅವಳು ಈಗ ಹೆಚ್ಚು ಚಿನ್ನದ ಬ್ರೆಡ್ ಬಯಸುವುದಿಲ್ಲ'])

array(['Kannada'], dtype=object)

In [123]:
nlp_pipe.predict(['Спасибо'])

array(['Russian'], dtype=object)