In [1]:
import numpy as np
import pandas as pd
import re

In [2]:
data = pd.read_csv('Language Detection.csv')
data.head()

Unnamed: 0,Text,Language
0,"Nature, in the broadest sense, is the natural...",English
1,"""Nature"" can refer to the phenomena of the phy...",English
2,"The study of nature is a large, if not the onl...",English
3,"Although humans are part of nature, human acti...",English
4,[1] The word nature is borrowed from the Old F...,English


In [3]:
data['Language'].unique()

array(['English', 'Malayalam', 'Hindi', 'Tamil', 'Portugeese', 'French',
       'Dutch', 'Spanish', 'Greek', 'Russian', 'Danish', 'Italian',
       'Turkish', 'Sweedish', 'Arabic', 'German', 'Kannada'], dtype=object)

In [4]:
data['Language'].value_counts()

English       1385
French        1014
Spanish        819
Portugeese     739
Italian        698
Russian        692
Sweedish       676
Malayalam      594
Dutch          546
Arabic         536
Turkish        474
German         470
Tamil          469
Danish         428
Kannada        369
Greek          365
Hindi           63
Name: Language, dtype: int64

In [5]:
X = data['Text']
y = data['Language']

In [6]:
X

0         Nature, in the broadest sense, is the natural...
1        "Nature" can refer to the phenomena of the phy...
2        The study of nature is a large, if not the onl...
3        Although humans are part of nature, human acti...
4        [1] The word nature is borrowed from the Old F...
                               ...                        
10332    ನಿಮ್ಮ ತಪ್ಪು ಏನು ಬಂದಿದೆಯೆಂದರೆ ಆ ದಿನದಿಂದ ನಿಮಗೆ ಒ...
10333    ನಾರ್ಸಿಸಾ ತಾನು ಮೊದಲಿಗೆ ಹೆಣಗಾಡುತ್ತಿದ್ದ ಮಾರ್ಗಗಳನ್...
10334    ಹೇಗೆ ' ನಾರ್ಸಿಸಿಸಮ್ ಈಗ ಮರಿಯನ್ ಅವರಿಗೆ ಸಂಭವಿಸಿದ ಎ...
10335    ಅವಳು ಈಗ ಹೆಚ್ಚು ಚಿನ್ನದ ಬ್ರೆಡ್ ಬಯಸುವುದಿಲ್ಲ ಎಂದು ...
10336    ಟೆರ್ರಿ ನೀವು ನಿಜವಾಗಿಯೂ ಆ ದೇವದೂತನಂತೆ ಸ್ವಲ್ಪ ಕಾಣು...
Name: Text, Length: 10337, dtype: object

In [7]:
y

0        English
1        English
2        English
3        English
4        English
          ...   
10332    Kannada
10333    Kannada
10334    Kannada
10335    Kannada
10336    Kannada
Name: Language, Length: 10337, dtype: object

In [8]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [9]:
y

array([3, 3, 3, ..., 9, 9, 9])

In [10]:
data_list = []
for text in X:
  text = re.sub(r'[!@#$(),n"%^*?:;~`0-9]','',text)
  text = re.sub(r'[[]]', ' ', text)
  text = text.lower()
  data_list.append(text)

  text = re.sub(r'[[]]', ' ', text)


In [11]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(data_list).toarray()
print(X.shape)
len(data_list)

(10337, 38665)


10337

In [12]:
from sklearn.model_selection import train_test_split
X_train , X_test , y_train , y_test = train_test_split(X,y,test_size = .2 , random_state = 41)

# Methods for ML

In [13]:
from sklearn.preprocessing import normalize
def normalizeData(train, test):
    train_result = normalize(train, norm='l2', axis=1, copy=True, return_norm=False)
    test_result = normalize(test, norm='l2', axis=1, copy=True, return_norm=False)
    return train_result, test_result

In [14]:
def toNumpyArray(data):
    data_type = type(data)
    if data_type == np.ndarray:
        return data
    elif data_type == list:
        return np.array(data_type)
    elif data_type == scipy.sparse.csr.csr_matrix:
        return data.toarray()
    print(data_type)
    return None

In [15]:
from sklearn.metrics import accuracy_score
def plot_Accuracy(y_test, y_predict,name=None):
    ACC = accuracy_score(y_test, y_predict)
    if name is None:
        print(" ACC:"+str(ACC))
    else:
        print(name+" ACC:"+str(ACC))

In [16]:
def prediction(text):
  x = cv.transform([text]).toarray()
  lang = model.predict(x)
  lang = le.inverse_transform(lang)
  print('The Language is in', lang[0])

# MultinomialNB

In [17]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train, y_train)

MultinomialNB()

In [18]:
y_pred = model.predict(X_test)
y_pred

array([14, 11, 16, ...,  3,  8,  3])

In [19]:
from sklearn.metrics import accuracy_score , confusion_matrix
ac = accuracy_score(y_test , y_pred)

In [20]:
print(f'Accuracy = {ac:.2f}')

Accuracy = 0.98


# KNN 

In [21]:
from sklearn.neighbors import KNeighborsClassifier
def applyNearestNeighbour(X_train, y_train, X_test):
    trainArray = toNumpyArray(X_train)
    testArray = toNumpyArray(X_test)
    
    clf = KNeighborsClassifier()
    clf.fit(trainArray, y_train)
    y_predict = clf.predict(testArray)
    return y_predict

In [22]:
y_predict_knn = applyNearestNeighbour(X_train, y_train, X_test)
y_predict_knn

array([ 3, 11,  9, ...,  9,  9,  3])

In [23]:
plot_Accuracy(y_test, y_predict_knn,name = 'KNN')

KNN ACC:0.5149903288201161


# NaiveBayes

In [24]:
from sklearn.naive_bayes import MultinomialNB
def applyNaiveBayes(X_train, y_train, X_test):
    trainArray = toNumpyArray(X_train)
    testArray = toNumpyArray(X_test)
    
    clf = MultinomialNB()
    clf.fit(trainArray, y_train)
    y_predict = clf.predict(testArray)
    return y_predict

In [25]:
y_predict_NB = applyNaiveBayes(X_train, y_train, X_test)

In [26]:
y_predict_NB

array([14, 11, 16, ...,  3,  8,  3])

In [27]:
plot_Accuracy(y_test, y_predict_knn,name = 'NaiveBayes')

NaiveBayes ACC:0.5149903288201161


# RandomForest 

In [28]:
from sklearn.ensemble import RandomForestClassifier 
def applyRandomForest(X_train,y_train,X_test):
    trainArray = toNumpyArray(X_train)
    testArray = toNumpyArray(X_test)
    clf=RandomForestClassifier(n_estimators=100)
    clf.fit(X_train,y_train)

    y_pred=clf.predict(X_test)

    return y_pred

In [29]:
y_predict_RF = applyRandomForest(X_train, y_train, X_test)
y_predict_RF

array([14, 11,  9, ...,  9, 13,  3])

In [30]:
plot_Accuracy(y_test, y_predict_knn,name = 'RandomForest')

RandomForest ACC:0.5149903288201161


# Prediction

In [31]:
prediction("это портал знаний на базе сообщества для профессионалов в области аналитики и данных.")

The Language is in Russian


In [32]:
prediction("Model accuracy is 98%.")

The Language is in English


In [34]:
prediction("मेरा नाम ऐश्वर्या हे.")

The Language is in Hindi


In [35]:
prediction("എന്റെ പേര് ഐശ്വര്യ.")

The Language is in Malayalam
