In [21]:
import pandas as pd
import numpy as np
import string
import pickle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn import pipeline

Data Preprocessing

In [23]:
language_data = pd.read_csv('Project14')
language_data.head()

Unnamed: 0,Text,Language
0,"Nature, in the broadest sense, is the natural...",English
1,"""Nature"" can refer to the phenomena of the phy...",English
2,"The study of nature is a large, if not the onl...",English
3,"Although humans are part of nature, human acti...",English
4,[1] The word nature is borrowed from the Old F...,English


In [24]:
def remove_pun(text): # Cleaning Data 
    for pun in string.punctuation:
        text = text.replace(pun, '')
    text = text.lower()
    return text

In [25]:
language_data['Text'] = language_data['Text'].apply(remove_pun)

In [26]:
language_data.describe()

Unnamed: 0,Text,Language
count,10337,10337
unique,10251,17
top,என்னை மன்னிக்கவும்,English
freq,4,1385


In [27]:
language_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10337 entries, 0 to 10336
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Text      10337 non-null  object
 1   Language  10337 non-null  object
dtypes: object(2)
memory usage: 161.6+ KB


In [28]:
language_data.shape

(10337, 2)

In [29]:
language_data.isnull().sum()

Text        0
Language    0
dtype: int64

In [30]:
language_data['Language'].value_counts()

Language
English       1385
French        1014
Spanish        819
Portugeese     739
Italian        698
Russian        692
Sweedish       676
Malayalam      594
Dutch          546
Arabic         536
Turkish        474
German         470
Tamil          469
Danish         428
Kannada        369
Greek          365
Hindi           63
Name: count, dtype: int64

In [31]:
rows_to_drop = language_data[language_data['Language'].isin(['English', 'Malayalam', 'Dutch','Arabic','Turkish', 'German', 'Tamil', 'Danish', 'Kannada', 'Greek', 'Hindi'])].index
language_data = language_data.drop(rows_to_drop , axis= 0)

In [32]:
language_data['Language'].value_counts()

Language
French        1014
Spanish        819
Portugeese     739
Italian        698
Russian        692
Sweedish       676
Name: count, dtype: int64

Split data into predictors and label

In [33]:
X = language_data.drop(columns= ['Language'], axis = 1)
y = language_data['Language']

Train and test data

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)


Feature extraction (Tokenization and Vectorization)


In [35]:
feature = TfidfVectorizer()

Modeling

In [36]:
model = MultinomialNB()

In [37]:
model_pipe = pipeline.Pipeline([('feature', feature), ('model', model)])


In [40]:
model_pipe.fit(X_train["Text"].values.tolist(), y_train.tolist())

In [46]:
model_pipe.predict(['меня зовут'])

array(['Russian'], dtype='<U10')

Model Accuracy

In [47]:
y_pred = model_pipe.predict(X_train['Text'].values.tolist())
accuracy_score(y_pred, y_train)

0.9981515711645101

In [49]:
new_file = open('model.project', 'wb')
pickle.dump(model_pipe, new_file)

In [51]:
new_file.close()