## <center> Import Libs

In [15]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

## <center> Import Data

In [16]:
data = pd.read_csv('data/languages.csv')
data.head()

Unnamed: 0,Text,language
0,klement gottwaldi surnukeha palsameeriti ning ...,Estonian
1,sebes joseph pereira thomas på eng the jesuit...,Swedish
2,ถนนเจริญกรุง อักษรโรมัน thanon charoen krung เ...,Thai
3,விசாகப்பட்டினம் தமிழ்ச்சங்கத்தை இந்துப் பத்திர...,Tamil
4,de spons behoort tot het geslacht haliclona en...,Dutch


## <center> Data Info

In [17]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22000 entries, 0 to 21999
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Text      22000 non-null  object
 1   language  22000 non-null  object
dtypes: object(2)
memory usage: 343.9+ KB


In [18]:
data.describe()

Unnamed: 0,Text,language
count,22000,22000
unique,21859,22
top,haec commentatio automatice praeparata res ast...,Estonian
freq,48,1000


## <center> Null Values / Duplicates

In [19]:
data.isnull().sum()

Text        0
language    0
dtype: int64

In [20]:
print(f"Duplicated data: {data.duplicated().sum()}")

Duplicated data: 141


In [21]:
data.drop_duplicates(inplace=True)
print(f"Duplicated data after cleaning: {data.duplicated().sum()}")

Duplicated data after cleaning: 0


## <center> Language counts

In [22]:
data["language"].value_counts()

Chinese       1000
Thai          1000
English       1000
Japanese      1000
Turkish       1000
Romanian      1000
Urdu          1000
Persian       1000
Korean        1000
Estonian       999
Russian        999
Arabic         998
Portugese      997
Spanish        996
Dutch          996
Pushto         993
Swedish        992
Hindi          990
French         990
Tamil          981
Indonesian     975
Latin          953
Name: language, dtype: int64

## <center> Dividing into X and y

In [23]:
X = data['Text']
y = data['language']

print(f'X shape: {X.shape}')
print(f'y shape: {y.shape}')

X shape: (21859,)
y shape: (21859,)


## <center> Setup a vectorizer

In [24]:
cv = CountVectorizer()
X = cv.fit_transform(X)

## <center> Dividing into train and test datasets

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.1, 
                                                    random_state=1)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (19673, 277720)
X_test shape: (2186, 277720)
y_train shape: (19673,)
y_test shape: (2186,)


## <center> Modeling

In [26]:
model = MultinomialNB()
model.fit(X_train,y_train)
print(f'Model score: {model.score(X_test,y_test)}')

Model score: 0.9547118023787741


## <center> Model Test

In [28]:
text_en = "Hello, my name is Andrii"
text_ro = "Bună, numele meu este Andrei"
text_ru = "Привет! Меня зовут Андрей"

text_en_vectorized = cv.transform([text_en]).toarray()
text_ro_vectorized = cv.transform([text_ro]).toarray()
text_ru_vectorized = cv.transform([text_ru]).toarray()

print(f"Text: {text_en}. Language: {model.predict(text_en_vectorized)}")
print(f"Text: {text_ro}. Language: {model.predict(text_ro_vectorized)}")
print(f"Text: {text_ru}. Language: {model.predict(text_ru_vectorized)}")

Text: Hello, my name is Andrii. Language: ['English']
Text: Bună, numele meu este Andrei. Language: ['Romanian']
Text: Привет! Меня зовут Андрей. Language: ['Russian']
