<a href="https://colab.research.google.com/github/TReV-89/TReV-89/blob/main/LanguuageID.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB, ComplementNB, BernoulliNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report,accuracy_score

In [2]:
df = pd.read_json("/content/drive/MyDrive/salt-train-v1.2.jsonl", lines = True)
df.pop("tts-speech")
data = {
    "language": [],
    "text": []
}
for row in df['text']:
    for key, value in row.items():
        data["language"].append(key)
        data["text"].append(value)
df1 = pd.DataFrame(data,columns = ['language','text'])
df1.head()

Unnamed: 0,language,text
0,eng,It was not a ghost refugee camp.
1,lug,Enkambi y'abanoonyiboobubudamu teyaliiwo mu bu...
2,ach,Pe obedo kem goba goba
3,teo,Mam arai ekabi lo erai ekwam.
4,lgg,Eri aa'ni ndra kembe emunyale eyini aa'zu inzo...


In [3]:
X = df1['text']
Y = df1['language']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=42)


In [4]:
count = CountVectorizer(analyzer = 'char',ngram_range=(1,3))

pipeline = Pipeline([
   ('vectorizer',count),
   ('model',MultinomialNB())
])
pipeline.fit(X_train,Y_train)
Y_pred = pipeline.predict(X_test)

print(classification_report(Y_test,Y_pred))

accuracy = accuracy_score(Y_test, Y_pred)
print("Accuracy:", accuracy)

              precision    recall  f1-score   support

         ach       1.00      1.00      1.00      7826
         eng       1.00      1.00      1.00      7783
         lgg       1.00      1.00      1.00      8089
         lug       0.99      0.99      0.99      7799
         nyn       1.00      0.99      0.99      7967
         teo       1.00      1.00      1.00      7952

    accuracy                           1.00     47416
   macro avg       1.00      1.00      1.00     47416
weighted avg       1.00      1.00      1.00     47416

Accuracy: 0.9971528597941623


In [7]:
count = CountVectorizer(analyzer = 'char',ngram_range=(1,3))

pipeline = Pipeline([
   ('vectorizer',count),
   ('model',BernoulliNB())
])
pipeline.fit(X_train,Y_train)
Y_pred = pipeline.predict(X_test)

print(classification_report(Y_test,Y_pred))

accuracy = accuracy_score(Y_test, Y_pred)
print("Accuracy:", accuracy)

              precision    recall  f1-score   support

         ach       1.00      1.00      1.00      7826
         eng       1.00      1.00      1.00      7783
         lgg       1.00      1.00      1.00      8089
         lug       0.99      0.99      0.99      7799
         nyn       1.00      0.99      0.99      7967
         teo       1.00      1.00      1.00      7952

    accuracy                           1.00     47416
   macro avg       1.00      1.00      1.00     47416
weighted avg       1.00      1.00      1.00     47416

Accuracy: 0.9967099713176987


In [None]:
count = CountVectorizer(analyzer = 'char',ngram_range=(1,3))

pipeline = Pipeline([
   ('vectorizer',count),
   ('model',ComplementNB())
])
pipeline.fit(X_train,Y_train)
Y_pred = pipeline.predict(X_test)

print(classification_report(Y_test,Y_pred))

accuracy = accuracy_score(Y_test, Y_pred)
print("Accuracy:", accuracy)

              precision    recall  f1-score   support

         ach       1.00      0.99      0.99      7826
         eng       0.98      1.00      0.99      7783
         lgg       1.00      1.00      1.00      8089
         lug       0.99      0.99      0.99      7799
         nyn       1.00      0.99      0.99      7967
         teo       1.00      1.00      1.00      7952

    accuracy                           0.99     47416
   macro avg       0.99      0.99      0.99     47416
weighted avg       0.99      0.99      0.99     47416

Accuracy: 0.993377762780496


Multinomial showed highest accuracy out of the three different Naive Bayes models

In [10]:
df2 = pd.read_json("/content/drive/MyDrive/salt-test-v1.2.jsonl", lines = True)
df2.pop("tts-speech")
data1 = {
    "language": [],
    "text": []
}
for row in df2['text']:
    for key, value in row.items():
        data1["language"].append(key)
        data1["text"].append(value)
df3 = pd.DataFrame(data,columns = ['language','text'])
df3.head()

Unnamed: 0,language,text
0,eng,The fashion industry is starting to thrive again.
1,lug,Ekisaawe ky'emisono kitandise okusituka nate.
2,ach,Yub me cital ruk mapatpat manyen tye ka dongo ...
3,teo,Ageutu ikampunin luka enape apolo bobo.
4,lgg,Okalamvu suta o'diru 'diyini 'diyi 'ye e'do tutu


In [11]:
X = df3['text']
Y = df3['language']
count = CountVectorizer(analyzer = 'char',ngram_range=(1,3))

pipeline = Pipeline([
   ('vectorizer',count),
   ('model',MultinomialNB())
])
pipeline.fit(X_train,Y_train)
Y_pred = pipeline.predict(X)

print(classification_report(Y,Y_pred))

accuracy = accuracy_score(Y, Y_pred)
print("Accuracy:", accuracy)

              precision    recall  f1-score   support

         ach       1.00      1.00      1.00       500
         eng       1.00      1.00      1.00       500
         lgg       1.00      1.00      1.00       500
         lug       0.99      0.99      0.99       500
         nyn       0.99      0.99      0.99       500
         teo       1.00      1.00      1.00       500

    accuracy                           1.00      3000
   macro avg       1.00      1.00      1.00      3000
weighted avg       1.00      1.00      1.00      3000

Accuracy: 0.996


99.6% accuracy with the SALT test set

In [13]:
new_samples = ['I am going to the katale']

Y_pred = pipeline.predict(new_samples)

print(f"Text: '{new_samples}' => Predicted Language: {Y_pred}")


Text: '['I am going to the katale']' => Predicted Language: ['eng']
