# **Language detection:**

## **Dependencies:**

In [3]:
import pickle
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt 

from sklearn import metrics
from sklearn import naive_bayes
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings('ignore')

## **Get, explore and remove missing text data:** 

In [4]:
# Get text:
df = pd.read_csv("lang_data.csv")
print(df.shape)
df = df.dropna()
print(df.shape,'\n') # A lot of missing values
df.head(10)

(2839, 2)
(2761, 2) 



Unnamed: 0,text,language
0,Ship shape and Bristol fashion,English
1,Know the ropes,English
2,Graveyard shift,English
3,Milk of human kindness,English
4,Touch with a barge-pole - Wouldn't,English
5,Sy kan altyd my battery natpiepie.,Afrikaans
6,When the shit hits the fan,English
8,Egg on,English
9,Drag race,English
10,As queer as a nine bob note,English


In [6]:
# Show language:
Label_list = list(df)
Language_list = df['language'].unique()
# print(Label_list,'\n')
print(Language_list,'\n')
data_g = df.groupby('language')
# data.first()
print("Number of instances in dataset: ",len(df))

# Get english text:
print("Number of English instances: ",len(data_g.get_group(Language_list[0])))
data_english = data_g.get_group(Language_list[0]) # A look at english
data_english.reset_index(inplace=True)
data_english = data_english.drop("index", axis=1)
# data_english.head(5)

# Get afrikaans text:
print("Number of Afrikaans instances: ",len(data_g.get_group(Language_list[1])))
data_afrikaans = data_g.get_group(Language_list[1]) # A look at english
data_afrikaans.reset_index(inplace=True)
data_afrikaans = data_afrikaans.drop("index", axis=1)
# data_afrikaans.head(5)

# Get netherland text:
print("Number of Afrikaans instances: ",len(data_g.get_group(Language_list[2])),'\n')
data_netherland = data_g.get_group(Language_list[2]) # A look at english
data_netherland.reset_index(inplace=True)
data_netherland = data_netherland.drop("index", axis=1)
# data_netherlands.head(5)

data = df
data.head()

['English' 'Afrikaans' 'Nederlands'] 

Number of instances in dataset:  2761
Number of English instances:  2055
Number of Afrikaans instances:  639
Number of Afrikaans instances:  67 



Unnamed: 0,text,language
0,Ship shape and Bristol fashion,English
1,Know the ropes,English
2,Graveyard shift,English
3,Milk of human kindness,English
4,Touch with a barge-pole - Wouldn't,English


#### **Own function (instead of using "encoder.fit_transform") for vectorise (mode=0) and de-vectorize (mode=1) labels:**

In [7]:
def Out_Vec_Devec(y1,mode):
    if mode == 0:
        y = []
        for i in y1:
            if i == 'English':
                y.append(0)
            if i == 'Afrikaans':
                y.append(1)
            if i == 'Nederlands':
                y.append(2)
    elif mode == 1:
        y = []
        for i in y1:
            if i == 0:
                y.append('English')
            if i == 1:
                y.append('Afrikaans')
            if i == 2:
                y.append('Nederlands')
    return y

## **For imbalanced data:**

#### **Vectorize text, train and test:**

In [8]:
df_temp = data.copy(deep = True)
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2))
tfidf_vectorizer.fit(df_temp['text'])
list_corpus = df_temp["text"].tolist()
list_labels = df_temp["language"].tolist()

X = tfidf_vectorizer.transform(list_corpus)
y = Out_Vec_Devec(list_labels,0)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=40)

In [9]:
# Naive Bayes on Character Level TF IDF Vectors
classifier = naive_bayes.MultinomialNB()
classifier.fit(X_train,y_train)
# predict the labels on validation dataset
y_pred = classifier.predict(X_test)
accuracy = metrics.accuracy_score(y_pred,y_test)
print(accuracy)

0.9439421338155516


In [10]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, confusion_matrix, make_scorer

final_metrics = pd.DataFrame({'Labels': ['English','Afrikaans','Nederland'], 
                              'Recall': recall_score(y_pred,y_test, average = None),
                              'Precision': precision_score(y_pred,y_test, average = None),
                              'F1-score': f1_score(y_pred,y_test, average = None)})
final_metrics

Unnamed: 0,Labels,Recall,Precision,F1-score
0,English,0.935335,1.0,0.966587
1,Afrikaans,0.975,0.886364,0.928571
2,Nederland,0.0,0.0,0.0


#### **Performance for each language:**

In [11]:
y_test_eng = []
y_test_afr = []
y_test_ned = []
y_pred_eng = []
y_pred_afr = []
y_pred_ned = []
for i in range(len(y_test)):
    if y_test[i] == 0:
        y_test_eng.append(y_test[i])
        y_pred_eng.append(y_pred[i])
    if y_test[i] == 1:
        y_test_afr.append(y_test[i])
        y_pred_afr.append(y_pred[i])
    if y_test[i] == 2:
        y_test_ned.append(y_test[i])
        y_pred_ned.append(y_pred[i])

accuracy_eng = metrics.accuracy_score(y_pred_eng,y_test_eng)
print("Accuracy for Eng: ",accuracy_eng)

accuracy_afr = metrics.accuracy_score(y_pred_afr,y_test_afr)
print("Accuracy for Afr: ",accuracy_afr)

accuracy_ned = metrics.accuracy_score(y_pred_ned,y_test_ned)
print("Accuracy for Ned: ",accuracy_ned)

Accuracy for Eng:  1.0
Accuracy for Afr:  0.8863636363636364
Accuracy for Ned:  0.0


## **For balanced data:**

#### **Vectorize text, train and test:**

In [12]:
df_temp = data.copy(deep = True)
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2))
tfidf_vectorizer.fit(df_temp['text'])
list_corpus = df_temp["text"].tolist()
list_labels = df_temp["language"].tolist()
X_Vec = tfidf_vectorizer.transform(list_corpus)

# Oversampling:
smote_over_sample = SMOTE(sampling_strategy='minority')
X1, y1 = smote_over_sample.fit_resample(X_Vec, list_labels)
X, y2 = smote_over_sample.fit_resample(X1,y1)

y = Out_Vec_Devec(y2,0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=40)

In [13]:
# Naive Bayes on Character Level TF IDF Vectors
classifier = naive_bayes.MultinomialNB()
classifier.fit(X_train,y_train)
# predict the labels on validation dataset
y_pred = classifier.predict(X_test)
accuracy = metrics.accuracy_score(y_pred,y_test)
print(accuracy)

0.9975669099756691


In [14]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, confusion_matrix, make_scorer

final_metrics = pd.DataFrame({'Labels': ['English','Afrikaans','Nederland'], 
                              'Recall': recall_score(y_pred,y_test, average = None),
                              'Precision': precision_score(y_pred,y_test, average = None),
                              'F1-score': f1_score(y_pred,y_test, average = None)})
final_metrics

Unnamed: 0,Labels,Recall,Precision,F1-score
0,English,1.0,0.992771,0.996372
1,Afrikaans,0.997436,1.0,0.998716
2,Nederland,0.99536,1.0,0.997674


#### **Investigate performance for each language:**

In [18]:
y_test_eng = []
y_test_afr = []
y_test_ned = []
y_pred_eng = []
y_pred_afr = []
y_pred_ned = []
for i in range(len(y_test)):
    if y_test[i] == 0:
        y_test_eng.append(y_test[i])
        y_pred_eng.append(y_pred[i])
    if y_test[i] == 1:
        y_test_afr.append(y_test[i])
        y_pred_afr.append(y_pred[i])
    if y_test[i] == 2:
        y_test_ned.append(y_test[i])
        y_pred_ned.append(y_pred[i])

accuracy_eng = metrics.accuracy_score(y_pred_eng,y_test_eng)
print("Accuracy for Eng: ",accuracy_eng)

accuracy_afr = metrics.accuracy_score(y_pred_afr,y_test_afr)
print("Accuracy for Afr: ",accuracy_afr)

accuracy_ned = metrics.accuracy_score(y_pred_ned,y_test_ned)
print("Accuracy for Ned: ",accuracy_ned)

Accuracy for Eng:  0.9927710843373494
Accuracy for Afr:  1.0
Accuracy for Ned:  1.0


## **Example of a text classified as a language, and save model and vectorizer:**

In [20]:
# Examples:
text = ["Ship shape and Bristol fashion"]            # English text
X = tfidf_vectorizer.transform(text)
pred = classifier.predict(X)
print(Out_Vec_Devec(pred,1),'\n')

text = ["Die man met min woorde, se rekenaar is so kompak, dat daar nie plek is vir kaf nie."]
X = tfidf_vectorizer.transform(text)
pred = classifier.predict(X)
print(Out_Vec_Devec(pred,1),'\n')

# Example:
text = ["Je moet geen oude schoenen weggooien, voordat je nieuwe hebt."]
X = tfidf_vectorizer.transform(text)
pred = classifier.predict(X)
print(Out_Vec_Devec(pred,1))

# save the model to disk
filename = 'Language_classifier.sav'
pickle.dump(classifier,open(filename, 'wb'))
filename = 'Vectorizer.sav'
pickle.dump(tfidf_vectorizer,open(filename, 'wb'))

['English'] 

['Afrikaans'] 

['Nederlands']
