In [1]:
%matplotlib inline
import pandas as pd

import numpy as np

from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

import pandas, numpy, string, xgboost
import tensorflow as tf

import langid
import statsmodels.api as sm

from warnings import filterwarnings
filterwarnings('ignore')


In [2]:
# load the provided data
train_set = pd.read_csv('data/train_data_1.csv')


In [3]:
train_set.describe()


Unnamed: 0,Sinhala,Tamil,Class
count,20318,20318,20318
unique,20292,20310,3
top,රත්නායක මුදියන්සේලාගේ කරුණාවතී,ஹேரத் முதியன்சேலாகே பத்மசிறி பண்டார ஹேரத்,Sinhala
freq,4,2,16635


In [4]:
targets = train_set['Class']

train_set.drop('Class', axis=1, inplace=True)

In [5]:
#sinhala
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(train_set['Sinhala'], targets)

#tamil
train_x_t, valid_x_t, train_y_t, valid_y_t = model_selection.train_test_split(train_set['Tamil'], targets)

In [6]:
#sinhala
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

#tamil
encoder = preprocessing.LabelEncoder()
train_y_t = encoder.fit_transform(train_y_t)
valid_y_t = encoder.fit_transform(valid_y_t)

In [7]:
#Count Vectors as features

# create a count vectorizer object for sinhala
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(train_set)

# transform the training and validation data using count vectorizer object for sinhala
xtrain_count =  count_vect.transform(train_x)
xvalid_count =  count_vect.transform(valid_x)



# create a count vectorizer object for tamil
count_vect_t = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect_t.fit(train_set)

# transform the training and validation data using count vectorizer object for tamil
xtrain_count_t =  count_vect_t.transform(train_x_t)
xvalid_count_t =  count_vect_t.transform(valid_x_t)



In [9]:
#TF-IDF Vectors as features

# word level tf-idf for sinhala
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(train_set['Sinhala'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)

# ngram level tf-idf for sinhala
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(train_set['Sinhala'])
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x)

# characters level tf-idf for sinhala
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram_chars.fit(train_set['Sinhala'])
xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_x) 
xvalid_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(valid_x) 




# word level tf-idf for tamil
tfidf_vect_t = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect_t.fit(train_set['Tamil'])
xtrain_tfidf_t =  tfidf_vect_t.transform(train_x_t)
xvalid_tfidf_t =  tfidf_vect_t.transform(valid_x_t)

# ngram level tf-idf for tamil
tfidf_vect_ngram_t = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram_t.fit(train_set['Tamil'])
xtrain_tfidf_ngram_t =  tfidf_vect_ngram_t.transform(train_x_t)
xvalid_tfidf_ngram_t =  tfidf_vect_ngram_t.transform(valid_x_t)

# characters level tf-idf for tamil
tfidf_vect_ngram_chars_t = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram_chars_t.fit(train_set['Tamil'])
xtrain_tfidf_ngram_chars_t =  tfidf_vect_ngram_chars_t.transform(train_x_t) 
xvalid_tfidf_ngram_chars_t =  tfidf_vect_ngram_chars_t.transform(valid_x_t) 

In [10]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid, test_valid_y, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    print(test_valid_y)
    
    accuracy = metrics.accuracy_score(predictions, test_valid_y)
    precision = metrics.precision_score(test_valid_y, predictions, average='weighted')
    recall = metrics.recall_score(test_valid_y, predictions, average='weighted')
    f1_score = metrics.f1_score(test_valid_y, predictions, average='weighted')
    
    
    return accuracy, precision, recall, f1_score, classifier


In [None]:
#For Sinhala Scripts

# Linear Classifier on Count Vectors
#accuracy, precision, recall, f1_score, predictions_cont = train_model(linear_model.LogisticRegression(), xtrain_count, train_y, xvalid_count, test_count)
#print ("LC, Count Vectors: ", accuracy, "      Precision: ", precision, "       Recall: ", recall, "     F1_Score: ", f1_score)

# Linear Classifier on Word Level TF IDF Vectors
#accuracy, precision, recall, f1_score, predictions_tf = train_model(linear_model.LogisticRegression(), xtrain_tfidf, train_y, xvalid_tfidf, test_tfidf)
#print ("LC, WordLevel TF-IDF: ", accuracy, "      Precision: ", precision, "       Recall: ", recall, "     F1_Score: ", f1_score)

# Linear Classifier on Ngram Level TF IDF Vectors
#accuracy, precision, recall, f1_score, predictions_ngram = train_model(linear_model.LogisticRegression(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram, test_tfidf_ngram)
#print ("LC, N-Gram Vectors: ", accuracy, "      Precision: ", precision, "       Recall: ", recall, "     F1_Score: ", f1_score)

# Linear Classifier on Character Level TF IDF Vectors
#accuracy, precision, recall, f1_score, predictions_char = train_model(linear_model.LogisticRegression(), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars, test_tfidf_ngram_chars)
#print ("LC, CharLevel Vectors: ", accuracy, "      Precision: ", precision, "       Recall: ", recall, "     F1_Score: ", f1_score)


In [None]:
#For Sinhala Scripts

# SVM on Ngram Level TF IDF Vectors
#accuracy, precision, recall, f1_score, predictions_ngram = train_model(svm.SVC(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram, test_tfidf_ngram)
#print ("SVM, N-Gram Vectors: ", accuracy, "      Precision: ", precision, "       Recall: ", recall, "     F1_Score: ", f1_score)

In [None]:
#For Sinhala Scripts

# Extereme Gradient Boosting on Count Vectors
#accuracy, precision, recall, f1_score, predictions_cont = train_model(xgboost.XGBClassifier(), xtrain_count, train_y, xvalid_count, test_count)
#print ("Xgb, Count Vectors: ", accuracy, "      Precision: ", precision, "       Recall: ", recall, "     F1_Score: ", f1_score)

# Extereme Gradient Boosting on Word Level TF IDF Vectors
#accuracy, precision, recall, f1_score, predictions_tf = train_model(xgboost.XGBClassifier(), xtrain_tfidf, train_y, xvalid_tfidf, test_tfidf)
#print ("Xgb, WordLevel TF-IDF: ", accuracy, "      Precision: ", precision, "       Recall: ", recall, "     F1_Score: ", f1_score)

# Extereme Gradient Boosting on Ngram Level TF IDF Vectors
#accuracy, precision, recall, f1_score, predictions_ngram = train_model(xgboost.XGBClassifier(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram, test_tfidf_ngram)
#print ("Xgb, N-Gram Vectors: ", accuracy, "      Precision: ", precision, "       Recall: ", recall, "     F1_Score: ", f1_score)

# Extereme Gradient Boosting on Character Level TF IDF Vectors
#accuracy, precision, recall, f1_score, predictions_char = train_model(xgboost.XGBClassifier(), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars, test_tfidf_ngram_chars)
#print ("Xgb, CharLevel Vectors: ", accuracy, "      Precision: ", precision, "       Recall: ", recall, "     F1_Score: ", f1_score)


In [11]:
# Sinhala  Script Classifier
# Naive Bayes on Count Vectors
accuracy, precision, recall, f1_score, classifier_cont = train_model(naive_bayes.MultinomialNB(), xtrain_count, train_y, xvalid_count, valid_y)
print ("NB, Count Vectors for Sinhala Classifier: ",  accuracy, "      Precision: ", precision, "       Recall: ", recall, "     F1_Score: ", f1_score)


# Naive Bayes on Word Level TF IDF Vectors
accuracy, precision, recall, f1_score, classifier_tf = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf, train_y, xvalid_tfidf, valid_y)
print ("NB, WordLevel TF-IDF for Sinhala Classifier: ",  accuracy, "      Precision: ", precision, "       Recall: ", recall, "     F1_Score: ", f1_score)

# Naive Bayes on Ngram Level TF IDF Vectors
accuracy, precision, recall, f1_score, classifier_ngram = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram, valid_y)
print ("NB, N-Gram Vectors for Sinhala Classifier: ",  accuracy, "      Precision: ", precision, "       Recall: ", recall, "     F1_Score: ", f1_score)

# Naive Bayes on Character Level TF IDF Vectors
accuracy, precision, recall, f1_score, classifier_char = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars, valid_y)
print ("NB, CharLevel Vectors for Sinhala Classifier: ",  accuracy, "      Precision: ", precision, "       Recall: ", recall, "     F1_Score: ", f1_score)



#Tamil Script Classifier
# Naive Bayes on Count Vectors
accuracy, precision, recall, f1_score, classifier_cont_t = train_model(naive_bayes.MultinomialNB(), xtrain_count_t, train_y_t, xvalid_count_t, valid_y_t)
print ("NB, Count Vectors for Tamil Classifier: ",  accuracy, "      Precision: ", precision, "       Recall: ", recall, "     F1_Score: ", f1_score)


# Naive Bayes on Word Level TF IDF Vectors
accuracy, precision, recall, f1_score, classifier_tf_t = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_t, train_y_t, xvalid_tfidf_t, valid_y_t)
print ("NB, WordLevel TF-IDF for Tamil Classifier: ",  accuracy, "      Precision: ", precision, "       Recall: ", recall, "     F1_Score: ", f1_score)

# Naive Bayes on Ngram Level TF IDF Vectors
accuracy, precision, recall, f1_score, classifier_ngram_t = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram_t, train_y_t, xvalid_tfidf_ngram_t, valid_y_t)
print ("NB, N-Gram Vectors for Tamil Classifier: ",  accuracy, "      Precision: ", precision, "       Recall: ", recall, "     F1_Score: ", f1_score)

# Naive Bayes on Character Level TF IDF Vectors
accuracy, precision, recall, f1_score, classifier_char_t = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram_chars_t, train_y_t, xvalid_tfidf_ngram_chars_t, valid_y_t)
print ("NB, CharLevel Vectors for Tamil Classifier: ",  accuracy, "      Precision: ", precision, "       Recall: ", recall, "     F1_Score: ", f1_score)


[1 1 1 ... 1 1 1]
NB, Count Vectors for Sinhala Classifier:  0.8047244094488188       Precision:  0.6475813751627503        Recall:  0.8047244094488188      F1_Score:  0.717651262178615
[1 1 1 ... 1 1 1]
NB, WordLevel TF-IDF for Sinhala Classifier:  0.8862204724409449       Precision:  0.8855814783159618        Recall:  0.8862204724409449      F1_Score:  0.8581450563461351
[1 1 1 ... 1 1 1]
NB, N-Gram Vectors for Sinhala Classifier:  0.9289370078740158       Precision:  0.925582696663703        Recall:  0.9289370078740158      F1_Score:  0.921433901052541
[1 1 1 ... 1 1 1]
NB, CharLevel Vectors for Sinhala Classifier:  0.9576771653543307       Precision:  0.9567072904921449        Recall:  0.9576771653543307      F1_Score:  0.9570272461563771
[2 1 1 ... 1 0 1]
NB, Count Vectors for Tamil Classifier:  0.825       Precision:  0.6806249999999999        Recall:  0.825      F1_Score:  0.7458904109589041
[2 1 1 ... 1 0 1]
NB, WordLevel TF-IDF for Tamil Classifier:  0.8854330708661418       P

In [19]:
import pickle
filename = 'finalized_model.sav'
pickle.dump(classifier_char, open(filename, 'wb'))

filename2 = 'tfidf_vect_ngram_chars.sav'
pickle.dump(tfidf_vect_ngram_chars, open(filename2, 'wb'))

loaded_model = pickle.load(open(filename, 'rb'))
loaded_model2 = pickle.load(open(filename, 'rb'))
name="පරමේශ්වරම්"
test_tfidf_ngram_chars =  loaded_model2.transform([name])
predictions_char = loaded_model.predict(test_tfidf_ngram_chars)
print(predictions_char)

AttributeError: 'MultinomialNB' object has no attribute 'transform'

In [None]:
# Since the best accuracy,presicion, recall and F1-score, was given by the char level vector classifier for both Tamil and Sinhala, 
# it was choosen as the best model (with naive bayes classifier)
test_set = pd.read_csv('data/test_data_names.csv')

ethnic_type={0:'Muslim', 1:'Sinhala', 2:'Tamil'}
predictions = []

name="මුතෙන්දාස්"

languageID= langid.classify(name)
#print (name)
if(languageID[0]=='si'):
    test_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform([name])
    predictions_char = classifier_char.predict(test_tfidf_ngram_chars)
    predictions.append(ethnic_type[predictions_char[0]])
    print(predictions_char)
else:
    test_tfidf_ngram_chars_t =  tfidf_vect_ngram_chars_t.transform([name])
    predictions_char_t = classifier_char_t.predict(test_tfidf_ngram_chars_t)
    predictions.append(ethnic_type[predictions_char_t[0]])
    print(predictions_char_t)


