In [40]:
#Importing libraries
import pandas as pd 
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt 

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, Normalizer

from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB

from nltk import word_tokenize, TreebankWordTokenizer
from nltk import PorterStemmer

In [41]:
train_data = pd.read_csv('train_set.csv')
test_data = pd.read_csv('test_set.csv')

In [42]:
train_data.head()

Unnamed: 0,lang_id,text
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...
2,eng,the province of kwazulu-natal department of tr...
3,nso,o netefatša gore o ba file dilo ka moka tše le...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...


In [43]:
test_data.head()

Unnamed: 0,index,text
0,1,"Mmasepala, fa maemo a a kgethegileng a letlele..."
1,2,Uzakwaziswa ngokufaneleko nakungafuneka eminye...
2,3,Tshivhumbeo tshi fana na ngano dza vhathu.
3,4,Kube inja nelikati betingevakala kutsi titsini...
4,5,Winste op buitelandse valuta.


In [44]:
#checking for any null data
train_data.isnull().sum()

lang_id    0
text       0
dtype: int64

In [45]:
#checking distribution of data for each language 
train_data['lang_id'].value_counts()

xho    3000
ven    3000
sot    3000
tsn    3000
ssw    3000
nbl    3000
zul    3000
nso    3000
tso    3000
afr    3000
eng    3000
Name: lang_id, dtype: int64

In [46]:
#cleaning the data 
pattern_url = r'http[s]?://(?:[A-Za-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9A-Fa-f][0-9A-Fa-f]))+'
train_data['clean'] = train_data['text'].replace(to_replace = pattern_url, value = ' ', regex = True)

In [47]:
# all strings lower case
train_data['clean'] = train_data['clean'].str.lower()

In [48]:
# tokenize 
train_data['tokens'] = train_data['clean'].apply(word_tokenize)

In [49]:
train_data

Unnamed: 0,lang_id,text,clean,tokens
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...,umgaqo-siseko wenza amalungiselelo kumaziko ax...,"[umgaqo-siseko, wenza, amalungiselelo, kumazik..."
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...,i-dha iya kuba nobulumko bokubeka umsebenzi na...,"[i-dha, iya, kuba, nobulumko, bokubeka, umsebe..."
2,eng,the province of kwazulu-natal department of tr...,the province of kwazulu-natal department of tr...,"[the, province, of, kwazulu-natal, department,..."
3,nso,o netefatša gore o ba file dilo ka moka tše le...,o netefatša gore o ba file dilo ka moka tše le...,"[o, netefatša, gore, o, ba, file, dilo, ka, mo..."
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...,khomishini ya ndinganyiso ya mbeu yo ewa maana...,"[khomishini, ya, ndinganyiso, ya, mbeu, yo, ew..."
...,...,...,...,...
32995,tsn,popo ya dipolateforomo tse ke go tlisa boetele...,popo ya dipolateforomo tse ke go tlisa boetele...,"[popo, ya, dipolateforomo, tse, ke, go, tlisa,..."
32996,sot,modise mosadi na o ntse o sa utlwe hore thaban...,modise mosadi na o ntse o sa utlwe hore thaban...,"[modise, mosadi, na, o, ntse, o, sa, utlwe, ho..."
32997,eng,closing date for the submission of completed t...,closing date for the submission of completed t...,"[closing, date, for, the, submission, of, comp..."
32998,xho,nawuphina umntu ofunyenwe enetyala phantsi kwa...,nawuphina umntu ofunyenwe enetyala phantsi kwa...,"[nawuphina, umntu, ofunyenwe, enetyala, phants..."


In [50]:
stem=PorterStemmer()
train_data['stemmed'] = train_data['tokens'].apply(lambda x: [stem.stem(word) for word in x])
train_data

# stemming and leminization seems to only funtion for english words 

Unnamed: 0,lang_id,text,clean,tokens,stemmed
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...,umgaqo-siseko wenza amalungiselelo kumaziko ax...,"[umgaqo-siseko, wenza, amalungiselelo, kumazik...","[umgaqo-siseko, wenza, amalungiselelo, kumazik..."
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...,i-dha iya kuba nobulumko bokubeka umsebenzi na...,"[i-dha, iya, kuba, nobulumko, bokubeka, umsebe...","[i-dha, iya, kuba, nobulumko, bokubeka, umsebe..."
2,eng,the province of kwazulu-natal department of tr...,the province of kwazulu-natal department of tr...,"[the, province, of, kwazulu-natal, department,...","[the, provinc, of, kwazulu-nat, depart, of, tr..."
3,nso,o netefatša gore o ba file dilo ka moka tše le...,o netefatša gore o ba file dilo ka moka tše le...,"[o, netefatša, gore, o, ba, file, dilo, ka, mo...","[o, netefatša, gore, o, ba, file, dilo, ka, mo..."
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...,khomishini ya ndinganyiso ya mbeu yo ewa maana...,"[khomishini, ya, ndinganyiso, ya, mbeu, yo, ew...","[khomishini, ya, ndinganyiso, ya, mbeu, yo, ew..."
...,...,...,...,...,...
32995,tsn,popo ya dipolateforomo tse ke go tlisa boetele...,popo ya dipolateforomo tse ke go tlisa boetele...,"[popo, ya, dipolateforomo, tse, ke, go, tlisa,...","[popo, ya, dipolateforomo, tse, ke, go, tlisa,..."
32996,sot,modise mosadi na o ntse o sa utlwe hore thaban...,modise mosadi na o ntse o sa utlwe hore thaban...,"[modise, mosadi, na, o, ntse, o, sa, utlwe, ho...","[modis, mosadi, na, o, ntse, o, sa, utlw, hore..."
32997,eng,closing date for the submission of completed t...,closing date for the submission of completed t...,"[closing, date, for, the, submission, of, comp...","[close, date, for, the, submiss, of, complet, ..."
32998,xho,nawuphina umntu ofunyenwe enetyala phantsi kwa...,nawuphina umntu ofunyenwe enetyala phantsi kwa...,"[nawuphina, umntu, ofunyenwe, enetyala, phants...","[nawuphina, umntu, ofunyenw, enetyala, phantsi..."


In [52]:
tfidf_vect=TfidfVectorizer(lowercase=True, ngram_range=(1,3) ,min_df=.01, max_features=250)
X_tfidf = tfidf_vect.fit_transform(train_data['clean'])

In [53]:
X_tfidf.shape

(33000, 250)

In [54]:
X = X_tfidf.toarray()
y = np.array(train_data['lang_id'])

In [55]:
# train data split for model testing 
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

In [58]:
rfor = RandomForestClassifier(n_estimators=60, min_samples_split=5, min_samples_leaf=1, max_features='sqrt', max_depth=None, bootstrap=False)
rfor.fit(X_train, y_train)
y_pred = rfor.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.9192424242424242


In [59]:
s_vc = SVC(kernel='linear')
s_vc.fit(X_train, y_train)
y_pred = s_vc.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.9151515151515152


In [60]:
n_n = MLPClassifier()
n_n.fit(X_train, y_train)
y_pred = n_n.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.9186363636363636


In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
# param_grid = {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001],'kernel': ['rbf', 'poly', 'sigmoid']}

param_rf = {'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [20, 40, 60, 80, 100]}

# grid = GridSearchCV(RandomForestClassifier, param_rf, refit=True)
# grid.fit(X_train,y_train)

rf_random = RandomizedSearchCV(estimator = RandomForestClassifier(), param_distributions = param_rf, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
rf_random.fit(X_train, y_train)

In [61]:
# fix test data 
test_data.isnull().sum()

index    0
text     0
dtype: int64

In [66]:
test_data['clean'] = test_data['text'].replace(to_replace = pattern_url, value = ' ', regex = True)
test_data['clean'] = test_data['clean'].str.lower()
test_data['tokens'] = test_data['clean'].apply(word_tokenize)

In [67]:
test_data

Unnamed: 0,index,text,clean,tokens
0,1,"Mmasepala, fa maemo a a kgethegileng a letlele...","mmasepala, fa maemo a a kgethegileng a letlele...","[mmasepala, ,, fa, maemo, a, a, kgethegileng, ..."
1,2,Uzakwaziswa ngokufaneleko nakungafuneka eminye...,uzakwaziswa ngokufaneleko nakungafuneka eminye...,"[uzakwaziswa, ngokufaneleko, nakungafuneka, em..."
2,3,Tshivhumbeo tshi fana na ngano dza vhathu.,tshivhumbeo tshi fana na ngano dza vhathu.,"[tshivhumbeo, tshi, fana, na, ngano, dza, vhat..."
3,4,Kube inja nelikati betingevakala kutsi titsini...,kube inja nelikati betingevakala kutsi titsini...,"[kube, inja, nelikati, betingevakala, kutsi, t..."
4,5,Winste op buitelandse valuta.,winste op buitelandse valuta.,"[winste, op, buitelandse, valuta, .]"
...,...,...,...,...
5677,5678,You mark your ballot in private.,you mark your ballot in private.,"[you, mark, your, ballot, in, private, .]"
5678,5679,Ge o ka kgetha ka bowena go se šomiše Mofani k...,ge o ka kgetha ka bowena go se šomiše mofani k...,"[ge, o, ka, kgetha, ka, bowena, go, se, šomiše..."
5679,5680,"E Ka kopo etsa kgetho ya hao ka hloko, hobane ...","e ka kopo etsa kgetho ya hao ka hloko, hobane ...","[e, ka, kopo, etsa, kgetho, ya, hao, ka, hloko..."
5680,5681,"TB ke bokudi ba PMB, mme Morero o tla lefella ...","tb ke bokudi ba pmb, mme morero o tla lefella ...","[tb, ke, bokudi, ba, pmb, ,, mme, morero, o, t..."


In [68]:
X_tfidf_test = tfidf_vect.fit_transform(test_data['clean'])

In [69]:
X_tfidf_test.shape

(5682, 250)

In [70]:
test_predictions = rfor.predict(X_tfidf_test.toarray())

In [71]:
submission = pd.DataFrame(
    {'index': test_data['index'],
     'lang_id': test_predictions
    })

In [73]:
submission.head()

Unnamed: 0,index,lang_id
0,1,xho
1,2,xho
2,3,eng
3,4,zul
4,5,xho


In [74]:
# save DataFrame to csv file for submission
submission.to_csv("submission.csv", index=False)