In [1]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline

  from pandas import MultiIndex, Int64Index
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)


In [2]:
train = pd.read_csv('train_set.csv')
test = pd.read_csv('test_set.csv')

In [None]:
train

In [None]:
lang_map = {
'afr' : 'Afrikaans',
'eng' : 'English',
'nbl' : 'isiNdebele',
'nso' : 'Sepedi',
'sot' : 'Sesotho',
'ssw' : 'siSwati',
'tsn' : 'Setswana',
'tso' : 'Xitsonga',
'ven' : 'Tshivenda',
'xho' : 'isiXhosa',
'zul' : 'isiZulu' }

In [None]:
train['lang_id'].map(lang_map).value_counts().plot.barh()

In [None]:
# Generate wordcloud
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
from PIL import Image

In [3]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
X = train['text']
y = train['lang_id']
y = le.fit_transform(y)

In [13]:
ls = LinearSVC()
pipe2 = Pipeline([('vect', TfidfVectorizer(stop_words='english', 
                             min_df=1, 
                             max_df=0.9, 
                             ngram_range=(1, 2))),('model', ls)])


parameters_svm = { 
     "model__C": (0.01, 0.1, 1, 10)
                        }

tuned2 = GridSearchCV(pipe2, parameters_svm, n_jobs=-1)
tuned2.fit(X, y)
print(tuned2.best_score_)
print(tuned2.best_params_)
scores = cross_val_score(tuned2, X, y, scoring='f1_weighted', cv=2)
print(scores)
print(scores.mean())

0.9977575757575756
{'model__C': 10}
[0.99648373 0.99630408]
0.9963939047875563


In [4]:
nb = MultinomialNB()
pipe1 = Pipeline([('vect', TfidfVectorizer(stop_words='english', 
                             min_df=1, 
                             max_df=0.9, 
                             ngram_range=(1, 2))),('model', nb)])
pipe1.fit(X, y)
scores = cross_val_score(pipe1, X, y, scoring='f1_weighted', cv=2)
print(scores)
print(scores.mean())

[0.99842284 0.9982418 ]
0.9983323191540634


In [8]:
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier
clf = BaggingClassifier(base_estimator=LinearSVC(),
                       n_estimators=10, random_state=0)
pipe_clf = Pipeline([('vect', TfidfVectorizer(stop_words='english', 
                             min_df=1, 
                             max_df=0.9, 
                             ngram_range=(1, 2))),('model', clf)])
pipe_clf.fit(X, y)
scores = cross_val_score(pipe_clf, X, y, scoring='f1_weighted', cv=2)
print(scores)
print(scores.mean())

[0.99551184 0.99533363]
0.9954227328564211


In [11]:
cnb = ComplementNB()
pipe3 = Pipeline([('vect', TfidfVectorizer(stop_words='english', 
                             min_df=1, 
                             max_df=0.9, 
                             ngram_range=(1, 2))),('model', cnb)])
pipe3.fit(X, y)
scores = cross_val_score(pipe3, X, y, scoring='f1_weighted', cv=2)
print(scores)
print(scores.mean())

[0.99757165 0.99787825]
0.9977249503002095


In [21]:
bnb = BernoulliNB()
pipe5 = Pipeline([('vect', TfidfVectorizer(stop_words='english', 
                             min_df=1, 
                             max_df=0.9, 
                             ngram_range=(1, 2))),('model', bnb)])
pipe5.fit(X, y)
scores = cross_val_score(pipe5, X, y, scoring='f1_weighted', cv=2)
print(scores)
print(scores.mean())

[0.99626377 0.99667582]
0.9964697994498408


In [None]:
from sklearn.ensemble import StackingClassifier
models = [("ET", pipe1),("XGB", pipe3), ('bagg', pipe_clf)]
meta_learner_reg = LogisticRegression()

s_clf = StackingClassifier(estimators=models, final_estimator=meta_learner_reg)
s_clf.fit(X, y)
scores = cross_val_score(s_clf, X, y, scoring='f1_weighted', cv=2)
print(scores)
print(scores.mean())


In [56]:
from sklearn.ensemble import VotingClassifier
models = [("ET", pipe1),("XGB", pipe3), ("Random Forest", pipe_clf)]
v_clf = VotingClassifier(estimators=models, voting='soft')
v_clf.fit(X, y)

In [51]:
y_pred = pipe_clf.predict(test['text'])
y_pred = le.inverse_transform(y_pred)
Final_Table = {'index': test.index +1, 'lang_id': y_pred}
submission = pd.DataFrame(data=Final_Table)
submission.to_csv('bagging.csv', index=False)
submission

Unnamed: 0,index,lang_id
0,1,tsn
1,2,nbl
2,3,ven
3,4,ssw
4,5,afr
...,...,...
5677,5678,eng
5678,5679,nso
5679,5680,sot
5680,5681,sot


In [52]:
y_pred = pipe3.predict(test['text'])
y_pred = le.inverse_transform(y_pred)
Final_Table = {'index': test.index +1, 'lang_id': y_pred}
submission = pd.DataFrame(data=Final_Table)
submission.to_csv('complement.csv', index=False)
submission

Unnamed: 0,index,lang_id
0,1,tsn
1,2,nbl
2,3,ven
3,4,ssw
4,5,afr
...,...,...
5677,5678,eng
5678,5679,nso
5679,5680,sot
5680,5681,sot


In [33]:
from sklearn.ensemble import VotingClassifier
models = [("ET", pipe),("XGB", tuned), ("Random Forest", pipe2)]
v_clf3 = VotingClassifier(estimators=models, voting='soft', weights=[2,1,1])
v_clf3.fit(X, y)

In [60]:
y_pred = s_clf.predict(test['text'])
y_pred = le.inverse_transform(y_pred)
Final_Table = {'index': test.index +1, 'lang_id': y_pred}
submission = pd.DataFrame(data=Final_Table)
submission.to_csv('stack2.csv', index=False)
submission

Unnamed: 0,index,lang_id
0,1,tsn
1,2,nbl
2,3,ven
3,4,ssw
4,5,afr
...,...,...
5677,5678,eng
5678,5679,nso
5679,5680,sot
5680,5681,sot


In [None]:
results

In [None]:
df = pd.DataFrame(results, columns=['Classifier', 'F1 score', 'Accuracy', 'Train time'])
df.set_index('Classifier', inplace=True)
df

In [57]:
y_pred = v_clf.predict(test['text'])
y_pred = le.inverse_transform(y_pred)
Final_Table = {'index': test.index +1, 'lang_id': y_pred}
submission = pd.DataFrame(data=Final_Table)
submission.to_csv('soft voting.csv', index=False)
submission

Unnamed: 0,index,lang_id
0,1,tsn
1,2,nbl
2,3,ven
3,4,ssw
4,5,eng
...,...,...
5677,5678,eng
5678,5679,nso
5679,5680,sot
5680,5681,sot
