# CLASSIFICATION EXAM

In [1]:

import pandas as pd
import numpy as np
from sklearn.utils import resample
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score, log_loss

In [2]:
train = pd.read_csv('train_set.csv')
test = pd.read_csv('test_set.csv')

In [3]:
train.head()

Unnamed: 0,lang_id,text
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...
2,eng,the province of kwazulu-natal department of tr...
3,nso,o netefatša gore o ba file dilo ka moka tše le...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...


In [4]:
train.isnull().sum()

lang_id    0
text       0
dtype: int64

In [5]:
train.lang_id.value_counts()

xho    3000
eng    3000
nso    3000
ven    3000
tsn    3000
nbl    3000
zul    3000
ssw    3000
tso    3000
sot    3000
afr    3000
Name: lang_id, dtype: int64

In [6]:
train.lang_id.nunique()

11

In [7]:
len(train)

33000

In [8]:
train_dummy = pd.get_dummies(train['lang_id'],drop_first=True)

In [9]:
train_dummy.head()

Unnamed: 0,eng,nbl,nso,sot,ssw,tsn,tso,ven,xho,zul
0,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,0,0,0,1,0
2,1,0,0,0,0,0,0,0,0,0
3,0,0,1,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,1,0,0


In [10]:

train_real = pd.concat([train,train_dummy],axis=1)

In [11]:
train_real.head()

Unnamed: 0,lang_id,text,eng,nbl,nso,sot,ssw,tsn,tso,ven,xho,zul
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...,0,0,0,0,0,0,0,0,1,0
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...,0,0,0,0,0,0,0,0,1,0
2,eng,the province of kwazulu-natal department of tr...,1,0,0,0,0,0,0,0,0,0
3,nso,o netefatša gore o ba file dilo ka moka tše le...,0,0,1,0,0,0,0,0,0,0
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...,0,0,0,0,0,0,0,1,0,0


In [12]:

def convert_to_number(lang):
    for i in lang.split():
        if i == 'afr':
            return int(1.0)
        if i == 'eng':
            return int(2.0)
        if i == 'xho':
            return int(3.0)
        if i == 'zul':
            return int(4.0)
        if i == 'tso':
            return int(5.0)
        if i == 'sot':
            return int(6.0)
        if i == 'tsn':
            return int(7.0)
        if i == 'nso':
            return int(8.0)
        if i == 'ssw':
            return int(9.0)
        if i == 'ven':
            return int(10.0)
        if i == 'nbl':
            return int(11.0)

In [13]:
train_real['target'] = train_real['lang_id'].apply(convert_to_number)

In [14]:
train_real.head()

Unnamed: 0,lang_id,text,eng,nbl,nso,sot,ssw,tsn,tso,ven,xho,zul,target
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...,0,0,0,0,0,0,0,0,1,0,3
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...,0,0,0,0,0,0,0,0,1,0,3
2,eng,the province of kwazulu-natal department of tr...,1,0,0,0,0,0,0,0,0,0,2
3,nso,o netefatša gore o ba file dilo ka moka tše le...,0,0,1,0,0,0,0,0,0,0,8
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...,0,0,0,0,0,0,0,1,0,0,10


In [15]:
train_real.dropna()

Unnamed: 0,lang_id,text,eng,nbl,nso,sot,ssw,tsn,tso,ven,xho,zul,target
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...,0,0,0,0,0,0,0,0,1,0,3
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...,0,0,0,0,0,0,0,0,1,0,3
2,eng,the province of kwazulu-natal department of tr...,1,0,0,0,0,0,0,0,0,0,2
3,nso,o netefatša gore o ba file dilo ka moka tše le...,0,0,1,0,0,0,0,0,0,0,8
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...,0,0,0,0,0,0,0,1,0,0,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...
32995,tsn,popo ya dipolateforomo tse ke go tlisa boetele...,0,0,0,0,0,1,0,0,0,0,7
32996,sot,modise mosadi na o ntse o sa utlwe hore thaban...,0,0,0,1,0,0,0,0,0,0,6
32997,eng,closing date for the submission of completed t...,1,0,0,0,0,0,0,0,0,0,2
32998,xho,nawuphina umntu ofunyenwe enetyala phantsi kwa...,0,0,0,0,0,0,0,0,1,0,3


In [16]:
train_real.target.value_counts()

3     3000
2     3000
8     3000
10    3000
7     3000
11    3000
4     3000
9     3000
5     3000
6     3000
1     3000
Name: target, dtype: int64

In [17]:
X_tra = train_real.drop(['lang_id','target'], axis=1)
X = train_real.text
y = train_real.target

In [18]:
vect = CountVectorizer(ngram_range=(1, 2))
X_vector = vect.fit_transform(X)

In [19]:
train_real['target'].dtypes

dtype('int64')

In [20]:
#Split the data
X_train, X_test, y_train, y_test = train_test_split(X_vector, y, test_size=0.2, random_state = 42)

In [21]:
clf=RandomForestClassifier()
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)

In [22]:
print(classification_report(y_test,y_pred)) 
print("The accuracy score is: ",accuracy_score(y_test,y_pred))
print("The F1 score is: ",f1_score(y_test,y_pred, average='macro'))

              precision    recall  f1-score   support

           1       1.00      1.00      1.00       583
           2       0.99      1.00      1.00       615
           3       0.98      0.97      0.98       609
           4       0.90      0.98      0.94       590
           5       1.00      1.00      1.00       561
           6       1.00      1.00      1.00       618
           7       0.99      1.00      0.99       598
           8       1.00      0.99      0.99       625
           9       0.99      0.96      0.98       584
          10       1.00      1.00      1.00       634
          11       0.99      0.95      0.97       583

    accuracy                           0.99      6600
   macro avg       0.99      0.99      0.99      6600
weighted avg       0.99      0.99      0.99      6600

The accuracy score is:  0.9854545454545455
The F1 score is:  0.9854142002272599


In [33]:
test.head()

Unnamed: 0,index,text
0,1,"Mmasepala, fa maemo a a kgethegileng a letlele..."
1,2,Uzakwaziswa ngokufaneleko nakungafuneka eminye...
2,3,Tshivhumbeo tshi fana na ngano dza vhathu.
3,4,Kube inja nelikati betingevakala kutsi titsini...
4,5,Winste op buitelandse valuta.


In [39]:
index = test['index']

In [43]:
test_vect = vect.fit_transform(test['text'])

In [45]:
clf=RandomForestClassifier()
clf.fit(X_train,y_train)

RandomForestClassifier()

In [49]:
subm = pd.DataFrame(list(zip(index, y_pred)), columns = ['index','lang_id'])

subm

Unnamed: 0,index,lang_id
0,1,6
1,2,8
2,3,2
3,4,8
4,5,8
...,...,...
5677,5678,4
5678,5679,5
5679,5680,4
5680,5681,6


In [53]:
lang = []
for pred in list(subm['lang_id']):
    if pred == 1:
        lang.append('afr')
    if pred == 2:
        lang.append('eng')
    if pred == 3:
        lang.append('xho')
    if pred == 4:
        lang.append('zul')
    if pred == 5:
        lang.append('tso')
    if pred == 6:
        lang.append('sot')
    if pred == 7:
        lang.append('tsn')
    if pred == 8:
        lang.append('nso')
    if pred == 9:
        lang.append('ssw')
    if pred == 10:
        lang.append('ven')
    if pred == 11:
        lang.append('nbl')

In [54]:
lang

['sot',
 'nso',
 'eng',
 'nso',
 'nso',
 'ven',
 'eng',
 'xho',
 'xho',
 'nbl',
 'sot',
 'afr',
 'nso',
 'eng',
 'nso',
 'tsn',
 'ssw',
 'sot',
 'tsn',
 'ven',
 'sot',
 'zul',
 'tso',
 'nso',
 'ssw',
 'ven',
 'eng',
 'xho',
 'eng',
 'tsn',
 'tsn',
 'nbl',
 'nso',
 'ven',
 'eng',
 'sot',
 'tsn',
 'eng',
 'zul',
 'tsn',
 'eng',
 'nbl',
 'ssw',
 'nbl',
 'nbl',
 'eng',
 'xho',
 'zul',
 'eng',
 'xho',
 'tsn',
 'ssw',
 'xho',
 'tso',
 'afr',
 'sot',
 'tsn',
 'zul',
 'xho',
 'tso',
 'nbl',
 'ssw',
 'eng',
 'ssw',
 'sot',
 'xho',
 'zul',
 'nso',
 'ven',
 'xho',
 'nbl',
 'zul',
 'zul',
 'eng',
 'tso',
 'ven',
 'nso',
 'nso',
 'zul',
 'zul',
 'ven',
 'sot',
 'eng',
 'tsn',
 'tsn',
 'zul',
 'ven',
 'afr',
 'tso',
 'tso',
 'nso',
 'xho',
 'ven',
 'eng',
 'xho',
 'ssw',
 'eng',
 'sot',
 'afr',
 'sot',
 'xho',
 'ven',
 'zul',
 'tsn',
 'nbl',
 'tsn',
 'eng',
 'xho',
 'nbl',
 'xho',
 'sot',
 'zul',
 'nso',
 'tsn',
 'nbl',
 'xho',
 'zul',
 'nbl',
 'tso',
 'zul',
 'sot',
 'zul',
 'tso',
 'ven',
 'zul',


In [56]:
subm.drop('lang_id', axis =1)

Unnamed: 0,index
0,1
1,2
2,3
3,4
4,5
...,...
5677,5678
5678,5679
5679,5680
5680,5681


In [57]:
subm['lang_id'] = lang
subm.head()

Unnamed: 0,index,lang_id
0,1,sot
1,2,nso
2,3,eng
3,4,nso
4,5,nso


In [59]:
subm.to_csv('classification_hackactton_odutayo.csv', index_label = False, index = False)