# CLASSIFICATION EXAM

In [2]:

import pandas as pd
import numpy as np
from sklearn.utils import resample
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score, log_loss

In [3]:
train = pd.read_csv('train_set.csv')
test = pd.read_csv('test_set.csv')

In [4]:
train.head()

Unnamed: 0,lang_id,text
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...
2,eng,the province of kwazulu-natal department of tr...
3,nso,o netefatša gore o ba file dilo ka moka tše le...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...


In [5]:
train.isnull().sum()

lang_id    0
text       0
dtype: int64

In [6]:
train.lang_id.value_counts()

xho    3000
eng    3000
nso    3000
ven    3000
tsn    3000
nbl    3000
zul    3000
ssw    3000
tso    3000
sot    3000
afr    3000
Name: lang_id, dtype: int64

In [7]:
train.lang_id.nunique()

11

In [8]:
len(train)

33000

In [9]:
train_dummy = pd.get_dummies(train['lang_id'],drop_first=True)

In [10]:
train_dummy.head()

Unnamed: 0,eng,nbl,nso,sot,ssw,tsn,tso,ven,xho,zul
0,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,0,0,0,1,0
2,1,0,0,0,0,0,0,0,0,0
3,0,0,1,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,1,0,0


In [10]:

train_real = pd.concat([train,train_dummy],axis=1)

In [11]:
train_real.head()

Unnamed: 0,lang_id,text,eng,nbl,nso,sot,ssw,tsn,tso,ven,xho,zul
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...,0,0,0,0,0,0,0,0,1,0
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...,0,0,0,0,0,0,0,0,1,0
2,eng,the province of kwazulu-natal department of tr...,1,0,0,0,0,0,0,0,0,0
3,nso,o netefatša gore o ba file dilo ka moka tše le...,0,0,1,0,0,0,0,0,0,0
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...,0,0,0,0,0,0,0,1,0,0


In [12]:

def convert_to_number(lang):
    for i in lang.split():
        if i == 'afr':
            return int(1.0)
        if i == 'eng':
            return int(2.0)
        if i == 'xho':
            return int(3.0)
        if i == 'zul':
            return int(4.0)
        if i == 'tso':
            return int(5.0)
        if i == 'sot':
            return int(6.0)
        if i == 'tsn':
            return int(7.0)
        if i == 'nso':
            return int(8.0)
        if i == 'ssw':
            return int(9.0)
        if i == 'ven':
            return int(10.0)
        if i == 'nbl':
            return int(11.0)

In [56]:
train_real['target'] = train_real['lang_id'].apply(convert_to_number)

In [57]:
train_real.head()

Unnamed: 0,lang_id,text,eng,nbl,nso,sot,ssw,tsn,tso,ven,xho,zul,target
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...,0,0,0,0,0,0,0,0,1,0,3
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...,0,0,0,0,0,0,0,0,1,0,3
2,eng,the province of kwazulu-natal department of tr...,1,0,0,0,0,0,0,0,0,0,2
3,nso,o netefatša gore o ba file dilo ka moka tše le...,0,0,1,0,0,0,0,0,0,0,8
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...,0,0,0,0,0,0,0,1,0,0,10


In [58]:
train_real.dropna()

Unnamed: 0,lang_id,text,eng,nbl,nso,sot,ssw,tsn,tso,ven,xho,zul,target
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...,0,0,0,0,0,0,0,0,1,0,3
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...,0,0,0,0,0,0,0,0,1,0,3
2,eng,the province of kwazulu-natal department of tr...,1,0,0,0,0,0,0,0,0,0,2
3,nso,o netefatša gore o ba file dilo ka moka tše le...,0,0,1,0,0,0,0,0,0,0,8
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...,0,0,0,0,0,0,0,1,0,0,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...
32995,tsn,popo ya dipolateforomo tse ke go tlisa boetele...,0,0,0,0,0,1,0,0,0,0,7
32996,sot,modise mosadi na o ntse o sa utlwe hore thaban...,0,0,0,1,0,0,0,0,0,0,6
32997,eng,closing date for the submission of completed t...,1,0,0,0,0,0,0,0,0,0,2
32998,xho,nawuphina umntu ofunyenwe enetyala phantsi kwa...,0,0,0,0,0,0,0,0,1,0,3


In [59]:
train_real.target.value_counts()

3     3000
2     3000
8     3000
10    3000
7     3000
11    3000
4     3000
9     3000
5     3000
6     3000
1     3000
Name: target, dtype: int64

In [60]:
X_tra = train_real.drop(['lang_id','target'], axis=1)
X = train_real.text
y = train_real.target

In [61]:
vect = CountVectorizer(ngram_range=(1, 2))
X_vector = vect.fit_transform(X)

In [62]:
train_real['target'].dtypes

dtype('int64')

In [63]:
#Split the data
X_train, X_test, y_train, y_test = train_test_split(X_vector, y, test_size=0.2, random_state = 42)

In [64]:
clf=RandomForestClassifier()
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)

In [65]:
print(classification_report(y_test,y_pred)) 
print("The accuracy score is: ",accuracy_score(y_test,y_pred))
print("The F1 score is: ",f1_score(y_test,y_pred, average='macro'))

              precision    recall  f1-score   support

           1       1.00      1.00      1.00       583
           2       1.00      1.00      1.00       615
           3       0.98      0.97      0.97       609
           4       0.91      0.97      0.94       590
           5       1.00      1.00      1.00       561
           6       1.00      1.00      1.00       618
           7       0.99      1.00      0.99       598
           8       1.00      0.99      1.00       625
           9       1.00      0.96      0.98       584
          10       1.00      1.00      1.00       634
          11       0.99      0.95      0.97       583

    accuracy                           0.99      6600
   macro avg       0.99      0.99      0.99      6600
weighted avg       0.99      0.99      0.99      6600

The accuracy score is:  0.9860606060606061
The F1 score is:  0.985990993927685


In [1]:
y_pred

NameError: name 'y_pred' is not defined