In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.multioutput import ClassifierChain
from sklearn.multiclass import OneVsRestClassifier

from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier

In [27]:
x_train = pd.read_csv('data/x_train.csv')
y_train = pd.read_csv('data/y_train.csv')
x_val = pd.read_csv('data/x_val.csv')
y_val = pd.read_csv('data/y_val.csv')
x_test = pd.read_csv('data/x_test.csv')
y_test = pd.read_csv('data/y_test.csv')

#for splitting after feature extraction
# x = x_train.append(x_val).append(x_test)
# y = y_train.append(y_val).append(y_test)
# x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.1)
# x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.11)

In [10]:
NB = GaussianNB()
SVM = SVC()
RF = RandomForestClassifier()
AdaBoost = AdaBoostClassifier()
KNN = KNeighborsClassifier()

In [13]:
def classifier_chain(model, x_train, y_train, x_test, y_test, num_chian = 10):
    if 'utterance' in x_train.columns:
        x_train = x_train.drop('utterance', axis=1)
        x_test = x_test.drop('utterance', axis=1)
        
    chains = [ClassifierChain(model, order='random', random_state=i)
              for i in range(num_chian)]
    for chain in chains:
        chain.fit(x_train, y_train)

    y_pred = np.array([chain.predict(x_test) for chain in
                              chains]).mean(axis=0)
    
    acc = np.mean(np.sum(np.logical_and((y_pred>0.5), y_test), axis = 1)/np.sum(np.logical_or((y_pred>0.5), y_test), axis = 1))
    precision = np.mean(np.sum(np.logical_and((y_pred>0.5), y_test), axis = 1)/np.sum((y_pred>0.5), axis = 1))
    recall = np.mean(np.sum(np.logical_and((y_pred>0.5), y_test), axis = 1)/np.sum(y_test, axis = 1))
    F1 = 2*(precision*recall)/(precision+recall)
    
    print('Accuracy:', acc)
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1 score:', F1)
    return acc, precision, recall, F1

In [30]:
classifier_chain(NB, x_train, y_train, x_test, y_test)

Accuracy: 0.3782589262589262
Precision: 0.490134987049213
Recall: 0.6117106227106228
F1 score: 0.5442155878907348


(0.3782589262589262, 0.490134987049213, 0.6117106227106228, 0.5442155878907348)

In [33]:
%%time
classifier_chain(SVM, x_train, y_train, x_test, y_test)

Accuracy: 0.3639206349206349
Precision: 0.7408759124087592
Recall: 0.3808437118437118
F1 score: 0.50308103094012
Wall time: 14min 34s


(0.3639206349206349, 0.7408759124087592, 0.3808437118437118, 0.50308103094012)

In [25]:
%%time
classifier_chain(RF, x_train, y_train, x_test, y_test)

Accuracy: 0.5594468864468863
Precision: 0.8584188521474724
Recall: 0.5836593406593406
F1 score: 0.6948640978735902
Wall time: 27.1 s


(0.5594468864468863,
 0.8584188521474724,
 0.5836593406593406,
 0.6948640978735902)

In [32]:
%%time
classifier_chain(AdaBoost, x_train, y_train, x_test, y_test)

Accuracy: 0.5746105006105006
Precision: 0.8217866093805943
Recall: 0.6121001221001221
F1 score: 0.701611463302493
Wall time: 2min 43s


(0.5746105006105006, 0.8217866093805943, 0.6121001221001221, 0.701611463302493)

In [31]:
KNN.fit(x_train.drop('utterance', axis=1), y_train)
y_pred = KNN.predict(x_test.drop('utterance', axis=1))

acc = np.mean(np.sum(np.logical_and((y_pred>0.5), y_test), axis = 1)/np.sum(np.logical_or((y_pred>0.5), y_test), axis = 1))
precision = np.mean(np.sum(np.logical_and((y_pred>0.5), y_test), axis = 1)/np.sum((y_pred>0.5), axis = 1))
recall = np.mean(np.sum(np.logical_and((y_pred>0.5), y_test), axis = 1)/np.sum(y_test, axis = 1))
F1 = 2*(precision*recall)/(precision+recall)

print('Accuracy:', acc)
print('Precision:', precision)
print('Recall:', recall)
print('F1 score:', F1)

Accuracy: 0.36642979242979246
Precision: 0.6123232323232323
Recall: 0.40529914529914524
F1 score: 0.4877528013628434


### Hypeparameter tuning

In [None]:
RF = RandomForestClassifier(max_depth=depth, min_samples_split= 5, min_samples_leaf=2)

### Table 6

In [3]:
Con = ['InitSim','DlgSim', 'QuestMark', 'Dup', 'What', 'Where', 'When', 'Why', 'Who', 'How']
Str = ['AbsPos', 'NormPos', 'Len', 'LenUni', 'LenStem', 'Starter']
Sen = ['Thank', 'ExMark', 'Feedback', 'SenScr_Neg', 'SenScr_Neu', 'SenScr_Pos', 'Lex_Pos', 'Lex_Neg']

In [21]:
table6 = pd.DataFrame(columns=['Group(s)','Acc', 'Precision', 'Recall', 'F1'])

In [22]:
table6.loc[0] = ['Content'] + list(classifier_chain(RF, x_train[Con], y_train, x_test[Con], y_test))
table6.loc[1] = ['Structural'] + list(classifier_chain(RF, x_train[Str], y_train, x_test[Str], y_test))
table6.loc[2] = ['Sentiment'] + list(classifier_chain(RF, x_train[Sen], y_train, x_test[Sen], y_test))
table6.loc[3] = ['Con+Str'] + list(classifier_chain(RF, x_train[Con+Str], y_train, x_test[Con+Str], y_test))
table6.loc[4] = ['Con+Sen'] + list(classifier_chain(RF, x_train[Con+Sen], y_train, x_test[Con+Sen], y_test))
table6.loc[5] = ['Str+Sent'] + list(classifier_chain(RF, x_train[Str+Sen], y_train, x_test[Str+Sen], y_test))

Accuracy: 0.4052405372405373
Precision: 0.656399845320959
Recall: 0.45146764346764345
F1 score: 0.5349796692086624
Accuracy: 0.49395937395937395
Precision: 0.7441099962839094
Recall: 0.5320598290598291
F1 score: 0.6204676361437829
Accuracy: 0.28700122100122105
Precision: 0.517053317053317
Recall: 0.3157264957264957
F1 score: 0.39205424865448124
Accuracy: 0.5098632478632479
Precision: 0.8032879818594104
Recall: 0.5371965811965812
F1 score: 0.6438321924236169
Accuracy: 0.49606471306471306
Precision: 0.7743855606758834
Recall: 0.5179157509157509
F1 score: 0.6207011871895421
Accuracy: 0.5554884004884004
Precision: 0.849905303030303
Recall: 0.5815054945054945
F1 score: 0.6905419525579883


In [23]:
table6

Unnamed: 0,Group(s),Acc,Precision,Recall,F1
0,Content,0.405241,0.6564,0.451468,0.53498
1,Structural,0.493959,0.74411,0.53206,0.620468
2,Sentiment,0.287001,0.517053,0.315726,0.392054
3,Con+Str,0.509863,0.803288,0.537197,0.643832
4,Con+Sen,0.496065,0.774386,0.517916,0.620701
5,Str+Sent,0.555488,0.849905,0.581505,0.690542


### Table 7

In [26]:
print(sorted(zip(map(lambda x: round(x, 4), RF.feature_importances_), names), reverse=True))

NotFittedError: This RandomForestClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.