In [18]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import itertools
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.multioutput import ClassifierChain
from sklearn.multiclass import OneVsRestClassifier

from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score,f1_score, precision_score, recall_score

from tqdm import tqdm_notebook
import random

In [6]:
x = pd.read_csv('data/x.csv').drop('utterance', axis=1)
y = pd.read_csv('data/y.csv')

x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.1)
x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.1)

In [7]:
def classifier_chain(model,x_test, y_test,num_chain = 10):
    chains = [ClassifierChain(model, order='random', random_state=i)
              for i in range(num_chain)]
    for chain in chains:
        chain.fit(x_train, y_train)

    y_pred = np.array([chain.predict(x_test) for chain in
                              chains]).mean(axis=0)
    
    acc = np.mean(np.sum(np.logical_and((y_pred>0.5), y_test), axis = 1)/np.sum(np.logical_or((y_pred>0.5), y_test), axis = 1))
    precision = np.mean(np.sum(np.logical_and((y_pred>0.5), y_test), axis = 1)/np.sum((y_pred>0.5), axis = 1))
    recall = np.mean(np.sum(np.logical_and((y_pred>0.5), y_test), axis = 1)/np.sum(y_test, axis = 1))
    F1 = 2*(precision*recall)/(precision+recall)
    return acc, precision, recall, F1


GaussianNB : No hyperparameters needed

In [7]:
NB = GaussianNB()
acc, precision, recall, F1 = classifier_chain(NB,x_test,y_test)
print('Accuracy:', acc)
print('Precision:', precision)
print('Recall:', recall)
print('F1 score:', F1)

Accuracy: 0.281470629641361
Precision: 0.41213262003830636
Recall: 0.5532731496146127
F1 score: 0.47238564325028687


In [None]:
clf2 = MultiOutputClassifier(GaussianNB()).fit(x_train, y_train)

y_pred = clf2.predict(x_test)
acc = np.mean(np.sum(np.logical_and((y_pred>0.5), y_test), axis = 1)/np.sum(np.logical_or((y_pred>0.5), y_test), axis = 1))
precision = np.mean(np.sum(np.logical_and((y_pred>0.5), y_test), axis = 1)/np.sum((y_pred>0.5), axis = 1))
recall = np.mean(np.sum(np.logical_and((y_pred>0.5), y_test), axis = 1)/np.sum(y_test, axis = 1))
F1 = 2*(precision*recall)/(precision+recall)

print('Accuracy:', acc)
print('Precision:', precision)
print('Recall:', recall)
print('F1 score:', F1)

SVM

In [14]:
cols = ["c","kernel", "gamma","acc","pre","rec","f1"]
svm_val_result = []
c_values= [0.2,0.4,0.6, 1,1.5, 2]
kernels = ["rbf","poly","sigmoid"] 
gamma = ["scale","auto"]
for (c, kernel, gamma) in tqdm_notebook(list(itertools.product(c_values, kernels, gamma))):
    svm_clf = SVC(C=c, kernel=kernel, gamma=gamma)
    acc, precision, recall, F1 = classifier_chain(svm_clf,x_val, y_val)
    svm_val_result = svm_val_result.append([c,kernel, gamma, acc, precision, recall, F1])

HBox(children=(IntProgress(value=0, max=36), HTML(value='')))

KeyboardInterrupt: 

RandomForest

In [31]:
max_depth = [int(x) for x in np.linspace(5, 50, num = 3)]
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
rf_val_result = []

for (depth, split, leaf) in tqdm_notebook(list(itertools.product(max_depth, min_samples_split,min_samples_leaf))):
    rf_clf = RandomForestClassifier(n_estimators=100, max_depth=depth, min_samples_split= split, min_samples_leaf=leaf,n_jobs=3)
    acc, precision, recall, F1 = classifier_chain(rf_clf,x_val, y_val)
    rf_val_result.append([depth, split, leaf, acc, precision, recall, F1])

HBox(children=(IntProgress(value=0, max=27), HTML(value='')))

In [33]:
rf_val_df = pd.DataFrame(rf_val_result)
rf_val_df

Unnamed: 0,0,1,2,3,4,5,6
0,5,2,1,0.489521,0.85798,0.489521,0.623375
1,5,2,2,0.486993,0.852555,0.486993,0.619893
2,5,2,4,0.486327,0.852941,0.486327,0.619455
3,5,5,1,0.487658,0.855882,0.487658,0.621311
4,5,5,2,0.483333,0.854815,0.483333,0.617511
5,5,5,4,0.486327,0.854626,0.486327,0.619899
6,5,10,1,0.488323,0.852555,0.488323,0.62097
7,5,10,2,0.487658,0.858407,0.487658,0.621975
8,5,10,4,0.485329,0.852725,0.485329,0.618588
9,27,2,1,0.591248,0.821865,0.59621,0.691083


KNN

In [10]:
k_s = list(range(1,10))
knn_val_result = []
for k in tqdm_notebook(k_s):
    rf_clf = KNeighborsClassifier(n_neighbors=k)
    acc, precision, recall, F1 = classifier_chain(rf_clf,x_val, y_val)
    knn_val_result.append([k, acc, precision, recall, F1])

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))




KeyboardInterrupt: 

In [None]:
knn_val_df = pd.DataFrame(knn_val_result)
knn_val_df

Adaboost

In [None]:
n_estimators = [50, 100]
learning_rates = [0.01,0.05,0.1,0.3,1]
ada_val_result = []
params_zips = list(itertools.product(n_estimators, learning_rates))
# select 5 parameters
selected_params = random.sample(params_zips,k=5)

for (n_estimator, learning_rate) in tqdm_notebook(selected_params):
    rf_clf = AdaBoostClassifier(n_estimators=n_estimator, learning_rate=learning_rate)
    acc, precision, recall, F1 = classifier_chain(rf_clf,x_val, y_val)
    ada_val_result.append([n_estimator, learning_rate, acc, precision, recall, F1])

In [None]:
ada_val_df = pd.DataFrame(ada_val_result)
ada_val_df

In [16]:
n_estimators = [50, 100]
learning_rates = [0.01,0.05,0.1,0.3,1]
ada_val_result = []
params_zips = list(itertools.product(n_estimators, learning_rates))
# select 5 parameters
selected_params = random.sample(params_zips,k=5)

for (n_estimator, learning_rate) in tqdm_notebook(selected_params):
    rf_clf = AdaBoostClassifier(n_estimators=n_estimator, learning_rate=learning_rate)
    acc, precision, recall, F1 = classifier_chain(rf_clf,x_val, y_val)
    ada_val_result.append([n_estimator, learning_rate, acc, precision, recall, F1])

HBox(children=(IntProgress(value=0, max=2), HTML(value='')))

KeyboardInterrupt: 

In [27]:
ada_val_df = pd.DataFrame(ada_val_result)
ada_val_df

[1, 3, 5, 7, 9, 11, 13, 15, 17, 19]