In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import itertools
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.multioutput import ClassifierChain
from sklearn.multiclass import OneVsRestClassifier

from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score,f1_score, precision_score, recall_score

from tqdm import tqdm_notebook
import random

In [2]:
x_train = pd.read_csv('data/x_train.csv')
y_train = pd.read_csv('data/y_train.csv')
x_val = pd.read_csv('data/x_val.csv')
y_val = pd.read_csv('data/y_val.csv')
x_test = pd.read_csv('data/x_test.csv')
y_test = pd.read_csv('data/y_test.csv')

In [8]:
def classifier_chain(model, x_train, y_train, x_test, y_test,num_chain = 10):
    if 'utterance' in x_train.columns:
        x_train = x_train.drop('utterance', axis=1)
        x_test = x_test.drop('utterance', axis=1)
    chains = [ClassifierChain(model, order='random', random_state=i)
              for i in range(num_chain)]
    for chain in chains:
        chain.fit(x_train, y_train)

    y_pred = np.array([chain.predict(x_test) for chain in
                              chains]).mean(axis=0)
    
    acc = np.mean(np.sum(np.logical_and((y_pred>0.5), y_test), axis = 1)/np.sum(np.logical_or((y_pred>0.5), y_test), axis = 1))
    precision = np.mean(np.sum(np.logical_and((y_pred>0.5), y_test), axis = 1)/np.sum((y_pred>0.5), axis = 1))
    recall = np.mean(np.sum(np.logical_and((y_pred>0.5), y_test), axis = 1)/np.sum(y_test, axis = 1))
    F1 = 2*(precision*recall)/(precision+recall)
    return acc, precision, recall, F1


GaussianNB : No hyperparameters needed

In [9]:
NB = GaussianNB()
acc, precision, recall, F1 = classifier_chain(NB,x_train, y_train, x_test,y_test)
print('Accuracy:', acc)
print('Precision:', precision)
print('Recall:', recall)
print('F1 score:', F1)

Accuracy: 0.3782589262589262
Precision: 0.490134987049213
Recall: 0.6117106227106228
F1 score: 0.5442155878907348


In [None]:
# clf2 = MultiOutputClassifier(GaussianNB()).fit(x_train, y_train)

# y_pred = clf2.predict(x_test)
# acc = np.mean(np.sum(np.logical_and((y_pred>0.5), y_test), axis = 1)/np.sum(np.logical_or((y_pred>0.5), y_test), axis = 1))
# precision = np.mean(np.sum(np.logical_and((y_pred>0.5), y_test), axis = 1)/np.sum((y_pred>0.5), axis = 1))
# recall = np.mean(np.sum(np.logical_and((y_pred>0.5), y_test), axis = 1)/np.sum(y_test, axis = 1))
# F1 = 2*(precision*recall)/(precision+recall)

# print('Accuracy:', acc)
# print('Precision:', precision)
# print('Recall:', recall)
# print('F1 score:', F1)

SVM

In [None]:
cols = ["c","kernel", "gamma","acc","pre","rec","f1"]
svm_val_result = []
c_values= [0.2,0.4,0.6, 1,1.5, 2]
kernels = ["rbf","poly","sigmoid"] 
gamma = ["scale","auto"]
param_zips = list(itertools.product(c_values, kernels, gamma))
selected_params = np.sample(param_zips,k=5)

for (c, kernel, gamma) in tqdm_notebook(selected_params):
    svm_clf = SVC(C=c, kernel=kernel, gamma=gamma)
    acc, precision, recall, F1 = classifier_chain(svm_clf,x_val, y_val)
    svm_val_result = svm_val_result.append([c,kernel, gamma, acc, precision, recall, F1])

In [None]:
svm_val_df = pd.DataFrame(svm_val_result,columns=col)
svm_val_df

RandomForest

In [None]:
max_depth = [int(x) for x in np.linspace(5, 50, num = 3)]
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
rf_val_result = []
param_zips = list(itertools.product(max_depth, min_samples_split,min_samples_leaf))
selected_params = random.sample(param_zips, k=1)

for (depth, split, leaf) in tqdm_notebook(selected_params):
    rf_clf = RandomForestClassifier(n_estimators=100, max_depth=depth, min_samples_split= split, min_samples_leaf=leaf,n_jobs=3)
    acc, precision, recall, F1 = classifier_chain(rf_clf,x_train, y_train, x_val, y_val)
    rf_val_result.append([depth, split, leaf, acc, precision, recall, F1])

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

In [None]:
rf_val_df = pd.DataFrame(rf_val_result)
rf_val_df

KNN

In [None]:
k_s = list(range(1,11,2))
knn_val_result = []
for k in tqdm_notebook(k_s):
    rf_clf = KNeighborsClassifier(n_neighbors=k)
    acc, precision, recall, F1 = classifier_chain(rf_clf,x_val, y_val)
    knn_val_result.append([k, acc, precision, recall, F1])

In [None]:
knn_val_df = pd.DataFrame(knn_val_result)
knn_val_df

Adaboost

In [None]:
n_estimators = [50, 100]
learning_rates = [0.01,0.05,0.1,0.3,1]
ada_val_result = []
params_zips = list(itertools.product(n_estimators, learning_rates))
# select 5 parameters
selected_params = random.sample(params_zips,k=5)

for (n_estimator, learning_rate) in tqdm_notebook(selected_params):
    rf_clf = AdaBoostClassifier(n_estimators=n_estimator, learning_rate=learning_rate)
    acc, precision, recall, F1 = classifier_chain(rf_clf,x_val, y_val)
    ada_val_result.append([n_estimator, learning_rate, acc, precision, recall, F1])

In [None]:
ada_val_df = pd.DataFrame(ada_val_result)
ada_val_df