# CV for multilabel classifer
## IML task 2, subtask 1

### Import libreries

In [1]:
import pandas as pd
import numpy as np
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.ensemble import RandomForestClassifier
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.model_selection import GridSearchCV
import time

In [2]:
## Start timer
start_time = time.time()

### Read data

In [3]:
## We read the already normalized and imputed data 
train_feat_path = "../data/train_features_imp.csv" 
train_lab_path = "../data/train_labels.csv"
train_feat = pd.read_csv(train_feat_path)
train_lab = pd.read_csv(train_lab_path)

## Order data to make sure that rows in X and Y match
train_feat.sort_values(by=['pid'], inplace = True, ignore_index = True)
train_lab.sort_values(by=['pid'], inplace = True, ignore_index = True)

## Join data 
train_Y_X = pd.merge(train_lab, train_feat,
                     on='pid', 
                     how='left')

In [4]:
## Sample rows
train_Y_X = train_Y_X.sample(frac=1, random_state=123).reset_index(drop=True) 

In [5]:
## Select relevant label columns and exclude the pid column, and sample rows
Y_train = train_Y_X.iloc[:,1:10].values 
X_train = train_Y_X.iloc[:, 11:287].values

### Define clasifiers for CV search

// Comented out for now
parameters = {'classifier': [SVC(class_weight = 'balanced')],
              'classifier__kernel':('linear', 'rbf'), 
              'classifier__C':[1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100, 1000]}

clf = GridSearchCV(BinaryRelevance(require_dense = [False, True]), 
                   parameters, 
                   scoring='f1_micro',
                   refit = True, 
                   cv = 5)
clf.fit(X_train, Y_train)                   

### Check out HGBC classification

In [6]:
parameters = {'classifier': [HistGradientBoostingClassifier()]}

clf_2 = GridSearchCV(BinaryRelevance(), 
                     parameters, 
                     scoring='f1_micro',
                     refit = True, 
                     cv = 5)
clf_2.fit(X_train, Y_train)

GridSearchCV(cv=5, estimator=BinaryRelevance(require_dense=[True, True]),
             param_grid={'classifier': [HistGradientBoostingClassifier()]},
             scoring='f1_micro')

In [7]:
print("--- %s minutes ---" % ((time.time() - start_time)/60))

--- 3.1024815519650777 minutes ---


In [10]:
clf_2.cv_results_

{'mean_fit_time': array([29.86903682]),
 'std_fit_time': array([2.3831427]),
 'mean_score_time': array([0.06750159]),
 'std_score_time': array([0.01126375]),
 'param_classifier': masked_array(data=[HistGradientBoostingClassifier()],
              mask=[False],
        fill_value='?',
             dtype=object),
 'params': [{'classifier': HistGradientBoostingClassifier()}],
 'split0_test_score': array([0.52329749]),
 'split1_test_score': array([0.51466138]),
 'split2_test_score': array([0.52488414]),
 'split3_test_score': array([0.51771735]),
 'split4_test_score': array([0.51457649]),
 'mean_test_score': array([0.51902737]),
 'std_test_score': array([0.00431565]),
 'rank_test_score': array([1], dtype=int32)}

In [8]:
start_time = time.time()
probs = clf_2.predict_proba(X_train)
print("--- %s minutes ---" % ((time.time() - start_time)/60))

--- 0.005346028010050455 minutes ---


In [9]:
pd.DataFrame(probs.toarray())

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0.020026,0.026184,0.193926,0.228685,0.226696,0.104676,0.065354,0.143798,0.010756
1,0.753678,0.036173,0.047086,0.036587,0.034599,0.168151,0.006931,0.902905,0.005964
2,0.205001,0.031768,0.203492,0.183848,0.176394,0.084961,0.016710,0.056815,0.011960
3,0.583477,0.015332,0.190372,0.178453,0.178158,0.291343,0.008091,0.108516,0.013026
4,0.007158,0.031569,0.298229,0.429707,0.555056,0.065348,0.385772,0.044772,0.011028
...,...,...,...,...,...,...,...,...,...
18990,0.069637,0.017585,0.163667,0.130233,0.146484,0.047072,0.016720,0.035916,0.010320
18991,0.292982,0.070242,0.470243,0.430541,0.392089,0.133853,0.008342,0.297786,0.038633
18992,0.761684,0.017030,0.063067,0.066066,0.025187,0.103107,0.015392,0.611873,0.006373
18993,0.252504,0.042099,0.701177,0.704101,0.877943,0.439551,0.007122,0.062641,0.022764
