# CV for multilabel classifer
## IML task 2, subtask 1

### Import libreries

In [1]:
import pandas as pd
import numpy as np
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.ensemble import RandomForestClassifier
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
import time

In [2]:
## Start timer
start_time = time.time()

### Read data

In [3]:
## We read the already normalized and imputed data 
train_feat_path = "../data/train_features_imp.csv" 
train_lab_path = "../data/train_labels.csv"
train_feat = pd.read_csv(train_feat_path)
train_lab = pd.read_csv(train_lab_path)

## Order data to make sure that rows in X and Y match
train_feat.sort_values(by=['pid'], inplace = True, ignore_index = True)
train_lab.sort_values(by=['pid'], inplace = True, ignore_index = True)

## Get labels
subtask1_labels_ids = ['LABEL_BaseExcess', 'LABEL_Fibrinogen', 'LABEL_AST',
                       'LABEL_Alkalinephos', 'LABEL_Bilirubin_total', 
                       'LABEL_Lactate', 'LABEL_TroponinI', 'LABEL_SaO2',
                       'LABEL_Bilirubin_direct', 'LABEL_EtCO2']

## Select relevant label columns and exclude the pid column, and sample rows
X_train = train_feat.iloc[:, 1:272].values
Y_train = train_lab[subtask1_labels_ids].to_numpy()

### Define clasifiers for CV search

In [None]:
parameters = {'classifier': [SVC(class_weight = 'balanced')],
              'classifier__kernel':('linear', 'rbf'), 
              'classifier__C':[1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100, 1000]}

clf = GridSearchCV(BinaryRelevance(require_dense = [False, True]), 
                   parameters, 
                   scoring='f1_micro',
                   refit = True, 
                   cv = 5)
clf.fit(X_train, Y_train)                   

### Check out HGBC classification

In [4]:
for i, label in enumerate(subtask1_labels_ids):
    scores = cross_val_score(HistGradientBoostingClassifier(), 
                             X_train, 
                             Y_train[:, i],
                             cv=5,
                             scoring='f1_micro',
                             verbose=True)
    
    print("Cross-validation score is {score:.3f},"
          " standard deviation is {err:.3f}"
          .format(score = scores.mean(), err = scores.std()))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   19.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Cross-validation score is 0.877, standard deviation is 0.004


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   10.8s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Cross-validation score is 0.938, standard deviation is 0.002


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   15.7s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Cross-validation score is 0.791, standard deviation is 0.003


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   14.6s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Cross-validation score is 0.794, standard deviation is 0.004


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   15.5s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Cross-validation score is 0.790, standard deviation is 0.003


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   14.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Cross-validation score is 0.839, standard deviation is 0.003


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   14.6s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Cross-validation score is 0.927, standard deviation is 0.003


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   16.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Cross-validation score is 0.836, standard deviation is 0.005


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    8.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Cross-validation score is 0.968, standard deviation is 0.001
Cross-validation score is 0.964, standard deviation is 0.002


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   15.2s finished


In [5]:
print("--- %s minutes ---" % ((time.time() - start_time)/60))

--- 2.4161749323209127 minutes ---


In [30]:
scores

array([0.96104238, 0.96762306, 0.96393788, 0.96341142, 0.96630692])

In [33]:
from sklearn.pipeline import make_pipeline
pipeline = make_pipeline(HistGradientBoostingClassifier())

for i, label in enumerate(subtask1_labels_ids):
    pipeline = pipeline.fit(X_train, Y_train[:, i])
    print("Training score:", f1_score(Y_train[:, i], pipeline.predict(X_train), average='micro'))

Training score: 0.9344037904711766
Training score: 0.9548828639115556
Training score: 0.8572255856804423
Training score: 0.8339036588575941
Training score: 0.8334824953935246
Training score: 0.8744406422742828
Training score: 0.9462490128981311
Training score: 0.8846012108449592
Training score: 0.9760989734140564
Training score: 0.9879441958410108


In [32]:
(0.9053385869780678+
0.7288187593878132+
0.6549839172655249+
0.673966505511219+
0.6635499774355397+
0.7195839524573667+
0.783277708342977+
0.7655774629568377+
0.6596926699545205+
0.8797387169467601)/10

0.7434528257236628

In [34]:
(0.9344037904711766+
0.9548828639115556+
0.8572255856804423+
0.8339036588575941+
0.8334824953935246+
0.8744406422742828+
0.9462490128981311+
0.8846012108449592+
0.9760989734140564+
0.9879441958410108)/10

0.9083232429586735