# Task 2, Subtask 1

## Import libreries

In [1]:
import pandas as pd
import numpy as np
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.svm import SVC
from sklearn.metrics import f1_score
import time

## Start timer

In [2]:
start_time = time.time()

## Read data

In [3]:
## We read the already normalized and imputed data
test_feat_path = "../data/test_features_imp.csv" 
train_feat_path = "../data/train_features_imp.csv" 
train_lab_path = "../data/train_labels.csv"

test_feat = pd.read_csv(test_feat_path)
train_feat = pd.read_csv(train_feat_path)
train_lab = pd.read_csv(train_lab_path)

In [4]:
## Order data to make sure that rows in X and Y match
test_feat.sort_values(by=['pid'], inplace = True, ignore_index = True)
train_feat.sort_values(by=['pid'], inplace = True, ignore_index = True)
train_lab.sort_values(by=['pid'], inplace = True, ignore_index = True)

In [5]:
## Select relevant label columns and exclude the pid column
X_test = train_feat.iloc[:, 1:170].values
X_train = train_feat.iloc[:, 1:170].values
train_lab = train_lab[["LABEL_BaseExcess", "LABEL_Fibrinogen",
                   "LABEL_AST", "LABEL_Alkalinephos",
                   "LABEL_Bilirubin_total", "LABEL_Lactate",
                   "LABEL_TroponinI", "LABEL_SaO2",
                   "LABEL_Bilirubin_direct", "LABEL_EtCO2"]]
Y_train = train_lab.iloc[:,:].values 

In [6]:
## Make C-Contiguos so the data doesnt have to be copied
X_test = np.ascontiguousarray(X_test, dtype=np.double)
X_train = np.ascontiguousarray(X_train, dtype=np.double)
Y_train = np.ascontiguousarray(Y_train, dtype=np.double)

## Train binary relevance

In [7]:
# initialize Binary Relevance multi-label classifier
# with an SVM classifier

classifier = BinaryRelevance(
    # Specifying SVM parameters
    classifier = SVC(C = 1,                     # Regularization term, by default 1
                     kernel = 'rbf',            # Kernell to be used, by default rbf
                     gamma = 'scale',           # Gamma parameter in 'rbf' kernell, by default 'scale' = 1 / (n_features * X.var()) 
                     probability = True,        # Probability estimates for the label predictions p(y_i = 1 | y^_i)
                     class_weight = 'balanced', # Weights to be used to balance data set. 'balanced' = n_samples / (n_classes * np.bincount(y))
                     random_state = 123,        # Controls the pseudo random number generation for shuffling the data for probability estimates.
                     cache_size = 1000),        # Specify the size of the kernel cache (in MB). By default 200, increased to 1000 to improve runtime.
    require_dense = [False, True])

In [8]:
# train
classifier.fit(X_train, Y_train)

BinaryRelevance(classifier=SVC(C=1, cache_size=1000, class_weight='balanced',
                               probability=True, random_state=123),
                require_dense=[False, True])

## Perfomrance of the fit

In [9]:
Y_pred = classifier.predict(X_train)

In [10]:
f1_score(Y_train, Y_pred, average = 'micro')

0.6282168702674115

In [None]:
classifier

## Write probabilities to output

In [11]:
probabilities = classifier.predict_proba(X_test)

In [12]:
probabilities = pd.DataFrame(probabilities.toarray())

In [13]:
probabilities.to_csv('../output/subtask_1_output', index = False, header = False)

## Runtime

In [14]:
print("--- %s seconds ---" % (time.time() - start_time))

--- 6159.6947519779205 seconds ---


In [16]:
6159.6947519779205 / 60

102.66157919963202

## Results Log

|   | C | kernel | gamma | weight | features | n_features | F1 score | runtime (min) |
|---|---|---|---|---|---|---|---|---|
| run_1 |  1 |  rbf | scale  |  balanced |  median for NA's and mean  | 35 | 0.598165656150447 | 33 |
| run_2 |  1 |  rbf | scale  |  balanced |  median for NA's and mean, max, min, median, sd  | 170 | 0.6282168702674115 | 102 |