# Task 2, Subtask 1

## Import libreries

In [14]:
import pandas as pd
import numpy as np
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.svm import SVC
from sklearn.metrics import f1_score

## Read data

In [2]:
## We read the already normalized and imputed data
test_feat_path = "../data/test_features_imp.csv" 
train_feat_path = "../data/train_features_imp.csv" 
train_lab_path = "../data/train_labels.csv"

test_feat = pd.read_csv(test_feat_path)
train_feat = pd.read_csv(train_feat_path)
train_lab = pd.read_csv(train_lab_path)

In [3]:
## Order data to make sure that rows in X and Y match
test_feat.sort_values(by=['pid'], inplace = True, ignore_index = True)
train_feat.sort_values(by=['pid'], inplace = True, ignore_index = True)
train_lab.sort_values(by=['pid'], inplace = True, ignore_index = True)

In [4]:
## Select relevant label columns and exclude the pid column
X_test = train_feat.iloc[:, 1:34].values
X_train = train_feat.iloc[:, 1:34].values
train_lab = train_lab[["LABEL_BaseExcess", "LABEL_Fibrinogen",
                   "LABEL_AST", "LABEL_Alkalinephos",
                   "LABEL_Bilirubin_total", "LABEL_Lactate",
                   "LABEL_TroponinI", "LABEL_SaO2",
                   "LABEL_Bilirubin_direct", "LABEL_EtCO2"]]
Y_train = train_lab.iloc[:,:].values 

## Train binary relevance

In [5]:
# initialize Binary Relevance multi-label classifier
# with an SVM classifier

classifier = BinaryRelevance(
    # Specifying SVM parameters
    classifier = SVC(C = 1,                   # Regularization term, by default 1
                     kernel = 'rbf',          # Kernell to be used, by default rbf
                     gamma = 'scale',         # Gamma parameter in 'rbf' kernell, by default 'scale' = 1 / (n_features * X.var()) 
                     probability = True,      # Probability estimates for the label predictions p(y_i = 1 | y^_i)
                     class_weight ='balanced',# Weights to be used to balance data set. 'balanced' = n_samples / (n_classes * np.bincount(y))
                     random_state = 123,      # Controls the pseudo random number generation for shuffling the data for probability estimates.
                     cache_size = 1000),      # Specify the size of the kernel cache (in MB). By default 200, increased to 1000 to improve runtime.
    require_dense = [False, True]
)

In [6]:
# train
classifier.fit(X_train, Y_train)

BinaryRelevance(classifier=SVC(C=1, cache_size=1000, class_weight='balanced',
                               probability=True, random_state=123),
                require_dense=[False, True])

## Perfomrance of the fit

In [7]:
# In multi-label classification, 
# the mean accuracy on the given test data and labels 
# is the subset accuracy which is a harsh metric since 
# you require for each sample that each label set be 
# correctly predicted.
classifier.score(X_train, Y_train) 

0.3192419057646749

In [12]:
Y_pred = classifier.predict(X_train)

In [26]:
f1_score(Y_train, Y_pred, average = 'micro')

0.598165656150447

## Write probabilities to output

In [8]:
probabilities = classifier.predict_proba(X_test)

In [9]:
probabilities = pd.DataFrame(probabilities.toarray())

In [10]:
probabilities.to_csv('../output/subtask_1_output', index = False, header = False)

## Results Log

|   | C | kernel | gamma | weight | features | F1 score |
|---|---|---|---|---|---|---|
| run_1 |  1 |  rbf | scale  |  balanced |  median for NA's and mean  | 0.598165656150447 |
