# Task 2, Subtask 1

## Import libreries

In [1]:
import pandas as pd
import numpy as np
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingClassifier
import sklearn.metrics as metrics
import time

## Start timer

In [2]:
start_time = time.time()

## Read data

In [3]:
## We read the already normalized and imputed data 
test_feat_path = "../data/test_features_imp.csv" 
train_feat_path = "../data/train_features_imp.csv" 
train_lab_path = "../data/train_labels.csv"
test_feat = pd.read_csv(test_feat_path)
train_feat = pd.read_csv(train_feat_path)
train_lab = pd.read_csv(train_lab_path)

## Order data to make sure that rows in X and Y match
test_feat.sort_values(by=['pid'], inplace = True, ignore_index = True)
train_feat.sort_values(by=['pid'], inplace = True, ignore_index = True)
train_lab.sort_values(by=['pid'], inplace = True, ignore_index = True)

## Get labels
labels_ids = ['LABEL_BaseExcess', 'LABEL_Fibrinogen', 'LABEL_AST',
              'LABEL_Alkalinephos', 'LABEL_Bilirubin_total', 
              'LABEL_Lactate', 'LABEL_TroponinI', 'LABEL_SaO2',
              'LABEL_Bilirubin_direct', 'LABEL_EtCO2']

## Select relevant label columns and exclude the pid column, and sample rows
X_test = test_feat.iloc[:, 1:272].values
X_train = train_feat.iloc[:, 1:272].values
Y_train = train_lab[labels_ids].to_numpy()

## Histogram-based Gradient Boosting Classification Tree
### Subtask 1

In [4]:
output = pd.DataFrame({'pid': test_feat.iloc[:, 0].values})
print("ROC AUC scores for each label and training score on probabilities:")
for i, label in enumerate(labels_ids):
    clf = HistGradientBoostingClassifier(scoring = 'roc_auc', random_state = 123).fit(X_train, Y_train[:, i])
    print(clf.validation_score_[np.size(clf.validation_score_) - 1], " ", metrics.roc_auc_score(Y_train[:, i], clf.predict_proba(X_train)[:, 1], average='micro'))
    output[label] = clf.predict_proba(X_test)[:, 1]

ROC AUC scores for each label and training score on probabilities:
0.9373635209479475   0.9753031880702936
0.7549797077922078   0.9026598465473147
0.7412113646304126   0.852573851830949
0.7613488278569883   0.8731191490363147
0.73456405403889   0.8339599224844418
0.8192191828254848   0.8619365002020938
0.8664142813173283   0.9300488203798855
0.8372654935154936   0.9118233959642607
0.7361366421568627   0.9384280088989346
0.9490929577464788   0.9949929011335982


### Subtask 2

In [5]:
Y_train = train_lab['LABEL_Sepsis'].to_numpy()

print("ROC AUC scores for each label and training score on probabilities:")

clf = HistGradientBoostingClassifier(scoring = 'roc_auc', random_state = 123).fit(X_train, Y_train)
print(clf.validation_score_[np.size(clf.validation_score_) - 1], " ", metrics.roc_auc_score(Y_train, clf.predict_proba(X_train)[:, 1], average='micro'))

output['LABEL_Sepsis'] = clf.predict_proba(X_test)[:, 1]

ROC AUC scores for each label and training score on probabilities:
0.7436315112770785   0.952782749680539


## Write probabilities to output

In [6]:
output

Unnamed: 0,pid,LABEL_BaseExcess,LABEL_Fibrinogen,LABEL_AST,LABEL_Alkalinephos,LABEL_Bilirubin_total,LABEL_Lactate,LABEL_TroponinI,LABEL_SaO2,LABEL_Bilirubin_direct,LABEL_EtCO2,LABEL_Sepsis
0,0,0.914906,0.513746,0.842769,0.819782,0.790402,0.491066,0.075179,0.297925,0.040125,0.002910,0.128961
1,3,0.010101,0.060852,0.246728,0.205171,0.136316,0.055908,0.042676,0.059464,0.021740,0.012837,0.019549
2,5,0.033072,0.046898,0.128559,0.179086,0.133181,0.068252,0.067896,0.058745,0.022675,0.022711,0.030466
3,7,0.844867,0.815086,0.897217,0.960052,0.910483,0.468831,0.061377,0.496812,0.340772,0.011140,0.075632
4,9,0.086911,0.073689,0.252036,0.277729,0.287920,0.096186,0.073415,0.069117,0.014736,0.000683,0.031630
...,...,...,...,...,...,...,...,...,...,...,...,...
12659,31647,0.026496,0.062178,0.141386,0.170821,0.126407,0.065149,0.067896,0.050096,0.015539,0.001867,0.015392
12660,31649,0.798826,0.070331,0.405186,0.380808,0.375513,0.684463,0.042917,0.241261,0.022399,0.005445,0.043661
12661,31651,0.599231,0.045610,0.213162,0.183221,0.167425,0.132979,0.069538,0.302960,0.013578,0.001275,0.063752
12662,31652,0.010771,0.045973,0.231386,0.202682,0.264693,0.062295,0.105402,0.031817,0.016451,0.049580,0.024161


In [7]:
output.to_csv('../output/subtask_1_output', index = False, header = True)

## Runtime

In [8]:
print("--- %s minutes ---" % ((time.time() - start_time)/60))

--- 0.5912911335627238 minutes ---


## Results Log

|   | C | kernel | gamma | weight | features | n_features | F1 score | AUC | runtime (min) |
|---|---|---|---|---|---|---|---|---|---|
| run_1 |  1 |  rbf | scale  |  balanced |  median for NA's and mean  | 35 | 0.598165656150447 | ? | 33 |
| run_2 |  1 |  rbf | scale  |  balanced |  median for NA's and mean, max, min, median, sd  | 170 | 0.628216870267411 |?| 102 |
| run_3 |  1 |  rbf | scale  |  balanced |  median for NA's and mean, max, min, median, sd, range, skw, kurt  | 272 | 0.649372121402984 | 0.8236937992110356 | 141 |
| run_4 |  HGBC |  HGBC | HGBC |  HGBC |  median for NA's and mean, max, min, median, sd, range, skw, kurt  | 272 | 0.871097657278231 | 0.8222653647930391 | 0.5 |