# Task 2, Subtask 1

## Import libreries

In [1]:
import pandas as pd
import numpy as np
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.linear_model import LassoCV
import sklearn.metrics as metrics
import time

## Start timer

In [2]:
start_time = time.time()

## Read data

In [3]:
## We read the already normalized and imputed data. For specifics about the imputation and normalization 
## see imputate.R file. 
test_feat_path = "../data/test_features_imp.csv" 
train_feat_path = "../data/train_features_imp.csv" 
train_lab_path = "../data/train_labels.csv"
test_feat = pd.read_csv(test_feat_path)
train_feat = pd.read_csv(train_feat_path)
train_lab = pd.read_csv(train_lab_path)

## Order data to make sure that rows in X and Y match
test_feat.sort_values(by=['pid'], inplace = True, ignore_index = True)
train_feat.sort_values(by=['pid'], inplace = True, ignore_index = True)
train_lab.sort_values(by=['pid'], inplace = True, ignore_index = True)

## Select exclude the pid column and make into array
X_test = test_feat.iloc[:, 1:272].values
X_train = train_feat.iloc[:, 1:272].values

## Subtask 1
### Histogram-based Gradient Boosting Classification Tree

In [4]:
## Define the names of the labels to predict
labels_subtask_1 = ['LABEL_BaseExcess', 'LABEL_Fibrinogen', 'LABEL_AST',
                    'LABEL_Alkalinephos', 'LABEL_Bilirubin_total', 
                    'LABEL_Lactate', 'LABEL_TroponinI', 'LABEL_SaO2',
                    'LABEL_Bilirubin_direct', 'LABEL_EtCO2']

## Write to an array the labels of interest
Y_train = train_lab[labels_subtask_1].to_numpy()

## Make empty df to fill with the results
output = pd.DataFrame({'pid': test_feat.iloc[:, 0].values})

## For every label in Y_train fit a HGBC and use it to predict the probabilities of X_test
print("ROC AUC validation and training score (training score on probability estimates), for each label:")
for i, label in enumerate(labels_subtask_1):
    ## Fit model
    clf = HistGradientBoostingClassifier(scoring = 'roc_auc', 
                                         random_state = 123).fit(X_train, Y_train[:, i])
    
    ## Print the testing and traing score. Trainig score is estimated for the probability estimates not the labels.
    print(clf.validation_score_[np.size(clf.validation_score_) - 1],
          " ", 
          metrics.roc_auc_score(Y_train[:, i],
          clf.predict_proba(X_train)[:, 1], average='micro'))
    
    ## Write to results df
    output[label] = clf.predict_proba(X_test)[:, 1]

ROC AUC validation and training score (t.s on probabilities), for each label:
0.9373635209479475   0.9753031880702936
0.7549797077922078   0.9026598465473147
0.7412113646304126   0.852573851830949
0.7613488278569883   0.8731191490363147
0.73456405403889   0.8339599224844418
0.8192191828254848   0.8619365002020938
0.8664142813173283   0.9300488203798855
0.8372654935154936   0.9118233959642607
0.7361366421568627   0.9384280088989346
0.9490929577464788   0.9949929011335982


## Subtask 2
### Histogram-based Gradient Boosting Classification Tree

In [5]:
## Write to an array the labels of interest
Y_train = train_lab['LABEL_Sepsis'].to_numpy()

## Fit a HGBC and use it to predict the probabilities of X_test
print("ROC AUC validation and training score (training score on probability estimates), for each label:")

## Fit model
clf = HistGradientBoostingClassifier(scoring = 'roc_auc',
                                     random_state = 123).fit(X_train, Y_train)
    
## Print the testing and traing score. Trainig score is estimated for the probability estimates not the labels.
print(clf.validation_score_[np.size(clf.validation_score_) - 1],
      " ",
      metrics.roc_auc_score(Y_train,
                            clf.predict_proba(X_train)[:, 1],
                            average='micro'))

## Write to results df
output['LABEL_Sepsis'] = clf.predict_proba(X_test)[:, 1]

ROC AUC validation and training score (t.s on probabilities), for each label:
0.7436315112770785   0.952782749680539


## Subtask 3
### Lasso Regression

In [6]:
## Define the features to predict for this rask
labels_subtask_3 = ['LABEL_RRate', 'LABEL_ABPm', 'LABEL_SpO2', 'LABEL_Heartrate']

## Write to an array the labels of interest
Y_train = train_lab[labels_subtask_3].to_numpy()

## Fit Lasso regression to the data and predict
print("Training scores for each label:")
for i, label in enumerate(labels_subtask_3):
    ## Get suffix of the label to predict
    sufix = label.split("_", maxsplit = 2)[1] + "$"
    
    ## Filter out columns that dont end with the suffix
    X_in_loop_train = train_feat.filter(regex = sufix, axis = 1).to_numpy()
    X_in_loop_test = test_feat.filter(regex = sufix, axis = 1).to_numpy()
    
    ## Fit model
    reg = LassoCV(random_state = 123, 
                  verbose = False,
                  max_iter = 10000).fit(X_in_loop_train, Y_train[:, i])
    
    ## Print training score (the suck)
    print(reg.score(X_in_loop_train, Y_train[:, i]))
    
    ## Write to output
    output[label] = reg.predict(X_in_loop_test)

Training scores for each label:
0.37770345083252754
0.5859785441608802
0.38386074780919743
0.6144724385873669


## Write to output

In [7]:
## Visualize output
output

Unnamed: 0,pid,LABEL_BaseExcess,LABEL_Fibrinogen,LABEL_AST,LABEL_Alkalinephos,LABEL_Bilirubin_total,LABEL_Lactate,LABEL_TroponinI,LABEL_SaO2,LABEL_Bilirubin_direct,LABEL_EtCO2,LABEL_Sepsis,LABEL_RRate,LABEL_ABPm,LABEL_SpO2,LABEL_Heartrate
0,0,0.914906,0.513746,0.842769,0.819782,0.790402,0.491066,0.075179,0.297925,0.040125,0.002910,0.128961,15.092701,81.921415,98.225746,84.676908
1,3,0.010101,0.060852,0.246728,0.205171,0.136316,0.055908,0.042676,0.059464,0.021740,0.012837,0.019549,17.708045,85.290360,96.737910,95.107560
2,5,0.033072,0.046898,0.128559,0.179086,0.133181,0.068252,0.067896,0.058745,0.022675,0.022711,0.030466,19.016770,72.983975,95.905009,71.068876
3,7,0.844867,0.815086,0.897217,0.960052,0.910483,0.468831,0.061377,0.496812,0.340772,0.011140,0.075632,17.501360,86.498119,98.069292,94.273838
4,9,0.086911,0.073689,0.252036,0.277729,0.287920,0.096186,0.073415,0.069117,0.014736,0.000683,0.031630,20.007447,87.643147,95.843105,91.415635
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12659,31647,0.026496,0.062178,0.141386,0.170821,0.126407,0.065149,0.067896,0.050096,0.015539,0.001867,0.015392,17.229005,69.264997,96.635856,74.800623
12660,31649,0.798826,0.070331,0.405186,0.380808,0.375513,0.684463,0.042917,0.241261,0.022399,0.005445,0.043661,16.076435,82.614546,97.025402,89.095196
12661,31651,0.599231,0.045610,0.213162,0.183221,0.167425,0.132979,0.069538,0.302960,0.013578,0.001275,0.063752,17.701808,79.185616,98.225746,82.267047
12662,31652,0.010771,0.045973,0.231386,0.202682,0.264693,0.062295,0.105402,0.031817,0.016451,0.049580,0.024161,19.294190,94.665188,97.395372,116.851730


In [10]:
## Write results to .zip
output.to_csv('../output/submission_HGBC_Lasso.zip', index=False, float_format='%.3f', compression='zip')

## Runtime

In [9]:
print("--- %s minutes ---" % ((time.time() - start_time)/60))

--- 0.6583995858828227 minutes ---


## Results Log

### Subtask 1. Binary Relevance and HGBC

|   | C | kernel | gamma | weight | features | n_features | F1 score | AUC | runtime (min) |
|---|---|---|---|---|---|---|---|---|---|
| run_1 |  1 |  rbf | scale  |  balanced |  median for NA's and mean  | 35 | 0.598165656150447 | ? | 33 |
| run_2 |  1 |  rbf | scale  |  balanced |  median for NA's and mean, max, min, median, sd  | 170 | 0.628216870267411 |?| 102 |
| run_3 |  1 |  rbf | scale  |  balanced |  median for NA's and mean, max, min, median, sd, range, skw, kurt  | 272 | 0.649372121402984 | 0.8236937992110356 | 141 |
| run_4 |  HGBC |  HGBC | HGBC |  HGBC |  median for NA's and mean, max, min, median, sd, range, skw, kurt  | 272 | 0.871097657278231* | 0.8222653647930391 | 0.5 |

*I think the reason for this high score is beacuse the f1_micro is more severe when all labels are taken into account instead of one by one and the averaging. Hence I dont belive the HGBC is superiro in terms of performance, otherwise we would have also observed a big increase in the AUC.

### Subtask 3. Lasso

Trainig scores for normalized and unnormalized imputed data restircted to the labels:

|nomralized|UN-nomralized|
|---|---|
|0.37770345083252754 | 0.37759566055685045|
|0.5859785441608802  | 0.5856645886174903 |
|0.38386074780919743 | 0.3842306307116389 |
|0.6144724385873669  | 0.6142282361433877 | 

The sumbission scores were only a little bit different for normalized and unormalized data. 0.754641671097 and 0.754664968318 respectively. We therofre decide to use normalized data becasue this way we dont need two imputation scripts. 