# Task 2, Subtask 1

## Import libreries

In [1]:
import pandas as pd
import numpy as np
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingClassifier
import sklearn.metrics as metrics
import time

## Start timer

In [2]:
start_time = time.time()

## Read data

In [3]:
## We read the already normalized and imputed data 
test_feat_path = "../data/test_features_imp.csv" 
train_feat_path = "../data/train_features_imp.csv" 
train_lab_path = "../data/train_labels.csv"
test_feat = pd.read_csv(train_feat_path)
train_feat = pd.read_csv(train_feat_path)
train_lab = pd.read_csv(train_lab_path)

## Order data to make sure that rows in X and Y match
test_feat.sort_values(by=['pid'], inplace = True, ignore_index = True)
train_feat.sort_values(by=['pid'], inplace = True, ignore_index = True)
train_lab.sort_values(by=['pid'], inplace = True, ignore_index = True)

## Get labels
labels_ids = ['LABEL_BaseExcess', 'LABEL_Fibrinogen', 'LABEL_AST',
              'LABEL_Alkalinephos', 'LABEL_Bilirubin_total', 
              'LABEL_Lactate', 'LABEL_TroponinI', 'LABEL_SaO2',
              'LABEL_Bilirubin_direct', 'LABEL_EtCO2']

## Select relevant label columns and exclude the pid column, and sample rows
X_test = train_feat.iloc[:, 1:272].values
X_train = train_feat.iloc[:, 1:272].values
Y_train = train_lab[labels_ids].to_numpy()

## Histogram-based Gradient Boosting Classification Tree

In [38]:
output = pd.DataFrame({'pid': test_feat.iloc[:, 0].values})
print("ROC AUC scores for each label and trraining score on probabilities:")
for i, label in enumerate(labels_ids):
    clf = HistGradientBoostingClassifier(scoring = 'roc_auc').fit(X_train, Y_train[:, i])
    print(clf.validation_score_[np.size(clf.validation_score_) - 1], " ", metrics.roc_auc_score(Y_train[:, i], clf.predict_proba(X_test)[:, 1], average='micro'))
    output[label] = clf.predict_proba(X_test)[:, 1]

ROC AUC scores for each label and trraining score on probabilities:
0.9622021063485248
0.9179870295944466
0.8672136760979517
0.8375341926119442
0.8658035412929626
0.8947057383260937
0.9427961702849914
0.8824270572341473
0.9375263368455897
0.9886837965418951


## Write probabilities to output

In [5]:
output

Unnamed: 0,pid,LABEL_BaseExcess,LABEL_Fibrinogen,LABEL_AST,LABEL_Alkalinephos,LABEL_Bilirubin_total,LABEL_Lactate,LABEL_TroponinI,LABEL_SaO2,LABEL_Bilirubin_direct,LABEL_EtCO2
0,1,0.708554,0.076010,0.038123,0.055323,0.075393,0.249192,0.017614,0.115010,0.006282,0.000859
1,2,0.051212,0.033267,0.140638,0.120264,0.126126,0.131657,0.598975,0.106778,0.021915,0.024606
2,4,0.045770,0.032806,0.111365,0.208309,0.145649,0.083064,0.134275,0.271008,0.032889,0.044032
3,6,0.897079,0.052133,0.085278,0.097104,0.081738,0.308022,0.010966,0.877357,0.016876,0.000575
4,8,0.019660,0.027716,0.182168,0.195130,0.154833,0.038612,0.228829,0.039309,0.017819,0.008643
...,...,...,...,...,...,...,...,...,...,...,...
18990,31653,0.357821,0.093695,0.224009,0.175923,0.130908,0.394871,0.009812,0.134667,0.007570,0.000389
18991,31654,0.014742,0.029490,0.243918,0.201067,0.224094,0.049392,0.075589,0.047048,0.015648,0.028271
18992,31656,0.884902,0.034847,0.099396,0.106368,0.093803,0.346195,0.011927,0.226279,0.008908,0.001246
18993,31657,0.059808,0.030205,0.129815,0.148697,0.136649,0.047161,0.045837,0.059225,0.016379,0.011634


In [6]:
#output.to_csv('../output/subtask_1_output', index = False, header = False)

## Runtime

In [7]:
print("--- %s minutes ---" % ((time.time() - start_time)/60))

--- 0.5424755851427714 minutes ---


## Results Log

|   | C | kernel | gamma | weight | features | n_features | F1 score | AUC | runtime (min) |
|---|---|---|---|---|---|---|---|---|---|
| run_1 |  1 |  rbf | scale  |  balanced |  median for NA's and mean  | 35 | 0.598165656150447 | ? | 33 |
| run_2 |  1 |  rbf | scale  |  balanced |  median for NA's and mean, max, min, median, sd  | 170 | 0.628216870267411 |?| 102 |
| run_3 |  1 |  rbf | scale  |  balanced |  median for NA's and mean, max, min, median, sd, range, skw, kurt  | 272 | 0.649372121402984 | 0.8236937992110356 | 141 |
| run_4 |  na |  na | na |  na |  median for NA's and mean, max, min, median, sd, range, skw, kurt  | 272 | 0.871097657278231 | 0.8222653647930391 | 0.5 |

### Alternative imputation

In [34]:
import numpy as np
import pandas as pd
train_data = pd.read_csv('../data/train_features.csv')
labels = pd.read_csv('../data/train_labels.csv')
test_data = pd.read_csv('../data/test_features.csv')

def calculate_time_features(data, n_samples):
    x = []
    features = [np.nanmedian, np.nanmean, np.nanvar, np.nanmin,
           np.nanmax]
    for index in range(int(data.shape[0] / n_samples)):
        assert data[n_samples * index, 0] == data[n_samples * (index + 1) - 1, 0], \
        'Ids are {}, {}'.format(data[n_samples * index, 0], data[n_samples * (index + 1) - 1, 0])
        patient_data = data[n_samples * index:n_samples * (index + 1), 2:]
        feature_values = np.empty((len(features), data[:, 2:].shape[1]))
        for i, feature in enumerate(features):
            feature_values[i] = feature(patient_data, axis=0)
        x.append(feature_values.ravel())
    return np.array(x)

x_train = calculate_time_features(train_data.to_numpy(), 12)
x_test = calculate_time_features(test_data.to_numpy(), 12)

subtask1_labels_ids = ['LABEL_BaseExcess', 'LABEL_Fibrinogen', 'LABEL_AST',
         'LABEL_Alkalinephos', 'LABEL_Bilirubin_total', 
         'LABEL_Lactate', 'LABEL_TroponinI', 'LABEL_SaO2',
         'LABEL_Bilirubin_direct', 'LABEL_EtCO2']
y_train = labels[subtask1_labels_ids].to_numpy()

  r, k = function_base._ureduce(a, func=_nanmedian, axis=axis, out=out,
  feature_values[i] = feature(patient_data, axis=0)
  feature_values[i] = feature(patient_data, axis=0)
  feature_values[i] = feature(patient_data, axis=0)


In [35]:
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import (GridSearchCV,
    cross_val_score, KFold)
from sklearn.multiclass import OneVsRestClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import sklearn.metrics as metrics

# TODO (yarden):
# feature selection.
# parameters tuning (subsample, learning rate).
for i, label in enumerate(subtask1_labels_ids):
    pipeline = make_pipeline(
                        SimpleImputer(strategy='median'),
                        StandardScaler(),
                        HistGradientBoostingClassifier())
    scores = cross_val_score(pipeline, x_train, y_train[:, i],
                                cv=5,
                                scoring='roc_auc',
                                verbose=True)
    print("Cross-validation score is {score:.3f},"
          " standard deviation is {err:.3f}"
          .format(score = scores.mean(), err = scores.std()))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   14.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Cross-validation score is 0.927, standard deviation is 0.002


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    9.7s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Cross-validation score is 0.799, standard deviation is 0.008


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   12.7s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Cross-validation score is 0.746, standard deviation is 0.004


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   14.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Cross-validation score is 0.745, standard deviation is 0.006


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   12.6s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Cross-validation score is 0.745, standard deviation is 0.006


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   12.5s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Cross-validation score is 0.807, standard deviation is 0.008


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   11.5s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Cross-validation score is 0.893, standard deviation is 0.002


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   12.5s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Cross-validation score is 0.831, standard deviation is 0.006


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    8.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Cross-validation score is 0.755, standard deviation is 0.027
Cross-validation score is 0.932, standard deviation is 0.007


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   11.8s finished


In [36]:
df = pd.DataFrame({'pid': test_data.iloc[0::12, 0].values})
for i, label in enumerate(subtask1_labels_ids):
    pipeline = pipeline.fit(x_train, y_train[:, i].ravel())
    print("Training score:", metrics.roc_auc_score(y_train[:, i], pipeline.predict_proba(x_train)[:, 1]))
    predictions = pipeline.predict_proba(x_test)[:, 1]
    df[label] = predictions

Training score: 0.9712413240711782
Training score: 0.9468267567896724
Training score: 0.8539277319915479
Training score: 0.8484115708714369
Training score: 0.8555810291669353
Training score: 0.8862815926341792
Training score: 0.9636602786650003
Training score: 0.918923976388801
Training score: 0.9692089486212778
Training score: 0.9941067227563865


In [37]:
(0.9712413240711782+
0.9468267567896724+
0.8539277319915479+
0.8484115708714369+
0.8555810291669353+
0.8862815926341792+
0.9636602786650003+
0.918923976388801+
0.9692089486212778+
0.9941067227563865)/10

0.9208169931956416

In [40]:
(0.9622021063485248+
0.9179870295944466+
0.8672136760979517+
0.8375341926119442+
0.8658035412929626+
0.8947057383260937+
0.9427961702849914+
0.8824270572341473+
0.9375263368455897+
0.9886837965418951)/10

0.9096879645178546

In [41]:
0.9208169931956416 - 0.9096879645178546

0.011129028677786978