# Task 2, Subtask 1

## Import libreries

In [None]:
import pandas as pd
import numpy as np
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.linear_model import LassoCV
from sklearn.model_selection import KFold
import sklearn.metrics as metrics

## Read data

In [None]:
## We read the already normalized and imputed data. For specifics about the imputation and normalization 
## see imputate.R file. 
test_feat_path = "../data/test_features_imp.csv" 
train_feat_path = "../data/train_features_imp.csv" 
train_lab_path = "../data/train_labels.csv"
test_feat = pd.read_csv(test_feat_path)
train_feat = pd.read_csv(train_feat_path)
train_lab = pd.read_csv(train_lab_path)

## Order data to make sure that rows in X and Y match
test_feat.sort_values(by=['pid'], inplace = True, ignore_index = True)
train_feat.sort_values(by=['pid'], inplace = True,ignore_index = True)
train_lab.sort_values(by=['pid'], inplace = True, ignore_index = True)

## Select exclude the pid column and make into array
X_test = test_feat.iloc[:, 1:272].values
X_train = train_feat.iloc[:, 1:272].values
Y_train = train_lab

# Create output file
output = pd.DataFrame({'pid': test_feat.iloc[:, 0].values})

## Subtask 1
### Histogram-based Gradient Boosting Classification Tree

In [None]:
## Define the names of the labels to predict
def prob_classsifier(X_train, Y_train, X_test, output):
    labels_subtask_1 = ['LABEL_BaseExcess', 'LABEL_Fibrinogen', 'LABEL_AST',
                    'LABEL_Alkalinephos', 'LABEL_Bilirubin_total', 
                    'LABEL_Lactate', 'LABEL_TroponinI', 'LABEL_SaO2',
                    'LABEL_Bilirubin_direct', 'LABEL_EtCO2']

    ## Write to an array the labels of interest
    Y_train = Y_train[labels_subtask_1].to_numpy()


    ## For every label in Y_train fit a HGBC and use it to predict the probabilities of X_test
    print("ROC AUC validation and training score (training score on probability estimates), for each label:")
    for i, label in enumerate(labels_subtask_1):
        ## Fit model
        clf = HistGradientBoostingClassifier(scoring = 'roc_auc', 
                                             random_state = 123).fit(X_train, Y_train[:, i])

        ## Print the testing and traing score. Training score is estimated for the probability estimates not the labels.
        print(clf.validation_score_[np.size(clf.validation_score_) - 1], " ", 
              metrics.roc_auc_score(Y_train[:, i],
              clf.predict_proba(X_train)[:, 1], average='micro'))

        ## Write to results df
        output[label] = clf.predict_proba(X_test)[:, 1]
    return output

output = prob_classsifier(X_train, Y_train, X_test, output)

## Subtask 2
### Histogram-based Gradient Boosting Classification Tree

In [None]:
def classifier(X_train, Y_train, X_test, output):
    ## Write to an array the labels of interest
    Y_train = Y_train['LABEL_Sepsis'].to_numpy()

    ## Fit a HGBC and use it to predict the probabilities of X_test
    print("ROC AUC validation and training score (training score on probability estimates), for each label:")

    ## Fit model
    clf = HistGradientBoostingClassifier(scoring = 'roc_auc',
                                         random_state = 123).fit(X_train, Y_train)

    ## Print the testing and traing score. Trainig score is estimated for the probability estimates not the labels.
    print(clf.validation_score_[np.size(clf.validation_score_) - 1],
          " ",
          metrics.roc_auc_score(Y_train,
                                clf.predict_proba(X_train)[:, 1],
                                average='micro'))

    ## Write to results df
    output['LABEL_Sepsis'] = clf.predict_proba(X_test)[:, 1]
    return output

output = classifier(X_train, Y_train, X_test, output)

## Subtask 3
### Lasso Regression

In [None]:
def regressor(train_feat, Y_train, test_feat, output):
    ## Define the features to predict for this rask
    labels_subtask_3 = ['LABEL_RRate', 'LABEL_ABPm', 'LABEL_SpO2', 'LABEL_Heartrate']

    ## Write to an array the labels of interest
    Y_train = Y_train[labels_subtask_3].to_numpy()

    ## Fit Lasso regression to the data and predict
    print("Training scores for each label:")
    for i, label in enumerate(labels_subtask_3):
        ## Get suffix of the label to predict
        sufix = label.split("_", maxsplit = 2)[1] + "$"

        ## Filter out columns that dont end with the suffix
        X_in_loop_train = train_feat.filter(regex = sufix, axis = 1).to_numpy()
        X_in_loop_test = test_feat.filter(regex = sufix, axis = 1).to_numpy()

        ## Fit model
        reg = LassoCV(random_state = 123, 
                      verbose = False,
                      max_iter = 10000).fit(X_in_loop_train, Y_train[:, i])

        ## Print training score (the suck)
        print(reg.score(X_in_loop_train, Y_train[:, i]))

        ## Write to output
        output[label] = reg.predict(X_in_loop_test)
    return output

output = regressor(train_feat, Y_train, test_feat, output)


## Visualize output

In [None]:
## Write results to .zip
output.to_csv('submission.zip', index=False, float_format='%.3f', compression='zip')
output.head()

## Compute the score of our submission

In [None]:
VITALS = ['LABEL_RRate', 'LABEL_ABPm', 'LABEL_SpO2', 'LABEL_Heartrate']
TESTS = ['LABEL_BaseExcess', 'LABEL_Fibrinogen', 'LABEL_AST', 'LABEL_Alkalinephos', 'LABEL_Bilirubin_total',
         'LABEL_Lactate', 'LABEL_TroponinI', 'LABEL_SaO2',
         'LABEL_Bilirubin_direct', 'LABEL_EtCO2']


def get_score(df_true, df_submission):
    df_submission = df_submission.sort_values('pid')
    df_true = df_true.sort_values('pid')
    task1 = np.mean([metrics.roc_auc_score(df_true[entry], df_submission[entry]) for entry in TESTS])
    task2 = metrics.roc_auc_score(df_true['LABEL_Sepsis'], df_submission['LABEL_Sepsis'])
    task3 = np.mean([0.5 + 0.5 * np.maximum(0, metrics.r2_score(df_true[entry], df_submission[entry])) for entry in VITALS])
    score = np.mean([task1, task2, task3])
    print("Score task 1: ", task1)
    print("Score task 2: ", task2)
    print("Score task 3: ", task3)
    scores = [task1, task2, task3, score]
    return scores


def crossvalidation_analysis(X_cross, y_cross, train_feat):
    """Cross-validation analysis of our classifiers and regressors"""
    kf = KFold(n_splits=2)
    scores = []
    for train_index, test_index in kf.split(X_cross):
        X_train, X_test = X_cross[train_index], X_cross[test_index]
        Y_train, Y_test = y_cross.loc[train_index].reset_index(), y_cross.loc[test_index].reset_index()
        X_train_labels, X_test_labels = train_feat.loc[train_index].reset_index(), train_feat.loc[test_index].reset_index()
        output = pd.DataFrame({'pid': Y_test.iloc[:, 0].values})
        output = prob_classsifier(X_train, Y_train, X_test, output)
        output = classifier(X_train, Y_train, X_test, output)
        output = regressor(X_train_labels, Y_train, X_test_labels, output)
        print("Fold score", get_score(Y_test, output))
        scores.append(get_score(Y_test, output))
    
    scores = pd.DataFrame(scores,columns=['Task1', "Task2", "Task3", "Average"])
    print("FINAL SCORE: ", np.mean(scores))
    
    return scores


scores = crossvalidation_analysis(X_train, Y_train, train_feat)

## Results Log

### Subtask 1. Binary Relevance and HGBC

|   | C | kernel | gamma | weight | features | n_features | F1 score | AUC | runtime (min) |
|---|---|---|---|---|---|---|---|---|---|
| run_1 |  1 |  rbf | scale  |  balanced |  median for NA's and mean  | 35 | 0.598165656150447 | ? | 33 |
| run_2 |  1 |  rbf | scale  |  balanced |  median for NA's and mean, max, min, median, sd  | 170 | 0.628216870267411 |?| 102 |
| run_3 |  1 |  rbf | scale  |  balanced |  median for NA's and mean, max, min, median, sd, range, skw, kurt  | 272 | 0.649372121402984 | 0.8236937992110356 | 141 |
| run_4 |  HGBC |  HGBC | HGBC |  HGBC |  median for NA's and mean, max, min, median, sd, range, skw, kurt  | 272 | 0.871097657278231* | 0.8222653647930391 | 0.5 |

*I think the reason for this high score is beacuse the f1_micro is more severe when all labels are taken into account instead of one by one and the averaging. Hence I dont belive the HGBC is superiro in terms of performance, otherwise we would have also observed a big increase in the AUC.

### Subtask 3. Lasso

Trainig scores for normalized and unnormalized imputed data restircted to the labels:

|nomralized|UN-nomralized|
|---|---|
|0.37770345083252754 | 0.37759566055685045|
|0.5859785441608802  | 0.5856645886174903 |
|0.38386074780919743 | 0.3842306307116389 |
|0.6144724385873669  | 0.6142282361433877 | 

The sumbission scores were only a little bit different for normalized and unormalized data. 0.754641671097 and 0.754664968318 respectively. We therofre decide to use normalized data becasue this way we dont need two imputation scripts. 