### This notebook loads extracted data, looks through the different fields, trains some baselines.

## Imports

In [None]:
import pandas as pd

## Load data, print keys

In [None]:
DATA_PATH = "/mnt/nfs/project/delirium/all_hourly_data.h5"

pd.set_option('display.max_columns', None)

with pd.HDFStore(DATA_PATH, "r") as hdf:
    hdf_keys= list(hdf.keys())
    print(hdf_keys)
    
    dfs={}
    print('Reading Patient Data ...')
    dfs['statics'] = pd.read_hdf(DATA_PATH, key='patients')

## Patient Statics

In [None]:
print(dfs['statics'].count())
dfs['statics']

## Labs

In [None]:
HOSPITAL_ID = {
    "THPM": 0,
    "SBK": 1,
    "UHNTG": 2,
    "SMH": 3,
    "UHNTW": 4,
    "THPC": 5,
    "PMH": 6,
    "MSH": 7,
}

labs = pd.DataFrame()
for site in HOSPITAL_ID:
    labs = pd.read_hdf(DATA_PATH, key=f'vitals_labs_{site}')
    print(site, labs.columns)

## Vitals

## Outcomes

In [None]:
print(dfs[f'outcomes_{SITE}'].count())
dfs[f'outcomes_{SITE}']

## Train linear regression models

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    roc_auc_score,
    roc_curve,
    auc,
    RocCurveDisplay,
    confusion_matrix
)
    

import numpy as np
import matplotlib.pyplot as plt


HOSPITAL_ID = {
    0: "THPM",
    1: "SBK",
    2: "UHNTG",
    3: "SMH",
    4: "UHNTW",
    5: "THPC",
    6: "PMH",
    7: "MSH",
}
features = [
    "age",
    "sex",
    "icd10_A00_B99",
    "icd10_C00_D49",
    "icd10_D50_D89",
    "icd10_E00_E89",
    "icd10_F01_F99",
    "icd10_G00_G99",
    "icd10_H00_H59",
    "icd10_H60_H95",
    "icd10_I00_I99",
    "icd10_J00_J99",
    "icd10_K00_K95",
    "icd10_L00_L99",
    "icd10_M00_M99",
    "icd10_N00_N99",
    "icd10_O00_O99",
    "icd10_Q00_Q99",
    "icd10_R00_R99",
    "icd10_S00_T88",
    "icd10_U07_U08",
    "icd10_Z00_Z99",
]
num_features = len(features)
hidden_layer_sz = (num_features, num_features * 2, num_features)
normalize_features = ["age"]
predict = "los"


def split_dataset(df, t, v, split_col):
    """Split dataset based on column.
    
    Parameters
    ----------
    df: pandas.Dataframe
        Data encapsulated in Dataframe.
    t: list
        List of column values to use for training.
    v: list
        List of column values to use for validation.
    split_col: str
        Name of column to use for splitting data.
        
    Returns
    -------
    tuple
        (train, val) Dataframes.
    """
    train = df.loc[df[split_col].isin(t)]
    val = df.loc[df[split_col].isin(v)]
    return train, val


def normalize(df, features, mean=None, std=None):
    df = df.copy()
    df_features = df.loc[:, features]
    if mean is None or std is None:
        mean, std = df_features.mean(), df_features.std()
    df.loc[:, features] = (df_features - mean) / std
    return df, mean, std


def discretize_los(los):
    """Discretize into 3 classes.
    
    Parameters
    ----------
    los: float
        Length of stay (days).
    
    Returns
    -------
    int
        Discrete class for LOS.
    
    e.g.
    <= 3 days -> 0
    > 3 days <= 7 days -> 1
    > 7 days -> 2
    
    """
    if los <= 3:
        return 0
    elif los <= 7:
        return 1
    else:
        return 2
    

if predict == 'del_present':
    dataset = dfs['statics'].copy()
    # Take out only delirium cohort.
    dataset = dataset.loc[dataset['gemini_cohort'] == True]
    dataset = dataset.loc[dataset['del_present'] != 3]
    dataset_size = len(dataset)
elif predict == 'los':
    dataset_size = 100000
    dataset = dfs['statics'].copy().sample(frac=1).head(dataset_size)
    dataset['label'] = dataset[predict].apply(discretize_los)
    # dataset['label'] = dataset[predict]
    print(dataset['label'].value_counts())
else:
    dataset_size = 100000
    dataset = dfs['statics'].copy().sample(frac=1).head(dataset_size)
    dataset['label'] = dataset[predict]
    print(dataset['label'].value_counts())


hospitals = sorted(dataset['hospital_id'].unique())
rocs = []
for hospital in hospitals:
    train_hospitals = [h for h in hospitals if h != hospital]
    val_hospitals = [hospital]

    train_dataset, val_dataset = split_dataset(dataset, train_hospitals, val_hospitals, 'hospital_id')
    # Normalize some of the features (not the ones that are one-hot encoded).
    # train_dataset, mean, std = normalize(train_dataset, normalize_features)

    train_dataset = train_dataset[features].join(train_dataset['label'])
    val_dataset = val_dataset[features].join(val_dataset['label'])
    print(f"Train set size: {len(train_dataset)}, Val set size: {len(val_dataset)}")
    train_dataset = train_dataset.fillna(-1)
    val_dataset = val_dataset.fillna(-1)
    print(f"Train set size: {len(train_dataset)}, Val set size: {len(val_dataset)}")

    train_inputs = train_dataset[features].to_numpy()
    train_labels = train_dataset['label'].to_numpy().squeeze().astype(np.int32)
    # val_inputs = normalize(val_dataset[features], normalize_features, mean, std)[0].to_numpy()
    val_inputs = val_dataset[features].to_numpy()
    val_labels = val_dataset['label'].to_numpy().squeeze().astype(np.int32)
    
    # Can't apply ROC metric.
    if len(np.unique(val_labels)) < 2:
        continue

    clf = MLPClassifier(max_iter=500,
                        hidden_layer_sizes=hidden_layer_sz,
                        verbose=True,
                        learning_rate="adaptive").fit(train_inputs, train_labels)
    preds = clf.predict(val_inputs)
    pred_probs = clf.predict_proba(val_inputs)
    acc = accuracy_score(val_labels, preds)
    
    if len(np.unique(val_labels)) == 2:
        roc = roc_auc_score(val_labels, pred_probs[:, 1])
        fpr, tpr, thresholds = roc_curve(val_labels, pred_probs[:, 1])
        print(confusion_matrix(val_labels, preds))
        f1 = f1_score(val_labels, preds)
        prec = precision_score(val_labels, preds)
        rec = recall_score(val_labels, preds)
        roc_auc = auc(fpr, tpr)
        display = RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc,
                                  estimator_name='risk predictor')
        display.plot()
        plt.show()
    elif len(np.unique(val_labels)) > 2:
        print(confusion_matrix(val_labels, preds))
        f1 = f1_score(val_labels, preds, average='micro')
        prec = precision_score(val_labels, preds, average='micro')
        rec = recall_score(val_labels, preds, average='micro')
        roc = roc_auc_score(val_labels, pred_probs, average='weighted', multi_class='ovr')
    rocs.append(roc)

    print(f"{HOSPITAL_ID[hospital]} Acc: {acc}, f1-score: {f1}, Precision: {prec}, Recall: {rec}, ROC: {roc}")
    break

print(f"Mean ROC AUC: {np.mean(rocs)}, using dataset size: {dataset_size}")