### This notebook loads extracted data, looks through the different fields, trains some baselines.

## Imports

In [None]:
import pandas as pd

## Load data, print keys

In [None]:
DATA_PATH = "/mnt/nfs/project/delirium/all_hourly_data.h5"
# DATA_PATH = "/mnt/nfs/project/delirium/_extract/all_hourly_data.h5"

SITE = "SMH"
pd.set_option('display.max_columns', None)

with pd.HDFStore(DATA_PATH, "r") as hdf:
    hdf_keys= list(hdf.keys())
    print(hdf_keys)
    
    dfs={}
    print('Reading Patient Data ...')
    dfs['statics'] = pd.read_hdf(DATA_PATH, key='patients')
    dfs[f'labs_{SITE}'] = pd.read_hdf(DATA_PATH, key=f'vitals_labs_{SITE}')
    dfs[f'outcomes_{SITE}'] = pd.read_hdf(DATA_PATH, key=f'interventions_{SITE}')

## Patient Statics

In [None]:
print(dfs['statics'].count())
dfs['statics']

## Labs

In [None]:
print(dfs[f'labs_{SITE}'].count())
dfs[f'labs_{SITE}']

## Outcomes

In [None]:
print(dfs[f'outcomes_{SITE}'].count())
dfs[f'outcomes_{SITE}']

## Train linear regression models (delirium)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score

import numpy as np


HOSPITAL_ID = {
    "THPM": 0,
    "SBK": 1,
    "UHNTG": 2,
    "SMH": 3,
    "UHNTW": 4,
    "THPC": 5,
    "PMH": 6,
    "MSH": 7,
}
features = [
    "sex",
    "age",
    "icd10_A00_B99",
    "icd10_C00_D49",
    "icd10_D50_D89",
    "icd10_E00_E89",
    "icd10_F01_F99",
    "icd10_G00_G99",
    "icd10_H00_H59",
    "icd10_H60_H95",
    "icd10_I00_I99",
    "icd10_J00_J99",
    "icd10_K00_K95",
    "icd10_L00_L99",
    "icd10_M00_M99",
    "icd10_N00_N99",
    "icd10_O00_O99",
    "icd10_Q00_Q99",
    "icd10_R00_R99",
    "icd10_S00_T88",
    "icd10_U07_U08",
    "icd10_Z00_Z99"
]
normalize_features = ["sex", "age"]
predict = "del_present"


def split_dataset(df, t, v, split_col):
    train = df.loc[df[split_col].isin(t)]
    val = df.loc[df[split_col].isin(v)]
    return train, val


def normalize(df, features, mean=None, std=None):
    if mean is None or std is None:
        mean, std = df[features].mean(), df[features].std()
    df[features] = (df[features] - mean) / std
    return df, mean, std


dataset = dfs['statics']
# Take out only delirium cohort.
dataset = dataset.loc[dataset['gemini_cohort'] == True]
dataset = dataset.loc[dataset['del_present'] != 3]
print(dataset.count())



hospitals = dataset['hospital_id'].unique()

for hos in hospitals:
    train_hospitals = [h for h in hospitals if h != hos]
    val_hospitals = [hos]


    train_dataset, val_dataset = split_dataset(dataset, train_hospitals, val_hospitals, 'hospital_id')
    # Normalize some of the features (not the ones that are one-hot encoded).
    train_dataset, mean, std = normalize(train_dataset, normalize_features)


    train_dataset = train_dataset[features].join(train_dataset[[predict]])
    val_dataset = val_dataset[features].join(val_dataset[[predict]])
    train_dataset = train_dataset.dropna()
    val_dataset = val_dataset.dropna()
    # print(train_dataset.count(), val_dataset.count())

    train_inputs = train_dataset[features].to_numpy()
    train_labels = train_dataset[[predict]].to_numpy().squeeze().astype(np.int32)
    val_inputs = normalize(val_dataset[features], normalize_features, mean, std)[0].to_numpy()
    # val_inputs = val_dataset[features].to_numpy()
    val_labels = val_dataset[[predict]].to_numpy().squeeze().astype(np.int32)

    clf = LogisticRegression(max_iter=2000, penalty="l1", solver="liblinear").fit(train_inputs, train_labels)
    preds = clf.predict(val_inputs)
    acc = accuracy_score(val_labels, preds)
    f1 = f1_score(val_labels, preds, average='weighted')
    prec = precision_score(val_labels, preds, average='weighted')
    rec = recall_score(val_labels, preds, average='weighted')
    roc = roc_auc_score(val_labels, preds, average='weighted')

    print(f"{hos} Acc: {acc}, f1-score: {f1}, Precision: {prec}, Recall: {rec}, ROC: {roc}")