In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from itertools import product

from sklearn.model_selection import (train_test_split,
                                     GridSearchCV,
                                     TunedThresholdClassifierCV)
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.frozen import FrozenEstimator
from sklearn.calibration import CalibratedClassifierCV

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.metrics import roc_auc_score, f1_score, RocCurveDisplay

In [None]:
while not os.getcwd().endswith('chest-pain-dissertation'):
    os.chdir('../')

print(f"Working directory: {os.getcwd()}")

# Functions

In [None]:
def split_data(X, y, train_size, validation_size, seed):

    train_set = int(100*train_size)
    val_set = int(100*validation_size)
    test_set = int(100*round(1-(train_size+validation_size), 2))

    msg = (f"Splitting data into {train_set}% training set, {val_set}% validation "
           f"set and {test_set}% testing set...")
    print(msg)

    X_train_val, X_test, y_train_val, y_test = train_test_split(
        X, y, test_size=test_set/100, stratify=y, random_state=seed
    )
    val_size = validation_size/(train_size+validation_size)
    X_train, X_val, y_train, y_val = train_test_split(
        X_train_val, y_train_val, test_size=val_size, stratify=y_train_val, random_state=seed
    )

    # save the train-test data for model training and evaluation
    training_data = X_train.join(y_train)
    validation_data = X_val.join(y_val)
    testing_data = X_test.join(y_test)

    return training_data, validation_data, testing_data

In [None]:
def create_preprocessing_pipeline(X_train, num_cols, disc_cols, cat_cols, removed_features=None):

    if removed_features is not None:
        for feature in removed_features:
            if feature in num_cols:
                num_cols.remove(feature)
            elif feature in disc_cols:
                disc_cols.remove(feature)
            elif feature in cat_cols:
                cat_cols.remove(feature)
            else:
                print(f"Feature {feature} is not valid.")
                raise(ValueError((f"Feature {feature} is not valid. "
                                  f"Feature must be in {X_train.columns.values.tolist()}")))
        
    invalid_features = list(
        set(num_cols+disc_cols+cat_cols) - set(X_train.columns.values.tolist())
    )
    if len(invalid_features) != 0:
        msg = f"The following features are not in the dataframe: {invalid_features}"
        print(msg)
        raise ValueError(msg)

    impute_and_scale = Pipeline([
        ("numeric_impute", SimpleImputer(strategy="median")),
        ("numeric_transformation", StandardScaler())
    ])
    binary_and_discrete_impute = Pipeline([
        ("numeric_impute", SimpleImputer(strategy="median"))
    ])
    impute_and_one_hot_encode = Pipeline([
        ("categorical_transformation", OneHotEncoder(handle_unknown='infrequent_if_exist'))
    ])

    transformers = []
    if len(num_cols)>0:
        transformers.append(
            ("numeric_preprocessing", impute_and_scale, num_cols)
        )
    if len(disc_cols)>0:
        transformers.append(
            ("binary_and_discrete_preprocessing", binary_and_discrete_impute, disc_cols)
        )
    if len(cat_cols)>0:
        transformers.append(
            ("categorical_preprocessing", impute_and_one_hot_encode, cat_cols)
        )

    if len(transformers)>0:
        return ColumnTransformer(transformers=transformers)
    else:
        raise ValueError("No transformaers to create pipeline")

In [None]:
def tune_hyperparameters(X, y, model, params, model_desc, model_name, preprocsessing_pipe, k_fold):
        
        import warnings
        warnings.filterwarnings('ignore')

        # set up the pipeline
        pipe = Pipeline([
            ("pre_processing", preprocsessing_pipe),
            (model_desc, model)
        ])

        # set up grid search object
        pipe_cv = GridSearchCV(pipe,
                               param_grid=params,
                               scoring='roc_auc',
                               cv=k_fold,
                               n_jobs=-1,
                               verbose=0,
                               error_score=0.0)

        # attempt to fit the model
        try:
            pipe_cv.fit(X, y)
        except Exception as e:
            msg = ("The following error occured "
                f"while tuning {model_name}: {e}")
            print(msg)
            raise(e)
        
        rounded_score = round(pipe_cv.best_score_, 3)

        result = {
            'model': pipe_cv.best_estimator_,
            'params': pipe_cv.best_params_,
            'scores': pipe_cv.best_score_
        }

        return result

In [None]:
def tune_threshold(model, model_name, X_val, y_val):

    tuned_model = TunedThresholdClassifierCV(
        model,
        scoring="f1",
        cv="prefit",
        refit=False,
        store_cv_results=True
    )

    tuned_model.fit(X_val, y_val)

    return tuned_model

In [None]:
def get_validation_auc(model: Pipeline, X_val, y_val) -> float:

    y_prob = model.predict_proba(X_val)[:, 1]
    auc = roc_auc_score(y_val, y_prob)

    return auc

In [None]:
def train_logistic_regression(X_train, y_train, X_val, y_val, preprocessor, params, seed):

    msg = f"Training Logistic Regression model tuning the parameters: {params}"
    print(msg)
    
    model = LogisticRegression(random_state=seed)
    
    if params != 'None':
        trained_model = tune_hyperparameters(X_train, y_train, model, params, "lreg_model", "Logistic Regression", preprocessor, 5)
    else:
        trained_model = {}
        pipe = Pipeline([
            ("pre_processing", preprocessor),
            ('lreg_model', model)
        ])
        trained_model['model'] = pipe.fit(X_train, y_train)
        
    tuned_model = tune_threshold(trained_model['model'], "Logistic Regression", X_val, y_val)
    trained_model['model'] = tuned_model

    trained_model['scores'] = get_validation_auc(tuned_model, X_val, y_val)

    return trained_model

In [None]:
def train_random_forest(X_train, y_train, X_val, y_val, preprocessor, params, seed):

    msg = f"Training Random Forest model tuning the parameters: {params}"
    print(msg)

    model = RandomForestClassifier(criterion="gini", random_state=seed)
    
    if params != 'None':
        trained_model = tune_hyperparameters(X_train, y_train, model, params, "rfc_model", "Random Forest", preprocessor, 5)
    else:
        trained_model = {}
        pipe = Pipeline([
            ("pre_processing", preprocessor),
            ('rfc_model', model)
        ])
        trained_model['model'] = pipe.fit(X_train, y_train)
        
    tuned_model = tune_threshold(trained_model['model'], "Random Forest", X_val, y_val)
    trained_model['model'] = tuned_model

    trained_model['scores'] = get_validation_auc(tuned_model, X_val, y_val)
        
    return trained_model

In [None]:
def train_lightgbm(X_train, y_train, X_val, y_val, preprocessor, params, seed):

    msg = f"Training LightGBM model tuning the parameters: {params}"
    print(msg)

    model = LGBMClassifier(objective='binary', random_state=seed)
    
    if params != 'None':
        trained_model = tune_hyperparameters(X_train, y_train, model, params, "lgbm_model", "LightGBM", preprocessor, 5)
    else:
        trained_model = {}
        pipe = Pipeline([
            ("pre_processing", preprocessor),
            ('gbm_model', model)
        ])
        trained_model['model'] = pipe.fit(X_train, y_train)
        
    tuned_model = tune_threshold(trained_model['model'], "LightGBM", X_val, y_val)
    trained_model['model'] = tuned_model

    trained_model['scores'] = get_validation_auc(tuned_model, X_val, y_val)
        
    return trained_model

In [None]:
def train_xgboost(X_train, y_train, X_val, y_val, preprocessor, params, seed):

    msg = f"Training XGBoost model tuning the parameters: {params}"
    print(msg)

    n_pos = y_train.sum()
    n_neg = len(y_train)-n_pos
    model = XGBClassifier(objective='binary:logistic', scale_pos_weight=n_neg/n_pos, seed=seed)
    
    if params != 'None':
        trained_model = tune_hyperparameters(X_train, y_train, model, params, "xgb_model", "XGBoost", preprocessor, 5)
    else:
        trained_model = {}
        pipe = Pipeline([
            ("pre_processing", preprocessor),
            ('xgb_model', model)
        ])
        trained_model['model'] = pipe.fit(X_train, y_train)
        
    tuned_model = tune_threshold(trained_model['model'], "XGBoost", X_val, y_val)
    trained_model['model'] = tuned_model

    trained_model['scores'] = get_validation_auc(tuned_model, X_val, y_val)
        
    return trained_model

# Experiments

In [None]:
df = pd.read_csv('data/clean/processed_dataset.csv')
X = df.drop(['nhs_number', 'subsequent_mi_30days_diagnosis'], axis=1).copy()
y = df['subsequent_mi_30days_diagnosis']

In [None]:
seed = 42

In [None]:
train_set, val_set, _ = split_data(X, y, train_size=0.6, validation_size=0.2, seed=seed)

In [None]:
X_train = train_set.drop(['subsequent_mi_30days_diagnosis'], axis=1).copy()
y_train = train_set['subsequent_mi_30days_diagnosis']

X_val = val_set.drop(['subsequent_mi_30days_diagnosis'], axis=1).copy()
y_val = val_set['subsequent_mi_30days_diagnosis']

In [None]:
num_cols = ['acute_morbidity_indicator', 'ae_duration_hrs', 'max_tnt_24hr_int',
            'min_egfr_24hr_int', 'first_tnt_24hr_int', 'first_egfr_24hr_int',
            'mood_and_anxiety_disorders_indicator', 'tnt_egfr_interaction',
            'ip_duration_days', 'total_duration_days', 'age', 'tnt_change', 'egfr_change']
disc_cols = ['ihd_mi', 'cc_heart_failure', 'cc_myocardial_infarction',
             'imd_decile_19', 'qof_diabetes', 'qof_ht', 'ht', 'qof_chd',
             'ihd_nonmi', 'af', 'arrhythmia_other', 'stroke', 'hf', 'vasc_dis',
             'cardio_other', 'qof_depression', 'qof_mental', 'N_tnt_24hr', 'N_egfr_24hr',
             'mi_diagnosis_ae_discharge', 'meds_total', 'meds_antip', 'meds_angio',
             'meds_betab', 'meds_total_discharge', 'transfered_dv', 'mi_diagnosis_code',
             'chd_diagnosis_code', 'meds_total_more_than_10',
             'tnt_rule_in', 'age_threshold', 'ae_target', 'egfr_rule_in']
cat_cols = ['ethnicity', 'sex', 'smoking', 'ae_provider', 'ip_provider',
            'site_ae', 'site_ip', 'derived_trust_catchment',
            'departure_season', 'diagnosis_description']

In [None]:
lreg_grid = {'raw': 'None',
             'hp1': {'lreg_model__solver': ['lbfgs', 'newton-cg', 'liblinear', 'sag', 'saga']},
             'hp2': {'lreg_model__solver': ['lbfgs', 'newton-cg', 'liblinear', 'sag', 'saga'],
                     'lreg_model__penalty': [None, 'l1', 'l2', 'elasticnet']},
             'hp3': {'lreg_model__solver': ['lbfgs', 'newton-cg', 'liblinear', 'sag', 'saga'],
                     'lreg_model__penalty': [None, 'l1', 'l2', 'elasticnet'],
                     'lreg_model__C': np.logspace(-4, 4, 20)},
             'hp4': {'lreg_model__solver': ['lbfgs', 'newton-cg', 'liblinear', 'sag', 'saga'],
                     'lreg_model__penalty': [None, 'l1', 'l2', 'elasticnet'],
                     'lreg_model__C': np.logspace(-4, 4, 20),
                     'lreg_model__max_iter': [500, 1000, 1500, 2000]},
             'hp5': {'lreg_model__solver': ['saga', 'liblinear'],
                     'lreg_model__penalty': [None, 'l1', 'l2'],
                     'lreg_model__C': [0.01, 0.1, 1, 10, 100],
                     'lreg_model__max_iter': [750, 1000, 1250, 1500]}}

In [None]:
rfc_grid = {'raw': 'None',
            'hp1': {'rfc_model__n_estimators': [100, 200, 300, 400]},
            'hp2': {'rfc_model__n_estimators': [100, 200, 300, 400],
                    'rfc_model__max_features': [0.25, 0.5, 0.75, 1, 'sqrt']},
            'hp3': {'rfc_model__n_estimators': [100, 200, 300, 400],
                    'rfc_model__max_features': [0.25, 0.5, 0.75, 1, 'sqrt'],
                    'rfc_model__max_depth': range(1, 15, 2)},
            'hp4': {'rfc_model__n_estimators': [100, 200, 300, 400],
                    'rfc_model__max_features': [0.25, 0.5, 0.75, 1, 'sqrt'],
                    'rfc_model__max_depth': range(1, 15, 2),
                    'rfc_model__min_samples_split': [10, 20, 50, 100]},
            'hp5': {'rfc_model__n_estimators': [100, 200, 300, 400],
                    'rfc_model__max_depth': range(5, 15, 2),
                    'rfc_model__min_samples_split': range(16, 25, 2)}}

In [None]:
lgbm_grid = {'raw': 'None',
             'hp1': {'lgbm_model__num_leaves': range(11, 72, 20)},
             'hp2': {'lgbm_model__num_leaves': range(11, 72, 20),
                     'lgbm_model__min_data_in_leaf': [20, 50, 100, 300]},
             'hp3': {'lgbm_model__num_leaves': range(11, 72, 20),
                     'lgbm_model__min_data_in_leaf': [20, 50, 100, 300],
                     'lgbm_model__max_depth': [-1, 1, 5, 10]},
             'hp4': {'lgbm_model__num_leaves': range(11, 72, 20),
                     'lgbm_model__min_data_in_leaf': [20, 50, 100, 300],
                     'lgbm_model__max_depth': [-1, 1, 5, 10],
                     'lgbm_model__learning_rate': [0.01, 0.1, 0.2]},
             'hp5': {'lgbm_model__num_leaves': [20, 30, 40],
                     'lgbm_model__max_depth': [2, 5, 10, 15, 20],
                     'lgbm_model__learning_rate': [0.001, 0.01, 0.1],
                     'lgbm_model__n_estimators': [100, 300, 500]}}

In [None]:
xgb_grid = {'raw': 'None',
            'hp1': {'xgb_model__n_estimators': [100, 400, 700, 1000]},
            'hp2': {'xgb_model__n_estimators': [100, 400, 700, 1000],
                    'xgb_model__max_depth': range(2, 11, 2)},
            'hp3': {'xgb_model__n_estimators': [100, 400, 700, 1000],
                    'xgb_model__max_depth': range(2, 11, 2),
                    'xgb_model__xgb_model__gamma': [0, 1, 10, 50, 100]},
            'hp4': {'xgb_model__n_estimators': [100, 400, 700, 1000],
                    'xgb_model__max_depth': range(2, 11, 2),
                    'xgb_model__xgb_model__gamma': [0, 1, 10, 50, 100],
                    'xgb_model__subsample': [0.6, 0.8, 1]},
            'hp5': {'xgb_model__n_estimators': [500, 750, 1000],
                    'xgb_model__eta': [0.01, 0.1, 0.3, 0.5],
                    'xgb_model__gamma': [0, 1, 10, 50, 100],
                    'xgb_model__max_depth': [2, 4, 6, 8, 10]}}

In [None]:
param_dict = {'Logistic Regression': lreg_grid,
              'Random Forest': rfc_grid,
              'LightGBM': lgbm_grid,
              'XGBoost': xgb_grid}

model_dict= {}

preprocessor = create_preprocessing_pipeline(X_train, num_cols, disc_cols, cat_cols)

fig, axs = plt.subplots(nrows=2, ncols=2, squeeze=False, figsize=(10, 8))

axs_raveled = axs.ravel()


for i, (name, params) in enumerate(param_dict.items()):
    indv_model_dict = {}
    for idx, (key, grid) in enumerate(params.items()):
        if name=='Logistic Regression':
            trained_model_dict = train_logistic_regression(X_train, y_train, X_val, y_val, preprocessor, grid, seed)
        elif name=='Random Forest':
            trained_model_dict = train_random_forest(X_train, y_train, X_val, y_val, preprocessor, grid, seed)
        elif name=='LightGBM':
            trained_model_dict = train_lightgbm(X_train, y_train, X_val, y_val, preprocessor, grid, seed)
        else:
            trained_model_dict = train_xgboost(X_train, y_train, X_val, y_val, preprocessor, grid, seed)

        model = trained_model_dict['model']

        indv_model_dict[key] = model

        y_scores = model.predict_proba(X_val)[:, 1]

        display = RocCurveDisplay.from_predictions(
            y_val,
            y_scores,
            name=key,
            ax=axs_raveled[i],
            plot_chance_level=(idx==len(params)-1),
            chance_level_kw={'linestyle': ':'}
        )
    model_dict[name] = indv_model_dict
    axs_raveled[i].set_title(name)
_ = plt.tight_layout(pad=2.0)

In [None]:
fig, ax = plt.subplots(2, 2, figsize=(10, 8))

axs = ax.ravel()

for i, (name, models) in enumerate(model_dict.items()):
    hp_list = []
    f1_list = []
    for key, model in models.items():
        y_pred = model.predict(X_val)
        f1 = f1_score(y_val, y_pred)

        hp_list.append(key)
        f1_list.append(f1)
    
    df_plot = pd.DataFrame({'hp': hp_list, 'F1 Score': f1_list})
    axs[i].barh(df_plot['hp'], df_plot['F1 Score'])
    axs[i].set(title=name, ylabel='HP Tuning Group', xlabel='F1 Score')

fig.suptitle('F1 scores for each model')

_ = plt.tight_layout(pad=2.0)