In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from itertools import product

from sklearn.model_selection import (train_test_split,
                                     GridSearchCV,
                                     TunedThresholdClassifierCV)
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.frozen import FrozenEstimator
from sklearn.calibration import CalibratedClassifierCV


from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as imb_Pipeline
from imblearn.under_sampling import RandomUnderSampler

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.metrics import roc_auc_score, RocCurveDisplay

In [None]:
while not os.getcwd().endswith('chest-pain-dissertation'):
    os.chdir('../')

print(f"Working directory: {os.getcwd()}")

# Functions

In [None]:
def split_data(X, y, train_size, validation_size, seed):

    train_set = int(100*train_size)
    val_set = int(100*validation_size)
    test_set = int(100*round(1-(train_size+validation_size), 2))

    msg = (f"Splitting data into {train_set}% training set, {val_set}% validation "
           f"set and {test_set}% testing set...")
    print(msg)

    X_train_val, X_test, y_train_val, y_test = train_test_split(
        X, y, test_size=test_set/100, stratify=y, random_state=seed
    )
    val_size = validation_size/(train_size+validation_size)
    X_train, X_val, y_train, y_val = train_test_split(
        X_train_val, y_train_val, test_size=val_size, stratify=y_train_val, random_state=seed
    )

    # save the train-test data for model training and evaluation
    training_data = X_train.join(y_train)
    validation_data = X_val.join(y_val)
    testing_data = X_test.join(y_test)

    return training_data, validation_data, testing_data

In [None]:
def create_preprocessing_pipeline(X_train, num_cols, disc_cols, cat_cols, removed_features=None):

    if removed_features is not None:
        for feature in removed_features:
            if feature in num_cols:
                num_cols.remove(feature)
            elif feature in disc_cols:
                disc_cols.remove(feature)
            elif feature in cat_cols:
                cat_cols.remove(feature)
            else:
                print(f"Feature {feature} is not valid.")
                raise(ValueError((f"Feature {feature} is not valid. "
                                  f"Feature must be in {X_train.columns.values.tolist()}")))
        
    invalid_features = list(
        set(num_cols+disc_cols+cat_cols) - set(X_train.columns.values.tolist())
    )
    if len(invalid_features) != 0:
        msg = f"The following features are not in the dataframe: {invalid_features}"
        print(msg)
        raise ValueError(msg)

    impute_and_scale = Pipeline([
        ("numeric_impute", SimpleImputer(strategy="median")),
        ("numeric_transformation", StandardScaler())
    ])
    binary_and_discrete_impute = Pipeline([
        ("numeric_impute", SimpleImputer(strategy="median"))
    ])
    impute_and_one_hot_encode = Pipeline([
        ("categorical_transformation", OneHotEncoder(handle_unknown='infrequent_if_exist'))
    ])

    transformers = []
    if len(num_cols)>0:
        transformers.append(
            ("numeric_preprocessing", impute_and_scale, num_cols)
        )
    if len(disc_cols)>0:
        transformers.append(
            ("binary_and_discrete_preprocessing", binary_and_discrete_impute, disc_cols)
        )
    if len(cat_cols)>0:
        transformers.append(
            ("categorical_preprocessing", impute_and_one_hot_encode, cat_cols)
        )

    if len(transformers)>0:
        return ColumnTransformer(transformers=transformers)
    else:
        raise ValueError("No transformaers to create pipeline")

In [None]:
def resampling_technique(name, preprocessor, model, model_name, seed):
        
    if name=='Under':
        pipeline = imb_Pipeline(steps=[
            ('pre_processing', preprocessor),
            ('under_sampling', RandomUnderSampler(random_state=seed)),
            (model_name, model)
        ])
    elif name=='Over':
        pipeline = imb_Pipeline(steps=[
            ('pre_processing', preprocessor),
            ('over_sampling', SMOTE(random_state=seed)),
            (model_name, model)
        ])
    elif name=='Under and over':
        pipeline = imb_Pipeline(steps=[
            ('pre_processing', preprocessor),
            ('under_sampling', RandomUnderSampler(random_state=seed)),
            ('over_sampling', SMOTE(random_state=seed)),
            (model_name, model)
        ])
    elif name=='Over and under':
        pipeline = imb_Pipeline(steps=[
            ('pre_processing', preprocessor),
            ('over_sampling', SMOTE(random_state=seed)),
            ('under_sampling', RandomUnderSampler(random_state=seed)),
            (model_name, model)
        ])
    else: # name=='No resampling'
        pipeline = Pipeline([
            ('pre_processing', preprocessor),
            (model_name, model)
        ])

    return pipeline

In [None]:
def tune_hyperparameters(X, y, model, params, model_desc, model_name, preprocsessing_pipe, resampling_tech, k_fold, seed):
        
        import warnings
        warnings.filterwarnings('ignore')

        # set up the pipeline
        pipe = resampling_technique(resampling_tech, preprocsessing_pipe, model, model_desc, seed)

        # set up grid search object
        pipe_cv = GridSearchCV(pipe,
                               param_grid=params,
                               scoring='roc_auc',
                               cv=k_fold,
                               n_jobs=-1,
                               verbose=0,
                               error_score=0.0)

        # attempt to fit the model
        try:
            pipe_cv.fit(X, y)
        except Exception as e:
            msg = ("The following error occured "
                f"while tuning {model_name}: {e}")
            print(msg)
            raise(e)

        result = {
            'model': pipe_cv.best_estimator_,
            'params': pipe_cv.best_params_,
            'scores': pipe_cv.best_score_
        }

        return result

In [None]:
def calibrate_probabilities(model, X_val, y_val):

    frozen_clf = FrozenEstimator(model)
    model_calib = CalibratedClassifierCV(frozen_clf, method='isotonic')
    model_calib.fit(X_val, y_val)

    return model_calib

In [None]:
def tune_threshold(model, X_val, y_val):

    tuned_model = TunedThresholdClassifierCV(
        model,
        scoring="f1",
        cv="prefit",
        refit=False,
        store_cv_results=True
    )

    tuned_model.fit(X_val, y_val)

    return tuned_model

In [None]:
def get_validation_auc(model: Pipeline, X_val, y_val) -> float:

    y_prob = model.predict_proba(X_val)[:, 1]
    auc = roc_auc_score(y_val, y_prob)

    return auc

In [None]:
def train_logistic_regression(X_train, y_train, X_val, y_val, preprocessor, seed, resampling_name):

    msg = f"Training Logistic Regression model using {resampling_name} resampling"
    print(msg)

    model = LogisticRegression(random_state=seed)
        
    params = {
        "lreg_model__solver": ['saga', 'liblinear'],
        "lreg_model__penalty": [None, 'l1', 'l2'],
        "lreg_model__C": [0.01, 0.1, 1, 10, 100],
        "lreg_model__max_iter": [750, 1000, 1250, 1500]
    }
    trained_model = tune_hyperparameters(X_train, y_train, model, params, "lreg_model", "Logistic Regression", preprocessor, resampling_name, 5, seed)
        
    tuned_model = tune_threshold(trained_model['model'], X_val, y_val)
    trained_model['model'] = tuned_model

    trained_model['scores'] = get_validation_auc(tuned_model, X_val, y_val)

    return trained_model

In [None]:
def train_random_forest(X_train, y_train, X_val, y_val, preprocessor, seed, resampling_name):

    msg = f"Training Random Forest model using the {resampling_name} resampling"
    print(msg)

    model = RandomForestClassifier(criterion="gini", max_features="sqrt",
                                   random_state=seed)
        
    params = {
        "rfc_model__n_estimators": [100, 200, 300, 400],
        "rfc_model__max_depth": range(5, 15, 2),
        "rfc_model__min_samples_split": range(16, 25, 2)
    }
    trained_model = tune_hyperparameters(X_train, y_train, model, params, "rfc_model", "Random Forest", preprocessor, resampling_name, 5, seed)
        
    tuned_model = tune_threshold(trained_model['model'], X_val, y_val)
    trained_model['model'] = tuned_model

    trained_model['scores'] = get_validation_auc(tuned_model, X_val, y_val)
        
    return trained_model

In [None]:
def train_xgboost(X_train, y_train, X_val, y_val, preprocessor, seed, resampling_name):

    msg = f"Training XGBoost model using the {resampling_name} resampling"
    print(msg)

    model = XGBClassifier(objective='binary:logistic', seed=seed)
        
    params = {
        "xgb_model__n_estimators": [500, 750, 1000],
        "xgb_model__eta": [0.01, 0.1, 0.3, 0.5],
        "xgb_model__gamma": [0, 1, 10, 50, 100],
        "xgb_model__max_depth": [2, 4, 6, 8, 10]
    }
    trained_model = tune_hyperparameters(X_train, y_train, model, params, "xgb_model", "XGBoost", preprocessor, resampling_name, 5, seed)
        
    tuned_model = tune_threshold(trained_model['model'], X_val, y_val)
    trained_model['model'] = tuned_model

    trained_model['scores'] = get_validation_auc(tuned_model, X_val, y_val)
        
    return trained_model

In [None]:
def train_lightgbm(X_train, y_train, X_val, y_val, preprocessor, seed, resampling_name):

    msg = f"Training LightGBM model using the {resampling_name} resampling"
    print(msg)

    model = LGBMClassifier(objective='binary', random_state=seed)

    params = {
        "lgbm_model__num_leaves": [20, 30, 40],
        "lgbm_model__max_depth": [2, 5, 10, 15, 20],
        "lgbm_model__learning_rate": [0.001, 0.01, 0.1],
        "lgbm_model__n_estimators": [100, 300, 500]
    }
    trained_model = tune_hyperparameters(X_train, y_train, model, params, "lgbm_model", "LightGBM", preprocessor, resampling_name, 5, seed)
        
    tuned_model = tune_threshold(trained_model['model'], X_val, y_val)
    trained_model['model'] = tuned_model

    trained_model['scores'] = get_validation_auc(tuned_model, X_val, y_val)
        
    return trained_model

# Experiments

In [None]:
df = pd.read_csv('data/clean/processed_dataset.csv')
X = df.drop(['nhs_number', 'subsequent_mi_30days_diagnosis'], axis=1).copy()
y = df['subsequent_mi_30days_diagnosis']

In [None]:
seed = 42

In [None]:
train_set, val_set, _ = split_data(X, y, train_size=0.6, validation_size=0.2, seed=seed)

In [None]:
X_train = train_set.drop(['subsequent_mi_30days_diagnosis'], axis=1).copy()
y_train = train_set['subsequent_mi_30days_diagnosis']

X_val = val_set.drop(['subsequent_mi_30days_diagnosis'], axis=1).copy()
y_val = val_set['subsequent_mi_30days_diagnosis']

In [None]:
num_cols = ['acute_morbidity_indicator', 'ae_duration_hrs', 'max_tnt_24hr_int',
            'min_egfr_24hr_int', 'first_tnt_24hr_int', 'first_egfr_24hr_int',
            'mood_and_anxiety_disorders_indicator', 'tnt_egfr_interaction',
            'ip_duration_days', 'total_duration_days', 'age', 'tnt_change', 'egfr_change']
disc_cols = ['ihd_mi', 'cc_heart_failure', 'cc_myocardial_infarction',
             'imd_decile_19', 'qof_diabetes', 'qof_ht', 'ht', 'qof_chd',
             'ihd_nonmi', 'af', 'arrhythmia_other', 'stroke', 'hf', 'vasc_dis',
             'cardio_other', 'qof_depression', 'qof_mental', 'N_tnt_24hr', 'N_egfr_24hr',
             'mi_diagnosis_ae_discharge', 'meds_total', 'meds_antip', 'meds_angio',
             'meds_betab', 'meds_total_discharge', 'transfered_dv', 'mi_diagnosis_code',
             'chd_diagnosis_code', 'meds_total_more_than_10',
             'tnt_rule_in', 'age_threshold', 'ae_target', 'egfr_rule_in']
cat_cols = ['ethnicity', 'sex', 'smoking', 'ae_provider', 'ip_provider',
            'site_ae', 'site_ip', 'derived_trust_catchment',
            'departure_season', 'diagnosis_description']

In [None]:
sampling_names = ['No resampling', 'Under', 'Over', 'Under and over', 'Over and under']

preprocessor = create_preprocessing_pipeline(X_train, num_cols, disc_cols, cat_cols)

fig, axs = plt.subplots(nrows=2, ncols=2, squeeze=False, figsize=(10, 10))


for i, ax in enumerate(axs.ravel()):
    if i==0:
        model_name = 'Logistic Regression'
    elif i==1:
        model_name = 'Random Forest'
    elif i==2:
        model_name = 'XGBoost'
    else:
        model_name = 'LightGBM'

    for idx, name in enumerate(sampling_names):
            
        if model_name=='Logistic Regression':
            model_dict = train_logistic_regression(X_train, y_train, X_val, y_val, preprocessor, seed, name)
        elif model_name=='Random Forest':
            model_dict = train_random_forest(X_train, y_train, X_val, y_val, preprocessor, seed, name)
        elif model_name=='XGBoost':
            model_dict = train_xgboost(X_train, y_train, X_val, y_val, preprocessor, seed, name)
        else:
            model_dict = train_lightgbm(X_train, y_train, X_val, y_val, preprocessor, seed, name)

        model = model_dict['model']

        y_scores = model.predict_proba(X_val)[:, 1]

        display = RocCurveDisplay.from_predictions(
            y_val,
            y_scores,
            name=name,
            ax=ax,
            plot_chance_level=(idx==len(sampling_names)-1),
            chance_level_kw={'linestyle': ':'}
        )
    ax.set_title(model_name)
_ = plt.tight_layout(pad=2.0)
plt.savefig('results/experimentation_results/imbalanced_class_techniques_results.png')
plt.show()
plt.close()