# Exploratory Data Analysis and Cleaning

In [None]:
## Standard Imports and Setting Adjustments
import numpy as np
import pandas as pd
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
pd.set_option("display.max_rows", None)

## Set Random Seeds Globally
import random
random.seed(42)
np.random.seed(42)

In [None]:
## Import Dataset
mi_data = pd.read_csv('C:/Users/fillm/Desktop/DSMS 691/Dataset/Myocardial infarction complications Database.csv')

# Tweak ZSN_A feature so can be ordinal instead of categorical
mi_data['ZSN_A'] = mi_data['ZSN_A'].replace(3, 2) #make 2 code for either left or right ventricular systolic dysfunction

In [None]:
# Review variance of all features
mi_data.var()

In [None]:
## Utilize Variable Info
## This is a CSV File created by me that contains useful information about the variables in the dataset
var_info = pd.read_csv('C:/Users/fillm/Desktop/DSMS 691/Dataset/MI_var_info.csv')
var_info['binary_vars'] = (var_info['Type'] == 'Binary') & (var_info['Role'] != 'Target')
var_info['categorical_vars'] = var_info['Type'] == 'Categorical'
var_info['ordinal_vars'] = var_info['Type'] == 'Ordinal'
var_info['num_vars'] = (var_info['Type'] == 'Continuous') | (var_info['Type'] == 'Integer')
var_info['cont_vars'] = (var_info['Type'] == 'Continuous')


# Add Column for What Period Predictors can be used
var_info['pred_admission'] = (var_info.index >=1) & (var_info.index < 112) & (var_info.index !=93-1) & (var_info.index !=94-1) &  (var_info.index != 95-1) &  (var_info.index != 100-1)  &  (var_info.index != 101-1)  &  (var_info.index != 102-1)  &  (var_info.index != 103-1)  &  (var_info.index != 104-1) &  (var_info.index != 105-1) #predictors that can be used at admission
var_info['pred_eod1'] = (var_info.index >=1) & (var_info.index < 112) & (var_info.index !=94-1) &  (var_info.index != 95-1)  &  (var_info.index != 101-1)  &  (var_info.index != 102-1)   &  (var_info.index != 104-1) &  (var_info.index != 105-1) #predictors that can be used at end of day 1
var_info['pred_eod2'] = (var_info.index >=1) & (var_info.index < 112) & (var_info.index != 95-1)  &  (var_info.index != 102-1) &  (var_info.index != 105-1) #predictors that can be used at end of day 2
var_info['pred_eod3'] = (var_info.index >=1) & (var_info.index < 112) #predictors that can be used at end of day 3

# Rename columns for convenience
var_info = var_info.rename(columns = {'Variable Name' : 'varname', 'Role' : 'role', 'Type' : 'type'})

# Drop Variables Not Including in Analysis
# Note to self : this must be run after the code directly above which relies on indexes
var_info = var_info.drop(var_info[var_info['varname'] == 'ID'].index) 
var_info = var_info.drop(var_info[var_info['varname'] == 'IBS_NASL'].index) 
var_info = var_info.drop(var_info[var_info['varname'] == 'S_AD_KBRIG'].index) 
var_info = var_info.drop(var_info[var_info['varname'] == 'D_AD_KBRIG'].index) 
var_info = var_info.drop(var_info[var_info['varname'] == 'KFK_BLOOD'].index) 


# Create arrays for column names of certain data types
binary_vars = np.array(var_info[var_info['binary_vars'] == True]['varname'])
categorical_vars = np.array(var_info[var_info['categorical_vars'] == True]['varname'])
ordinal_vars = np.array(var_info[var_info['ordinal_vars'] == True]['varname'])
num_vars = np.array(var_info[var_info['num_vars'] == True]['varname'])
continuous_vars = np.array(var_info[var_info['cont_vars'] == True]['varname'])
targets = np.array(var_info[var_info['role'] == 'Target']['varname'])


In [None]:
## Review Tabulations of Variables
mi_data['AGE'].value_counts(sort = True, ascending = True)
mi_data['SEX'].value_counts(sort = True, ascending = True)

In [None]:
## Review Numbers of Missing Values per Variable
mi_data.info(verbose = True, show_counts= True)

## Review Number of Missing Values per Row
(mi_data.isnull().sum(axis=1) > 40).sum() # 7 records have more than 40 variables missing

## Drop 7 Rows with over 40 missing features
to_drop = np.where(mi_data.isnull().sum(axis=1) > 40)[0]
to_drop
mi_data = mi_data.drop(to_drop)

In [None]:
## Calculate Percentage of missing values per variable
## Drop variables that are over 60% missing values
for vars in np.array(mi_data.columns):
    val = mi_data[vars].isnull().sum() / len(mi_data)
    if val > .6:
        print('Dropping... ', vars, round(val, 4))
        mi_data = mi_data.drop(vars, axis=1)
        


In [None]:
## Keep Only Target Variables Planning on Analyzing
mi_data = mi_data.drop('FIBR_PREDS', axis=1)
mi_data = mi_data.drop('PREDS_TAH', axis=1)
mi_data = mi_data.drop('JELUD_TAH', axis=1)
mi_data = mi_data.drop('FIBR_JELUD', axis=1)
mi_data = mi_data.drop('A_V_BLOK', axis=1)
mi_data = mi_data.drop('DRESSLER', axis=1)
mi_data = mi_data.drop('REC_IM', axis=1)
mi_data = mi_data.drop('P_IM_STEN', axis=1)
mi_data = mi_data.drop('LET_IS', axis=1)


### Note : Going to analyze these three targets remaining : OTEK_LANC, RAZRIV, ZSN
Chronic heart failure : Target Variable Name = 'ZSN'

Myocardial rupture : Target Variable Name = 'RAZRIV'

Pulmonary edema : Target Variable Name = 'OTEK_LANC'

In [None]:
## Review Imbalance of Target Variables
for vars in np.array(var_info[var_info['role'] == 'Target']['varname']):
    mi_data[vars].value_counts(normalize = True)

In [None]:
#Review Scatterplot Matrix for Continuous Vars
import matplotlib as mpl
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix

scatter_matrix(mi_data.loc[:, continuous_vars], figsize=(24, 12))


In [None]:
## Separate Features and Targets
X_all = mi_data.drop(['ZSN', 'RAZRIV', 'OTEK_LANC'], axis = 1)
y_ZSN = mi_data.loc[:, ['ZSN', 'ID']]
y_RAZRIV = mi_data.loc[:, ['RAZRIV', 'ID']]
y_OTEK_LANC = mi_data.loc[:, ['OTEK_LANC', 'ID']]

## Split Data into Training, Validation and Test Sets
#Stratify by the Target to ensure equal proportions of target in train/validation/test data, to help address severe class imbalances
#Split 70% for Training, and 15% for both Validation and Test Sets
from sklearn.model_selection import train_test_split

#ZSN Data Splits
X_train_ZSN, X_temp_ZSN, y_train_ZSN, y_temp_ZSN = train_test_split(X_all, y_ZSN, test_size=0.3, stratify = y_ZSN['ZSN'], random_state=42)
X_val_ZSN, X_test_ZSN, y_val_ZSN, y_test_ZSN = train_test_split(X_temp_ZSN, y_temp_ZSN, test_size=0.5, stratify = y_temp_ZSN['ZSN'], random_state=42)

#RAZRIV Data Splits
X_train_RAZRIV, X_temp_RAZRIV, y_train_RAZRIV, y_temp_RAZRIV = train_test_split(X_all, y_RAZRIV, test_size=0.3, stratify = y_RAZRIV['RAZRIV'], random_state=42)
X_val_RAZRIV, X_test_RAZRIV, y_val_RAZRIV, y_test_RAZRIV = train_test_split(X_temp_RAZRIV, y_temp_RAZRIV, test_size=0.5, stratify = y_temp_RAZRIV['RAZRIV'], random_state=42)

#OTEK_LANC Data Splits
X_train_OTEK_LANC, X_temp_OTEK_LANC, y_train_OTEK_LANC, y_temp_OTEK_LANC = train_test_split(X_all, y_OTEK_LANC, test_size=0.3, stratify = y_OTEK_LANC['OTEK_LANC'], random_state=42)
X_val_OTEK_LANC, X_test_OTEK_LANC, y_val_OTEK_LANC, y_test_OTEK_LANC = train_test_split(X_temp_OTEK_LANC, y_temp_OTEK_LANC, test_size=0.5, stratify = y_temp_OTEK_LANC['OTEK_LANC'], random_state=42)




In [None]:
### Make Column Transformer ###

# Imports
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Make Pipeline for Numeric Vars (Impute and Scale)
num_pipeline = make_pipeline(
    SimpleImputer(strategy="mean"),
    StandardScaler())

# Make Pipeline for Ordinal Vars (Impute and Ordinal Encode)
ordinal_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))

# Make Pipeline for Categorical Vars (Impute and Apply One Hot Encoder)
categorical_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(drop='first', handle_unknown="ignore"))

# Make Pipeline for Binary Vars (Impute)
binary_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"))

## Create Full Pipeline
full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_vars),
        ("cat", categorical_pipeline, categorical_vars),
        ("ord", ordinal_pipeline, ordinal_vars),
        ("bin", binary_pipeline, binary_vars)],
        remainder = 'passthrough',
        verbose_feature_names_out = False
)

## Define Feature Lists to Make Model-Specific Versions of Pipelines for use in Model Deployment
## Note : These variable lists below were added after feature selection, and are lists of selected features
num_vars_ZSN_deployment = ['AGE', 'NA_R_2_n', 'NA_R_3_n']
num_vars_RAZRIV_deployment = ['AGE']
num_vars_OTEK_LANC_deployment = ['AGE', 'ROE', 'NA_R_1_n', 'NA_R_2_n', 'NA_R_3_n']
binary_vars_ZSN_deployment = ['SEX','nr_04','endocr_01','zab_leg_01','K_SH_POST','MP_TP_POST','ritm_ecg_p_02','n_r_ecg_p_06']           
binary_vars_RAZRIV_deployment = ['SEX','SIM_GIPERT','endocr_02','ritm_ecg_p_01','n_p_ecg_p_03','ASP_S_n']
binary_vars_OTEK_LANC_deployment = ['SEX','nr_04','endocr_01','zab_leg_02','O_L_POST','ritm_ecg_p_01','ritm_ecg_p_02','ritm_ecg_p_07','n_r_ecg_p_06','NA_KB','NITR_S']
ordinal_vars_ZSN_deployment = ['ZSN_A','R_AB_2_n','R_AB_3_n','NOT_NA_1_n'] 
ordinal_vars_RAZRIV_deployment = ['INF_ANAM','GB','DLIT_AG','TIME_B_S','NOT_NA_1_n']
ordinal_vars_OTEK_LANC_deployment = ['INF_ANAM','STENOK_AN','FK_STENOK','GB','DLIT_AG','ZSN_A','R_AB_1_n','R_AB_2_n','R_AB_3_n']
categorical_vars_ZSN_deployment = ['ant_im', 'lat_im'] # keep 'ant_im_1.0','ant_im_4.0','lat_im_2.0','lat_im_3.0' after encoded
categorical_vars_RAZRIV_deployment = ['ant_im', 'lat_im'] # keep 'ant_im_1.0','ant_im_4.0','lat_im_2.0','lat_im_3.0' after encoded
#categorical_vars_OTEK_LANC_deployment = [] none
num_vars_ZSN_admissions_deployment = ['AGE']
num_vars_RAZRIV_admissions_deployment = ['AGE']
num_vars_OTEK_LANC_admissions_deployment = ['AGE', 'ROE']
binary_vars_ZSN_admissions_deployment = ['SEX', 'nr_04', 'endocr_01', 'zab_leg_01', 'K_SH_POST', 'MP_TP_POST', 'ritm_ecg_p_02', 'n_r_ecg_p_06']
binary_vars_RAZRIV_admissions_deployment = ['SEX', 'SIM_GIPERT', 'endocr_02', 'ritm_ecg_p_01', 'n_p_ecg_p_03', 'ASP_S_n']
binary_vars_OTEK_LANC_admissions_deployment = ['SEX', 'nr_04', 'endocr_01', 'zab_leg_02', 'O_L_POST', 'ritm_ecg_p_01', 'ritm_ecg_p_02', 'ritm_ecg_p_07', 'n_r_ecg_p_06', 'NA_KB', 'NITR_S']
ordinal_vars_ZSN_admissions_deployment = ['ZSN_A']
ordinal_vars_RAZRIV_admissions_deployment = ['INF_ANAM', 'GB', 'DLIT_AG', 'TIME_B_S']
ordinal_vars_OTEK_LANC_admissions_deployment = ['INF_ANAM', 'STENOK_AN', 'FK_STENOK', 'GB', 'DLIT_AG', 'ZSN_A']
categorical_vars_ZSN_admissions_deployment = ['ant_im', 'lat_im']
categorical_vars_RAZRIV_admissions_deployment = ['ant_im', 'lat_im']
#categorical_vars_OTEK_LANC_admissions_deployment = [] none

## Create Model-Specific Versions of Pipelines for use in Model Deployment Later
full_pipeline_ZSN_deployment = ColumnTransformer([
        ("num", num_pipeline, num_vars_ZSN_deployment),
        ("cat", categorical_pipeline, categorical_vars_ZSN_deployment),
        ("ord", ordinal_pipeline, ordinal_vars_ZSN_deployment),
        ("bin", binary_pipeline, binary_vars_ZSN_deployment)],
        remainder = 'passthrough',
        verbose_feature_names_out = False
)

full_pipeline_RAZRIV_deployment = ColumnTransformer([
        ("num", num_pipeline, num_vars_RAZRIV_deployment),
        ("cat", categorical_pipeline, categorical_vars_RAZRIV_deployment),
        ("ord", ordinal_pipeline, ordinal_vars_RAZRIV_deployment),
        ("bin", binary_pipeline, binary_vars_RAZRIV_deployment)],
        remainder = 'passthrough',
        verbose_feature_names_out = False
)

full_pipeline_OTEK_LANC_deployment = ColumnTransformer([
        ("num", num_pipeline, num_vars_OTEK_LANC_deployment),
        ("ord", ordinal_pipeline, ordinal_vars_OTEK_LANC_deployment),
        ("bin", binary_pipeline, binary_vars_OTEK_LANC_deployment)],
        remainder = 'passthrough',
        verbose_feature_names_out = False
)

full_pipeline_ZSN_admissions_deployment = ColumnTransformer([
        ("num", num_pipeline, num_vars_ZSN_admissions_deployment),
        ("cat", categorical_pipeline, categorical_vars_ZSN_admissions_deployment),
        ("ord", ordinal_pipeline, ordinal_vars_ZSN_admissions_deployment),
        ("bin", binary_pipeline, binary_vars_ZSN_admissions_deployment)],
        remainder = 'passthrough',
        verbose_feature_names_out = False
)

full_pipeline_RAZRIV_admissions_deployment = ColumnTransformer([
        ("num", num_pipeline, num_vars_RAZRIV_admissions_deployment),
        ("cat", categorical_pipeline, categorical_vars_RAZRIV_admissions_deployment),
        ("ord", ordinal_pipeline, ordinal_vars_RAZRIV_admissions_deployment),
        ("bin", binary_pipeline, binary_vars_RAZRIV_admissions_deployment)],
        remainder = 'passthrough',
        verbose_feature_names_out = False
)

full_pipeline_OTEK_LANC_admissions_deployment = ColumnTransformer([
        ("num", num_pipeline, num_vars_OTEK_LANC_admissions_deployment),
        ("ord", ordinal_pipeline, ordinal_vars_OTEK_LANC_admissions_deployment),
        ("bin", binary_pipeline, binary_vars_OTEK_LANC_admissions_deployment)],
        remainder = 'passthrough',
        verbose_feature_names_out = False
)

## Create Subsets of features actually used in deployed models
X_ZSN_deployment =   X_train_ZSN[num_vars_ZSN_deployment + categorical_vars_ZSN_deployment + ordinal_vars_ZSN_deployment + binary_vars_ZSN_deployment]
X_ZSN_admissions_deployment =   X_train_ZSN[num_vars_ZSN_admissions_deployment + categorical_vars_ZSN_admissions_deployment + ordinal_vars_ZSN_admissions_deployment + binary_vars_ZSN_admissions_deployment]
X_RAZRIV_deployment =   X_train_RAZRIV[num_vars_RAZRIV_deployment + categorical_vars_RAZRIV_deployment + ordinal_vars_RAZRIV_deployment + binary_vars_RAZRIV_deployment]
X_RAZRIV_admissions_deployment =   X_train_RAZRIV[num_vars_RAZRIV_admissions_deployment + categorical_vars_RAZRIV_admissions_deployment + ordinal_vars_RAZRIV_admissions_deployment + binary_vars_RAZRIV_admissions_deployment]
X_OTEK_LANC_deployment =   X_train_OTEK_LANC[num_vars_OTEK_LANC_deployment + ordinal_vars_OTEK_LANC_deployment + binary_vars_OTEK_LANC_deployment]
X_OTEK_LANC_admissions_deployment =   X_train_OTEK_LANC[num_vars_OTEK_LANC_admissions_deployment + ordinal_vars_OTEK_LANC_admissions_deployment + binary_vars_OTEK_LANC_admissions_deployment]


## Save out fit pipeline objects to import later for use in model deployment
import joblib
full_pipeline_ZSN_deployment.fit(X_ZSN_deployment)
full_pipeline_RAZRIV_deployment.fit(X_RAZRIV_deployment)
full_pipeline_OTEK_LANC_deployment.fit(X_OTEK_LANC_deployment)
full_pipeline_ZSN_admissions_deployment.fit(X_ZSN_admissions_deployment)
full_pipeline_RAZRIV_admissions_deployment.fit(X_RAZRIV_admissions_deployment)
full_pipeline_OTEK_LANC_admissions_deployment.fit(X_OTEK_LANC_admissions_deployment)
joblib.dump(full_pipeline_ZSN_deployment, 'full_pipeline_ZSN_deployment.pkl')
joblib.dump(full_pipeline_RAZRIV_deployment, 'full_pipeline_RAZRIV_deployment.pkl')
joblib.dump(full_pipeline_OTEK_LANC_deployment, 'full_pipeline_OTEK_LANC_deployment.pkl')
joblib.dump(full_pipeline_ZSN_admissions_deployment, 'full_pipeline_ZSN_admissions_deployment.pkl')
joblib.dump(full_pipeline_RAZRIV_admissions_deployment, 'full_pipeline_RAZRIV_admissions_deployment.pkl')
joblib.dump(full_pipeline_OTEK_LANC_admissions_deployment, 'full_pipeline_OTEK_LANC_admissions_deployment.pkl')

## Transform All Features Datasets and Restore feature Names
X_train_ZSN = full_pipeline.fit_transform(X_train_ZSN)
features = full_pipeline.get_feature_names_out()
X_train_ZSN = pd.DataFrame(X_train_ZSN, columns = features)

X_val_ZSN = full_pipeline.transform(X_val_ZSN)
features = full_pipeline.get_feature_names_out()
X_val_ZSN = pd.DataFrame(X_val_ZSN, columns = features)

X_test_ZSN = full_pipeline.transform(X_test_ZSN)
features = full_pipeline.get_feature_names_out()
X_test_ZSN = pd.DataFrame(X_test_ZSN, columns = features)


X_train_RAZRIV = full_pipeline.fit_transform(X_train_RAZRIV)
features = full_pipeline.get_feature_names_out()
X_train_RAZRIV = pd.DataFrame(X_train_RAZRIV, columns = features)

X_val_RAZRIV = full_pipeline.transform(X_val_RAZRIV)
features = full_pipeline.get_feature_names_out()
X_val_RAZRIV = pd.DataFrame(X_val_RAZRIV, columns = features)

X_test_RAZRIV = full_pipeline.transform(X_test_RAZRIV)
features = full_pipeline.get_feature_names_out()
X_test_RAZRIV = pd.DataFrame(X_test_RAZRIV, columns = features)


X_train_OTEK_LANC = full_pipeline.fit_transform(X_train_OTEK_LANC)
features = full_pipeline.get_feature_names_out()
X_train_OTEK_LANC = pd.DataFrame(X_train_OTEK_LANC, columns = features)

X_val_OTEK_LANC = full_pipeline.transform(X_val_OTEK_LANC)
features = full_pipeline.get_feature_names_out()
X_val_OTEK_LANC = pd.DataFrame(X_val_OTEK_LANC, columns = features)

X_test_OTEK_LANC = full_pipeline.transform(X_test_OTEK_LANC)
features = full_pipeline.get_feature_names_out()
X_test_OTEK_LANC = pd.DataFrame(X_test_OTEK_LANC, columns = features)



In [None]:
## Ensure All Variables are now Non-Missing and confirm all features are of float datatype
X_train_ZSN.info(verbose = True, show_counts= True)

In [None]:
##Create Datasets that are combined Training and Validation Data to be Used for Feature Selection

# Combine Rows for Training Sets and Validation Sets which can both be used for feature selection
ZSN_FS = pd.concat([X_train_ZSN, X_val_ZSN], ignore_index=True)
RAZRIV_FS = pd.concat([X_train_RAZRIV, X_val_RAZRIV], ignore_index=True)
OTEK_LANC_FS = pd.concat([X_train_OTEK_LANC, X_val_OTEK_LANC], ignore_index=True)

# Need To Merge on Labels Using ID
ZSN_FS = pd.merge(ZSN_FS, y_ZSN, on = 'ID', how = 'left')
RAZRIV_FS = pd.merge(RAZRIV_FS, y_RAZRIV, on = 'ID', how = 'left')
OTEK_LANC_FS = pd.merge(OTEK_LANC_FS, y_OTEK_LANC, on = 'ID', how = 'left')


In [None]:
## Drop ID variable that is simply a unique row identifier from all Dataframes
frame_list = [X_train_ZSN, X_val_ZSN, X_test_ZSN, X_train_RAZRIV, X_val_RAZRIV, X_test_RAZRIV, X_train_OTEK_LANC, X_val_OTEK_LANC, X_test_OTEK_LANC, y_train_ZSN, y_val_ZSN, y_test_ZSN, y_train_RAZRIV, y_val_RAZRIV, y_test_RAZRIV, y_train_OTEK_LANC, y_val_OTEK_LANC, y_test_OTEK_LANC, ZSN_FS, RAZRIV_FS, OTEK_LANC_FS]   
for frame in frame_list :
    frame.drop('ID', axis=1, inplace = True)



In [None]:
### Numeric Variable Feature Selection ###

## Review Correlation of Targets and Numeric variables and use for feature selection
corr_matrix_ZSN = ZSN_FS[np.append(num_vars, 'ZSN')].corr()
corr_matrix_RAZRIV = RAZRIV_FS[np.append(num_vars, 'RAZRIV')].corr()
corr_matrix_OTEK_LANC = OTEK_LANC_FS[np.append(num_vars, 'OTEK_LANC')].corr()
corr_level = .075

# Change all correlation values less than absolute value of specified correlation level to missing for easier identification of good features
for vars in np.array(corr_matrix_ZSN.columns): 
    corr_matrix_ZSN[vars] = corr_matrix_ZSN[vars].mask((corr_matrix_ZSN[vars]<corr_level) & (corr_matrix_ZSN[vars]>-corr_level), np.nan)

for vars in np.array(corr_matrix_RAZRIV.columns): 
    corr_matrix_RAZRIV[vars] = corr_matrix_RAZRIV[vars].mask((corr_matrix_RAZRIV[vars]<corr_level) & (corr_matrix_RAZRIV[vars]>-corr_level), np.nan)

for vars in np.array(corr_matrix_OTEK_LANC.columns): 
    corr_matrix_OTEK_LANC[vars] = corr_matrix_OTEK_LANC[vars].mask((corr_matrix_OTEK_LANC[vars]<corr_level) & (corr_matrix_OTEK_LANC[vars]>-corr_level), np.nan)

#View Correlation Matrices
corr_matrix_ZSN
corr_matrix_RAZRIV
corr_matrix_OTEK_LANC

# Upon Reviewing Correlation Matrix only keeping predictors with 'corr_level' or greater correlation as predictors
#Save a New Version of num_vars for each of 3 predictors
num_vars_ZSN = corr_matrix_ZSN.loc[:, 'ZSN'].dropna(axis='rows')
num_vars_RAZRIV = corr_matrix_RAZRIV.loc[:, 'RAZRIV'].dropna(axis='rows')
num_vars_OTEK_LANC = corr_matrix_OTEK_LANC.loc[:, 'OTEK_LANC'].dropna(axis='rows')

# Remove the Target rows
num_vars_ZSN = num_vars_ZSN.drop(['ZSN'], axis=0)
num_vars_RAZRIV = num_vars_RAZRIV.drop(['RAZRIV'], axis=0)
num_vars_OTEK_LANC = num_vars_OTEK_LANC.drop(['OTEK_LANC'], axis=0)

# Series that store Numeric Vars to use in each Respective Model
num_vars_ZSN = pd.Series(num_vars_ZSN.index)
num_vars_RAZRIV = pd.Series(num_vars_RAZRIV.index)
num_vars_OTEK_LANC = pd.Series(num_vars_OTEK_LANC.index)


In [None]:
# Review Summary Statistics for Numeric Variables
mi_data['AGE'].describe()
mi_data['NA_R_1_n'].describe()
mi_data['NA_R_2_n'].describe()
mi_data['NA_R_3_n'].describe()
mi_data['ALT_BLOOD'].describe()
mi_data['NA_BLOOD'].describe()
mi_data['S_AD_ORIT'].describe()
mi_data['D_AD_ORIT'].describe()
mi_data['NOT_NA_2_n'].describe()
mi_data['NOT_NA_3_n'].describe()
mi_data['ROE'].describe()

In [None]:
# Update Binary Variable list to include the new Binary Features Created from One-Hot-Encoding
new_binary = np.array(['ant_im_1.0','ant_im_2.0','ant_im_3.0','ant_im_4.0','lat_im_1.0','lat_im_2.0','lat_im_3.0','lat_im_4.0','inf_im_1.0','inf_im_2.0','inf_im_3.0','inf_im_4.0','post_im_1.0','post_im_2.0','post_im_3.0','post_im_4.0'])
binary_vars = np.concatenate((binary_vars, new_binary))

In [None]:
### Binary Feature Selection ###

## Assess Chi-Square Tests for Binary Variables and use for feature selection
from scipy.stats import chi2_contingency
alpha_level = .03 #Only keep predictors with p-value less than or equal to specified level

# ZSN Binary Features
combined_p_vals = pd.Series()
for vars in binary_vars :
    crosstab = pd.crosstab(ZSN_FS['ZSN'], ZSN_FS[vars])
    chi2_contingency(crosstab) #Run the Test
    p_val = pd.Series([vars, chi2_contingency(crosstab)[1]])
    combined_p_vals = pd.concat([combined_p_vals, p_val], axis = 1)

combined_p_vals = combined_p_vals.loc[:, (combined_p_vals.loc[1] <= alpha_level)]
combined_p_vals = combined_p_vals.loc[0]
binary_vars_ZSN = combined_p_vals #Save out list of significant binary features for ZSN models

# RAZRIV Binary Features
combined_p_vals = pd.Series()
for vars in binary_vars :
    crosstab = pd.crosstab(RAZRIV_FS['RAZRIV'], RAZRIV_FS[vars])
    chi2_contingency(crosstab) #Run the Test
    p_val = pd.Series([vars, chi2_contingency(crosstab)[1]])
    combined_p_vals = pd.concat([combined_p_vals, p_val], axis = 1)

combined_p_vals = combined_p_vals.loc[:, (combined_p_vals.loc[1] <= alpha_level)]
combined_p_vals = combined_p_vals.loc[0]
binary_vars_RAZRIV = combined_p_vals #Save out list of significant binary features for RAZRIV models

# OTEK_LANC Binary Features
combined_p_vals = pd.Series()
for vars in binary_vars :
    crosstab = pd.crosstab(OTEK_LANC_FS['OTEK_LANC'], OTEK_LANC_FS[vars])
    chi2_contingency(crosstab) #Run the Test
    p_val = pd.Series([vars, chi2_contingency(crosstab)[1]])
    combined_p_vals = pd.concat([combined_p_vals, p_val], axis = 1)

combined_p_vals = combined_p_vals.loc[:, (combined_p_vals.loc[1] <= alpha_level)]
combined_p_vals = combined_p_vals.loc[0]
binary_vars_OTEK_LANC = combined_p_vals #Save out list of significant binary features for OTEK_LANC models


In [None]:
### Feature Selection for Ordinal Variables ###
# Use Kendall's Rank Coefficient 
from scipy.stats import kendalltau

# ZSN Ordinal Features
combined_p_vals = pd.Series()
for vars in ordinal_vars :
    p_val = pd.Series([vars, kendalltau(ZSN_FS[vars], ZSN_FS['ZSN'], nan_policy = 'omit')[1]])
    combined_p_vals = pd.concat([combined_p_vals, p_val], axis = 1)
    
combined_p_vals = combined_p_vals.loc[:, (combined_p_vals.loc[1] <= alpha_level)]
combined_p_vals = combined_p_vals.loc[0]
ordinal_vars_ZSN = combined_p_vals #Save out list of significant ordinal features for ZSN models

# RAZRIV Ordinal Features
combined_p_vals = pd.Series()
for vars in ordinal_vars :
    p_val = pd.Series([vars, kendalltau(RAZRIV_FS[vars], RAZRIV_FS['RAZRIV'], nan_policy = 'omit')[1]])
    combined_p_vals = pd.concat([combined_p_vals, p_val], axis = 1)
    
combined_p_vals = combined_p_vals.loc[:, (combined_p_vals.loc[1] <= alpha_level)]
combined_p_vals = combined_p_vals.loc[0]
ordinal_vars_RAZRIV = combined_p_vals #Save out list of significant ordinal features for RAZRIV models

# OTEK_LANC Ordinal Features
combined_p_vals = pd.Series()
for vars in ordinal_vars :
    p_val = pd.Series([vars, kendalltau(OTEK_LANC_FS[vars], OTEK_LANC_FS['OTEK_LANC'], nan_policy = 'omit')[1]])
    combined_p_vals = pd.concat([combined_p_vals, p_val], axis = 1)
    
combined_p_vals = combined_p_vals.loc[:, (combined_p_vals.loc[1] <= alpha_level)]
combined_p_vals = combined_p_vals.loc[0]
ordinal_vars_OTEK_LANC = combined_p_vals #Save out list of significant ordinal features for OTEK_LANC models


# Data Preparation

In [None]:
## Limit Features Datasets to Only Variables Chosen in Feature Selection 
# ZSN
vars_ZSN = pd.concat([num_vars_ZSN, binary_vars_ZSN, ordinal_vars_ZSN], axis = 0)
X_train_ZSN = X_train_ZSN[vars_ZSN]
X_val_ZSN = X_val_ZSN[vars_ZSN]
X_test_ZSN = X_test_ZSN[vars_ZSN]

# RAZRIV
vars_RAZRIV = pd.concat([num_vars_RAZRIV, binary_vars_RAZRIV, ordinal_vars_RAZRIV], axis = 0)
X_train_RAZRIV = X_train_RAZRIV[vars_RAZRIV]
X_val_RAZRIV = X_val_RAZRIV[vars_RAZRIV]
X_test_RAZRIV = X_test_RAZRIV[vars_RAZRIV]

# OTEK_LANC
vars_OTEK_LANC = pd.concat([num_vars_OTEK_LANC, binary_vars_OTEK_LANC, ordinal_vars_OTEK_LANC], axis = 0)
X_train_OTEK_LANC = X_train_OTEK_LANC[vars_OTEK_LANC]
X_val_OTEK_LANC = X_val_OTEK_LANC[vars_OTEK_LANC]
X_test_OTEK_LANC = X_test_OTEK_LANC[vars_OTEK_LANC]


In [None]:
## Get Lists of Features that can be used at specified periods
## According to Data Manual, only a subset of features are available upon admission of a patient to the hospital

#List of Subset of Features Usable Upon Admission to Hospital
admission_vars = var_info[var_info['pred_admission'] == True].loc[:, 'varname']
admission_vars = pd.concat([admission_vars, pd.Series(['ant_im_1.0','ant_im_2.0','ant_im_3.0','ant_im_4.0','lat_im_1.0','lat_im_2.0','lat_im_3.0','lat_im_4.0','inf_im_1.0','inf_im_2.0','inf_im_3.0','inf_im_4.0','post_im_1.0','post_im_2.0','post_im_3.0','post_im_4.0'])])
admission_vars = admission_vars.drop([44, 45, 46, 47]) #drops original var names pre-onehotencoding


In [None]:
## Create Separate Training Data that only includes Features Usable Upon Admission to Hospital for use in "Admission" models
X_train_ZSN_admission = X_train_ZSN[pd.merge(admission_vars.to_frame(), vars_ZSN.to_frame(), how = 'inner').squeeze()]
X_test_ZSN_admission = X_test_ZSN[pd.merge(admission_vars.to_frame(), vars_ZSN.to_frame(), how = 'inner').squeeze()]
X_val_ZSN_admission = X_val_ZSN[pd.merge(admission_vars.to_frame(), vars_ZSN.to_frame(), how = 'inner').squeeze()]

X_train_RAZRIV_admission = X_train_RAZRIV[pd.merge(admission_vars.to_frame(), vars_RAZRIV.to_frame(), how = 'inner').squeeze()]
X_test_RAZRIV_admission = X_test_RAZRIV[pd.merge(admission_vars.to_frame(), vars_RAZRIV.to_frame(), how = 'inner').squeeze()]
X_val_RAZRIV_admission = X_val_RAZRIV[pd.merge(admission_vars.to_frame(), vars_RAZRIV.to_frame(), how = 'inner').squeeze()]

X_train_OTEK_LANC_admission = X_train_OTEK_LANC[pd.merge(admission_vars.to_frame(), vars_OTEK_LANC.to_frame(), how = 'inner').squeeze()]
X_test_OTEK_LANC_admission = X_test_OTEK_LANC[pd.merge(admission_vars.to_frame(), vars_OTEK_LANC.to_frame(), how = 'inner').squeeze()]
X_val_OTEK_LANC_admission = X_val_OTEK_LANC[pd.merge(admission_vars.to_frame(), vars_OTEK_LANC.to_frame(), how = 'inner').squeeze()]


In [None]:
## Confirm that admission model and full (EOD3) models actually have differing numbers of features
len(pd.merge(admission_vars.to_frame(), vars_ZSN.to_frame(), how = 'inner'))
len(vars_ZSN)

len(pd.merge(admission_vars.to_frame(), vars_RAZRIV.to_frame(), how = 'inner'))
len(vars_RAZRIV)

len(pd.merge(admission_vars.to_frame(), vars_OTEK_LANC.to_frame(), how = 'inner'))
len(vars_OTEK_LANC)


# Model Training

In [None]:
## Imports Needed for Model Creation and Tuning
import tensorflow as tf
tf.random.set_seed(42) # set tensorflow seed
import sklearn
from scikeras.wrappers import KerasClassifier #Need to use so that the Keras Sequential Model is compatible with GridSearchCV
from scikeras import wrappers
from tensorflow import keras
from tensorflow.keras.layers import Dense, Input, Dropout
from tensorflow.keras.models import Sequential
from sklearn.model_selection import GridSearchCV
from sklearn.utils import class_weight

In [None]:
## Specify Class Weights To Help Address Class Imbalances, Use 'balanced' option
## Use to increase model accuracy with positive predictions
class_weights_ZSN = dict(enumerate(class_weight.compute_class_weight('balanced', classes=np.unique(y_train_ZSN), y=y_train_ZSN.values.reshape(-1))))
class_weights_RAZRIV = dict(enumerate(class_weight.compute_class_weight('balanced', classes=np.unique(y_train_RAZRIV), y=y_train_RAZRIV.values.reshape(-1))))
class_weights_OTEK_LANC = dict(enumerate(class_weight.compute_class_weight('balanced', classes=np.unique(y_train_OTEK_LANC), y=y_train_OTEK_LANC.values.reshape(-1))))


In [None]:
## Create Parameter Grids to use for all models
parameter_grid_ZSN_admission = {
   'model__neurons' : [6, 12, 16],
   'model__dropout_rate' : [.3, .4],
   'model__learn_rt' : [.003, .005],
   'epochs' : [50, 100]
}
 
parameter_grid_ZSN = {
   'model__neurons' : [14, 16, 18],
   'model__dropout_rate' : [.0, .3],
   'model__learn_rt' : [.01, .015],
   'epochs' : [50, 100]
}
 
parameter_grid_RAZRIV_admission = {
   'model__neurons' : [14, 16, 18],
   'model__dropout_rate' : [.0, .3],
   'model__learn_rt' : [.01, .015],
   'epochs' : [50, 100]
}
 
parameter_grid_RAZRIV = {
   'model__neurons' : [14, 16, 18],
   'model__dropout_rate' : [.0, .3],
   'model__learn_rt' : [.01, .015],
   'epochs' : [50, 100]
}

parameter_grid_OTEK_LANC_admission = {
   'model__neurons' : [14, 16, 18],
   'model__dropout_rate' : [.3, .4],
   'model__learn_rt' : [.01, .015],
   'epochs' : [50, 100]
}
 
parameter_grid_OTEK_LANC = {
   'model__neurons' : [14, 16, 18],
   'model__dropout_rate' : [.0, .3],
   'model__learn_rt' : [.01, .015],
   'epochs' : [50, 100]
}



In [None]:
## ZSN Model Usable at Admission

#Define a Function that Creates the Model
def make_model_ZSN_admission(neurons, dropout_rate, learn_rt):
    model_ZSN_admission = Sequential()
    model_ZSN_admission.add(Dense(neurons, activation='relu', input_dim = X_train_ZSN_admission.shape[1]))
    model_ZSN_admission.add(Dropout(dropout_rate)) #
    model_ZSN_admission.add(Dense(1, activation='sigmoid')) 
    model_ZSN_admission.compile(loss='binary_crossentropy', optimizer=keras.optimizers.Adam(learning_rate=learn_rt), metrics=['accuracy']) 
    return model_ZSN_admission
 
#Execute Grid Search on Model
model_ZSN_admission = KerasClassifier(build_fn = make_model_ZSN_admission)
grid_ZSN_admission = GridSearchCV(estimator = model_ZSN_admission, param_grid = parameter_grid_ZSN_admission, cv = 3, verbose = 2)
results_ZSN_admission = grid_ZSN_admission.fit(X_train_ZSN_admission, y_train_ZSN, validation_data=(X_val_ZSN_admission, y_val_ZSN), class_weight = class_weights_ZSN)

#Get Best Hyperparameters Choices and Fit Model Using These Best Hyperparameters Found
best_params_ZSN_admission = results_ZSN_admission.best_params_
best_model_ZSN_admission = KerasClassifier(build_fn = make_model_ZSN_admission(best_params_ZSN_admission['model__neurons'], best_params_ZSN_admission['model__dropout_rate'], best_params_ZSN_admission['model__learn_rt']))
best_model_ZSN_admission.fit(X_train_ZSN_admission, y_train_ZSN, epochs = best_params_ZSN_admission['epochs'])


In [None]:
## ZSN Model Usable at EOD3 (End of Day 3 of Hospital Period)

#Define a Function that Creates the Model
def make_model_ZSN(neurons, dropout_rate, learn_rt):
    model_ZSN = Sequential()
    model_ZSN.add(Dense(neurons, activation='relu', input_dim = X_train_ZSN.shape[1]))
    model_ZSN.add(Dropout(dropout_rate)) #
    model_ZSN.add(Dense(1, activation='sigmoid')) 
    model_ZSN.compile(loss='binary_crossentropy', optimizer=keras.optimizers.Adam(learning_rate=learn_rt), metrics=['accuracy']) 
    return model_ZSN
 
#Execute Grid Search on Model
model_ZSN = KerasClassifier(build_fn = make_model_ZSN)
grid_ZSN = GridSearchCV(estimator = model_ZSN, param_grid = parameter_grid_ZSN, cv = 3, verbose = 2)
results_ZSN = grid_ZSN.fit(X_train_ZSN, y_train_ZSN, validation_data=(X_val_ZSN, y_val_ZSN), class_weight = class_weights_ZSN)

#Get Best Hyperparameters Choices and Fit Model Using These Best Hyperparameters Found
best_params_ZSN = results_ZSN.best_params_
best_model_ZSN = KerasClassifier(build_fn = make_model_ZSN(best_params_ZSN['model__neurons'], best_params_ZSN['model__dropout_rate'], best_params_ZSN['model__learn_rt']))
best_model_ZSN.fit(X_train_ZSN, y_train_ZSN, epochs = best_params_ZSN['epochs'])


In [None]:
## RAZRIV Model Usable at Admission

#Define a Function that Creates the Model
def make_model_RAZRIV_admission(neurons, dropout_rate, learn_rt):
    model_RAZRIV_admission = Sequential()
    model_RAZRIV_admission.add(Dense(neurons, activation='relu', input_dim = X_train_RAZRIV_admission.shape[1]))
    model_RAZRIV_admission.add(Dropout(dropout_rate)) #
    model_RAZRIV_admission.add(Dense(1, activation='sigmoid')) 
    model_RAZRIV_admission.compile(loss='binary_crossentropy', optimizer=keras.optimizers.Adam(learning_rate=learn_rt), metrics=['accuracy']) 
    return model_RAZRIV_admission
 
#Execute Grid Search on Model
model_RAZRIV_admission = KerasClassifier(build_fn = make_model_RAZRIV_admission)
grid_RAZRIV_admission = GridSearchCV(estimator = model_RAZRIV_admission, param_grid = parameter_grid_RAZRIV_admission, cv = 3, verbose = 2)
results_RAZRIV_admission = grid_RAZRIV_admission.fit(X_train_RAZRIV_admission, y_train_RAZRIV, validation_data=(X_val_RAZRIV_admission, y_val_RAZRIV), class_weight = class_weights_RAZRIV)

#Get Best Hyperparameters Choices and Fit Model Using These Best Hyperparameters Found
best_params_RAZRIV_admission = results_RAZRIV_admission.best_params_
best_model_RAZRIV_admission = KerasClassifier(build_fn = make_model_RAZRIV_admission(best_params_RAZRIV_admission['model__neurons'], best_params_RAZRIV_admission['model__dropout_rate'], best_params_RAZRIV_admission['model__learn_rt']))
best_model_RAZRIV_admission.fit(X_train_RAZRIV_admission, y_train_RAZRIV, epochs = best_params_RAZRIV_admission['epochs'])


In [None]:
## RAZRIV Model Usable at EOD3 (End of Day 3 of Hospital Period)

#Define a Function that Creates the Model
def make_model_RAZRIV(neurons, dropout_rate, learn_rt):
    model_RAZRIV = Sequential()
    model_RAZRIV.add(Dense(neurons, activation='relu', input_dim = X_train_RAZRIV.shape[1]))
    model_RAZRIV.add(Dropout(dropout_rate)) #
    model_RAZRIV.add(Dense(1, activation='sigmoid')) 
    model_RAZRIV.compile(loss='binary_crossentropy', optimizer=keras.optimizers.Adam(learning_rate=learn_rt), metrics=['accuracy']) 
    return model_RAZRIV
 
#Execute Grid Search on Model
model_RAZRIV = KerasClassifier(build_fn = make_model_RAZRIV)
grid_RAZRIV = GridSearchCV(estimator = model_RAZRIV, param_grid = parameter_grid_RAZRIV, cv = 3, verbose = 2)
results_RAZRIV = grid_RAZRIV.fit(X_train_RAZRIV, y_train_RAZRIV, validation_data=(X_val_RAZRIV, y_val_RAZRIV), class_weight = class_weights_RAZRIV)

#Get Best Hyperparameters Choices and Fit Model Using These Best Hyperparameters Found
best_params_RAZRIV = results_RAZRIV.best_params_
best_model_RAZRIV = KerasClassifier(build_fn = make_model_RAZRIV(best_params_RAZRIV['model__neurons'], best_params_RAZRIV['model__dropout_rate'], best_params_RAZRIV['model__learn_rt']))
best_model_RAZRIV.fit(X_train_RAZRIV, y_train_RAZRIV, epochs = best_params_RAZRIV['epochs'])


In [None]:
## OTEK_LANC Model Usable at Admission

#Define a Function that Creates the Model
def make_model_OTEK_LANC_admission(neurons, dropout_rate, learn_rt):
    model_OTEK_LANC_admission = Sequential()
    model_OTEK_LANC_admission.add(Dense(neurons, activation='relu', input_dim = X_train_OTEK_LANC_admission.shape[1]))
    model_OTEK_LANC_admission.add(Dropout(dropout_rate)) #
    model_OTEK_LANC_admission.add(Dense(1, activation='sigmoid')) 
    model_OTEK_LANC_admission.compile(loss='binary_crossentropy', optimizer=keras.optimizers.Adam(learning_rate=learn_rt), metrics=['accuracy']) 
    return model_OTEK_LANC_admission
 
#Execute Grid Search on Model
model_OTEK_LANC_admission = KerasClassifier(build_fn = make_model_OTEK_LANC_admission)
grid_OTEK_LANC_admission = GridSearchCV(estimator = model_OTEK_LANC_admission, param_grid = parameter_grid_OTEK_LANC_admission, cv = 3, verbose = 2)
results_OTEK_LANC_admission = grid_OTEK_LANC_admission.fit(X_train_OTEK_LANC_admission, y_train_OTEK_LANC, validation_data=(X_val_OTEK_LANC_admission, y_val_OTEK_LANC), class_weight = class_weights_OTEK_LANC)

#Get Best Hyperparameters Choices and Fit Model Using These Best Hyperparameters Found
best_params_OTEK_LANC_admission = results_OTEK_LANC_admission.best_params_
best_model_OTEK_LANC_admission = KerasClassifier(build_fn = make_model_OTEK_LANC_admission(best_params_OTEK_LANC_admission['model__neurons'], best_params_OTEK_LANC_admission['model__dropout_rate'], best_params_OTEK_LANC_admission['model__learn_rt']))
best_model_OTEK_LANC_admission.fit(X_train_OTEK_LANC_admission, y_train_OTEK_LANC, epochs = best_params_OTEK_LANC_admission['epochs'])


In [None]:
## OTEK_LANC Model Usable at EOD3 (End of Day 3 of Hospital Period)

#Define a Function that Creates the Model
def make_model_OTEK_LANC(neurons, dropout_rate, learn_rt):
    model_OTEK_LANC = Sequential()
    model_OTEK_LANC.add(Dense(neurons, activation='relu', input_dim = X_train_OTEK_LANC.shape[1]))
    model_OTEK_LANC.add(Dropout(dropout_rate)) #
    model_OTEK_LANC.add(Dense(1, activation='sigmoid')) 
    model_OTEK_LANC.compile(loss='binary_crossentropy', optimizer=keras.optimizers.Adam(learning_rate=learn_rt), metrics=['accuracy']) 
    return model_OTEK_LANC
 
#Execute Grid Search on Model
model_OTEK_LANC = KerasClassifier(build_fn = make_model_OTEK_LANC)
grid_OTEK_LANC = GridSearchCV(estimator = model_OTEK_LANC, param_grid = parameter_grid_OTEK_LANC, cv = 3, verbose = 2)
results_OTEK_LANC = grid_OTEK_LANC.fit(X_train_OTEK_LANC, y_train_OTEK_LANC, validation_data=(X_val_OTEK_LANC, y_val_OTEK_LANC), class_weight = class_weights_OTEK_LANC)

#Get Best Hyperparameters Choices and Fit Model Using These Best Hyperparameters Found
best_params_OTEK_LANC = results_OTEK_LANC.best_params_
best_model_OTEK_LANC = KerasClassifier(build_fn = make_model_OTEK_LANC(best_params_OTEK_LANC['model__neurons'], best_params_OTEK_LANC['model__dropout_rate'], best_params_OTEK_LANC['model__learn_rt']))
best_model_OTEK_LANC.fit(X_train_OTEK_LANC, y_train_OTEK_LANC, epochs = best_params_OTEK_LANC['epochs'])


In [None]:
## View Best Parameters for Each Model, Review and tune in parameter grids used in models above as needed
results_ZSN_admission.best_params_
results_ZSN.best_params_
results_RAZRIV_admission.best_params_
results_RAZRIV.best_params_
results_OTEK_LANC_admission.best_params_
results_OTEK_LANC.best_params_



In [None]:
## Save and Pickle Best Versions of All Models
import joblib
import pickle
joblib.dump(best_model_ZSN_admission, "best_model_ZSN_admission.joblib")
joblib.dump(best_model_ZSN, "best_model_ZSN.joblib")
joblib.dump(best_model_RAZRIV_admission, "best_model_RAZRIV_admission.joblib")
joblib.dump(best_model_RAZRIV, "best_model_RAZRIV.joblib")
joblib.dump(best_model_OTEK_LANC_admission, "best_model_OTEK_LANC_admission.joblib")
joblib.dump(best_model_OTEK_LANC, "best_model_OTEK_LANC.joblib")


# Model Evaluation

In [None]:
## Imports to Evaluate Models
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score, recall_score, f1_score


In [None]:
## Get Prediction Probabilities From All Models
y_proba_ZSN_admission = best_model_ZSN_admission.predict_proba(X_test_ZSN_admission) #Get Prediction Probabilities
y_proba_ZSN = best_model_ZSN.predict_proba(X_test_ZSN) #Get Prediction Probabilities
y_proba_RAZRIV_admission = best_model_RAZRIV_admission.predict_proba(X_test_RAZRIV_admission) #Get Prediction Probabilities
y_proba_RAZRIV = best_model_RAZRIV.predict_proba(X_test_RAZRIV) #Get Prediction Probabilities
y_proba_OTEK_LANC_admission = best_model_OTEK_LANC_admission.predict_proba(X_test_OTEK_LANC_admission) #Get Prediction Probabilities
y_proba_OTEK_LANC = best_model_OTEK_LANC.predict_proba(X_test_OTEK_LANC) #Get Prediction Probabilities

## Keep only positive outcome prediction probabilities
y_proba_ZSN_admission = y_proba_ZSN_admission[:, 1]
y_proba_ZSN = y_proba_ZSN[:, 1]
y_proba_RAZRIV_admission = y_proba_RAZRIV_admission[:, 1]
y_proba_RAZRIV = y_proba_RAZRIV[:, 1]
y_proba_OTEK_LANC_admission = y_proba_OTEK_LANC_admission[:, 1]
y_proba_OTEK_LANC = y_proba_OTEK_LANC[:, 1]


In [None]:
## ROC Curves for ZSN Models with AUC values
fpr_ZSN_admission, tpr_ZSN_admission, thresh_vals = roc_curve(y_test_ZSN, y_proba_ZSN_admission)
fpr_ZSN, tpr_ZSN, thresh_vals = roc_curve(y_test_ZSN, y_proba_ZSN)
fig, (ZSN1, ZSN2) = plt.subplots(1, 2, figsize = (10, 4))
fig.suptitle('ZSN ROC Curves')
ZSN1.set_title('Admission Model: AUC = ' + str(round(roc_auc_score(y_test_ZSN, y_proba_ZSN_admission), 3)))
ZSN2.set_title('Full Model: AUC = ' + str(round(roc_auc_score(y_test_ZSN, y_proba_ZSN), 3)))
ZSN1.set_xlabel('False-Positive Rate')
ZSN1.set_ylabel('True-Positive Rate')
ZSN2.set_xlabel('False-Positive Rate')
ZSN2.set_ylabel('True-Positive Rate')
ZSN1.plot(fpr_ZSN_admission, tpr_ZSN_admission)
ZSN2.plot(fpr_ZSN, tpr_ZSN)

## ROC Curves for RAZRIV Models with AUC values
fpr_RAZRIV_admission, tpr_RAZRIV_admission, thresh_vals = roc_curve(y_test_RAZRIV, y_proba_RAZRIV_admission)
fpr_RAZRIV, tpr_RAZRIV, thresh_vals = roc_curve(y_test_RAZRIV, y_proba_RAZRIV)
fig, (RAZRIV1, RAZRIV2) = plt.subplots(1, 2, figsize = (10, 4))
fig.suptitle('RAZRIV ROC Curves')
RAZRIV1.set_title('Admission Model: AUC = ' + str(round(roc_auc_score(y_test_RAZRIV, y_proba_RAZRIV_admission), 3)))
RAZRIV2.set_title('Full Model: AUC = ' + str(round(roc_auc_score(y_test_RAZRIV, y_proba_RAZRIV), 3)))
RAZRIV1.set_xlabel('False-Positive Rate')
RAZRIV1.set_ylabel('True-Positive Rate')
RAZRIV2.set_xlabel('False-Positive Rate')
RAZRIV2.set_ylabel('True-Positive Rate')
RAZRIV1.plot(fpr_RAZRIV_admission, tpr_RAZRIV_admission)
RAZRIV2.plot(fpr_RAZRIV, tpr_RAZRIV)

## ROC Curves for OTEK_LANC Models with AUC values
fpr_OTEK_LANC_admission, tpr_OTEK_LANC_admission, thresh_vals = roc_curve(y_test_OTEK_LANC, y_proba_OTEK_LANC_admission)
fpr_OTEK_LANC, tpr_OTEK_LANC, thresh_vals = roc_curve(y_test_OTEK_LANC, y_proba_OTEK_LANC)
fig, (OTEK_LANC1, OTEK_LANC2) = plt.subplots(1, 2, figsize = (10, 4))
fig.suptitle('OTEK_LANC ROC Curves')
OTEK_LANC1.set_title('Admission Model: AUC = ' + str(round(roc_auc_score(y_test_OTEK_LANC, y_proba_OTEK_LANC_admission), 3)))
OTEK_LANC2.set_title('Full Model: AUC = ' + str(round(roc_auc_score(y_test_OTEK_LANC, y_proba_OTEK_LANC), 3)))
OTEK_LANC1.set_xlabel('False-Positive Rate')
OTEK_LANC1.set_ylabel('True-Positive Rate')
OTEK_LANC2.set_xlabel('False-Positive Rate')
OTEK_LANC2.set_ylabel('True-Positive Rate')
OTEK_LANC1.plot(fpr_OTEK_LANC_admission, tpr_OTEK_LANC_admission)
OTEK_LANC2.plot(fpr_OTEK_LANC, tpr_OTEK_LANC)


In [None]:
## View Precision-Recall Tradeoff Curve for ZSN Models
precision_ZSN_admission, recall_ZSN_admission, thresh_vals = precision_recall_curve(y_test_ZSN, y_proba_ZSN_admission)
precision_ZSN, recall_ZSN, thresh_vals = precision_recall_curve(y_test_ZSN, y_proba_ZSN)
fig, (ZSN3, ZSN4) = plt.subplots(1, 2, figsize = (10, 4))
fig.suptitle('ZSN Precision-Recall Curves')
ZSN3.set_title('Admission Model')
ZSN4.set_title('Full Model')
ZSN3.set_xlabel('Precision')
ZSN3.set_ylabel('Recall')
ZSN4.set_xlabel('Precision')
ZSN4.set_ylabel('Recall')
ZSN3.plot(precision_ZSN_admission, recall_ZSN_admission)
ZSN4.plot(precision_ZSN, recall_ZSN)

## View Precision-Recall Tradeoff Curve for RAZRIV Models
precision_RAZRIV_admission, recall_RAZRIV_admission, thresh_vals = precision_recall_curve(y_test_RAZRIV, y_proba_RAZRIV_admission)
precision_RAZRIV, recall_RAZRIV, thresh_vals = precision_recall_curve(y_test_RAZRIV, y_proba_RAZRIV)
fig, (RAZRIV3, RAZRIV4) = plt.subplots(1, 2, figsize = (10, 4))
fig.suptitle('RAZRIV Precision-Recall Curves')
RAZRIV3.set_title('Admission Model')
RAZRIV4.set_title('Full Model')
RAZRIV3.set_xlabel('Precision')
RAZRIV3.set_ylabel('Recall')
RAZRIV4.set_xlabel('Precision')
RAZRIV4.set_ylabel('Recall')
RAZRIV3.plot(precision_RAZRIV_admission, recall_RAZRIV_admission)
RAZRIV4.plot(precision_RAZRIV, recall_RAZRIV)

## View Precision-Recall Tradeoff Curve for OTEK_LANC Models
precision_OTEK_LANC_admission, recall_OTEK_LANC_admission, thresh_vals = precision_recall_curve(y_test_OTEK_LANC, y_proba_OTEK_LANC_admission)
precision_OTEK_LANC, recall_OTEK_LANC, thresh_vals = precision_recall_curve(y_test_OTEK_LANC, y_proba_OTEK_LANC)
fig, (OTEK_LANC3, OTEK_LANC4) = plt.subplots(1, 2, figsize = (10, 4))
fig.suptitle('OTEK_LANC Precision-Recall Curves')
OTEK_LANC3.set_title('Admission Model')
OTEK_LANC4.set_title('Full Model')
OTEK_LANC3.set_xlabel('Precision')
OTEK_LANC3.set_ylabel('Recall')
OTEK_LANC4.set_xlabel('Precision')
OTEK_LANC4.set_ylabel('Recall')
OTEK_LANC3.plot(precision_OTEK_LANC_admission, recall_OTEK_LANC_admission)
OTEK_LANC4.plot(precision_OTEK_LANC, recall_OTEK_LANC)


In [None]:
#Set Probability Threshold for Counting as a Positive Prediction Based on Review of ROC Curves and Precision Recall Tradeoff Curves
threshold = .5

#Obtain Model Predictions in Label Form by Converting from Probabilities Using Specified Threshold
y_pred_ZSN_admission = np.where(y_proba_ZSN_admission >= threshold, 1, 0) 
y_pred_ZSN = np.where(y_proba_ZSN >= threshold, 1, 0)
y_pred_RAZRIV_admission = np.where(y_proba_RAZRIV_admission >= threshold, 1, 0)
y_pred_RAZRIV = np.where(y_proba_RAZRIV >= threshold, 1, 0)
y_pred_OTEK_LANC_admission = np.where(y_proba_OTEK_LANC_admission >= threshold, 1, 0)
y_pred_OTEK_LANC = np.where(y_proba_OTEK_LANC >= threshold, 1, 0)



In [None]:
## Create and View Confusion Matrices for All Models

#ZSN Admission Model Confusion Matrix
cm_ZSN_admission = confusion_matrix(y_test_ZSN, y_pred_ZSN_admission) #Create Confusion Matrix
cm_ZSN_admission_df = pd.DataFrame({'Predicted Negative':cm_ZSN_admission[:,0], 'Predicted Positive':cm_ZSN_admission[:,1]}, index=['Actual Negative','Actual Positive']) #Convert to df to visualize
precision_score_ZSN_admission = precision_score(y_test_ZSN, y_pred_ZSN_admission)
recall_score_ZSN_admission = recall_score(y_test_ZSN, y_pred_ZSN_admission)
print("******** ZSN Admission Model ********")
cm_ZSN_admission_df
print("Precision Score : ", precision_score_ZSN_admission)
print("Recall Score : ", recall_score_ZSN_admission)

#ZSN EOD3 Model Confusion Matrix
cm_ZSN = confusion_matrix(y_test_ZSN, y_pred_ZSN) #Create Confusion Matrix
cm_ZSN_df = pd.DataFrame({'Predicted Negative':cm_ZSN[:,0], 'Predicted Positive':cm_ZSN[:,1]}, index=['Actual Negative','Actual Positive']) #Convert to df to visualize
precision_score_ZSN = precision_score(y_test_ZSN, y_pred_ZSN)
recall_score_ZSN = recall_score(y_test_ZSN, y_pred_ZSN)
print("******** ZSN Full Model ********")
cm_ZSN_df
print("Precision Score : ", precision_score_ZSN)
print("Recall Score : ", recall_score_ZSN)



#RAZRIV Admission Model Confusion Matrix
cm_RAZRIV_admission = confusion_matrix(y_test_RAZRIV, y_pred_RAZRIV_admission) #Create Confusion Matrix
cm_RAZRIV_admission_df = pd.DataFrame({'Predicted Negative':cm_RAZRIV_admission[:,0], 'Predicted Positive':cm_RAZRIV_admission[:,1]}, index=['Actual Negative','Actual Positive']) #Convert to df to visualize
precision_score_RAZRIV_admission = precision_score(y_test_RAZRIV, y_pred_RAZRIV_admission)
recall_score_RAZRIV_admission = recall_score(y_test_RAZRIV, y_pred_RAZRIV_admission)
print("******** RAZRIV Admission Model ********")
cm_RAZRIV_admission_df
print("Precision Score : ", precision_score_RAZRIV_admission)
print("Recall Score : ", recall_score_RAZRIV_admission)

#RAZRIV EOD3 Model Confusion Matrix
cm_RAZRIV = confusion_matrix(y_test_RAZRIV, y_pred_RAZRIV) #Create Confusion Matrix
cm_RAZRIV_df = pd.DataFrame({'Predicted Negative':cm_RAZRIV[:,0], 'Predicted Positive':cm_RAZRIV[:,1]}, index=['Actual Negative','Actual Positive']) #Convert to df to visualize
precision_score_RAZRIV = precision_score(y_test_RAZRIV, y_pred_RAZRIV)
recall_score_RAZRIV = recall_score(y_test_RAZRIV, y_pred_RAZRIV)
print("******** RAZRIV Full Model ********")
cm_RAZRIV_df
print("Precision Score : ", precision_score_RAZRIV)
print("Recall Score : ", recall_score_RAZRIV)



#OTEK_LANC Admission Model Confusion Matrix
cm_OTEK_LANC_admission = confusion_matrix(y_test_OTEK_LANC, y_pred_OTEK_LANC_admission) #Create Confusion Matrix
cm_OTEK_LANC_admission_df = pd.DataFrame({'Predicted Negative':cm_OTEK_LANC_admission[:,0], 'Predicted Positive':cm_OTEK_LANC_admission[:,1]}, index=['Actual Negative','Actual Positive']) #Convert to df to visualize
precision_score_OTEK_LANC_admission = precision_score(y_test_OTEK_LANC, y_pred_OTEK_LANC_admission)
recall_score_OTEK_LANC_admission = recall_score(y_test_OTEK_LANC, y_pred_OTEK_LANC_admission)
print("******** OTEK_LANC Admission Model ********")
cm_OTEK_LANC_admission_df
print("Precision Score : ", precision_score_OTEK_LANC_admission)
print("Recall Score : ", recall_score_OTEK_LANC_admission)

#OTEK_LANC EOD3 Model Confusion Matrix
cm_OTEK_LANC = confusion_matrix(y_test_OTEK_LANC, y_pred_OTEK_LANC) #Create Confusion Matrix
cm_OTEK_LANC_df = pd.DataFrame({'Predicted Negative':cm_OTEK_LANC[:,0], 'Predicted Positive':cm_OTEK_LANC[:,1]}, index=['Actual Negative','Actual Positive']) #Convert to df to visualize
precision_score_OTEK_LANC = precision_score(y_test_OTEK_LANC, y_pred_OTEK_LANC)
recall_score_OTEK_LANC = recall_score(y_test_OTEK_LANC, y_pred_OTEK_LANC)
print("******** OTEK_LANC Full Model ********")
cm_OTEK_LANC_df
print("Precision Score : ", precision_score_OTEK_LANC)
print("Recall Score : ", recall_score_OTEK_LANC)

