In [156]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import cross_val_score

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay


from sklearn.pipeline import make_pipeline
from imblearn.pipeline import make_pipeline as imbalanced_make_pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, classification_report
from collections import Counter
from sklearn.model_selection import KFold, StratifiedKFold

from sklearn.manifold import TSNE
from sklearn.decomposition import PCA, TruncatedSVD
import matplotlib.patches as mpatches
import time

In [173]:
path = '/home/daisy/FDA_Dataset/inpatient_all_final_1.csv'
df1 = pd.read_csv(path).iloc[:,1:]
df1.drop(columns = ['Veteran flag','Event date','Marital status', 'Marital status encoded',
                    'State','Ruca category'], inplace=True)


path = '/home/daisy/FDA_Dataset/inpatient_all_final_2.csv'
df2 = pd.read_csv(path).iloc[:,1:]
df2.drop(columns = ['Veteran flag','Event date','Marital status', 'Marital status encoded',
                    'State','Ruca category'], inplace=True)

## Train test split

In [179]:
X_admission1 = df1.drop(columns = ['Readmission', 'Died'])
Y_admission1 = df1[['Readmission']]

X_mortality1 = df1.drop(columns = ['Died'])
Y_mortality1 = df1[['Died']]

In [180]:
X_train_ad1, X_test_ad1, y_train_ad1, y_test_ad1 = train_test_split(X_admission1, Y_admission1, test_size=0.20, random_state=42)
X_train_mor1, X_test_mor1, y_train_mor1, y_test_mor1 = train_test_split(X_mortality1, Y_mortality1, test_size=0.20, random_state=42)

In [184]:
X_train_ad1.shape, X_test_ad1.shape, y_train_ad1.shape, y_test_ad1.shape

((67628, 74), (16908, 74), (67628, 1), (16908, 1))

In [185]:
X_train_mor1.shape, X_test_mor1.shape, y_train_mor1.shape, y_test_mor1.shape

((67628, 75), (16908, 75), (67628, 1), (16908, 1))

Replace missing values with mean value for continuous variables

In [181]:
missing_cols = df1.columns[df1.isna().any()].tolist()
X_train_ad1[missing_cols] = X_train_ad1[missing_cols].fillna(X_train_ad1[missing_cols].mean())
X_train_mor1[missing_cols] =  X_train_mor1[missing_cols].fillna(X_train_mor1[missing_cols].mean())

X_test_ad1[missing_cols] = X_test_ad1[missing_cols].fillna(X_test_ad1[missing_cols].mean())
X_test_mor1[missing_cols] = X_test_mor1[missing_cols].fillna(X_test_mor1[missing_cols].mean())

## Remove skewness and kurtosis
Log transformation on numeric features

np.log1p() is used, for np.log1p(0) 0 , np.log(0) = -inf

In [182]:
# 'Internalpatientid' is not in these colnames
targets = ['Readmission', 'Died']

cat_cols = ['AO', 'CVD', 'Ruca category encoded', 'Ethnicity', 
            'Gender', 'Races', 'Ethnicity_0',
            'Ethnicity_1', 'Ethnicity_2', 'Races_0', 
            'Races_1', 'Races_2', 'Races_3','DOMICILIARY', 
            'MEDICINE', 'NHCU', 'NON-COUNT', 'OTHERS', 'PSYCHIATRY']

numeric_cols = ['num_stays', 'stay_length', 'num_unique_units',
       'num_transfers', 'num_cvd_readmission', 'unique_admitting_specialty', 
       'unique_discharging_specialty','Age 20-40', 'Age 40-60', 'Age 60-80', 'Age 80-100',
       'Age 100-120', 'age_mean', 'age_std', 'age_min', 'age_max', 'stay_min',
       'stay_max', 'stay_mean', 'stay_std', 'freq', 'total_procedure',
       'num_surgery_pro', 'num_immunization', 'Num med per admission mean',
       'Num med per admission min', 'Num med per admission max',
       'Total medications', 'mean age at specailty', 'period mean', 
       'specialty medical count', 'specialty support count',
       'period std','specialty count', 'Age 20-40 hypotension',
       'Age 40-60 hypotension', 'Age 60-80 hypotension',
       'Age 80-100 hypotension', 'Age 100-120 hypotension',
       'Age 20-40 hypertension', 'Age 40-60 hypertension',
       'Age 60-80 hypertension', 'Age 80-100 hypertension',
       'Age 100-120 hypertension', 'Age 20-40 healthy', 'Age 40-60 healthy',
       'Age 60-80 healthy', 'Age 80-100 healthy', 'Age 100-120 healthy',
       'lab_count', 'lab_freq', 'lab_age_mean', 'lab_age_std']

In [183]:
def check_skewness(df):
    statusdf = pd.DataFrame()
    statusdf['numeric_col'] = numeric_cols
    transform = []
    sknewness_before = []
    kurtosis_before = []
    std_before = []
    
    skewness_after = []
    kurtosis_after = []
    std_after = []

    method = []
    for i in numeric_cols:
        if abs(df[i].skew()) > 1.96 and abs(df[i].kurtosis() > 1.96):
            transform.append('Yes')
            sknewness_before.append(df[i].skew())
            kurtosis_before.append(df[i].kurtosis())
            std_before.append(df[i].std())

            skewness_after.append(np.log1p(df[df[i] >= 0][i]).skew())
            kurtosis_after.append(np.log1p(df[df[i] >= 0][i]).kurtosis())
            std_after.append(np.log1p(df[df[i] >= 0][i]).std())

            method.append('log')
        else:
            transform.append('No')
            sknewness_before.append(df[i].skew())
            kurtosis_before.append(df[i].kurtosis())
            std_before.append(df[i].std())

            skewness_after.append(df[i].skew())
            kurtosis_after.append(df[i].kurtosis())
            std_after.append(df[i].std())
            method.append(' ')

    statusdf['transform'] = transform
    statusdf['method'] = method
    statusdf['sknewness_before'] = sknewness_before
    statusdf['skewness_after'] = skewness_after

    statusdf['kurtosis_before'] = kurtosis_before
    statusdf['kurtosis_after'] = kurtosis_after
    
    statusdf['std_before'] = std_before
    statusdf['std_after'] = std_after
    return statusdf


In [None]:
statusdf = check_skewness(df)

In [161]:
def remove_skewness(df,statusdf):
    for i in range(len(statusdf)):
        if statusdf['transform'][i] == 'Yes':
            colname = str(statusdf['numeric_col'][i])
            
            # will lose information here,
            # For np.log() has 'inf', and we will not consider 'inf'
            #df[colname + "_log"] = np.log1p(df[df[colname] >= 0][colname])
            df[colname + "_log"] = np.log1p(df[colname])
    return df

In [162]:
df_log = remove_skewness(df,statusdf)
log_numeric_cols = ['num_stays_log', 'stay_length_log',
       'num_transfers_log', 'num_cvd_readmission_log',
       'unique_admitting_specialty_log', 'Age 20-40_log', 'Age 40-60_log',
       'Age 60-80_log', 'Age 80-100_log', 'Age 100-120_log', 'stay_min_log',
       'stay_max_log', 'stay_mean_log', 'stay_std_log', 'freq_log',
       'total_procedure_log', 'num_surgery_pro_log',
       'Num med per admission mean_log', 'Num med per admission min_log',
       'Num med per admission max_log', 'Total medications_log',
       'period mean_log', 'specialty medical count_log',
       'specialty support count_log', 'period std_log', 'specialty count_log',
       'Age 20-40 hypotension_log', 'Age 40-60 hypotension_log',
       'Age 60-80 hypotension_log', 'Age 80-100 hypotension_log',
       'Age 100-120 hypotension_log', 'Age 20-40 hypertension_log',
       'Age 40-60 hypertension_log', 'Age 60-80 hypertension_log',
       'Age 80-100 hypertension_log', 'Age 100-120 hypertension_log',
       'Age 20-40 healthy_log', 'Age 40-60 healthy_log',
       'Age 60-80 healthy_log', 'Age 80-100 healthy_log',
       'Age 100-120 healthy_log', 'lab_count_log', 'lab_freq_log']
log_cols = ['Internalpatientid'] + log_numeric_cols + cat_cols
df_log = df_log[log_cols]
df_log

Unnamed: 0,Internalpatientid,num_stays_log,stay_length_log,num_transfers_log,num_cvd_readmission_log,unique_admitting_specialty_log,Age 20-40_log,Age 40-60_log,Age 60-80_log,Age 80-100_log,...,Races_2,Races_3,DOMICILIARY,MEDICINE,NHCU,NON-COUNT,OTHERS,PSYCHIATRY,Readmission,Died
0,1,1.609438,2.826722,0.000000,0.000000,1.609438,0.0,0.000000,1.609438,0.000000,...,0,0,0,3,0,0,0,0,1,0
1,2,3.135494,4.546057,1.098612,2.397895,2.302585,0.0,1.386294,2.995732,0.000000,...,1,0,0,9,2,9,0,1,1,0
2,3,1.098612,6.012492,0.000000,0.000000,1.098612,0.0,0.000000,0.693147,0.693147,...,0,0,0,1,1,0,0,0,1,1
3,4,0.693147,2.098018,0.000000,0.000000,0.693147,0.0,0.000000,0.000000,0.693147,...,0,0,0,1,0,0,0,0,0,0
4,5,1.098612,2.246015,0.000000,0.000000,1.098612,0.0,0.000000,1.098612,0.000000,...,0,0,0,1,0,1,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84531,169055,0.693147,2.010895,0.000000,0.000000,0.693147,0.0,0.693147,0.000000,0.000000,...,0,0,0,1,0,0,0,0,0,0
84532,169057,3.401197,4.598750,1.098612,2.639057,2.484907,0.0,0.000000,2.397895,2.995732,...,1,0,0,25,0,2,0,0,1,0
84533,169060,2.197225,3.836868,0.000000,1.791759,1.791759,0.0,1.386294,1.791759,0.000000,...,0,0,0,7,0,1,0,0,1,0
84534,169062,2.564949,5.190899,0.693147,0.000000,2.079442,0.0,0.000000,2.564949,0.000000,...,0,0,1,7,3,1,0,0,1,1


### Standardize the data that's log transformed

In [140]:
from sklearn.preprocessing import StandardScaler, RobustScaler

# RobustScaler is less prone to outliers.

#std_scaler = StandardScaler()

def rob_scale_numeric_data(df,cols):
    rob_scaler = RobustScaler()
    for i in cols:
        #new_i =  rob_scaler.fit_transform(df[i].values.reshape(-1,1))
        df[i] = rob_scaler.fit_transform(df[i].values.reshape(-1,1))
        df = df.rename(columns = {i:i+ "_rob_scaled"})
    return df

df_log_norm= rob_scale_numeric_data(df_log,log_numeric_cols)
df_log_norm

Unnamed: 0,Internalpatientid,num_stays_log_rob_scaled,stay_length_log_rob_scaled,num_transfers_log_rob_scaled,num_cvd_readmission_log_rob_scaled,unique_admitting_specialty_log_rob_scaled,Age 20-40_log_rob_scaled,Age 40-60_log_rob_scaled,Age 60-80_log_rob_scaled,Age 80-100_log_rob_scaled,...,Races,Ethnicity_0,Ethnicity_1,Ethnicity_2,Races_0,Races_1,Races_2,Races_3,Readmission,Died
0,1,0.000000,-0.163158,0.000000,0.000000,0.243529,0.0,0.0,0.285097,0.000000,...,0,0,1,0,1,0,0,0,1,0
1,2,1.389076,0.746636,1.584963,3.459432,1.000000,0.0,2.0,1.058803,0.000000,...,2,0,1,0,0,0,1,0,1,0
2,3,-0.464974,1.522606,0.000000,0.000000,-0.313964,0.0,0.0,-0.226294,1.000000,...,1,0,0,1,0,1,0,0,1,1
3,4,-0.834044,-0.548755,0.000000,0.000000,-0.756471,0.0,0.0,-0.613147,1.000000,...,1,1,0,0,0,1,0,0,0,0
4,5,-0.464974,-0.470442,0.000000,0.000000,-0.313964,0.0,0.0,0.000000,0.000000,...,0,0,1,0,1,0,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84531,169055,-0.834044,-0.594857,0.000000,0.000000,-0.756471,0.0,1.0,-0.613147,0.000000,...,0,0,1,0,1,0,0,0,0,0
84532,169057,1.630930,0.774518,1.584963,3.807355,1.198978,0.0,0.0,0.725144,4.321928,...,2,0,1,0,0,0,1,0,1,0
84533,169060,0.535026,0.371365,0.000000,2.584963,0.442507,0.0,2.0,0.386853,0.000000,...,1,0,0,1,0,1,0,0,1,0
84534,169062,0.869744,1.087857,1.000000,0.000000,0.756471,0.0,0.0,0.818378,0.000000,...,0,0,0,1,1,0,0,0,1,1


In [164]:
df_log_norm[df_log_norm.isna().any(axis = 1)]

Unnamed: 0,Internalpatientid,num_stays_log_rob_scaled,stay_length_log_rob_scaled,num_transfers_log_rob_scaled,num_cvd_readmission_log_rob_scaled,unique_admitting_specialty_log_rob_scaled,Age 20-40_log_rob_scaled,Age 40-60_log_rob_scaled,Age 60-80_log_rob_scaled,Age 80-100_log_rob_scaled,...,Races,Ethnicity_0,Ethnicity_1,Ethnicity_2,Races_0,Races_1,Races_2,Races_3,Readmission,Died


## Standardize data without log transform

In [168]:
df_norm  = rob_scale_numeric_data(df,numeric_cols)
df_norm

Unnamed: 0,Internalpatientid,num_stays_rob_scaled,stay_length_rob_scaled,num_unique_units_rob_scaled,num_transfers_rob_scaled,num_cvd_readmission_rob_scaled,Readmission,Died,AO,CVD,...,Age 100-120 hypertension_rob_scaled,Age 20-40 healthy_rob_scaled,Age 40-60 healthy_rob_scaled,Age 60-80 healthy_rob_scaled,Age 80-100 healthy_rob_scaled,Age 100-120 healthy_rob_scaled,lab_count_rob_scaled,lab_freq_rob_scaled,lab_age_mean_rob_scaled,lab_age_std_rob_scaled
0,1,0.000000,-0.122410,0.0,0.0,0.0,1,0,0,0,...,0.0,0.0,0.000000,0.755556,0.111111,0.0,0.864130,0.011898,0.102119,0.671449
1,2,3.000000,1.430191,1.5,2.0,10.0,1,0,0,1,...,0.0,0.0,2.555556,6.155556,0.000000,0.0,2.516304,0.790157,-0.365358,0.489013
2,3,-0.333333,7.736116,0.0,0.0,0.0,1,1,0,1,...,0.0,0.0,0.000000,-0.400000,8.888889,0.0,4.875000,10.449432,0.679267,-0.632874
3,4,-0.500000,-0.297798,-0.5,0.0,0.0,0,0,0,1,...,0.0,0.0,0.000000,-0.333333,1.555556,0.0,-0.179348,-0.291509,0.735354,-0.031575
4,5,-0.333333,-0.271710,0.0,0.0,0.0,1,1,0,1,...,0.0,0.0,0.000000,-0.177778,0.000000,0.0,-0.331522,0.591671,0.364668,-0.799369
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84531,169055,-0.500000,-0.311443,-0.5,0.0,0.0,0,0,0,1,...,0.0,0.0,0.000000,0.155556,0.000000,0.0,0.250000,0.127637,-0.356720,0.070954
84532,169057,4.166667,1.532534,0.5,2.0,13.0,1,0,0,1,...,0.0,0.0,0.000000,2.022222,26.888889,0.0,2.559783,0.809627,0.633148,0.455486
84533,169060,0.666667,0.469372,0.0,0.0,5.0,1,0,0,1,...,0.0,0.0,0.444444,1.266667,0.000000,0.0,1.456522,0.768524,-0.391984,0.945633
84534,169062,1.333333,3.143330,1.0,1.0,0.0,1,1,1,0,...,0.0,0.0,0.000000,1.133333,0.000000,0.0,-0.190217,0.627907,0.151643,-0.552866


In [169]:
df_norm[df_norm.isna().any(axis = 1)]

Unnamed: 0,Internalpatientid,num_stays_rob_scaled,stay_length_rob_scaled,num_unique_units_rob_scaled,num_transfers_rob_scaled,num_cvd_readmission_rob_scaled,Readmission,Died,AO,CVD,...,Age 100-120 hypertension_rob_scaled,Age 20-40 healthy_rob_scaled,Age 40-60 healthy_rob_scaled,Age 60-80 healthy_rob_scaled,Age 80-100 healthy_rob_scaled,Age 100-120 healthy_rob_scaled,lab_count_rob_scaled,lab_freq_rob_scaled,lab_age_mean_rob_scaled,lab_age_std_rob_scaled
