In [19]:
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import pandas as pd

In [20]:
df = pd.read_csv("data/Training_Set.csv")

In [21]:
num_cols = df.select_dtypes(include=['int64', 'float64']).columns
cat_cols = df.select_dtypes(include=['object', 'bool']).columns

In [22]:
df_encoded = df.copy()

for col in cat_cols:
    if df_encoded[col].dtype == 'bool':
        df_encoded[col] = df_encoded[col].astype(int)
    else:
        df_encoded = pd.get_dummies(df_encoded, columns=[col], drop_first=True)

df_encoded.head()

Unnamed: 0,Patient_ID,Admission Year,Age At Admission,Length of Stay (Days),First Potassium Days From Admit,First Potassium Result,Last Potassium Days From Admit,Last Potassium Result,Min Potassium Days From Admit,Min Potassium Result,...,Hx_Pvd,Hx_Valve_Procedure,Hx_Dm,Hx_Ckd,Hx_Ihd,Hx_Aortic_Valve_Problem,Hx_Prior_Admit,Gender_MALE,Race Simplified_Other,Race Simplified_White
0,1616,2020,89.03944,6.008976,0.150694,3.0,5.616078,4.0,0.580556,2.8,...,0,0,0,0,0,0,0,False,False,True
1,5717,2020,69.42983,2.596738,0.1125,3.9,2.358333,4.1,0.034722,4.0,...,0,0,1,0,0,0,1,True,False,True
2,5922,2019,67.465759,2.046528,,,,,,,...,0,0,1,1,0,0,1,True,False,True
3,2054,2019,61.347314,1.644444,0.033941,4.2,0.055349,3.9,0.063194,3.9,...,0,0,1,1,0,0,1,False,False,True
4,5810,2019,83.347254,2.253531,0.278472,4.7,1.738194,3.8,2.359028,3.9,...,0,0,1,0,0,0,1,True,False,True


In [25]:
true_samples = df_encoded[df_encoded['1Yr_Death'] == True]
false_samples = df_encoded[df_encoded['1Yr_Death'] == False]

In [26]:
descriptive_stats = pd.DataFrame({
    'True_Mean': true_samples.mean(),
    'False_Mean': false_samples.mean(),
    'True_Median': true_samples.median(),
    'False_Median': false_samples.median(),
    'True_Std': true_samples.std(),
    'False_Std': false_samples.std()
})


In [28]:
descriptive_stats

Unnamed: 0,True_Mean,False_Mean,True_Median,False_Median,True_Std,False_Std
Patient_ID,3733.264628,3797.092063,3736.500000,3808.500000,2161.821761,2125.310761
Admission Year,2019.863032,2019.936735,2020.000000,2020.000000,0.792423,0.802489
Age At Admission,75.972619,72.648924,76.767803,72.291325,11.338535,11.726955
Length of Stay (Days),6.050152,4.784932,4.704701,3.811554,4.886666,3.904016
First Potassium Days From Admit,0.545432,0.550355,0.443750,0.471181,0.536989,0.474740
...,...,...,...,...,...,...
Hx_Aortic_Valve_Problem,0.232713,0.172109,0.000000,0.000000,0.422701,0.377518
Hx_Prior_Admit,0.930851,0.865986,1.000000,1.000000,0.253792,0.340706
Gender_MALE,0.483378,0.492063,0.000000,0.000000,0.499890,0.499994
Race Simplified_Other,0.013963,0.014512,0.000000,0.000000,0.117375,0.119604


In [32]:
features_to_analyze = df_encoded.columns
p_values = []
for feature in features_to_analyze :
    stat, p = stats.mannwhitneyu(true_samples[feature].dropna(), false_samples[feature].dropna())
    p_values.append(p)

descriptive_stats['P_value'] = p_values
descriptive_stats['Significant_Difference'] = descriptive_stats['P_value'].apply(lambda p: 1 if p < 0.05 else 0)

descriptive_stats[descriptive_stats['Significant_Difference']!=1].T

Unnamed: 0,Patient_ID,First Potassium Days From Admit,First Respiritory Rate Result,First Respiritory Rate Days From Admit,Last Respiritory Rate Result,First Hemoglobin Days From Admit,First Creatinine Days From Admit,Max Sodium Result,First Bnp Days From Admit,Last Bnp Days From Admit,Min Bnp Days From Admit,First Weight Days From Admit,First Troponin I Days From Admit,Hx_Cabg,Hx_Dm,Gender_MALE,Race Simplified_Other,Race Simplified_White
True_Mean,3733.264628,0.545432,20.171582,0.065665,18.51574,0.674436,0.550161,140.306137,1.337489,2.081111,1.704154,0.456696,0.45468,0.093085,0.527926,0.483378,0.013963,0.674202
False_Mean,3797.092063,0.550355,20.127426,0.064064,18.452011,0.632572,0.548635,140.381368,1.082933,1.73648,1.533831,0.41712,0.370196,0.079365,0.53356,0.492063,0.014512,0.647392
True_Median,3736.5,0.44375,20.0,0.022222,18.0,0.517361,0.448611,141.0,0.456246,0.645833,0.558511,0.127778,0.226389,0.0,1.0,0.0,0.0,1.0
False_Median,3808.5,0.471181,20.0,0.022917,18.0,0.50446,0.471528,141.0,0.316826,0.625307,0.538826,0.134056,0.2006,0.0,1.0,0.0,0.0,1.0
True_Std,2161.821761,0.536989,4.460128,0.16917,3.116685,0.812546,0.529154,4.147205,2.120831,3.06366,2.516515,1.028525,0.83316,0.290648,0.499386,0.49989,0.117375,0.468828
False_Std,2125.310761,0.47474,4.624475,0.130574,2.003421,0.690936,0.463593,3.674679,1.8621,2.659164,2.31351,0.923541,0.815116,0.270338,0.498929,0.499994,0.119604,0.477836
P_value,0.315221,0.091802,0.80079,0.255256,0.973608,0.423276,0.308494,0.460479,0.069793,0.144489,0.65738,0.919371,0.153144,0.095588,0.705333,0.560687,0.877125,0.05909
Significant_Difference,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
list(descriptive_stats[descriptive_stats['Significant_Difference']!=1].T.columns)

['Patient_ID',
 'First Potassium Days From Admit',
 'First Respiritory Rate Result',
 'First Respiritory Rate Days From Admit',
 'Last Respiritory Rate Result',
 'First Hemoglobin Days From Admit',
 'First Creatinine Days From Admit',
 'Max Sodium Result',
 'First Bnp Days From Admit',
 'Last Bnp Days From Admit',
 'Min Bnp Days From Admit',
 'First Weight Days From Admit',
 'First Troponin I Days From Admit',
 'Hx_Cabg',
 'Hx_Dm',
 'Gender_MALE',
 'Race Simplified_Other',
 'Race Simplified_White']

In [35]:
p_value_df_sorted = descriptive_stats.sort_values(by='P_value', ascending=True).head(20)

p_value_df_sorted

Unnamed: 0,True_Mean,False_Mean,True_Median,False_Median,True_Std,False_Std,P_value,Significant_Difference
1Yr_Death,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1
Number of ED Vists in Last 6 mo,2.173537,1.530839,2.0,1.0,1.989612,1.642202,9.333596e-41,1
Number of Outpatient Visits Last Year,13.891622,9.2322,9.0,5.0,16.497653,12.561215,1.116706e-36,1
Number of Admission Last Year,1.90758,1.22381,1.0,1.0,2.244555,1.675007,8.755059e-33,1
Last Hemoglobin Result,10.363867,11.085959,10.1,10.9,2.015669,2.158263,2.266896e-26,1
Last Respiritory Rate Days From Admit,5.918085,4.617543,4.568812,3.627778,4.964045,3.908536,1.066573e-25,1
Length of Stay (Days),6.050152,4.784932,4.704701,3.811554,4.886666,3.904016,5.851757000000001e-25,1
Min Weight Result,82.083747,90.471401,77.1991,85.7289,26.604243,28.962743,2.629341e-24,1
Max Potassium Result,4.609384,4.414895,4.5,4.3,0.659809,0.633435,1.5041480000000002e-23,1
Last Potassium Days From Admit,5.46013,4.221416,4.07601,3.207103,4.93546,3.90113,1.1571600000000001e-22,1
