In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import mannwhitneyu
from scipy.stats import gaussian_kde
from scipy.stats import norm
from scipy import integrate
from sklearn.neighbors import KernelDensity
import scipy.stats

np.random.seed(20200619)

lab_ranges = {'ALBUMIN':        [3.5, 5.5],                
              'BICARBONATE':    [21,29],                     
              'BUN':            [10,20],                                          
              'CALCIUM':        [8.5,10.5],                                       
              'FREECALCIUM':    [1.05,1.37],                  
              'CHLORIDE':       [98,106],                       
              'MAGNESIUM':      [1.8,3],                   
              'PHOSPHATE':      [3,4.5],                     
              'HEMOGLOBIN':     [12,18],                   
              'SODIUM':         [136,145],                
              'CREATININE':     [0.5,1.5],                   
              'PLATELET':       [150,400],                  
              'POTASSIUM':      [3.3,5.5],                 
              'LACTATE':        [0.5,2.0],                      
              'WBC':            [4.5,11],                     
              'GLUCOSE':        [75.0, 115.0]            
             } 

lab_unit = {  'ALBUMIN':        'g/dL',          
              'BICARBONATE':    'mEq/L',           
              'BUN':            'mg/dL',                           
              'CALCIUM':        'mg/dL',                                
              'FREECALCIUM':    'mmol/L',        
              'CHLORIDE':       'mEq/L',             
              'MAGNESIUM':      'mg/dL',         
              'PHOSPHATE':      'mg/dL',           
              'HEMOGLOBIN':     'g/dL',          
              'SODIUM':         'mEq/L',        
              'CREATININE':     'mg/dL',           
              'PLATELET':       'K/uL',            
              'POTASSIUM':      'mEq/L',         
              'LACTATE':        'mmol/L',            
              'WBC':            'K/uL',            
              'GLUCOSE':        'mg/dL'         
             } 


data = pd.read_csv("S:/NUS/Year Two/UROPS/firstlab_xy.csv")

mort_data = data[data['mort_icu']==1]
surv_data = data[data['mort_icu']==0]


In [9]:
def ResumeNorm(val1, val2):
    norm_val1 = norm.ppf(0.025)
    norm_val2 = norm.ppf(0.975)
    mean = (val2 - val1)/2 + val1
    scale = (val1 - val2) / (norm_val1 - norm_val2)
    return norm(mean, scale)

def OverlapParam(dist1, dist2,lab):
    """
        Overlapping coefficient: integration of min value of two distribution over R. 
        For chloride, glucose, sodium, use large interval with focus point interval instead.
        func = lambda x: min(dist1.pdf(x), dist2.pdf(x))
        return integrate.quad(func, -np.inf , np.inf)
    """
   
    func = lambda x: min(dist1.pdf(x), dist2.pdf(x))
    if lab in ['CHLORIDE', 'GLUCOSE', 'SODIUM']:
        return integrate.quad(func, 0, 1000)
    else:
        return integrate.quad(func, -np.inf, np.inf)

def cohen_d(x,y):
    nx = len(x)
    ny = len(y)
    dof = nx + ny - 2
    return (np.mean(x) - np.mean(y)) / np.sqrt(((nx-1)*np.std(x, ddof=1) ** 2 + (ny-1)*np.std(y, ddof=1) ** 2) / dof)

In [4]:
c = ['labName','Best group Mean [IQR]','Worst group Mean [IQR]','P-Value']
df=pd.DataFrame(columns = c)
    
for i, u in iter(lab_unit.items()):
    
   

    # best vs worst
    
    n_surv = surv_data[i.lower() + '_min'].dropna().count()
    quarter = n_surv /4
    quarter = int(quarter)
    cond = data.mort_icu == 0
    all_patients = surv_data.sort_values(by=['los'])[i.lower()+'_min'].dropna()
    best_group = surv_data.sort_values(by=['los'])[i.lower()+'_min'].dropna().head(quarter)
    worst_group = surv_data.sort_values(by=['los'])[i.lower()+'_min'].dropna().tail(quarter)
    
    [min_q1, min_q2] = best_group.quantile(q = [0.25,0.75])
    min_bgm = best_group.mean()
    [min_p1, min_p2] = worst_group.quantile(q = [0.25,0.75])
    min_wgm = worst_group.mean()


    t_stat, p_val_min = mannwhitneyu(best_group, worst_group)
    
    df=df.append(pd.DataFrame([['{}_min:'.format(i), 
                                '{} , [{}-{}]'.format(format(min_bgm, '.2f'), min_q1, min_q2), 
                                '{} , [{}-{}]'.format(format(min_wgm, '.2f'), min_p1, min_p2), 
                                round(p_val_min,2)]], columns = c))
    
    
    n_surv = surv_data[i.lower() + '_max'].dropna().count()
    n_all = data[i.lower() + '_max'].dropna().count()
    quarter = n_surv /4
    quarter = int(quarter)
    cond = data.mort_icu == 0
    all_patients = surv_data.sort_values(by=['los'])[i.lower()+'_max'].dropna()
    best_group = surv_data.sort_values(by=['los'])[i.lower()+'_max'].dropna().head(quarter)
    worst_group = surv_data.sort_values(by=['los'])[i.lower()+'_max'].dropna().tail(quarter)
    
    [max_q1, max_q2] = best_group.quantile(q = [0.25,0.75])
    max_bgm = best_group.mean()
    [max_p1, max_p2] = worst_group.quantile(q = [0.25,0.75])
    max_wgm = worst_group.mean()

    t_stat, p_val_max = mannwhitneyu(best_group, worst_group)
    
    df=df.append(pd.DataFrame([['{}_max:'.format(i), 
                                '{} , [{}-{}]'.format(format(max_bgm, '.2f'), max_q1, max_q2), 
                                '{} , [{}-{}]'.format(format(max_wgm, '.2f'), max_p1, max_p2), 
                                round(p_val_max,2)]], columns = c))
print(df)
                                

    

    
        

            labName    Best group Mean [IQR]  Worst group Mean [IQR]  P-Value
0      ALBUMIN_min:         3.48 , [3.0-4.0]        3.02 , [2.5-3.5]     0.00
0      ALBUMIN_max:         3.55 , [3.1-4.0]        3.14 , [2.6-3.7]     0.00
0  BICARBONATE_min:      23.41 , [21.0-26.0]     21.89 , [19.0-25.0]     0.00
0  BICARBONATE_max:      25.74 , [24.0-28.0]     25.29 , [23.0-28.0]     0.00
0          BUN_min:      18.75 , [11.0-21.0]     23.70 , [12.0-28.0]     0.00
0          BUN_max:      22.81 , [13.0-25.0]     29.54 , [15.0-35.0]     0.00
0      CALCIUM_min:         8.34 , [7.9-8.8]        7.96 , [7.4-8.6]     0.00
0      CALCIUM_max:         8.71 , [8.2-9.1]        8.60 , [8.1-9.1]     0.00
0  FREECALCIUM_min:       1.07 , [1.02-1.12]      1.03 , [0.97-1.11]     0.00
0  FREECALCIUM_max:       1.21 , [1.14-1.27]      1.19 , [1.12-1.24]     0.00
0     CHLORIDE_min:   102.22 , [100.0-105.0]   102.00 , [99.0-106.0]     0.00
0     CHLORIDE_max:  106.53 , [103.25-110.0]  107.78 , [104.0-11

In [15]:

c = ['labName','OVL','SMD','P-Value']
#all vs normal 
df=pd.DataFrame(columns = c)
for i, u in iter(lab_unit.items()):
    all_patients = data.sort_values(by=['los'])[i.lower()+'_min'].dropna()
    n_all = data[i.lower() + '_min'].dropna().count()

    norm_dist = ResumeNorm(lab_ranges[i][0], lab_ranges[i][1])

    norm_sample = norm_dist.rvs(size=n_all, random_state=20200704)

    t_stat, p_val_min = mannwhitneyu(all_patients, norm_sample)

    #standardized mean difference
    smd_min = cohen_d(all_patients, norm_sample)

    #overlapping parameter
    all_patients_dist = gaussian_kde(all_patients)
    ovl_min = OverlapParam(all_patients_dist, norm_dist,i)

    df=df.append(pd.DataFrame([['{}_min:'.format(i), 
                            round(max(ovl_min),2), 
                            round(smd_min,2), 
                            round(p_val_min,2)]], columns = c))



    all_patients = data.sort_values(by=['los'])[i.lower()+'_max'].dropna()

    norm_dist = ResumeNorm(lab_ranges[i][0], lab_ranges[i][1])

    norm_sample = norm_dist.rvs(size=n_all, random_state=20200704)

    t_stat, p_val_max = mannwhitneyu(all_patients, norm_sample)
    #standardized mean difference
    smd_max = cohen_d(all_patients, norm_sample)

    #overlapping parameter
    all_patients_dist = gaussian_kde(all_patients)
    ovl_max = OverlapParam(all_patients_dist, norm_dist,i)

    df=df.append(pd.DataFrame([['{}_max:'.format(i), 
                            round(max(ovl_max),2), 
                            round(smd_max,2),
                            round(p_val_max,2)]], columns = c))
    
print(df)

            labName   OVL   SMD  P-Value
0      ALBUMIN_min:  0.31 -2.01      0.0
0      ALBUMIN_max:  0.35 -1.88      0.0
0  BICARBONATE_min:  0.60 -0.74      0.0
0  BICARBONATE_max:  0.74  0.12      0.0
0          BUN_min:  0.47  0.51      0.0
0          BUN_max:  0.44  0.77      0.0
0      CALCIUM_min:  0.29 -1.89      0.0
0      CALCIUM_max:  0.46 -1.15      0.0
0  FREECALCIUM_min:  0.36 -1.57      0.0
0  FREECALCIUM_max:  0.59  0.01      0.0
0     CHLORIDE_min:  0.60  0.01      0.0
0     CHLORIDE_max:  0.37  1.19      0.0
0    MAGNESIUM_min:  0.36 -1.67      0.0
0    MAGNESIUM_max:  0.63 -0.48      0.0
0    PHOSPHATE_min:  0.47 -0.60      0.0
0    PHOSPHATE_max:  0.56  0.20      0.0
0   HEMOGLOBIN_min:  0.21 -2.48      0.0
0   HEMOGLOBIN_max:  0.46 -1.40      0.0
0       SODIUM_min:  0.49 -1.09      0.0
0       SODIUM_max:  0.81 -0.03      0.0
0   CREATININE_min:  0.66  0.19      0.0
0   CREATININE_max:  0.52  0.15      0.0
0     PLATELET_min:  0.53 -0.87      0.0
0     PLATELET_m

In [17]:
# best vs normal
c = ['labName','OVL','SMD','P-Value']
#all vs normal
df=pd.DataFrame(columns = c)
    
for i, u in iter(lab_unit.items()):
 
    n_surv = surv_data[i.lower() + '_min'].dropna().count()
    quarter = n_surv /4
    quarter = int(quarter)
    best_group = surv_data.sort_values(by=['los'])[i.lower()+'_min'].dropna().head(quarter)
  
    norm_dist = ResumeNorm(lab_ranges[i][0], lab_ranges[i][1])

    norm_sample = norm_dist.rvs(size = quarter,random_state=20200620)

    t_stat, p_val_min = mannwhitneyu(best_group, norm_sample)
    
    #standardized mean difference
    smd_min = cohen_d(best_group, norm_sample)
        
    #overlapping parameter
    best_group_dist = gaussian_kde(best_group)
    ovl_min = OverlapParam(best_group_dist, norm_dist,i)
    
    df=df.append(pd.DataFrame([['{}_min:'.format(i), 
                                round(max(ovl_min),2), 
                                round(smd_min,2), 
                                round(p_val_min,2)]], columns = c))

    
    n_surv = surv_data[i.lower() + '_max'].dropna().count()
    quarter = n_surv /4
    quarter = int(quarter)
    cond = data.mort_icu == 0
    all_patients = surv_data.sort_values(by=['los'])[i.lower()+'_max'].dropna()
    best_group = surv_data.sort_values(by=['los'])[i.lower()+'_max'].dropna().head(quarter)
    
    [max_q1, max_q2] = best_group.quantile(q = [0.25,0.75])
    max_bgm = best_group.mean()
    [max_p1, max_p2] = worst_group.quantile(q = [0.25,0.75])
    max_wgm = worst_group.mean()

    norm_dist = ResumeNorm(lab_ranges[i][0], lab_ranges[i][1])

    norm_sample = norm_dist.rvs(size = quarter, random_state=20100704)

    t_stat, p_val_max = mannwhitneyu(best_group, norm_sample)
        #standardized mean difference
    smd_max = cohen_d(best_group, norm_sample)
        
    #overlapping parameter
    best_group_dist = gaussian_kde(best_group)
    ovl_max = OverlapParam(best_group_dist, norm_dist,i)
    
    df=df.append(pd.DataFrame([['{}_max:'.format(i), 
                                round(max(ovl_max),2), 
                                round(smd_max,2), 
                                round(p_val_max,2)]], columns = c))

print(df)
    
    

            labName   OVL   SMD  P-Value
0      ALBUMIN_min:  0.40 -1.66     0.00
0      ALBUMIN_max:  0.43 -1.60     0.00
0  BICARBONATE_min:  0.71 -0.51     0.00
0  BICARBONATE_max:  0.78  0.26     0.00
0          BUN_min:  0.50  0.35     0.01
0          BUN_max:  0.49  0.61     0.00
0      CALCIUM_min:  0.34 -1.76     0.00
0      CALCIUM_max:  0.49 -1.24     0.00
0  FREECALCIUM_min:  0.37 -1.60     0.00
0  FREECALCIUM_max:  0.87  0.00     0.00
0     CHLORIDE_min:  0.64  0.05     0.00
0     CHLORIDE_max:  0.40  1.15     0.00
0    MAGNESIUM_min:  0.36 -1.70     0.00
0    MAGNESIUM_max:  0.56 -0.87     0.00
0    PHOSPHATE_min:  0.52 -0.64     0.00
0    PHOSPHATE_max:  0.61 -0.00     0.00
0   HEMOGLOBIN_min:  0.26 -2.23     0.00
0   HEMOGLOBIN_max:  0.49 -1.39     0.00
0       SODIUM_min:  0.53 -1.07     0.00
0       SODIUM_max:  0.86 -0.14     0.00
0   CREATININE_min:  0.68  0.08     0.00
0   CREATININE_max:  0.74  0.26     0.00
0     PLATELET_min:  0.58 -0.76     0.00
0     PLATELET_m

In [18]:
# worst vs normal 
c = ['labName','OVL','SMD','P-Value']
#all vs normal
df=pd.DataFrame(columns = c)
    
for i, u in iter(lab_unit.items()):

    worst_group = mort_data.sort_values(by=['los'])[i.lower()+'_min'].dropna()
    n_mort = mort_data[i.lower() + '_min'].dropna().count()

    norm_dist = ResumeNorm(lab_ranges[i][0], lab_ranges[i][1])

    norm_sample = norm_dist.rvs(size=n_mort, random_state=20200704)

    t_stat, p_val_min = mannwhitneyu(worst_group, norm_sample)
    
    #standardized mean difference
    smd_min = cohen_d(worst_group, norm_sample)
        
    #overlapping parameter
    worst_group_dist = gaussian_kde(worst_group)
    ovl_min = OverlapParam(worst_group_dist, norm_dist,i)
    
    df=df.append(pd.DataFrame([['{}_min:'.format(i), 
                                round(max(ovl_min),2), 
                                round(smd_min,2), 
                                round(p_val_min,2)]], columns = c))
    
    
    worst_group = surv_data.sort_values(by=['los'])[i.lower()+'_max'].dropna()
    n_mort = data[i.lower() + '_max'].dropna().count()
    
    norm_dist = ResumeNorm(lab_ranges[i][0], lab_ranges[i][1])

    norm_sample = norm_dist.rvs(size=n_mort, random_state=20200704)

    t_stat, p_val_max = mannwhitneyu(worst_group, norm_sample)
        #standardized mean difference
    smd_max = cohen_d(worst_group, norm_sample)
        
    #overlapping parameter
    worst_group_dist = gaussian_kde(worst_group)
    ovl_max = OverlapParam(worst_group_dist, norm_dist,i)
    
    df=df.append(pd.DataFrame([['{}_max:'.format(i), 
                                round(max(ovl_max),2), 
                                round(smd_max,2), 
                                round(p_val_max,2)]], columns = c))
print(df)
    
    



            labName   OVL   SMD  P-Value
0      ALBUMIN_min:  0.19 -2.60      0.0
0      ALBUMIN_max:  0.36 -1.86      0.0
0  BICARBONATE_min:  0.38 -1.21      0.0
0  BICARBONATE_max:  0.76  0.17      0.0
0          BUN_min:  0.32  0.98      0.0
0          BUN_max:  0.45  0.75      0.0
0      CALCIUM_min:  0.24 -1.93      0.0
0      CALCIUM_max:  0.46 -1.21      0.0
0  FREECALCIUM_min:  0.36 -1.50      0.0
0  FREECALCIUM_max:  0.57  0.01      0.0
0     CHLORIDE_min:  0.51 -0.13      0.0
0     CHLORIDE_max:  0.37  1.24      0.0
0    MAGNESIUM_min:  0.39 -1.51      0.0
0    MAGNESIUM_max:  0.62 -0.54      0.0
0    PHOSPHATE_min:  0.41 -0.09      0.0
0    PHOSPHATE_max:  0.58  0.13      0.0
0   HEMOGLOBIN_min:  0.21 -2.48      0.0
0   HEMOGLOBIN_max:  0.46 -1.40      0.0
0       SODIUM_min:  0.49 -0.95      0.0
0       SODIUM_max:  0.83 -0.05      0.0
0   CREATININE_min:  0.57  0.55      0.0
0   CREATININE_max:  0.72  0.38      0.0
0     PLATELET_min:  0.48 -0.94      0.0
0     PLATELET_m