In [28]:
# Import tutto quanto
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import copy

In [29]:
# LOADING DATA

df=pd.read_csv("ICU_Challenge_Dataset.csv", sep=",")
df_description = df.describe()
columns = df.columns
columns

Index(['recordid', 'SAPS-I', 'SOFA', 'In-hospital_death', 'Age', 'Gender',
       'Height', 'Weight', 'CCU', 'CSRU',
       ...
       'Platelets_last', 'TroponinI_last', 'TroponinT_last', 'WBC_last',
       'Weight_last', 'pH_last', 'MechVentStartTime', 'MechVentDuration',
       'MechVentLast8Hour', 'UrineOutputSum'],
      dtype='object', length=121)

In [30]:
# MISSING DATA - da esplorare
# print(df[df.columns[df.isna().any()]])
percent_missing = df.isnull().sum() * 100 / len(df)
missing_value_df = pd.DataFrame({'column_name': df.columns,
                                 'percent_missing': percent_missing})
missing_value_df.sort_values('percent_missing', inplace=True)
print(missing_value_df)

                         column_name  percent_missing
recordid                    recordid         0.000000
SAPS-I                        SAPS-I         0.000000
SOFA                            SOFA         0.000000
In-hospital_death  In-hospital_death         0.000000
Age                              Age         0.000000
...                              ...              ...
TroponinT_first      TroponinT_first        77.766667
Cholesterol_last    Cholesterol_last        92.133333
Cholesterol_first  Cholesterol_first        92.133333
TroponinI_last        TroponinI_last        95.250000
TroponinI_first      TroponinI_first        95.250000

[121 rows x 2 columns]


In [31]:
df_col_keep = missing_value_df[missing_value_df['percent_missing']<50]
df_col = df_col_keep['column_name']

df=df[df_col]
df

Unnamed: 0,recordid,SAPS-I,SOFA,In-hospital_death,Age,CCU,CSRU,SICU,Gender,BUN_first,...,Weight_first,FiO2_last,FiO2_first,MechVentDuration,MechVentStartTime,UrineOutputSum,MechVentLast8Hour,Lactate_first,Lactate_last,Height
0,132539,6,1,0,54.0,0,0,1,0.0,13.0,...,,,,,,,,,,
1,132540,16,8,0,76.0,0,1,0,1.0,16.0,...,80.6,0.40,1.0,360.0,71.0,5.0,0.0,,,175.3
2,132541,21,11,0,44.0,0,0,0,0.0,8.0,...,56.7,0.40,1.0,2160.0,617.0,14.0,1.0,1.3,0.9,
3,132543,7,1,0,68.0,0,0,0,1.0,23.0,...,84.6,,,,,,,,,180.3
4,132545,17,2,0,88.0,0,0,0,0.0,45.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5995,151939,19,6,1,52.0,0,0,1,1.0,11.0,...,,0.40,0.6,2440.0,238.0,17.0,1.0,,,
5996,151383,13,2,0,90.0,1,0,0,0.0,20.0,...,72.9,1.00,1.0,,,,,,,157.5
5997,149623,12,9,0,72.0,1,0,0,0.0,62.0,...,,0.35,0.5,1200.0,188.0,10.0,0.0,1.4,1.4,
5998,145366,18,11,1,52.0,0,0,0,1.0,16.0,...,95.9,0.50,1.0,2760.0,100.0,27.0,1.0,9.9,8.2,182.9


In [32]:
print(df['In-hospital_death'].value_counts())

0    5162
1     838
Name: In-hospital_death, dtype: int64


In [33]:
print(np.unique(df.dtypes))

int_var = list(df.columns[df.dtypes == np.int64])
float_var = list(df.columns[df.dtypes == np.float64])

cat_var = []
num_var = []
for cat in list(df.columns):
    if len(df[cat].unique()) <=3: #conta anche i Nan come valori
        cat_var.append(cat)
    else:
        num_var.append(cat)

categorical_df = df[cat_var]
numerical_df = df[num_var]

[dtype('int64') dtype('float64')]


In [34]:
#Normality test
#Hp0: normal distribution (p-value<0.05 => reject it)
import scipy.stats as sci
for x in list(numerical_df.columns):
    y=sci.normaltest(numerical_df[x], nan_policy='omit')
    print(x, '\n', y, '\n')

recordid 
 NormaltestResult(statistic=842.8952779362795, pvalue=9.281453324358711e-184) 

SAPS-I 
 NormaltestResult(statistic=178.46362453579428, pvalue=1.7665109011997907e-39) 

SOFA 
 NormaltestResult(statistic=145.41575069788357, pvalue=2.650763275105637e-32) 

Age 
 NormaltestResult(statistic=359.68535645125036, pvalue=7.858094294744039e-79) 

BUN_first 
 NormaltestResult(statistic=3229.820965379643, pvalue=0.0) 

Creatinine_last 
 NormaltestResult(statistic=4722.975745445371, pvalue=0.0) 

BUN_last 
 NormaltestResult(statistic=2725.806709376353, pvalue=0.0) 

Creatinine_first 
 NormaltestResult(statistic=5735.2033386169605, pvalue=0.0) 

HCT_first 
 NormaltestResult(statistic=44.06438668636649, pvalue=2.701095919471259e-10) 

HCT_last 
 NormaltestResult(statistic=375.8779347073141, pvalue=2.3941069297250362e-82) 

HR_first 
 NormaltestResult(statistic=296.5261930061873, pvalue=4.075259972210311e-65) 

HR_median 
 NormaltestResult(statistic=65.8838788512379, pvalue=4.93739058159336

In [35]:
#For non normal (all in this case) -> Wilcoxon-Mann-Whitney test
#Hp0: are they the same?

print('BUN\n', sci.mannwhitneyu(numerical_df['BUN_first'], numerical_df['BUN_last'], nan_policy='omit'))
#same
print('\n\nCreatinine\n', sci.mannwhitneyu(numerical_df['Creatinine_first'], numerical_df['Creatinine_last'], nan_policy='omit'))
#same
print('\n\nHCT\n', sci.mannwhitneyu(numerical_df['HCT_first'], numerical_df['HCT_last'], nan_policy='omit'))
#different
print('\n\nHR\n', sci.mannwhitneyu(numerical_df['HR_first'], numerical_df['HR_last'], nan_policy='omit'))
#different
print('\n\nTemp\n', sci.mannwhitneyu(numerical_df['Temp_first'], numerical_df['Temp_last'], nan_policy='omit'))
#different
print('\n\nGCS\n', sci.mannwhitneyu(numerical_df['GCS_first'], numerical_df['GCS_last'], nan_policy='omit'))
#different
print('\n\nPlatelets\n', sci.mannwhitneyu(numerical_df['Platelets_first'], numerical_df['Platelets_last'], nan_policy='omit'))
#different
print('\n\nHCO3\n', sci.mannwhitneyu(numerical_df['HCO3_first'], numerical_df['HCO3_last'], nan_policy='omit'))
#different
print('\n\nNa\n', sci.mannwhitneyu(numerical_df['Na_first'], numerical_df['Na_last'], nan_policy='omit'))
#same
print('\n\nK\n', sci.mannwhitneyu(numerical_df['K_first'], numerical_df['K_last'], nan_policy='omit'))
#different
print('\n\nWBC\n', sci.mannwhitneyu(numerical_df['WBC_first'], numerical_df['WBC_last'], nan_policy='omit'))
#different
print('\n\nMg\n', sci.mannwhitneyu(numerical_df['Mg_first'], numerical_df['Mg_last'], nan_policy='omit'))
#different
print('\n\nGlucose\n', sci.mannwhitneyu(numerical_df['Glucose_first'], numerical_df['Glucose_last'], nan_policy='omit'))
#different
print('\n\nNISysABP\n', sci.mannwhitneyu(numerical_df['NISysABP_first'], numerical_df['NISysABP_last'], nan_policy='omit'))
#different
print('\n\nNIMAP\n', sci.mannwhitneyu(numerical_df['NIMAP_first'], numerical_df['NIMAP_last'], nan_policy='omit'))
#different
print('\n\nNIDiasABP\n', sci.mannwhitneyu(numerical_df['NIDiasABP_first'], numerical_df['NIDiasABP_last'], nan_policy='omit'))
#different
print('\n\npH\n', sci.mannwhitneyu(numerical_df['pH_first'], numerical_df['pH_last'], nan_policy='omit'))
#different
print('\n\nPaO2\n', sci.mannwhitneyu(numerical_df['PaO2_first'], numerical_df['PaO2_last'], nan_policy='omit'))
#different
print('\n\nPaCO2\n', sci.mannwhitneyu(numerical_df['PaCO2_first'], numerical_df['PaCO2_last'], nan_policy='omit'))
#different
print('\n\nMAP\n', sci.mannwhitneyu(numerical_df['MAP_first'], numerical_df['MAP_last'], nan_policy='omit'))
#different
print('\n\nSysABP\n', sci.mannwhitneyu(numerical_df['SysABP_first'], numerical_df['SysABP_last'], nan_policy='omit'))
#different
print('\n\nDiasABP\n', sci.mannwhitneyu(numerical_df['DiasABP_first'], numerical_df['DiasABP_last'], nan_policy='omit'))
#different
print('\n\nWeight\n', sci.mannwhitneyu(numerical_df['Weight_first'], numerical_df['Weight_last'], nan_policy='omit'))
#same
print('\n\nFiO2\n', sci.mannwhitneyu(numerical_df['FiO2_first'], numerical_df['FiO2_last'], nan_policy='omit'))
#different
print('\n\nLactate\n', sci.mannwhitneyu(numerical_df['Lactate_first'], numerical_df['Lactate_last'], nan_policy='omit'))
#different

BUN
 MannwhitneyuResult(statistic=17614605.0, pvalue=0.29981721794226535)


Creatinine
 MannwhitneyuResult(statistic=17557180.0, pvalue=0.4663447229665394)


HCT
 MannwhitneyuResult(statistic=19566419.0, pvalue=3.558726270824122e-31)


HR
 MannwhitneyuResult(statistic=19187529.5, pvalue=2.0600174229886776e-22)


Temp
 MannwhitneyuResult(statistic=12305599.0, pvalue=2.892439692928371e-166)


GCS
 MannwhitneyuResult(statistic=12970276.0, pvalue=4.004574406370745e-140)


Platelets
 MannwhitneyuResult(statistic=19870854.0, pvalue=1.479965468395165e-41)


HCO3
 MannwhitneyuResult(statistic=14709169.0, pvalue=4.7381289527075e-46)


Na
 MannwhitneyuResult(statistic=17335528.5, pvalue=0.9434850748661205)


K
 MannwhitneyuResult(statistic=18566664.0, pvalue=4.796755746113763e-14)


WBC
 MannwhitneyuResult(statistic=17789766.5, pvalue=0.0008012400962805928)


Mg
 MannwhitneyuResult(statistic=13532952.0, pvalue=4.08955520274117e-86)


Glucose
 MannwhitneyuResult(statistic=19876694.5, pvalue=1.883

In [36]:
df

Unnamed: 0,recordid,SAPS-I,SOFA,In-hospital_death,Age,CCU,CSRU,SICU,Gender,BUN_first,...,Weight_first,FiO2_last,FiO2_first,MechVentDuration,MechVentStartTime,UrineOutputSum,MechVentLast8Hour,Lactate_first,Lactate_last,Height
0,132539,6,1,0,54.0,0,0,1,0.0,13.0,...,,,,,,,,,,
1,132540,16,8,0,76.0,0,1,0,1.0,16.0,...,80.6,0.40,1.0,360.0,71.0,5.0,0.0,,,175.3
2,132541,21,11,0,44.0,0,0,0,0.0,8.0,...,56.7,0.40,1.0,2160.0,617.0,14.0,1.0,1.3,0.9,
3,132543,7,1,0,68.0,0,0,0,1.0,23.0,...,84.6,,,,,,,,,180.3
4,132545,17,2,0,88.0,0,0,0,0.0,45.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5995,151939,19,6,1,52.0,0,0,1,1.0,11.0,...,,0.40,0.6,2440.0,238.0,17.0,1.0,,,
5996,151383,13,2,0,90.0,1,0,0,0.0,20.0,...,72.9,1.00,1.0,,,,,,,157.5
5997,149623,12,9,0,72.0,1,0,0,0.0,62.0,...,,0.35,0.5,1200.0,188.0,10.0,0.0,1.4,1.4,
5998,145366,18,11,1,52.0,0,0,0,1.0,16.0,...,95.9,0.50,1.0,2760.0,100.0,27.0,1.0,9.9,8.2,182.9


In [46]:
df.drop(['BUN_first', 'Creatinine_first', 'Na_first', 'Weight_first'], axis=1)
df

Unnamed: 0,recordid,SAPS-I,SOFA,In-hospital_death,Age,CCU,CSRU,SICU,Gender,BUN_first,...,Weight_first,FiO2_last,FiO2_first,MechVentDuration,MechVentStartTime,UrineOutputSum,MechVentLast8Hour,Lactate_first,Lactate_last,Height
0,132539,6,1,0,54.0,0,0,1,0.0,13.0,...,,,,,,,,,,
1,132540,16,8,0,76.0,0,1,0,1.0,16.0,...,80.6,0.40,1.0,360.0,71.0,5.0,0.0,,,175.3
2,132541,21,11,0,44.0,0,0,0,0.0,8.0,...,56.7,0.40,1.0,2160.0,617.0,14.0,1.0,1.3,0.9,
3,132543,7,1,0,68.0,0,0,0,1.0,23.0,...,84.6,,,,,,,,,180.3
4,132545,17,2,0,88.0,0,0,0,0.0,45.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5995,151939,19,6,1,52.0,0,0,1,1.0,11.0,...,,0.40,0.6,2440.0,238.0,17.0,1.0,,,
5996,151383,13,2,0,90.0,1,0,0,0.0,20.0,...,72.9,1.00,1.0,,,,,,,157.5
5997,149623,12,9,0,72.0,1,0,0,0.0,62.0,...,,0.35,0.5,1200.0,188.0,10.0,0.0,1.4,1.4,
5998,145366,18,11,1,52.0,0,0,0,1.0,16.0,...,95.9,0.50,1.0,2760.0,100.0,27.0,1.0,9.9,8.2,182.9
