In [1]:
import sys
import pandas as pd
import datetime as datetime
import numpy as np
from dateutil.relativedelta import relativedelta
from sklearn.preprocessing import OneHotEncoder
import matplotlib
import matplotlib.pyplot as plt
#import psycopg2
from scipy.stats import ks_2samp
import scipy.stats as scats
import visuals as vs
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
from sklearn import svm
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split




# MIMIC-III Critical Care Database

MIMIC-III (Medical Information Mart for Intensive Care III) is a large, freely-available database comprising deidentified health-related data associated with over forty thousand patients who stayed in critical care units of the Beth Israel Deaconess Medical Center between 2001 and 2012.

The database includes information such as demographics, vital sign measurements made at the bedside (~1 data point per hour), laboratory test results, procedures, medications, caregiver notes, imaging reports, and mortality (both in and out of hospital).

MIMIC supports a diverse range of analytic studies spanning epidemiology, clinical decision-rule improvement, and electronic tool development. It is notable for three factors:

it is freely available to researchers worldwide
it encompasses a diverse and very large population of ICU patients
it contains high temporal resolution data including lab results, electronic documentation, and bedside monitor trends and waveforms.

Citations: 
MIMIC-III, a freely accessible critical care database. Johnson AEW, Pollard TJ, Shen L, Lehman L, Feng M, Ghassemi M, Moody B, Szolovits P, Celi LA, and Mark RG. Scientific Data (2016). DOI: 10.1038/sdata.2016.35. Available at: http://www.nature.com/articles/sdata201635

Pollard, T. J. & Johnson, A. E. W. The MIMIC-III Clinical Database http://dx.doi.org/10.13026/C2XW26 (2016).



# IMPORTING DATA
The mimic III database was downloaded and reconstructed locally using posgresql. The database was managed graphically using Portico. 
A query was run on the mimic III database to generate demographic data and data concerning hospital and ICU stays for patients diagnosed with sepsis according to the Angus criteria (Angus et al, 2001. Epidemiology of severe sepsis in the United States; http://www.ncbi.nlm.nih.gov/pubmed/11445675 )

The query was exported from Porticoto the file PTNT_DEMOG_ANGUS_FIRST24.csv. The data was read into a pandas dataframe lab_events.. The query was exported from Porticoto the file ADMISSIONS_ICUSTAY_SEPSIS.csv. The data was read into a pandas dataframe ptnt_demog 


In [22]:
ptnt_demog = pd.DataFrame.from_csv('PTNT_DEMOG_ANGUS_FIRST24.csv')
ptnt_demog.head()

Unnamed: 0_level_0,gender,dob,admittime,marital_status,ethnicity,insurance,hospital_expire_flag,icustay_id,first_careunit,infection,explicit_sepsis,organ_dysfunction,mech_vent,angus
subject_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
3,M,2025-04-11 00:00:00,2101-10-20 19:08:00,MARRIED,WHITE,Medicare,0,211552,MICU,1,0,1,1,1
21,M,2047-04-04 00:00:00,2134-09-11 12:17:00,MARRIED,WHITE,Medicare,0,217847,CCU,1,0,1,0,1
21,M,2047-04-04 00:00:00,2135-01-30 20:50:00,MARRIED,WHITE,Medicare,1,216859,MICU,1,1,1,0,1
31,M,2036-05-17 00:00:00,2108-08-22 23:27:00,MARRIED,WHITE,Medicare,1,254478,MICU,1,0,0,1,1
36,M,2061-08-17 00:00:00,2134-05-10 11:30:00,MARRIED,WHITE,Medicare,0,241249,MICU,1,0,0,1,1


Date and time data imported in string format is converted to pandas.datetime objects

In [4]:
ptnt_demog.columns


Index([u'gender', u'dob', u'admittime', u'marital_status', u'ethnicity',
       u'insurance', u'hospital_expire_flag', u'icustay_id', u'first_careunit',
       u'infection', u'explicit_sepsis', u'organ_dysfunction', u'mech_vent',
       u'angus'],
      dtype='object')

In [23]:
#transforms date time data imported in string format to pandas.datetime objects. 
dates_and_times = ['dob', 'admittime']
for thing in dates_and_times:
    ptnt_demog[thing] = pd.to_datetime(ptnt_demog[thing])
    
ptnt_demog.head()


Unnamed: 0_level_0,gender,dob,admittime,marital_status,ethnicity,insurance,hospital_expire_flag,icustay_id,first_careunit,infection,explicit_sepsis,organ_dysfunction,mech_vent,angus
subject_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
3,M,2025-04-11,2101-10-20 19:08:00,MARRIED,WHITE,Medicare,0,211552,MICU,1,0,1,1,1
21,M,2047-04-04,2134-09-11 12:17:00,MARRIED,WHITE,Medicare,0,217847,CCU,1,0,1,0,1
21,M,2047-04-04,2135-01-30 20:50:00,MARRIED,WHITE,Medicare,1,216859,MICU,1,1,1,0,1
31,M,2036-05-17,2108-08-22 23:27:00,MARRIED,WHITE,Medicare,1,254478,MICU,1,0,0,1,1
36,M,2061-08-17,2134-05-10 11:30:00,MARRIED,WHITE,Medicare,0,241249,MICU,1,0,0,1,1


In [24]:
ptnt_demog['subject_id'] = ptnt_demog.index
ptnt_demog.index = ptnt_demog['icustay_id']
ptnt_demog.drop(['icustay_id'], axis=1, inplace = True)
ptnt_demog.head()

Unnamed: 0_level_0,gender,dob,admittime,marital_status,ethnicity,insurance,hospital_expire_flag,first_careunit,infection,explicit_sepsis,organ_dysfunction,mech_vent,angus,subject_id
icustay_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
211552,M,2025-04-11,2101-10-20 19:08:00,MARRIED,WHITE,Medicare,0,MICU,1,0,1,1,1,3
217847,M,2047-04-04,2134-09-11 12:17:00,MARRIED,WHITE,Medicare,0,CCU,1,0,1,0,1,21
216859,M,2047-04-04,2135-01-30 20:50:00,MARRIED,WHITE,Medicare,1,MICU,1,1,1,0,1,21
254478,M,2036-05-17,2108-08-22 23:27:00,MARRIED,WHITE,Medicare,1,MICU,1,0,0,1,1,31
241249,M,2061-08-17,2134-05-10 11:30:00,MARRIED,WHITE,Medicare,0,MICU,1,0,0,1,1,36


In [25]:
#for patient age, only the date information is needed so datetime data is converted to date
ptnt_demog['dob'] = ptnt_demog['dob'].apply(lambda x: x.date())
ptnt_demog['admittime'] = ptnt_demog['admittime'].apply(lambda x: x.date())
ptnt_demog[['dob', 'admittime']].head()

Unnamed: 0_level_0,dob,admittime
icustay_id,Unnamed: 1_level_1,Unnamed: 2_level_1
211552,2025-04-11,2101-10-20
217847,2047-04-04,2134-09-11
216859,2047-04-04,2135-01-30
254478,2036-05-17,2108-08-22
241249,2061-08-17,2134-05-10


In [26]:
#dates_and_times = ['dob', 'dod', 'admittime', 'dischtime', 'deathtime', 'intime', 'outtime']
#checking for null values in the data
for item in dates_and_times:
    isanan = ptnt_demog[item].isnull().values.any()
    print "{}   {}".format(item, isanan)

dob   False
admittime   False


# Calculating Patient Age, Hospital and ICU Stays
The age of patient at the time of admission was calculated. Also calculated was the duration of 
hospital stay and duration of ICU stay. 
    

In [27]:
ptnt_demog.head()

Unnamed: 0_level_0,gender,dob,admittime,marital_status,ethnicity,insurance,hospital_expire_flag,first_careunit,infection,explicit_sepsis,organ_dysfunction,mech_vent,angus,subject_id
icustay_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
211552,M,2025-04-11,2101-10-20,MARRIED,WHITE,Medicare,0,MICU,1,0,1,1,1,3
217847,M,2047-04-04,2134-09-11,MARRIED,WHITE,Medicare,0,CCU,1,0,1,0,1,21
216859,M,2047-04-04,2135-01-30,MARRIED,WHITE,Medicare,1,MICU,1,1,1,0,1,21
254478,M,2036-05-17,2108-08-22,MARRIED,WHITE,Medicare,1,MICU,1,0,0,1,1,31
241249,M,2061-08-17,2134-05-10,MARRIED,WHITE,Medicare,0,MICU,1,0,0,1,1,36


In [28]:
'''the code calculates age of patient at time of admission, hospital stay and ICU stay. 
    using relativedelta.years to get the age rounded to years,
    relativedelta.days to get hospital stay in days and relativedelta.hours
    to get icu stay in hours.
    '''
# age at time of admission is calculated as admittime - dob
for index, row in ptnt_demog.iterrows():
    age_val = relativedelta(row['admittime'], row['dob']).years
    ptnt_demog.set_value(index, 'age', age_val)
'''        
    #adm_stay in days
    adm_stay_val = relativedelta(row['dischtime'],row['admittime'])
    adm_stay_val = adm_stay_val.weeks*7 + adm_stay_val.days
    #icu_stay in hours
    icu_stay_val = relativedelta(row['outtime'],row['intime'])
    icu_stay_val = icu_stay_val.weeks*7*24 + icu_stay_val.days*24 + icu_stay_val.hours
    ptnt_demog.set_value(index, 'adm_stay_days', adm_stay_val)
    ptnt_demog.set_value(index, 'icu_stay_hours', icu_stay_val)
'''    
ptnt_demog.head()

Unnamed: 0_level_0,gender,dob,admittime,marital_status,ethnicity,insurance,hospital_expire_flag,first_careunit,infection,explicit_sepsis,organ_dysfunction,mech_vent,angus,subject_id,age
icustay_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
211552,M,2025-04-11,2101-10-20,MARRIED,WHITE,Medicare,0,MICU,1,0,1,1,1,3,76.0
217847,M,2047-04-04,2134-09-11,MARRIED,WHITE,Medicare,0,CCU,1,0,1,0,1,21,87.0
216859,M,2047-04-04,2135-01-30,MARRIED,WHITE,Medicare,1,MICU,1,1,1,0,1,21,87.0
254478,M,2036-05-17,2108-08-22,MARRIED,WHITE,Medicare,1,MICU,1,0,0,1,1,31,72.0
241249,M,2061-08-17,2134-05-10,MARRIED,WHITE,Medicare,0,MICU,1,0,0,1,1,36,72.0


In [37]:
#relativedelta(ptnt_demog.iloc[0]['dod'], ptnt_demog.iloc[0]['dob']).years
cols = list(ptnt_demog.columns)
cols.insert(0, cols.pop(cols.index('subject_id')))
ptnt_demog = ptnt_demog[cols]
ptnt_demog.head()

Unnamed: 0_level_0,subject_id,gender,marital_status,ethnicity,insurance,hospital_expire_flag,first_careunit,infection,explicit_sepsis,organ_dysfunction,mech_vent,angus,age
icustay_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
211552,3,M,MARRIED,WHITE,Medicare,0,MICU,1,0,1,1,1,76.0
217847,21,M,MARRIED,WHITE,Medicare,0,CCU,1,0,1,0,1,87.0
216859,21,M,MARRIED,WHITE,Medicare,1,MICU,1,1,1,0,1,87.0
254478,31,M,MARRIED,WHITE,Medicare,1,MICU,1,0,0,1,1,72.0
241249,36,M,MARRIED,WHITE,Medicare,0,MICU,1,0,0,1,1,72.0


# Reality Check on Ages and Durations of Stay
Checking durations of stay and ages on low end for values < 0. Checking ages on the high end at 110yrs. 
Because I'm not sure what an unreasonable durations of stay would be for ICU or hospital stay will look for 
outliers in the data using statistical analysis later in preprocessing. 


In [38]:
ptnt_demog[ptnt_demog['age'] <  0]

Unnamed: 0_level_0,subject_id,gender,marital_status,ethnicity,insurance,hospital_expire_flag,first_careunit,infection,explicit_sepsis,organ_dysfunction,mech_vent,angus,age
icustay_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1


## There are approximately 93 patients whos calculated age is > 110. All are > 300yrs. 

In [39]:
age_replace_vals = list(ptnt_demog[ptnt_demog['age'] > 110]['age'].unique())


[]

In [41]:
ptnt_demog['age'].replace(age_replace_vals, np.nan, inplace = True)
ptnt_demog['age'].head()

AttributeError: 'NoneType' object has no attribute 'any'

In [42]:
#checking again for age values > 110
ptnt_demog[ptnt_demog['age'] > 110]

Unnamed: 0_level_0,subject_id,gender,marital_status,ethnicity,insurance,hospital_expire_flag,first_careunit,infection,explicit_sepsis,organ_dysfunction,mech_vent,angus,age
icustay_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1


## Dropping columns that will not be used in analysis. 
These columns were used to calculate columns like age, admissions stay and ICU stay and are no longer useful. 
Except admittime which may be used to determine whether survival rates have changed over time. 

In [43]:
# and want to include the subject id as index so i'm creating ptnt_demog2 as a copy, then dropping columns from 
# ptnt_demog in place as i don't need the subject_id for the current analysis. 
#ptnt_demog2 = ptnt_demog.drop(['dob', 'expire_flag', 'dod', 'dischtime', 'deathtime', 'intime', 'outtime'],axis = 1)
ptnt_demog.drop(['dob','admittime' ],axis = 1, inplace = True)
ptnt_demog.head()

ValueError: labels ['dob' 'admittime'] not contained in axis

In [None]:
'''
Sample code that was used to manually calculate dates and times from strings
def to_timedelta(time_string): 
    age = time_string.split()
    days = age[0]
    hms = age[2].split(":")
    print hms
    delta = timedelta(days = int(days), hours = int(hms[0]), minutes = int(hms[1]), seconds = int(hms[2]))
    return delta
'''
'''this was code to manually convert the dates in the file to datetime objects
age = ptnt_demog.age.iloc[0]
print age
dummy = age.split()
print dummy
days = dummy[0]
print days
hms = dummy[2].split(":")
print hms

datetime_object = datetime.strptime(dob, '%Y %m %d %H:%M:%S')




and these are scraps of where i figured out i can just use to_datetime
ptnt_demog['dod2'] = pd.to_datetime(ptnt_demog['dod']) 
ptnt_demog.head()
onedate = to_timedelta(ptnt_demog.age.iloc[0])
otherdate =  to_timedelta(ptnt_demog.age.iloc[1])
print onedate - otherdate
print onedate

'''
'''example code for timedelta
from datetime import datetime, timedelta
# we specify the input and the format...
t = datetime.strptime("05:20:25","%H:%M:%S")
# ...and use datetime's hour, min and sec properties to build a timedelta
delta = timedelta(hours=t.hour, minutes=t.minute, seconds=t.second)


#durations like icu stay etc are imported as strings. the code below 
#converts durations in strings to timedelta for use as variable
from datetime import timedelta
from datetime import datetime

delta = timedelta(days = int(duration[0]), hours = int(duration[1]), minutes = int(duration[2]), seconds = int(duration[3]))
print delta
'''

In [44]:
ptnt_demog.columns

Index([u'subject_id', u'gender', u'marital_status', u'ethnicity', u'insurance',
       u'hospital_expire_flag', u'first_careunit', u'infection',
       u'explicit_sepsis', u'organ_dysfunction', u'mech_vent', u'angus',
       u'age'],
      dtype='object')

In [None]:
ptnt_demog.drop('admittime', axis = 1, inplace = True)
ptnt_demog2.drop('admittime', axis = 1, inplace = True)

In [45]:
ptnt_demog.to_csv('PTNT_DEMOG_ANGUS_rev.csv')

In [None]:
cat_vars = list(ptnt_demog.columns[ptnt_demog.dtypes == object])
num_vars = list(ptnt_demog.columns[ptnt_demog.dtypes == 'float64'])

In [None]:
cat_vars

In [None]:
for item in cat_vars:
    
    # plot
    plt.subplots(figsize=(13,6))
    ptnt_demog[item][ptnt_demog.hospital_expire_flag==1].value_counts().plot.bar(
        alpha=0.5,label= 'Non_Survival')
    ptnt_demog[item][ptnt_demog.hospital_expire_flag==0].value_counts().plot.bar(
        alpha=0.1,label= 'Survival')
    
    # fake plots for KS test, median, etc
    '''
    plt.plot([], label=' ',color='lightgray')
    plt.plot([], label='KS test: p={}'.format(format(p_val,'.3f')),
             color='lightgray')
    plt.plot([], label='Median (non-surv): {}'.format(format(med_nonsurv,'.2f')),
             color='lightgray')
    plt.plot([], label='Median (surv): {}'.format(format(med_surv,'.2f')),
             color='lightgray')
    plt.plot([], label='Variance (non-surv): {}'.format(format(var_nonsurv,'.2f')),
             color='lightgray')
    plt.plot([], label='Variance (surv): {}'.format(format(var_surv,'.2f')),
             color='lightgray')
    plt.plot([], label='Skew (non-surv): {}'.format(format(skew_nonsurv,'.2f')),
             color='lightgray')
    plt.plot([], label='Skew (surv): {}'.format(format(skew_surv,'.2f')),
             color='lightgray')
    '''
    # add title, labels etc.
    plt.title(item)
    #plt.xlabel(l + ' ' + u)
    plt.legend(loc="upper left", bbox_to_anchor=(1,1),fontsize=12)
    #plt.xlim(0, data[l].dropna().quantile(0.99))
    
    # Add lab range if available
    #if l in lab_ranges:
    #    plt.axvline(lab_ranges[l][0],color='k',linestyle='--')
    #    plt.axvline(lab_ranges[l][1],color='k',linestyle='--')

In [None]:
for item in num_vars:
    
    # plot
    plt.subplots(figsize=(13,6))
    ptnt_demog2[item][ptnt_demog.hospital_expire_flag==1].plot.hist(
        alpha=0.5,label= 'Non_Survival')
    ptnt_demog2[item][ptnt_demog.hospital_expire_flag==0].plot.hist(
        alpha=0.5,label= 'Survival')
    
    # fake plots for KS test, median, etc
    '''
    plt.plot([], label=' ',color='lightgray')
    plt.plot([], label='KS test: p={}'.format(format(p_val,'.3f')),
             color='lightgray')
    plt.plot([], label='Median (non-surv): {}'.format(format(med_nonsurv,'.2f')),
             color='lightgray')
    plt.plot([], label='Median (surv): {}'.format(format(med_surv,'.2f')),
             color='lightgray')
    plt.plot([], label='Variance (non-surv): {}'.format(format(var_nonsurv,'.2f')),
             color='lightgray')
    plt.plot([], label='Variance (surv): {}'.format(format(var_surv,'.2f')),
             color='lightgray')
    plt.plot([], label='Skew (non-surv): {}'.format(format(skew_nonsurv,'.2f')),
             color='lightgray')
    plt.plot([], label='Skew (surv): {}'.format(format(skew_surv,'.2f')),
             color='lightgray')
    '''
    # add title, labels etc.
    plt.title(item)
    #plt.xlabel(l + ' ' + u)
    plt.legend(loc="upper left", bbox_to_anchor=(1,1),fontsize=12)
    #plt.xlim(0, data[l].dropna().quantile(0.99))
    
    # Add lab range if available
    #if l in lab_ranges:
    #    plt.axvline(lab_ranges[l][0],color='k',linestyle='--')
    #    plt.axvline(lab_ranges[l][1],color='k',linestyle='--')

#code for converting sql dates to date-time objects
from datetime import datetime
dob = ptnt_demog.iloc[0].dob.split()
ymd = dob[0].split('-')
print ymd
ymd.append(dob[1])
print ymd
dob = " ".join(ymd)

datetime_object = datetime.strptime(dob, '%Y %m %d %H:%M:%S')

In [None]:
from sklearn import preprocessing as prp


In [None]:
ptnt_demog.dtypes

In [None]:
ptnt_demog2 = ptnt_demog.copy()

In [None]:
cat_vars = list(ptnt_demog2.columns[ptnt_demog2.dtypes == object])
cat_vars

In [None]:
monkey = pd.get_dummies(ptnt_demog2[cat_vars])

In [None]:
ptnt_demog2.drop(cat_vars, axis = 1, inplace = True)
ptnt_demog2.head()


In [None]:
ptnt_demog2 = ptnt_demog2.join(monkey)


In [None]:
cols = list(ptnt_demog2.columns)
cols.insert(0, cols.pop(cols.index('hospital_expire_flag')))
cols

In [None]:
ptnt_demog2 = ptnt_demog2[cols]
ptnt_demog2.head()

In [None]:
ptnt_demog2[ptnt_demog2.columns[:10]].groupby('hospital_expire_flag').hist(alpha = 0.5, figsize = (10, 10))

In [None]:
gr = ptnt_demog2.groupby('hospital_expire_flag')
for k, v in gr:
    v.plot(kind='hist', alpha=.5, legend = None)

## There appears to be very little observable difference between survivors and non-survivors in any of these variables. For this reason this analysis will be paused while a query is run to extract clinical data on septic patients. An initial exploration will then be performed on that data. 

In [None]:
ptnt_demog2.drop('admittime', axis=1, inplace = True)

In [None]:
cols=list(ptnt_demog2.columns[1:])
cols

In [None]:
monkey = cols[2]
monkey

In [None]:
ptnt_demog2.groupby('hospital_expire_flag')[monkey].hist(alpha = 0.3, figsize = (10, 10))

In [None]:
for item in cols:
    
    # plot
    plt.subplots(figsize=(13,6))
    ptnt_demog2[item][ptnt_demog2.hospital_expire_flag==1].plot.hist(
        alpha=0.5,label= 'Non_Survival')
    ptnt_demog2[item][ptnt_demog2.hospital_expire_flag==0].plot.hist(
        alpha=0.5,label= 'Survival')
    
    # fake plots for KS test, median, etc
    '''
    plt.plot([], label=' ',color='lightgray')
    plt.plot([], label='KS test: p={}'.format(format(p_val,'.3f')),
             color='lightgray')
    plt.plot([], label='Median (non-surv): {}'.format(format(med_nonsurv,'.2f')),
             color='lightgray')
    plt.plot([], label='Median (surv): {}'.format(format(med_surv,'.2f')),
             color='lightgray')
    plt.plot([], label='Variance (non-surv): {}'.format(format(var_nonsurv,'.2f')),
             color='lightgray')
    plt.plot([], label='Variance (surv): {}'.format(format(var_surv,'.2f')),
             color='lightgray')
    plt.plot([], label='Skew (non-surv): {}'.format(format(skew_nonsurv,'.2f')),
             color='lightgray')
    plt.plot([], label='Skew (surv): {}'.format(format(skew_surv,'.2f')),
             color='lightgray')
    '''
    # add title, labels etc.
    plt.title(item)
    #plt.xlabel(l + ' ' + u)
    plt.legend(loc="upper left", bbox_to_anchor=(1,1),fontsize=12)
    #plt.xlim(0, data[l].dropna().quantile(0.99))
    
    # Add lab range if available
    #if l in lab_ranges:
    #    plt.axvline(lab_ranges[l][0],color='k',linestyle='--')
    #    plt.axvline(lab_ranges[l][1],color='k',linestyle='--')

In [None]:
display(ptnt_demog2.shape)
display(ptnt_demog2.dropna().shape)

In [None]:
ptnt_demog2.dropna(inplace = True)
ptnt_demog2.shape

In [None]:
ptnt_demog2_scaled = ptnt_demog2.copy()
for feature_name in ptnt_demog2_scaled.columns[1:]:
#    max_value = data2[feature_name].max()
#    min_value = data2[feature_name].min()
#    data2_norm[feature_name] = (data2[feature_name] - min_value) / (max_value - min_value)
    ptnt_demog2_scaled[feature_name] = preprocessing.scale(ptnt_demog2_scaled[feature_name], with_mean = True, 
                                                  with_std = True)
    
display(ptnt_demog2_scaled.head())

In [None]:
pca = PCA(n_components = 8).fit(ptnt_demog2_scaled[ptnt_demog2_scaled.columns[1:]])

# Generate PCA results plot
pca_results = vs.pca_results(ptnt_demog2_scaled[ptnt_demog2_scaled.columns[1:]], pca)



In [None]:
features = ptnt_demog2_scaled[ptnt_demog2_scaled.columns[1:]]
targets = ptnt_demog2_scaled[ptnt_demog2_scaled.columns[0]]
display(features.columns)
display(targets.name)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, targets, 
                                                    test_size = 0.30, random_state = 42)

In [None]:
clf_SVC = svm.SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape=None, degree=3, gamma='auto', kernel='rbf', 
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False).fit(X_train, y_train)

scores = cross_val_score(clf_SVC, features, targets, cv=5)
display(scores)  

In [None]:
y_predsSVC = clf_SVC.predict(X_test)
metrics.confusion_matrix(y_test, y_predsSVC)



In [None]:
#X_train, X_test, y_train, y_test = train_test_split(reduced_data, targets,
#                                                   test_size = 0.30, 
#                                                  random_state = 42)

clf_MLP = MLPClassifier(solver='sgd', alpha=1e-6, learning_rate = 'adaptive',
                    hidden_layer_sizes=(5, 2), random_state=1)

clf_MLP.fit(X_train, y_train)                         

display(clf_MLP.score(X_test, y_test))
y_predsMLP = clf_MLP.predict(X_test)
metrics.confusion_matrix(y_test, y_predsMLP)




In [None]:
### create classifier
clf_GNB = GaussianNB()
### fit the classifier on the training features and labels
clf_GNB.fit(X_train, y_train)
    ### return the fit classifier

nb_score = clf_GNB.score(X_test, y_test) 

print nb_score
y_predsGNB = clf_GNB.predict(X_test)
metrics.confusion_matrix(y_test, y_predsGNB)


