In [1]:
import sys
import pandas as pd
import datetime as datetime
import numpy as np
from dateutil.relativedelta import relativedelta
from sklearn.preprocessing import OneHotEncoder
import matplotlib
import matplotlib.pyplot as plt
#import psycopg2
from scipy.stats import ks_2samp
import scipy.stats as scats
import visuals as vs
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
from sklearn import svm
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
import yaml



# MIMIC-III Critical Care Database

MIMIC-III (Medical Information Mart for Intensive Care III) is a large, freely-available database comprising deidentified health-related data associated with over forty thousand patients who stayed in critical care units of the Beth Israel Deaconess Medical Center between 2001 and 2012.

The database includes information such as demographics, vital sign measurements made at the bedside (~1 data point per hour), laboratory test results, procedures, medications, caregiver notes, imaging reports, and mortality (both in and out of hospital).

MIMIC supports a diverse range of analytic studies spanning epidemiology, clinical decision-rule improvement, and electronic tool development. It is notable for three factors:

it is freely available to researchers worldwide
it encompasses a diverse and very large population of ICU patients
it contains high temporal resolution data including lab results, electronic documentation, and bedside monitor trends and waveforms.

Citations: 
MIMIC-III, a freely accessible critical care database. Johnson AEW, Pollard TJ, Shen L, Lehman L, Feng M, Ghassemi M, Moody B, Szolovits P, Celi LA, and Mark RG. Scientific Data (2016). DOI: 10.1038/sdata.2016.35. Available at: http://www.nature.com/articles/sdata201635

Pollard, T. J. & Johnson, A. E. W. The MIMIC-III Clinical Database http://dx.doi.org/10.13026/C2XW26 (2016).



# IMPORTING DATA
The mimic III database was downloaded and reconstructed locally using posgresql. The database was managed graphically using Portico. 
A query was run on the mimic III database to generate demographic data and data concerning hospital and ICU stays for patients diagnosed with sepsis according to the Angus criteria (Angus et al, 2001. Epidemiology of severe sepsis in the United States; http://www.ncbi.nlm.nih.gov/pubmed/11445675 )

The query was exported from Porticoto the file PTNT_DEMOG_FIRST24.csv. The data was read into a pandas dataframe lab_events.

In [2]:
# patient demographic data includes diagnoses and icd9 codes for each patient and each icustay

ptnt_demog = pd.DataFrame.from_csv('PTNT_DEMOG_FIRST24.csv')
ptnt_demog2 = ptnt_demog[~ptnt_demog.index.duplicated(keep='first')]



#ptnt_demog.loc[:,'icustay_id'] = ptnt_demog.index
#ptnt_demog.index = np.arange(ptnt_demog.shape[0])
#ptnt_demog.head()

print "patient demographics with unique icustays"
display(ptnt_demog2.head())

# create patient demographic table with unique icustays as rows

dates_and_times = ['dob', 'admittime', 'dischtime', 'intime', 'outtime', 'deathtime']
for thing in dates_and_times:
    ptnt_demog2.loc[:,thing] = pd.to_datetime(ptnt_demog2.loc[:,thing])

print "calculating ages, duration of stays"
# len(pd.date_range()) APPEARS TO TAKE A VERY LONG TIME
for index, row in ptnt_demog2.iterrows():
    if (pd.notnull(row['intime']) & pd.notnull(row['dob'])):
        #age_val = relativedelta(row['intime'], row['dob']).years
        # calculating year end frequency
        age_val = len(pd.date_range(end = row['intime'], start = row['dob'], freq = 'A'))
    else: 
        age_val = np.nan
    if (pd.notnull(row['intime']) & pd.notnull(row['outtime'])):
        #delta_time = relativedelta(row['outtime'], row['intime'])
        #icu_stay_val = delta_time.days*24 + delta_time.hours
        icu_stay_val = len(pd.date_range(end = row['outtime'], start = row['intime'], freq = 'H'))
    else: 
        icu_stay_val = np.nan
    if (pd.notnull(row['admittime']) & pd.notnull(row['dischtime'])):
        #delta_time = relativedelta(row['dischtime'], row['admittime'])
        #hosp_stay_val = delta_time.days*24 + delta_time.hours
        hosp_stay_val = len(pd.date_range(end = row['dischtime'], start = row['admittime'], freq = 'H'))
    else:
        hosp_stay_val = np.nan
    
    ptnt_demog2.set_value(index, 'age', age_val)
    ptnt_demog2.set_value(index, 'icu_stay', icu_stay_val)
    ptnt_demog2.set_value(index, 'hosp_stay', hosp_stay_val)

cols = list(ptnt_demog2.columns)
cols.pop(cols.index('icd9_code'))
cols.pop(cols.index('icd9_code.1'))
cols.pop(cols.index('short_title'))
cols.pop(cols.index('intime'))
cols.pop(cols.index('outtime'))
cols.pop(cols.index('admittime'))
cols.pop(cols.index('dischtime'))
cols.pop(cols.index('seq_num'))
cols.pop(cols.index('dob'))

#cols.insert(0, cols.pop(cols.index('icustay_id')))
cols.insert(0, cols.pop(cols.index('hadm_id')))
cols.insert(1, cols.pop(cols.index('age')))
cols.insert(2, cols.pop(cols.index('icu_stay')))
cols.insert(3, cols.pop(cols.index('hosp_stay')))
cols.insert(len(cols), cols.pop(cols.index('hospital_expire_flag')))
display(cols)


ptnt_demog2 = ptnt_demog2[cols]
print "patient demographics with calculated ages, duration of stays"
display(ptnt_demog2.head())

patient demographics with unique icustays


Unnamed: 0_level_0,hadm_id,subject_id,first_careunit,gender,marital_status,ethnicity,insurance,admission_type,admittime,dischtime,intime,outtime,deathtime,dob,hospital_expire_flag,icd9_code,icd9_code.1,short_title,seq_num
icustay_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
211552,145834,3,MICU,M,MARRIED,WHITE,Medicare,EMERGENCY,2101-10-20 19:08:00,2101-10-31 13:58:00,2101-10-20 19:10:11,2101-10-26 20:43:09,,2025-04-11 00:00:00,0,0389,0389,Septicemia NOS,1
294638,185777,4,MICU,F,SINGLE,WHITE,Private,EMERGENCY,2191-03-16 00:28:00,2191-03-23 18:41:00,2191-03-16 00:29:31,2191-03-17 16:46:31,,2143-05-12 00:00:00,0,2763,2763,Alkalosis,4
228232,107064,6,SICU,F,MARRIED,WHITE,Medicare,ELECTIVE,2175-05-30 07:15:00,2175-06-15 16:00:00,2175-05-30 21:30:54,2175-06-03 13:39:54,,2109-06-21 00:00:00,0,40391,40391,Hyp kid NOS w cr kid V,1
220597,150750,9,MICU,M,,UNKNOWN/NOT SPECIFIED,Medicaid,EMERGENCY,2149-11-09 13:06:00,2149-11-14 10:15:00,2149-11-09 13:07:02,2149-11-14 20:52:14,2149-11-14 10:15:00,2108-01-26 00:00:00,1,431,431,Intracerebral hemorrhage,1
232669,112213,12,SICU,M,MARRIED,WHITE,Medicare,ELECTIVE,2104-08-07 10:15:00,2104-08-20 02:57:00,2104-08-08 02:08:17,2104-08-15 17:22:25,2104-08-20 02:57:00,2032-03-24 00:00:00,1,E8782,E8782,Abn react-anastom/graft,9


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


calculating ages, duration of stays


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)


['hadm_id',
 'age',
 'icu_stay',
 'hosp_stay',
 'subject_id',
 'first_careunit',
 'gender',
 'marital_status',
 'ethnicity',
 'insurance',
 'admission_type',
 'deathtime',
 'hospital_expire_flag']

patient demographics with calculated ages, duration of stays


Unnamed: 0_level_0,hadm_id,age,icu_stay,hosp_stay,subject_id,first_careunit,gender,marital_status,ethnicity,insurance,admission_type,deathtime,hospital_expire_flag
icustay_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
211552,145834,76.0,146.0,259.0,3,MICU,M,MARRIED,WHITE,Medicare,EMERGENCY,NaT,0
294638,185777,48.0,41.0,187.0,4,MICU,F,SINGLE,WHITE,Private,EMERGENCY,NaT,0
228232,107064,66.0,89.0,393.0,6,SICU,F,MARRIED,WHITE,Medicare,ELECTIVE,NaT,0
220597,150750,41.0,128.0,118.0,9,MICU,M,,UNKNOWN/NOT SPECIFIED,Medicaid,EMERGENCY,2149-11-14 10:15:00,1
232669,112213,72.0,184.0,305.0,12,SICU,M,MARRIED,WHITE,Medicare,ELECTIVE,2104-08-20 02:57:00,1


In [4]:
age_replace_vals = list(ptnt_demog2[ptnt_demog2['age'] > 110]['age'].unique())
display(age_replace_vals)

[300.0,
 305.0,
 302.0,
 301.0,
 303.0,
 310.0,
 306.0,
 307.0,
 304.0,
 311.0,
 308.0,
 309.0]

In [5]:
ptnt_demog2['age'].replace(age_replace_vals, np.nan, inplace = True)
ptnt_demog2['age'].dropna().describe()

count    41917.000000
mean        63.597204
std         15.815852
min         17.000000
25%         53.000000
50%         65.000000
75%         76.000000
max         89.000000
Name: age, dtype: float64

In [6]:
display(ptnt_demog2.icu_stay.dropna().describe())
display(ptnt_demog2.hosp_stay.dropna().describe())
display(ptnt_demog2[ptnt_demog2.icu_stay <= 0])
display(ptnt_demog2[ptnt_demog2.hosp_stay <=0])
icu_stay_low = list(ptnt_demog2[ptnt_demog2['icu_stay'] <=0]['icu_stay'].unique())
age_replace_vals = list(ptnt_demog2[ptnt_demog2['age'] > 110]['age'].unique())

#ptnt_demog2.loc[:,'subject_id'] = ptnt_demog2.index
#ptnt_demog2.index = ptnt_demog2.icustay_id
#ptnt_demog2.drop('icustay_id', axis = 1, inplace = True)
#ptnt_demog2.head()

count    44152.000000
mean       115.544505
std        156.791692
min         25.000000
25%         41.000000
50%         65.000000
75%        120.000000
max       4154.000000
Name: icu_stay, dtype: float64

count    44152.000000
mean       294.413345
std        327.554904
min          0.000000
25%        118.000000
50%        193.000000
75%        349.000000
max       7072.000000
Name: hosp_stay, dtype: float64

Unnamed: 0_level_0,hadm_id,age,icu_stay,hosp_stay,subject_id,first_careunit,gender,marital_status,ethnicity,insurance,admission_type,deathtime,hospital_expire_flag
icustay_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1


Unnamed: 0_level_0,hadm_id,age,icu_stay,hosp_stay,subject_id,first_careunit,gender,marital_status,ethnicity,insurance,admission_type,deathtime,hospital_expire_flag
icustay_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
269115,146708,50.0,27.0,0.0,74937,MICU,F,MARRIED,WHITE,Private,EMERGENCY,2103-08-12,1


Date and time data imported in string format is converted to pandas.datetime objects

## code grabbed from benchmarking exercise

In [9]:
#phenotypes = add_hcup_ccs_2015_groups(diagnoses, yaml.load(open(args.phenotype_definitions, 'r')))
definitions = yaml.load(open('hcup_ccs_2015_definitions.yaml', 'r'))

diagnoses = ptnt_demog[['hadm_id', 'icd9_code', 'short_title']].copy()

# create mapping of hcup_ccs_2015_definitions to diagnoses icd9 codes
def_map = {}
for dx in definitions:
    for code in definitions[dx]['codes']:
        def_map[code] = (dx, definitions[dx]['use_in_benchmark'])

print "map created"
# map hcup_ccs_2015 definitions to icd9 diagnoses codes
diagnoses['HCUP_CCS_2015'] = diagnoses.icd9_code.apply(lambda c: def_map[c][0] if c in def_map else None)
diagnoses['USE_IN_BENCHMARK'] = diagnoses.icd9_code.apply(lambda c: int(def_map[c][1]) if c in def_map else None)
#diagnoses['subject_id'] = diagnoses.index
#diagnoses.set_index(np.arange(diagnoses.shape[0]), inplace = True)


# create dataframe from the def_map dict so that we can isolate the 
# definitions that are used in benchmarking

def_map_df = pd.DataFrame.from_dict(def_map, orient = 'index')
def_map_df.columns = ['Diagnoses', 'Benchmark']
diagnoses_bm = list(def_map_df[def_map_df.Benchmark == True].drop_duplicates('Diagnoses').Diagnoses)
icustays = list(ptnt_demog2.index)

# create dataframe with hcup_ccp diagnoses benchmark categories as columns and
# icustay_id information as indices. if the diagnosis is present for a given icustay the 
# value is 1, otherwise 0. 

diagnoses2 = pd.DataFrame(columns = diagnoses_bm, index = icustays)
diagnoses2.fillna(0, inplace = True)
print "created empty diagnosis dataframe"
for row in diagnoses.iterrows():
    if row[1]['USE_IN_BENCHMARK'] == 1:
        diagnoses2.loc[row[0]][row[1]['HCUP_CCS_2015']] = 1

print "filled diagnosis dataframe"
diagnoses2.head()

map created
created empty diagnosis dataframe
filled diagnosis dataframe


Unnamed: 0,Diabetes mellitus with complications,Complications of surgical procedures or medical care,Congestive heart failure; nonhypertensive,Diabetes mellitus without complication,Chronic kidney disease,Acute and unspecified renal failure,Acute cerebrovascular disease,Other liver diseases,Hypertension with complications and secondary hypertension,Cardiac dysrhythmias,...,Other lower respiratory disease,Conduction disorders,Chronic obstructive pulmonary disease and bronchiectasis,Pleurisy; pneumothorax; pulmonary collapse,Gastrointestinal hemorrhage,Respiratory failure; insufficiency; arrest (adult),Coronary atherosclerosis and other heart disease,Disorders of lipid metabolism,Shock,Essential hypertension
211552,0,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
294638,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
228232,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
220597,0,0,1,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
232669,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [10]:
ptnt_demog3 = ptnt_demog2.merge(diagnoses2,left_index = True, right_index = True, 
                       how = 'left', sort = True)
ptnt_demog3.head()

Unnamed: 0_level_0,hadm_id,age,icu_stay,hosp_stay,subject_id,first_careunit,gender,marital_status,ethnicity,insurance,...,Other lower respiratory disease,Conduction disorders,Chronic obstructive pulmonary disease and bronchiectasis,Pleurisy; pneumothorax; pulmonary collapse,Gastrointestinal hemorrhage,Respiratory failure; insufficiency; arrest (adult),Coronary atherosclerosis and other heart disease,Disorders of lipid metabolism,Shock,Essential hypertension
icustay_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
211552,145834,76.0,146.0,259.0,3,MICU,M,MARRIED,WHITE,Medicare,...,0,0,0,0,0,0,0,0,0,0
294638,185777,48.0,41.0,187.0,4,MICU,F,SINGLE,WHITE,Private,...,0,0,0,0,0,0,0,0,0,0
228232,107064,66.0,89.0,393.0,6,SICU,F,MARRIED,WHITE,Medicare,...,0,0,0,0,0,0,0,0,0,0
220597,150750,41.0,128.0,118.0,9,MICU,M,,UNKNOWN/NOT SPECIFIED,Medicaid,...,0,0,0,0,0,0,0,0,0,1
232669,112213,72.0,184.0,305.0,12,SICU,M,MARRIED,WHITE,Medicare,...,0,0,0,0,0,0,0,0,0,1


In [11]:
ptnt_demog3['icustay_id'] = ptnt_demog3.index
ptnt_demog3.index = np.arange(ptnt_demog3.shape[0])
ptnt_demog3.to_csv('PTNT_DEMOG_FIRST24_PROCESSED.csv')

# code remnants

#for patient age, only the date information is needed so datetime data is converted to date
ptnt_demog['dob'] = ptnt_demog['dob'].apply(lambda x: x.date())
ptnt_demog['admittime'] = ptnt_demog['admittime'].apply(lambda x: x.date())
ptnt_demog[['dob', 'admittime']].head()

#dates_and_times = ['dob', 'dod', 'admittime', 'dischtime', 'deathtime', 'intime', 'outtime']
#checking for null values in the data
for item in dates_and_times:
    isanan = ptnt_demog[item].isnull().values.any()
    print "{}   {}".format(item, isanan)

'''the code calculates age of patient at time of admission, hospital stay and ICU stay. 
    using relativedelta.years to get the age rounded to years,
    relativedelta.days to get hospital stay in days and relativedelta.hours
    to get icu stay in hours.
    '''
#age at time of admission is calculated as admittime - dob
for index, row in ptnt_demog.iterrows():
    age_val = relativedelta(row['admittime'], row['dob']).years
    ptnt_demog.set_value(index, 'age', age_val)
'''        
    #adm_stay in days
    adm_stay_val = relativedelta(row['dischtime'],row['admittime'])
    adm_stay_val = adm_stay_val.weeks*7 + adm_stay_val.days
    #icu_stay in hours
    icu_stay_val = relativedelta(row['outtime'],row['intime'])
    icu_stay_val = icu_stay_val.weeks*7*24 + icu_stay_val.days*24 + icu_stay_val.hours
    ptnt_demog.set_value(index, 'adm_stay_days', adm_stay_val)
    ptnt_demog.set_value(index, 'icu_stay_hours', icu_stay_val)
'''    
ptnt_demog.head()

# Reality Check on Ages and Durations of Stay
Checking durations of stay and ages on low end for values < 0. Checking ages on the high end at 110yrs. 
Because I'm not sure what an unreasonable durations of stay would be for ICU or hospital stay will look for 
outliers in the data using statistical analysis later in preprocessing. 


In [None]:
ptnt_demog3[ptnt_demog3['age'] >120]['age'].shape

## There are approximately 93 patients whos calculated age is > 110. All are > 300yrs. 

In [None]:
age_replace_vals = list(ptnt_demog[ptnt_demog['age'] > 110]['age'].unique())


ptnt_demog['age'].replace(age_replace_vals, np.nan, inplace = True)
ptnt_demog['age'].head()

In [None]:
#checking again for age values > 110
ptnt_demog[ptnt_demog['age'] > 110]

## Dropping columns that will not be used in analysis. 
These columns were used to calculate columns like age, admissions stay and ICU stay and are no longer useful. 
Except admittime which may be used to determine whether survival rates have changed over time. 

In [None]:
# and want to include the subject id as index so i'm creating ptnt_demog2 as a copy, then dropping columns from 
# ptnt_demog in place as i don't need the subject_id for the current analysis. 
#ptnt_demog2 = ptnt_demog.drop(['dob', 'expire_flag', 'dod', 'dischtime', 'deathtime', 'intime', 'outtime'],axis = 1)
ptnt_demog.drop(['dob','admittime' ],axis = 1, inplace = True)
ptnt_demog.head()

In [None]:
'''
Sample code that was used to manually calculate dates and times from strings
def to_timedelta(time_string): 
    age = time_string.split()
    days = age[0]
    hms = age[2].split(":")
    print hms
    delta = timedelta(days = int(days), hours = int(hms[0]), minutes = int(hms[1]), seconds = int(hms[2]))
    return delta
'''
'''this was code to manually convert the dates in the file to datetime objects
age = ptnt_demog.age.iloc[0]
print age
dummy = age.split()
print dummy
days = dummy[0]
print days
hms = dummy[2].split(":")
print hms

datetime_object = datetime.strptime(dob, '%Y %m %d %H:%M:%S')




and these are scraps of where i figured out i can just use to_datetime
ptnt_demog['dod2'] = pd.to_datetime(ptnt_demog['dod']) 
ptnt_demog.head()
onedate = to_timedelta(ptnt_demog.age.iloc[0])
otherdate =  to_timedelta(ptnt_demog.age.iloc[1])
print onedate - otherdate
print onedate

'''
'''example code for timedelta
from datetime import datetime, timedelta
# we specify the input and the format...
t = datetime.strptime("05:20:25","%H:%M:%S")
# ...and use datetime's hour, min and sec properties to build a timedelta
delta = timedelta(hours=t.hour, minutes=t.minute, seconds=t.second)


#durations like icu stay etc are imported as strings. the code below 
#converts durations in strings to timedelta for use as variable
from datetime import timedelta
from datetime import datetime

delta = timedelta(days = int(duration[0]), hours = int(duration[1]), minutes = int(duration[2]), seconds = int(duration[3]))
print delta
'''

In [None]:
ptnt_demog.columns

In [None]:
ptnt_demog.drop('admittime', axis = 1, inplace = True)
ptnt_demog2.drop('admittime', axis = 1, inplace = True)

In [None]:
ptnt_demog.to_csv('PTNT_DEMOG_ANGUS_rev.csv')

In [None]:
cat_vars = list(ptnt_demog.columns[ptnt_demog.dtypes == object])
num_vars = list(ptnt_demog.columns[ptnt_demog.dtypes == 'float64'])

In [None]:
cat_vars

In [None]:
for item in cat_vars:
    
    # plot
    plt.subplots(figsize=(13,6))
    ptnt_demog[item][ptnt_demog.hospital_expire_flag==1].value_counts().plot.bar(
        alpha=0.5,label= 'Non_Survival')
    ptnt_demog[item][ptnt_demog.hospital_expire_flag==0].value_counts().plot.bar(
        alpha=0.1,label= 'Survival')
    
    # fake plots for KS test, median, etc
    '''
    plt.plot([], label=' ',color='lightgray')
    plt.plot([], label='KS test: p={}'.format(format(p_val,'.3f')),
             color='lightgray')
    plt.plot([], label='Median (non-surv): {}'.format(format(med_nonsurv,'.2f')),
             color='lightgray')
    plt.plot([], label='Median (surv): {}'.format(format(med_surv,'.2f')),
             color='lightgray')
    plt.plot([], label='Variance (non-surv): {}'.format(format(var_nonsurv,'.2f')),
             color='lightgray')
    plt.plot([], label='Variance (surv): {}'.format(format(var_surv,'.2f')),
             color='lightgray')
    plt.plot([], label='Skew (non-surv): {}'.format(format(skew_nonsurv,'.2f')),
             color='lightgray')
    plt.plot([], label='Skew (surv): {}'.format(format(skew_surv,'.2f')),
             color='lightgray')
    '''
    # add title, labels etc.
    plt.title(item)
    #plt.xlabel(l + ' ' + u)
    plt.legend(loc="upper left", bbox_to_anchor=(1,1),fontsize=12)
    #plt.xlim(0, data[l].dropna().quantile(0.99))
    
    # Add lab range if available
    #if l in lab_ranges:
    #    plt.axvline(lab_ranges[l][0],color='k',linestyle='--')
    #    plt.axvline(lab_ranges[l][1],color='k',linestyle='--')

In [None]:
for item in num_vars:
    
    # plot
    plt.subplots(figsize=(13,6))
    ptnt_demog2[item][ptnt_demog.hospital_expire_flag==1].plot.hist(
        alpha=0.5,label= 'Non_Survival')
    ptnt_demog2[item][ptnt_demog.hospital_expire_flag==0].plot.hist(
        alpha=0.5,label= 'Survival')
    
    # fake plots for KS test, median, etc
    '''
    plt.plot([], label=' ',color='lightgray')
    plt.plot([], label='KS test: p={}'.format(format(p_val,'.3f')),
             color='lightgray')
    plt.plot([], label='Median (non-surv): {}'.format(format(med_nonsurv,'.2f')),
             color='lightgray')
    plt.plot([], label='Median (surv): {}'.format(format(med_surv,'.2f')),
             color='lightgray')
    plt.plot([], label='Variance (non-surv): {}'.format(format(var_nonsurv,'.2f')),
             color='lightgray')
    plt.plot([], label='Variance (surv): {}'.format(format(var_surv,'.2f')),
             color='lightgray')
    plt.plot([], label='Skew (non-surv): {}'.format(format(skew_nonsurv,'.2f')),
             color='lightgray')
    plt.plot([], label='Skew (surv): {}'.format(format(skew_surv,'.2f')),
             color='lightgray')
    '''
    # add title, labels etc.
    plt.title(item)
    #plt.xlabel(l + ' ' + u)
    plt.legend(loc="upper left", bbox_to_anchor=(1,1),fontsize=12)
    #plt.xlim(0, data[l].dropna().quantile(0.99))
    
    # Add lab range if available
    #if l in lab_ranges:
    #    plt.axvline(lab_ranges[l][0],color='k',linestyle='--')
    #    plt.axvline(lab_ranges[l][1],color='k',linestyle='--')

#code for converting sql dates to date-time objects
from datetime import datetime
dob = ptnt_demog.iloc[0].dob.split()
ymd = dob[0].split('-')
print ymd
ymd.append(dob[1])
print ymd
dob = " ".join(ymd)

datetime_object = datetime.strptime(dob, '%Y %m %d %H:%M:%S')

In [None]:
from sklearn import preprocessing as prp


In [None]:
ptnt_demog.dtypes

In [None]:
ptnt_demog2 = ptnt_demog.copy()

In [None]:
cat_vars = list(ptnt_demog2.columns[ptnt_demog2.dtypes == object])
cat_vars

In [None]:
monkey = pd.get_dummies(ptnt_demog2[cat_vars])

In [None]:
ptnt_demog2.drop(cat_vars, axis = 1, inplace = True)
ptnt_demog2.head()


In [None]:
ptnt_demog2 = ptnt_demog2.join(monkey)


In [None]:
cols = list(ptnt_demog2.columns)
cols.insert(0, cols.pop(cols.index('hospital_expire_flag')))
cols

In [None]:
ptnt_demog2 = ptnt_demog2[cols]
ptnt_demog2.head()

In [None]:
ptnt_demog2[ptnt_demog2.columns[:10]].groupby('hospital_expire_flag').hist(alpha = 0.5, figsize = (10, 10))

In [None]:
gr = ptnt_demog2.groupby('hospital_expire_flag')
for k, v in gr:
    v.plot(kind='hist', alpha=.5, legend = None)

## There appears to be very little observable difference between survivors and non-survivors in any of these variables. For this reason this analysis will be paused while a query is run to extract clinical data on septic patients. An initial exploration will then be performed on that data. 

In [None]:
ptnt_demog2.drop('admittime', axis=1, inplace = True)

In [None]:
cols=list(ptnt_demog2.columns[1:])
cols

In [None]:
monkey = cols[2]
monkey

In [None]:
ptnt_demog2.groupby('hospital_expire_flag')[monkey].hist(alpha = 0.3, figsize = (10, 10))

In [None]:
for item in cols:
    
    # plot
    plt.subplots(figsize=(13,6))
    ptnt_demog2[item][ptnt_demog2.hospital_expire_flag==1].plot.hist(
        alpha=0.5,label= 'Non_Survival')
    ptnt_demog2[item][ptnt_demog2.hospital_expire_flag==0].plot.hist(
        alpha=0.5,label= 'Survival')
    
    # fake plots for KS test, median, etc
    '''
    plt.plot([], label=' ',color='lightgray')
    plt.plot([], label='KS test: p={}'.format(format(p_val,'.3f')),
             color='lightgray')
    plt.plot([], label='Median (non-surv): {}'.format(format(med_nonsurv,'.2f')),
             color='lightgray')
    plt.plot([], label='Median (surv): {}'.format(format(med_surv,'.2f')),
             color='lightgray')
    plt.plot([], label='Variance (non-surv): {}'.format(format(var_nonsurv,'.2f')),
             color='lightgray')
    plt.plot([], label='Variance (surv): {}'.format(format(var_surv,'.2f')),
             color='lightgray')
    plt.plot([], label='Skew (non-surv): {}'.format(format(skew_nonsurv,'.2f')),
             color='lightgray')
    plt.plot([], label='Skew (surv): {}'.format(format(skew_surv,'.2f')),
             color='lightgray')
    '''
    # add title, labels etc.
    plt.title(item)
    #plt.xlabel(l + ' ' + u)
    plt.legend(loc="upper left", bbox_to_anchor=(1,1),fontsize=12)
    #plt.xlim(0, data[l].dropna().quantile(0.99))
    
    # Add lab range if available
    #if l in lab_ranges:
    #    plt.axvline(lab_ranges[l][0],color='k',linestyle='--')
    #    plt.axvline(lab_ranges[l][1],color='k',linestyle='--')

In [None]:
display(ptnt_demog2.shape)
display(ptnt_demog2.dropna().shape)

In [None]:
ptnt_demog2.dropna(inplace = True)
ptnt_demog2.shape

In [None]:
ptnt_demog2_scaled = ptnt_demog2.copy()
for feature_name in ptnt_demog2_scaled.columns[1:]:
#    max_value = data2[feature_name].max()
#    min_value = data2[feature_name].min()
#    data2_norm[feature_name] = (data2[feature_name] - min_value) / (max_value - min_value)
    ptnt_demog2_scaled[feature_name] = preprocessing.scale(ptnt_demog2_scaled[feature_name], with_mean = True, 
                                                  with_std = True)
    
display(ptnt_demog2_scaled.head())

In [None]:
pca = PCA(n_components = 8).fit(ptnt_demog2_scaled[ptnt_demog2_scaled.columns[1:]])

# Generate PCA results plot
pca_results = vs.pca_results(ptnt_demog2_scaled[ptnt_demog2_scaled.columns[1:]], pca)



In [None]:
features = ptnt_demog2_scaled[ptnt_demog2_scaled.columns[1:]]
targets = ptnt_demog2_scaled[ptnt_demog2_scaled.columns[0]]
display(features.columns)
display(targets.name)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, targets, 
                                                    test_size = 0.30, random_state = 42)

In [None]:
clf_SVC = svm.SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape=None, degree=3, gamma='auto', kernel='rbf', 
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False).fit(X_train, y_train)

scores = cross_val_score(clf_SVC, features, targets, cv=5)
display(scores)  

In [None]:
y_predsSVC = clf_SVC.predict(X_test)
metrics.confusion_matrix(y_test, y_predsSVC)



In [None]:
#X_train, X_test, y_train, y_test = train_test_split(reduced_data, targets,
#                                                   test_size = 0.30, 
#                                                  random_state = 42)

clf_MLP = MLPClassifier(solver='sgd', alpha=1e-6, learning_rate = 'adaptive',
                    hidden_layer_sizes=(5, 2), random_state=1)

clf_MLP.fit(X_train, y_train)                         

display(clf_MLP.score(X_test, y_test))
y_predsMLP = clf_MLP.predict(X_test)
metrics.confusion_matrix(y_test, y_predsMLP)




In [None]:
### create classifier
clf_GNB = GaussianNB()
### fit the classifier on the training features and labels
clf_GNB.fit(X_train, y_train)
    ### return the fit classifier

nb_score = clf_GNB.score(X_test, y_test) 

print nb_score
y_predsGNB = clf_GNB.predict(X_test)
metrics.confusion_matrix(y_test, y_predsGNB)


