<h1><center>Length of Stay Project 24h</h1>
<h4>TCSS 555<br>
Spring 2018<br>
Thuan Lam, Tood Robbins, Inno Irving Estrera</h4></center>


<h2>Libraries</h2>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.utils import shuffle
from dateutil.parser import parse
from datetime import datetime

from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

## Global Setup

In [2]:
runAt = 24              # run pridiction after the patient checkek in 24 hours
option = 3              # see the Options section
numberOfStayRow = 5000  # get 5000 rows in the ICU STAYS only. Set numberOfStayRow = -1 if you want to get all rows

## User Difined Functions and Variables

In [3]:
DatetimeFormat = '%Y-%m-%d %H:%M:%S'

def event_time_filter(row):
    proceduretime = datetime.strptime(row.EVENTTIME, DatetimeFormat)
    intime = datetime.strptime(row.INTIME, DatetimeFormat)
    hours = (proceduretime - intime).total_seconds() / 3600.0
    row.hours = hours
    return row

## Data

In [4]:
import os 
cwd = os.getcwd()
print('Current folder is {}'.format(cwd))

Current folder is D:\Users\Thuan Lam\Google Drive\TCC-Madison-UWT\UWT\TCSS 555 - Machine Learning\Projects\LengthOfStay


In [5]:
# Load datasets
# admissions = pd.read_csv("Data\ADMISSIONS.csv")
# patients = pd.read_csv("Data\PATIENTS.csv")

stays = pd.read_csv('Data\ICUSTAYS.csv')[['ICUSTAY_ID','SUBJECT_ID','INTIME','LOS']] #keep certain columns only
stays = shuffle(stays)

# delete LOS = null or LOS = 0
stays = stays.fillna(0)
stays = stays.loc[stays['LOS'] > 0]

if numberOfStayRow != -1:
    stays = stays[:numberOfStayRow] #get the first 100 rows

print('ICUStays loaded. Shape: {}'.format(stays.shape))
# stays

ICUStays loaded. Shape: (5000, 4)


Unnamed: 0,ICUSTAY_ID,SUBJECT_ID,INTIME,LOS
28757,202032,23534,2126-08-23 13:40:10,2.2934
43580,220258,32367,2146-02-07 18:43:41,1.9980
41078,206490,28555,2183-06-11 20:54:25,0.9031
638,223095,907,2155-08-12 15:40:46,24.1121
31108,294383,22553,2132-04-02 10:59:09,6.4981
51643,253786,89711,2154-01-28 11:31:01,3.3255
22119,266643,12849,2140-04-15 00:08:07,11.7485
60754,286379,98999,2199-09-07 11:12:44,1.4790
55003,298500,73578,2160-01-11 06:43:00,1.4315
16267,248831,14131,2118-03-28 20:57:56,1.0445


* <h3>Options

In [6]:
if option == 1: 
    # Option 1: Accuracy +/- 0.1 days = 2 hours and 24 minutes
    # For accuracy, we multiply that by 10. So, LOS=36 means 3.6 days. LOS=113 means 11.3 days
    stays['LOS'] = stays['LOS'].apply(lambda x: int(x * 10))
elif option == 2:
    # Option 2: Accuracy +/- 0.5 days = 12 hours
    # For accuracy, we multiply that by 10, and round up for half-day (0, 5, 10, 15, 20, 25...)
    # For example, LOS=35 means 3.5 days. LOS=110 means 11 days
    stays['LOS'] = stays['LOS'].apply(lambda x: int(round(x * 2, 0) * 5))
elif option == 3:
    # Option 3: Accuracy +/- 1 day = 24 hours
    stays['LOS'] = stays['LOS'].apply(lambda x: int(round(x,0)))
elif option == 4:
    # Option 4: Accuracy +/- 2 day = 48 hours
    # For example, 0 means 0-1 day, 2 means 2-3 days, 4 means 4-5 days, 6 means 6-7 days, ...
    stays['LOS'] = stays['LOS'].apply(lambda x: round(x,0)//2*2)    
elif option == 5:
    # Option 5: Accuracy is a binary choice: Less than 5 days or not
    stays['LOS'] = stays['LOS'].apply(lambda x: 1 if x >= 5 else 0)
# stays

* <h3>Paitents

In [7]:
patients = pd.read_csv('Data\PATIENTS.csv', encoding='latin1' )[['SUBJECT_ID', 'GENDER', 'DOB']]
print('Patients loaded. Shape: {}'.format(patients.shape))
stays = pd.merge(patients, stays, on='SUBJECT_ID', how='inner').drop(['SUBJECT_ID'], axis=1)

stays.rename(columns={'DOB': 'AGE'}, inplace=True)

# process gender
stays['GENDER'] = stays['GENDER'].apply(lambda x: 0 if x == 'F' else 1)
stays['AGE'] = stays['AGE'].apply(lambda x: abs(int(x[:4]) - datetime.now().year)//10 if x.find('-') > 0 else 0)
print('ICUStays joined with Patients. Shape: {}'.format(stays.shape))

Patients loaded. Shape: (46520, 3)
ICUStays joined with Patients. Shape: (5000, 5)


* <h3>Input_Events_CV

In [None]:
# Load data
inputeventscv = pd.read_csv('Data\INPUTEVENTS_CV.csv', encoding='latin1' )[['ICUSTAY_ID', 'STORETIME', 'ITEMID']]

print('Input_Events_CV loaded. Shape: {}'.format(inputeventscv.shape))

inputeventscv.rename(columns={'STORETIME': 'EVENTTIME'}, inplace=True) #rename column so that we can re-use the event_time_filter function
inputeventscv = pd.merge(inputeventscv, stays, on='ICUSTAY_ID', how='inner').drop(['LOS','GENDER','AGE'], axis=1)
inputeventscv.insert(0,'hours', 0) 
inputeventscv = inputeventscv.apply(lambda row: event_time_filter(row), axis=1)

# delete hours > RunAt (RunAt is defined in the User_Difined_Functions_and_Variables section)
inputeventscv = inputeventsmv.loc[inputeventscv['hours'] <= runAt] #RunAt = 24h

inputeventscv.drop(['hours', 'EVENTTIME', 'INTIME'], axis=1, inplace=True)
print('Input_Events_CV filterd. Shape: {}'.format(inputeventscv.shape))
# inputeventscv

* <h3>Procedure_Events

In [None]:
# Load data
procedureevents = pd.read_csv('Data\PROCEDUREEVENTS_MV.csv')[['ICUSTAY_ID', 'STARTTIME', 'ITEMID']]
print('Procedure_Events loaded. Shape: {}'.format(procedureevents.shape))
procedureevents.rename(columns={'STARTTIME': 'EVENTTIME'}, inplace=True) #rename column so that we can re-use the event_time_filter function
procedureevents = pd.merge(procedureevents, stays, on='ICUSTAY_ID', how='inner').drop(['LOS','GENDER','AGE'], axis=1)

procedureevents.insert(0,'hours', 0) 
procedureevents = procedureevents.apply(lambda row: event_time_filter(row), axis=1)

# delete hours > RunAt (RunAt is defined in the User_Difined_Functions_and_Variables section)
procedureevents = procedureevents.loc[procedureevents['hours'] <= runAt] #RunAt = 24h

procedureevents.drop(['hours', 'EVENTTIME', 'INTIME'], axis=1, inplace=True)
print('Procedure_Events filterd. Shape: {}'.format(procedureevents.shape))
# procedureevents

* <h3>Datetime_Events

In [None]:
# Load data
datetimeevents = pd.read_csv('Data\DATETIMEEVENTS.csv', encoding='latin1' )[['ICUSTAY_ID', 'STORETIME', 'ITEMID']]

print('Datetime_Events loaded. Shape: {}'.format(datetimeevents.shape))

datetimeevents.rename(columns={'STORETIME': 'EVENTTIME'}, inplace=True) #rename column so that we can re-use the event_time_filter function
datetimeevents = pd.merge(datetimeevents, stays, on='ICUSTAY_ID', how='inner').drop(['LOS','GENDER','AGE'], axis=1)
datetimeevents.insert(0,'hours', 0) 
datetimeevents = datetimeevents.apply(lambda row: event_time_filter(row), axis=1)
datetimeevents
# delete hours > RunAt (RunAt is defined in the User_Difined_Functions_and_Variables section)
datetimeevents = datetimeevents.loc[datetimeevents['hours'] <= runAt] #RunAt = 24h

datetimeevents.drop(['hours', 'EVENTTIME', 'INTIME'], axis=1, inplace=True)
print('Datetime_Events filterd. Shape: {}'.format(datetimeevents.shape))
# datetimeevents

* <h3>Input_Events_MV

In [None]:
# Load data
inputeventsmv = pd.read_csv('Data\INPUTEVENTS_MV.csv', encoding='latin1' )[['ICUSTAY_ID', 'STARTTIME', 'ITEMID']]

print('Input_Events_MV loaded. Shape: {}'.format(inputeventsmv.shape))

inputeventsmv.rename(columns={'STARTTIME': 'EVENTTIME'}, inplace=True) #rename column so that we can re-use the event_time_filter function
inputeventsmv = pd.merge(inputeventsmv, stays, on='ICUSTAY_ID', how='inner').drop(['LOS','GENDER','AGE'], axis=1)
inputeventsmv.insert(0,'hours', 0) 
inputeventsmv = inputeventsmv.apply(lambda row: event_time_filter(row), axis=1)

# delete hours > RunAt (RunAt is defined in the User_Difined_Functions_and_Variables section)
inputeventsmv = inputeventsmv.loc[inputeventsmv['hours'] <= runAt] #RunAt = 24h

inputeventsmv.drop(['hours', 'EVENTTIME', 'INTIME'], axis=1, inplace=True)
print('Input_Events_MV filterd. Shape: {}'.format(inputeventsmv.shape))
# inputeventsmv

* <h3>Master = Union All Tables 

In [None]:
# If you add a new table, don't forget to put it into the list below 
master = pd.concat([inputeventscv, procedureevents, datetimeevents, inputeventsmv])
print('Master built. Shape: {}'.format(master.shape))
# master

* <h3>Pivot All Columns

In [None]:
print('Number of Items/Columns need to be added: ', master.ITEMID.unique().size)
for x in master.ITEMID.unique():
    master[x]=(master.ITEMID==x).astype(int)
    
master = master.groupby(['ICUSTAY_ID']).sum().reset_index()

master = pd.merge(master, stays, on='ICUSTAY_ID', how='inner')
master.drop(['ITEMID','ICUSTAY_ID','INTIME'], axis=1, inplace=True)
print('Master built. Shape: {}'.format(master.shape))
# master

## Model

In [None]:
# Split-out validation dataset (No-show column only)
col = len(master.columns) - 1
array = master.values   #numpy array
X = array[:,0:col]#numpy array - 11 first columns
Y = array[:,col]  #numpy array - the 12st column
# print(X[0:2,])  #print top 2 rows
# print(Y[0:2,])  #print top 2 rows
# print(X.size)

In [None]:
validation_size = 0.20
seed = 7
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed)
# print('{}'.format(X_train, Y_train))

In [None]:
# Test options and evaluation metric
seed = 7
scoring = 'accuracy'

# Spot Check Algorithms
models = []
# models.append(('LR', LogisticRegression()))
# models.append(('LDA', LinearDiscriminantAnalysis()))
# models.append(('KNN', KNeighborsClassifier()))
# models.append(('CART', DecisionTreeClassifier()))
# models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))

# evaluate each model in turn
results = []
names = []
for name, model in models:
    kfold = model_selection.KFold(n_splits=10, random_state=seed)
    cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

In [None]:
# Compare Algorithms
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

In [None]:
# Make predictions on validation dataset

# knn = KNeighborsClassifier()
# knn.fit(X_train, Y_train)
# predictions = knn.predict(X_validation)

# lr = LogisticRegression()
# lr.fit(X_train, Y_train)
# predictions = lr.predict(X_validation)

svm = SVC()
svm.fit(X_train, Y_train)
predictions = svm.predict(X_validation)

print(predictions)
print(accuracy_score(Y_validation, predictions))
print(confusion_matrix(Y_validation, predictions))
print(classification_report(Y_validation, predictions))

## Conclusion
#### bla bla bla