In [1]:
import numpy as np
import pandas as pd
import pickle
import os
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import sys
import re
lib = __import__("ihm_lib") # provided by [1]

## Objects to read train and test for each specific task

In [2]:
path = 'Path to the Train and Test Data for In Hospital Mortality'
inhosmor_reader_train = lib.InHospitalMortalityReader(dataset_dir = path , listfile=path +'listfile.csv')
inhosmor_reader_test = lib.InHospitalMortalityReader(dataset_dir = path+'test\\', listfile = path +
                                                     'test\\listfile.csv')

## Binary Representation of Disease Diagnostic for all the Patients

In [None]:
ihm_train_data = [inhosmor_reader_train.read_example(index) for index in range(inhosmor_reader_train.get_number_of_examples())]
ihm_test_data = [inhosmor_reader_test.read_example(index) for index in range(inhosmor_reader_test.get_number_of_examples())]
episodes = lib.count_episodes(ihm_train_data)
patients_multiple_episodes = episodes[episodes['EPISODE #']>1].index
episodes['icu_ids'] = np.ones(episodes.shape[0])
all_diagonosis = pd.read_csv(path + 'all_diagnoses.csv') #must be generated after following [1]
all_stays = pd.read_csv(path + 'all_stays.csv') #must be generated after following [1]
ids = [lib.return_icu_ids(subject,episodes,all_stays) for subject in episodes.index]
epi = dict()
for i in range(len(list(episodes.index))):
    epi[episodes.index[i]] = ids[i]
unique_diseases = sorted(all_diagonosis.ICD9_CODE.unique())
disease_diagnostics = np.array([lib.binary_representation(np.array(unique_diseases),np.array(lib.extract_diagnosis(i,ihm_train_data,all_diagonosis,epi))) for i in range(len(ihm_train_data))])
disease_diagnostics = pd.DataFrame(disease_diagnostics)
disease_diagnostics.index = ['-'.join(ihm_train_data[i]['name'].split('_')[:2]) for i in range(len(ihm_train_data))]
disease_diagnostics.columns = unique_diseases
disease_diagnostics.to_csv(path + 'disease_diagnostics_train.csv')
episodes = lib.count_episodes(ihm_test_data)
patients_multiple_episodes = episodes[episodes['EPISODE #']>1].index
episodes['icu_ids'] = np.ones(episodes.shape[0])
all_diagonosis = pd.read_csv(path + 'all_diagnoses.csv') #must be generated after following [1]
all_stays = pd.read_csv(path + 'all_stays.csv') #must be generated after following [1]
ids = [lib.return_icu_ids(subject,episodes,all_stays) for subject in episodes.index]
epi = dict()
for i in range(len(list(episodes.index))):
    epi[episodes.index[i]] = ids[i]
unique_diseases = sorted(all_diagonosis.ICD9_CODE.unique())
disease_diagnostics = np.array([lib.binary_representation(np.array(unique_diseases),np.array(lib.extract_diagnosis(i,ihm_test_data,all_diagonosis,epi))) for i in range(len(ihm_test_data))])
disease_diagnostics = pd.DataFrame(disease_diagnostics)
disease_diagnostics.index = ['-'.join(ihm_test_data[i]['name'].split('_')[:2]) for i in range(len(ihm_test_data))]
disease_diagnostics.columns = unique_diseases
disease_diagnostics.to_csv(path + 'disease_diagnostics_test.csv')

## Extract Training Data for Modeling

In [11]:
train = [lib.convert_pandas(index, ihm_train_data) for index in range(len(ihm_train_data))]
test = [lib.convert_pandas(index, ihm_test_data) for index in range(len(ihm_test_data))]
Size = [train[index].shape[0] for index in range(len(train))]
Quantiles = np.quantile(np.array(Size), q=[0.15,0.3,0.45,0.6,0.75,0.8,0.9])
interval_length = np.round(( 48 * 60 ) / (95))
interval_length = 10
Hours = np.round(np.arange(2880,step=10) / (60) , 2)
cat_columns = ['Capillary refill rate','Glascow coma scale eye opening', 'Glascow coma scale motor response','Glascow coma scale total','Glascow coma scale verbal response']

## Extract Unique Values for all Categorical Variables in Entire Dataset

In [12]:
crr_train = [lib.extract_string(idx,train,cat_columns[0]) for idx in range(len(train))]
crr_test = [lib.extract_string(idx,test,cat_columns[0]) for idx in range(len(test))]
flatten = lambda l: [item for sublist in l for item in sublist]
crr_train = flatten(crr_train)
crr_test = flatten(crr_test)
print ('Unique Values of Capillary refill rate in entire dataset')
print (set(crr_train+crr_test))
crr_train = [lib.extract_string(idx,train,cat_columns[1]) for idx in range(len(train))]
crr_test = [lib.extract_string(idx,test,cat_columns[1]) for idx in range(len(test))]
crr_train = flatten(crr_train)
crr_test = flatten(crr_test)
print ('Unique Values of Glascow coma scale eye opening in entire dataset')
print (set(crr_train+crr_test))
crr_train = [lib.extract_string(idx,train,cat_columns[2]) for idx in range(len(train))]
crr_test = [lib.extract_string(idx,test,cat_columns[2]) for idx in range(len(test))]
crr_train = flatten(crr_train)
crr_test = flatten(crr_test)
print ('Unique Values of Glascow coma scale motor response in entire dataset')
print (set(crr_train+crr_test))
crr_train = [lib.extract_string(idx,train,cat_columns[3]) for idx in range(len(train))]
crr_test = [lib.extract_string(idx,test,cat_columns[3]) for idx in range(len(test))]
crr_train = flatten(crr_train)
crr_test = flatten(crr_test)
print ('Unique Values of Glascow coma scale total in entire dataset')
print (set(crr_train+crr_test))
crr_train = [lib.extract_string(idx,train,cat_columns[4]) for idx in range(len(train))]
crr_test = [lib.extract_string(idx,test,cat_columns[4]) for idx in range(len(test))]
crr_train = flatten(crr_train)
crr_test = flatten(crr_test)
print ('Unique Values of Glascow coma scale verbal response in entire dataset')
print (set(crr_train+crr_test))

Unique Values of Capillary refill rate in entire dataset
{'0.0', '1.0'}
Unique Values of Glascow coma scale eye opening in entire dataset
{'3 To speech', 'Spontaneously', '4 Spontaneously', 'To Pain', '1 No Response', 'None', 'To Speech', '2 To pain'}
Unique Values of Glascow coma scale motor response in entire dataset
{'2 Abnorm extensn', 'Flex-withdraws', '1 No Response', 'Abnormal extension', '6 Obeys Commands', '4 Flex-withdraws', 'No response', 'Localizes Pain', 'Abnormal Flexion', '3 Abnorm flexion', 'Obeys Commands', '5 Localizes Pain'}
Unique Values of Glascow coma scale total in entire dataset
{'12', '8', '3', '7', '15', '14', '13', '10', '4', '6', '9', '11', '5'}
Unique Values of Glascow coma scale verbal response in entire dataset
{'1.0 ET/Trach', 'Incomprehensible sounds', '5 Oriented', '2 Incomp sounds', 'Inappropriate Words', '1 No Response', 'No Response-ETT', 'Confused', '3 Inapprop words', '4 Confused', 'Oriented', 'No Response'}


## Encode Categorial Variables in Entire Data Set

In [13]:
encoding = ['1.0', '0.0']
decoding = ['1.0','0.0']
for z in range(len(train)):
    train[z].loc[:,cat_columns[0]] = lib.encode_cat(index=z,column=cat_columns[0],data=train,encoding=encoding,decoding=decoding)
encoding = ['3 To speech', '4 Spontaneously', 'None', '2 To pain', '1 No Response', 'To Speech', 'To Pain', 'Spontaneously']
decoding = ['3','4','0','2','1','5','6','7']
for z in range(len(train)):
    train[z].loc[:,cat_columns[1]] = lib.encode_cat(index=z,column=cat_columns[1],data=train,encoding=encoding,decoding=decoding)
encoding = ['Obeys Commands', 'Localizes Pain', '6 Obeys Commands', '3 Abnorm flexion', 'Flex-withdraws', '1 No Response', 'No response', '2 Abnorm extensn', 'Abnormal Flexion', 'Abnormal extension', '5 Localizes Pain', '4 Flex-withdraws']
decoding = list(np.array(np.arange(len(encoding)),dtype=str))
for z in range(len(train)):
    train[z].loc[:,cat_columns[2]] = lib.encode_cat(index=z,column=cat_columns[2],data=train,encoding=encoding,decoding=decoding)
encoding = ['5', '4', '14', '7', '13', '6', '10', '9', '12', '3', '15', '8', '11']
decoding = list(np.array(np.arange(len(encoding)),dtype=str))
for z in range(len(train)):
    train[z].loc[:,cat_columns[3]] = lib.encode_cat(index=z,column=cat_columns[3],data=train,encoding=encoding,decoding=decoding)
encoding = ['2 Incomp sounds', 'Oriented', 'Confused', '4 Confused', '1 No Response', 'No Response-ETT', 'Incomprehensible sounds', '3 Inapprop words', '1.0 ET/Trach', 'No Response', 'Inappropriate Words', '5 Oriented']
decoding = list(np.array(np.arange(len(encoding)),dtype=str))
for z in range(len(train)):
    train[z].loc[:,cat_columns[4]] = lib.encode_cat(index=z,column=cat_columns[4],data=train,encoding=encoding,decoding=decoding)
encoding = ['1.0', '0.0']
decoding = ['1.0','0.0']
for z in range(len(test)):
    test[z].loc[:,cat_columns[0]] = lib.encode_cat(index=z,column=cat_columns[0],data=test,encoding=encoding,decoding=decoding)
encoding = ['3 To speech', '4 Spontaneously', 'None', '2 To pain', '1 No Response', 'To Speech', 'To Pain', 'Spontaneously']
decoding = ['3','4','0','2','1','5','6','7']
for z in range(len(test)):
    test[z].loc[:,cat_columns[1]] = lib.encode_cat(index=z,column=cat_columns[1],data=test,encoding=encoding,decoding=decoding)
encoding = ['Obeys Commands', 'Localizes Pain', '6 Obeys Commands', '3 Abnorm flexion', 'Flex-withdraws', '1 No Response', 'No response', '2 Abnorm extensn', 'Abnormal Flexion', 'Abnormal extension', '5 Localizes Pain', '4 Flex-withdraws']
decoding = list(np.array(np.arange(len(encoding)),dtype=str))
for z in range(len(test)):
    test[z].loc[:,cat_columns[2]] = lib.encode_cat(index=z,column=cat_columns[2],data=test,encoding=encoding,decoding=decoding)
encoding = ['5', '4', '14', '7', '13', '6', '10', '9', '12', '3', '15', '8', '11']
decoding = list(np.array(np.arange(len(encoding)),dtype=str))
for z in range(len(test)):
    test[z].loc[:,cat_columns[3]] = lib.encode_cat(index=z,column=cat_columns[3],data=test,encoding=encoding,decoding=decoding)
encoding = ['2 Incomp sounds', 'Oriented', 'Confused', '4 Confused', '1 No Response', 'No Response-ETT', 'Incomprehensible sounds', '3 Inapprop words', '1.0 ET/Trach', 'No Response', 'Inappropriate Words', '5 Oriented']
decoding = list(np.array(np.arange(len(encoding)),dtype=str))
for z in range(len(test)):
    test[z].loc[:,cat_columns[4]] = lib.encode_cat(index=z,column=cat_columns[4],data=test,encoding=encoding,decoding=decoding)

## Save the training and test data

In [14]:
train = [lib.fill_missing(idx,train) for idx in range(len(train))]
test = [lib.fill_missing(idx,test) for idx in range(len(test))]
with open('train_ihm.pickle', 'wb') as handle:
    pickle.dump(train, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('test_ihm.pickle', 'wb') as handle:
    pickle.dump(test, handle, protocol=pickle.HIGHEST_PROTOCOL)