In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import BatchNormalization

from keras.utils import np_utils

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
df = pd.read_csv('data/VTOUTP16.TXT', low_memory=False)
df.head()

Unnamed: 0,hnum2,ATYPE,asour,intage,TXTZIP,sex,dstat,PPAY,CHRGS,DX1,...,BTYPE,ERFLAG,cah,vtres,OBSFLAG,AFLAG,Uniq,ADMID_QTR,DISCD_QTR,CHRGS_HCIA
0,11,3,4,13,50,2,5,1,3409.85,L600,...,131,0,1,1,0,1,3,1,1,3409.85
1,11,3,4,11,50,2,5,7,1837.65,Z86010,...,131,0,1,1,0,1,54,1,1,1837.65
2,11,3,4,12,37,2,5,1,1102.65,Z1211,...,131,0,1,3,0,1,85,1,1,1102.65
3,11,3,4,10,50,1,5,7,1102.7,Z1211,...,131,0,1,1,0,1,87,1,1,1102.7
4,11,3,4,13,51,2,5,7,1837.65,Z1211,...,131,0,1,1,0,1,93,1,1,1837.65


In [3]:
df.describe()

Unnamed: 0,hnum2,ATYPE,intage,dstat,PPAY,CHRGS,hsa,pdays,DY,RECNO,BTYPE,ERFLAG,cah,vtres,OBSFLAG,AFLAG,Uniq,ADMID_QTR,DISCD_QTR,CHRGS_HCIA
count,370633.0,370633.0,370633.0,370633.0,370633.0,370633.0,370633.0,370633.0,370633.0,370633.0,370633.0,370633.0,370633.0,370633.0,370633.0,370633.0,370633.0,370633.0,370633.0,370633.0
mean,7.080063,1.754121,8.093732,4.942763,3.623676,3087.405855,15.229507,1.400739,2016.0,978655.2,165.769095,0.638057,0.285954,1.374878,0.02717,0.38646,1006005.0,2.483929,2.483376,3087.405855
std,4.341688,0.947148,4.159334,0.625275,2.893698,4915.324061,28.115395,6.177145,0.0,548918.6,153.572216,0.480563,0.451868,1.160495,0.162578,0.486939,564015.2,1.110703,1.110408,4915.324061
min,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,2016.0,3.0,131.0,0.0,0.0,1.0,0.0,0.0,3.0,1.0,1.0,0.0
25%,5.0,1.0,4.0,5.0,1.0,682.48,2.0,1.0,2016.0,495842.0,131.0,0.0,0.0,1.0,0.0,0.0,511531.0,1.0,1.0,682.48
50%,5.0,1.0,9.0,5.0,2.0,1521.62,7.0,1.0,2016.0,985901.0,131.0,1.0,0.0,1.0,0.0,0.0,1014498.0,2.0,2.0,1521.62
75%,9.0,3.0,12.0,5.0,6.0,3440.18,10.0,1.0,2016.0,1450438.0,131.0,1.0,1.0,1.0,0.0,1.0,1490877.0,3.0,3.0,3440.18
max,16.0,5.0,14.0,13.0,12.0,227311.78,99.0,366.0,2016.0,1931900.0,857.0,1.0,1.0,6.0,1.0,1.0,1985438.0,4.0,4.0,227311.78


In [4]:
# check what columns are related to each other
import matplotlib.pyplot as plt
import seaborn as sns; sns.set(color_codes=True)
f, ax = plt.subplots(figsize=(20, 6))
corr = df.corr()
sns.heatmap(corr, mask=np.zeros_like(corr, dtype=np.bool), cmap=sns.diverging_palette(220, 10, as_cmap=True), square=False, ax=ax,annot=True)

<matplotlib.axes._subplots.AxesSubplot at 0x121c8d748>

In [5]:
# print first row to understand data better
print(df.loc[0])

hnum2              11
ATYPE               3
asour               4
intage             13
TXTZIP            050
sex                 2
dstat               5
PPAY                1
CHRGS         3409.85
DX1              L600
DX2            J45909
DX3                  
DX4                  
DX5                  
DX6                  
DX7                  
DX8                  
DX9                  
DX10                 
DX11                 
DX12                 
DX13                 
DX14                 
DX15                 
DX16                 
DX17                 
DX18                 
DX19                 
DX20                 
PX1                  
               ...   
PX12                 
PX13                 
PX14                 
PX15                 
PX16                 
PX17                 
PX18                 
PX19                 
PX20                 
ECODE1               
ECODE2               
ECODE3               
hsa                12
pdays               1
ccsdx     

In [6]:
# clean up data
## replace spaces with 0
df.replace([' '], [0], inplace=True)

In [7]:
## convert ZIP codes to ints
zipcodes_dict = {}
replacement = 0
for zipcode in np.unique(df["TXTZIP"]):
    zipcodes_dict[zipcode] = replacement
    replacement += 1
print(zipcodes_dict)


df['TXTZIP'] = df['TXTZIP'].map(zipcodes_dict)

{'005': 0, '034+036': 1, '037': 2, '050': 3, '051': 4, '052': 5, '05201': 6, '053': 7, '05301': 8, '054': 9, '05401': 10, '05403': 11, '05446': 12, '05452': 13, '05468': 14, '05478': 15, '056': 16, '05602': 17, '05641': 18, '057': 19, '05701': 20, '05753': 21, '058-059': 22, '120-123': 23, '128': 24, '129': 25, '136': 26, '999': 27, 'OTH-MA': 28, 'OTH-NH': 29, 'OTH-NY': 30, 'OTH-ST': 31}


In [8]:
## replace all ICD-9 codes with ints in the DX columns
icd9codes_dict = {}
replacement = 0
for i in range (1,21):
    current_column = f'DX{i}'
    for icd9code in df[current_column]:
        if icd9code not in icd9codes_dict.keys():
            icd9codes_dict[icd9code] = replacement
            replacement += 1
print(icd9codes_dict)

for i in range (1,21):
    current_column = f'DX{i}'
    df[current_column] = df[current_column].map(icd9codes_dict)
    df[current_column] = pd.to_numeric(df[current_column])

{'L600': 0, 'Z86010': 1, 'Z1211': 2, 'M2021': 3, 'K2270': 4, 'R1310': 5, 'H2512': 6, 'H2511': 7, 'K625': 8, 'J351': 9, 'H6591': 10, 'H25042': 11, 'H25041': 12, 'Z48815': 13, 'R1013': 14, 'H25011': 15, 'K219': 16, 'S83232A': 17, 'Z8711': 18, 'M5417': 19, 'K5900': 20, 'M1288': 21, 'H04301': 22, 'M5416': 23, 'J320': 24, 'R197': 25, 'K529': 26, 'K429': 27, 'C50911': 28, 'D242': 29, 'D509': 30, 'S83242A': 31, 'M722': 32, 'J449': 33, 'I129': 34, 'S0083XA': 35, 'S0003XA': 36, 'R079': 37, 'M545': 38, 'M25571': 39, 'S99921A': 40, 'S60221A': 41, 'S42332A': 42, 'K1379': 43, 'S80212A': 44, 'M869': 45, 'S39012A': 46, 'H10029': 47, 'J069': 48, 'K047': 49, 'R1030': 50, 'E8770': 51, 'J159': 52, 'K8010': 53, 'R6881': 54, 'M5442': 55, 'A6000': 56, 'S82831A': 57, 'M273': 58, 'N8320': 59, 'S60222A': 60, 'T7840XA': 61, 'R51': 62, 'R413': 63, 'J0110': 64, 'S0181XA': 65, 'C50412': 66, 'H2000': 67, 'S61012A': 68, 'S61211A': 69, 'R05': 70, 'I509': 71, 'R1012': 72, 'J189': 73, 'M109': 74, 'R062': 75, 'S060X0A':

In [9]:
## replace all ICD-9 codes with ints in the PX columns
for i in range (1,21):
    current_column = f'PX{i}'
    for icd9code in df[current_column]:
        if icd9code not in icd9codes_dict.keys():
            icd9codes_dict[icd9code] = replacement
            replacement += 1

for i in range (1,21):
    current_column = f'PX{i}'
    df[current_column] = df[current_column].map(icd9codes_dict)
    df[current_column] = pd.to_numeric(df[current_column])

In [10]:
## replace all ICD-9 codes with ints in the ECODE columns
for i in range (1,4):
    current_column = f'ECODE{i}'
    for icd9code in df[current_column]:
        if icd9code not in icd9codes_dict.keys():
            icd9codes_dict[icd9code] = replacement
            replacement += 1

for i in range (1,4):
    current_column = f'ECODE{i}'
    df[current_column] = df[current_column].map(icd9codes_dict)
    df[current_column] = pd.to_numeric(df[current_column])

In [11]:
# print first row to make sure the replacements took place
print(df.loc[0])

hnum2              11
ATYPE               3
asour               4
intage             13
TXTZIP              3
sex                 2
dstat               5
PPAY                1
CHRGS         3409.85
DX1                 0
DX2                77
DX3              2618
DX4              2618
DX5              2618
DX6              2618
DX7              2618
DX8              2618
DX9              2618
DX10             2618
DX11             2618
DX12             2618
DX13             2618
DX14             2618
DX15             2618
DX16             2618
DX17             2618
DX18             2618
DX19             2618
DX20             2618
PX1              2618
               ...   
PX12             2618
PX13             2618
PX14             2618
PX15             2618
PX16             2618
PX17             2618
PX18             2618
PX19             2618
PX20             2618
ECODE1           2618
ECODE2           2618
ECODE3           2618
hsa                12
pdays               1
ccsdx     

In [12]:
# target = number of days a patient stays in the hospital = pdays
y = df.pdays
X = df.drop("pdays", axis=1)

In [13]:
# split into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [14]:
print(max(list(y_train)))

366


In [15]:
# One-hot encoding, is this really necessary?
y_train = np_utils.to_categorical(y_train)

In [16]:
print(y_train)

[[0. 1. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 ...
 [0. 1. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]]


In [17]:
print(X_train.shape)

(296506, 69)


In [18]:
# setup model
## parameters
model = Sequential()
dropoutrate = 0.2
batchsize = 16
inputdim = X_train.shape[1]
adam = keras.optimizers.Adam() # Adam optimizer

model.add(Dense(10, input_dim=inputdim, kernel_initializer='uniform',activation='relu'))
model.add(Dropout(dropoutrate))
model.add(Dense(40, kernel_initializer='uniform',activation='relu'))
model.add(Dropout(dropoutrate))
model.add(Dense(40, kernel_initializer='uniform',activation='relu'))
model.add(Dropout(dropoutrate))
model.add(Dense(367, activation='softmax'))

In [19]:
model.compile(loss=keras.losses.categorical_crossentropy,optimizer=adam,metrics=['accuracy'])

In [20]:
history = model.fit(X_train, y_train, epochs=10,validation_split=0.3, batch_size=batchsize,verbose=1)

Train on 207554 samples, validate on 88952 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [21]:
# predict the number of days a person has to stay in the hospital
predictions = model.predict(X_test)

In [22]:
print(predictions)

[[0. 1. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 ...
 [0. 1. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]]


In [23]:
# compare prediction with real
n = 1
print("Predicted: {0} \tcertainty: {1}".format(predictions[n].argmax(axis=0), predictions[n][predictions[n].argmax(axis=0)]))
print("Real: {0}".format(list(y_test)[n]))

Predicted: 1 	certainty: 1.0
Real: 1


In [26]:
# make list of all predictions
y_predicted = []
for i in range(0, len(predictions)):
    y_predicted.append(predictions[i].argmax(axis=0))

In [27]:
print(y_predicted)

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [28]:
from sklearn.metrics import mean_absolute_error

MAE = mean_absolute_error(y_test,y_predicted)
print('MAE = ',MAE)

MAE =  0.42690247817934085


Logical that the MAE is small because the biggest part of the target values are 1 so of course predicting 1 all the time will cause only a small error. However, predicting one all the time means we need another model.