In [91]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import BatchNormalization

from keras.utils import np_utils

In [92]:
df = pd.read_csv('data/VTOUTP16.TXT', low_memory=False)
df.head()

Unnamed: 0,hnum2,ATYPE,asour,intage,TXTZIP,sex,dstat,PPAY,CHRGS,DX1,...,BTYPE,ERFLAG,cah,vtres,OBSFLAG,AFLAG,Uniq,ADMID_QTR,DISCD_QTR,CHRGS_HCIA
0,11,3,4,13,50,2,5,1,3409.85,L600,...,131,0,1,1,0,1,3,1,1,3409.85
1,11,3,4,11,50,2,5,7,1837.65,Z86010,...,131,0,1,1,0,1,54,1,1,1837.65
2,11,3,4,12,37,2,5,1,1102.65,Z1211,...,131,0,1,3,0,1,85,1,1,1102.65
3,11,3,4,10,50,1,5,7,1102.7,Z1211,...,131,0,1,1,0,1,87,1,1,1102.7
4,11,3,4,13,51,2,5,7,1837.65,Z1211,...,131,0,1,1,0,1,93,1,1,1837.65


In [93]:
# clean data
## only use the diagnosis columns + the pdays column
df.drop(df.columns[0:9],axis=1,inplace=True) # delete everything before DX columns
df.drop(df.columns[20:40],axis=1,inplace=True) # delete everything between DX columns and ecodes
df.drop(df.columns[21:],axis=1,inplace=True) # delete everything after ecodes
print(df.loc[0])

DX1         L600
DX2       J45909
DX3             
DX4             
DX5             
DX6             
DX7             
DX8             
DX9             
DX10            
DX11            
DX12            
DX13            
DX14            
DX15            
DX16            
DX17            
DX18            
DX19            
DX20            
ECODE1          
Name: 0, dtype: object


In [94]:
# clean up data
## replace spaces with 0
df.replace([' '], [0], inplace=True)

In [95]:
## replace all ICD-9 codes with ints in the DX columns
icd9codes_dict = {}
replacement = 0
for i in range (1,21):
    current_column = f'DX{i}'
    for icd9code in df[current_column]:
        if icd9code not in icd9codes_dict.keys():
            icd9codes_dict[icd9code] = replacement
            replacement += 1

for i in range (1,21):
    current_column = f'DX{i}'
    df[current_column] = df[current_column].map(icd9codes_dict)
    df[current_column] = pd.to_numeric(df[current_column])

In [96]:
## replace all ICD-9 codes with ints in the ECODE columns
for i in range (1,2):
    current_column = f'ECODE{i}'
    for icd9code in df[current_column]:
        if icd9code not in icd9codes_dict.keys():
            icd9codes_dict[icd9code] = replacement
            replacement += 1

for i in range (1,2):
    current_column = f'ECODE{i}'
    df[current_column] = df[current_column].map(icd9codes_dict)
    df[current_column] = pd.to_numeric(df[current_column])

In [97]:
# set features and targets
y = df['ECODE1']
df.drop('ECODE1', axis=1, inplace=True)
X = df.values

In [98]:
# print first row to make sure the replacements took place
print(X[0])

[   0   77 2618 2618 2618 2618 2618 2618 2618 2618 2618 2618 2618 2618
 2618 2618 2618 2618 2618 2618]


In [99]:
# split into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [100]:
print(X_train.shape)

(296506, 20)


In [101]:
# One-hot encoding
y_train = np_utils.to_categorical(y_train)

In [102]:
print(y_train)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [115]:
# setup model
## parameters
model = Sequential()
dropoutrate = 0.2
batchsize = 1024
inputdim = X_train.shape[1]
adam = keras.optimizers.Adam() # Adam optimizer

model.add(Dense(10, input_dim=inputdim, kernel_initializer='uniform',activation='relu'))
model.add(Dropout(dropoutrate))
model.add(Dense(40, kernel_initializer='uniform',activation='relu'))
model.add(Dropout(dropoutrate))
model.add(Dense(40, kernel_initializer='uniform',activation='relu'))
model.add(Dropout(dropoutrate))
model.add(Dense(15378, activation='softmax'))

In [116]:
model.compile(loss=keras.losses.categorical_crossentropy,optimizer=adam,metrics=['accuracy'])

In [117]:
# train model
history = model.fit(X_train, y_train, epochs=10,validation_split=0.3, batch_size=batchsize,verbose=1)

Train on 207554 samples, validate on 88952 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [121]:
# predict the number of days a person has to stay in the hospital
predictions = model.predict(X_test)

In [123]:
# compare prediction with real
n = 0
print("Predicted: {0} \tcertainty: {1}".format(predictions[n].argmax(axis=0), predictions[n][predictions[n].argmax(axis=0)]))
print("Real: {0}".format(list(y_test)[n]))

Predicted: 2618 	certainty: 0.9291730523109436
Real: 2618


In [124]:
# make list of all predictions
y_predicted = []
for i in range(0, len(predictions)):
    y_predicted.append(predictions[i].argmax(axis=0))

In [132]:
# save model for later use
# https://stackoverflow.com/questions/40396042/how-to-save-scikit-learn-keras-model-into-a-persistence-file-pickle-hd5-json-ya
model.save('model_diagnosis_ecode.h5')
# save ICD code for later use
#https://stackoverflow.com/questions/19201290/how-to-save-a-dictionary-to-a-file
np.save('icd_diagnosis.npy', icd9codes_dict) 