In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import BatchNormalization

from keras.utils import np_utils

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
df = pd.read_csv('data/VTOUTP16.TXT', low_memory=False)
df.head()

Unnamed: 0,hnum2,ATYPE,asour,intage,TXTZIP,sex,dstat,PPAY,CHRGS,DX1,...,BTYPE,ERFLAG,cah,vtres,OBSFLAG,AFLAG,Uniq,ADMID_QTR,DISCD_QTR,CHRGS_HCIA
0,11,3,4,13,50,2,5,1,3409.85,L600,...,131,0,1,1,0,1,3,1,1,3409.85
1,11,3,4,11,50,2,5,7,1837.65,Z86010,...,131,0,1,1,0,1,54,1,1,1837.65
2,11,3,4,12,37,2,5,1,1102.65,Z1211,...,131,0,1,3,0,1,85,1,1,1102.65
3,11,3,4,10,50,1,5,7,1102.7,Z1211,...,131,0,1,1,0,1,87,1,1,1102.7
4,11,3,4,13,51,2,5,7,1837.65,Z1211,...,131,0,1,1,0,1,93,1,1,1837.65


In [3]:
# clean data
## only use the diagnosis columns + the pdays column
df.drop(df.columns[0:9],axis=1,inplace=True) # delete everything before DX columns
df.drop(df.columns[20:44],axis=1,inplace=True) # delete everything between DX columns and pdays
df.drop(df.columns[21:],axis=1,inplace=True) # delete everything after pdays
print(df.loc[0])

DX1        L600
DX2      J45909
DX3            
DX4            
DX5            
DX6            
DX7            
DX8            
DX9            
DX10           
DX11           
DX12           
DX13           
DX14           
DX15           
DX16           
DX17           
DX18           
DX19           
DX20           
pdays         1
Name: 0, dtype: object


In [4]:
# calculate distribution of pdays
unique, counts = np.unique(df.pdays, return_counts=True)

In [5]:
# print the extreme values
for i in range (0, len(counts)):
    if(counts[i] >= 200):
        if(unique[i] == 1):
            print(f"{unique[i]} day appears {counts[i]} times")
        else:
            print(f"{unique[i]} days appears {counts[i]} times")

1 day appears 363722 times
2 days appears 2167 times
3 days appears 453 times
28 days appears 632 times
29 days appears 956 times
30 days appears 764 times


In [6]:
# adjust distribution

## create new dataframe with a fixed number of randomly chosen rows where pdays = 1
### source: https://stackoverflow.com/questions/29204005/how-to-perform-under-sampling-in-scikit-learn
day1_indices = df[df.pdays == 1].index # get index of the rows where pdays = 1
random_indices = np.random.choice(day1_indices, 2000, replace=False) # 200 indicates how many samples there will be
day1_df = df.loc[random_indices] # dataframe with samples where pday = 1

## create new dataframe with a fixed number of randomly chosen rows where pdays = 2
day2_indices = df[df.pdays == 2].index # get index of the rows where pdays = 2
random_indices = np.random.choice(day2_indices, 100, replace=False) # 200 indicates how many samples there will be
day2_df = df.loc[random_indices] # dataframe with samples where pday = 2

## create new dataframe with a fixed number of randomly chosen rows where pdays = 3
day3_indices = df[df.pdays == 3].index # get index of the rows where pdays = 3
random_indices = np.random.choice(day3_indices, 100, replace=False) # 200 indicates how many samples there will be
day3_df = df.loc[random_indices] # dataframe with samples where pday = 3

## create new dataframe with a fixed number of randomly chosen rows where pdays = 28
day28_indices = df[df.pdays == 28].index # get index of the rows where pdays = 28
random_indices = np.random.choice(day28_indices, 100, replace=False) # 200 indicates how many samples there will be
day28_df = df.loc[random_indices] # dataframe with samples where pday = 28

## create new dataframe with a fixed number of randomly chosen rows where pdays = 29
day29_indices = df[df.pdays == 29].index # get index of the rows where pdays = 29
random_indices = np.random.choice(day29_indices, 100, replace=False) # 200 indicates how many samples there will be
day29_df = df.loc[random_indices] # dataframe with samples where pday = 29

## create new dataframe with a fixed number of randomly chosen rows where pdays = 30
day30_indices = df[df.pdays == 30].index # get index of the rows where pdays = 30
random_indices = np.random.choice(day30_indices, 100, replace=False) # 200 indicates how many samples there will be
day30_df = df.loc[random_indices] # dataframe with samples where pday = 30

## delete all rows where pdays exceeds the limit
df = df[df.pdays != 1]
df = df[df.pdays != 2]
df = df[df.pdays != 3]
df = df[df.pdays != 28]
df = df[df.pdays != 29]
df = df[df.pdays != 30]

## combine dataframe without excessive pdays  with dataframes with fewer rows of excessive pdays
df = df.append(day1_df, ignore_index=True)
df = df.append(day2_df, ignore_index=True)
df = df.append(day3_df, ignore_index=True)
df = df.append(day28_df, ignore_index=True)
df = df.append(day29_df, ignore_index=True)
df = df.append(day30_df, ignore_index=True)
df.tail()

Unnamed: 0,DX1,DX2,DX3,DX4,DX5,DX6,DX7,DX8,DX9,DX10,...,DX12,DX13,DX14,DX15,DX16,DX17,DX18,DX19,DX20,pdays
4434,N186,N2581,E7143,,,,,,,,...,,,,,,,,,,30
4435,N186,N2581,D509,,,,,,,,...,,,,,,,,,,30
4436,G629,R1311,R682,R05,,,,,,,...,,,,,,,,,,30
4437,N186,E875,,,,,,,,,...,,,,,,,,,,30
4438,N186,D509,,,,,,,,,...,,,,,,,,,,30


In [7]:
from sklearn.utils import class_weight
calculated_weights = class_weight.compute_class_weight("balanced", unique, df.pdays)

In [8]:
# clean up data
## replace spaces with 0
df.replace([' '], [0], inplace=True)

In [9]:
## replace all ICD-9 codes with ints in the DX columns
icd9codes_dict = {}
replacement = 0
for i in range (1,21):
    current_column = f'DX{i}'
    for icd9code in df[current_column]:
        if icd9code not in icd9codes_dict.keys():
            icd9codes_dict[icd9code] = replacement
            replacement += 1

for i in range (1,21):
    current_column = f'DX{i}'
    df[current_column] = df[current_column].map(icd9codes_dict)
    df[current_column] = pd.to_numeric(df[current_column])

In [10]:
# set features and targets
y = df.pdays
df.drop('pdays', axis=1, inplace=True)
X = df.values

In [11]:
# print first row to make sure the replacements took place
print(X[0])

[   0  426   64 2064  888 1873 1132  320 1318 1318 1318 1318 1318 1318
 1318 1318 1318 1318 1318 1318]


In [12]:
# split into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [13]:
print(X_train.shape)

(3551, 20)


In [14]:
# setup model
## parameters
batchsize = 4
epochs = 100
inputdim = X_train.shape[1]

model = Sequential()
model.add(Dense(32, input_dim=inputdim, kernel_initializer='uniform', activation='relu'))
model.add(Dense(64, kernel_initializer='uniform', activation='relu'))
model.add(Dense(1, activation='linear'))

In [15]:
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])

In [16]:
# train model
model.fit(X_train, y_train, epochs=epochs, validation_split=0.1, batch_size=batchsize, verbose=1, class_weight=calculated_weights)

Train on 3195 samples, validate on 356 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100


Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x1214bd320>

In [17]:
# predict the number of days a person has to stay in the hospital
y_pred = model.predict(X_test)

In [18]:
print(y_pred)

[[33.340805 ]
 [32.923714 ]
 [ 2.528311 ]
 [ 1.3553249]
 [ 3.6543593]
 [ 1.3553249]
 [ 1.3553249]
 [ 8.56934  ]
 [33.340805 ]
 [55.2636   ]
 [ 1.3553249]
 [33.33724  ]
 [ 1.3553249]
 [ 7.4880447]
 [ 9.847231 ]
 [ 1.3553249]
 [10.486174 ]
 [ 1.3553249]
 [31.129065 ]
 [28.982819 ]
 [21.004221 ]
 [ 8.4828005]
 [ 1.3553249]
 [27.820894 ]
 [15.107878 ]
 [26.21409  ]
 [10.01246  ]
 [26.508991 ]
 [36.125706 ]
 [28.603626 ]
 [ 1.3553249]
 [10.437031 ]
 [12.326121 ]
 [ 1.3553249]
 [10.071585 ]
 [25.538807 ]
 [ 1.3553249]
 [ 1.3553249]
 [27.91136  ]
 [ 1.3553249]
 [ 3.8689244]
 [28.685799 ]
 [32.65271  ]
 [37.5342   ]
 [32.70186  ]
 [ 6.7016487]
 [33.291656 ]
 [30.98162  ]
 [ 1.3553249]
 [ 1.3553249]
 [ 1.3553249]
 [33.340805 ]
 [ 1.3553249]
 [32.849308 ]
 [ 1.3553249]
 [ 1.3553249]
 [23.953205 ]
 [33.340805 ]
 [34.549488 ]
 [29.82993  ]
 [31.293312 ]
 [24.837896 ]
 [ 6.8490944]
 [10.774422 ]
 [37.28924  ]
 [37.5342   ]
 [25.902496 ]
 [ 1.3553249]
 [32.70186  ]
 [25.525997 ]
 [56.518055 ]
 [ 1.3

In [24]:
# compare prediction with real value
n = 0
print("Predicted: {0}".format(y_pred[n]))
print("Real: {0}".format(list(y_test)[n]))

Predicted: [33.340805]
Real: 30


In [25]:
from sklearn.metrics import mean_absolute_error

MAE = mean_absolute_error(y_test,y_pred)
print('MAE = ',MAE)

MAE =  20.732464229201412


In [26]:
# save model for later use
# https://stackoverflow.com/questions/40396042/how-to-save-scikit-learn-keras-model-into-a-persistence-file-pickle-hd5-json-ya
model.save('model_diagnosis_stay.h5')
# save ICD code for later use
#https://stackoverflow.com/questions/19201290/how-to-save-a-dictionary-to-a-file
np.save('icd_diagnosis.npy', icd9codes_dict) 