In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import BatchNormalization

from keras.utils import np_utils

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
df = pd.read_csv('data/VTOUTP16.TXT', low_memory=False)
df.head()

Unnamed: 0,hnum2,ATYPE,asour,intage,TXTZIP,sex,dstat,PPAY,CHRGS,DX1,...,BTYPE,ERFLAG,cah,vtres,OBSFLAG,AFLAG,Uniq,ADMID_QTR,DISCD_QTR,CHRGS_HCIA
0,11,3,4,13,50,2,5,1,3409.85,L600,...,131,0,1,1,0,1,3,1,1,3409.85
1,11,3,4,11,50,2,5,7,1837.65,Z86010,...,131,0,1,1,0,1,54,1,1,1837.65
2,11,3,4,12,37,2,5,1,1102.65,Z1211,...,131,0,1,3,0,1,85,1,1,1102.65
3,11,3,4,10,50,1,5,7,1102.7,Z1211,...,131,0,1,1,0,1,87,1,1,1102.7
4,11,3,4,13,51,2,5,7,1837.65,Z1211,...,131,0,1,1,0,1,93,1,1,1837.65


In [3]:
# clean data
## only use the diagnosis columns + the pdays column
df.drop(df.columns[0:9],axis=1,inplace=True) # delete everything before DX columns
df.drop(df.columns[40:44],axis=1,inplace=True) # delete everything between DX, PX columns and pdays
df.drop(df.columns[41:],axis=1,inplace=True) # delete everything after pdays
print(df.loc[0])

DX1        L600
DX2      J45909
DX3            
DX4            
DX5            
DX6            
DX7            
DX8            
DX9            
DX10           
DX11           
DX12           
DX13           
DX14           
DX15           
DX16           
DX17           
DX18           
DX19           
DX20           
PX1            
PX2            
PX3            
PX4            
PX5            
PX6            
PX7            
PX8            
PX9            
PX10           
PX11           
PX12           
PX13           
PX14           
PX15           
PX16           
PX17           
PX18           
PX19           
PX20           
pdays         1
Name: 0, dtype: object


In [4]:
# calculate distribution of pdays
unique, counts = np.unique(df.pdays, return_counts=True)

# show all distribution of y except the first
import matplotlib.pyplot as plt
plt.figure(num=None, figsize=(20, 6), dpi=80, facecolor='w', edgecolor='k')
axes = plt.gca()
axes.set_xlim([0,len(unique)])
axes.set_ylim([0,200])
plt.bar(unique, counts)

<Container object of 215 artists>

In [5]:
# print the extreme values
for i in range (0, len(counts)):
    if(counts[i] >= 200):
        if(unique[i] == 1):
            print(f"{unique[i]} day appears {counts[i]} times")
        else:
            print(f"{unique[i]} days appears {counts[i]} times")

1 day appears 363722 times
2 days appears 2167 times
3 days appears 453 times
28 days appears 632 times
29 days appears 956 times
30 days appears 764 times


In [6]:
# adjust distribution

## create new dataframe with a fixed number of randomly chosen rows where pdays = 1
### source: https://stackoverflow.com/questions/29204005/how-to-perform-under-sampling-in-scikit-learn
day1_indices = df[df.pdays == 1].index # get index of the rows where pdays = 1
random_indices = np.random.choice(day1_indices, 2000, replace=False) # 200 indicates how many samples there will be
day1_df = df.loc[random_indices] # dataframe with samples where pday = 1

## create new dataframe with a fixed number of randomly chosen rows where pdays = 2
day2_indices = df[df.pdays == 2].index # get index of the rows where pdays = 2
random_indices = np.random.choice(day2_indices, 200, replace=False) # 200 indicates how many samples there will be
day2_df = df.loc[random_indices] # dataframe with samples where pday = 2

## create new dataframe with a fixed number of randomly chosen rows where pdays = 3
day3_indices = df[df.pdays == 3].index # get index of the rows where pdays = 3
random_indices = np.random.choice(day3_indices, 200, replace=False) # 200 indicates how many samples there will be
day3_df = df.loc[random_indices] # dataframe with samples where pday = 3

## create new dataframe with a fixed number of randomly chosen rows where pdays = 28
day28_indices = df[df.pdays == 28].index # get index of the rows where pdays = 28
random_indices = np.random.choice(day28_indices, 200, replace=False) # 200 indicates how many samples there will be
day28_df = df.loc[random_indices] # dataframe with samples where pday = 28

## create new dataframe with a fixed number of randomly chosen rows where pdays = 29
day29_indices = df[df.pdays == 29].index # get index of the rows where pdays = 29
random_indices = np.random.choice(day29_indices, 200, replace=False) # 200 indicates how many samples there will be
day29_df = df.loc[random_indices] # dataframe with samples where pday = 29

## create new dataframe with a fixed number of randomly chosen rows where pdays = 30
day30_indices = df[df.pdays == 30].index # get index of the rows where pdays = 30
random_indices = np.random.choice(day30_indices, 200, replace=False) # 200 indicates how many samples there will be
day30_df = df.loc[random_indices] # dataframe with samples where pday = 30

## delete all rows where pdays exceeds the limit
df = df[df.pdays != 1]
df = df[df.pdays != 2]
df = df[df.pdays != 3]
df = df[df.pdays != 28]
df = df[df.pdays != 29]
df = df[df.pdays != 30]

## combine dataframe without excessive pdays  with dataframes with fewer rows of excessive pdays
df = df.append(day1_df, ignore_index=True)
df = df.append(day2_df, ignore_index=True)
df = df.append(day3_df, ignore_index=True)
df = df.append(day28_df, ignore_index=True)
df = df.append(day29_df, ignore_index=True)
df = df.append(day30_df, ignore_index=True)
df.tail()

Unnamed: 0,DX1,DX2,DX3,DX4,DX5,DX6,DX7,DX8,DX9,DX10,...,PX12,PX13,PX14,PX15,PX16,PX17,PX18,PX19,PX20,pdays
4934,N186,E7143,D509,,,,,,,,...,,,,,,,,,,30
4935,N186,N2581,,,,,,,,,...,,,,,,,,,,30
4936,N186,N2581,,,,,,,,,...,,,,,,,,,,30
4937,N186,D631,,,,,,,,,...,,,,,,,,,,30
4938,N186,,,,,,,,,,...,,,,,,,,,,30


In [7]:
# clean up data
## replace spaces with 0
df.replace([' '], [0], inplace=True)

In [8]:
## replace all ICD-9 codes with ints in the DX columns
icd9codes_dict = {}
replacement = 0
for i in range (1,21):
    current_column = f'DX{i}'
    for icd9code in df[current_column]:
        if icd9code not in icd9codes_dict.keys():
            icd9codes_dict[icd9code] = replacement
            replacement += 1

for i in range (1,21):
    current_column = f'DX{i}'
    df[current_column] = df[current_column].map(icd9codes_dict)
    df[current_column] = pd.to_numeric(df[current_column])

In [9]:
# replace all ICD-9 codes with ints in the PX columns
for i in range (1,21):
    current_column = f'PX{i}'
    for icd9code in df[current_column]:
        if icd9code not in icd9codes_dict.keys():
            icd9codes_dict[icd9code] = replacement
            replacement += 1

for i in range (1,21):
    current_column = f'PX{i}'
    df[current_column] = df[current_column].map(icd9codes_dict)
    df[current_column] = pd.to_numeric(df[current_column])

In [10]:
# set features and targets
y = df.pdays
df.drop('pdays', axis=1, inplace=True)
X = df.values

In [11]:
# print first row to make sure the replacements took place
print(X[0])

[   0  426   64 2122 1231  653  572  320  542  542  542  542  542  542
  542  542  542  542  542  542  542  542  542  542  542  542  542  542
  542  542  542  542  542  542  542  542  542  542  542  542]


In [12]:
# split into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=521)

In [13]:
print(X_train.shape)

(3951, 40)


In [19]:
# setup model
## https://machinelearningmastery.com/grid-search-hyperparameters-deep-learning-models-python-keras/
## parameters
batchsize = 4
epochs = 100
inputdim = X_train.shape[1]

model = Sequential()
model.add(Dense(32, input_dim=inputdim, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='linear'))
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])

NameError: name 'dropoutrate' is not defined

In [20]:
# train model
model.fit(X_train, y_train, epochs=epochs, validation_split=0.1, batch_size=batchsize, verbose=1)

RuntimeError: The model needs to be compiled before being used.

In [16]:
# predict the number of days a person has to stay in the hospital
y_pred = model.predict(X_test)

In [17]:
print(y_pred)

[[ 2.77235546e+01]
 [ 3.07323570e+01]
 [ 1.08987169e+01]
 [ 3.45748596e+01]
 [ 2.52321386e+00]
 [ 4.55968261e-01]
 [ 2.59215088e+01]
 [ 1.63997517e+01]
 [ 2.17637014e+00]
 [ 6.69591308e-01]
 [ 6.39005065e-01]
 [-3.80648255e-01]
 [ 3.20741959e+01]
 [ 8.28198791e-01]
 [ 1.41370142e+00]
 [ 2.30777502e+00]
 [-1.02015936e+00]
 [ 3.26359291e+01]
 [ 7.13715911e-01]
 [ 2.90539818e+01]
 [ 2.61970654e+01]
 [ 2.65078392e+01]
 [ 2.43793106e+01]
 [ 1.32415521e+00]
 [ 2.45285110e+01]
 [ 9.26783676e+01]
 [ 2.43793106e+01]
 [ 4.13581123e+01]
 [ 3.60805588e+01]
 [ 5.25733604e+01]
 [ 2.48036742e-01]
 [ 8.62355590e-01]
 [-7.34243512e-01]
 [-1.46036780e+00]
 [ 1.32105005e+00]
 [ 3.80483031e-01]
 [ 2.43793106e+01]
 [ 1.15553033e+00]
 [ 2.97490368e+01]
 [ 5.32567382e-01]
 [-1.94869483e+00]
 [ 1.49690759e+00]
 [ 2.81114674e+01]
 [ 9.33034301e-01]
 [ 2.81372890e+01]
 [ 2.43793106e+01]
 [-9.48193645e+00]
 [ 6.33153319e-01]
 [ 1.01524866e+00]
 [ 2.84358292e+01]
 [ 2.56230526e+01]
 [ 4.70491600e+01]
 [ 4.1727146

In [18]:
# compare prediction with real value
print("Predicted: {0}".format(y_pred[1]))
print("Real: {0}".format(list(y_test)[1]))

Predicted: [30.732357]
Real: 90


In [None]:
from sklearn.metrics import mean_absolute_error

MAE = mean_absolute_error(y_test,y_predicted)
print('MAE = ',MAE)