<center>
# Predicting Show or No-show on Medical Appointments 

## - Nilesh Pawar




In [29]:
import pandas as pd
import numpy as np
from IPython.display import display
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score as cross_validation
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
#Create Data Frame
df = pd.DataFrame(pd.read_csv('No-show-Issue-Comma-300k.csv'))

df.head()

Unnamed: 0,Age,Gender,AppointmentRegistration,ApointmentData,DayOfTheWeek,Status,Diabetes,Alcoolism,HiperTension,Handcap,Smokes,Scholarship,Tuberculosis,Sms_Reminder,AwaitingTime
0,19,M,2014-12-16T14:46:25Z,2015-01-14T00:00:00Z,Wednesday,Show-Up,0,0,0,0,0,0,0,0,-29
1,24,F,2015-08-18T07:01:26Z,2015-08-19T00:00:00Z,Wednesday,Show-Up,0,0,0,0,0,0,0,0,-1
2,4,F,2014-02-17T12:53:46Z,2014-02-18T00:00:00Z,Tuesday,Show-Up,0,0,0,0,0,0,0,0,-1
3,5,M,2014-07-23T17:02:11Z,2014-08-07T00:00:00Z,Thursday,Show-Up,0,0,0,0,0,0,0,1,-15
4,38,M,2015-10-21T15:20:09Z,2015-10-27T00:00:00Z,Tuesday,Show-Up,0,0,0,0,0,0,0,1,-6


In [3]:
#Creating Subset to make different trials. Full DF has 300,000 rows.
dfSubset = df.sample(frac = 0.01)

In [4]:
#Changing Status variable to dummy
StatusDummy = pd.get_dummies(dfSubset['Status'])
dfSubset['StatusDummy'] = StatusDummy['Show-Up']

In [5]:
#Changing Gender to Dummy
GenderDummy = pd.get_dummies(dfSubset['Gender'])
dfSubset['isFemale'] =  GenderDummy['F']

In [6]:
#Get absolutes values for awaiting time
dfSubset['AwaitingTime'] = abs(dfSubset['AwaitingTime'])

In [7]:
#Get time of registration
dfSubset['AppointmentRegTime'] = pd.to_datetime(dfSubset['AppointmentRegistration'])

In [8]:
#Get date time 
dfSubset['AppointmentDataTime'] = pd.to_datetime(dfSubset['ApointmentData'])

In [9]:
#Get how many days for appointment since registration
daysToAppointment = dfSubset['AppointmentDataTime'] - dfSubset['AppointmentRegTime']
dfSubset['daysToAppointment'] = daysToAppointment.apply(lambda x: x.total_seconds() / (3600 * 24))

In [10]:
#Calculate hour of appointment
def calculateHour(timestamp):
    timestamp = str(timestamp)
    hour = int(timestamp[11:13])
    minute = int(timestamp[14:16])
    second = int(timestamp[17:])
    return round(hour + minute/60 + second/3600)

dfSubset['HourOfTheDay'] = dfSubset['AppointmentRegTime'].apply(calculateHour)

In [11]:
def ageGroup(x):
        if x <= 16:
                return 0
        elif x <= 30:
                return 1
        elif x <= 50:
                return 2
        elif x <= 70:
                return 3
        else:
                return 4

dfSubset['AgeGrp'] = dfSubset['Age'].apply(ageGroup)  


In [12]:
#Creates Categories for days to appointment 
daystoApptCateg = pd.cut(dfSubset['daysToAppointment'],[0,7,30,90,180,365,100000], labels = [0,1,2,3,4,5])
dfSubset['daysToApptCateg'] = daystoApptCateg

In [13]:
#Creating dummy is weekend
def dow(x):
        if x == 'Sunday':
                return 1
        elif x == 'Monday':
                return 0
        elif x == 'Tuesday':
                return 0
        elif x == 'Wednesday':
                return 0
        elif x == 'Thursday':
                return 0
        elif x == 'Friday':
                return 0
        elif x == 'Saturday':
                return 1
            
WeekendDummy = dfSubset['DayOfTheWeek'].apply(dow)
dfSubset['WeekendDummy'] = WeekendDummy

In [14]:
#Renaming Variables with typos
dfSubset.rename(columns = {'ApointmentData':'AppointmentData',
                         'Alcoolism': 'Alchoholism',
                         'HiperTension': 'Hypertension',
                         'Handcap': 'Handicap'}, inplace = True)

In [15]:
#Decriprive analysis of the data
dfSubset.describe()

Unnamed: 0,Age,Diabetes,Alchoholism,Hypertension,Handicap,Smokes,Scholarship,Tuberculosis,Sms_Reminder,AwaitingTime,StatusDummy,isFemale,daysToAppointment,HourOfTheDay,AgeGrp,WeekendDummy
count,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0
mean,37.613333,0.077667,0.022667,0.221,0.022,0.05,0.095,0.000667,0.580333,13.856667,0.694,0.664667,13.363734,11.826333,1.782,0.003667
std,22.948595,0.267691,0.148863,0.41499,0.157663,0.217981,0.293264,0.025816,0.498962,14.750321,0.460907,0.472186,14.746681,3.298353,1.269257,0.060452
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.174468,6.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,3.354893,9.0,1.0,0.0
50%,37.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,8.0,1.0,1.0,7.587668,11.0,2.0,0.0
75%,56.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,21.0,1.0,1.0,20.318527,15.0,3.0,0.0
max,96.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,2.0,117.0,1.0,1.0,116.379965,21.0,4.0,1.0


In [16]:
#Decriprive analysis of the data when there is no show up
dfSubset[dfSubset['StatusDummy'] == 0].describe()

Unnamed: 0,Age,Diabetes,Alchoholism,Hypertension,Handicap,Smokes,Scholarship,Tuberculosis,Sms_Reminder,AwaitingTime,StatusDummy,isFemale,daysToAppointment,HourOfTheDay,AgeGrp,WeekendDummy
count,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0
mean,35.040305,0.065359,0.028322,0.1939,0.023965,0.051198,0.105664,0.0,0.587146,15.3878,0.0,0.666667,14.887649,12.0,1.657952,0.002179
std,22.131793,0.247294,0.165983,0.395567,0.159992,0.220522,0.307575,0.0,0.494824,14.168019,0.0,0.471661,14.172932,3.195553,1.234985,0.046651
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.174468,7.0,0.0,0.0
25%,18.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,3.671814,9.0,1.0,0.0
50%,34.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,11.0,0.0,1.0,10.478912,11.0,2.0,0.0
75%,51.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,22.0,0.0,1.0,21.627986,15.0,3.0,0.0
max,91.0,1.0,1.0,1.0,2.0,1.0,1.0,0.0,2.0,91.0,0.0,1.0,90.517708,21.0,4.0,1.0


In [17]:
#Decriprive analysis of the data when there is show up
dfSubset[dfSubset['StatusDummy'] == 1].describe()

Unnamed: 0,Age,Diabetes,Alchoholism,Hypertension,Handicap,Smokes,Scholarship,Tuberculosis,Sms_Reminder,AwaitingTime,StatusDummy,isFemale,daysToAppointment,HourOfTheDay,AgeGrp,WeekendDummy
count,2082.0,2082.0,2082.0,2082.0,2082.0,2082.0,2082.0,2082.0,2082.0,2082.0,2082.0,2082.0,2082.0,2082.0,2082.0,2082.0
mean,38.747839,0.083093,0.020173,0.232949,0.021134,0.049472,0.090298,0.000961,0.577329,13.181556,1.0,0.663785,12.691806,11.74976,1.836695,0.004323
std,23.214403,0.276089,0.140625,0.422812,0.156656,0.216903,0.286677,0.030986,0.500864,14.953482,0.0,0.472527,14.946723,3.340563,1.280557,0.065621
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.242546,6.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,1.0,0.0,2.689094,9.0,1.0,0.0
50%,40.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,7.0,1.0,1.0,6.668252,11.0,2.0,0.0
75%,57.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,19.0,1.0,1.0,18.26537,15.0,3.0,0.0
max,96.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,2.0,117.0,1.0,1.0,116.379965,21.0,4.0,1.0


In [18]:
#Delete data that won't go in the model
del dfSubset['Gender']
del dfSubset['AppointmentRegistration']
del dfSubset['AppointmentData']
del dfSubset['DayOfTheWeek']
del dfSubset['Status']
del dfSubset['AppointmentRegTime']
del dfSubset['AppointmentDataTime']
del dfSubset['daysToAppointment']
del dfSubset['Age']

In [19]:
#Break in train and test data
import random
random.seed(1500)
from sklearn.model_selection import train_test_split
X = dfSubset
Y = dfSubset['StatusDummy']
X = dfSubset.drop('StatusDummy', axis = 1)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.4, random_state=0)
print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)

(1800, 14) (1200, 14) (1800,) (1200,)


In [20]:
X_train.head()

Unnamed: 0,Diabetes,Alchoholism,Hypertension,Handicap,Smokes,Scholarship,Tuberculosis,Sms_Reminder,AwaitingTime,isFemale,HourOfTheDay,AgeGrp,daysToApptCateg,WeekendDummy
6604,0,0,1,0,0,0,0,1,4,1,15,3,0,0
15731,0,0,1,0,0,0,0,1,29,1,12,2,1,0
555,0,0,0,0,0,0,0,1,7,0,8,0,0,0
13735,0,0,0,0,0,0,0,1,24,1,12,0,1,0
114058,0,0,0,0,0,0,0,1,26,0,10,3,1,0


In [21]:
X_test.head()

Unnamed: 0,Diabetes,Alchoholism,Hypertension,Handicap,Smokes,Scholarship,Tuberculosis,Sms_Reminder,AwaitingTime,isFemale,HourOfTheDay,AgeGrp,daysToApptCateg,WeekendDummy
208463,0,0,0,0,0,0,0,0,2,0,12,3,0,0
143278,0,0,0,0,0,0,0,1,91,1,12,3,3,0
206651,0,0,1,0,0,0,0,1,13,1,11,3,1,0
160434,0,0,0,0,0,0,0,1,6,1,13,2,0,0
93039,0,0,1,0,0,1,0,1,47,1,8,2,2,0


In [22]:
#Create k-nn
results_knn = []
for i in range(1,150):
    knn=KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train,Y_train)
    Y_pred=knn.predict(X_test)
    scores_knn = accuracy_score(Y_test,Y_pred)
    results_knn.append((i,scores_knn))

df_accuracy_knn = pd.DataFrame(data=results_knn,columns=['Neighbors', 'Accuracy'])

df_accuracy_knn.ix[df_accuracy_knn['Accuracy'].idxmax()]

Neighbors    41.0000
Accuracy      0.6725
Name: 40, dtype: float64

In [23]:
# Create Naive Bayes classifier
clf_gb = GaussianNB()
clf_gb.fit(X_train, Y_train)
predicts_gb = clf_gb.predict(X_test)
print("GB Accuracy Rate, which is calculated by accuracy_score() is: %f" % accuracy_score(Y_test, predicts_gb))

GB Accuracy Rate, which is calculated by accuracy_score() is: 0.338333


In [24]:
#Decision Tree
clf_dt = tree.DecisionTreeClassifier()
clf_dt.fit(X_train, Y_train)
predicts_dt = clf_dt.predict(X_test)
print("Decision tree Accuracy Rate, which is calculated by accuracy_score() is: %f" % accuracy_score(Y_test, predicts_dt))

Decision tree Accuracy Rate, which is calculated by accuracy_score() is: 0.583333


In [25]:
#Random forest classifier
clf_rf = RandomForestClassifier(random_state = 1)
clf_rf.fit(X_train, Y_train)
accuracy_rf = clf_rf.score(X_test,Y_test)
print("Random Forest Accuracy Rate, which is calculated by accuracy_score() is: %f" % accuracy_rf)

Random Forest Accuracy Rate, which is calculated by accuracy_score() is: 0.590833


In [26]:
#Tried kernel = 'rbf' also, but 'linear' was better
#SVM
clf_svm = svm.SVC(kernel='linear')
clf_svm.fit(X_train, Y_train)
predicts_svm = clf_svm.predict(X_test)
print("SVM Accuracy Rate, which is calculated by accuracy_score() is: %f" % accuracy_score(Y_test, predicts_svm))

SVM Accuracy Rate, which is calculated by accuracy_score() is: 0.668333


In [27]:
scores = cross_val_score(clf_svm, X_test, Y_test, cv=8)

In [28]:
#Printing SVM Model Output Summary
print('*******MODEL OUTPUT***********')
print('\nAccuracy: %0.2f' % (scores.mean()))

print('\nDifference of Show Up by Age Group Summaries bins = (0,16,30,50,70)')
for i in range(0,5):
    print('Category = ', i)
    print(dfSubset['StatusDummy'][dfSubset['AgeGrp'] == i].value_counts(normalize = True))

print('\nDifference of Show Up by Days to Appointment (Wait Time) Summaries (bins = (0,7,30,90,180,365)')
for i in range(0,6):
    print('Category = ', i)
    print(dfSubset['StatusDummy'][dfSubset['daysToApptCateg'] == i].value_counts(normalize = True))

print('\nDifference of Show Up if Is Weekend (Is Weekend = 1)')
for v in range(0,2):
    print('Category = ', v)
    print(dfSubset['StatusDummy'][dfSubset['WeekendDummy'] == v].value_counts(normalize = True))
    
print('\nDifference of Show Up if Is Female (Is Female = 1)')
for z in range(0,2):
    print('Category = ', z)
    print(dfSubset['StatusDummy'][dfSubset['isFemale'] == v].value_counts(normalize = True))

*******MODEL OUTPUT***********

Accuracy: 0.67

Difference of Show Up by Age Group Summaries bins = (0,16,30,50,70)
Category =  0
1    0.688693
0    0.311307
Name: StatusDummy, dtype: float64
Category =  1
1    0.606805
0    0.393195
Name: StatusDummy, dtype: float64
Category =  2
1    0.690566
0    0.309434
Name: StatusDummy, dtype: float64
Category =  3
1    0.752988
0    0.247012
Name: StatusDummy, dtype: float64
Category =  4
1    0.727273
0    0.272727
Name: StatusDummy, dtype: float64

Difference of Show Up by Days to Appointment (Wait Time) Summaries (bins = (0,7,30,90,180,365)
Category =  0
1    0.740431
0    0.259569
Name: StatusDummy, dtype: float64
Category =  1
1    0.660408
0    0.339592
Name: StatusDummy, dtype: float64
Category =  2
1    0.606154
0    0.393846
Name: StatusDummy, dtype: float64
Category =  3
1    0.923077
0    0.076923
Name: StatusDummy, dtype: float64
Category =  4
Series([], Name: StatusDummy, dtype: float64)
Category =  5
Series([], Name: StatusDummy, 