In [32]:
import pandas as pd
import numpy as np
import datetime
from time import strftime

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

In [33]:
week_key = ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']

In [34]:
df = pd.read_csv("/Users/P U N I/M L/PROJ/Patient dataset.csv")

In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110527 entries, 0 to 110526
Data columns (total 14 columns):
PatientId         110527 non-null float64
AppointmentID     110527 non-null int64
Gender            110527 non-null object
ScheduledDay      110527 non-null object
AppointmentDay    110527 non-null object
Age               110527 non-null int64
Neighbourhood     110527 non-null object
Scholarship       110527 non-null int64
Hipertension      110527 non-null int64
Diabetes          110527 non-null int64
Alcoholism        110527 non-null int64
Handcap           110527 non-null int64
SMS_received      110527 non-null int64
No-show           110527 non-null object
dtypes: float64(1), int64(8), object(5)
memory usage: 11.8+ MB


In [36]:
df['PatientId'] = df['PatientId'].astype('int64')
df['ScheduledDay'] = pd.to_datetime(df['ScheduledDay']).dt.date.astype('datetime64[ns]')
df['AppointmentDay'] = pd.to_datetime(df['AppointmentDay']).dt.date.astype('datetime64[ns]')
df = df.rename(columns={'Hipertension': 'Hypertension', 'Handcap': 'Handicap', 'SMS_received': 'SMSReceived', 'No-show': 'NoShow'})

In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110527 entries, 0 to 110526
Data columns (total 14 columns):
PatientId         110527 non-null int64
AppointmentID     110527 non-null int64
Gender            110527 non-null object
ScheduledDay      110527 non-null datetime64[ns]
AppointmentDay    110527 non-null datetime64[ns]
Age               110527 non-null int64
Neighbourhood     110527 non-null object
Scholarship       110527 non-null int64
Hypertension      110527 non-null int64
Diabetes          110527 non-null int64
Alcoholism        110527 non-null int64
Handicap          110527 non-null int64
SMSReceived       110527 non-null int64
NoShow            110527 non-null object
dtypes: datetime64[ns](2), int64(9), object(3)
memory usage: 11.8+ MB


In [38]:
df.head()

Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hypertension,Diabetes,Alcoholism,Handicap,SMSReceived,NoShow
0,29900000000000,5642903,F,2016-04-29,2016-04-29,62,JARDIM DA PENHA,0,1,0,0,0,0,No
1,559000000000000,5642503,M,2016-04-29,2016-04-29,56,JARDIM DA PENHA,0,0,0,0,0,0,No
2,4260000000000,5642549,F,2016-04-29,2016-04-29,62,MATA DA PRAIA,0,0,0,0,0,0,No
3,868000000000,5642828,F,2016-04-29,2016-04-29,8,PONTAL DE CAMBURI,0,0,0,0,0,0,No
4,8840000000000,5642494,F,2016-04-29,2016-04-29,56,JARDIM DA PENHA,0,1,1,0,0,0,No


In [39]:
df.columns

Index(['PatientId', 'AppointmentID', 'Gender', 'ScheduledDay',
       'AppointmentDay', 'Age', 'Neighbourhood', 'Scholarship', 'Hypertension',
       'Diabetes', 'Alcoholism', 'Handicap', 'SMSReceived', 'NoShow'],
      dtype='object')

In [40]:
df.drop(['PatientId', 'AppointmentID'], axis=1, inplace=True)


In [41]:
df['Scholarship'] = df['Scholarship'].astype('object')
df['Hypertension'] = df['Hypertension'].astype('object')
df['Diabetes'] = df['Diabetes'].astype('object')
df['Alcoholism'] = df['Alcoholism'].astype('object')
df['Handicap'] = df['Handicap'].astype('object')
df['SMSReceived'] = df['SMSReceived'].astype('object')
df['Neighbourhood']=df['Neighbourhood'].astype('object')

In [42]:
np.sort(df.Age.unique())

array([ -1,   0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,
        12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,
        25,  26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,
        38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,
        51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,
        64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,
        77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
        90,  91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 102, 115],
      dtype=int64)

In [43]:
df=df[df.Age>=0]

In [44]:
np.sort(df.Age.unique())

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 102, 115],
      dtype=int64)

In [45]:
df[(df.Age <= 0) & ((df.Hypertension.astype(int) == 1) | (df.Diabetes.astype(int) == 1) | (df.Alcoholism.astype(int) == 1))]

Unnamed: 0,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hypertension,Diabetes,Alcoholism,Handicap,SMSReceived,NoShow


In [46]:
print("Unique Values in `ScheduledDay` => {}".format(np.sort(df.ScheduledDay.dt.strftime('%d-%m-%y').unique())))

Unique Values in `ScheduledDay` => ['01-02-16' '01-03-16' '01-04-16' '01-06-16' '02-02-16' '02-03-16'
 '02-05-16' '02-06-16' '03-02-16' '03-03-16' '03-05-16' '03-06-16'
 '03-12-15' '04-01-16' '04-02-16' '04-03-16' '04-05-16' '04-06-16'
 '05-01-16' '05-02-16' '05-03-16' '05-04-16' '05-05-16' '06-04-16'
 '06-05-16' '06-06-16' '07-01-16' '07-03-16' '07-04-16' '07-05-16'
 '07-06-16' '07-12-15' '08-03-16' '08-04-16' '08-06-16' '08-12-15'
 '09-03-16' '09-04-16' '09-05-16' '10-03-16' '10-05-16' '10-11-15'
 '11-01-16' '11-02-16' '11-03-16' '11-04-16' '11-05-16' '12-02-16'
 '12-04-16' '12-05-16' '13-01-16' '13-04-16' '13-05-16' '14-01-16'
 '14-03-16' '14-04-16' '14-05-16' '14-12-15' '15-02-16' '15-03-16'
 '15-04-16' '15-12-15' '16-02-16' '16-03-16' '16-04-16' '16-05-16'
 '17-02-16' '17-03-16' '17-05-16' '18-02-16' '18-03-16' '18-04-16'
 '18-05-16' '19-01-16' '19-02-16' '19-03-16' '19-04-16' '19-05-16'
 '20-01-16' '20-04-16' '20-05-16' '21-01-16' '21-03-16' '22-01-16'
 '22-02-16' '22-03-16' '23-

In [47]:
print("Unique Values in `AppointmentDay` => {}".format(np.sort(df.AppointmentDay.dt.strftime('%d-%m-%y').unique())))

Unique Values in `AppointmentDay` => ['01-06-16' '02-05-16' '02-06-16' '03-05-16' '03-06-16' '04-05-16'
 '05-05-16' '06-05-16' '06-06-16' '07-06-16' '08-06-16' '09-05-16'
 '10-05-16' '11-05-16' '12-05-16' '13-05-16' '14-05-16' '16-05-16'
 '17-05-16' '18-05-16' '19-05-16' '20-05-16' '24-05-16' '25-05-16'
 '29-04-16' '30-05-16' '31-05-16']


In [48]:
df['ScheduledDay_DOW'] = df['ScheduledDay'].dt.weekday_name
df['AppointmentDay_DOW'] = df['AppointmentDay'].dt.weekday_name

In [49]:
df['AppointmentDay'] = np.where((df['AppointmentDay'] - df['ScheduledDay']).dt.days < 0, df['ScheduledDay'], df['AppointmentDay'])
df['Waiting_Time_days'] = df['AppointmentDay'] - df['ScheduledDay']
df['Waiting_Time_days'] = df['Waiting_Time_days'].dt.days

In [50]:
df.sample(n=10)

Unnamed: 0,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hypertension,Diabetes,Alcoholism,Handicap,SMSReceived,NoShow,ScheduledDay_DOW,AppointmentDay_DOW,Waiting_Time_days
86196,M,2016-06-06,2016-06-06,24,JARDIM CAMBURI,0,0,0,0,0,0,No,Monday,Monday,0
90746,F,2016-06-01,2016-06-01,30,JARDIM DA PENHA,0,0,0,0,0,0,No,Wednesday,Wednesday,0
96609,F,2016-05-10,2016-06-06,68,JARDIM DA PENHA,0,0,0,0,0,1,No,Tuesday,Monday,27
95925,F,2016-06-02,2016-06-08,8,FORTE SÃO JOÃO,0,0,0,0,0,1,Yes,Thursday,Wednesday,6
33518,F,2016-04-15,2016-05-03,50,SANTA MARTHA,1,0,1,0,0,1,No,Friday,Tuesday,18
11350,M,2016-05-20,2016-05-24,25,ILHA DE SANTA MARIA,0,0,0,0,0,1,No,Friday,Tuesday,4
39390,M,2016-05-25,2016-05-31,20,COMDUSA,0,0,0,0,0,1,No,Wednesday,Tuesday,6
36828,F,2016-04-14,2016-05-19,3,RESISTÊNCIA,0,0,0,0,0,0,No,Thursday,Thursday,35
65171,M,2016-04-20,2016-05-12,48,SÃO CRISTÓVÃO,0,1,0,0,0,1,No,Wednesday,Thursday,22
41552,M,2016-04-08,2016-05-12,51,DO QUADRO,0,1,0,0,0,1,No,Friday,Thursday,34


In [51]:
print("NoShow and Show Count of Patients\n")
print(df.groupby(['NoShow']).size())

print("\nNoShow and Show '%' of Patients\n")
show = df.groupby(['NoShow']).size()[0]/(df.groupby(['NoShow']).size()[0]+df.groupby(['NoShow']).size()[1])
print("Percent of Patients who `Showed Up` => {:.2f}%".format(show*100))
noshow = df.groupby(['NoShow']).size()[1]/(df.groupby(['NoShow']).size()[0]+df.groupby(['NoShow']).size()[1])
print("Percent of Patients who Did `Not Showed Up` => {:.2f}%".format(noshow*100))

NoShow and Show Count of Patients

NoShow
No     88207
Yes    22319
dtype: int64

NoShow and Show '%' of Patients

Percent of Patients who `Showed Up` => 79.81%
Percent of Patients who Did `Not Showed Up` => 20.19%


In [52]:
le = LabelEncoder()
df['Gender'] = le.fit_transform(df['Gender'])
le = LabelEncoder()
df['Neighbourhood'] = le.fit_transform(df['Neighbourhood'])

le = LabelEncoder()
df['ScheduledDay_DOW'] = le.fit_transform(df['ScheduledDay_DOW'])

le = LabelEncoder()
df['AppointmentDay_DOW'] = le.fit_transform(df['AppointmentDay_DOW'])
print("LabelEncoder Completed")

le = LabelEncoder()
df['NoShow'] = le.fit_transform(df['NoShow'])


LabelEncoder Completed


In [53]:
df['ScheduledDay_Y'] = df['ScheduledDay'].dt.year
df['ScheduledDay_M'] = df['ScheduledDay'].dt.month
df['ScheduledDay_D'] = df['ScheduledDay'].dt.day
df.drop(['ScheduledDay'], axis=1, inplace=True)

df['AppointmentDay_Y'] = df['AppointmentDay'].dt.year
df['AppointmentDay_M'] = df['AppointmentDay'].dt.month
df['AppointmentDay_D'] = df['AppointmentDay'].dt.day
df.drop(['AppointmentDay'], axis=1, inplace=True)

In [54]:
df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 110526 entries, 0 to 110526
Data columns (total 19 columns):
Gender                110526 non-null int32
Age                   110526 non-null int64
Neighbourhood         110526 non-null int32
Scholarship           110526 non-null object
Hypertension          110526 non-null object
Diabetes              110526 non-null object
Alcoholism            110526 non-null object
Handicap              110526 non-null object
SMSReceived           110526 non-null object
NoShow                110526 non-null int32
ScheduledDay_DOW      110526 non-null int32
AppointmentDay_DOW    110526 non-null int32
Waiting_Time_days     110526 non-null int64
ScheduledDay_Y        110526 non-null int64
ScheduledDay_M        110526 non-null int64
ScheduledDay_D        110526 non-null int64
AppointmentDay_Y      110526 non-null int64
AppointmentDay_M      110526 non-null int64
AppointmentDay_D      110526 non-null int64
dtypes: int32(5), int64(8), object(6)
memory us

In [55]:
X = df.drop(['NoShow'], axis=1)
y = df['NoShow']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

dt_clf = DecisionTreeClassifier(random_state=0)
dt_clf.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')

In [56]:
print("Feature Importance:\n")
for name, importance in zip(X.columns, np.sort(dt_clf.feature_importances_)[::-1]):
    print("{} -- {:.2f}".format(name, importance))

Feature Importance:

Gender -- 0.23
Age -- 0.23
Neighbourhood -- 0.17
Scholarship -- 0.08
Hypertension -- 0.07
Diabetes -- 0.05
Alcoholism -- 0.05
Handicap -- 0.04
SMSReceived -- 0.02
ScheduledDay_DOW -- 0.02
AppointmentDay_DOW -- 0.01
Waiting_Time_days -- 0.01
ScheduledDay_Y -- 0.01
ScheduledDay_M -- 0.01
ScheduledDay_D -- 0.01
AppointmentDay_Y -- 0.01
AppointmentDay_M -- 0.00
AppointmentDay_D -- 0.00


In [57]:
dt_clf.score(X_test, y_test)

0.723419422054066

In [60]:
pred=dt_clf.predict(X)
accuracy = accuracy_score(y, pred)
print("Accuracy : {}".format(accuracy))
precision = precision_score(y, pred)
print("Precision : {}".format(precision))
f1score = f1_score(y, pred)
print("F1 Score : {}".format(f1score))

Accuracy : 0.904429726942077
Precision : 0.7625748235504333
F1 Score : 0.763717704954703
