In [4]:
import pandas as pd
import numpy as np
import datetime
from time import strftime

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
week_key = ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']

In [6]:
df = pd.read_csv('/Users/P U N I/M L/PROJ/Patient dataset.csv')

In [7]:
print("Shape of the DataFrame : {}".format(df.shape))

Shape of the DataFrame : (110527, 14)


In [8]:
df.head()

Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show
0,29900000000000.0,5642903,F,2016-04-29T18:38:08Z,2016-04-29T00:00:00Z,62,JARDIM DA PENHA,0,1,0,0,0,0,No
1,559000000000000.0,5642503,M,2016-04-29T16:08:27Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,0,0,0,0,0,No
2,4260000000000.0,5642549,F,2016-04-29T16:19:04Z,2016-04-29T00:00:00Z,62,MATA DA PRAIA,0,0,0,0,0,0,No
3,868000000000.0,5642828,F,2016-04-29T17:29:31Z,2016-04-29T00:00:00Z,8,PONTAL DE CAMBURI,0,0,0,0,0,0,No
4,8840000000000.0,5642494,F,2016-04-29T16:07:23Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,1,1,0,0,0,No


In [9]:
df['PatientId'] = df['PatientId'].astype('int64')

In [10]:
df['ScheduledDay'] = pd.to_datetime(df['ScheduledDay']).dt.date.astype('datetime64[ns]')
df['AppointmentDay'] = pd.to_datetime(df['AppointmentDay']).dt.date.astype('datetime64[ns]')

In [11]:
df = df.rename(columns={'Hipertension': 'Hypertension', 'Handcap': 'Handicap', 'SMS_received': 'SMSReceived', 'No-show': 'NoShow'})

In [12]:
df.head()

Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hypertension,Diabetes,Alcoholism,Handicap,SMSReceived,NoShow
0,29900000000000,5642903,F,2016-04-29,2016-04-29,62,JARDIM DA PENHA,0,1,0,0,0,0,No
1,559000000000000,5642503,M,2016-04-29,2016-04-29,56,JARDIM DA PENHA,0,0,0,0,0,0,No
2,4260000000000,5642549,F,2016-04-29,2016-04-29,62,MATA DA PRAIA,0,0,0,0,0,0,No
3,868000000000,5642828,F,2016-04-29,2016-04-29,8,PONTAL DE CAMBURI,0,0,0,0,0,0,No
4,8840000000000,5642494,F,2016-04-29,2016-04-29,56,JARDIM DA PENHA,0,1,1,0,0,0,No


In [13]:
print("DataFrame Features : {}".format(df.columns.ravel()))

DataFrame Features : ['PatientId' 'AppointmentID' 'Gender' 'ScheduledDay' 'AppointmentDay'
 'Age' 'Neighbourhood' 'Scholarship' 'Hypertension' 'Diabetes'
 'Alcoholism' 'Handicap' 'SMSReceived' 'NoShow']


In [14]:
df.drop(['PatientId', 'AppointmentID'], axis=1, inplace=True)

In [15]:
df['Scholarship'] = df['Scholarship'].astype('object')

df['Neighbourhood'] = df['Neighbourhood'].astype('object')

df['Hypertension'] = df['Hypertension'].astype('object')

df['Diabetes'] = df['Diabetes'].astype('object')

df['Alcoholism'] = df['Alcoholism'].astype('object')

df['Handicap'] = df['Handicap'].astype('object')

df['SMSReceived'] = df['SMSReceived'].astype('object')

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110527 entries, 0 to 110526
Data columns (total 12 columns):
Gender            110527 non-null object
ScheduledDay      110527 non-null datetime64[ns]
AppointmentDay    110527 non-null datetime64[ns]
Age               110527 non-null int64
Neighbourhood     110527 non-null object
Scholarship       110527 non-null object
Hypertension      110527 non-null object
Diabetes          110527 non-null object
Alcoholism        110527 non-null object
Handicap          110527 non-null object
SMSReceived       110527 non-null object
NoShow            110527 non-null object
dtypes: datetime64[ns](2), int64(1), object(9)
memory usage: 10.1+ MB


In [17]:
df.sample(5)

Unnamed: 0,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hypertension,Diabetes,Alcoholism,Handicap,SMSReceived,NoShow
75690,M,2016-05-13,2016-05-13,68,NOVA PALESTINA,0,0,1,0,0,0,No
48615,F,2016-05-09,2016-05-25,62,JARDIM DA PENHA,0,0,0,0,0,1,No
38494,F,2016-04-29,2016-05-06,58,ENSEADA DO SUÁ,0,1,0,0,0,1,No
34279,F,2016-05-11,2016-05-11,2,NOVA PALESTINA,0,0,0,0,0,0,No
63248,F,2016-05-30,2016-05-31,23,JARDIM CAMBURI,0,0,0,0,0,0,No


In [18]:
np.sort(df.Age.unique())

array([ -1,   0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,
        12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,
        25,  26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,
        38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,
        51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,
        64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,
        77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
        90,  91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 102, 115],
      dtype=int64)

In [19]:
print("Patients with `Age` < -1 -> {}".format(df[df.Age == -1].shape[0]))
print("Patients with `Age` = 0 -> {}".format(df[df.Age == 0].shape[0]))

Patients with `Age` < -1 -> 1
Patients with `Age` = 0 -> 3539


In [20]:
df = df[df.Age >= 0]

In [21]:
df[(df.Age <= 0) & ((df.Hypertension.astype(int) == 1) | (df.Diabetes.astype(int) == 1) | (df.Alcoholism.astype(int) == 1))]

Unnamed: 0,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hypertension,Diabetes,Alcoholism,Handicap,SMSReceived,NoShow


In [22]:
print("Unique Values in `ScheduledDay` => {}".format(np.sort(df.ScheduledDay.dt.strftime('%d-%m-%y').unique())))

Unique Values in `ScheduledDay` => ['01-02-16' '01-03-16' '01-04-16' '01-06-16' '02-02-16' '02-03-16'
 '02-05-16' '02-06-16' '03-02-16' '03-03-16' '03-05-16' '03-06-16'
 '03-12-15' '04-01-16' '04-02-16' '04-03-16' '04-05-16' '04-06-16'
 '05-01-16' '05-02-16' '05-03-16' '05-04-16' '05-05-16' '06-04-16'
 '06-05-16' '06-06-16' '07-01-16' '07-03-16' '07-04-16' '07-05-16'
 '07-06-16' '07-12-15' '08-03-16' '08-04-16' '08-06-16' '08-12-15'
 '09-03-16' '09-04-16' '09-05-16' '10-03-16' '10-05-16' '10-11-15'
 '11-01-16' '11-02-16' '11-03-16' '11-04-16' '11-05-16' '12-02-16'
 '12-04-16' '12-05-16' '13-01-16' '13-04-16' '13-05-16' '14-01-16'
 '14-03-16' '14-04-16' '14-05-16' '14-12-15' '15-02-16' '15-03-16'
 '15-04-16' '15-12-15' '16-02-16' '16-03-16' '16-04-16' '16-05-16'
 '17-02-16' '17-03-16' '17-05-16' '18-02-16' '18-03-16' '18-04-16'
 '18-05-16' '19-01-16' '19-02-16' '19-03-16' '19-04-16' '19-05-16'
 '20-01-16' '20-04-16' '20-05-16' '21-01-16' '21-03-16' '22-01-16'
 '22-02-16' '22-03-16' '23-

In [23]:
print("Unique Values in `AppointmentDay` => {}".format(np.sort(df.AppointmentDay.dt.strftime('%Y-%m-%d').unique())))

Unique Values in `AppointmentDay` => ['2016-04-29' '2016-05-02' '2016-05-03' '2016-05-04' '2016-05-05'
 '2016-05-06' '2016-05-09' '2016-05-10' '2016-05-11' '2016-05-12'
 '2016-05-13' '2016-05-14' '2016-05-16' '2016-05-17' '2016-05-18'
 '2016-05-19' '2016-05-20' '2016-05-24' '2016-05-25' '2016-05-30'
 '2016-05-31' '2016-06-01' '2016-06-02' '2016-06-03' '2016-06-06'
 '2016-06-07' '2016-06-08']


In [24]:
print("Total Count for `Neighbourhood` => {}".format(df.Neighbourhood.unique().size))

Total Count for `Neighbourhood` => 81


In [25]:
print("Unique Values in `Neighbourhood` => {}".format(np.sort(df.Neighbourhood.unique())))

Unique Values in `Neighbourhood` => ['AEROPORTO' 'ANDORINHAS' 'ANTÔNIO HONÓRIO' 'ARIOVALDO FAVALESSA'
 'BARRO VERMELHO' 'BELA VISTA' 'BENTO FERREIRA' 'BOA VISTA' 'BONFIM'
 'CARATOÍRA' 'CENTRO' 'COMDUSA' 'CONQUISTA' 'CONSOLAÇÃO' 'CRUZAMENTO'
 'DA PENHA' 'DE LOURDES' 'DO CABRAL' 'DO MOSCOSO' 'DO QUADRO'
 'ENSEADA DO SUÁ' 'ESTRELINHA' 'FONTE GRANDE' 'FORTE SÃO JOÃO' 'FRADINHOS'
 'GOIABEIRAS' 'GRANDE VITÓRIA' 'GURIGICA' 'HORTO' 'ILHA DAS CAIEIRAS'
 'ILHA DE SANTA MARIA' 'ILHA DO BOI' 'ILHA DO FRADE' 'ILHA DO PRÍNCIPE'
 'ILHAS OCEÂNICAS DE TRINDADE' 'INHANGUETÁ' 'ITARARÉ' 'JABOUR'
 'JARDIM CAMBURI' 'JARDIM DA PENHA' 'JESUS DE NAZARETH' 'JOANA D´ARC'
 'JUCUTUQUARA' 'MARIA ORTIZ' 'MARUÍPE' 'MATA DA PRAIA' 'MONTE BELO'
 'MORADA DE CAMBURI' 'MÁRIO CYPRESTE' 'NAZARETH' 'NOVA PALESTINA'
 'PARQUE INDUSTRIAL' 'PARQUE MOSCOSO' 'PIEDADE' 'PONTAL DE CAMBURI'
 'PRAIA DO CANTO' 'PRAIA DO SUÁ' 'REDENÇÃO' 'REPÚBLICA' 'RESISTÊNCIA'
 'ROMÃO' 'SANTA CECÍLIA' 'SANTA CLARA' 'SANTA HELENA' 'SANTA LUÍZA'
 'SANTA

In [26]:
df['ScheduledDay_DOW'] = df['ScheduledDay'].dt.weekday_name
df['AppointmentDay_DOW'] = df['AppointmentDay'].dt.weekday_name

In [27]:
df['AppointmentDay'] = np.where((df['AppointmentDay'] - df['ScheduledDay']).dt.days < 0, df['ScheduledDay'], df['AppointmentDay'])

In [28]:
df['Waiting_Time_days'] = df['AppointmentDay'] - df['ScheduledDay']
df['Waiting_Time_days'] = df['Waiting_Time_days'].dt.days

In [29]:
print("There are [{}] records where the Waiting Time is less than Zero.".format(df[df.Waiting_Time_days < 0].shape[0]))

There are [0] records where the Waiting Time is less than Zero.


In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 110526 entries, 0 to 110526
Data columns (total 15 columns):
Gender                110526 non-null object
ScheduledDay          110526 non-null datetime64[ns]
AppointmentDay        110526 non-null datetime64[ns]
Age                   110526 non-null int64
Neighbourhood         110526 non-null object
Scholarship           110526 non-null object
Hypertension          110526 non-null object
Diabetes              110526 non-null object
Alcoholism            110526 non-null object
Handicap              110526 non-null object
SMSReceived           110526 non-null object
NoShow                110526 non-null object
ScheduledDay_DOW      110526 non-null object
AppointmentDay_DOW    110526 non-null object
Waiting_Time_days     110526 non-null int64
dtypes: datetime64[ns](2), int64(2), object(11)
memory usage: 13.5+ MB


In [31]:
df.sample(n=10)

Unnamed: 0,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hypertension,Diabetes,Alcoholism,Handicap,SMSReceived,NoShow,ScheduledDay_DOW,AppointmentDay_DOW,Waiting_Time_days
67141,M,2016-05-17,2016-05-17,80,JARDIM CAMBURI,0,0,0,0,0,0,No,Tuesday,Tuesday,0
62975,M,2016-05-12,2016-05-13,22,JARDIM CAMBURI,0,0,0,0,0,0,No,Thursday,Friday,1
46402,F,2016-04-28,2016-05-06,44,SÃO PEDRO,1,1,1,0,0,1,No,Thursday,Friday,8
40456,F,2016-04-14,2016-05-12,0,JOANA D´ARC,0,0,0,0,0,1,Yes,Thursday,Thursday,28
18996,M,2016-03-30,2016-05-09,5,ROMÃO,0,0,0,0,0,0,Yes,Wednesday,Monday,40
13163,F,2016-03-16,2016-05-04,67,ILHA DE SANTA MARIA,0,0,0,0,0,1,No,Wednesday,Wednesday,49
66178,F,2016-05-05,2016-05-30,2,JARDIM CAMBURI,0,0,0,0,0,1,No,Thursday,Monday,25
93098,F,2016-05-20,2016-06-08,31,CENTRO,0,0,0,0,0,1,No,Friday,Wednesday,19
84936,F,2016-06-01,2016-06-01,25,DO MOSCOSO,0,0,0,1,0,0,Yes,Wednesday,Wednesday,0
93190,F,2016-04-11,2016-06-02,37,JUCUTUQUARA,0,0,0,0,0,1,No,Monday,Thursday,52


In [32]:
print("NoShow and Show Count of Patients\n")
print(df.groupby(['NoShow']).size())

print("\nNoShow and Show '%' of Patients\n")
show = df.groupby(['NoShow']).size()[0]/(df.groupby(['NoShow']).size()[0]+df.groupby(['NoShow']).size()[1])
print("Percent of Patients who `Showed Up` => {:.2f}%".format(show*100))
noshow = df.groupby(['NoShow']).size()[1]/(df.groupby(['NoShow']).size()[0]+df.groupby(['NoShow']).size()[1])
print("Percent of Patients who Did `Not Showed Up` => {:.2f}%".format(noshow*100))

NoShow and Show Count of Patients

NoShow
No     88207
Yes    22319
dtype: int64

NoShow and Show '%' of Patients

Percent of Patients who `Showed Up` => 79.81%
Percent of Patients who Did `Not Showed Up` => 20.19%


In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 110526 entries, 0 to 110526
Data columns (total 15 columns):
Gender                110526 non-null object
ScheduledDay          110526 non-null datetime64[ns]
AppointmentDay        110526 non-null datetime64[ns]
Age                   110526 non-null int64
Neighbourhood         110526 non-null object
Scholarship           110526 non-null object
Hypertension          110526 non-null object
Diabetes              110526 non-null object
Alcoholism            110526 non-null object
Handicap              110526 non-null object
SMSReceived           110526 non-null object
NoShow                110526 non-null object
ScheduledDay_DOW      110526 non-null object
AppointmentDay_DOW    110526 non-null object
Waiting_Time_days     110526 non-null int64
dtypes: datetime64[ns](2), int64(2), object(11)
memory usage: 13.5+ MB


In [34]:
le = LabelEncoder()
df['Gender'] = le.fit_transform(df['Gender'])

le = LabelEncoder()
df['Neighbourhood'] = le.fit_transform(df['Neighbourhood'])

le = LabelEncoder()
df['ScheduledDay_DOW'] = le.fit_transform(df['ScheduledDay_DOW'])

le = LabelEncoder()
df['AppointmentDay_DOW'] = le.fit_transform(df['AppointmentDay_DOW'])

le = LabelEncoder()
df['NoShow'] = le.fit_transform(df['NoShow'])

print("LabelEncoder Completed")

LabelEncoder Completed


In [35]:
df['ScheduledDay_Y'] = df['ScheduledDay'].dt.year
df['ScheduledDay_M'] = df['ScheduledDay'].dt.month
df['ScheduledDay_D'] = df['ScheduledDay'].dt.day
df.drop(['ScheduledDay'], axis=1, inplace=True)

In [36]:
df['AppointmentDay_Y'] = df['AppointmentDay'].dt.year
df['AppointmentDay_M'] = df['AppointmentDay'].dt.month
df['AppointmentDay_D'] = df['AppointmentDay'].dt.day
df.drop(['AppointmentDay'], axis=1, inplace=True)

In [37]:
df.sample(n=10)

Unnamed: 0,Gender,Age,Neighbourhood,Scholarship,Hypertension,Diabetes,Alcoholism,Handicap,SMSReceived,NoShow,ScheduledDay_DOW,AppointmentDay_DOW,Waiting_Time_days,ScheduledDay_Y,ScheduledDay_M,ScheduledDay_D,AppointmentDay_Y,AppointmentDay_M,AppointmentDay_D
84211,0,38,7,0,0,0,0,0,1,1,5,1,12,2016,5,25,2016,6,6
84640,1,9,77,0,0,0,0,0,1,1,5,1,5,2016,6,1,2016,6,6
50090,0,58,39,0,0,0,0,0,0,0,4,4,0,2016,5,10,2016,5,10
87143,0,91,38,0,0,0,0,0,1,0,0,5,61,2016,4,1,2016,6,1
91684,0,8,27,0,0,0,0,0,0,0,0,1,3,2016,6,3,2016,6,6
79920,1,44,10,0,0,0,0,0,0,0,1,1,0,2016,5,2,2016,5,2
44048,0,13,27,1,0,0,0,0,0,1,1,4,1,2016,5,9,2016,5,10
4524,1,53,62,0,0,0,0,0,0,0,4,3,2,2016,5,3,2016,5,5
100490,0,42,40,0,0,0,0,0,0,0,1,1,0,2016,6,6,2016,6,6
41415,0,31,37,1,1,0,0,0,1,0,0,4,32,2016,4,1,2016,5,3


In [38]:
X = df.drop(['NoShow'], axis=1)
y = df['NoShow']

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

In [42]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix


In [61]:
model = LogisticRegression()
model.fit(X_train, y_train)

y_pred= model.predict(X_test)

pred=model.predict(X)
LR_accuracy = accuracy_score(y, pred)
print("Accuracy : {}".format(LR_accuracy))
LR_precision = precision_score(y, pred)
print("Precision : {}".format(LR_precision))




Accuracy : 0.7945098890758735
Precision : 0.3530291697830965


In [57]:
model.score(X_train,y_train)

0.7933074056068709

In [58]:
model.score(X_test,y_test)

0.796951252947305