In [161]:
# load libraries
import pandas as pd
import numpy as np
import datetime
from time import strftime

In [162]:
# load the training data set
trainingData = pd.read_csv('./Data/KaggleV2-May-2016.csv')

print("======================")
print("Training Data Overview")
print("======================")
trainingData.info()

Training Data Overview
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110527 entries, 0 to 110526
Data columns (total 14 columns):
PatientId         110527 non-null float64
AppointmentID     110527 non-null int64
Gender            110527 non-null object
ScheduledDay      110527 non-null object
AppointmentDay    110527 non-null object
Age               110527 non-null int64
Neighbourhood     110527 non-null object
Scholarship       110527 non-null int64
Hipertension      110527 non-null int64
Diabetes          110527 non-null int64
Alcoholism        110527 non-null int64
Handcap           110527 non-null int64
SMS_received      110527 non-null int64
No-show           110527 non-null object
dtypes: float64(1), int64(8), object(5)
memory usage: 11.8+ MB


In [163]:
# Raw data, first 5 rows
trainingData.head()

Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show
0,29872500000000.0,5642903,F,2016-04-29T18:38:08Z,2016-04-29T00:00:00Z,62,JARDIM DA PENHA,0,1,0,0,0,0,No
1,558997800000000.0,5642503,M,2016-04-29T16:08:27Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,0,0,0,0,0,No
2,4262962000000.0,5642549,F,2016-04-29T16:19:04Z,2016-04-29T00:00:00Z,62,MATA DA PRAIA,0,0,0,0,0,0,No
3,867951200000.0,5642828,F,2016-04-29T17:29:31Z,2016-04-29T00:00:00Z,8,PONTAL DE CAMBURI,0,0,0,0,0,0,No
4,8841186000000.0,5642494,F,2016-04-29T16:07:23Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,1,1,0,0,0,No


In [164]:
# Data type conversions for readability
trainingData['ScheduledDay'] = pd.to_datetime(trainingData['ScheduledDay']).dt.date.astype('datetime64[ns]')
trainingData['AppointmentDay'] = pd.to_datetime(trainingData['AppointmentDay']).dt.date.astype('datetime64[ns]')
trainingData['PatientId'] = trainingData['PatientId'].astype('int64')

# Preprocessing to correct attribute names
trainingData = trainingData.rename(columns = {
    'Hipertension': 'Hypertension',
    'Handcap': 'Handicap',
    'SMS_received': 'SMSReceived',
    'No-show': 'NoShow'
})

# Deleting invalid records such as having an age of -1
trainingData = trainingData[trainingData.Age >= 0]

trainingData.head()

Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hypertension,Diabetes,Alcoholism,Handicap,SMSReceived,NoShow
0,29872499824296,5642903,F,2016-04-29,2016-04-29,62,JARDIM DA PENHA,0,1,0,0,0,0,No
1,558997776694438,5642503,M,2016-04-29,2016-04-29,56,JARDIM DA PENHA,0,0,0,0,0,0,No
2,4262962299951,5642549,F,2016-04-29,2016-04-29,62,MATA DA PRAIA,0,0,0,0,0,0,No
3,867951213174,5642828,F,2016-04-29,2016-04-29,8,PONTAL DE CAMBURI,0,0,0,0,0,0,No
4,8841186448183,5642494,F,2016-04-29,2016-04-29,56,JARDIM DA PENHA,0,1,1,0,0,0,No


In [165]:
# Dropping PatientId and AppointmentId as they are just numbers assigned to patients
# which do not have any effect on the output
if 'PatientId' in trainingData and 'AppointmentID' in trainingData:
    trainingData.drop(['PatientId', 'AppointmentID'], axis=1, inplace=True)
trainingData.head()

Unnamed: 0,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hypertension,Diabetes,Alcoholism,Handicap,SMSReceived,NoShow
0,F,2016-04-29,2016-04-29,62,JARDIM DA PENHA,0,1,0,0,0,0,No
1,M,2016-04-29,2016-04-29,56,JARDIM DA PENHA,0,0,0,0,0,0,No
2,F,2016-04-29,2016-04-29,62,MATA DA PRAIA,0,0,0,0,0,0,No
3,F,2016-04-29,2016-04-29,8,PONTAL DE CAMBURI,0,0,0,0,0,0,No
4,F,2016-04-29,2016-04-29,56,JARDIM DA PENHA,0,1,1,0,0,0,No


In [166]:
# Print unique set of values for each attribute
genderSet = trainingData.Gender.unique()
ageGroup = trainingData.Age.unique()
appointments = np.sort(trainingData.AppointmentDay.dt.strftime('%Y-%m-%d').unique())
schedules = np.sort(trainingData.ScheduledDay.dt.strftime('%Y-%m-%d').unique())
neighbors = np.sort(trainingData.Neighbourhood.unique())

print("Gender Set (Total: {}): {}".format(genderSet.size, genderSet))
print("Age Group (Total: {}): {}".format(ageGroup.size, ageGroup))
print("Appointments (Total: {}): {}".format(appointments.size, appointments))
print("Schedules (Total: {}): {}".format(schedules.size, schedules))
print("Neighborhood (Total: {}): {}".format(neighbors.size, neighbors))

Gender Set (Total: 2): ['F' 'M']
Age Group (Total: 103): [ 62  56   8  76  23  39  21  19  30  29  22  28  54  15  50  40  46   4
  13  65  45  51  32  12  61  38  79  18  63  64  85  59  55  71  49  78
  31  58  27   6   2  11   7   0   3   1  69  68  60  67  36  10  35  20
  26  34  33  16  42   5  47  17  41  44  37  24  66  77  81  70  53  75
  73  52  74  43  89  57  14   9  48  83  72  25  80  87  88  84  82  90
  94  86  91  98  92  96  93  95  97 102 115 100  99]
Appointments (Total: 27): ['2016-04-29' '2016-05-02' '2016-05-03' '2016-05-04' '2016-05-05'
 '2016-05-06' '2016-05-09' '2016-05-10' '2016-05-11' '2016-05-12'
 '2016-05-13' '2016-05-14' '2016-05-16' '2016-05-17' '2016-05-18'
 '2016-05-19' '2016-05-20' '2016-05-24' '2016-05-25' '2016-05-30'
 '2016-05-31' '2016-06-01' '2016-06-02' '2016-06-03' '2016-06-06'
 '2016-06-07' '2016-06-08']
Schedules (Total: 111): ['2015-11-10' '2015-12-03' '2015-12-07' '2015-12-08' '2015-12-14'
 '2015-12-15' '2016-01-04' '2016-01-05' '2016-01-0

In [167]:
# print("NoShow and Show Count of Patients\n")
trainingData.sample(n=10)
noShowResults = trainingData.groupby(['NoShow']).size()
numPatients = noShowResults[0] + noShowResults[1]
noShow = noShowResults[0]
attended = noShowResults[1]

print("No Show: {:.2f}%\nAttended: {:.2f}%".format((noShow / numPatients) * 100, (attended / numPatients) * 100))

No Show: 79.81%
Attended: 20.19%


In [168]:
trainingData.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 110526 entries, 0 to 110526
Data columns (total 12 columns):
Gender            110526 non-null object
ScheduledDay      110526 non-null datetime64[ns]
AppointmentDay    110526 non-null datetime64[ns]
Age               110526 non-null int64
Neighbourhood     110526 non-null object
Scholarship       110526 non-null int64
Hypertension      110526 non-null int64
Diabetes          110526 non-null int64
Alcoholism        110526 non-null int64
Handicap          110526 non-null int64
SMSReceived       110526 non-null int64
NoShow            110526 non-null object
dtypes: datetime64[ns](2), int64(7), object(3)
memory usage: 11.0+ MB


In [171]:
# Dependent amd Independent Features
X = trainingData.drop(['NoShow'], axis=1)
Y = trainingData['NoShow']

In [None]:
# TODO Classifiers and create prediction