In [110]:
import pandas as pd
import numpy as np
from datetime import timezone, datetime
import pytz
from pytz import all_timezones

#### Read in dataframe & view first 5 rows

In [2]:
df = pd.read_csv('KaggleV2-May-2016.csv')
print(df.head())

      PatientId  AppointmentID Gender          ScheduledDay  \
0  2.987250e+13        5642903      F  2016-04-29T18:38:08Z   
1  5.589978e+14        5642503      M  2016-04-29T16:08:27Z   
2  4.262962e+12        5642549      F  2016-04-29T16:19:04Z   
3  8.679512e+11        5642828      F  2016-04-29T17:29:31Z   
4  8.841186e+12        5642494      F  2016-04-29T16:07:23Z   

         AppointmentDay  Age      Neighbourhood  Scholarship  Hipertension  \
0  2016-04-29T00:00:00Z   62    JARDIM DA PENHA            0             1   
1  2016-04-29T00:00:00Z   56    JARDIM DA PENHA            0             0   
2  2016-04-29T00:00:00Z   62      MATA DA PRAIA            0             0   
3  2016-04-29T00:00:00Z    8  PONTAL DE CAMBURI            0             0   
4  2016-04-29T00:00:00Z   56    JARDIM DA PENHA            0             1   

   Diabetes  Alcoholism  Handcap  SMS_received No-show  
0         0           0        0             0      No  
1         0           0        0      

# Transformation / Data Cleaning & Wrangling

#### 1. Convert PatientId column from scientific notation to integer

In [5]:
df['PatientId'] = df['PatientId'].astype('int64')
df['PatientId'].head()

0     29872499824296
1    558997776694438
2      4262962299951
3       867951213174
4      8841186448183
Name: PatientId, dtype: int64

#### 2. Split ScheduledDay column to separate date and time columns

In [74]:
# convert time zone from utc to eastern time zone
df['ScheduledDay'] = df['ScheduledDay'].dt.tz_convert('US/Eastern')
df['ScheduledDay'].head()

0   2016-04-29 14:38:08-04:00
1   2016-04-29 12:08:27-04:00
2   2016-04-29 12:19:04-04:00
3   2016-04-29 13:29:31-04:00
4   2016-04-29 12:07:23-04:00
Name: ScheduledDay, dtype: datetime64[ns, US/Eastern]

In [78]:
# create ScheduledDate column for the scheduled date
# create ScheduledTime column for the scheduled time
# create ScheduledDayofWeek column for the schedules date/time day of week

df['ScheduledDate'] = df['ScheduledDay'].dt.strftime("%m-%d-%Y")
df['ScheduledTime'] = df['ScheduledDay'].dt.strftime("%I:%M %p")
df['ScheduledDayofWeek'] = df['ScheduledDay'].dt.strftime("%A")
df.head()

Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show,ScheduledDate,ScheduledTime,ScheduledDayofWeek
0,29872499824296,5642903,F,2016-04-29 14:38:08-04:00,2016-04-29T00:00:00Z,62,JARDIM DA PENHA,0,1,0,0,0,0,No,04-29-2016,02:38 PM,Friday
1,558997776694438,5642503,M,2016-04-29 12:08:27-04:00,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,0,0,0,0,0,No,04-29-2016,12:08 PM,Friday
2,4262962299951,5642549,F,2016-04-29 12:19:04-04:00,2016-04-29T00:00:00Z,62,MATA DA PRAIA,0,0,0,0,0,0,No,04-29-2016,12:19 PM,Friday
3,867951213174,5642828,F,2016-04-29 13:29:31-04:00,2016-04-29T00:00:00Z,8,PONTAL DE CAMBURI,0,0,0,0,0,0,No,04-29-2016,01:29 PM,Friday
4,8841186448183,5642494,F,2016-04-29 12:07:23-04:00,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,1,1,0,0,0,No,04-29-2016,12:07 PM,Friday


#### 3. Drop unnecessary columns 

In [80]:
df = df.drop(['AppointmentDay', 'Scholarship'], axis=1)

#### 4. Convert Values

In [83]:
# convert 'F' to 'Female' & 'M' to 'Male'
df['Gender'] = df['Gender'].map({'F': 'Female', 'M': 'Male'})
df.head()

Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,Age,Neighbourhood,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show,ScheduledDate,ScheduledTime,ScheduledDayofWeek
0,29872499824296,5642903,Female,2016-04-29 14:38:08-04:00,62,JARDIM DA PENHA,1,0,0,0,0,No,04-29-2016,02:38 PM,Friday
1,558997776694438,5642503,Male,2016-04-29 12:08:27-04:00,56,JARDIM DA PENHA,0,0,0,0,0,No,04-29-2016,12:08 PM,Friday
2,4262962299951,5642549,Female,2016-04-29 12:19:04-04:00,62,MATA DA PRAIA,0,0,0,0,0,No,04-29-2016,12:19 PM,Friday
3,867951213174,5642828,Female,2016-04-29 13:29:31-04:00,8,PONTAL DE CAMBURI,0,0,0,0,0,No,04-29-2016,01:29 PM,Friday
4,8841186448183,5642494,Female,2016-04-29 12:07:23-04:00,56,JARDIM DA PENHA,1,1,0,0,0,No,04-29-2016,12:07 PM,Friday


In [86]:
# For columns with 0 or 1 values, convert to yes or no
df['Hipertension'] = df['Hipertension'].map({0: 'No', 1: 'Yes'})
df['Diabetes'] = df['Diabetes'].map({0: 'No', 1: 'Yes'})
df['Alcoholism'] = df['Alcoholism'].map({0: 'No', 1: 'Yes'})
df['Handcap'] = df['Handcap'].map({0: 'No', 1: 'Yes'})
df['SMS_received'] = df['SMS_received'].map({0: 'No', 1: 'Yes'})
df.head()

Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,Age,Neighbourhood,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show,ScheduledDate,ScheduledTime,ScheduledDayofWeek
0,29872499824296,5642903,Female,2016-04-29 14:38:08-04:00,62,JARDIM DA PENHA,Yes,No,No,No,No,No,04-29-2016,02:38 PM,Friday
1,558997776694438,5642503,Male,2016-04-29 12:08:27-04:00,56,JARDIM DA PENHA,No,No,No,No,No,No,04-29-2016,12:08 PM,Friday
2,4262962299951,5642549,Female,2016-04-29 12:19:04-04:00,62,MATA DA PRAIA,No,No,No,No,No,No,04-29-2016,12:19 PM,Friday
3,867951213174,5642828,Female,2016-04-29 13:29:31-04:00,8,PONTAL DE CAMBURI,No,No,No,No,No,No,04-29-2016,01:29 PM,Friday
4,8841186448183,5642494,Female,2016-04-29 12:07:23-04:00,56,JARDIM DA PENHA,Yes,Yes,No,No,No,No,04-29-2016,12:07 PM,Friday


#### 5. Rename columns

In [114]:
df.rename(columns = {'Neighbourhood': 'Neighborhood', 'Hipertension': 'Hypertension', 'SMS_received': 'SMS'}, inplace = True)
df.head()

Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,Age,Neighorhood,Hypertension,Diabetes,Alcoholism,Handcap,SMS,No-show,ScheduledDate,ScheduledTime,ScheduledDayofWeek
0,29872499824296,5642903,Female,2016-04-29 14:38:08-04:00,62,JARDIM DA PENHA,Yes,No,No,No,No,No,2016-04-29,02:38 PM,Friday
1,558997776694438,5642503,Male,2016-04-29 12:08:27-04:00,56,JARDIM DA PENHA,No,No,No,No,No,No,2016-04-29,12:08 PM,Friday
2,4262962299951,5642549,Female,2016-04-29 12:19:04-04:00,62,MATA DA PRAIA,No,No,No,No,No,No,2016-04-29,12:19 PM,Friday
3,867951213174,5642828,Female,2016-04-29 13:29:31-04:00,8,PONTAL DE CAMBURI,No,No,No,No,No,No,2016-04-29,01:29 PM,Friday
4,8841186448183,5642494,Female,2016-04-29 12:07:23-04:00,56,JARDIM DA PENHA,Yes,Yes,No,No,No,No,2016-04-29,12:07 PM,Friday


In [101]:
#### 6. Check for duplicate appointmentId's
df.drop_duplicates()

Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,Age,Neighorhood,Hypertension,Diabetes,Alcoholism,Handcap,SMS,No-show,ScheduledDate,ScheduledTime,ScheduledDayofWeek
0,29872499824296,5642903,Female,2016-04-29 14:38:08-04:00,62,JARDIM DA PENHA,Yes,No,No,No,No,No,04-29-2016,02:38 PM,Friday
1,558997776694438,5642503,Male,2016-04-29 12:08:27-04:00,56,JARDIM DA PENHA,No,No,No,No,No,No,04-29-2016,12:08 PM,Friday
2,4262962299951,5642549,Female,2016-04-29 12:19:04-04:00,62,MATA DA PRAIA,No,No,No,No,No,No,04-29-2016,12:19 PM,Friday
3,867951213174,5642828,Female,2016-04-29 13:29:31-04:00,8,PONTAL DE CAMBURI,No,No,No,No,No,No,04-29-2016,01:29 PM,Friday
4,8841186448183,5642494,Female,2016-04-29 12:07:23-04:00,56,JARDIM DA PENHA,Yes,Yes,No,No,No,No,04-29-2016,12:07 PM,Friday
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110522,2572134369293,5651768,Female,2016-05-03 05:15:35-04:00,56,MARIA ORTIZ,No,No,No,No,Yes,No,05-03-2016,05:15 AM,Tuesday
110523,3596266328735,5650093,Female,2016-05-03 03:27:33-04:00,51,MARIA ORTIZ,No,No,No,No,Yes,No,05-03-2016,03:27 AM,Tuesday
110524,15576631729893,5630692,Female,2016-04-27 12:03:52-04:00,21,MARIA ORTIZ,No,No,No,No,Yes,No,04-27-2016,12:03 PM,Wednesday
110525,92134931435557,5630323,Female,2016-04-27 11:09:23-04:00,38,MARIA ORTIZ,No,No,No,No,Yes,No,04-27-2016,11:09 AM,Wednesday


In [104]:
#### 7. Check dataframe for null values & datatypes of dates/integers
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110527 entries, 0 to 110526
Data columns (total 15 columns):
 #   Column              Non-Null Count   Dtype                     
---  ------              --------------   -----                     
 0   PatientId           110527 non-null  int64                     
 1   AppointmentID       110527 non-null  int64                     
 2   Gender              110527 non-null  object                    
 3   ScheduledDay        110527 non-null  datetime64[ns, US/Eastern]
 4   Age                 110527 non-null  int64                     
 5   Neighorhood         110527 non-null  object                    
 6   Hypertension        110527 non-null  object                    
 7   Diabetes            110527 non-null  object                    
 8   Alcoholism          110527 non-null  object                    
 9   Handcap             110328 non-null  object                    
 10  SMS                 110527 non-null  object             

In [115]:
# change date datatype from object to date
df['ScheduledDate'] = pd.to_datetime(df['ScheduledDate'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110527 entries, 0 to 110526
Data columns (total 15 columns):
 #   Column              Non-Null Count   Dtype                     
---  ------              --------------   -----                     
 0   PatientId           110527 non-null  int64                     
 1   AppointmentID       110527 non-null  int64                     
 2   Gender              110527 non-null  object                    
 3   ScheduledDay        110527 non-null  datetime64[ns, US/Eastern]
 4   Age                 110527 non-null  int64                     
 5   Neighorhood         110527 non-null  object                    
 6   Hypertension        110527 non-null  object                    
 7   Diabetes            110527 non-null  object                    
 8   Alcoholism          110527 non-null  object                    
 9   Handcap             110328 non-null  object                    
 10  SMS                 110527 non-null  object             

#### Export data to csv

In [112]:
df.to_csv('missed_appointments_csv.csv', index=False)