Split Data into train / test before doing target encoding
Use Train Data to build encoding, save the values, create values to be assigned for labels falling outside the scope of the current list.

In [83]:
#!/usr/bin/env python3
#### Import all the required libraries
import pandas as pd #### Library for working with large datsets
import numpy as np #### Library for performing numerical calculations
import matplotlib.pyplot as plt #### Basic Library for plotting graphs
#### Configuring Matplotlib to show Plots inline
%matplotlib inline 
plt.rcParams['figure.figsize'] = (12, 12) ### Setting the size of the Plots

In [84]:
### Parsing the Date
dateparse = lambda dates: [pd.datetime.strptime(d, '%Y-%m-%dT%H:%M:%SZ') for d in dates]

In [85]:
#### Loading the Data
data = pd.read_csv('noshow_appointments.csv', parse_dates=['AppointmentDay', 'ScheduledDay'], date_parser=dateparse)

  dateparse = lambda dates: [pd.datetime.strptime(d, '%Y-%m-%dT%H:%M:%SZ') for d in dates]


In [86]:
#### Looking at the overall info
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110527 entries, 0 to 110526
Data columns (total 14 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   PatientId       110527 non-null  float64       
 1   AppointmentID   110527 non-null  int64         
 2   Gender          110527 non-null  object        
 3   ScheduledDay    110527 non-null  datetime64[ns]
 4   AppointmentDay  110527 non-null  datetime64[ns]
 5   Age             110527 non-null  int64         
 6   Neighbourhood   110527 non-null  object        
 7   Scholarship     110527 non-null  int64         
 8   Hipertension    110527 non-null  int64         
 9   Diabetes        110527 non-null  int64         
 10  Alcoholism      110527 non-null  int64         
 11  Handcap         110527 non-null  int64         
 12  SMS_received    110527 non-null  int64         
 13  NoShow          110527 non-null  object        
dtypes: datetime64[ns](2), float64(1), in

In [87]:
### In this dataset we have the dependant variable being 'yes' or 'no', lets convert it to 1 or 0
clean_up_categoricals = {'NoShow':{'Yes':1, 'No':0},
                         'Gender' :{"M":0, "F":1}}
data.replace(clean_up_categoricals, inplace=True)

In [88]:
#### Extract Y & X from Data
Y = data['NoShow'].values
X = data
#### Split X & Y into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)

In [89]:
####  Find out how many unique values are there in the Train / Test Sets for Neighbourhood
print(X_train.Neighbourhood.nunique())
print(X_test.Neighbourhood.nunique())

81
79


In [92]:
#### Create Target Encoding Values for each of the neighbourhoods by calculating the percentage of no_shows in that neighbourhood

#print(X_train.groupby(['Neighbourhood']).NoShow.count())
#print(X_train[X_train.NoShow==1].groupby(['Neighbourhood']).NoShow.count())
Target_Encoding_Value = X_train[X_train.NoShow==1].groupby(['Neighbourhood']).NoShow.count()/X_train.groupby(['Neighbourhood']).NoShow.count()
print(Target_Encoding_Value)
Target_Encoding_Value.to_csv('Neighbourhood_Target_Encoded_Values.csv', index=True)

Neighbourhood
AEROPORTO              0.166667
ANDORINHAS             0.234298
ANTÔNIO HONÓRIO        0.164251
ARIOVALDO FAVALESSA    0.217949
BARRO VERMELHO         0.211111
                         ...   
SÃO JOSÉ               0.217584
SÃO PEDRO              0.215776
TABUAZEIRO             0.183387
UNIVERSITÁRIO          0.188525
VILA RUBIM             0.171852
Name: NoShow, Length: 81, dtype: float64


In [93]:
#### Replacing values in Train / Test from our Neighbourhood_Target_Encoded_Values.csv file

#### Load encoded_values data from the file
encoded_values = pd.read_csv('Neighbourhood_Target_Encoded_Values.csv', index_col='Neighbourhood')
#print(encoded_values)
encoded_values.head()

Unnamed: 0_level_0,NoShow
Neighbourhood,Unnamed: 1_level_1
AEROPORTO,0.166667
ANDORINHAS,0.234298
ANTÔNIO HONÓRIO,0.164251
ARIOVALDO FAVALESSA,0.217949
BARRO VERMELHO,0.211111


In [94]:
#### Replacing values in Train / Test from our Neighbourhood_Target_Encoded_Values.csv file

#### Create a unique list of all neighbourhoods
listA = X_train.Neighbourhood.unique()
listB = X_test.Neighbourhood.unique()
#### set function will create a set from a list helping us apply different types of set operators on the list
#### Here we've used the OR Operator
Full_List = list(set(listA)|set(listB))

#### Create new columns in both X_train & X_test
X_train['Target_Encoding_Value'] = 0
X_test['Target_Encoding_Value'] = 0

#### Iterate through the list of all neighbourhoods and set the Target Encoding Values
for i,v in enumerate(Full_List):
    X_train.loc[X_train.Neighbourhood==v,'Target_Encoding_Value'] = encoded_values.loc[v,'NoShow']
    X_test.loc[X_test.Neighbourhood==v,'Target_Encoding_Value'] = encoded_values.loc[v,'NoShow']
X_test.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['Target_Encoding_Value'] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['Target_Encoding_Value'] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,NoShow,Target_Encoding_Value
78500,52964710000000.0,5721041,1,2016-05-19 15:14:15,2016-05-19,24,BONFIM,0,0,0,0,1,0,0,0.196573
21891,4176779000000.0,5649318,1,2016-05-02 19:01:48,2016-05-13,60,GRANDE VITÓRIA,0,1,1,0,0,0,0,0.202899
89006,456582400000.0,5757058,1,2016-06-01 08:55:55,2016-06-01,59,ITARARÉ,0,0,0,0,0,0,0,0.259837
36681,453751300000.0,5749766,1,2016-05-31 08:11:30,2016-05-31,58,RESISTÊNCIA,0,1,0,0,0,0,0,0.203652
22388,5346157000000.0,5641183,1,2016-04-29 12:45:36,2016-05-02,38,RESISTÊNCIA,1,0,0,0,0,0,1,0.203652
