In [1]:
#import all import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import timeit
from time import time
import sys
import os
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.express as px
import plotly.graph_objects as go
import pickle

from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, StandardScaler
from sklearn.metrics import classification_report,confusion_matrix, roc_auc_score,roc_curve, accuracy_score\
,make_scorer,fbeta_score
from sklearn.model_selection import train_test_split, GridSearchCV,cross_val_score, cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier

import tensorflow as tf
from tensorflow import keras

## Import Cleaned and Engineered Data from Previously worked sample

In [2]:
filename = 'cleaned_hotel_data.pkl'

with open(filename, 'rb') as file:
    data = pickle.load(file)
data

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date,arrival_date,family_type,room_change,preferred_day,had_cancellations,driving
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,0,0,Check-Out,7/1/2015,2015-07-01,Couple,False,weekday,False,False
75559,City Hotel,0,257,2015,July,27,1,0,2,1,...,0,0,Check-Out,7/3/2015,2015-07-01,Single,False,weekday,False,False
75560,City Hotel,0,257,2015,July,27,1,0,2,2,...,0,0,Check-Out,7/3/2015,2015-07-01,Couple,False,weekday,False,False
75561,City Hotel,0,257,2015,July,27,1,0,2,2,...,0,0,Check-Out,7/3/2015,2015-07-01,Couple,False,weekday,False,False
75562,City Hotel,0,257,2015,July,27,1,0,2,2,...,0,0,Check-Out,7/3/2015,2015-07-01,Couple,False,weekday,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40039,Resort Hotel,0,108,2017,August,35,31,2,5,2,...,0,1,Check-Out,9/7/2017,2017-08-31,Couple,False,weekday,False,False
40040,Resort Hotel,0,194,2017,August,35,31,2,5,2,...,1,1,Check-Out,9/7/2017,2017-08-31,Family,False,weekday,False,True
13794,Resort Hotel,1,17,2017,August,35,31,0,3,2,...,0,2,Canceled,8/14/2017,2017-08-31,Couple,False,weekday,False,False
40038,Resort Hotel,0,191,2017,August,35,31,2,5,2,...,0,0,Check-Out,9/7/2017,2017-08-31,Couple,False,weekday,False,False


## Check for class Imbalances and store some data away for test purposes

In [3]:
data['is_canceled'].value_counts()

0    73988
1    43825
Name: is_canceled, dtype: int64

In [3]:
#Randomly sample 73988 - 43825 data out and store as test set
diff = data['is_canceled'].value_counts()[0] - data['is_canceled'].value_counts()[1]
TN_test = data[data['is_canceled'] == 0].sample(n = diff)
TN_test

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date,arrival_date,family_type,room_change,preferred_day,had_cancellations,driving
29575,Resort Hotel,0,33,2016,October,44,24,2,5,2,...,0,0,Check-Out,10/31/2016,2016-10-24,Couple,True,weekday,False,False
118131,City Hotel,0,150,2017,August,32,8,2,5,2,...,0,2,Check-Out,8/15/2017,2017-08-08,Family,False,weekday,False,False
94207,City Hotel,0,301,2016,July,31,30,1,1,1,...,0,0,Check-Out,8/1/2016,2016-07-30,Single,False,weekday,False,False
117232,City Hotel,0,125,2017,July,31,31,1,0,2,...,0,0,Check-Out,8/1/2017,2017-07-31,Family,False,weekend,False,False
38433,Resort Hotel,0,118,2017,July,27,4,4,10,2,...,0,0,Check-Out,7/18/2017,2017-07-04,Couple,False,weekday,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34671,Resort Hotel,0,9,2017,March,13,31,0,1,2,...,0,0,Check-Out,4/1/2017,2017-03-31,Couple,False,weekday,False,False
103502,City Hotel,0,3,2016,December,51,15,4,8,1,...,0,0,Check-Out,12/27/2016,2016-12-15,Single,False,weekday,False,False
81414,City Hotel,0,1,2015,December,50,10,0,1,1,...,0,0,Check-Out,12/11/2015,2015-12-10,Single,False,weekday,False,False
21364,Resort Hotel,0,40,2016,February,8,20,2,5,2,...,0,0,Check-Out,2/27/2016,2016-02-20,Couple,False,weekday,False,False


In [4]:
df = data[~(data.index.isin(TN_test.index))]

## Feature Selection and Feature Importance
Since the data had already been cleaned from previous section, and also some additional data had been engineered, this part will move straight to Feature Selection, i.e selecting only relevant and important features that are mutually exclusive to become the predictor variables

#### Perform Encoding and Standard Scaler
Since the data has many categorical variables, it is best to first perform ordinal encoding and standard scaling first as most ML algorithms work best with scaled datasets.

***Note on Encoding***: The encoding type chosen for this exercise is Ordinal Encoder, which is not as robust as OneHotEncoder due to the order can be misinterpretaed by the ML algorithm at times. But OneHotEncoder can pose another challenge which is data cardinality and huge matrix which will then need to be cleaned with PCA etc. So for that reason and in interest of time, Ordinal Encoder will be used. More information can be retrieved from this [sklearn site](https://scikit-learn.org/stable/modules/preprocessing.html#encoding-categorical-features)

**To drop a few more columns**<br>
- `reservation_status_date`: too many unique values and may not provide any information gain
- `country`: too many unique values. Although in EDA, some countries like PRT, BRA and ITA provided more cancellation rates and may provide some useful features, but the other features shall play higher role
- `arrival_date`: engineered data for EDA
- `reservation_status`: from previous discovery, this has dependency on cancellations directly and may not be a good predictor

In [5]:
df = df.drop(['reservation_status_date', 'country', 'arrival_date', 'reservation_status'], axis = 1)

In [6]:
categorical_cols = list(df.columns[(df.dtypes == object)|(df.dtypes == bool)])
categorical_cols

['hotel',
 'arrival_date_month',
 'meal',
 'market_segment',
 'distribution_channel',
 'reserved_room_type',
 'assigned_room_type',
 'deposit_type',
 'customer_type',
 'family_type',
 'room_change',
 'preferred_day',
 'had_cancellations',
 'driving']

In [7]:
df.var()


Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError.  Select only valid columns before calling the reduction.



is_canceled                           0.250003
lead_time                         12270.893724
arrival_date_year                     0.502088
arrival_date_week_number            181.035523
arrival_date_day_of_month            77.167133
stays_in_weekend_nights               0.988297
stays_in_week_nights                  3.530136
adults                                0.351778
children                              0.156558
babies                                0.007792
is_repeated_guest                     0.026837
previous_cancellations                0.942638
previous_bookings_not_canceled        2.028418
booking_changes                       0.358786
agent                             11069.733618
days_in_waiting_list                344.397783
adr                                2592.751753
required_car_parking_spaces           0.047919
total_of_special_requests             0.598636
room_change                           0.091966
had_cancellations                     0.065547
driving      

In [8]:
#Normalizing the actual numerical values: lead_time, days_in_waiting_list, adr
df['lead_time'] = np.log(df['lead_time'] + 1)
df['days_in_waiting_list'] = np.log(df['days_in_waiting_list'] + 1)
df['adr'] = np.log(df['adr'] + 1)

In [9]:
df.var()


Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError.  Select only valid columns before calling the reduction.



is_canceled                           0.250003
lead_time                             2.404367
arrival_date_year                     0.502088
arrival_date_week_number            181.035523
arrival_date_day_of_month            77.167133
stays_in_weekend_nights               0.988297
stays_in_week_nights                  3.530136
adults                                0.351778
children                              0.156558
babies                                0.007792
is_repeated_guest                     0.026837
previous_cancellations                0.942638
previous_bookings_not_canceled        2.028418
booking_changes                       0.358786
agent                             11069.733618
days_in_waiting_list                  0.565603
adr                                   0.460205
required_car_parking_spaces           0.047919
total_of_special_requests             0.598636
room_change                           0.091966
had_cancellations                     0.065547
driving      

In [10]:
oe = OrdinalEncoder()
#data[categorical_cols] = data[categorical_cols].apply(lambda col: le.fit_transform(col))
df[categorical_cols] = oe.fit_transform(df[categorical_cols])

In [11]:
oe.categories_

[array(['City Hotel', 'Resort Hotel'], dtype=object),
 array(['April', 'August', 'December', 'February', 'January', 'July',
        'June', 'March', 'May', 'November', 'October', 'September'],
       dtype=object),
 array(['BB', 'FB', 'HB', 'SC'], dtype=object),
 array(['Aviation', 'Complementary', 'Corporate', 'Direct', 'Groups',
        'Offline TA/TO', 'Online TA'], dtype=object),
 array(['Corporate', 'Direct', 'GDS', 'TA/TO'], dtype=object),
 array(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'L'], dtype=object),
 array(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L'],
       dtype=object),
 array(['No Deposit', 'Non Refund', 'Refundable'], dtype=object),
 array(['Contract', 'Group', 'Transient', 'Transient-Party'], dtype=object),
 array(['Adult Group', 'Couple', 'Family', 'Single'], dtype=object),
 array([False,  True]),
 array(['weekday', 'weekend'], dtype=object),
 array([False,  True]),
 array([False,  True])]

In [12]:
df.corr()["is_canceled"].abs().sort_values(ascending=False)

is_canceled                       1.000000
deposit_type                      0.431159
lead_time                         0.342299
room_change                       0.279114
total_of_special_requests         0.254941
had_cancellations                 0.247369
driving                           0.228850
required_car_parking_spaces       0.228334
assigned_room_type                0.187899
distribution_channel              0.179940
booking_changes                   0.155045
hotel                             0.143302
previous_cancellations            0.099085
is_repeated_guest                 0.093532
days_in_waiting_list              0.090358
adr                               0.088541
family_type                       0.079838
customer_type                     0.070826
reserved_room_type                0.067522
previous_bookings_not_canceled    0.065007
market_segment                    0.061569
adults                            0.059709
agent                             0.044863
babies     

In [13]:
#Checking for important feature via correlation with cancellations
sorted_importance = df.corr()["is_canceled"].abs().sort_values(ascending=False)

In [14]:
#Assign these features into a new dataframe for analysis
df_im = df[sorted_importance.index]

In [15]:
df_im

Unnamed: 0,is_canceled,deposit_type,lead_time,room_change,total_of_special_requests,had_cancellations,driving,required_car_parking_spaces,assigned_room_type,distribution_channel,...,babies,stays_in_week_nights,preferred_day,arrival_date_year,arrival_date_week_number,meal,arrival_date_day_of_month,arrival_date_month,children,stays_in_weekend_nights
75559,0,0.0,5.552960,0.0,0,0.0,0.0,0,0.0,3.0,...,0,2,0.0,2015,27,2.0,1,5.0,0,0
75560,0,0.0,5.552960,0.0,0,0.0,0.0,0,0.0,3.0,...,0,2,0.0,2015,27,2.0,1,5.0,0,0
75564,0,0.0,5.552960,0.0,0,0.0,0.0,0,0.0,3.0,...,0,2,0.0,2015,27,2.0,1,5.0,0,0
75565,0,0.0,5.552960,0.0,0,0.0,0.0,0,0.0,3.0,...,0,2,0.0,2015,27,2.0,1,5.0,0,0
75567,0,0.0,5.552960,0.0,0,0.0,0.0,0,0.0,3.0,...,0,2,0.0,2015,27,2.0,1,5.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13796,1,0.0,4.727388,0.0,0,0.0,0.0,0,4.0,3.0,...,0,3,0.0,2017,35,0.0,31,1.0,0,0
13795,1,0.0,4.262680,0.0,1,0.0,0.0,0,3.0,3.0,...,0,3,0.0,2017,35,0.0,31,1.0,0,0
40040,0,0.0,5.273000,0.0,1,0.0,1.0,1,6.0,3.0,...,0,5,0.0,2017,35,2.0,31,1.0,1,2
13794,1,0.0,2.890372,0.0,2,0.0,0.0,0,0.0,3.0,...,0,3,0.0,2017,35,2.0,31,1.0,0,0


In [16]:
#create a correlation matrix to check the important features
corr_mat = df_im.drop('is_canceled',axis=1).corr().abs()

In [17]:
#create an upper triangle of correlation so that we can remove the dependent variables
upper = corr_mat.where(np.triu(np.ones(corr_mat.shape),k=1).astype(bool))

In [18]:
threshold = 0.50
upper[upper>threshold]

Unnamed: 0,deposit_type,lead_time,room_change,total_of_special_requests,had_cancellations,driving,required_car_parking_spaces,assigned_room_type,distribution_channel,booking_changes,...,babies,stays_in_week_nights,preferred_day,arrival_date_year,arrival_date_week_number,meal,arrival_date_day_of_month,arrival_date_month,children,stays_in_weekend_nights
deposit_type,,,,,,,,,,,...,,,,,,,,,,
lead_time,,,,,,,,,,,...,,,,,,,,,,
room_change,,,,,,,,,,,...,,,,,,,,,,
total_of_special_requests,,,,,,,,,,,...,,,,,,,,,,
had_cancellations,,,,,,,,,,,...,,,,,,,,,,
driving,,,,,,,0.997745,,,,...,,,,,,,,,,
required_car_parking_spaces,,,,,,,,,,,...,,,,,,,,,,
assigned_room_type,,,,,,,,,,,...,,,,,,,,,,
distribution_channel,,,,,,,,,,,...,,,,,,,,,,
booking_changes,,,,,,,,,,,...,,,,,,,,,,


In [19]:
corr_features = [column for column in upper.columns if any(upper[column]>=threshold)]
corr_features

['required_car_parking_spaces',
 'reserved_room_type',
 'market_segment',
 'adults',
 'agent',
 'arrival_date_week_number']

From the exercise above, can conclude that only 17 important features are needed to run the machine learning model to predict the type of diagnosis, namely **deposit_type, lead_time, room_change, total_of_special_requests, had_cancellations, driving, assigned_room_type,** and **distribution_channel**. One important feature can also be dropped: **required_car_parking_spaces, reserved_room_type, market_segment, adults, agent**, can be described by the other 17 features

In [20]:
#Assigning a new dataframe to selected features
X = df[upper.columns].drop(corr_features,axis=1)
X

Unnamed: 0,deposit_type,lead_time,room_change,total_of_special_requests,had_cancellations,driving,assigned_room_type,distribution_channel,booking_changes,hotel,...,previous_bookings_not_canceled,babies,stays_in_week_nights,preferred_day,arrival_date_year,meal,arrival_date_day_of_month,arrival_date_month,children,stays_in_weekend_nights
75559,0.0,5.552960,0.0,0,0.0,0.0,0.0,3.0,1,0.0,...,0,0,2,0.0,2015,2.0,1,5.0,0,0
75560,0.0,5.552960,0.0,0,0.0,0.0,0.0,3.0,0,0.0,...,0,0,2,0.0,2015,2.0,1,5.0,0,0
75564,0.0,5.552960,0.0,0,0.0,0.0,0.0,3.0,0,0.0,...,0,0,2,0.0,2015,2.0,1,5.0,0,0
75565,0.0,5.552960,0.0,0,0.0,0.0,0.0,3.0,0,0.0,...,0,0,2,0.0,2015,2.0,1,5.0,0,0
75567,0.0,5.552960,0.0,0,0.0,0.0,0.0,3.0,0,0.0,...,0,0,2,0.0,2015,2.0,1,5.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13796,0.0,4.727388,0.0,0,0.0,0.0,4.0,3.0,0,1.0,...,0,0,3,0.0,2017,0.0,31,1.0,0,0
13795,0.0,4.262680,0.0,1,0.0,0.0,3.0,3.0,0,1.0,...,0,0,3,0.0,2017,0.0,31,1.0,0,0
40040,0.0,5.273000,0.0,1,0.0,1.0,6.0,3.0,3,1.0,...,0,0,5,0.0,2017,2.0,31,1.0,1,2
13794,0.0,2.890372,0.0,2,0.0,0.0,0.0,3.0,0,1.0,...,0,0,3,0.0,2017,2.0,31,1.0,0,0


In [21]:
#check if there are any null values
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 87650 entries, 0 to 117424
Data columns (total 26 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   deposit_type                    87650 non-null  float64
 1   lead_time                       87650 non-null  float64
 2   room_change                     87650 non-null  float64
 3   total_of_special_requests       87650 non-null  int64  
 4   had_cancellations               87650 non-null  float64
 5   driving                         87650 non-null  float64
 6   assigned_room_type              87650 non-null  float64
 7   distribution_channel            87650 non-null  float64
 8   booking_changes                 87650 non-null  int64  
 9   hotel                           87650 non-null  float64
 10  previous_cancellations          87650 non-null  int64  
 11  is_repeated_guest               87650 non-null  int64  
 12  days_in_waiting_list           

In [21]:
#fill in null in adr
X['adr'] = X['adr'].fillna(value = X['adr'].mean())

In [22]:
y = df['is_canceled']

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)

In [27]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [24]:
print("Length of training set: {}".format(len(X_train)))
print("\n")
print("Length of testing set: {}".format(len(X_test)))

Length of training set: 70120


Length of testing set: 17530


## Modelling
The dataset will be trained using classification algorithms, namely Logistic Regression, Random Forest, and XGBoost but first a metric will be set to compare with later

### Evaluation Metrics - Naive Predictor (Benchmark model)

In this section, I will attempt to create a Benchmark model which is a Naive Predictor. The assumption is to naively say that all bookings will not be cancelled. So, that will be my attempt here to set all target labels to zero, and then predict the ROC_AUC score on the Naive Predictor. It should give around 50% ROC score as it is sort of a random predictor.

Hotel booking cancellation is a solution for the Hotel Owner (Objective to predict cancellation). Hence it will be a bigger loss if there is a wrong prediction on saying that the booking will not be cancelled, but ended up cancelling it (False Negative - Type 2 Error) as there will be revenue loss rather than wrongly prediction cancellations (False Positive - Type 1 Error). Hence Type 2 error need to be avoided, which means recall score needs to be optimized. Since ROC-AUC will look to optimize both TPR and 1-TNR, this will be the main metric used. However, to optimize the grid-search algorithm below, F-Beta score with a Beta value of 2 is also used to optimize recall.

In [31]:
#create an array of zeros for the length of training set
benchmark_target = np.zeros(70120)
    
#Check the roc score
benchmark_score = roc_auc_score(y_train,benchmark_target)
print('Benchmark roc_auc score for this Hotel Booking dataset is {}'.format(benchmark_score))

Benchmark roc_auc score for this Hotel Booking dataset is 0.5


### Logistic Regression

**Advantages**
- Simple method and easy to use
- Good for linear models or for data models that has a linear decision boundaries
- Fast training times
- Can give estimates of probabilities of target outcomes
- Works well with large datasets
- Model based learning, so may give better approximation than instance based methods

**Disadvantages**
- Not very effective in non-linear models
- Not as accurate as Neural Networks, Decision Forests
- May not work well if the number of features are high vs the number of data points

**Why Logistic Regression for this problem?**
- IT IS A MUST IN ANY CLASSIFICATION PROBLEMS
- predicting hotel booking is a two-class classification problem 
- Simple and fast training time of logistic regression
- Good to compare with other models like Random Forests that uses a different splitting algorithm


### Random Forest

**Advantages**
- Very powerful algorithm, but yet simple method and easy to use and visualize
- Requires little data preparation compared to other models
- Cost of using the tree becomes relatively low compared to other models when the number of data points increases
- Can handle both numerical and categorical dataset
- Can handle multi-output problems
- Since it uses a white-box model, it can be easily troubleshooted and interpreted by simple logic
- Good accuracy if trained with high number of data points
- Works well with large datasets
- Non-parametric, hence can be used for both linear and non-linear models
- Solves overfitting problem of Decision trees by randomly sampling to construct trees

**Disadvantages**
- Does not support missing values
- If the maximum depth of tree is not properly investigated, this may lead to overfitting with a very complex tree structure
- Sensitive to variations in data, small variations can lead to different tree


**Why Random Forest for this problem?**
- predicting hotel booking is a classification problem on which Random Forest can perform well
- Easy to troubleshoot with visualization of trees
- Can be used for both linear and non linear type, so it will be good comparison with the above two methods.

### ExtraTrees

**Advantages**
- All advantages of Random Forests plus Decision Trees, with some additional boosted trees for better training and convergence
- Latest technology

**Disadvantages**
- Does not support missing values
- Sensitive to variations in data, small variations can lead to different tree


In [28]:
#create base classifiers for training
clf_lg = LogisticRegression()
clf_rf = RandomForestClassifier(max_depth=3, n_estimators=50)
clf_etc = ExtraTreesClassifier()

clf_lg.fit(X_train_scaled,y_train)
clf_rf.fit(X_train_scaled,y_train)
clf_etc.fit(X_train_scaled,y_train)

for clf in [clf_lg, clf_rf,clf_etc]:
    time_start = time()
    clf_name = clf.__class__.__name__
    print ('Classifier: {}'.format(clf_name))
    cv_score = np.mean(cross_val_score(clf, X_train_scaled, y_train, cv=3, scoring='roc_auc'))
    print ('ROC_AUC CV score is {}'.format(cv_score))
    print ("total time to train: {:.2f}s".format(time()-time_start))
    print ("")

Classifier: LogisticRegression
ROC_AUC CV score is 0.8506510969279354
total time to train: 0.31s

Classifier: RandomForestClassifier
ROC_AUC CV score is 0.8526532246454069
total time to train: 1.38s

Classifier: ExtraTreesClassifier
ROC_AUC CV score is 0.9197325298135598
total time to train: 9.98s



In [29]:
for clf in [clf_lg, clf_rf, clf_etc]:
    time_start = time()
    clf_name = clf.__class__.__name__
    print ('Classifier: {}'.format(clf_name))
    prediction = clf.predict(X_test_scaled)
    cv_score = roc_auc_score(y_test,prediction)
    print ('ROC_AUC score for test is {}'.format(cv_score))
    print ("total time to test: {:.2f}s".format(time()-time_start))
    print ("")
    print (classification_report(y_test,prediction))
    print ("*"*100)
    print ("")

Classifier: LogisticRegression
ROC_AUC score for test is 0.75433763785821
total time to test: 0.01s

              precision    recall  f1-score   support

           0       0.75      0.78      0.76      8835
           1       0.76      0.73      0.75      8695

    accuracy                           0.75     17530
   macro avg       0.76      0.75      0.75     17530
weighted avg       0.75      0.75      0.75     17530

****************************************************************************************************

Classifier: RandomForestClassifier
ROC_AUC score for test is 0.7535607926678258
total time to test: 0.04s

              precision    recall  f1-score   support

           0       0.72      0.84      0.78      8835
           1       0.80      0.67      0.73      8695

    accuracy                           0.75     17530
   macro avg       0.76      0.75      0.75     17530
weighted avg       0.76      0.75      0.75     17530

************************************

## Optimizing Best Sklearn Model Using GridSearchCV

### ExtraTreesClassifier and Logistic Regression Optimization

In [133]:
#make scoring function for f-beta score to optimize on recall score
def performance_metric(y_true, y_predict):
    """ Calculates and returns the performance score between 
        true and predicted values based on the metric chosen. """
    
    score = fbeta_score(y_true, y_predict, beta=2)
    
    # Return the score
    return score

In [140]:
#Grid Search on Logistic Regression to reduce overfitting

#Set parameters for grid search
param_grid = {'solver': ['lbfgs', 'liblinear','sag'],
              'max_iter': [50,100,200],
              'C': [0.05, 0.1, 0.5, 1.0,2.0,3.0, 10.0, 100.0]}

#make own scorer from function above to optimize recall
scorer = make_scorer(performance_metric)

gs_lg = GridSearchCV(clf_lg, param_grid, scoring=scorer, cv=3, verbose=1, n_jobs=-1)
time_start = time()

print("Performing grid search...")
gs_lg.fit(X_train_scaled,y_train)
score = gs_lg.best_score_
        
print ('GridSearch best F2 score for logistic regression is: {}'.format(score))

#get parameters from best estimator
best_parameters = gs_lg.best_estimator_.get_params()
        
print("Best parameters:")
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))
print("")
print ("")
#get best estimator
clf_lg_best = gs_lg.best_estimator_

print("Total time to GridSearch: {:.2f}s".format(time()-time_start))
print("")

# Make predictions using the optimized model
best_prediction = clf_lg_best.predict(X_test_scaled)
cv_score = roc_auc_score(y_test,best_prediction)
print('ROC_AUC score for best prediction is {}'.format(cv_score))
print ("")
print (classification_report(y_test,best_prediction))
print ("")

Performing grid search...
Fitting 3 folds for each of 72 candidates, totalling 216 fits
GridSearch best F2 score for logistic regression is: nan
Best parameters:
	C: 0.05
	max_iter: 50
	solver: 'lbfgs'


Total time to GridSearch: 36.76s

ROC_AUC score for best prediction is 0.7524051816215674

              precision    recall  f1-score   support

           0       0.74      0.78      0.76      8732
           1       0.77      0.73      0.75      8798

    accuracy                           0.75     17530
   macro avg       0.75      0.75      0.75     17530
weighted avg       0.75      0.75      0.75     17530





One or more of the test scores are non-finite: [nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan]



#### ExtraTreesClassifier Optimization

In [149]:
#Grid Search on Random Forest to reduce overfitting

#Set parameters for grid search
param_grid = {'n_estimators': [100,200,500],
            'max_features': ['auto', 'sqrt', 'log2'],
            'min_samples_split': [2,3],
            'min_samples_leaf': [1,2],
            'criterion' :['gini', 'entropy']}

gs_etc = GridSearchCV(clf_etc, param_grid, scoring=scorer, cv=3, verbose=1, n_jobs=-1)
time_start = time()

print("Performing grid search...")
gs_etc.fit(X_train_scaled,y_train)
score = gs_etc.best_score_
        
print ('GridSearch best F2 score for extra trees classifier is: {}'.format(score))

#get parameters from best estimator
best_parameters = gs_etc.best_estimator_.get_params()
        
print("Best parameters:")
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))
print("")
print ("")
#get best estimator
clf_etc_best = gs_etc.best_estimator_

print("Total time to GridSearch: {:.2f}s".format(time()-time_start))
print("")

# Make predictions using the optimized model
best_prediction = clf_etc_best.predict(X_test_scaled)
cv_score = roc_auc_score(y_test,best_prediction)
print('ROC_AUC score for test is {}'.format(cv_score))
print ("")
print (classification_report(y_test,best_prediction))
print ("")

Performing grid search...
Fitting 3 folds for each of 72 candidates, totalling 216 fits
GridSearch best F2 score for extra trees classifier is: 0.8265097655473986
Best parameters:
	criterion: 'entropy'
	max_features: 'sqrt'
	min_samples_leaf: 1
	min_samples_split: 3
	n_estimators: 150


Total time to GridSearch: 232.07s

ROC_AUC score for test is 0.8456901357146406

              precision    recall  f1-score   support

           0       0.84      0.86      0.85      8732
           1       0.86      0.83      0.84      8798

    accuracy                           0.85     17530
   macro avg       0.85      0.85      0.85     17530
weighted avg       0.85      0.85      0.85     17530




## Saving best models
From sklearn model, the best model even after grid search, is the basic Extra Trees Classifier with 92% ROC AUC score on training and about 85% on test set.

### Running predictions to validate
Randomly run predictions to validate from the True Negatives set earlier

In [35]:
X_test_TN = TN_test.copy(deep=True)
y_test_TN = TN_test['is_canceled']
X_test_TN[categorical_cols] = oe.transform(X_test_TN[categorical_cols])
X_test_TN = X_test_TN[X_train.columns]
X_test_TN_scaled = scaler.transform(X_test_TN)

In [39]:
clf_etc.predict(X_test_TN_scaled[[0]])

array([0])

In [40]:
clf_etc.predict(X_test_TN_scaled[[5]])

array([0])

From the two cells above, the classifier correctly predicts random samples. More testing can be done at later stages

**Saving the model, scaler, encoder as pickle file**

In [41]:
dict_class = {'scaler': scaler, 'encoder': oe, 'usable_columns': X_train.columns,
             'encoder_categories': categorical_cols, 'clf_etc': clf_etc}

filename = 'classifier_extra_trees_files.pkl'

with open(filename, 'wb') as f:
    pickle.dump(dict_class, f)