In [11]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier 
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

np.random.seed(1)

In [12]:
import os
os.chdir('C:\\Users\\simra\\Downloads')

In [13]:
X_train = pd.read_csv("hotel_res_train_X.csv")
X_test = pd.read_csv("hotel_res_test_X.csv")
y_train = pd.read_csv("hotel_res_train_y.csv")
y_test = pd.read_csv("hotel_res_test_y.csv")

### Modeling data
First, let's create a dataframe to load the model performance metrics into.

In [14]:
performance = pd.DataFrame({"model": [], "Accuracy": [], "Precision": [], "Recall": [], "F1": []})

### Evaluation metric
In our dataset, for end goal, we are predicting the column 'booking_status_Not_Canceled' which is 1 when the reservation remains intact, and 0 when it is called-off. The selection of metric would depend on what parameter leads to maximum loss reduction to the business.
Interpretation for the confusion matrix:

TP= not canceled detected 1 correctly means the booking remained.

TN= not canceled detected 0 correctly means the booking got cancelled.

FP= not canceled detected 1 wrongly means detected that booking is there but it is actually cancelled.

FN= not canceled detected 0 wrongly means detected that booking is cancelled but instead it is actually intact.

Therefore FN and FP both need to be reduced because increase in both the cases would lead to loss in different ways.
Increased FP will lead to underutilization of rooms therefore revenue loss as there will be empty rooms available. 
Increased FN will lead to overbooking and therefore when customers arrive, it would lead to their dissatisfaction with the service as someone will have to adjust or compromize on either another room or another hotel. This would indirectly impact the ratings of the hotel and lead to loss in revenue. 

Precision metric takes care of the FP and Recall metric takes care of the FN, but we are interested to reduce both FP and FN.
Hence, for modeling data, we will use F1-score as it takes care of both metrics by taking their harmonic mean. 


#### Logistic Regression Random Search CV using default, L1, L2, Elastic, liblinear regularization
Conduct an initial random search across a wide range of possible parameters.

In [15]:
score_measure = "f1"
kfolds = 5

param_grid = {
    'max_iter':np.arange(90,110),
    'penalty': ['None','l1','l2','elasticnet'],
    'solver':['saga','liblinear']
}

log_reg = LogisticRegression()
rand_search = RandomizedSearchCV(estimator = log_reg, param_distributions=param_grid, cv=kfolds, n_iter=100,
                           scoring=score_measure, verbose=1, n_jobs=-1, 
                           return_train_score=True)

_ = rand_search.fit(X_train, y_train)

print(f"The best {score_measure} score is {rand_search.best_score_}")
print(f"... with parameters: {rand_search.best_params_}")

Fitting 5 folds for each of 100 candidates, totalling 500 fits
The best f1 score is 0.7794295844412918
... with parameters: {'solver': 'saga', 'penalty': 'l1', 'max_iter': 109}


#### Logistic Regression Grid Search CV using default, L1, L2, Elastic, liblinear regularization
Conduct an exhaustive search across a smaller range of parameters around the parameters found in the initial random search.

In [16]:
score_measure = "f1"
kfolds = 5
max_iter = rand_search.best_params_['max_iter']
penalty = rand_search.best_params_['penalty']
solver = rand_search.best_params_['solver']

param_grid = {
    'max_iter': np.arange(max_iter-5,max_iter+5),  
    'penalty': [penalty],
    'solver': [solver]
}

log_reg_model = LogisticRegression()
grid_search = GridSearchCV(estimator = log_reg_model, param_grid=param_grid, cv=kfolds, 
                           scoring=score_measure, verbose=1, n_jobs=-1,
                           return_train_score=True)

_ = grid_search.fit(X_train, y_train)

print(f"The best {score_measure} score is {grid_search.best_score_}")
print(f"... with parameters: {grid_search.best_params_}")

bestf1_logistic = grid_search.best_estimator_

Fitting 5 folds for each of 10 candidates, totalling 50 fits
The best f1 score is 0.7794295844412918
... with parameters: {'max_iter': 104, 'penalty': 'l1', 'solver': 'saga'}


In [17]:
c_matrix = confusion_matrix(y_test, grid_search.predict(X_test))
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"Logistic Regression", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])
performance

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,Logistic Regression,0.785721,0.879089,0.792148,0.833357


#### SVM Random Search CV using linear, rbf and poly kernal
Conduct an initial random search across a wide range of possible parameters.

In [18]:
score_measure = "f1"
kfolds = 3

param_grid = {
    'C': np.arange(1,15),   
    'gamma': ['scale','auto'],
    'kernel':['linear','rbf','poly']
}

svm_m1 = SVC()
rand_search = RandomizedSearchCV(estimator = svm_m1, param_distributions=param_grid, cv=kfolds, n_iter=2,
                           scoring=score_measure, verbose=1, n_jobs=-1, 
                           return_train_score=True)

_ = rand_search.fit(X_train, y_train)

print(f"The best {score_measure} score is {rand_search.best_score_}")
print(f"... with parameters: {rand_search.best_params_}")

Fitting 3 folds for each of 2 candidates, totalling 6 fits
The best f1 score is 0.8191377481759609
... with parameters: {'kernel': 'poly', 'gamma': 'scale', 'C': 5}


#### SVM Grid Search CV using linear, rbf and poly kernal
Conduct an initial random search across a wide range of possible parameters.

In [19]:
score_measure = "f1"
kfolds = 3

C = rand_search.best_params_['C']
gamma = rand_search.best_params_['gamma']
kernel = rand_search.best_params_['kernel']

param_grid = {
    'C': np.arange(C-1,C+1),  
    'gamma': [gamma],
    'kernel': [kernel]
    
}

svm_model = SVC()
grid_search = GridSearchCV(estimator = svm_model, param_grid=param_grid, cv=kfolds, 
                           scoring=score_measure, verbose=1, n_jobs=-1,
                           return_train_score=True)

_ = grid_search.fit(X_train, y_train)

print(f"The best {score_measure} score is {grid_search.best_score_}")
print(f"... with parameters: {grid_search.best_params_}")

bestf1_SVM = grid_search.best_estimator_

Fitting 3 folds for each of 2 candidates, totalling 6 fits
The best f1 score is 0.8195533072379352
... with parameters: {'C': 4, 'gamma': 'scale', 'kernel': 'poly'}


In [20]:
c_matrix = confusion_matrix(y_test, grid_search.predict(X_test))
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"SVM", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])
performance

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,Logistic Regression,0.785721,0.879089,0.792148,0.833357
0,SVM,0.823211,0.910836,0.818775,0.862355


#### DTree Random Search CV

In [21]:
score_measure = "f1"
kfolds = 5

param_grid = {
    'min_samples_split': np.arange(1,100),  
    'min_samples_leaf': np.arange(1,100),
    'min_impurity_decrease': np.arange(0.0001, 0.01, 0.0005),
    'max_leaf_nodes': np.arange(5, 100), 
    'max_depth': np.arange(5,15), 
    'criterion': ['entropy', 'gini'],
}

dtree = DecisionTreeClassifier()
rand_search = RandomizedSearchCV(estimator = dtree, param_distributions=param_grid, cv=kfolds, n_iter=50,
                           scoring=score_measure, verbose=1, n_jobs=-1, 
                           return_train_score=True)

_ = rand_search.fit(X_train, y_train)

print(f"The best {score_measure} score is {rand_search.best_score_}")
print(f"... with parameters: {rand_search.best_params_}")

Fitting 5 folds for each of 50 candidates, totalling 250 fits
The best f1 score is 0.826500027995135
... with parameters: {'min_samples_split': 22, 'min_samples_leaf': 35, 'min_impurity_decrease': 0.0006000000000000001, 'max_leaf_nodes': 58, 'max_depth': 14, 'criterion': 'entropy'}


#### DTree Grid Search CV

In [22]:
score_measure = "f1"
kfolds = 3
min_samples_split = rand_search.best_params_['min_samples_split']
min_samples_leaf = rand_search.best_params_['min_samples_leaf']
min_impurity_decrease = rand_search.best_params_['min_impurity_decrease']
max_leaf_nodes = rand_search.best_params_['max_leaf_nodes']
max_depth = rand_search.best_params_['max_depth']
criterion = rand_search.best_params_['criterion']

param_grid = {
    'min_samples_split': np.arange(min_samples_split-2,min_samples_split+2),  
    'min_samples_leaf': np.arange(min_samples_leaf-2,min_samples_leaf+2),
    'min_impurity_decrease': np.arange(min_impurity_decrease-0.0001, min_impurity_decrease+0.0001, 0.00005),
    'max_leaf_nodes': np.arange(max_leaf_nodes-2,max_leaf_nodes+2), 
    'max_depth': np.arange(max_depth-2,max_depth+2), 
    'criterion': [criterion]
}

dtree = DecisionTreeClassifier()
grid_search = GridSearchCV(estimator = dtree, param_grid=param_grid, cv=kfolds, 
                           scoring=score_measure, verbose=1, n_jobs=-1,
                           return_train_score=True)

_ = grid_search.fit(X_train, y_train)

print(f"The best {score_measure} score is {grid_search.best_score_}")
print(f"... with parameters: {grid_search.best_params_}")

bestf1_dtree = grid_search.best_estimator_

Fitting 3 folds for each of 1280 candidates, totalling 3840 fits
The best f1 score is 0.8283334369927218
... with parameters: {'criterion': 'entropy', 'max_depth': 12, 'max_leaf_nodes': 59, 'min_impurity_decrease': 0.0005, 'min_samples_leaf': 33, 'min_samples_split': 20}


In [23]:
c_matrix = confusion_matrix(y_test, grid_search.predict(X_test))
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"Decision Tree", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])
performance

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,Logistic Regression,0.785721,0.879089,0.792148,0.833357
0,SVM,0.823211,0.910836,0.818775,0.862355
0,Decision Tree,0.842047,0.896542,0.866458,0.881244


#### Best Estimators for the 3 models

In [24]:
best= [bestf1_logistic,bestf1_SVM,bestf1_dtree]
best


[LogisticRegression(max_iter=105, penalty='l1', solver='saga'),
 SVC(C=8),
 DecisionTreeClassifier(criterion='entropy', max_depth=11, max_leaf_nodes=77,
                        min_impurity_decrease=0.0005, min_samples_leaf=25,
                        min_samples_split=22)]

### Summary
Arranged by f1 score, the best models are:

In [24]:
performance.sort_values(by=['F1'],ascending=False)

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,Decision Tree,0.842047,0.896542,0.866458,0.881244
0,SVM,0.823211,0.910836,0.818775,0.862355
0,Logistic Regression,0.785721,0.879089,0.792148,0.833357


Due to technical limitations, models with few iterations were run which gave the above described F1-score. It seems that for our model the best performing model out of the three is SVM Classifier. 

Predicting correct booking cancellations would help the business in two ways-

-->There would be no empty rooms i.e. underutilization of the property.

-->There would be no customer issues who could clash on not getting the rooms because of overbooking, which eventually will lead to good reviews.


### Neural Net

#### With Random Search

In [25]:
%%time

score_measure = "f1"
kfolds = 5

param_grid = {
    'hidden_layer_sizes': [ (70,),(50,30), (40,20)],
    'activation': ['logistic', 'tanh', 'relu'],
    'solver': ['adam', 'sgd'],
    'alpha': [0, .2, .5, .7, 1],
    'learning_rate': ['constant', 'invscaling', 'adaptive'],
    'learning_rate_init': [0.001, 0.01, 0.1, 0.2, 0.5],
    'max_iter': [100]
}

ann = MLPClassifier()
grid_search = RandomizedSearchCV(estimator = ann, param_distributions=param_grid, cv=kfolds, n_iter=50,
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = grid_search.fit(X_train, y_train)

bestf1Tree = grid_search.best_estimator_

print(grid_search.best_params_)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
{'solver': 'sgd', 'max_iter': 100, 'learning_rate_init': 0.2, 'learning_rate': 'adaptive', 'hidden_layer_sizes': (70,), 'alpha': 0.2, 'activation': 'relu'}
Wall time: 1min 54s


In [27]:
%%time
y_pred = bestf1Tree.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.70      0.84      0.77      3522
           1       0.92      0.83      0.87      7361

    accuracy                           0.83     10883
   macro avg       0.81      0.84      0.82     10883
weighted avg       0.85      0.83      0.84     10883

Wall time: 32.3 ms


In [28]:
%%time

score_measure = "f1"
kfolds = 5

param_grid = {
    'hidden_layer_sizes': [ (30,), (50,), (70,)],
    'activation': ['tanh', 'relu'],
    'solver': ['adam'],
    'alpha': [.5, .7, 1],
    'learning_rate': ['adaptive', 'invscaling'],
    'learning_rate_init': [0.005, 0.01, 0.15],
    'max_iter': [100]
}

ann = MLPClassifier()
grid_search = GridSearchCV(estimator = ann, param_grid=param_grid, cv=kfolds, 
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = grid_search.fit(X_train, y_train)

bestf1Tree = grid_search.best_estimator_

print(grid_search.best_params_)


Fitting 5 folds for each of 108 candidates, totalling 540 fits
{'activation': 'relu', 'alpha': 0.5, 'hidden_layer_sizes': (70,), 'learning_rate': 'invscaling', 'learning_rate_init': 0.01, 'max_iter': 100, 'solver': 'adam'}
Wall time: 2min 10s


In [29]:
%%time
y_pred = bestf1Tree.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.67      0.82      0.74      3522
           1       0.90      0.81      0.85      7361

    accuracy                           0.81     10883
   macro avg       0.79      0.81      0.80     10883
weighted avg       0.83      0.81      0.82     10883

Wall time: 26.4 ms


In [30]:
c_matrix = confusion_matrix(y_test, y_pred)
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"ANN", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])
performance

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,Logistic Regression,0.785721,0.879089,0.792148,0.833357
0,SVM,0.823211,0.910836,0.818775,0.862355
0,Decision Tree,0.842047,0.896542,0.866458,0.881244
0,ANN,0.811817,0.90183,0.809944,0.853421


In [31]:
performance.sort_values(by=['F1'],ascending=False)

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,Decision Tree,0.842047,0.896542,0.866458,0.881244
0,SVM,0.823211,0.910836,0.818775,0.862355
0,ANN,0.811817,0.90183,0.809944,0.853421
0,Logistic Regression,0.785721,0.879089,0.792148,0.833357


For this dataset, ANN doesn't seem to be the best model considering the parameters selected. It might turn better if we tune the model or use a larger dataset maybe. 