In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings

In [24]:
from warnings import filterwarnings
filterwarnings('ignore')

In [25]:
%pwd

'/Users/oliviawang/Documents/Hotel_booking_prediction_Python'

In [26]:
%store -r predictors
%store -r response

In [27]:
predictors.columns

Index(['country', 'reservation_status_date', 'lead_time', 'adults',
       'previous_cancellations', 'previous_bookings_not_canceled',
       'booking_changes', 'days_in_waiting_list', 'adr',
       'required_car_parking_spaces', 'total_of_special_requests',
       'deposit_given'],
      dtype='object')

In [28]:
response

0         0
1         0
2         1
3         0
4         1
         ..
119205    0
119206    0
119207    0
119208    0
119209    0
Name: is_canceled, Length: 119209, dtype: int64

## 1. Split the data into training and testing data

In [29]:
from sklearn.model_selection import train_test_split

In [30]:
X_train, X_test, y_train, y_test = train_test_split(predictors, response, test_size = 0.25)

In [31]:
X_train.shape

(89406, 12)

In [32]:
%store X_train
%store X_test
%store y_train
%store y_test

Stored 'X_train' (DataFrame)
Stored 'X_test' (DataFrame)
Stored 'y_train' (Series)
Stored 'y_test' (Series)


## 2. Logistic Regression

In [98]:
from sklearn.linear_model import LogisticRegression

In [99]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

LogisticRegression()

In [100]:
log_pred = logreg.predict(X_test)
log_pred

array([1, 0, 1, ..., 0, 0, 0])

In [101]:
from sklearn.metrics import confusion_matrix

In [102]:
confusion_matrix(y_test, log_pred)

array([[17325,  1390],
       [ 3979,  7109]])

In [103]:
from sklearn.metrics import accuracy_score

In [104]:
accuracy_score(y_test, log_pred)

0.819850350635842

### 2.1 Cross-Validation - Logistic Regresison

In [105]:
from sklearn.model_selection import cross_val_score

In [106]:
logreg_score = cross_val_score(logreg, X_train, y_train, cv = 10)

In [107]:
print('The average accuracy of logistics regression model after cross-validation is',logreg_score.mean())

The average accuracy of logistics regression model after cross-validation is 0.8240834145893524


## 3. Decision tree model with hyperparameters tuning

In [85]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

In [86]:
dt = DecisionTreeClassifier(random_state = 42)

In [87]:
dt_params = {
    'max_depth': [2, 3, 5, 10, 20],
    'min_samples_leaf': [5, 10, 20, 50, 100],
    'criterion': ['gini', 'entropy']
}

In [88]:
dt_grid_search = GridSearchCV(estimator = dt,
                           param_grid = dt_params,
                           cv = 10, n_jobs = -1, verbose = 1, scoring = 'accuracy')

In [89]:
dt_grid_search.fit(X_train, y_train)

Fitting 10 folds for each of 50 candidates, totalling 500 fits


GridSearchCV(cv=10, estimator=DecisionTreeClassifier(random_state=42),
             n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [2, 3, 5, 10, 20],
                         'min_samples_leaf': [5, 10, 20, 50, 100]},
             scoring='accuracy', verbose=1)

In [90]:
dt_best = dt_grid_search.best_estimator_
dt_best 

DecisionTreeClassifier(max_depth=20, min_samples_leaf=10, random_state=42)

DecisionTreeClassifier(criterion='entropy', max_depth=20, min_samples_leaf=10,
                       random_state=42)

## 4. KNN model with hyperparameter tuning

In [38]:
from sklearn.neighbors import KNeighborsClassifier

In [39]:
knn = KNeighborsClassifier()

In [40]:
knn_params = {'n_neighbors':[5,6,7,8,9,10],
              'leaf_size':[1,2,3,5],
              'weights':['uniform', 'distance'],
              'algorithm':['auto', 'ball_tree','kd_tree','brute'],
              'n_jobs':[-1]}

In [43]:
from sklearn.experimental import enable_halving_search_cv 
from sklearn.model_selection import HalvingGridSearchCV

In [44]:
knn_grid_search = HalvingGridSearchCV(estimator = knn,
                           param_grid = knn_params,
                           cv = 5, n_jobs = -1, verbose = 1, scoring = 'accuracy')

In [45]:
knn_grid_search.fit(X_train, y_train)

n_iterations: 5
n_required_iterations: 5
n_possible_iterations: 5
min_resources_: 1103
max_resources_: 89406
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 192
n_resources: 1103
Fitting 5 folds for each of 192 candidates, totalling 960 fits
----------
iter: 1
n_candidates: 64
n_resources: 3309
Fitting 5 folds for each of 64 candidates, totalling 320 fits
----------
iter: 2
n_candidates: 22
n_resources: 9927
Fitting 5 folds for each of 22 candidates, totalling 110 fits
----------
iter: 3
n_candidates: 8
n_resources: 29781
Fitting 5 folds for each of 8 candidates, totalling 40 fits
----------
iter: 4
n_candidates: 3
n_resources: 89343
Fitting 5 folds for each of 3 candidates, totalling 15 fits


HalvingGridSearchCV(estimator=KNeighborsClassifier(), n_jobs=-1,
                    param_grid={'algorithm': ['auto', 'ball_tree', 'kd_tree',
                                              'brute'],
                                'leaf_size': [1, 2, 3, 5], 'n_jobs': [-1],
                                'n_neighbors': [5, 6, 7, 8, 9, 10],
                                'weights': ['uniform', 'distance']},
                    scoring='accuracy', verbose=1)

In [46]:
knn_best = knn_grid_search.best_estimator_
knn_best 

KNeighborsClassifier(leaf_size=1, n_jobs=-1, n_neighbors=10, weights='distance')

KNeighborsClassifier(leaf_size=1, n_jobs=-1, n_neighbors=10, weights='distance')

## 5. Support vector machine model with hyperparameters tuning

In [120]:
from sklearn.svm import SVC

In [121]:
svm = SVC()

In [123]:
svm_params = {'C': [0.1, 1, 10, 100, 1000], 
            'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
            'kernel': ['rbf','poly','sigmoid','linear']} 

In [51]:
from sklearn.experimental import enable_halving_search_cv 
from sklearn.model_selection import HalvingGridSearchCV

In [124]:
svm_grid_search = HalvingGridSearchCV(estimator = svm,
                           param_grid = svm_params,
                           cv = 5, n_jobs = -1, verbose = 1, scoring = 'accuracy')

In [None]:
svm_grid_search.fit(X_train, y_train)

n_iterations: 5
n_required_iterations: 5
n_possible_iterations: 5
min_resources_: 1103
max_resources_: 89406
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 100
n_resources: 1103
Fitting 5 folds for each of 100 candidates, totalling 500 fits
----------
iter: 1
n_candidates: 34
n_resources: 3309
Fitting 5 folds for each of 34 candidates, totalling 170 fits


In [57]:
svm_best = svm_grid_search.best_estimator_
svm_best 

SVC(C=10, gamma=1)

SVC(C=10, gamma=1)

## 6. Random forest model with hyperparameters tuning

In [60]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()

In [61]:
rf_params = {
    'n_estimators': [5,20,50,100],
    'max_features': ['auto', 'sqrt'],
    'max_depth': [int(x) for x in np.linspace(10, 120, num = 12)],
    'min_samples_split': [2, 6, 10],
    'min_samples_leaf': [1, 3, 4],
    'bootstrap': [True, False]}

In [62]:
from sklearn.experimental import enable_halving_search_cv 
from sklearn.model_selection import HalvingGridSearchCV

In [63]:
rf_grid_search = HalvingGridSearchCV(estimator = rf,
                           param_grid = rf_params,
                           cv = 5, n_jobs = -1, verbose = 1, scoring = 'accuracy')

In [64]:
rf_grid_search.fit(X_train, y_train)

n_iterations: 7
n_required_iterations: 7
n_possible_iterations: 7
min_resources_: 122
max_resources_: 89406
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 1728
n_resources: 122
Fitting 5 folds for each of 1728 candidates, totalling 8640 fits
----------
iter: 1
n_candidates: 576
n_resources: 366
Fitting 5 folds for each of 576 candidates, totalling 2880 fits
----------
iter: 2
n_candidates: 192
n_resources: 1098
Fitting 5 folds for each of 192 candidates, totalling 960 fits
----------
iter: 3
n_candidates: 64
n_resources: 3294
Fitting 5 folds for each of 64 candidates, totalling 320 fits
----------
iter: 4
n_candidates: 22
n_resources: 9882
Fitting 5 folds for each of 22 candidates, totalling 110 fits
----------
iter: 5
n_candidates: 8
n_resources: 29646
Fitting 5 folds for each of 8 candidates, totalling 40 fits
----------
iter: 6
n_candidates: 3
n_resources: 88938
Fitting 5 folds for each of 3 candidates, totalling 15 fits


HalvingGridSearchCV(estimator=RandomForestClassifier(), n_jobs=-1,
                    param_grid={'bootstrap': [True, False],
                                'max_depth': [10, 20, 30, 40, 50, 60, 70, 80,
                                              90, 100, 110, 120],
                                'max_features': ['auto', 'sqrt'],
                                'min_samples_leaf': [1, 3, 4],
                                'min_samples_split': [2, 6, 10],
                                'n_estimators': [5, 20, 50, 100]},
                    scoring='accuracy', verbose=1)

In [65]:
rf_best = rf_grid_search.best_estimator_
rf_best 

RandomForestClassifier(max_depth=110, max_features='sqrt', min_samples_split=6)

## 7. Model evaluation

In [67]:
def evaluate_model(model):
    print("Train Accuracy :", accuracy_score(y_train, model.predict(X_train)))
    print("Train Confusion Matrix:")
    print(confusion_matrix(y_train, model.predict(X_train)))
    print("-"*40)
    print("Test Accuracy :", accuracy_score(y_test, model.predict(X_test)))
    print("Test Confusion Matrix:")
    print(confusion_matrix(y_test, model.predict(X_test)))

In [91]:
evaluate_model(dt_best)

Train Accuracy : 0.8879829094244234
Train Confusion Matrix:
[[52820  3475]
 [ 6540 26571]]
----------------------------------------
Test Accuracy : 0.8419622185686004
Test Confusion Matrix:
[[16901  1814]
 [ 2896  8192]]


In [93]:
evaluate_model(knn_best)

Train Accuracy : 0.9998434109567591
Train Confusion Matrix:
[[56292     3]
 [   11 33100]]
----------------------------------------
Test Accuracy : 0.8620273126866422
Test Confusion Matrix:
[[17326  1389]
 [ 2723  8365]]


In [94]:
evaluate_model(svm_best)

Train Accuracy : 0.8551998747287654
Train Confusion Matrix:
[[52779  3516]
 [ 9430 23681]]
----------------------------------------
Test Accuracy : 0.8432372579941617
Test Confusion Matrix:
[[17376  1339]
 [ 3333  7755]]


In [95]:
evaluate_model(rf_best)

Train Accuracy : 0.9771715544818022
Train Confusion Matrix:
[[56028   267]
 [ 1774 31337]]
----------------------------------------
Test Accuracy : 0.8794081132771868
Test Confusion Matrix:
[[17638  1077]
 [ 2517  8571]]


## 8. Result

In [109]:
result = {'model': ['logistics regression', 'decision tree', 'k-nearest neighbors', 'support machine vector', 'random forest'],
          'accuracy': [0.824, 0.842, 0.862, 0.843, 0.879]}

In [119]:
result_df = pd.DataFrame(result)
result_df.sort_values(by = ['accuracy'], ascending = False)

Unnamed: 0,model,accuracy
4,random forest,0.879
2,k-nearest neighbors,0.862
3,support machine vector,0.843
1,decision tree,0.842
0,logistics regression,0.824


Random forest model with 'max_depth'=110, 'max_features'='sqrt', 'min_samples_split'=6 performs the best