In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
# read data
data = pd.read_csv("hotel_clean.csv")

# Intro 

In this notebook we are primarily 1) finding the best performing model and 2) evaluating results. 

In addition to this I'm doing some ad hoc feature engineering and data-cleaning. This is not optimal for clarity and readability, however I'm under significant time contraint in this project and I found it more efficient to this in the same notebook so please pardon me :)

# "saving" features that will not be included in training but used to evaluate results

In [7]:
# saving features that i will remove from model but want to include in analysis of results
# will later merge these with result table (merge/join on index)
canceled_days = data.canceled_days.to_frame()
arrival_date = data.arrival_date.to_frame()
year_week = data.year_week.to_frame()
year_mo = data.year_mo.to_frame()

# reducing cats for countries

In [9]:
# temporary dataframe with value_counts for columns to create an "other" category for countries with less than 100 bookings
data_temp = data.country.value_counts().to_frame()
# list with all countries with less than 100 bookings
country_lst = data_temp.loc[data_temp.country < 100].index.tolist()

# function to replace countries in series with "OTHER", if less than 100 bookings for that country
def other_country(x):
    if x in country_lst:
        return "OTHER"
    else:
        return x

# apply function to series
data.country = data.country.apply(other_country)

In [None]:
# The reason for doing reducing cats for countries was that it caused problem when evaluating feature_importances. 
# Perhaps not a very good reason but I'm under time contraint and need to move on. 

# changing some num-cols to cat-cols

features are categorical but saved as numbers. I'll change to cat and create dummies

In [10]:
data.agent = data.agent.astype("object")
data.company = data.company.astype("object")
# Should have been changed in data-cleaning but didn't pick up on this until now

# cols to include in model, creating X and y

In [11]:
# leaving out columns such as canceled_days since it only holds values for bookings that were canceled and thus...
# ... wouldn't be realistic training info
number_cols = ['is_canceled',
 'lead_time',
 'stays_in_weekend_nights',
 'stays_in_week_nights',
 'adults',
 'children',
 'babies',
 'is_repeated_guest',
 'previous_cancellations',
 'previous_bookings_not_canceled',
 'booking_changes',
 'days_in_waiting_list',
 'adr',
 'required_car_parking_spaces',
 'total_of_special_requests']

# leaving out 2 columns ["reservation_status", "reservation_status_date"] 
# since we wont have this information in a real world setting
categorical_cols = ['hotel', 
'arrival_date_month', 
'meal', 
'market_segment',
'distribution_channel', 
'reserved_room_type', 
'assigned_room_type',
'deposit_type', 
'customer_type',
'country',
'agent',
'company']

# new dataframe with selected cols
data = data[number_cols + categorical_cols]

# target variable and features
target_col = "is_canceled"
X = data.drop(target_col, axis = 1)
y = data[target_col]

## Optional: dropping columns that are causing problems

Iterative process to find which columns are causing problems and if it's worth spending time trying to resolve it (which it would be if you think the add much explanatory value, otherwise it's probably not worth it)

In [12]:
# Looking at number of categories for non-numeric features
# This helps me get a feel for which columns might cause problems 
for x in data.select_dtypes("object"):
    print(x, data[x].nunique())

hotel 2
arrival_date_month 12
meal 5
market_segment 8
distribution_channel 5
reserved_room_type 10
assigned_room_type 12
deposit_type 3
customer_type 4
country 39
agent 333
company 352


In [13]:
# company and agent where causing some issues and didn't seem to add much explanatory value so I'm dropping them
X.drop(["company", "agent"], axis = 1, inplace = True)

# this is possible loss of important information. For instance could be the case that there are meaningful differences in...
# ... cancelation-rate for different companies.
# However I'm under significant time contraint in this project and I feel ok dropping these in order to be able to move on. 

# Preprocessing

In [14]:
# Imports for preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

# Imports for model-evaluation
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score

# Importing classifiers
from sklearn.ensemble import RandomForestClassifier
#from sklearn.tree import DecisionTreeClassifier
#from sklearn.svm import SVC
#from sklearn.linear_model import LogisticRegression
#from sklearn.neighbors import KNeighborsClassifier

# Selecting numeric features from X
numeric_features = X.select_dtypes("number").columns
# Transformation-pipeline for numeric features (scaler and imputer)
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler()),
                                     ('imputer', SimpleImputer(strategy='median'))])

# Selecting categorical features from X
categorical_features = X.select_dtypes("object").columns
# Transformation-Pipeline for categorical features (imputer and onehotencoder)
categorical_transformer = Pipeline(steps=[("imputer", SimpleImputer(strategy = "constant")),
                                          ('onehot', OneHotEncoder(handle_unknown = "ignore"))])

# Columntransformer(final preprocessing pipeline). Incorporates transformation-pipelines for both num- and cat-features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Model training

In [15]:
# preprocessing pipeline same as before, now only using the best performing model (RandomForest)
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', RandomForestClassifier())])

# Fake param_grid since I wan't to use default settings cause they had the best performance...
# .. and I don't want to have a slow runtime for this cell when i'm running through it when making minor adjustments
param_grid = {
    #"classifier__criterion": ["gini"]
    #,'classifier__max_depth': [None]
    #,"classifier__n_estimators": [100]
}

# Fake GridSearchCV because param_grid is fake
# 5 cross-folds
grid_search = GridSearchCV(clf, param_grid, cv=5)

# train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Fitting model
grid_search.fit(X_train, y_train)

# Output to evaluate
print(grid_search.best_score_)
print(grid_search.scorer_)
print(grid_search.best_params_)
results = grid_search.cv_results_

0.8819415407779567
<function _passthrough_scorer at 0x0000023A357D2790>
{}


In [2]:
# 88% accuracy. Baseline would be around 60 %. Good performance. Suspiciously good. 

# Model test

In [16]:
# Importing metrics for model evaluation
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score

# creating classifier from best_estimator_ from the grid-search
clf = grid_search.best_estimator_

# predictions for X_test
test_pred = clf.predict(X_test)

print(confusion_matrix(y_test, test_pred))
print("ACC:", accuracy_score(y_test, test_pred))
print("P:", precision_score(y_test, test_pred))
print("R:", recall_score(y_test, test_pred))

[[14078   915]
 [ 1778  7107]]
ACC: 0.8872183599966497
P: 0.8859386686611818
R: 0.7998874507597074


In [None]:
# significanly higher precision compared to recall

# Concats with features that wasn't allowed/included in training

I wan't to concat these features to result-table to allow for more in depth-analysis

features are:

canceled_days = data.canceled_days.to_frame() > I'll be able to see distribution of cancellations\ 
arrival_date = data.arrival_date.to_frame() > I'll be able to see if there are differences between time-periods\
year_week = data.year_week.to_frame()\
year_mo = data.year_mo.to_frame()

## concat test with canceled_days

In [17]:
# list of dataframes to concat
objs = [X_test, canceled_days, arrival_date, year_week, year_mo]

In [18]:
# concatinating on index
test_evaluation = pd.concat(objs, axis=1, join='inner', ignore_index=False)

## add true and pred to df and create cm-col

In [5]:
# test predictions to evaluation df
test_evaluation["y_true"] = y_test

# true outcome to evaluation df
test_evaluation["y_pred"] = test_pred

In [20]:
# mashing pred and true 
test_evaluation["true_pred"] = test_evaluation.y_true.astype(str) +"-"+ test_evaluation.y_pred.astype(str)

In [21]:
# function to get cm-values for mashed pred and true
def cm(true_pred):
    if true_pred == "0-0":
        return "TN"
    if true_pred == "1-0":
        return "FN"
    if true_pred == "1-1":
        return "TP"
    if true_pred == "0-1":
        return "FP"

In [22]:
# applying function cm (cell above)
test_evaluation.true_pred = test_evaluation.true_pred.apply(cm)

## Cancellations with less than 31 days notice

In [23]:
# mask to filter for bookings that were cancelled less than one month before arrival
mask1 = test_evaluation.canceled_days < 31

In [24]:
# 4004 bookings were canceled less than one month(31 days) before arrival in test-data
test_evaluation.loc[mask1].y_true.value_counts()

1    4004
Name: y_true, dtype: int64

In [6]:
# 23878 was total no. of rows in test-data so ~16.9% of total bookings were canceled less than 31 days before arrival
4040/23878

0.1691933997822263

In [25]:
# Predictions from model 
test_evaluation.loc[mask1].y_pred.value_counts()

1    2792
0    1212
Name: y_pred, dtype: int64

In [26]:
# precision for model.
2792/4004

0.6973026973026973

In [None]:
# So the models precision for total data was ~ 88.6% but less than 70% for cancellations made less than 31 days before arrival
# Significat difference! Model is not nearly as good at predicting cancellations made close to arrival
# This is of course not good for our business use case

# Comparison with predict proba for specified threshold

1) Looking at predictions made with 0.9 or above probability

2) Looking at positive predictions made within 30 days of cancellation

In [28]:
# adding models prediction probability for "1"(true) prediction to evaluation df
test_evaluation["proba_1"] = clf.predict_proba(X_test)[:,1]

In [29]:
# So when models prediction probability is > 0.9 we have precision over 99%!
test_evaluation.loc[test_evaluation.proba_1 > 0.9].true_pred.value_counts(normalize = True)

TP    0.992175
FP    0.007825
Name: true_pred, dtype: float64

## How many useful predictions is the model making and with that precision? 

In order to find out if the model is useful in a real world setting I'm interested in 2 numbers: 
1) How many actionable predictions is the model making?\
2) With what precision is the "actionable" predictions being made? 

An actionable prediction I will define as:
1) a prediction made with > 0.9 prediction probability (predict_proba)\
2) a prediction made about a booking that canceled 30 days or less before arrival



In [31]:
# df with test evaluation. Only when predict_proba for "true" (cancellation) > 0.9
proba_90 = test_evaluation.loc[test_evaluation.proba_1 > 0.9]

In [33]:
# index in proba_90 where canceled_days is more than 30
index_canceled_days_above_30 = proba_90.loc[proba_90.canceled_days > 30].index.tolist()

In [8]:
# new column where we drop rows where canceled days is more than 30. 
# This df only contains:
    # 1) predictions made for cancellations with more than 0.9 probability 
    # 2) predictions made for bookings that canceled less than 30 days before arrival
proba_90_clean = proba_90.drop(index_canceled_days_above_30)

In [35]:
# looking at y_true(true outcomes) for predictions made with proba > 0.9 and canceled_days < 31
proba_90_clean.y_true.value_counts()

# If predict proba is > 0.9 the model will have made a prediction that the booking is canceled (1/True)
# ... so comparing with true outcome gives us precision

1    1473
0      35
Name: y_true, dtype: int64

In [9]:
# Precision is ~ 97.7% for model 
1473/(1473+35)

0.976790450928382

In [11]:
# and whe are making useful predictions for ~6% of data
1508/23878

0.06315436803752408

# Conclusion

We are making actionable predictions for ~ 6% of the bookings and these predictions have in a test-environment achieved a precision of ~97.7%. 

There is definitely a possible profitable use-case for such a model. To evaluate this we could look at:
- loss/profit for FP/TP based on models precision to get an expected average value for each actionable prediction
- cost for building and maintaing model
- no. of actionable predictions that has to made to make up for costs related to model