## Problem Statement

Leverage Guest Data and Booking behaviour patterns to devise a strategy for Hotel Revenue management, using Data science and Machine Learning

### Importing the required libraries


In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

### Importing the data for ML project

In [None]:
# Read the data
df = pd.read_csv('/Users/priyankac/Downloads/hotel_bookings (1).csv')

In [None]:
# Checking the number of rows and columns
df.shape

In [None]:
# Checking the first five records
df.head()

In [None]:
# Checking different data types in the given data set
df.dtypes

### Setting display options to ensure feature name visiblity

In [None]:
pd.set_option('display.max_columns', None)

### Warning Suppression

In [None]:
import warnings
warnings.filterwarnings('ignore')

### Drop ID Feature from the dataset

In [None]:
# No ID feature is mentioned in the data set

### Defining Target and Independent Features


In [None]:
Y = df[['is_canceled']]
X = df.drop(['is_canceled'], axis = 1)

In [None]:
Y.shape

In [None]:
X.shape

### Get the Cancellation Rate

In [None]:
# Check the event rate (cancellation rate)
Y.mean()

### Split features into Numerical and Categorical Features

In [None]:
num1 = X.select_dtypes(include = 'number')
char1 = X.select_dtypes(include = 'object')

In [None]:
num1.head()

In [None]:
char1.head()

In [None]:
# checking the levels of feature in the numerical column
# If the level <= 20 they will be considered as categorical feature

def unique_levels(x):
    x = x.value_counts().count()
    return (x)

df_value_counts = pd.DataFrame(num1.apply(lambda x : unique_levels(x)))

In [None]:
df_value_counts

In [None]:
df_value_counts.columns = ['feature_levels']
df_value_counts.head()

In [None]:
# Slicing all the features that has levels less than 20 and storing in separate df
slice1 = df_value_counts.loc[df_value_counts['feature_levels'] <= 20]
cat_list = slice1.index
cat = num1.loc[: , cat_list]
cat.dtypes


In [None]:
cat.shape

In [None]:
# Keeping the feature levels greater than 20 in num DataFrame
slice2 = df_value_counts.loc[df_value_counts['feature_levels'] > 20]
num_list = slice2.index
num1 = num1.loc[: , num_list]

In [None]:
num1.dtypes

In [None]:
num1.shape

In [None]:
# concatenating the cat dataframe with the char dataframe
char1 = pd.concat([char1, cat], axis = 1, join = 'inner')

In [None]:
char1.shape

In [None]:
char1.head()

In [None]:
char1.dtypes

### Outlier Analysis of Numerical Features

In [None]:
num1.describe(percentiles = [0.01, 0.05, 0.10, 0.25, 0.50, 0.75, 0.85, 0.90, 0.99])

### Capping and Flooring of Outliers

In [None]:
def outlier_cap(x):
    x = x.clip(lower = x.quantile(0.01))
    x = x.clip(upper = x.quantile(0.99))
    return (x)

In [None]:
num1 = num1.apply(lambda x : outlier_cap(x))

In [None]:
num1.describe(percentiles = [0.01, 0.05, 0.10, 0.25, 0.50, 0.75, 0.85, 0.99])

### Missing Values Analysis

In [None]:
num1.isnull().mean()

In [None]:
char1.isnull().mean()

### Dropping variables that have > 25% missing values

In [None]:
num1.head()

In [None]:
num1 = num1.loc[:,num1.isnull().mean() <= .25]

In [None]:
num1.isnull().mean()

### Imputation of Missing Value

In [None]:
from sklearn.impute import SimpleImputer
# Imputation For numerical features
imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')
num_1 = pd.DataFrame(imputer.fit_transform(num1), index = num1.index, columns = num1.columns)

In [None]:
num_1.isnull().mean()

In [None]:
# Imputation for Categorical features
imputer = SimpleImputer(missing_values = np.nan, strategy = 'most_frequent')
char_1 = pd.DataFrame(imputer.fit_transform(char1), index = char1.index, columns = char1.columns)

In [None]:
char_1.dtypes

In [None]:
char_1.isnull().mean()

## Feature Selection - Numerical Feature

### Part 1 : Remove Features with 0 Variance


In [None]:
from sklearn.feature_selection import VarianceThreshold
varselector = VarianceThreshold(threshold = 0)
varselector.fit_transform(num_1)

# Get columns to keep and create new dataframe with those only
cols = varselector.get_support(indices = True)
num_2 = num_1.iloc[:,cols]

In [None]:
num_2.iloc[0]

### Part 2 : Bi Variate Analysis (Feature Discretization)

In [None]:
from sklearn.preprocessing import KBinsDiscretizer
discrete = KBinsDiscretizer(n_bins = 10, encode = 'ordinal', strategy = 'quantile')
num_binned = pd.DataFrame(discrete.fit_transform(num_2), index = num_2.index, 
                          columns = num_2.columns).add_suffix('_Rank')
num_binned.head()

In [None]:
# Check if the features show a slope at all
# If they do, then do you see some deciles below the population average and some higher than population?
# If that is the case then the slope will be strong

# Conclusion: A strong slope is indicative of the features' ability to discriminate the event from non event
#             making it a good predictor

X_bin_combined = pd.concat([Y, num_binned], axis = 1, join = 'inner')

from numpy import mean
for col in (num_binned.columns):
    plt.figure()
    sns.lineplot(x = col, y = X_bin_combined['is_canceled'].mean(), data = X_bin_combined, color = 'red')
    sns.barplot(x = col, y = 'is_canceled', data = X_bin_combined, estimator = mean)
plt.show()    

In [None]:
# Check the descriptive statistics for the following features:
# previous_bookings_not_canceled
# days_in_waiting_list
# booking_changes

num_2['day_wait_ind'] = np.where(num_2['days_in_waiting_list'] > 0, 1, 0)
num_2['previous_bookings_not_canceled_ind'] = np.where(num_2['previous_bookings_not_canceled'] > 0, 1, 0)
num_2['booking_changes_ind'] = np.where(num_2['booking_changes'] > 0, 1, 0)

In [None]:
num_varlist = ['arrival_date_week_number', 'arrival_date_day_of_month', 'previous_bookings_not_canceled',
              'booking_changes', 'days_in_waiting_list']
num_2 = num_2.drop(num_varlist, axis = 1)

In [None]:
num_2.dtypes

In [None]:
num_2.shape

### Part 3 : Select K Best

In [None]:
from sklearn.feature_selection import SelectKBest, chi2
selector = SelectKBest(chi2, k = 4)
selector.fit_transform(num_2, Y)

# Get columns to keep and create new dataframe with only those
cols = selector.get_support(indices = True)
select_features_df_num = num_2.iloc[: , cols]

In [None]:
select_features_df_num.iloc[0]

## Feature Selection - Categorical Features

### Part 1 : Bi Variate Analysis


In [None]:
X_char_merged = pd.concat([char1, Y], axis = 1, join = 'inner')

from numpy import mean
for col in (char1.columns):
    plt.figure()
    sns.barplot(x = col, y = 'is_canceled', data = X_char_merged, estimator = mean)
plt.show()    


In [None]:
charlist = ['arrival_date_month', 'country', 'assigned_room_type', 'reservation_status',
           'reservation_status_date', 'arrival_month_year']
char_1 = char_1.drop(charlist, axis = 1)

In [None]:
char_1.dtypes

In [None]:
# Create dummy features with the n-1 levels(One Hot Encoding)
X_char_dum = pd.get_dummies(char_1, drop_first = True)
X_char_dum.shape

### Part 2 : Select K Best

In [None]:
# Select K Best for Categorical Features
from sklearn.feature_selection import SelectKBest, chi2
selector = SelectKBest(chi2, k = 90)
selector.fit_transform(X_char_dum, Y)

# Get columns to keep and create a dataframe with those only
cols = selector.get_support(indices = True)
select_features_df_char = X_char_dum.iloc[: , cols]

In [None]:
select_features_df_char.iloc[0]

## Creating the Master Feature Set for Model Development

In [None]:
X_all = pd.concat([select_features_df_num, select_features_df_char], axis = 1, join = 'inner')

In [None]:
X_all.shape

## Train Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_all, Y, test_size = 0.3, random_state = 20)

In [None]:
print('Shape of Training Data ',X_train.shape)
print('Shape of Testing Data', X_test.shape)
print('Response rate in Training Data', y_train.mean())
print('Response rate in Testing Data', y_test.mean())
# mean of training and testing data are almost same indicating a good representation of the data

In [None]:
# Non Linearity in feature relationships are observed which makes tree based methods a good choice
# There are few options to consider among Tree methods:
# White Box(Completely explainable set of rules )- Decision Tree
# Ensemble Method - Random forest(with Bagging)
# Ensemble method - GBM/XGBoost(Boosting)

In [None]:
# Build a Decision Tree model
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier(criterion = 'gini', random_state = 20)

In [None]:
np.random.seed(44)
from sklearn.model_selection import GridSearchCV

param_dist = {'max_depth' : [3, 5, 6, 7], 'min_samples_split' : [140, 280, 420, 560, 700]}
tree_grid = GridSearchCV(dtree, cv = 10, param_grid = param_dist, n_jobs = -1)
tree_grid.fit(X_train, y_train)
print('Best Parameters using Grid Search: \n', tree_grid.best_params_)


In [None]:
dtree = DecisionTreeClassifier(criterion = 'gini', random_state = 20, max_depth = 7, min_samples_split = 140)
dtree.fit(X_train, y_train)

In [None]:
# Building Random Forest Model
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(criterion = 'gini', random_state = 20, max_depth = 7, min_samples_split = 140)
rf.fit(X_train, y_train)

In [None]:
# Building a Gradient Boosting Model
from sklearn.ensemble import GradientBoostingClassifier
gbm = GradientBoostingClassifier(criterion = 'mse', random_state = 20, max_depth = 7, min_samples_split = 140)
gbm.fit(X_train, y_train)

### Model Evaluation

In [None]:
y_pred_tree = dtree.predict(X_test)
y_pred_rf = rf.predict(X_test)
y_pred_gbm = gbm.predict(X_test)

In [None]:
from sklearn import metrics
from sklearn.metrics import confusion_matrix

In [None]:
# Evaluation for Decision Tree
print('Accuracy : ', metrics.accuracy_score(y_test, y_pred_tree))
print('Precision : ', metrics.precision_score(y_test, y_pred_tree))
print('Recall : ', metrics.recall_score(y_test, y_pred_tree))
print('f1_score : ', metrics.f1_score(y_test, y_pred_tree))

In [None]:
# Plot the confusion matrix
metrics.plot_confusion_matrix(dtree, X_test, y_test)

In [None]:
# Evaluation for Random Forest
print('Accuracy : ', metrics.accuracy_score(y_test, y_pred_rf))
print('Precision : ', metrics.precision_score(y_test, y_pred_rf))
print('Recall : ', metrics.recall_score(y_test, y_pred_rf))
print('f1_score : ', metrics.f1_score(y_test, y_pred_rf))

In [None]:
# Plot the confusion matrix
metrics.plot_confusion_matrix(rf, X_test, y_test)

In [None]:
# Evaluation for Gradient Boosting
print('Accuracy : ', metrics.accuracy_score(y_test, y_pred_gbm))
print('Precision : ', metrics.precision_score(y_test, y_pred_gbm))
print('Recall : ', metrics.recall_score(y_test, y_pred_gbm))
print('f1_score : ', metrics.f1_score(y_test, y_pred_gbm))

In [None]:
# Plot the confusion matrix
metrics.plot_confusion_matrix(gbm, X_test, y_test)

### Lorenze Curve

In [None]:
# Since Gradient boost is giving the best results, we use Lorenz curve based on output from Gradient Boosting Model

y_pred_prob = gbm.predict_proba(X_all)[:, 1]

#as output of y_pred_prob is an array convert to dataframe, add as new column in our dataframe df
df['pred_prob_gbm'] = pd.DataFrame(y_pred_prob)

#creating decile out of the y_pred_prob(to a group of people to find churners and then apply strategy for their 
#retention) (codes+1 ensures that we have the same number of people in each group) 
df['P_Rank_gbm'] = pd.qcut(df['pred_prob_gbm'].rank(method='first').values, 10, duplicates='drop').codes+1 

# creating another dataframe using just the columns is_canceled and y_pred_P
rank_df_actuals = df.groupby('P_Rank_gbm')['is_canceled'].agg(['count', 'mean'])
rank_df_actuals = pd.DataFrame(rank_df_actuals)
rank_df_actuals.rename(columns = {'mean' : 'Actual_event_rate'}, inplace = True)

rank_df_predicted = df.groupby('P_Rank_gbm')['y_pred_P'].agg(['mean'])
rank_df_predicted = pd.DataFrame(rank_df_predicted)
rank_df_predicted.rename(columns = {'mean': 'Predicted_event_rate'}, inplace = True)

rank_df = pd.concat([rank_df_actuals, rank_df_predicted], axis = 1, join = 'inner')

#sorting the dataframe in descending order
sorted_rank_df = rank_df.sort_values(by = 'P_Rank_gbm' , ascending = False)
sorted_rank_df['N_events'] = rank_df['count'] * rank_df['Actual_event_rate']# number of people who churned
sorted_rank_df['cum_events'] = sorted_rank_df['N_events'].cumsum()#cumulating the number of people churned
sorted_rank_df['event_cap'] = sorted_rank_df['N_events']/max(sorted_rank_df['N_events'].cumsum())#event capture rate
sorted_rank_df['cum_event_cap'] = sorted_rank_df['event_cap'].cumsum()

# calculating non events
sorted_rank_df['N_non_events'] = sorted_rank_df['count'] - sorted_rank_df['N_events']#number of non events
sorted_rank_df['cum_non_events'] = sorted_rank_df['N_non_events'].cumsum()
sorted_rank_df['non_event_cap'] = sorted_rank_df['N_non_events']/max(sorted_rank_df['N_non_events'].cumsum())
sorted_rank_df['cum_non_event_cap'] = sorted_rank_df['non_event_cap'].cumsum()


# Using KS-The KS test report the maximum difference between the two cumulative distributions, 
# and we see where the gap maximizes
sorted_rank_df['KS'] = round((sorted_rank_df['cum_event_cap'] - sorted_rank_df['cum_non_event_cap']), 4) 

sorted_rank_df['random_cap'] = sorted_rank_df['count']/max(sorted_rank_df['count'].cumsum())
sorted_rank_df['cum_random_cap'] = sorted_rank_df['random_cap'].cumsum()

sorted_reindexed = sorted_rank_df.reset_index()
sorted_reindexed['Decile'] = sorted_reindexed.index+1
sorted_reindexed['Lift_over_Avg']=sorted_reindexed['Actual_event_rate']/(max(sorted_reindexed['N_events'].cumsum())/max(sorted_reindexed['count'].cumsum()))

                                                            
sorted_reindexed


In [None]:
# Plotting graphs
fig, axes = plt.subplots(1,2, sharex = True, figsize = (15,5))
fig.suptitle('Effectiveness of Deciles based on Model Probabilities')
axes[0].set_title('Rank Ordering of Actual Event Rate')
#axes[1].set_title('Lift over Mean Event Rate')
axes[1].set_title('Gains Chart')
sns.lineplot(ax=axes[0], x='Decile', y='Actual_event_rate', data=sorted_reindexed, color='red')
#sns.barplot(ax=axes[1], x='Decile', y='Lift_over_avg', data=sorted_reindexed, color='green')
sns.lineplot(ax=axes[1], x='Decile', y='cum_event_cap', data=sorted_reindexed, color='blue')
sns.lineplot(ax=axes[1], x='Decile', y='cum_non_event_cap', data=sorted_reindexed, color='black')


In [None]:
# From machine learning to strategy
# Introduction APT framework
# A-Audience
# P-Prioritization
# T-Treatment

### Audience

In [None]:
df['Predicted_cancel_Rank'] = np.where(df['P_Rank_gbm'] < 8, 'Bottom 7', 'Top 3')
df.Predicted_cancel_Rank.value_counts()

In [None]:
df_top3 = df.loc[df['Predicted_cancel_Rank'] == 'Top 3', :]

In [None]:
df_top3.shape

### Prioritization

In [None]:
df['lead_time_RANK'] = pd.qcut(df['lead_time'].rank(method = 'first').values, 10, duplicates = 'drop').codes+1

In [None]:
df.groupby('lead_time_RANK')['lead_time'].agg(['min','mean', 'max'])

In [None]:
df.lead_time.mean()

In [None]:
# considering the mean greater than equal to 7 is considered as high lead time
df['lead_time_segment'] = np.where(df['lead_time_RANK'] >= 7, 'High Lead Time', 'Low Lead Time')
df.lead_time_segment.value_counts()

In [None]:
df['adr_RANK'] = pd.qcut(df['adr'].rank(method = 'first').values, 10, duplicates = 'drop').codes+1

In [None]:
df.groupby('adr_RANK')['adr'].agg(['min', 'mean', 'max'])

In [None]:
df.adr.mean()

In [None]:
df['adr_segment'] = np.where(df['adr_RANK'] >= 7, 'High ADR', 'Low ADR')
df.adr_segment.value_counts()

In [None]:
pd.crosstab(index=df['adr_segment'], columns=df['lead_time_segment'], values=df['adr'],aggfunc='mean')

In [None]:
pd.crosstab(index=df['adr_segment'], columns=df['lead_time_segment'], values=df['is_canceled'], aggfunc='mean')

### Treatment

In [None]:
service_list =['stays_in_weekend_nights','total_of_special_requests', 'reserved_room_type']

In [None]:
df_top3_services = df_top3[service_list]

In [None]:
for col in (df_top3_services.columns):
    plt.figure()
    sns.countplot(x = col, data = df_top3_services)
plt.show()    