In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
from matplotlib import pyplot as plt
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv('/kaggle/input/hotel-booking-demand/hotel_bookings.csv')

In [None]:
data.head()

In [None]:
data.columns

# Data Exploration

### Hotel types

In [None]:
plt.figure(figsize=(10,6))
g = sns.countplot(data.hotel)

### Booking canceled

In [None]:
plt.figure(figsize=(10,6))
g = sns.countplot(data.is_canceled)

### Days between the entering date of the booking and the arrival date

In [None]:
plt.figure(figsize=(15,10))
g = sns.kdeplot(data.lead_time)

In [None]:
data.lead_time.describe()

### Year of arrival date

In [None]:
plt.figure(figsize=(10,6))
g = sns.countplot(data.arrival_date_year)

### Month of arrival date

In [None]:
# Order by month
months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
data['arrival_date_month'] = pd.Categorical(data['arrival_date_month'], categories=months, ordered=True)

In [None]:
plt.figure(figsize=(15,6))
g = sns.countplot(data.sort_values(by='arrival_date_month').arrival_date_month)

In [None]:
# Peak in the summer : July and August

### Week number of arrival date

In [None]:
plt.figure(figsize=(15,6))
g = sns.countplot(data.arrival_date_week_number)

In [None]:
# Peak between weeks 28 and 34, and week 53 for Winter holidays

### Day of arrival date

In [None]:
plt.figure(figsize=(15,6))
g = sns.countplot(data.arrival_date_day_of_month)

### Number of weekend nights (Saturday or Sunday)

In [None]:
plt.figure(figsize=(15,6))
g = sns.countplot(data.stays_in_weekend_nights)

### Number of week nights (Monday to Friday)

In [None]:
plt.figure(figsize=(15,6))
g = sns.countplot(data.stays_in_week_nights)

In [None]:
# Less number of nights after 5, which corresponds to 1 week

### Number of days

In [None]:
data['total_days'] = data['stays_in_weekend_nights'] + data['stays_in_week_nights']

In [None]:
plt.figure(figsize=(15,6))
g = sns.countplot(data.total_days)

### Number of adults

In [None]:
plt.figure(figsize=(15,6))
g = sns.countplot(data.adults)

In [None]:
# > 10 : groups

### Number of children

In [None]:
plt.figure(figsize=(15,6))
g = sns.countplot(data.children)

In [None]:
# A lot of couples

In [None]:
plt.figure(figsize=(15,6))
g = sns.barplot(x="adults", y="children", data=data, dodge=False)

In [None]:
# Exploring children without parents
len(data[(data.adults == 0) & (data.children > 1)])

In [None]:
data[(data.adults == 0) & (data.children > 1)].hotel.value_counts()

In [None]:
# Only City Hotels

In [None]:
data[(data.adults == 0) & (data.children > 1)].arrival_date_month.value_counts()

In [None]:
# Especially on summer and for the winter holidays

In [None]:
data[(data.adults == 0) & (data.children > 1)].total_days.value_counts()

In [None]:
# Stay 3 or 4 days

In [None]:
data[(data.adults == 0) & (data.children > 1)].total_of_special_requests.value_counts()

In [None]:
# Not a lot of special requests

### Number of babies

In [None]:
plt.figure(figsize=(15,6))
g = sns.countplot(data.babies)

### Type of meal booked

In [None]:
plt.figure(figsize=(15,6))
g = sns.countplot(data.meal)

In [None]:
data.meal.value_counts() / len(data) * 100

In [None]:
# Majority of Bed & Breakfast (77.3%)
# 12.1 % of Half board (breakfast and one other meal – usually dinner)
# 8.9% of No Meal
# 0.6% of Full board (breakfast, lunch and dinner)

### Country of origin

In [None]:
plt.figure(figsize=(15,8))
g = sns.countplot(x='country',data=data, order = data['country'].value_counts().iloc[:10].index)

In [None]:
data.country.value_counts()[:10] / len(data) * 100

In [None]:
# Majority from Portugal, we guess the hotels are in Portugal

### Market segment designation

In [None]:
plt.figure(figsize=(15,6))
g = sns.countplot(data.market_segment)

In [None]:
data.market_segment.value_counts() / len(data) * 100

In [None]:
# Majority of Online Travel Agent

### Booking distribution channel

In [None]:
plt.figure(figsize=(15,6))
g = sns.countplot(data.distribution_channel)

In [None]:
data.distribution_channel.value_counts().nlargest(5) / len(data) * 100

In [None]:
# 82% of Travel Agent / Tour Operators

### Repeated guest

In [None]:
plt.figure(figsize=(15,6))
g = sns.countplot(data.is_repeated_guest)

### Number of previous bookings that were cancelled by the customer prior to the current booking

In [None]:
plt.figure(figsize=(15,6))
g = sns.countplot(data.previous_cancellations)

### Number of previous bookings not cancelled by the customer prior to the current booking

In [None]:
data.previous_bookings_not_canceled.value_counts().nlargest(10) / len(data) * 100

### Code of room type reserved

In [None]:
plt.figure(figsize=(15,6))
g = sns.countplot(data.reserved_room_type)

### Code for the type of room assigned to the booking

In [None]:
plt.figure(figsize=(15,6))
g = sns.countplot(data.assigned_room_type)

### Number of changes/amendments made to the booking 

In [None]:
plt.figure(figsize=(15,6))
g = sns.countplot(data.booking_changes)

### Indication on if the customer made a deposit to guarantee the booking

In [None]:
plt.figure(figsize=(15,6))
g = sns.countplot(data.deposit_type)

In [None]:
# The majority did not make a deposit

### ID of the travel agency that made the booking

In [None]:
data.agent.value_counts().nlargest(5) / len(data) * 100

### ID of the company/entity that made the booking

In [None]:
data.company.value_counts().nlargest(5) / len(data) * 100

### Number of days the booking was in the waiting list before it was confirmed to the customer

In [None]:
data.days_in_waiting_list.value_counts().nlargest(5) / len(data) * 100

In [None]:
# In 97% of the time, the booking is confirmed the same day

### Type of booking

In [None]:
plt.figure(figsize=(15,6))
g = sns.countplot(data.customer_type)

In [None]:
# Majority of Transient (when the booking is not part of a group or contract, and is not associated to other transient booking)

### Average Daily Rate

In [None]:
plt.figure(figsize=(15,10))
g = sns.kdeplot(data.adr)

In [None]:
data.adr.describe()

### Number of car parking spaces required by the customer

In [None]:
plt.figure(figsize=(15,6))
g = sns.countplot(data.required_car_parking_spaces)

### Number of special requests made by the customer (e.g. twin bed or high floor)

In [None]:
plt.figure(figsize=(15,6))
g = sns.countplot(data.total_of_special_requests)

### Reservation last status

In [None]:
plt.figure(figsize=(15,6))
g = sns.countplot(data.reservation_status)

# Data Featuring

In [None]:
data.head()

In [None]:
# lead_time is right skew, we will normalize it

# Normalize features columns
# Models performe better when values are close to normally distributed
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

In [None]:
data['lead_time'] = scaler.fit_transform(data['lead_time'].values.reshape(-1, 1))
data['adr'] = scaler.fit_transform(data['adr'].values.reshape(-1, 1))

In [None]:
# Convert to categorical values
data['arrival_date_month'] = data.arrival_date_month.astype('category').cat.codes
data['meal'] = data.meal.astype('category').cat.codes
data['country'] = data.country.astype('category').cat.codes
data['market_segment'] = data.market_segment.astype('category').cat.codes
data['distribution_channel'] = data.distribution_channel.astype('category').cat.codes
data['reserved_room_type'] = data.reserved_room_type.astype('category').cat.codes
data['assigned_room_type'] = data.assigned_room_type.astype('category').cat.codes
data['deposit_type'] = data.deposit_type.astype('category').cat.codes
data['customer_type'] = data.customer_type.astype('category').cat.codes
data['reservation_status'] = data.reservation_status.astype('category').cat.codes
data['hotel'] = data.hotel.astype('category').cat.codes

In [None]:
# Fill NA to 0
data.isnull().sum(axis = 0)

In [None]:
data['children'] = data.children.fillna(0) # replace the 4 nan with 0
data['agent'] = data.agent.fillna(0)
data['company'] = data.company.fillna(0)

In [None]:
data.head()

# Correlation

In [None]:
# Remove columns not important
data = data.drop(["arrival_date_year", "reservation_status_date"], axis=1)

In [None]:
# Get columns with at least 0.2 correlation
data_corr = data.corr()['is_canceled']
cols = data_corr[abs(data_corr) > 0.1].index.tolist()
data = data[cols]

In [None]:
# plot the heatmap
data_corr = data.corr()
plt.figure(figsize=(10,8))
sns.heatmap(data_corr, 
        xticklabels=data_corr.columns,
        yticklabels=data_corr.columns, cmap=sns.diverging_palette(220, 20, n=200))

In [None]:
data.corr()['is_canceled'].sort_values(ascending=False)

In [None]:
# Too much correlation
data = data.drop('reservation_status', 1)

# Creating the model

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

In [None]:
X = data.drop("is_canceled", axis=1)
Y = data["is_canceled"]

In [None]:
# Split 20% test, 80% train

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state=0)

In [None]:
# Logistic Regression

log = LogisticRegression(max_iter=1000)
log.fit(X_train, Y_train)
Y_pred_log = log.predict(X_test)
acc_log = accuracy_score(Y_pred_log, Y_test)
acc_log

In [None]:
t = tree.DecisionTreeClassifier()

# search the best params
grid = {'min_samples_split': [5, 10, 20, 50, 100]},

clf_tree = GridSearchCV(t, grid, cv=10)
clf_tree.fit(X_train, Y_train)

Y_pred_tree = clf_tree.predict(X_test)

# get the accuracy score
acc_tree = accuracy_score(Y_pred_tree, Y_test)
print(acc_tree)

In [None]:
clf_tree.best_params_

In [None]:
rf = RandomForestClassifier()

# search the best params
grid = {'n_estimators':[100,200], 'max_depth': [2,5,10]}

clf_rf = GridSearchCV(rf, grid, cv=10)
clf_rf.fit(X_train, Y_train)

Y_pred_rf = clf_rf.predict(X_test)
# get the accuracy score
acc_rf = accuracy_score(Y_pred_rf, Y_test)
print(acc_rf)

In [None]:
clf_rf.best_params_

# Conclusion

In [None]:
# The best model is Decision Tree 