## Import Libraries

In [None]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.model_selection import cross_val_score,train_test_split,cross_val_predict,cross_validate
from sklearn.ensemble import GradientBoostingClassifier,AdaBoostClassifier
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
%matplotlib inline

In [None]:
df = pd.read_csv('hotel_bookings.csv')

In [None]:
df.head()

## About the dataset:
This [data article](https://www.sciencedirect.com/science/article/pii/S2352340918315191#s0005) describes a datasets with hotel demand data. One of the hotels (H1) is a resort hotel and the other is a city hotel (H2). the dataset contains 32 variables describing the 40,060 observations of H1 and 79,330 observations of H2. Each observation represents a hotel booking. the dataset comprehend bookings due to arrive between the 1st of July of 2015 and the 31st of August 2017, including bookings that effectively arrived and bookings that were canceled. Since this is hotel real data, all data elements pertaining hotel or costumer identification were deleted. Due to the scarcity of real business data for scientific and educational purposes, this dataset can have an important role for research and education in revenue management, machine learning, or data mining, as well as in other fields. 
You can download the Hotel Booking Demand dataset from the Kaggle [click here](https://www.kaggle.com/jessemostipak/hotel-booking-demand).

## Attribute Information:
- **hotel**: Hotel (Resort Hotel, City Hotel)
- **is_canceled**: Value indicating if the booking was canceled (1) or not (0)
- **lead_time**: Number of days that elapsed between the entering date of the booking into the PMS and the arrival date
- **arrival_date_year**: Year of arrival date
- **arrival_date_month**: Month of arrival date
- **arrival_date_week_number**: Week number of year for arrival date
- **arrival_date_day_of_month**: Day of arrival date
- **stays_in_weekend_nights**: Number of weekend nights (Saturday or Sunday) the guest stayed or booked to stay at the hotel
- **stays_in_week_nights**: Number of week nights (Monday to Friday) the guest stayed or booked to stay at the hotel
- **adults**: Number of adults
- **children**: Number of children
- **babies**: Number of babies
- **meal**: Type of meal booked. Categories are presented in standard hospitality meal packages: Undefined/SC – no meal package; BB – Bed & Breakfast; HB – Half board (breakfast and one other meal – usually dinner); FB – Full board (breakfast, lunch and dinner)
- **country**: Country of origin. Categories are represented in the ISO 3155–3:2013 format
- **market_segment**: Market segment designation. In categories, the term “TA” means “Travel Agents” and “TO” means “Tour Operators”
- **distribution_channel**: Booking distribution channel. The term “TA” means “Travel Agents” and “TO” means “Tour Operators”
- **is_repeated_guest**: Value indicating if the booking name was from a repeated guest (1) or not (0)
- **previous_cancellations**: Number of previous bookings that were cancelled by the customer prior to the current booking
- **previous_bookings_not_canceled**: Number of previous bookings not cancelled by the customer prior to the current booking
- **reserved_room_type**: Code of room type reserved. Code is presented instead of designation for anonymity reasons.
- **assigned_room_type**: Code for the type of room assigned to the booking. Sometimes the assigned room type differs from the reserved room type due to hotel operation reasons (e.g. overbooking) or by customer request. Code is presented instead of designation for anonymity reasons.
- **booking_changes**: Number of changes/amendments made to the booking from the moment the booking was entered on the PMS until the moment of check-in or cancellation
- **deposit_type**: Indication on if the customer made a deposit to guarantee the booking. This variable can assume three categories: No Deposit – no deposit was made; Non Refund – a deposit was made in the value of the total stay cost; Refundable – a deposit was made with a value under the total cost of stay.
- **agent**: ID of the travel agency that made the booking
- **company**: ID of the company/entity that made the booking or responsible for paying the booking. ID is presented instead of designation for anonymity reasons
- **days_in_waiting_list**: Number of days the booking was in the waiting list before it was confirmed to the customer
- **customer_type**: Type of booking, assuming one of four categories: Contract - when the booking has an allotment or other type of contract associated to it; Group – when the booking is associated to a group; Transient – when the booking is not part of a group or contract, and is not associated to other transient booking; Transient-party – when the booking is transient, but is associated to at least other transient booking
- **adr**: Average Daily Rate as defined by dividing the sum of all lodging transactions by the total number of staying nights
- **required_car_parking_spaces**: Number of car parking spaces required by the customer
- **total_of_special_requests**: Number of special requests made by the customer (e.g. twin bed or high floor)
- **reservation_status**: Reservation last status, assuming one of three categories: Canceled – booking was canceled by the customer; Check-Out – customer has checked in but already departed; No-Show – customer did not check-in and did inform the hotel of the reason why
- **reservation_status_date**: Date at which the last status was set. This variable can be used in conjunction with the ReservationStatus to

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.describe(include='object')

## Does the type of payment affect the cancellation of the reservation or not?

In [None]:
import plotly.graph_objects as plt
labels = ['No Deposit','Refundable','Non Refund']
values = [88, 0, 12]

fig = plt.Figure(data=[plt.Pie(labels=labels, values=values, textinfo='label+percent',
                             insidetextorientation='radial'
                            )])
fig.show()

### Insights:
- 
-



## Does length of time between the entering date of the booking and the arrival date affect the cancellation?

In [None]:
counts=df.groupby('lead_time').count()
s=df.groupby('lead_time').sum()
cnl=(s['is_canceled']/counts['is_canceled'])*100
cnl

In [None]:
sns.regplot(x=cnl.index,y=cnl.values);

In [None]:
#Insights: Yes the length of time between the entering date of the booking and the arrival date affect the cancellation.

## Which months that have lowest number of visitors?

In [None]:
visitors=df['adults']+df['children']+df['babies']
df['visitors']=visitors

In [None]:
plt.figure(figsize=(13,6))
sns.barplot(data=df,x='arrival_date_month',y='visitors', estimator=sum,color='r');

In [None]:
# As we can see the lowest months have lowest number of visitors is Nov, Dec, and Jan
# The highest month of visitors is August.
# April , May and June are very near to each other in the number of visitors.

## Check if the lowest number of visitors due to the increase in the number of the canceled reservations?

In [None]:
s=df.groupby('arrival_date_month').sum().reset_index()

In [None]:
plt.figure(figsize=(13,6))
sns.barplot(data=df,x='arrival_date_month',y='visitors',hue='is_canceled', estimator=sum,color='r');

In [None]:
# As we can see August is the highest number of cancel reservation.
# November and January are the lowest months of cancelation.

## Which top 10 countries that visitors come from?      

In [None]:
contry=df.groupby('country').sum().reset_index().sort_values(by='visitors').reset_index().loc[0:10,:]

In [None]:
plt.figure(figsize=(11,11))
plt.pie(x=contry['visitors'],labels=contry['country'],autopct='%2.2f')
plt.show()

In [None]:
# PRT and GBR are the highest countries that visitors come from.
# FRA and ESP are almost the same Level of visitors.
# NLD, BRA, and BEL are the lowest countries of visitors.

## Relation between booking changes and cancelling the reservation

In [None]:
sns.barplot(data=df,x='is_canceled',y='booking_changes',hue='customer_type').set_title('Relation between booking changes and cancelling the reservation');

In [None]:
# Transient - Party is the highest booking change.
# Group costumer is the lowest canceling reservation.

### Insights:
-
-


# Data Preprocessing

In [None]:
df.isnull().sum()

In [None]:
# Drop the columns with null values
df = df.drop(['agent' ,'company', 'country'], axis=1, errors="ignore")

In [None]:
df.dropna(subset = ["children"], inplace=True)

In [None]:
df.isnull().sum()

In [None]:
df = df.drop(['hotel','arrival_date_month','meal','customer_type',
              'market_segment', 'distribution_channel','assigned_room_type',
              'reserved_room_type', 'deposit_type', 'customer_type', 'reservation_status', 
              'reservation_status_date'], axis=1, errors="ignore")

In [None]:
df.dtypes

In [None]:
X, y = df.drop(['is_canceled'],axis=1), df['is_canceled']

### Choose any machine learning algorithm to predict cancellation 
- use train-test split
- use cross-validation split

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn import svm

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)
X_train.shape, y_train.shape

In [None]:
clf = svm.SVC(kernel='linear', C=1).fit(X_train, y_train)
clf.score(X_test, y_test)

In [None]:
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))
print("Accuracy: {:.1f} %".format(scores.mean()*100))

In [None]:
from sklearn import metrics
scores = cross_val_score(clf, X, y, cv=5, scoring="f1_macro")
scores

In [None]:
metrics.accuracy_score(y, predicted_values)