<a href="https://colab.research.google.com/github/SrishBansal/Hotel__advertising_genai/blob/main/Hotel_Booking_Demand_Analysis_and_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'hotel-booking-demand:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F511638%2F944030%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240921%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240921T093605Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3Da9741a96293d71ba82de4c6cd2f39f556cff0c32bf545b18ad390bfb43eef87faf6e170eaf149948ee30a3b3a5ac8c047e11a5d94572da98ad3b986ced32540f9f57dae64a141e3bb56cc4b7270b2f8be5b074828fc57e21d50f89d6b2a8ec509c46f6054e3fc8662cffc708eedef364a8deecfaf9d91e3defc235fcd93690761777dd2456d77f233a32d6f867547404c67766d84ce905d36781118726f08936dc05ea1d03b2e1ee824df6056481503176a531e91a98ae2625f036e9e1568f389ab95c3a67a3c5a965837eba0f30707280e70df92818ed238a91f6f6471260da5448c37bbe2fdfb69fea081f3d87bee990d3809c0bfbf92f6a55d7a2e12de0db'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


# Hotel Booking Demand Analysis and Prediction Project #

This project aims to analyze and predict hotel booking cancellations using a comprehensive dataset. I will perform extensive exploratory data analysis (EDA) to understand booking patterns, build a predictive model for cancellations, and evaluate the importance of various features in predicting cancellations.

The project is divided into three main sections:
1. Exploratory Data Analysis (EDA)
2. Predicting Cancellations
3. Evaluate Feature Importance

### 1. Exploratory Data Analysis (EDA) ###

In this section, we will answer the following questions using visualizations created with Plotly:
- Where do the guests come from?
- How much do guests pay for a room per night?
- How does the price per night vary over the year?
- Which are the most busy months?
- How long do people stay at the hotels?
- Bookings by market segment
- How many bookings were canceled?
- Which month has the highest number of cancellations?

#### Load the data and clean the data ####

In [None]:
!pip install shap

In [None]:
!pip install eli5

In [None]:
import pandas as pd
import numpy as np

import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, confusion_matrix
import shap

import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('hotel_bookings.csv')
df

In [None]:
df.info()

In [None]:
print(f'Columns with null values:\n{df.isnull().sum()}')

Replace missing values:
- agent: If no agency is specified, the booking was probably made independently.
- company: If none company given, it was most likely a private booking.

In [None]:
nan_replacements = {"children:": 0,"country": "Unknown", "agent": 0, "company": 0}
df_clean = df.fillna(nan_replacements)

Drop rows that contains 0 adults, 0 children and 0 babies.

In [None]:
df_clean.drop(
    df_clean.loc[
        df_clean['adults']
        + df_clean['children']
        + df_clean['babies'] == 0].index,
    inplace=True
)

The "meal" column contains values of "Undefined", which are equivalent to SC.

In [None]:
df_clean['meal'].replace('Undefined', 'SC', inplace=True)

In [None]:
df_clean

In [None]:
df_clean.shape

In [None]:
df_clean.describe()

#### Where do the guests come from? ####

To understand the origin of the guests, we will create a choropleth map showing the number of guests from different countries.

In [None]:
country_counts = df_clean['country'].value_counts()

fig = px.choropleth(country_counts, locations=country_counts.index, color=country_counts.values,
                    title="Where do the guests come from?",
                    labels={'color':'Number of Guests'})
fig.show()

The guest are from all over the world in the hotels.

The most people are from:
- Portugal
- UK
- France
- Spain
- Germany

Separate Datasets for Resort and City Hotels

In [None]:
resort_hotel = df_clean[df_clean['hotel'] == 'Resort Hotel']
city_hotel = df_clean[df_clean['hotel'] == 'City Hotel']

In [None]:
country_counts_resort = resort_hotel['country'].value_counts()

fig = px.choropleth(country_counts_resort, locations=country_counts_resort.index, color=country_counts_resort.values,
                    title="Where do the guests come from to Resort Hotel?",
                    labels={'color':'Number of Guests'})
fig.show()

The most guest that visits Resort Hotel are from Portugal, UK and Spain

In [None]:
country_counts_city = city_hotel['country'].value_counts()

fig = px.choropleth(country_counts_city, locations=country_counts_city.index, color=country_counts_city.values,
                    title="Where do the guests come from to City Hotel?",
                    labels={'color':'Number of Guests'})
fig.show()

The most guest that visits Resort Hotel are from:
- Portugal
- France
- Germany
- UK
- Spain

#### How much do guests pay for a room per night? ####

Plot the distribution of the average daily rate (ADR) to understand the price guests pay per night.

Counting adults and children as paying guests only

In [None]:
# Counting adults and children as paying guests only
df['paying_guests'] = df['adults'] + df['children']

# Filter out rows where there are no paying guests
df_clean = df[df['paying_guests'] > 0]

resort_hotel = df_clean[df_clean['hotel'] == 'Resort Hotel']
city_hotel = df_clean[df_clean['hotel'] == 'City Hotel']

In [None]:
# Combine both datasets for plotting
resort_hotel['hotel_type'] = 'Resort Hotel'
city_hotel['hotel_type'] = 'City Hotel'
combined_df = pd.concat([resort_hotel, city_hotel])

# Create box plot
fig = px.box(
    combined_df,
    x='reserved_room_type',
    y='adr',
    color='hotel_type',
    title='Room Prices by Room Type for Resort and City Hotels',
    labels={'reserved_room_type': 'Room Type', 'adr': 'Price [EUR]', 'hotel_type': 'Hotel Type'}
)

# Update layout to set y-axis limit
fig.update_layout(
    yaxis=dict(range=[0, 500])
)

fig.show()

The box plot provides that the average price per room, depending on its type and the standard deviation.

**Detailed Insights**
- Room Type C:
    - Resort Hotel: The prices for room type C in resort hotels are relatively high, with a median price around 200 EUR. The interquartile range (IQR) indicates variability, and there are several outliers reaching up to 400 EUR.
    - City Hotel: For city hotels, room type C has a lower median price, around 100 EUR, and also shows some variability and outliers.
- Room Type A:
    - Resort Hotel: The median price is around 100 EUR, with a wide IQR and several outliers indicating higher price variability.
    - City Hotel: Similar to resort hotels, but with a slightly higher median price around 120 EUR, and numerous outliers reaching above 200 EUR.
- Room Type D:
    - Resort Hotel: Room type D in resort hotels shows a median price around 150 EUR, with a wide IQR and significant price variability.
    - City Hotel: The median price is slightly lower than in resort hotels, around 120 EUR, with several outliers.
- Room Type E:
    - Resort Hotel: Room type E has a median price around 150 EUR, with a wider IQR and several outliers.
    - City Hotel: Similar to resort hotels but with a lower median price, around 120 EUR, and fewer outliers.
- Room Type F:
    - Resort Hotel: The median price is around 100 EUR, with a wide IQR and many outliers indicating price variability.
    - City Hotel: The prices for room type F in city hotels are higher, with a median price around 150 EUR and many outliers.
- Room Type G:
    - Resort Hotel: The prices show a median of around 200 EUR with a wide IQR and several outliers.
    - City Hotel: Lower median price around 150 EUR with a wide IQR and many outliers.
- Room Type H:
    - Resort Hotel: The prices are higher, with a median around 200 EUR and a very wide IQR, indicating high variability.
    - City Hotel: Not available.
- Room Type L:
    - Resort Hotel: Lower prices with a median around 100 EUR and fewer outliers.
    - City Hotel: Not available.
- Room Type B:
    - Resort Hotel: Lower prices with a median around 100 EUR and a narrower IQR.
    - City Hotel: Higher variability, with a median price around 120 EUR and several outliers.

**Key Takeaways:**
- Price Variability: Resort hotels generally exhibit higher price variability compared to city hotels, as seen by the wider IQRs and the presence of numerous outliers.
- Median Prices: Resort hotels tend to have higher median prices for most room types compared to city hotels.
- Room Type Specifics: Certain room types (e.g., C, G, and H) in resort hotels command higher prices compared to other types, possibly reflecting better amenities or views associated with these room types.

#### How does the price per night vary over the year? ####

Plot showing the monthly variation in ADR to observe any seasonal trends in pricing.

In [None]:
df_clean['adr_pp'] = df_clean['adr'] / df_clean['paying_guests']

room_price_monthly = df_clean[
    ["hotel", "arrival_date_month", "adr_pp"]
].sort_values("arrival_date_month")

ordered_months = ["January", "February", "March", "April", "May", "June",
          "July", "August", "September", "October", "November", "December"]

room_price_monthly['arrival_date_month'] = pd.Categorical(
    room_price_monthly["arrival_date_month"],
    categories=ordered_months,
    ordered=True)

monthly_adr = room_price_monthly.groupby(
    ['hotel', 'arrival_date_month'])['adr_pp'].mean().reset_index()

fig = px.line(
    monthly_adr,
    x='arrival_date_month',
    y='adr_pp',
    color='hotel',
    title='Monthly ADR Variation per Person for Resort and City Hotels',
    labels={'adr_pp':'Average Daily Rate per Person (ADR_PP)', 'arrival_date_month':'Month'}
)

# Update layout for better readability
fig.update_layout(
    xaxis=dict(
        tickmode='array',
        tickvals=ordered_months,
        ticktext=ordered_months
    )
)

fig.show()

**Seasonal Trends:**
- The line plot provides that the prices in the Resort hotel are much higher during the summer (June to August).
- The peak for Resort hotels in July is more pronounced compared to City hotels, indicating a higher demand or increased pricing strategy during peak vacation season.

#### Which are the most busy months? ####

In [None]:
resort_guests_monthly = resort_hotel.groupby("arrival_date_month")["hotel"].count()
city_guests_monthly = city_hotel.groupby("arrival_date_month")["hotel"].count()

resort_guest_data = pd.DataFrame({"month": list(resort_guests_monthly.index),
                    "hotel": "Resort hotel",
                    "guests": list(resort_guests_monthly.values)})

city_guest_data = pd.DataFrame({"month": list(city_guests_monthly.index),
                    "hotel": "City hotel",
                    "guests": list(city_guests_monthly.values)})

full_guest_data = pd.concat([resort_guest_data,city_guest_data], ignore_index=True)

# order by month:
ordered_months = ["January", "February", "March", "April", "May", "June",
          "July", "August", "September", "October", "November", "December"]
full_guest_data["month"] = pd.Categorical(full_guest_data['month'], categories=ordered_months, ordered=True)

# Dataset contains July and August date from 3 years, the other month from 2 years. Normalize data:
full_guest_data.loc[(full_guest_data["month"] == "July") | (full_guest_data["month"] == "August"),
                    "guests"] /= 3
full_guest_data.loc[~((full_guest_data["month"] == "July") | (full_guest_data["month"] == "August")),
                    "guests"] /= 2

full_guest_data = full_guest_data.sort_values("month")

fig = px.line(
    full_guest_data,
    x='month',
    y='guests',
    color='hotel',
    title='Monthly Bookings',
    labels={'month':'Month', 'guests':'Nunber of guests'}
)

fig.show()

**Peak Seasons:**
- From June to Semtember guest numbers for the Resort hotel go down slighty.
- City hotels see high bookings in spring and autumn, indicating different peak times.
- Both hotels have the fewest number of guests in November, December and January

#### How long do people stay at the hotels? ####

In [None]:
resort_hotel['total_nights'] = resort_hotel['stays_in_weekend_nights'] + resort_hotel['stays_in_week_nights']
city_hotel['total_nights'] = city_hotel['stays_in_weekend_nights'] + city_hotel['stays_in_week_nights']

number_nights_reseort = list(resort_hotel["total_nights"].value_counts().index)
number_bookings_resort = list(resort_hotel["total_nights"].value_counts())
rel_bookings_resort = resort_hotel["total_nights"].value_counts() / sum(number_bookings_resort) * 100

number_nights_city = list(city_hotel["total_nights"].value_counts().index)
number_bookings_city = list(city_hotel["total_nights"].value_counts())
rel_bookings_city = city_hotel["total_nights"].value_counts() / sum(number_bookings_city) * 100

resort_nights = pd.DataFrame({"hotel": "Resort hotel",
                           "num_nights": number_nights_reseort,
                           "rel_num_bookings": rel_bookings_resort})
city_nights = pd.DataFrame({"hotel": "City hotel",
                           "num_nights": number_nights_city,
                           "rel_num_bookings": rel_bookings_city})

total_nights = pd.concat([resort_nights, city_nights], ignore_index=True)

fig = px.bar(
    total_nights,
    x='num_nights',
    y='rel_num_bookings',
    color='hotel',
    title='Length of stay',
    labels={'num_nights':'Number of nights', 'rel_num_bookings':'Guests [%]'},
    barmode='group'
)

fig.update_layout(
    xaxis=dict(range=[0, 20])
)

fig.show()

**Length of stay trends:**
- for City Hotel guest prefers to stay for 1-4 nights
- for Resort Hotel guest prefers to stay for 1-4 nights too, but 7 nights also stand out as being very popular.

Let's calculate the average nights that guest stays in hotels

In [None]:
avg_night_resort = round(sum(list((resort_nights["num_nights"] * (resort_nights["rel_num_bookings"]/100)).values)))
avg_night_city = round(sum(list((city_nights["num_nights"] * (city_nights["rel_num_bookings"]/100)).values)))

print(f'Average nights that guests stays in Resort Hotel is {avg_night_resort}\n'
f'Average nights that guests stays in City Hotel is {avg_night_city}')

#### Bookings by market segments ####

In [None]:
market_segments = df_clean['market_segment'].value_counts()

fig = px.pie(
    values=market_segments.values,
    names=market_segments.index,
    title='Bookings by Market Segment'
)

fig.show()

The most popular market segments is Online TA, Offline TA/TO and Groups

In [None]:
plt.figure(figsize=(12, 8))
sns.barplot(x="market_segment",
            y="adr_pp",
            hue="reserved_room_type",
            data=df_clean,
            ci="sd",
            errwidth=1,
            capsize=0.1)
plt.title("Price per night by market segment and room type", fontsize=16)
plt.xlabel("Market segment", fontsize=16)
plt.xticks(rotation=45)
plt.ylabel("Price per night per person in EUR", fontsize=16)
plt.legend(loc="upper left")
plt.show()

- On avarege Groups have best prices per night.
- Aviation have the highest price

Why Aviation have the highest price (approximately twice as much)?

In [None]:
aviation = df_clean.loc[
    df_clean['market_segment'] == 'Aviation'
][[
    'is_canceled',
    'adults',
    'lead_time',
    'adr_pp'
]].describe()

non_aviation = df_clean.loc[
    df_clean['market_segment'] != 'Aviation'
][[
    'is_canceled',
    'adults',
    'lead_time',
    'adr_pp'
]].describe()

In [None]:
aviation

In [None]:
non_aviation

Why Aviation have the highest price (approximately twice as much):
- We can see big differrence in Lead Time between aviation and non aviation: 4 days vs 104 days
- Airlines need a rooms for their crew and they usually books one room for a person

#### How many bookings were canceled? ####

In [None]:
canceled_counts = df_clean['is_canceled'].value_counts()
fig = px.bar(
    canceled_counts,
    x=canceled_counts.index,
    y=canceled_counts.values,
    title='Cancellations vs Non-Cancellations',
    labels={'x':'Is Canceled', 'y':'Number of Bookings'}
)
fig.show()

In [None]:
resort_canceled = df_clean.loc[df_clean['hotel'] == 'Resort Hotel']['is_canceled'].sum()
resort_canceled_percent = round(resort_canceled / df_clean.loc[df_clean['hotel'] == 'Resort Hotel'].shape[0] * 100, 2)

city_canceled = df_clean.loc[df_clean['hotel'] == 'City Hotel']['is_canceled'].sum()
city_canceled_percent = round(city_canceled / df_clean.loc[df_clean['hotel'] == 'City Hotel'].shape[0] * 100, 2)

print(f"Total booking canceled: {df_clean['is_canceled'].sum()}, {round(df_clean['is_canceled'].sum() / df_clean.shape[0] * 100, 2)} %")
print(f"Resort Hotel canceled: {resort_canceled}, {resort_canceled_percent}%")
print(f"City Hotel canceled: {city_canceled}, {city_canceled_percent}%")

City Hotels have a significantly higher cancellation rate compared to Resort Hotels. This could be influenced by the nature of the stays—business versus leisure—or other factors like location flexibility or booking policies.

#### Concelation by month ####

In [None]:
resort_booking_per_month = df_clean.loc[df_clean['hotel']
                                       == 'Resort Hotel'].groupby('arrival_date_month')['hotel'].count()
resort_cancel_per_month = df_clean.loc[df_clean['hotel']
                                       == 'Resort Hotel'].groupby('arrival_date_month')['is_canceled'].sum()

city_booking_pere_month = df_clean.loc[df_clean['hotel']
                                       == 'City Hotel'].groupby('arrival_date_month')['hotel'].count()
city_cancel_per_month = df_clean.loc[df_clean['hotel']
                                       == 'City Hotel'].groupby('arrival_date_month')['is_canceled'].sum()

resort_cancel_data = pd.DataFrame({"Hotel": "Resort Hotel",
                                   "Month": list(resort_booking_per_month.index),
                                   "Bookings": list(resort_booking_per_month.values),
                                   "Cancelations": list(resort_cancel_per_month.values)})

city_cancel_data = pd.DataFrame({"Hotel": "City Hotel",
                                 "Month": list(city_booking_pere_month.index),
                                 "Bookings": list(city_booking_pere_month.values),
                                 "Cancelations": list(city_cancel_per_month.values)})

canceled_df = pd.concat([resort_cancel_data, city_cancel_data],
                        ignore_index=True)
canceled_df['Canceled_percent'] = canceled_df['Cancelations'] / canceled_df['Bookings'] * 100

ordered_months = ["January", "February", "March", "April", "May", "June",
          "July", "August", "September", "October", "November", "December"]
canceled_df['Month'] = pd.Categorical(canceled_df['Month'], categories=ordered_months, ordered=True)
canceled_df = canceled_df.sort_values('Month')

In [None]:
fig = px.bar(
    canceled_df,
    x='Month',
    y='Canceled_percent',
    color='Hotel',
    title='Cancelation per month by hotels',
    labels={'Month':'Month', 'Canceled_percent':'Cancelations [%]'},
    barmode='group'
)

fig.show()

**City Hotels:**
- Higher Cancellation Rates: Across almost every month, City Hotels consistently have higher cancellation rates compared to Resort Hotels, often exceeding 40%.
- Peak Cancellation Months:
    - April, May, June, and October stand out with cancellation rates around or above 45%.
    - September, October and December also see high cancellation rates, consistently close to 40%.
- Lowest Cancellations: The lowest cancellation rates in City Hotels are observed in March (around 37%).

**Resort Hotels:**
- Lower Cancellation Rates: Resort Hotels show significantly lower cancellation rates across the board, generally staying between 15% and 35%.
- Monthly Variations:
    - Summer months and Semptember have relatively higher cancellation rates, close to 30% to 35%.
    - January and November have the lowest cancellation rates, especially in January, which drops below 20%.

**Overall Trends:**
 - City Hotels experience a much higher rate of cancellations across all months compared to Resort Hotels, which suggests different booking behaviors between the two types of hotels.
- The higher cancellation rates in City Hotels may be attributed to factors such as business travel, where plans are more likely to change, or the availability of multiple accommodation options in urban areas.

**Seasonal Patterns:**
- For Resort Hotels, cancellation rates are higher during the summer months, possibly due to the higher volume of bookings during these peak vacation periods. However, they are still significantly lower than those of City Hotels.
- City Hotels see consistently high cancellation rates throughout the year, but they peak during transitional months like April, June, and October, which might correspond to changes in travel plans or business schedules.

### 2. Predicting Cancellations ###

In [None]:
is_cancel_corr = df_clean.corr()

fig = px.imshow(is_cancel_corr)
fig.update_layout(
    width=1000,
    height=800
)
fig.show()

This correlation matrix shows that *is_canceled* has a weak positive correlation with *lead_time* and weak negative correlations with *total_of_special_requests*, *required_car_parking_spaces*, and *booking_changes*.

Let's look for no numeric feature: reservation_status

In [None]:
df_clean.groupby('is_canceled')['reservation_status'].value_counts()

Select columns to include manually. Certain columns are excluded to enhance the model's generalizability and avoid leakage (such as arrival_date_year, assigned_room_type, booking_changes, reservation_status, country, and days_in_waiting_list). While including the country column might improve accuracy, it could also reduce the model's general applicability

In [None]:
numerical_features = ['lead_time', 'arrival_date_week_number','arrival_date_day_of_month',
                    'stays_in_weekend_nights', 'stays_in_week_nights', 'adults',
                    'children', 'babies', 'is_repeated_guest', 'previous_cancellations',
                    'previous_bookings_not_canceled', 'agent', 'company',
                    'required_car_parking_spaces', 'total_of_special_requests', 'adr']

categorical_features = ['hotel', 'arrival_date_month', 'meal', 'market_segment',
                        'distribution_channel', 'reserved_room_type', 'deposit_type',
                        'customer_type']
features = numerical_features + categorical_features

# Split the data
X = df_clean.drop('is_canceled', axis=1)
y = df_clean['is_canceled']

Preprocess features:
- For most numerical columns:
    - except for date columns, using 0 as the fill value
- For categorical (non numerical) features:
    - Imputation: Missing values in categorical features are replaced with the constant value "Unknown".
    - One-Hot Encoding: Categorical features are transformed into a binary matrix, with new columns created for each unique category. Any unknown categories encountered during transformation are ignored.

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Preprocessing for categorical features
categorial_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="Unknown")),
    ("onehot", OneHotEncoder(handle_unknown='ignore'))
])

# Preprocessing for numerical features
num_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),  # Handle missing values in numerical features
    ("scaler", StandardScaler())
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", categorial_transformer, categorical_features),
        ("num", num_transformer, numerical_features)
    ])

In [None]:
from sklearn.model_selection import train_test_split, KFold, cross_validate, cross_val_score

# Model Building
models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'XGBoost': XGBClassifier()
}

kfolds = 4
split = KFold(n_splits=kfolds, shuffle=True, random_state=42)

results = {}
for model_name, model in models.items():
    model_steps = Pipeline(steps=[('preprocessor', preprocessor),
                                  ('model', model)])
    cv_results = cross_val_score(model_steps,
                                 X,
                                 y,
                                 cv=split,
                                 scoring="accuracy",
                                 n_jobs=-1)
    # output:
    min_score = round(min(cv_results), 4)
    max_score = round(max(cv_results), 4)
    mean_score = round(np.mean(cv_results), 4)
    std_dev = round(np.std(cv_results), 4)
    print(f'{model_name} cross validation accuarcy score: {mean_score} +/- {std_dev} (std) min: {min_score}, max: {max_score}')

#### Summary: ####
- **Best Performer:** Random Forest, with the highest average accuracy (86.71%) and stable performance.
- **Most Stable:** Gradient Boosting, with the smallest standard deviation (0.0010).
- **Overall:** All models have shown good performance, but Random Forest and XGBoost stand out in terms of accuracy.

In [None]:
# Define the enhanced Random Forest model with the best parameters
rf_model = RandomForestClassifier(
    n_estimators=500,
    max_features=0.4,
    min_samples_split=2,
    n_jobs=-1,
    random_state=0
)

# Define KFold cross-validation
kfolds = 4  # Number of folds
split = KFold(n_splits=kfolds, shuffle=True, random_state=42)

# Create a pipeline with preprocessor and the enhanced Random Forest model
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),  # Assuming 'preprocessor' is defined elsewhere
    ('model', rf_model)
])

# Perform cross-validation
cv_results = cross_val_score(
    model_pipeline,
    X,
    y,
    cv=split,
    scoring="accuracy",
    n_jobs=-1
)

# Output the results
min_score = round(min(cv_results), 4)
max_score = round(max(cv_results), 4)
mean_score = round(np.mean(cv_results), 4)
std_dev = round(np.std(cv_results), 4)

print(f'Random Forest model cross-validation accuracy score: {mean_score} +/-'
      f'{std_dev} (std) min: {min_score}, max: {max_score}')

In [None]:
import eli5

model_pipeline.fit(X, y)

categorial_columns = list(model_pipeline.named_steps['preprocessor'].
                           named_transformers_['cat'].
                           named_steps['onehot'].
                           get_feature_names(input_features=categorical_features))

feature_list = numerical_features + categorial_columns

importances_df = eli5.formatters.as_dataframe.explain_weights_df(
    model_pipeline.named_steps['model'],
    feature_names=feature_list
)
importances_df.head(7)

In [None]:
model_pipeline.fit(X, y)

categorial_columns = list(model_pipeline.named_steps['preprocessor'].
                           named_transformers_['cat'].
                           named_steps['onehot'].
                           get_feature_names(input_features=categorical_features))

feature_list = numerical_features + categorial_columns

rf_model = model_pipeline.named_steps['model']  # 'classifier' is the step name used in the pipeline

importances = rf_model.feature_importances_

# Create a Series for better visualization
feature_importances = pd.Series(importances, index=feature_list).sort_values(ascending=False)
feature_importances.head(7)

**Three most important features:**
- reserved_room_type_A
- market_segment_Online TA
- customer_type_Transient-Party