In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

In [12]:
# Loading Dataset
df = pd.read_csv('hotel_bookings.csv')


In [13]:
df.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03


In [17]:
print(df.info())
print(df.describe())
print(df.isnull().sum()) 

<class 'pandas.core.frame.DataFrame'>
Index: 6797 entries, 18 to 119248
Data columns (total 32 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   hotel                           6797 non-null   object 
 1   is_canceled                     6797 non-null   int64  
 2   lead_time                       6797 non-null   int64  
 3   arrival_date_year               6797 non-null   int64  
 4   arrival_date_month              6797 non-null   object 
 5   arrival_date_week_number        6797 non-null   int64  
 6   arrival_date_day_of_month       6797 non-null   int64  
 7   stays_in_weekend_nights         6797 non-null   int64  
 8   stays_in_week_nights            6797 non-null   int64  
 9   adults                          6797 non-null   int64  
 10  children                        6797 non-null   float64
 11  babies                          6797 non-null   int64  
 12  meal                            6797

In [18]:
# Data Cleaning And Preprocessing
# Handling Missing Values
df['children'].fillna(0, inplace=True)
df['country'].fillna(df['country'].mode()[0], inplace=True)
df['agent'].fillna(0, inplace=True)
df.dropna(inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['children'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['country'].fillna(df['country'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting v

In [19]:
# Convert Categorical Features
encoder = LabelEncoder()
df['hotel'] = encoder.fit_transform(df['hotel'])
df['deposit_type'] = encoder.fit_transform(df['deposit_type'])
df['customer_type'] = encoder.fit_transform(df['customer_type'])

In [20]:
# Outlier Detection and Handling (adr, lead_time)
df = df[df['adr'] < df['adr'].quantile(0.99)]
df = df[df['lead_time'] < df['lead_time'].quantile(0.99)]

In [21]:
# Feature Engineering
# Feature Scaling
scaler = StandardScaler()


In [22]:
# Categorical Grouping
df['lead_time_category'] = pd.cut(df['lead_time'], bins=[0,30,90, np.inf], labels=['Short','Medium','Large'])
df['lead_time_category'] = encoder.fit_transform(df['lead_time_category'])


In [23]:
# Select Features and Target
features = ['lead_time', 'adr', 'previous_cancellations', 'deposit_type', 'customer_type', 'lead_time_category']
target = 'is_canceled'
X = df[features]
y = df[target]

In [24]:
# Split the Dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [25]:
# Scale Data
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [27]:
# Model Building
models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'Support Vector Machine': SVC(probability=True)
}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f'--- {name} ---')
    print('Accuracy:', accuracy_score(y_test, y_pred))
    print('ROC-AUC:', roc_auc_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))


--- Logistic Regression ---
Accuracy: 0.8765060240963856
ROC-AUC: 0.6975207651648629
              precision    recall  f1-score   support

           0       0.89      0.97      0.93      1096
           1       0.77      0.42      0.54       232

    accuracy                           0.88      1328
   macro avg       0.83      0.70      0.74      1328
weighted avg       0.87      0.88      0.86      1328

--- Decision Tree ---
Accuracy: 0.9028614457831325
ROC-AUC: 0.7848445758872389
              precision    recall  f1-score   support

           0       0.92      0.97      0.94      1096
           1       0.79      0.60      0.68       232

    accuracy                           0.90      1328
   macro avg       0.86      0.78      0.81      1328
weighted avg       0.90      0.90      0.90      1328

--- Random Forest ---
Accuracy: 0.9149096385542169
ROC-AUC: 0.7921438459602317
              precision    recall  f1-score   support

           0       0.92      0.98      0.95     

In [28]:
# Advanced Filtering And Parameterized Visualizations
# High-Risk Customers
high_risk_customers = df[df['previous_cancellations'] > 2]
print("High-risk customers:")
print(high_risk_customers[['customer_type', 'previous_cancellations']].head())

High-risk customers:
       customer_type  previous_cancellations
25707              2                       3
25708              2                       3
25709              2                       3
25710              2                       4
25711              2                       4


In [29]:
# Most Frequent Guests
frequent_guests = df[df['is_repeated_guest'] == 1]
print("Most frequent guests:")
print(frequent_guests[['customer_type', 'previous_bookings_not_canceled']].head())

Most frequent guests:
       customer_type  previous_bookings_not_canceled
14940              3                               1
14941              3                               2
14942              3                               3
14943              3                               1
14952              2                               4


In [30]:
# Seasonal Booking Trends
seasonal_trends = df.groupby('arrival_date_month')['is_canceled'].mean()
print("Seasonal Booking Trends:")
print(seasonal_trends)


Seasonal Booking Trends:
arrival_date_month
April        0.220911
August       0.155039
December     0.120548
February     0.182186
January      0.257885
July         0.153061
June         0.233645
March        0.079452
May          0.189356
November     0.158098
October      0.087520
September    0.295082
Name: is_canceled, dtype: float64


In [32]:
# Joins And Multi-Table Queries
# Hotel-Specific Cancellation Rate
hotel_cancellation_rate = df.groupby('hotel')['is_canceled'].mean()
print("Hotel-Specific Cancellation Rates:")
print(hotel_cancellation_rate)

Hotel-Specific Cancellation Rates:
hotel
0    0.211333
1    0.131949
Name: is_canceled, dtype: float64


In [33]:
# Countries With High Cancellation Rates
country_cancellation_rate = df.groupby('country')['is_canceled'].mean().sort_values(ascending=False)
print("Countries with High Cancellation Rates:")
print(country_cancellation_rate.head(10))


Countries with High Cancellation Rates:
country
PRT    0.226456
DZA    0.166667
GBR    0.127072
RUS    0.083333
FRA    0.067708
AUT    0.027778
BEL    0.024096
ITA    0.019048
NLD    0.015152
ESP    0.012766
Name: is_canceled, dtype: float64


In [35]:
# Hyperparameter Tuning
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 20],
    'min_samples_split': [2, 5, 10]
}

gs = GridSearchCV(RandomForestClassifier(), param_grid, cv=2, scoring='accuracy') # using Random forest as it has high accuracy
gs.fit(X_train, y_train)
print("Best Parameters:", gs.best_params_)
print("Best Accuracy:", gs.best_score_)

Best Parameters: {'max_depth': 20, 'min_samples_split': 10, 'n_estimators': 200}
Best Accuracy: 0.9105633692963946


In [42]:
example_input = [50, 100, 2, 1, 0, 1]
best_model = gs.best_estimator_
example_input = np.array(example_input).reshape(1, -1)
prediction = best_model.predict(example_input)
print("Predicted Cancellation Status :", prediction[0])

Predicted Cancellation Status : 0


In [None]:
# 1=Cancelled, 0=Not Cancelled