### STEP 1 — Load & Preview Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv("hotel_bookings.csv")
df.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03


### STEP 2 — Basic Data Inspection

In [4]:
df.shape
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 32 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   hotel                           119390 non-null  object 
 1   is_canceled                     119390 non-null  int64  
 2   lead_time                       119390 non-null  int64  
 3   arrival_date_year               119390 non-null  int64  
 4   arrival_date_month              119390 non-null  object 
 5   arrival_date_week_number        119390 non-null  int64  
 6   arrival_date_day_of_month       119390 non-null  int64  
 7   stays_in_weekend_nights         119390 non-null  int64  
 8   stays_in_week_nights            119390 non-null  int64  
 9   adults                          119390 non-null  int64  
 10  children                        119386 non-null  float64
 11  babies                          119390 non-null  int64  
 12  meal            

Unnamed: 0,is_canceled,lead_time,arrival_date_year,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,booking_changes,agent,company,days_in_waiting_list,adr,required_car_parking_spaces,total_of_special_requests
count,119390.0,119390.0,119390.0,119390.0,119390.0,119390.0,119390.0,119390.0,119386.0,119390.0,119390.0,119390.0,119390.0,119390.0,103050.0,6797.0,119390.0,119390.0,119390.0,119390.0
mean,0.370416,104.011416,2016.156554,27.165173,15.798241,0.927599,2.500302,1.856403,0.10389,0.007949,0.031912,0.087118,0.137097,0.221124,86.693382,189.266735,2.321149,101.831122,0.062518,0.571363
std,0.482918,106.863097,0.707476,13.605138,8.780829,0.998613,1.908286,0.579261,0.398561,0.097436,0.175767,0.844336,1.497437,0.652306,110.774548,131.655015,17.594721,50.53579,0.245291,0.792798
min,0.0,0.0,2015.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,6.0,0.0,-6.38,0.0,0.0
25%,0.0,18.0,2016.0,16.0,8.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,62.0,0.0,69.29,0.0,0.0
50%,0.0,69.0,2016.0,28.0,16.0,1.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,14.0,179.0,0.0,94.575,0.0,0.0
75%,1.0,160.0,2017.0,38.0,23.0,2.0,3.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,229.0,270.0,0.0,126.0,0.0,1.0
max,1.0,737.0,2017.0,53.0,31.0,19.0,50.0,55.0,10.0,10.0,1.0,26.0,72.0,21.0,535.0,543.0,391.0,5400.0,8.0,5.0


### STEP 3 — Missing Value Handling

#### 3.1 Check missing values

In [7]:
df.isnull().sum()

hotel                                  0
is_canceled                            0
lead_time                              0
arrival_date_year                      0
arrival_date_month                     0
arrival_date_week_number               0
arrival_date_day_of_month              0
stays_in_weekend_nights                0
stays_in_week_nights                   0
adults                                 0
children                               4
babies                                 0
meal                                   0
country                              488
market_segment                         0
distribution_channel                   0
is_repeated_guest                      0
previous_cancellations                 0
previous_bookings_not_canceled         0
reserved_room_type                     0
assigned_room_type                     0
booking_changes                        0
deposit_type                           0
agent                              16340
company         

#### 3.2 Fill children with median (because children can be 0)

In [9]:
df['children'] = df['children'].fillna(df['children'].median())

#### 3.3 Fill country with most common (mode)

In [11]:
df['country'] = df['country'].fillna(df['country'].mode()[0])

#### 3.4 Fill agent & company with 0 (because NaN means no agent/company)

In [13]:
df['agent'] = df['agent'].fillna(0)
df['company'] = df['company'].fillna(0)

#### 3.5 Convert reservation_status_date to datetime

In [15]:
df['reservation_status_date'] = pd.to_datetime(df['reservation_status_date'])

### STEP 4 — FEATURE ENGINEERING (3 FEATURES)

#### FE 1 — Total Stay Duration

In [18]:
df['total_stay'] = df['stays_in_weekend_nights'] + df['stays_in_week_nights']
df['total_stay']

0         0
1         0
2         1
3         1
4         2
         ..
119385    7
119386    7
119387    7
119388    7
119389    9
Name: total_stay, Length: 119390, dtype: int64

#### FE 2 — Total Guests

In [20]:
df['total_guests'] = df['adults'] + df['children'] + df['babies']
df['total_guests']

0         2.0
1         2.0
2         1.0
3         1.0
4         2.0
         ... 
119385    2.0
119386    3.0
119387    2.0
119388    2.0
119389    2.0
Name: total_guests, Length: 119390, dtype: float64

#### FE 3 — Season (based on arrival month)

In [22]:
season_map = {
    'December': 'Winter', 'January': 'Winter', 'February': 'Winter',
    'March': 'Spring', 'April': 'Spring', 'May': 'Spring',
    'June': 'Summer', 'July': 'Summer', 'August': 'Summer',
    'September': 'Autumn', 'October': 'Autumn', 'November': 'Autumn'
}

df['season'] = df['arrival_date_month'].map(season_map)
df['season']

0         Summer
1         Summer
2         Summer
3         Summer
4         Summer
           ...  
119385    Summer
119386    Summer
119387    Summer
119388    Summer
119389    Summer
Name: season, Length: 119390, dtype: object

### STEP 5 — ENCODING (Categorical - > Numeric)

#### 5.1 One-Hot Encode all categorical columns

In [25]:
df = df.drop(columns=['reservation_status_date'])

In [26]:
categorical_cols = ['hotel','meal','market_segment','distribution_channel',
                    'reserved_room_type','customer_type','deposit_type',
                    'season','reservation_status', 'arrival_date_month']

df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

#### 5.2 Label Encode binary columns

In [35]:
df_encoded = pd.get_dummies(df, drop_first=True)

In [36]:
leak_cols = [col for col in df_encoded.columns if 'reservation_status' in col]
df_encoded = df_encoded.drop(columns=leak_cols)

In [37]:
df.shape, df_encoded.shape

((119390, 34), (119390, 253))

### STEP 6 — FEATURE SCALING

In [32]:
scale_cols = ['lead_time','total_stay','total_guests','adr']

scaler = StandardScaler()
df_encoded[scale_cols] = scaler.fit_transform(df_encoded[scale_cols])

In [33]:
df_encoded[scale_cols].describe()

Unnamed: 0,lead_time,total_stay,total_guests,adr
count,119390.0,119390.0,119390.0,119390.0
mean,6.094277e-17,4.0469810000000005e-17,1.142677e-17,2.513889e-16
std,1.000004,1.000004,1.000004,1.000004
min,-0.9733187,-1.34037,-2.724616,-2.141286
25%,-0.8048782,-0.5583345,0.04396711,-0.643925
50%,-0.3276301,-0.1673166,0.04396711,-0.1435844
75%,0.5239303,0.2237013,0.04396711,0.4782547
max,5.923385,25.63986,73.41142,104.8404
