In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
df = pd.read_csv(r'C:\Users\USER\hotel_cancellation_prediction\data\hotel_bookings.csv')

In [3]:
# Handling missing values
df['children'].fillna(0, inplace=True)
df['country'].fillna(df['country'].mode()[0], inplace=True)
df['agent'].fillna(0, inplace=True)
df['company'].fillna(0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['children'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['country'].fillna(df['country'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting v

In [4]:
# categorical variables to numerical
df = pd.get_dummies(df, columns=['hotel', 'meal', 'market_segment', 'distribution_channel', 'reserved_room_type', 'assigned_room_type', 'deposit_type', 'customer_type', 'country'], drop_first=True)

In [5]:
# 'reservation_status_date' to datetime
df['reservation_status_date'] = pd.to_datetime(df['reservation_status_date'])

In [6]:
# Extracting year and month from 'reservation_status_date'
df['reservation_year'] = df['reservation_status_date'].dt.year
df['reservation_month'] = df['reservation_status_date'].dt.month

In [7]:
df['reservation_year'] = df['reservation_status_date'].dt.year
df['reservation_month'] = df['reservation_status_date'].dt.month

In [8]:
X = df.drop('is_canceled', axis=1)
y = df['is_canceled']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# Creating preprocessing pipeline
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features)
    ])



In [11]:
print("Data preprocessing completed!")
print(f"Shape of the preprocessed dataset: {df.shape}")
print(f"Number of features: {X.shape[1]}")
print(f"Training set size: {X_train.shape[0]}")
print(f"Testing set size: {X_test.shape[0]}")

Data preprocessing completed!
Shape of the preprocessed dataset: (119390, 242)
Number of features: 241
Training set size: 95512
Testing set size: 23878


In [12]:
df.to_csv('preprocessed_hotel_bookings.csv', index=False)
print("Preprocessed dataset saved as 'preprocessed_hotel_bookings.csv'")

X.to_csv('preprocessed_features.csv', index=False)
y.to_csv('target_variable.csv', index=False)
print("Features saved as 'preprocessed_features.csv'")
print("Target variable saved as 'target_variable.csv'")

Preprocessed dataset saved as 'preprocessed_hotel_bookings.csv'
Features saved as 'preprocessed_features.csv'
Target variable saved as 'target_variable.csv'
