### Split the Data into Train & Test Sets

In [9]:
from sklearn.model_selection import train_test_split

# Load the processed dataset
df = pd.read_csv("../data/processed/hotel_bookings_cleaned.csv")

# Define features (X) and target variable (y)
X = df.drop(columns=["Reservation_Status"])  # Features
y = df["Reservation_Status"]  # Target (1=Check-in, 2=Cancel, 3=No-Show)

# Split the dataset (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)


Train shape: (21999, 29)
Test shape: (5500, 29)


### Train a Baseline Model (Random Forest)

In [4]:
!pip install imbalanced-learn
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import pandas as pd

# Load the processed dataset
df = pd.read_csv("../data/processed/hotel_bookings_cleaned.csv")

# Define features (X) and target variable (y)
X = df.drop(columns=["Reservation_Status"])  # Features
y = df["Reservation_Status"]  # Target (1=Check-in, 2=Cancel, 3=No-Show)

# Split the dataset (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("Before SMOTE:", y_train.value_counts())


Before SMOTE: Reservation_Status
1    16992
2     3307
3     1700
Name: count, dtype: int64


In [5]:
# Apply SMOTE to balance the classes
smote = SMOTE(sampling_strategy="auto", random_state=42)
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)

print("After SMOTE:", y_train_sm.value_counts())  # Check the new class distribution




After SMOTE: Reservation_Status
1    16992
3    16992
2    16992
Name: count, dtype: int64


In [6]:
# Initialize and train the model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_sm, y_train_sm)

# Make predictions
y_pred = rf_model.predict(X_test)

# Evaluate performance
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           1       0.65      0.17      0.28      4248
           2       0.13      0.63      0.21       827
           3       0.53      0.36      0.43       425

    accuracy                           0.26      5500
   macro avg       0.44      0.39      0.31      5500
weighted avg       0.56      0.26      0.28      5500



In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE

# Encode labels to start from 0
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)  # Encode original labels
y_test_encoded = le.transform(y_test)  # Ensure the same encoding for test

# Apply SMOTE
smote = SMOTE(sampling_strategy="auto", random_state=42)
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train_encoded)  # Use encoded labels

# Initialize models
models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=500),
    "Decision Tree": DecisionTreeClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="mlogloss")
}

# Train and evaluate each model
for name, model in models.items():
    model.fit(X_train_sm, y_train_sm)
    y_pred = model.predict(X_test)
    
    # Convert predictions back to original labels if needed
    y_pred = le.inverse_transform(y_pred)  
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {accuracy:.4f}")




Random Forest Accuracy: 0.2580


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression Accuracy: 0.5984
Decision Tree Accuracy: 0.2465


Parameters: { "use_label_encoder" } are not used.



XGBoost Accuracy: 0.2593


In [23]:
from sklearn.preprocessing import StandardScaler

# Identify numerical columns
num_cols = ["Age", "Room_Rate", "Discount_Rate"]  # Add other numerical columns if needed

# Scale numerical features
scaler = StandardScaler()
X_train_sm[num_cols] = scaler.fit_transform(X_train_sm[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])


In [25]:
import numpy as np

# Calculate correlation matrix
corr_matrix = X_train_sm.corr()

# Find highly correlated features (threshold: 0.9)
high_corr_features = [column for column in corr_matrix.columns if any(corr_matrix[column] > 0.9)]
print("Highly Correlated Features:", high_corr_features)

# Drop them from the dataset
X_train_sm.drop(columns=high_corr_features, inplace=True)
X_test.drop(columns=high_corr_features, inplace=True)


Highly Correlated Features: ['Age', 'Educational_Level', 'Income', 'Adults', 'Children', 'Babies', 'Discount_Rate', 'Room_Rate', 'Gender_M', 'Ethnicity_Asian American', 'Ethnicity_Latino', 'Ethnicity_caucasian', 'Country_region_North', 'Country_region_South', 'Country_region_West', 'Hotel_Type_City Hotel', 'Hotel_Type_Resort', 'Meal_Type_FB', 'Meal_Type_HB', 'Deposit_type_Non-Refundable', 'Deposit_type_Refundable', 'Booking_channel_Direct', 'Booking_channel_Online', 'Visted_Previously_Yes', 'Previous_Cancellations_Yes', 'Required_Car_Parking_Yes', 'Use_Promotion_Yes', 'Booking_Lead_Time', 'Total_Guests']


In [31]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier(learning_rate=0.05, n_estimators=300, max_depth=6, eval_metric="mlogloss")
xgb_model.fit(X_train_sm, y_train_sm)

y_pred_xgb = xgb_model.predict(X_test)
print("XGBoost Performance:\n", classification_report(y_test, y_pred_xgb))

ValueError: feature_names mismatch: ['Age', 'Educational_Level', 'Income', 'Adults', 'Children', 'Babies', 'Discount_Rate', 'Room_Rate', 'Gender_M', 'Ethnicity_Asian American', 'Ethnicity_Latino', 'Ethnicity_caucasian', 'Country_region_North', 'Country_region_South', 'Country_region_West', 'Hotel_Type_City Hotel', 'Hotel_Type_Resort', 'Meal_Type_FB', 'Meal_Type_HB', 'Deposit_type_Non-Refundable', 'Deposit_type_Refundable', 'Booking_channel_Direct', 'Booking_channel_Online', 'Visted_Previously_Yes', 'Previous_Cancellations_Yes', 'Required_Car_Parking_Yes', 'Use_Promotion_Yes', 'Booking_Lead_Time', 'Total_Guests'] []
expected Booking_channel_Online, Meal_Type_FB, Ethnicity_Asian American, Use_Promotion_Yes, Babies, Gender_M, Ethnicity_caucasian, Discount_Rate, Country_region_West, Ethnicity_Latino, Total_Guests, Room_Rate, Hotel_Type_City Hotel, Income, Deposit_type_Non-Refundable, Country_region_South, Booking_Lead_Time, Meal_Type_HB, Hotel_Type_Resort, Visted_Previously_Yes, Country_region_North, Required_Car_Parking_Yes, Deposit_type_Refundable, Previous_Cancellations_Yes, Adults, Age, Booking_channel_Direct, Educational_Level, Children in input data