### Split the Data into Train & Test Sets

In [27]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the processed dataset
df = pd.read_csv("../data/processed/hotel_bookings_cleaned.csv")

# Define features (X) and target variable (y)
X = df.drop(columns=["Reservation_Status"])  # Features
y = df["Reservation_Status"]  # Target (1=Check-in, 2=Cancel, 3=No-Show)

# Split the dataset (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)
df.head(5)

Train shape: (21999, 29)
Test shape: (5500, 29)


Unnamed: 0,Age,Educational_Level,Income,Adults,Children,Babies,Reservation_Status,Discount_Rate,Room_Rate,Gender_M,...,Deposit_type_Non-Refundable,Deposit_type_Refundable,Booking_channel_Direct,Booking_channel_Online,Visted_Previously_Yes,Previous_Cancellations_Yes,Required_Car_Parking_Yes,Use_Promotion_Yes,Booking_Lead_Time,Total_Guests
0,40,3,0.0,4,2,0.0,1,0,106.440241,True,...,False,False,False,True,False,False,True,True,41,6.0
1,49,3,2.0,1,3,0.0,1,0,158.039564,False,...,True,False,False,True,False,False,True,False,36,4.0
2,42,2,0.0,1,3,0.0,1,0,212.61078,False,...,False,True,False,True,False,False,True,False,3,4.0
3,25,3,3.0,3,3,0.0,1,5,106.146407,True,...,False,False,False,False,False,False,True,True,12,6.0
4,62,4,1.0,1,1,0.0,1,10,159.384439,False,...,False,True,True,False,False,False,False,True,13,2.0


### Train a Baseline Model (Random Forest)

In [30]:
from sklearn.preprocessing import OrdinalEncoder

# Define the order (example)
reservation_order = ['P', 'Confirmed', 'Canceled']

# Encode
ordinal_encoder = OrdinalEncoder(categories=[reservation_order])
df[['Reservation_Status']] = ordinal_encoder.fit_transform(df[['Reservation_Status']])

ValueError: invalid literal for int() with base 10: 'P'

In [32]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Initialize the model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Predictions
y_pred = rf_model.predict(X_test)

# Evaluate performance
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           1       0.58      0.19      0.29      4248
           2       0.12      0.55      0.19       827
           3       0.66      0.28      0.39       425

    accuracy                           0.25      5500
   macro avg       0.45      0.34      0.29      5500
weighted avg       0.52      0.25      0.28      5500



In [34]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# Initialize models
models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=500),
    "Decision Tree": DecisionTreeClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="mlogloss")
}

# Train and evaluate each model
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {accuracy:.4f}")


ModuleNotFoundError: No module named 'xgboost'