In [88]:
import numpy as np
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, balanced_accuracy_score, roc_auc_score, classification_report
import pickle
from random_forest import RandomForest  # Import your RandomForest class

In [89]:
df = pd.read_csv("../data/fProcessed.csv")  # Update path if necessary

# Split Data into Features and Labels
X = df.drop('booking_status', axis=1)  # Features
y = df['booking_status']  # Target

In [90]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [91]:
smote = SMOTE(random_state=42)
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)

In [92]:
print("Before SMOTE:", pd.Series(y_train).value_counts())
print("After SMOTE:", pd.Series(y_train_sm).value_counts())

Before SMOTE: booking_status
0    14266
1     9939
Name: count, dtype: int64
After SMOTE: booking_status
0    14266
1    14266
Name: count, dtype: int64


In [93]:
# Convert DataFrames to NumPy arrays
X_train_sm = X_train_sm.to_numpy()
X_test = X_test.to_numpy()
y_train_sm = y_train_sm.to_numpy()
y_test = y_test.to_numpy()

In [94]:
clf = RandomForest(n_features=X.columns)
clf.fit(X_train_sm, y_train_sm)

-----------------------------
Iteration: 0
-----------------------------
Training Process Started.
Splitted Column: lead_time
Splitted Column: avg_price_per_room
Splitted Column: no_of_special_requests
Splitted Column: lead_time
Splitted Column: type_of_meal_plan
Splitted Column: lead_time
Splitted Column: lead_time
Splitted Column: avg_price_per_room
Splitted Column: avg_price_per_room
Splitted Column: no_of_adults
Splitted Column: avg_price_per_room
Splitted Column: lead_time
Splitted Column: arrival_month
Splitted Column: arrival_month
Splitted Column: no_of_weekend_nights
Splitted Column: lead_time
Splitted Column: avg_price_per_room
Splitted Column: avg_price_per_room
Splitted Column: avg_price_per_room
Splitted Column: lead_time
Splitted Column: avg_price_per_room
Splitted Column: avg_price_per_room
Splitted Column: no_of_week_nights
Splitted Column: arrival_date
Splitted Column: arrival_date
Splitted Column: avg_price_per_room
Splitted Column: avg_price_per_room
Splitted Column:

In [101]:
# Pickle the model
pickle.dump(clf, open("ml_model", "wb"))

In [102]:
predictions = clf.predict(X_test)
y_train_pred = clf.predict(X_train_sm)

------------------------------
------------------------------
------------------------------
------------------------------
------------------------------
------------------------------
------------------------------
------------------------------
------------------------------
------------------------------


In [103]:
print("Test Set Metrics:")
print("Imbalance Accuracy: ", accuracy_score(y_test, predictions))
print("Balanced Accuracy: ", balanced_accuracy_score(y_test, predictions))
print("AUC Score: ", roc_auc_score(y_test, predictions))
print("Classification Report: ")
print("\n", classification_report(y_test, predictions))

Test Set Metrics:
Imbalance Accuracy:  0.7862926547137074
Balanced Accuracy:  0.7748566313460643
AUC Score:  0.7748566313460644
Classification Report: 

               precision    recall  f1-score   support

           0       0.80      0.84      0.82      6080
           1       0.76      0.71      0.73      4294

    accuracy                           0.79     10374
   macro avg       0.78      0.77      0.78     10374
weighted avg       0.79      0.79      0.79     10374



In [104]:
print("Training Set Metrics:")
print("Imbalance Accuracy: ", accuracy_score(y_train_sm, y_train_pred))
print("Balanced Accuracy: ", balanced_accuracy_score(y_train_sm, y_train_pred))
print("AUC Score: ", roc_auc_score(y_train_sm, y_train_pred))
print("Classification Report: ")
print("\n", classification_report(y_train_sm, y_train_pred))

Training Set Metrics:
Imbalance Accuracy:  0.8408453666059161
Balanced Accuracy:  0.8408453666059161
AUC Score:  0.8408453666059162
Classification Report: 

               precision    recall  f1-score   support

           0       0.81      0.88      0.85     14266
           1       0.87      0.80      0.83     14266

    accuracy                           0.84     28532
   macro avg       0.84      0.84      0.84     28532
weighted avg       0.84      0.84      0.84     28532

