In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv('train.csv')

In [None]:
df.head(5)

In [None]:
print(list(df.columns))

In [None]:
df = df.drop(columns = ['Booking_ID'])

In [None]:
df.shape

In [None]:
df.head(5)

In [None]:
print(df.isnull().sum())

In [None]:
df.describe()

In [None]:
df.duplicated().sum()

In [None]:
df.drop_duplicates(inplace = True)

In [None]:
df.shape

In [None]:
df.columns

In [None]:
columns = list(df.columns)
print(columns)

In [None]:
for column_name in columns:
    if df[column_name].nunique() < 10:
        print(f"\nValue counts for: {column_name}")
        print(df[column_name].value_counts())


In [None]:
print(df.dtypes)

In [None]:
cat_cols = df.select_dtypes(include = "object").columns.tolist()

In [None]:
cat_cols

In [None]:
num_cols = df.select_dtypes(include = ['int64', 'float64']).columns.tolist()

In [None]:
num_cols

In [None]:
df.columns

In [None]:
len(cat_cols)

In [None]:
len(num_cols)

In [None]:
data = df.copy()

In [None]:
data.shape

In [None]:
def num_cols_distribution(df, num_features):
    fig, axes = plt.subplots(len(num_features), 2, figsize = (15, len(num_features) * 5))
    if len(num_features) == 1:
        axes = axes.reshape(1,-1)

    for i, column in enumerate(num_features):
        sns.histplot(data = df, x = column, ax = axes[i,0], kde = True, palette = 'Blues')
        axes[i][0].set_title(f"Histogram for {column}")

        sns.boxplot(data = df, x = column, ax = axes[i,1], palette = "Blues")
        axes[i][1].set_title(f"Box Plot for {column}")

    plt.tight_layout()
    plt.show()

In [None]:
num_cols_distribution(data, num_cols)

In [None]:
def num_cols_display(df, num_cols):
    for col in num_cols:
        print(f"\nColumn: {col}")
        print(f"Unique values: {df[col].nunique()}")
        print(f"Missing values: {df[col].isna().sum()}")

        vc = df[col].value_counts().head(10)
        vc_pct = df[col].value_counts(normalize=True).head(10) * 100

        display(vc.to_frame("Count").join(vc_pct.to_frame("Percentage")))


In [None]:
num_cols_display(data, num_cols)

In [None]:
def cat_cols_distribution(df, cat_cols):
    for col in cat_cols:
        print(f"\nColumn: {col}")
        print("Unique values:", df[col].nunique())
        print("Missing values:", df[col].isna().sum())

        vc = df[col].value_counts()
        vc_pct = df[col].value_counts(normalize=True) * 100

        display(
            vc.to_frame("Count").join(vc_pct.to_frame("Percentage"))
        )

        plt.figure(figsize=(10, 5))
        vc.sort_values(ascending=False).plot(
            kind='bar',
            color='steelblue'
        )
        plt.title(f"Distribution of {col}")
        plt.ylabel("Count")
        plt.xlabel(col)
        plt.show()


In [None]:
cat_cols_distribution(data, cat_cols)

In [None]:
def plot_bivariate_numerical(df, target, num_cols):
    num_plots = len(num_cols)
    num_rows = (num_plots+1)//2

    fig, axes = plt.subplots(num_rows, 2, figsize = (15, num_rows * 5))
    axes = axes.flatten()

    for i, column in enumerate(num_cols):
         sns.boxplot(x = target, y = column, ax = axes[i], data = df, palette = "Blues")
         axes[i].set_title(f"{column} VS {target}")

    plt.tight_layout()
    plt.show()

In [None]:
plot_bivariate_numerical(data, "booking_status", num_cols)

In [None]:
def plot_bivariate_cat(df, target, cat_features):

    num_features = len(cat_features)
    num_rows = (num_features + 1)//2

    fig, axes = plt.subplots(num_rows, 2, figsize=(15, num_rows* 5))

    axes = axes.flatten()

    for i, feature in enumerate(cat_features):
        sns.countplot(x = feature, hue = target, data = df, palette = "Set2", ax = axes[i])
        axes[i].set_title(f"{feature} VS {target}")
        axes[i].tick_params(axis = 'x', rotation = 90)

    for j in range(len(cat_features), len(axes)):
        fig.delaxes(axes[j])

    plt.tight_layout()
    plt.show()

In [None]:
plot_bivariate_cat(data, 'booking_status', cat_cols)

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
le = LabelEncoder()

mappings = {}

for col in cat_cols:
    df[col] = le.fit_transform(df[col])
    mappings[col] = {label:code for label, code in zip(le.classes_, le.transform(le.classes_))}

In [None]:
mappings

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

In [None]:
X = add_constant(df)

vif_data = pd.DataFrame()

vif_data["features"] = X.columns

vif_data["VIF"] = [variance_inflation_factor(X.values,i) for i in range(X.shape[1])]

In [None]:
vif_data

In [None]:
plt.figure(figsize = (10,10))

sns.heatmap(df.corr(), annot = True, fmt = ".1f", linewidths = 0.5, cmap = "Blues")
plt.tight_layout()


In [None]:
df

In [None]:
num_cols

In [None]:
skew_cols = [
    'lead_time',
    'avg_price_per_room',
    'no_of_week_nights',
    'no_of_weekend_nights',
    'no_of_special_requests'
]

skewness = df[skew_cols].skew().sort_values(ascending=False)
print(skewness)


In [None]:
transform_cols = [
    'no_of_week_nights',
    'lead_time'
]


In [None]:
for col in transform_cols:
    df[col] = np.log1p(df[col])


In [None]:
df[['no_of_week_nights', 'lead_time']].skew()


In [None]:
scale_cols = [
    'no_of_week_nights',
    'lead_time',
    'avg_price_per_room',
    'no_of_weekend_nights',
    'no_of_special_requests'
]


In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df[scale_cols] = scaler.fit_transform(df[scale_cols])


In [None]:
df[num_cols].skew()

In [None]:
binary_cols = [
    'required_car_parking_space',
    'repeated_guest'
]


In [None]:
df['has_children'] = (df['no_of_children'] > 0).astype(int)
df['has_previous_cancellations'] = (df['no_of_previous_cancellations'] > 0).astype(int)
df['has_previous_bookings'] = (df['no_of_previous_bookings_not_canceled'] > 0).astype(int)


In [None]:
df.drop(columns=[
    'no_of_children',
    'no_of_previous_cancellations',
    'no_of_previous_bookings_not_canceled'
], inplace=True)


In [None]:
num_cols = [
    'no_of_adults',
    'no_of_week_nights',
    'no_of_weekend_nights',
    'lead_time',
    'avg_price_per_room',
    'no_of_special_requests'
]


In [None]:
df[num_cols]

In [None]:
type(cat_cols)

In [None]:
type(num_cols)

In [None]:
type(binary_cols)

In [None]:
cat_cols = [
    col for col in cat_cols
    if col != 'booking_status'
]


In [None]:
target = ['booking_status']

In [None]:
type(target)

In [None]:
df[num_cols]

In [None]:
df[binary_cols]

In [None]:
df[cat_cols]

In [None]:
df[target]

In [None]:
df['booking_status'].value_counts(normalize=False)

In [None]:
X = df.drop(columns = "booking_status")
y = df["booking_status"]

In [None]:
X.head(5)

In [None]:
y.head(5)

In [None]:
y.value_counts()

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
smote = SMOTE(random_state = 42)

X_res, y_res = smote.fit_resample(X, y)

In [None]:
y_res.value_counts()

In [None]:
balanced_df = pd.DataFrame(X_res, columns = X.columns)
balanced_df["booking_status"] = y_res

In [None]:
balanced_df.columns

In [None]:
balanced_df.shape

In [None]:
df = balanced_df.copy()

In [None]:
# Feature Selection

In [None]:
X = df.drop(columns = 'booking_status')
y = df['booking_status']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score

model = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    random_state=42,
    class_weight='balanced',
    n_jobs=-1
)

model.fit(X_train, y_train)
preds = model.predict(X_test)
probs = model.predict_proba(X_test)[:, 1]

print("Random Forest")
print(classification_report(y_test, preds))
print("ROC-AUC:", roc_auc_score(y_test, probs))


In [None]:
feature_importance = model.feature_importances_

In [None]:
feature_importance_df = pd.DataFrame(
    {
        'Feature': X.columns,
        'Importance': feature_importance
    }
)

In [None]:
feature_importance_df

In [None]:
print(feature_importance_df.sort_values(by = "Importance", ascending = False))

In [None]:
top_features = feature_importance_df.sort_values(by = "Importance", ascending = False)

In [None]:
top_features

In [None]:
selected_features = top_features['Feature'].head(10).values

In [None]:
selected_features

In [None]:
len(selected_features)

In [None]:
top_df = df[selected_features.tolist() + ["booking_status"]]

In [None]:
len(top_df.columns)

In [None]:
top_df.shape

In [None]:
top_df.head(5)

In [None]:
df = top_df.copy()

In [None]:
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.model_selection import StratifiedKFold, cross_val_score

In [None]:
X = df.drop(columns = 'booking_status')
y = df['booking_status']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)


In [None]:
from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score, roc_auc_score

In [None]:
scoring = {
    "roc_auc": "roc_auc",
    "f1": "f1",
    "recall": "recall",
    "precision": "precision",
    "accuracy": "accuracy"
}


In [None]:
models = {
    "Logistic Regression": LogisticRegression(
        max_iter=1000,
        class_weight="balanced"
    ),
    "Decision Tree": DecisionTreeClassifier(
        class_weight="balanced",
        random_state=42
    ),
    "AdaBoost": AdaBoostClassifier(
        random_state=42
    ),
    "Gradient Boosting": GradientBoostingClassifier(
        random_state=42
    ),
    "SVM": SVC(
        probability=True,
        class_weight="balanced"
    ),
    "KNN": KNeighborsClassifier(
        n_neighbors=7
    ),
    "Naive Bayes": GaussianNB(),
    "XGBoost": XGBClassifier(
        n_estimators=300,
        max_depth=6,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        objective="binary:logistic",
        eval_metric="auc",
        scale_pos_weight=y_train.value_counts()[0] / y_train.value_counts()[1],
        random_state=42
    ),
    "LightGBM": LGBMClassifier(
        class_weight="balanced",
        random_state=42
    )
}


In [None]:
cv = StratifiedKFold(
    n_splits=5,
    shuffle=True,
    random_state=42
)


In [None]:
from sklearn.model_selection import cross_validate

results = []

for name, model in models.items():
    cv_results = cross_validate(
        model,
        X_train,
        y_train,
        cv=cv,
        scoring=scoring,
        n_jobs=-1
    )

    results.append({
        "Model": name,
        "ROC-AUC": cv_results["test_roc_auc"].mean(),
        "F1": cv_results["test_f1"].mean(),
        "Recall": cv_results["test_recall"].mean(),
        "Precision": cv_results["test_precision"].mean(),
        "Accuracy": cv_results["test_accuracy"].mean()
    })

results_df = pd.DataFrame(results).sort_values(
    by="ROC-AUC",
    ascending=False
)

results_df


In [None]:
# Train Model

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform
from sklearn.metrics import RocCurveDisplay

In [None]:
X = df.drop(columns = 'booking_status')
y = df['booking_status']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)


In [None]:
xgb_base = XGBClassifier(
    objective="binary:logistic",
    eval_metric="auc",
    scale_pos_weight=y_train.value_counts()[0] / y_train.value_counts()[1],
    random_state=42,
    n_jobs=-1
)

In [None]:
param_dist = {
    "n_estimators": randint(200, 600),
    "max_depth": randint(3, 8),
    "learning_rate": uniform(0.01, 0.15),
    "subsample": uniform(0.6, 0.4),
    "colsample_bytree": uniform(0.6, 0.4),
    "gamma": uniform(0, 5),
    "min_child_weight": randint(1, 10)
}


In [None]:
cv = StratifiedKFold(
    n_splits=5,
    shuffle=True,
    random_state=42
)


In [None]:
random_search = RandomizedSearchCV(
    estimator=xgb_base,
    param_distributions=param_dist,
    n_iter=30,
    scoring={
        "roc_auc": "roc_auc",
        "f1": "f1",
        "recall": "recall",
        "precision": "precision",
        "accuracy": "accuracy"
    },
    refit="roc_auc",
    cv=cv,
    verbose=2,
    n_jobs=-1,
    random_state=42,
    return_train_score=True
)

random_search.fit(X_train, y_train)


In [None]:
cv_results = pd.DataFrame(random_search.cv_results_)

metrics_cols = [
    "mean_test_roc_auc",
    "mean_test_f1",
    "mean_test_recall",
    "mean_test_precision",
    "mean_test_accuracy"
]

cv_results_summary = cv_results[
    ["params"] + metrics_cols
].sort_values(by="mean_test_roc_auc", ascending=False)

cv_results_summary.head(5)


In [None]:
best_xgb = random_search.best_estimator_

print("Best Parameters:")
print(random_search.best_params_)


In [None]:
y_pred = best_xgb.predict(X_test)
y_prob = best_xgb.predict_proba(X_test)[:, 1]

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Test ROC-AUC:", roc_auc_score(y_test, y_prob))


In [None]:
RocCurveDisplay.from_predictions(y_test, y_prob)
plt.title("XGBoost ROC Curve (Tuned)")
plt.show()


In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(5,4))
plt.imshow(cm, cmap="Blues")
plt.title("Confusion Matrix")
plt.colorbar()
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()


In [None]:
importances = pd.Series(
    best_xgb.feature_importances_,
    index=X_train.columns
).sort_values(ascending=False)

plt.figure(figsize=(10,6))
importances.head(15).plot(kind="bar")
plt.title("Top 15 Feature Importances (XGBoost)")
plt.show()


In [None]:
BEST_THRESHOLD = 0.5


In [None]:
import joblib
from datetime import datetime

model_artifact = {
    "model": best_xgb,
    "feature_names": list(X_train.columns),
    "threshold": BEST_THRESHOLD,
    "metadata": {
        "model_type": "XGBoostClassifier",
        "trained_on": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        "roc_auc_cv_best": random_search.best_score_,
        "notes": "Log transform + StandardScaler applied before training"
    }
}


In [None]:
joblib.dump(
    model_artifact,
    "xgboost_model_artifact.joblib",
    compress=3
)
