# CIS 9660 – Data Mining Project #2 (Q1)
**Hotel Booking Cancellation Classifier – Colab Notebook**

This notebook loads the dataset, visualizes, preprocesses, trains multiple classifiers with 5-fold CV, plots K-Means elbow, evaluates on a 70/30 split, and exports deploy artifacts (model, scaler, columns). It also writes a minimal **Streamlit** app (`app.py`) and `requirements.txt` so you can deploy.

In [None]:

# (Optional) Save artifacts to Drive
# from google.colab import drive
# drive.mount('/content/drive')


## 1) Imports & Globals

In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.experimental import enable_hist_gradient_boosting  # noqa: F401

import joblib
import warnings
warnings.filterwarnings("ignore")
sns.set(style="whitegrid", font_scale=1.1)

RANDOM_STATE = 42


## 2) Load Dataset

In [None]:

# Hotel Booking Demand (new dataset; not reused)
url = "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-02-11/hotels.csv"
df_raw = pd.read_csv(url)
df = df_raw.copy()

print("Rows, Cols:", df.shape)
df.head()


## 3) Data Visualizations

In [None]:

plt.figure(figsize=(15,10))

# Viz 1: Cancellation Rate by Hotel Type
plt.subplot(2,2,1)
sns.barplot(x='hotel', y='is_canceled', data=df)
plt.title('Cancellation Rate by Hotel Type')
plt.xlabel('Hotel Type'); plt.ylabel('Cancellation Rate')

# Viz 2: Lead Time vs. Cancellations
plt.subplot(2,2,2)
sns.boxplot(x='is_canceled', y='lead_time', data=df)
plt.title('Lead Time by Cancellation Status')
plt.xlabel('Is Canceled (0/1)'); plt.ylabel('Lead Time (days)')

# Viz 3: Top 10 Countries by Booking Volume
plt.subplot(2,2,3)
df['country'].value_counts().head(10).plot(kind='bar')
plt.title('Top 10 Countries by Booking Volume')
plt.xlabel('Country'); plt.ylabel('Count')

plt.tight_layout()
plt.savefig('visualizations.png', dpi=150, bbox_inches='tight')
plt.show()


## 4) Preprocessing (features, cleaning, encoding, split, scale)

In [None]:

# Selected features
FEATURES = [
    'hotel', 'lead_time', 'arrival_date_month', 'stays_in_weekend_nights', 
    'stays_in_week_nights', 'adults', 'children', 'meal', 'country', 
    'market_segment', 'deposit_type', 'customer_type', 'adr'
]
TARGET = 'is_canceled'

# Basic cleaning
df['children'] = df['children'].fillna(0)
df = df.dropna(subset=['adr'])

# Limit country cardinality for practical dummy size
top_countries = df['country'].value_counts().nlargest(20).index
df['country'] = np.where(df['country'].isin(top_countries), df['country'], 'OTHER')

# One-hot encode
work = df[FEATURES + [TARGET]].copy()
work = pd.get_dummies(
    work,
    columns=['hotel', 'arrival_date_month', 'meal', 'country', 'market_segment', 'deposit_type', 'customer_type'],
    drop_first=True
)

print("Missing values after preprocessing:", work.isna().sum().sum())

# Train/test split 70/30
X = work.drop(columns=[TARGET])
y = work[TARGET].astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=RANDOM_STATE, stratify=y
)

# Standardize (with_mean=False keeps compatibility with sparse-like matrices)
scaler = StandardScaler(with_mean=False)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X.shape, X_train.shape, X_test.shape


## 5) Models + 5-Fold Cross-Validation

In [None]:

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=RANDOM_STATE),
    "Naive Bayes": GaussianNB(),
    "Decision Tree": DecisionTreeClassifier(random_state=RANDOM_STATE),
    "Random Forest": RandomForestClassifier(n_estimators=300, random_state=RANDOM_STATE),
    "SVM": SVC(probability=True, kernel='linear', random_state=RANDOM_STATE),
    "KNN": KNeighborsClassifier(n_neighbors=7),
    "HistGradientBoosting": HistGradientBoostingClassifier(max_iter=200, random_state=RANDOM_STATE)
}

def cv_mean_accuracy(model, Xs, y):
    # GaussianNB requires dense arrays
    if isinstance(model, GaussianNB):
        return cross_val_score(model, Xs.toarray(), y, cv=5, n_jobs=-1, error_score='raise').mean()
    return cross_val_score(model, Xs, y, cv=5, n_jobs=-1, error_score='raise').mean()

cv_results = {}
for name, model in models.items():
    try:
        print(f"Running 5-fold CV: {name}")
        acc = cv_mean_accuracy(model, X_train_scaled, y_train)
        cv_results[name] = acc
        print(f" -> Mean CV Accuracy: {acc:.3f}")
    except Exception as e:
        print(f"{name} CV failed: {e}")
        cv_results[name] = 0.0

plt.figure(figsize=(10,6))
pd.Series(cv_results).sort_values().plot(kind='barh')
plt.title('5-Fold Cross Validation Accuracy')
plt.xlabel('Accuracy')
plt.tight_layout()
plt.savefig('cv_results.png', dpi=150, bbox_inches='tight')
plt.show()


## 6) Test Evaluation + Metrics Plot + Confusion Matrices

In [None]:

metrics = {'Model': [], 'Accuracy': [], 'Precision': [], 'Recall': [], 'F1': []}

best_name, best_model_obj, best_f1 = None, None, -1

for name, model in models.items():
    # Fit & predict
    if isinstance(model, GaussianNB):
        model.fit(X_train_scaled.toarray(), y_train)
        y_pred = model.predict(X_test_scaled.toarray())
    else:
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)

    report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
    metrics['Model'].append(name)
    metrics['Accuracy'].append(report['accuracy'])
    metrics['Precision'].append(report['weighted avg']['precision'])
    metrics['Recall'].append(report['weighted avg']['recall'])
    metrics['F1'].append(report['weighted avg']['f1-score'])

    # Track best by weighted F1
    if report['weighted avg']['f1-score'] > best_f1:
        best_name, best_model_obj, best_f1 = name, model, report['weighted avg']['f1-score']

    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(cm)
    disp.plot(cmap='Blues')
    plt.title(f'Confusion Matrix - {name}')
    plt.tight_layout()
    plt.savefig(f'cm_{name.replace(" ", "_").lower()}.png', dpi=150, bbox_inches='tight')
    plt.show()

metrics_df = pd.DataFrame(metrics).set_index('Model')
ax = metrics_df[['Accuracy','Precision','Recall','F1']].plot(kind='bar', figsize=(12,6))
plt.title('Model Performance Comparison (Test Set)')
plt.ylabel('Score'); plt.ylim(0, 1.0)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('model_metrics.png', dpi=150, bbox_inches='tight')
plt.show()

print(f"Best model by weighted F1: {best_name} ({best_f1:.3f})")
metrics_df.sort_values('F1', ascending=False)


## 7) K-Means Elbow Plot (+ optional KMeans-as-classifier)

In [None]:

# Elbow
inertia = []
for k in range(1, 11):
    km = KMeans(n_clusters=k, random_state=RANDOM_STATE, n_init="auto")
    km.fit(X_train_scaled)
    inertia.append(km.inertia_)

plt.figure(figsize=(8,5))
plt.plot(range(1,11), inertia, marker='o')
plt.title('Elbow Method for Optimal K (K-Means)')
plt.xlabel('Number of clusters (K)')
plt.ylabel('Inertia (WCSS)')
plt.tight_layout()
plt.savefig('elbow_plot.png', dpi=150, bbox_inches='tight')
plt.show()

# Optional: K-Means as "classifier" by majority vote (binary target)
try:
    from scipy.stats import mode
    km = KMeans(n_clusters=2, random_state=RANDOM_STATE, n_init="auto")
    km.fit(X_train_scaled)
    train_clusters = km.predict(X_train_scaled)
    mapping = {}
    for c in np.unique(train_clusters):
        mapping[c] = mode(y_train[train_clusters == c], keepdims=True).mode[0]
    test_clusters = km.predict(X_test_scaled)
    y_pred_km = np.array([mapping[c] for c in test_clusters])
    km_acc = (y_pred_km == y_test.values).mean()
    print(f'K-Means-as-classifier Accuracy: {km_acc:.3f}')
except Exception as e:
    print("K-Means classifier step skipped:", e)


## 8) Save Deploy Artifacts

In [None]:

joblib.dump(best_model_obj, 'hotel_cancellation_model.pkl')
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(X.columns.tolist(), 'feature_columns.pkl')
print('Saved: hotel_cancellation_model.pkl, scaler.pkl, feature_columns.pkl')


## 9) Write `app.py` and `requirements.txt` for Streamlit Deployment

In [None]:

app_code = '''import streamlit as st
import pandas as pd
import numpy as np
import joblib

st.set_page_config(page_title="Hotel Cancellation Classifier", layout="wide")
st.title("Hotel Cancellation Classifier")

st.markdown("""
Upload a CSV of **new booking records**. The app will:
1) apply the same preprocessing used in training,
2) scale features, and
3) predict cancellation (and probability when supported).
""")

# Load artifacts
model = joblib.load('hotel_cancellation_model.pkl')
scaler = joblib.load('scaler.pkl')
feature_columns = joblib.load('feature_columns.pkl')

TRAIN_FEATURES = [
    'hotel', 'lead_time', 'arrival_date_month', 'stays_in_weekend_nights',
    'stays_in_week_nights', 'adults', 'children', 'meal', 'country',
    'market_segment', 'deposit_type', 'customer_type', 'adr'
]

# Infer top countries from training feature columns
country_prefix = 'country_'
tops = [c[len(country_prefix):] for c in feature_columns if c.startswith(country_prefix)]
TOP_COUNTRIES = [t for t in tops if t != 'OTHER']

def preprocess(df_in: pd.DataFrame) -> pd.DataFrame:
    df = df_in.copy()

    # Basic cleaning mirroring training
    if 'children' in df.columns:
        df['children'] = df['children'].fillna(0)
    if 'adr' in df.columns:
        df = df.dropna(subset=['adr'])

    # Reduce country cardinality
    if 'country' in df.columns and TOP_COUNTRIES:
        df['country'] = np.where(df['country'].isin(TOP_COUNTRIES), df['country'], 'OTHER')

    # Ensure all required features exist (fill sensible defaults)
    missing = [c for c in TRAIN_FEATURES if c not in df.columns]
    for c in missing:
        if c in ['lead_time','stays_in_weekend_nights','stays_in_week_nights','adults','children','adr']:
            df[c] = 0
        elif c == 'country':
            df[c] = 'OTHER'
        else:
            df[c] = 'Undefined'

    df = df[TRAIN_FEATURES]

    # One-hot encode like training
    df_dum = pd.get_dummies(
        df,
        columns=['hotel', 'arrival_date_month', 'meal', 'country', 'market_segment', 'deposit_type', 'customer_type'],
        drop_first=True
    )

    # Align columns with training columns
    X_new = df_dum.reindex(columns=feature_columns, fill_value=0)
    return X_new

uploaded = st.file_uploader("Upload CSV", type=["csv"])
if uploaded is not None:
    new_df = pd.read_csv(uploaded)

    st.subheader("Preview of Uploaded Data")
    st.dataframe(new_df.head())

    try:
        X_new = preprocess(new_df)
        try:
            X_new_scaled = scaler.transform(X_new)
        except Exception:
            X_new_scaled = scaler.transform(X_new.values)

        preds = model.predict(X_new_scaled)

        if hasattr(model, "predict_proba"):
            probs = model.predict_proba(X_new_scaled)[:, 1]
            out = new_df.copy()
            out['pred_is_canceled'] = preds
            out['prob_canceled'] = probs
        else:
            out = new_df.copy()
            out['pred_is_canceled'] = preds

        st.subheader("Predictions")
        st.dataframe(out.head(50))

        st.download_button("Download Predictions CSV",
                           data=out.to_csv(index=False).encode('utf-8'),
                           file_name="predictions.csv",
                           mime="text/csv")
    except Exception as e:
        st.error(f"Error during preprocessing/prediction: {e}")
else:
    st.info("Please upload a CSV to get predictions.")
'''
with open('app.py', 'w') as f:
    f.write(app_code)

with open('requirements.txt', 'w') as f:
    f.write("streamlit\npandas\nnumpy\nscikit-learn\njoblib\nmatplotlib\nseaborn\nscipy\n")

print("Wrote app.py and requirements.txt")
