# Task 2 â€” Predicting Customer Holiday Purchases

This notebook contains a complete, runnable workflow to prepare data, train a Random Forest model, evaluate results, and export outputs for the Forage submission. Edit `DATA_PATH` and the feature lists if your dataset uses different column names.

In [None]:
# Cell 1: Imports and plotting defaults
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, roc_auc_score, roc_curve

import joblib

pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)
print('Imports loaded successfully')

In [None]:
# Cell 2: Load data & quick EDA
DATA_PATH = "Customer_Booking.csv"  # <-- change this if your file has a different name
df = pd.read_csv(DATA_PATH)
print('Data shape:', df.shape)
display(df.head())


In [None]:
# Quick checks (run these if you'd like more information)
df.info()
df.describe(include='all').T
print('\nMissing values per column:')
print(df.isna().sum().sort_values(ascending=False).head(20))


In [None]:
# Cell 3: Define target and simple cleaning
TARGET = "BookedHoliday"   # <-- change if your target column has another name

# Convert target to numeric if needed
if df[TARGET].dtype == object:
    df[TARGET] = df[TARGET].map({'Yes': 1, 'No': 0})
    df[TARGET] = df[TARGET].astype(int)

# Drop obvious ID columns if present
for col in ['CustomerID', 'ID', 'BookingID']:
    if col in df.columns:
        df.drop(columns=col, inplace=True)

# Keep only rows with target present
df = df[df[TARGET].notna()].reset_index(drop=True)
print('Shape after cleaning:', df.shape)


In [None]:
# Cell 4: Feature selection - update lists to match your data
# Numeric features (edit)
num_cols = [c for c in ['LeadTimeDays', 'Age', 'PastHolidayPurchases'] if c in df.columns]

# Ordinal features (examples)
ord_cols = [c for c in ['LoyaltyTier'] if c in df.columns]

# Nominal categorical features
cat_cols = [c for c in ['IncomeBand', 'DestinationType', 'TripType', 'BookingChannel', 'BookingOrigin'] if c in df.columns]

print('Numeric:', num_cols)
print('Ordinal:', ord_cols)
print('Categorical:', cat_cols)

In [None]:
# Cell 5: Preprocessing and pipeline
# Numeric transformer
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Ordinal transformer (define order if applicable)
loyalty_order = ['Bronze', 'Silver', 'Gold', 'Platinum']  # edit to match your dataset
ordinal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ord', OrdinalEncoder(categories=[loyalty_order]) if ord_cols else 'passthrough')
])

# Categorical transformer
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, num_cols),
    ('ord', ordinal_transformer, ord_cols),
    ('cat', categorical_transformer, cat_cols)
], remainder='drop', verbose_feature_names_out=False)

clf = Pipeline(steps=[
    ('preproc', preprocessor),
    ('rf', RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1))
])

print('Pipeline created')

In [None]:
# Cell 6: Train/test split and baseline training
X = df[num_cols + ord_cols + cat_cols]
y = df[TARGET]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
print('Train shape:', X_train.shape, 'Test shape:', X_test.shape)

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:,1] if hasattr(clf.named_steps['rf'], 'predict_proba') else None

print('Accuracy:', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred, zero_division=0))
print('Recall:', recall_score(y_test, y_pred, zero_division=0))
print('F1:', f1_score(y_test, y_pred, zero_division=0))
print('\nClassification report:\n', classification_report(y_test, y_pred, zero_division=0))

In [None]:
# Cell 7: Confusion matrix and ROC curve
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(5,4))
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
plt.title("Confusion Matrix")
plt.colorbar()
plt.xticks([0,1])
plt.yticks([0,1])
plt.xlabel("Predicted")
plt.ylabel("Actual")
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        plt.text(j, i, cm[i,j], ha="center", va="center", color="white")
plt.show()

if y_proba is not None:
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    auc = roc_auc_score(y_test, y_proba)
    plt.figure(figsize=(5,4))
    plt.plot(fpr, tpr, label=f'AUC = {auc:.3f}')
    plt.plot([0,1],[0,1], '--', color='grey')
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("ROC Curve")
    plt.legend()
    plt.show()

In [None]:
# Cell 8: Feature importance (interpretable)
preproc = clf.named_steps['preproc']
rf = clf.named_steps['rf']

feat_names = []
feat_names += num_cols
feat_names += ord_cols

if cat_cols:
    ohe = preproc.named_transformers_['cat'].named_steps['ohe']
    ohe_names = ohe.get_feature_names_out(cat_cols).tolist()
    feat_names += ohe_names

importances = rf.feature_importances_
fi = pd.DataFrame({'feature': feat_names, 'importance': importances})
fi = fi.sort_values('importance', ascending=False).reset_index(drop=True)
display(fi.head(20))

plt.figure(figsize=(8,6))
plt.barh(fi['feature'].head(12)[::-1], fi['importance'].head(12)[::-1])
plt.title('Top feature importances')
plt.xlabel('Importance')
plt.tight_layout()
plt.show()

In [None]:
# Cell 9: Save model and export predictions
joblib.dump(clf, 'rf_pipeline_task2.joblib')
preds = X_test.copy()
preds['actual'] = y_test.values
preds['pred'] = y_pred
if y_proba is not None:
    preds['probability'] = y_proba
preds.to_csv('task2_test_predictions.csv', index=False)
print('Saved rf_pipeline_task2.joblib and task2_test_predictions.csv')

In [None]:
# Cell 10: Build a concise summary for PowerPoint / Forage submission
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, zero_division=0)
rec = recall_score(y_test, y_pred, zero_division=0)
top_features = fi['feature'].head(5).tolist()

summary_text = f'''
Model: Random Forest
Accuracy: {acc:.3f}
Precision: {prec:.3f}
Recall: {rec:.3f}

Top features:
1) {top_features[0] if len(top_features)>0 else ''}
2) {top_features[1] if len(top_features)>1 else ''}
3) {top_features[2] if len(top_features)>2 else ''}
4) {top_features[3] if len(top_features)>3 else ''}
5) {top_features[4] if len(top_features)>4 else ''}
'''
print(summary_text)
with open('task2_summary.txt','w') as f:
    f.write(summary_text)
print('Summary written to task2_summary.txt')

## Notes & Next Steps

- If `BookedHoliday` is imbalanced, use `class_weight='balanced'` in RandomForest or try resampling (SMOTE).
- For better interpretability consider using SHAP (`pip install shap`) and generating local explanations.
- To tune the model, add a `GridSearchCV` cell (example commented below).

### Optional: GridSearch snippet

```python
param_grid = {'rf__n_estimators':[100,200], 'rf__max_depth':[None,10,20]}
search = GridSearchCV(clf, param_grid, cv=3, scoring='f1', n_jobs=-1)
search.fit(X_train, y_train)
print(search.best_params_)
```
