In [4]:
# %%
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold

# =============================
# LOAD DATA
# =============================
train_path = 'Quentin/dsba-m-1-challenge-purchase-prediction/train_dataset_M1_with_id.csv'
df = pd.read_csv(train_path)

# =============================
# DROP UNUSED/LEAKING COLUMNS
# =============================
drop_cols = ['id', 'Session_ID', 'Items_In_Cart']  # already decided
for c in drop_cols:
    if c in df.columns:
        df.drop(columns=c, inplace=True)

# Target and day
target = 'Purchase'

# =============================
# FEATURE ENGINEERING
# =============================

# Effective price
df['Effective_Price'] = df['Price'] - df['Discount']

# Discount amount (same as Discount but keep for clarity)
df['Discount_Amount'] = df['Discount']

# Has discount (binary)
df['Has_Discount'] = (df['Discount'] > 0).astype(int)

# High discount (binary: >20% of Price)
df['High_Discount'] = ((df['Discount'] / df['Price']) > 0.2).astype(int)

# Price category
def price_category(p):
    if p > 400: return 'Premium'
    elif p > 200: return 'High'
    else: return 'Low'
df['Price_Category'] = df['Price'].apply(price_category)

# =============================
# FINAL FEATURES LIST
# =============================
final_features = [
    'Age', 'Gender', 'Reviews_Read', 'Price', 'Discount', 'Category',
    'Time_of_Day', 'Email_Interaction', 'Device_Type', 'Payment_Method', 'Referral_Source',
    'Socioeconomic_Status_Score', 'Engagement_Score', 'AB_Bucket', 'Price_Sine', 'PM_RS_Combo',
    'Campaign_Period', 'Effective_Price', 'Discount_Amount',
    'Has_Discount', 'High_Discount', 'Price_Category'
]

df = df[[*final_features, target, 'Day']].copy()  # include Day for potential future split

# =============================
# PREPROCESSOR CLASS
# =============================
class Preprocessor:
    def __init__(self, cat_cols, num_cols, n_neighbors=5):
        self.cat_cols = cat_cols
        self.num_cols = num_cols
        self.n_neighbors = n_neighbors
        self.cat_imputer = None
        self.ohe = None
        self.knn = None
        self.scaler = None
        self.ohe_cols = None

    def fit(self, X):
        # Categorical imputer
        self.cat_imputer = SimpleImputer(strategy='most_frequent')
        X_cat = self.cat_imputer.fit_transform(X[self.cat_cols])

        # One-hot encode
        self.ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
        arr = self.ohe.fit_transform(X_cat)
        self.ohe_cols = list(self.ohe.get_feature_names_out(self.cat_cols))

        # KNN imputer (numeric + OHE combined)
        X_num = X[self.num_cols].copy()
        X_knn_fit = pd.concat([pd.DataFrame(arr, index=X.index, columns=self.ohe_cols), X_num], axis=1)
        self.knn = KNNImputer(n_neighbors=self.n_neighbors, weights='distance')
        self.knn.fit(X_knn_fit)

        # Fit scaler
        X_knn_imp = pd.DataFrame(self.knn.transform(X_knn_fit), columns=X_knn_fit.columns, index=X.index)
        self.scaler = MinMaxScaler()
        self.scaler.fit(X_knn_imp)
        return self

    def transform(self, X):
        X = X.copy()
        X_cat = self.cat_imputer.transform(X[self.cat_cols])
        arr = self.ohe.transform(X_cat)
        df_ohe = pd.DataFrame(arr, index=X.index, columns=self.ohe_cols)
        X_num = X[self.num_cols].copy()
        X_knn = pd.concat([df_ohe, X_num], axis=1)

        # Ensure all OHE columns are present
        for c in [c for c in self.ohe_cols if c not in X_knn.columns]:
            X_knn[c] = 0.0

        X_knn = X_knn[self.ohe_cols + self.num_cols]  # enforce column order
        X_imp = pd.DataFrame(self.knn.transform(X_knn), columns=X_knn.columns, index=X.index)
        X_scaled = pd.DataFrame(self.scaler.transform(X_imp), columns=X_imp.columns, index=X.index)
        return X_scaled

# =============================
# DEFINE CATEGORICAL & NUMERIC
# =============================
cat_cols = [c for c in final_features if df[c].dtype == 'object' or str(df[c].dtype).startswith('category')]
num_cols = [c for c in final_features if c not in cat_cols]

print("Categorical cols:", cat_cols)
print("Numeric cols:", num_cols)

# =============================
# PREPROCESS DATA
# =============================
X = df[final_features].copy()
y = df[target].copy()

PP = Preprocessor(cat_cols=cat_cols, num_cols=num_cols, n_neighbors=5)
PP.fit(X)
X_prep = PP.transform(X)

print("Preprocessing done. Shape:", X_prep.shape)

# =============================
# RANDOM FOREST MODEL
# =============================
rf = RandomForestClassifier(n_estimators=500, max_depth=15, random_state=42, n_jobs=-1)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

acc_scores = cross_val_score(rf, X_prep, y, cv=cv, scoring='accuracy')
f1_scores = cross_val_score(rf, X_prep, y, cv=cv, scoring='f1')
roc_scores = cross_val_score(rf, X_prep, y, cv=cv, scoring='roc_auc')

print(f"Random Forest 5-fold CV Accuracy: {acc_scores.mean():.4f} ± {acc_scores.std():.4f}")
print(f"Random Forest 5-fold CV F1-score: {f1_scores.mean():.4f} ± {f1_scores.std():.4f}")
print(f"Random Forest 5-fold CV ROC-AUC: {roc_scores.mean():.4f} ± {roc_scores.std():.4f}")

# Train on full data for feature importance
rf.fit(X_prep, y)
feature_importances = pd.DataFrame({
    'feature': X_prep.columns,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)

print(feature_importances.head(20))


Categorical cols: ['Time_of_Day', 'Device_Type', 'Payment_Method', 'Referral_Source', 'PM_RS_Combo', 'Campaign_Period', 'Price_Category']
Numeric cols: ['Age', 'Gender', 'Reviews_Read', 'Price', 'Discount', 'Category', 'Email_Interaction', 'Socioeconomic_Status_Score', 'Engagement_Score', 'AB_Bucket', 'Price_Sine', 'Effective_Price', 'Discount_Amount', 'Has_Discount', 'High_Discount']
Preprocessing done. Shape: (13735, 180)
Random Forest 5-fold CV Accuracy: 0.6799 ± 0.0078
Random Forest 5-fold CV F1-score: 0.4056 ± 0.0110
Random Forest 5-fold CV ROC-AUC: 0.7177 ± 0.0142
                        feature  importance
173            Engagement_Score    0.096807
168                       Price    0.082525
176             Effective_Price    0.080150
175                  Price_Sine    0.060202
167                Reviews_Read    0.059477
172  Socioeconomic_Status_Score    0.058753
165                         Age    0.049375
177             Discount_Amount    0.048468
169                    Disc