In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, roc_auc_score, make_scorer
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import numpy as np
import pandas as pd

In [None]:
# Load your cleaned dataset
df = pd.read_csv("cleaned_behavior_features.csv")
df['converted'] = (df['event_name'] == 'purchase').astype(int)
df['user_activity_count'] = df['user_pseudo_id'].map(df['user_pseudo_id'].value_counts())
df['item_popularity'] = df['item_id'].map(df['item_id'].value_counts())

In [None]:
# Drop unused columns
df.drop(columns=[
    'user_pseudo_id', 'item_id', 'item_name', 'event_name',
    'discounted_price', 'event_date', 'region', 'city', 'country'
], inplace=True)

In [None]:
# Define features and target
target = 'converted'
features = [
    'original_price', 'discount_percent', 'item_category', 'campaign_type', 'channel',
    'hour_of_day', 'day_of_week', 'days_since_first_event',
    'user_product_view_count', 'user_product_purchase_count', 'user_product_interaction_count',
    'user_activity_count', 'item_popularity'
]

X = df[features]
y = df[target]

In [None]:
# Categorical & numerical columns
categorical_cols = ['item_category', 'campaign_type', 'channel', 'day_of_week']
numerical_cols = [col for col in features if col not in categorical_cols]

In [None]:
# Preprocessing
preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), numerical_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
])

In [None]:
# Train/test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

In [None]:
# ----------------------- #
# RANDOM FOREST TUNING
# ----------------------- #
rf_params = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [5, 10, 20, None],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4],
    'classifier__max_features': ['sqrt', 'log2'],
    'classifier__class_weight': ['balanced']
}

rf_pipeline = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('classifier', RandomForestClassifier(random_state=42))
])

rf_search = RandomizedSearchCV(
    rf_pipeline, rf_params, n_iter=20, scoring='f1', cv=3, verbose=1, n_jobs=-1, random_state=42
)

rf_search.fit(X_train, y_train)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


In [None]:
# Evaluate
rf_best = rf_search.best_estimator_
rf_pred = rf_best.predict(X_test)
rf_prob = rf_best.predict_proba(X_test)[:, 1]

print("\n🎯 Best Parameters - Random Forest:")
print(rf_search.best_params_)
print("\n📊 Random Forest Classification Report:")
print(classification_report(y_test, rf_pred))
print(f"🔢 ROC-AUC Score: {roc_auc_score(y_test, rf_prob):.4f}")


🎯 Best Parameters - Random Forest:
{'classifier__n_estimators': 100, 'classifier__min_samples_split': 2, 'classifier__min_samples_leaf': 4, 'classifier__max_features': 'sqrt', 'classifier__max_depth': 5, 'classifier__class_weight': 'balanced'}

📊 Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.84      0.89      9198
           1       0.20      0.47      0.28       802

    accuracy                           0.81     10000
   macro avg       0.58      0.65      0.59     10000
weighted avg       0.89      0.81      0.84     10000

🔢 ROC-AUC Score: 0.6626


In [None]:
# ----------------------- #
# XGBOOST TUNING
# ----------------------- #
xgb_params = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [3, 5, 7, 10],
    'classifier__learning_rate': [0.01, 0.05, 0.1],
    'classifier__subsample': [0.7, 0.8, 1.0],
    'classifier__colsample_bytree': [0.7, 0.8, 1.0],
    'classifier__scale_pos_weight': [ (y==0).sum() / (y==1).sum() ]  # imbalance handling
}

xgb_pipeline = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('classifier', XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42))
])

xgb_search = RandomizedSearchCV(
    xgb_pipeline, xgb_params, n_iter=20, scoring='f1', cv=3, verbose=1, n_jobs=-1, random_state=42
)

xgb_search.fit(X_train, y_train)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


Parameters: { "use_label_encoder" } are not used.



In [None]:
# Evaluate
xgb_best = xgb_search.best_estimator_
xgb_pred = xgb_best.predict(X_test)
xgb_prob = xgb_best.predict_proba(X_test)[:, 1]

print("\n🎯 Best Parameters - XGBoost:")
print(xgb_search.best_params_)
print("\n📊 XGBoost Classification Report:")
print(classification_report(y_test, xgb_pred))
print(f"🔢 ROC-AUC Score: {roc_auc_score(y_test, xgb_prob):.4f}")


🎯 Best Parameters - XGBoost:
{'classifier__subsample': 0.8, 'classifier__scale_pos_weight': np.float64(11.4750499001996), 'classifier__n_estimators': 300, 'classifier__max_depth': 5, 'classifier__learning_rate': 0.1, 'classifier__colsample_bytree': 0.8}

📊 XGBoost Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.77      0.85      9198
           1       0.16      0.49      0.24       802

    accuracy                           0.75     10000
   macro avg       0.55      0.63      0.54     10000
weighted avg       0.88      0.75      0.80     10000

🔢 ROC-AUC Score: 0.6494
