In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, roc_auc_score

In [None]:
# Load dataset
df = pd.read_csv("cleaned_behavior_features.csv")
df['converted'] = (df['event_name'] == 'purchase').astype(int)

In [None]:
# Add back user and item ID info before dropping
user_ids = df['user_pseudo_id']
item_ids = df['item_id']

In [None]:
# Add frequency-encoded features
df['user_activity_count'] = df['user_pseudo_id'].map(df['user_pseudo_id'].value_counts())
df['item_popularity'] = df['item_id'].map(df['item_id'].value_counts())

In [None]:
# Drop unused columns
df.drop(columns=[
    'user_pseudo_id', 'item_id', 'item_name', 'event_name',
    'discounted_price', 'event_date', 'region', 'city', 'country'
], inplace=True)

In [None]:
# Define features
features = [
    'original_price', 'discount_percent', 'item_category', 'campaign_type', 'channel',
    'hour_of_day', 'day_of_week', 'days_since_first_event',
    'user_product_view_count', 'user_product_purchase_count', 'user_product_interaction_count',
    'user_activity_count', 'item_popularity'
]
target = 'converted'
X = df[features]
y = df[target]

In [None]:
# Identify column types
categorical_cols = ['item_category', 'campaign_type', 'channel', 'day_of_week']
numerical_cols = [col for col in features if col not in categorical_cols]

In [None]:
# Preprocessing
preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), numerical_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
])

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

In [None]:
X_train, X_test, y_train, y_test, user_train, user_test, item_train, item_test = train_test_split(
    X, y, user_ids, item_ids, stratify=y, test_size=0.2, random_state=42
)


In [None]:
# Use best XGBoost parameters
xgb_best_model = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('classifier', XGBClassifier(
        n_estimators=300,
        max_depth=5,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        scale_pos_weight=(y == 0).sum() / (y == 1).sum(),
        use_label_encoder=False,
        eval_metric='logloss',
        random_state=42
    ))
])

In [None]:
# Train the model
xgb_best_model.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.



In [None]:
# Predict probabilities
y_proba = xgb_best_model.predict_proba(X_test)[:, 1]

In [None]:
# Prepare output dataframe
X_test_display = X_test.copy()
X_test_display['conversion_probability'] = y_proba
X_test_display['user_pseudo_id'] = user_test.values
X_test_display['item_id'] = item_test.values
X_test_display['original_price'] = df.loc[X_test_display.index, 'original_price'].values
X_test_display['discount_percent'] = df.loc[X_test_display.index, 'discount_percent'].values

In [None]:
# Show top 10 users likely to convert
top_users = X_test_display[['user_pseudo_id', 'item_id', 'original_price', 'discount_percent', 'conversion_probability']]\
    .sort_values(by='conversion_probability', ascending=False)\
    .head(10)

print("🎯 Top 10 Users Most Likely to Convert:")
print(top_users.to_string(index=False))

🎯 Top 10 Users Most Likely to Convert:
user_pseudo_id  item_id  original_price  discount_percent  conversion_probability
     user_1483 item_626           69.35                50                0.932802
     user_1586 item_150          259.96                50                0.918644
     user_1007 item_197          256.65                50                0.913032
      user_833 item_779          193.37                50                0.902941
     user_1047 item_697          467.63                50                0.901871
     user_1513 item_917          241.09                50                0.900752
      user_958 item_925          195.45                50                0.900666
     user_1585 item_920          449.67                50                0.898346
     user_1601 item_679          354.61                50                0.897740
      user_602 item_819          174.83                50                0.897123
