In [2]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report
import joblib
import shap

os.makedirs("model", exist_ok=True)

ACTIONS_PATH = "data/user_actions.csv"
DECISIONS_PATH = "data/user_decisions.csv"

actions = pd.read_csv(ACTIONS_PATH)
decisions = pd.read_csv(DECISIONS_PATH)

# Aggregate behavioral features per user (general + per-category)
def build_features(df_actions: pd.DataFrame) -> pd.DataFrame:
    base = df_actions.groupby("user_id").agg(
        total_events=("event_id", "count"),
        searches=("event_type", lambda x: (x == "search").sum()),
        watch_videos=("event_type", lambda x: (x == "watch_video").sum()),
        read_articles=("event_type", lambda x: (x == "read_article").sum()),
        compares=("event_type", lambda x: (x == "compare").sum()),
        product_views=("event_type", lambda x: (x == "view_product").sum()),
    )

    # Category counts
    categories = df_actions["category"].unique()
    for cat in categories:
        col = f"{cat}_events"
        base[col] = df_actions[df_actions["category"] == cat].groupby("user_id")["event_id"].count()
    base.fillna(0, inplace=True)
    return base

agg = build_features(actions)

# Target: did user purchase ANY product?
agg["made_purchase"] = agg.index.isin(decisions["user_id"]).astype(int)
    
X = agg.drop(columns=["made_purchase"])
y = agg["made_purchase"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)

model = LogisticRegression(max_iter=600)
model.fit(X_train_s, y_train)

y_prob = model.predict_proba(X_test_s)[:, 1]
print("\n=== General Purchase Model Report ===")
print(classification_report(y_test, model.predict(X_test_s)))
print("ROC-AUC:", roc_auc_score(y_test, y_prob))

joblib.dump(
    {"model": model, "scaler": scaler, "features": list(X.columns)},
    "model/ttrace_multi.pkl"
)

# SHAP â€“ global importance (optional)
explainer = shap.LinearExplainer(model, X_train_s)
shap_vals = explainer.shap_values(X_test_s)

feat_imp = pd.DataFrame({
    "feature": X.columns,
    "importance": np.abs(shap_vals).mean(axis=0)
}).sort_values(by="importance", ascending=False)

print("\n=== Top Influential Behavioral Features (Global) ===")
print(feat_imp.head(10))

# Optional: visualize if you're running locally with GUI
# shap.summary_plot(shap_vals, X_test, show=True)



=== General Purchase Model Report ===
              precision    recall  f1-score   support

           0       0.97      0.99      0.98       485
           1       0.40      0.13      0.20        15

    accuracy                           0.97       500
   macro avg       0.69      0.56      0.59       500
weighted avg       0.96      0.97      0.96       500

ROC-AUC: 0.9069415807560137

=== Top Influential Behavioral Features (Global) ===
                      feature  importance
1                    searches    1.010694
2                watch_videos    0.704586
5               product_views    0.598411
4                    compares    0.450483
8           smartphone_events    0.412989
0                total_events    0.333099
6   home_entertainment_events    0.326824
3               read_articles    0.301877
9               gaming_events    0.222636
10            computer_events    0.173376
