In [1]:
import os
import json
import hashlib
from datetime import datetime

import pandas as pd
import numpy as np
import joblib
import shap

os.makedirs("ledger", exist_ok=True)

ACTIONS_PATH = "data/user_actions.csv"
DECISIONS_PATH = "data/user_decisions.csv"
MODEL_PATH = "model/ttrace_multi.pkl"
LEDGER_PATH = "ledger/decision_influence_log.jsonl"

actions = pd.read_csv(ACTIONS_PATH)
decisions = pd.read_csv(DECISIONS_PATH)

model_art = joblib.load(MODEL_PATH)
model = model_art["model"]
scaler = model_art["scaler"]
features = model_art["features"]
model_version = "ttrace_multi_lr_v1"

# Build background data for SHAP (same aggregation as training)
def build_features(df_actions: pd.DataFrame) -> pd.DataFrame:
    base = df_actions.groupby("user_id").agg(
        total_events=("event_id", "count"),
        searches=("event_type", lambda x: (x == "search").sum()),
        watch_videos=("event_type", lambda x: (x == "watch_video").sum()),
        read_articles=("event_type", lambda x: (x == "read_article").sum()),
        compares=("event_type", lambda x: (x == "compare").sum()),
        product_views=("event_type", lambda x: (x == "view_product").sum()),
    )
    categories = df_actions["category"].unique()
    for cat in categories:
        col = f"{cat}_events"
        base[col] = df_actions[df_actions["category"] == cat].groupby("user_id")["event_id"].count()
    base.fillna(0, inplace=True)
    return base

agg_all = build_features(actions)
X_bg = agg_all[features].values
X_bg_scaled = scaler.transform(X_bg)

explainer = shap.LinearExplainer(model, X_bg_scaled)

def compute_record_hash(rec_no_hash: dict, prev_hash: str) -> str:
    payload = json.dumps(rec_no_hash, sort_keys=True, separators=(",", ":"))
    return hashlib.sha256((prev_hash + payload).encode("utf-8")).hexdigest()

prev_hash = "0" * 64
logged = 0

with open(LEDGER_PATH, "w", encoding="utf-8") as f_ledger:
    for _, dec in decisions.iterrows():
        user_id = int(dec["user_id"])
        decision_id = dec["event_id"]
        product_id = dec["product_id"]
        product_category = dec["category"]
        dec_time = pd.to_datetime(dec["timestamp"])

        # Past actions of this user
        user_hist = actions[(actions["user_id"] == user_id) &
                            (pd.to_datetime(actions["timestamp"]) < dec_time)].copy()
        if len(user_hist) < 5:
            continue

        # Aggregate features for this user (same as training)
        feat = pd.DataFrame([{
            "total_events": len(user_hist),
            "searches": (user_hist["event_type"] == "search").sum(),
            "watch_videos": (user_hist["event_type"] == "watch_video").sum(),
            "read_articles": (user_hist["event_type"] == "read_article").sum(),
            "compares": (user_hist["event_type"] == "compare").sum(),
            "product_views": (user_hist["event_type"] == "view_product").sum(),
        }])

        for cat in actions["category"].unique():
            col = f"{cat}_events"
            feat[col] = (user_hist["category"] == cat).sum()

        feat = feat[features]
        X_scaled = scaler.transform(feat.values)
        proba = float(model.predict_proba(X_scaled)[0, 1])
        shap_vals = explainer.shap_values(X_scaled)[0]

        # Top-4 influential behaviors
        top_idx = np.argsort(np.abs(shap_vals))[-4:][::-1]
        top_shap = {features[i]: float(shap_vals[i]) for i in top_idx}

        # Map features -> concrete events
        influential_events = []
        for feat_name in top_shap.keys():
            evt_df = None
            if feat_name in ["searches", "watch_videos", "read_articles", "compares", "product_views"]:
                if feat_name == "searches":
                    e_type = "search"
                elif feat_name == "watch_videos":
                    e_type = "watch_video"
                elif feat_name == "read_articles":
                    e_type = "read_article"
                elif feat_name == "compares":
                    e_type = "compare"
                elif feat_name == "product_views":
                    e_type = "view_product"
                evt_df = user_hist[user_hist["event_type"] == e_type].tail(3)
            elif feat_name.endswith("_events"):
                cat = feat_name.replace("_events", "")
                evt_df = user_hist[user_hist["category"] == cat].tail(3)

            if evt_df is not None and not evt_df.empty:
                influential_events.extend(list(evt_df["event_id"].values))

        rec_no_hash = {
            "decision_id": decision_id,
            "user_id": user_id,
            "timestamp": dec["timestamp"],
            "decision_type": "purchase",
            "product_id": product_id,
            "product_category": product_category,
            "model_version": model_version,
            "predicted_probability": proba,
            "top_shap_features": top_shap,
            "influential_event_ids": influential_events,
            "prev_hash": prev_hash,
        }

        rec_hash = compute_record_hash(rec_no_hash, prev_hash)
        record = {**rec_no_hash, "hash": rec_hash}

        f_ledger.write(json.dumps(record) + "\n")
        prev_hash = rec_hash
        logged += 1

print(f"✓ Logged {logged} purchase decisions with hashed, explainable influence.")
print(f"→ {LEDGER_PATH}")




✓ Logged 62 purchase decisions with hashed, explainable influence.
→ ledger/decision_influence_log.jsonl


