In [None]:
import os
import json
import numpy as np
import pandas as pd
import kagglehub

from pathlib import Path
from scipy import sparse
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib

import sys
# Add src directory to python path
if str(Path.cwd()) not in sys.path:
    sys.path.append(str(Path.cwd()))

from src.paper_protocol import make_leave_one_out_purchase_splits

## Config

In [2]:
# Set random seed for reproducibility
RANDOM_SEED = 42
rng = np.random.default_rng(RANDOM_SEED)

# Define file paths
RAW_EVENTS_PATH = Path("data/raw/events.csv")
CLEAN_EVENTS_PATH = Path("data/processed/clean_events.parquet")
DATA_OUT = Path("data/processed")
DATA_OUT.mkdir(parents=True, exist_ok=True)

# Define configuration settings
SUPPORTED_EVENTS = {"view", "addtocart", "transaction"}
event_weight = {"view": 1.0, "addtocart": 3.0, "transaction": 10.0}

min_user_events_for_cf = 3     # set to 5+ for stronger filtering
min_item_events = 1            # set to 2+ if you want to drop ultra-rare items
cap_user_item_strength = 20.0  # caps repeated interactions

split_strategy = "loo"            # "loo" (Leave-One-Out)

neg_k = 100                       # negatives per positive for ranking eval
eval_one_positive_per_user = True # keeps eval candidate sets lightweight

# 1. Load Data

In [3]:
# Load raw data
# Download dataset if not found locally
if not RAW_EVENTS_PATH.exists():
    print("Raw events not found locally. Downloading via kagglehub...")
    kaggle_path = Path(kagglehub.dataset_download("retailrocket/ecommerce-dataset"))
    raw_source_path = kaggle_path / "events.csv"
    props1_path = kaggle_path / "item_properties_part1.csv"
    props2_path = kaggle_path / "item_properties_part2.csv"
else:
    raw_source_path = RAW_EVENTS_PATH
    props1_path = Path("data/raw/item_properties_part1.csv")
    props2_path = Path("data/raw/item_properties_part2.csv")
    
    # Fallback if props are missing locally
    if not (props1_path.exists() and props2_path.exists()):
        print("Item properties not found locally. Downloading via kagglehub...")
        kaggle_path = Path(kagglehub.dataset_download("retailrocket/ecommerce-dataset"))
        props1_path = kaggle_path / "item_properties_part1.csv"
        props2_path = kaggle_path / "item_properties_part2.csv"

print(f"Loading raw events from: {raw_source_path}")

dtypes = {
    "timestamp": "int64",
    "visitorid": "Int64",
    "itemid": "Int64",
    "event": "string",
    "transactionid": "Int64",
}
events = pd.read_csv(raw_source_path, dtype=dtypes)
print("Raw shape:", events.shape)
events.head()

Raw events not found locally. Downloading via kagglehub...
Loading raw events from: C:\Users\ayera\.cache\kagglehub\datasets\retailrocket\ecommerce-dataset\versions\2\events.csv
Raw shape: (2756101, 5)


Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
0,1433221332117,257597,view,355908,
1,1433224214164,992329,view,248676,
2,1433221999827,111016,view,318965,
3,1433221955914,483717,view,253185,
4,1433221337106,951259,view,367447,


# 2. Clean Data

In [4]:
# Clean data
# Remove rows with missing key values
required_cols = ["visitorid", "itemid", "timestamp", "event"]
events = events.dropna(subset=required_cols)

# Convert timestamps to UTC datetime
# RetailRocket timestamps are in milliseconds
events["timestamp"] = pd.to_datetime(events["timestamp"], unit="ms", utc=True, errors="coerce")
events = events.dropna(subset=["timestamp"])

# Keep only supported event types
events["event"] = events["event"].astype(str).str.lower()
events = events[events["event"].isin(SUPPORTED_EVENTS)]

# Cast columns to correct data types
events["visitorid"] = events["visitorid"].astype("int64")
events["itemid"] = events["itemid"].astype("int64")
if "transactionid" in events.columns:
    events["transactionid"] = events["transactionid"].astype("Int64")

# Remove duplicate events
before_dedupe = len(events)
events = events.drop_duplicates(subset=["visitorid", "itemid", "timestamp", "event"])
print(f"Dropped {before_dedupe - len(events)} duplicates")

print("Cleaned shape:", events.shape)
events.head()

Dropped 460 duplicates
Cleaned shape: (2755641, 5)


Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
0,2015-06-02 05:02:12.117000+00:00,257597,view,355908,
1,2015-06-02 05:50:14.164000+00:00,992329,view,248676,
2,2015-06-02 05:13:19.827000+00:00,111016,view,318965,
3,2015-06-02 05:12:35.914000+00:00,483717,view,253185,
4,2015-06-02 05:02:17.106000+00:00,951259,view,367447,


In [5]:
# Save cleaned data to parquet
CLEAN_EVENTS_PATH.parent.mkdir(parents=True, exist_ok=True)
events.to_parquet(CLEAN_EVENTS_PATH, index=False)
print(f"Saved cleaned events to {CLEAN_EVENTS_PATH}")

Saved cleaned events to data\processed\clean_events.parquet


In [6]:
# Create short timestamp column
events["ts"] = events["timestamp"]

# Print dataset statistics
print("n_events:", len(events))
print("n_users:", events["visitorid"].nunique(), "n_items:", events["itemid"].nunique())
print("timestamp min/max:", events["timestamp"].min(), events["timestamp"].max())
print("event counts:\n", events["event"].value_counts())

required_cols = ["visitorid","itemid","event","timestamp"]
for col in required_cols:
    assert col in events.columns, f"Missing required column {col}"
events.head()

n_events: 2755641
n_users: 1407580 n_items: 235061
timestamp min/max: 2015-05-03 03:00:04.384000+00:00 2015-09-18 02:59:47.788000+00:00
event counts:
 event
view           2664218
addtocart        68966
transaction      22457
Name: count, dtype: int64


Unnamed: 0,timestamp,visitorid,event,itemid,transactionid,ts
0,2015-06-02 05:02:12.117000+00:00,257597,view,355908,,2015-06-02 05:02:12.117000+00:00
1,2015-06-02 05:50:14.164000+00:00,992329,view,248676,,2015-06-02 05:50:14.164000+00:00
2,2015-06-02 05:13:19.827000+00:00,111016,view,318965,,2015-06-02 05:13:19.827000+00:00
3,2015-06-02 05:12:35.914000+00:00,483717,view,253185,,2015-06-02 05:12:35.914000+00:00
4,2015-06-02 05:02:17.106000+00:00,951259,view,367447,,2015-06-02 05:02:17.106000+00:00


In [7]:
# Sort events by user and time
events = events.sort_values(["visitorid", "ts", "itemid"]).reset_index(drop=True)

# Assign weights to event types
events["w"] = events["event"].astype(str).map(event_weight).astype("float32")

events[["visitorid","itemid","event","transactionid","ts","w"]].head()

Unnamed: 0,visitorid,itemid,event,transactionid,ts,w
0,0,285930,view,,2015-09-11 20:49:49.439000+00:00,1.0
1,0,357564,view,,2015-09-11 20:52:39.591000+00:00,1.0
2,0,67045,view,,2015-09-11 20:55:17.175000+00:00,1.0
3,1,72028,view,,2015-08-13 17:46:06.444000+00:00,1.0
4,2,325215,view,,2015-08-07 17:51:44.567000+00:00,1.0


# 3. Feature Engineering

In [8]:
# Aggregate events by user and item
agg = (events
    .groupby(["visitorid","itemid"], as_index=False)
    .agg(
        ts_last=("ts","max"),
        w_sum=("w","sum"),
        w_max=("w","max"),
        n_events=("w","size"),
    ))

agg["w_sum"] = agg["w_sum"].clip(upper=cap_user_item_strength).astype("float32")
agg["w_max"] = agg["w_max"].astype("float32")

print("agg rows:", len(agg), "| unique users:", agg["visitorid"].nunique(), "| unique items:", agg["itemid"].nunique())
agg.head()

agg rows: 2145179 | unique users: 1407580 | unique items: 235061


Unnamed: 0,visitorid,itemid,ts_last,w_sum,w_max,n_events
0,0,67045,2015-09-11 20:55:17.175000+00:00,1.0,1.0,1
1,0,285930,2015-09-11 20:49:49.439000+00:00,1.0,1.0,1
2,0,357564,2015-09-11 20:52:39.591000+00:00,1.0,1.0,1
3,1,72028,2015-08-13 17:46:06.444000+00:00,1.0,1.0,1
4,2,216305,2015-08-07 18:17:43.170000+00:00,2.0,1.0,2


In [9]:
# Filter items by minimum events
if min_item_events > 1:
    item_counts = agg["itemid"].value_counts()
    keep_items = set(item_counts[item_counts >= min_item_events].index)
    agg = agg[agg["itemid"].isin(keep_items)].copy()
    print("after MIN_ITEM_EVENTS filter:", agg.shape)

In [10]:
# Filter users by minimum events
user_counts = agg["visitorid"].value_counts()
active_users = set(user_counts[user_counts >= min_user_events_for_cf].index)

agg_cf = agg[agg["visitorid"].isin(active_users)].copy()

print("agg total:", agg.shape)
print("agg_cf (active users):", agg_cf.shape)

agg total: (2145179, 6)
agg_cf (active users): (684577, 6)


# 4. Splitting

In [11]:
print("Using Multi-Behavior Leave-One-Out Protocol (Target: Purchase)...")

# Count transactions per user
trans_counts = events[events["event"] == "transaction"]["visitorid"].value_counts()
print(f"Users with >= 3 transactions: {(trans_counts >= 3).sum()}")
print(f"Users with >= 2 transactions: {(trans_counts >= 2).sum()}")

# Split data using Leave-One-Out strategy
train_events, val_events, test_events = make_leave_one_out_purchase_splits(events, min_interactions=3)

print(f"Split sizes (events): Train={len(train_events)}, Val={len(val_events)}, Test={len(test_events)}")

# Aggregate events by user and item
def aggregate_events(df):
    # df already has 'w' column from previous cells
    agg = df.groupby(["visitorid", "itemid"]).agg(
        ts_last=("ts", "max"),
        w_sum=("w", "sum"),
        w_max=("w", "max"),
        n_events=("event", "count")
    ).reset_index()
    return agg

print("Aggregating train events...")
train = aggregate_events(train_events)
val = aggregate_events(val_events)
test = aggregate_events(test_events)

print("train/val/test (aggregated shapes):", train.shape, val.shape, test.shape)

Using Multi-Behavior Leave-One-Out Protocol (Target: Purchase)...
Users with >= 3 transactions: 1027
Users with >= 2 transactions: 2576


100%|██████████| 1027/1027 [00:02<00:00, 371.43it/s]


Split sizes (events): Train=105653, Val=1027, Test=1027
Aggregating train events...
train/val/test (aggregated shapes): (59659, 6) (1027, 6) (1027, 6)


In [12]:
# Filter validation and test sets
train_users = set(train["visitorid"].unique())
train_items = set(train["itemid"].unique())

val  = val[val["visitorid"].isin(train_users) & val["itemid"].isin(train_items)].copy()
test = test[test["visitorid"].isin(train_users) & test["itemid"].isin(train_items)].copy()

print("val/test after train-only filter:", val.shape, test.shape)

val/test after train-only filter: (794, 6) (814, 6)


# 5. Create Mappings

In [13]:
# Create user and item mappings
unique_users = np.array(sorted(train["visitorid"].unique()))
unique_items = np.array(sorted(train["itemid"].unique()))

user2idx = {int(u): int(i) for i, u in enumerate(unique_users)}
item2idx = {int(it): int(i) for i, it in enumerate(unique_items)}

with open(DATA_OUT / "user2idx.json", "w") as f:
    json.dump(user2idx, f)
with open(DATA_OUT / "item2idx.json", "w") as f:
    json.dump(item2idx, f)

print("n_users:", len(user2idx), "n_items:", len(item2idx))

n_users: 984 n_items: 25479


In [14]:
def add_indices(df: pd.DataFrame) -> pd.DataFrame:
    # Add internal indices to dataframe
    df = df.copy()
    df["u"] = df["visitorid"].map(user2idx)
    df["i"] = df["itemid"].map(item2idx)
    # keep ts_last for later (eval filtering / last interaction)
    return df.dropna(subset=["u","i"]).astype({"u":"int32","i":"int32"})

# Add indices to train, val, test
train_i = add_indices(train)
val_i   = add_indices(val)
test_i  = add_indices(test)

train_i.head()

Unnamed: 0,visitorid,itemid,ts_last,w_sum,w_max,n_events,u,i
0,3465,8523,2015-06-16 01:34:15.600000+00:00,4.0,3.0,2,0,437
1,3926,36039,2015-06-17 00:33:53.859000+00:00,2.0,1.0,2,1,1917
2,3926,335331,2015-06-02 20:01:56.963000+00:00,12.0,10.0,3,1,18298
3,4101,104752,2015-05-21 00:32:25.232000+00:00,14.0,10.0,3,2,5729
4,4101,115244,2015-05-21 00:25:10.185000+00:00,2.0,1.0,2,2,6288


# 6. Build Matrices

In [15]:
def build_csr(df: pd.DataFrame, n_users: int, n_items: int, weight_col="w_sum"):
    # Build CSR matrix
    rows = df["u"].to_numpy()
    cols = df["i"].to_numpy()
    data = df[weight_col].to_numpy(dtype=np.float32)
    X = sparse.csr_matrix((data, (rows, cols)), shape=(n_users, n_items))
    X.sum_duplicates()
    return X

# Build and save CSR matrix
X_train = build_csr(train_i, len(user2idx), len(item2idx), weight_col="w_sum")
sparse.save_npz(DATA_OUT / "X_train_csr.npz", X_train)

print("X_train shape:", X_train.shape, "nnz:", X_train.nnz)

X_train shape: (984, 25479) nnz: 59659


In [16]:
# Calculate item popularity based on TRANSACTIONS only (to match negative sampling and LOO target)
train_purchases = train_events[train_events["event"] == "transaction"]

item_pop = (train_purchases.groupby("itemid")
            .size()
            .reset_index(name="pop")
            .sort_values("pop", ascending=False))

# Map item IDs to internal indices
item_pop["i"] = item_pop["itemid"].map(item2idx)
# Filter items not in training set
item_pop = item_pop.dropna(subset=["i"]).astype({"i": "int32"})

item_pop.to_parquet(DATA_OUT / "item_popularity.parquet", index=False)
item_pop.head(10)

Unnamed: 0,itemid,pop,i
1331,119736,89,6527
2425,213834,31,11661
5250,461686,28,25184
2834,248455,17,13542
3579,312728,16,17100
3652,320130,15,17489
4221,369158,14,20215
4785,420960,13,23036
185,17478,13,910
3791,334401,13,18239


# 7. Process Item Properties

In [17]:
def iter_item_props(paths, chunksize=5_000_000):
    # Iterate over item properties
    # Explicitly setting dtypes saves memory and ensures correct parsing
    dtypes = {"timestamp":"int64", "itemid":"int64", "property":"string", "value":"string"}
    for p in paths:
        for chunk in pd.read_csv(p, dtype=dtypes, chunksize=chunksize):
            chunk["ts"] = pd.to_datetime(chunk["timestamp"], unit="ms", utc=True)
            yield chunk

# For LOO, prevent leakage by capping at the last training event
train_cutoff = train_events["ts"].max()

keep_itemids = set(item2idx.keys())

print("train_cutoff:", train_cutoff)
print("keep_itemids:", len(keep_itemids))

latest_rows = []
for chunk in iter_item_props([props1_path, props2_path], chunksize=5_000_000):
    chunk = chunk[chunk["itemid"].isin(keep_itemids)]
    if train_cutoff is not None:
        chunk = chunk[chunk["ts"] <= train_cutoff]

    # Keep latest property value per item
    chunk = chunk.sort_values("ts").drop_duplicates(["itemid","property"], keep="last")
    latest_rows.append(chunk[["itemid","property","value","ts"]])

item_props = pd.concat(latest_rows, ignore_index=True)
# Deduplicate properties across chunks
item_props = (item_props.sort_values("ts")
              .drop_duplicates(["itemid","property"], keep="last")
              .reset_index(drop=True))

print("item_props snapshot rows:", len(item_props))
print("unique items in snapshot:", item_props["itemid"].nunique())
item_props.head()

train_cutoff: 2015-09-18 02:34:21.089000+00:00
keep_itemids: 25479
item_props snapshot rows: 711102
unique items in snapshot: 24455


Unnamed: 0,itemid,property,value,ts
0,323093,112,679677,2015-05-10 03:00:00+00:00
1,21899,364,816290,2015-05-10 03:00:00+00:00
2,128318,71,376905,2015-05-10 03:00:00+00:00
3,444323,678,1290827,2015-05-10 03:00:00+00:00
4,255622,776,577331,2015-05-10 03:00:00+00:00


In [18]:
# Process item properties
item_props["value"] = item_props["value"].fillna("").astype(str)
item_props["value"] = item_props["value"].str.replace(r"\s+", "_", regex=True)

item_props["token"] = "p" + item_props["property"].astype(str) + "=v" + item_props["value"]

item_text = (item_props.groupby("itemid")["token"]
             .apply(lambda s: " ".join(s.values))
             .reset_index())

# Ensure all training items have property entries
all_items_df = pd.DataFrame({"itemid": list(keep_itemids)})
item_text = all_items_df.merge(item_text, on="itemid", how="left")
item_text["token"] = item_text["token"].fillna("")

# Sort by internal item index
item_text["i"] = item_text["itemid"].map(item2idx)
item_text = item_text.sort_values("i").reset_index(drop=True)

print("item_text rows (should equal n_items):", len(item_text))
item_text.head()

item_text rows (should equal n_items): 25479


Unnamed: 0,itemid,token,i
0,15,pcategoryid=v722 p764=v1285872 p917=v789221 p3...,0
1,17,p917=vn58500.000 p764=v1285872 p790=vn27120.00...,1
2,19,p678=v743822_552121 p283=v984060_150169_103789...,2
3,25,p159=v519769 p112=v679677 p364=v316529 p917=vn...,3
4,42,p776=v905905 p6=v1285402_1042990 p159=v519769 ...,4


In [19]:
print("Sample tokens:", item_text["token"].head().tolist())
print("Empty tokens count:", (item_text["token"] == "").sum())
print("Total items:", len(item_text))

try:
    # Create TF-IDF matrix
    vectorizer = TfidfVectorizer(
        token_pattern=r"[^ ]+",
        min_df=2,
        max_features=200_000,
        dtype=np.float32
    )
    X_item = vectorizer.fit_transform(item_text["token"].values)
except ValueError:
    print("Warning: min_df=2 failed (likely too sparse or empty). Retrying with min_df=1...")
    vectorizer = TfidfVectorizer(
        token_pattern=r"[^ ]+",
        min_df=1,
        max_features=200_000,
        dtype=np.float32
    )
    try:
        X_item = vectorizer.fit_transform(item_text["token"].values)
    except ValueError:
        print("Warning: Empty vocabulary even with min_df=1. Creating dummy features.")
        # Create dummy feature if vocabulary is empty
        item_text["dummy"] = "item"
        vectorizer = TfidfVectorizer(token_pattern=r"[^ ]+", min_df=1)
        X_item = vectorizer.fit_transform(item_text["dummy"].values)

sparse.save_npz(DATA_OUT / "item_content_tfidf.npz", X_item)
joblib.dump(vectorizer, DATA_OUT / "tfidf_vectorizer.joblib")

print("X_item shape:", X_item.shape, "nnz:", X_item.nnz)

# Verify TF-IDF matrix size
assert X_item.shape[1] > 1000, f"TF-IDF vocabulary too small ({X_item.shape[1]}). Check item properties or tokenization."

Sample tokens: ['pcategoryid=v722 p764=v1285872 p917=v789221 p364=v1047026 p283=v433564_245772_789221_809278_245772_1213953_429140_1322984_792235_79212_237874_654986_809278_1215254_249416_646928_750061_961877_1152409_780700_1128577_269926_754848_703408_469750_581854_1028919_1124417_484436_1256252_790607 p678=v245772 p915=v769062 p202=v789221 p698=v433564 p616=v769062 p888=v789221 p839=v245772 p227=v433564 p159=v519769 p812=v769062 p112=v679677 p693=v769062 p776=v604754 p790=vn8400.000 p591=v1116693 pavailable=v0', 'p917=vn58500.000 p764=v1285872 p790=vn27120.000 p159=v519769 p6=v245617 p283=v245617_365855_1263472_n58500.000 p776=v840531 p364=v1083699 pcategoryid=v1265 p839=v365855 p202=v1263472_n58500.000 pavailable=v0 p888=v1263472_n58500.000 p348=v245617 p392=v706848 p678=v365855 p112=v679677 p227=v245617', 'p678=v743822_552121 p283=v984060_150169_1037891_743822_552121_119805 p888=v119805 p6=v353870_1310600 p917=v119805 p604=v769062 p764=v1285872 p562=v769062_639502_n278.400_1041241_

# 8. Build Evaluation Candidates

In [20]:
# Select one positive item per user for evaluation
val_pos = val_i.copy()
test_pos = test_i.copy()

# Map users to their training items
train_user_items = (
    train_i.groupby("u")["i"].apply(lambda s: set(s.values)).to_dict()
)

# Map users to their validation items
val_user_items = (
    val_i.groupby("u")["i"].apply(lambda s: set(s.values)).to_dict()
)

# Build popularity distribution from TRAIN purchases only
train_purchases = train_events[train_events["event"] == "transaction"]
# Map to internal indices
train_purchases_i = train_purchases.copy()
train_purchases_i["i"] = train_purchases_i["itemid"].map(item2idx)
train_purchases_i = train_purchases_i.dropna(subset=["i"]).astype({"i": "int32"})

pop_counts = train_purchases_i["i"].value_counts()
all_items = pop_counts.index.to_numpy()
pop_probs = (pop_counts / pop_counts.sum()).to_numpy()

print(f"Popularity distribution built from {len(train_purchases_i)} purchases.")
print(f"Top 5 items: {all_items[:5]}")

def sample_negatives(u: int, k: int, split: str) -> list:
    # Sample negative candidates
    seen = set(train_user_items.get(u, []))
    if split == "test":
        seen.update(val_user_items.get(u, []))
    
    negs = set()
    max_tries = 50000
    tries = 0
    
    while len(negs) < k and tries < max_tries:
        batch_size = (k - len(negs)) * 2
        candidates = rng.choice(all_items, size=batch_size, p=pop_probs, replace=True)
        for c in candidates:
            if c not in seen:
                negs.add(int(c))
                if len(negs) == k:
                    break
        tries += 1
        
    if len(negs) < k:
        # Fallback to random sampling
        print(f"Warning: Popularity sampling exhausted for user {u}. Falling back to random.")
        all_indices = np.arange(len(item2idx))
        while len(negs) < k:
            c = int(rng.choice(all_indices))
            if c not in seen:
                negs.add(c)
                
    return list(negs)

def build_eval_candidates(df_pos: pd.DataFrame, split_name: str):
    # Build evaluation candidates
    rows = []
    for r in df_pos.itertuples(index=False):
        u = int(r.u)
        pos_i = int(r.i)
        
        # Ensure positive is not in negatives
        negs = sample_negatives(u, neg_k, split_name)
        
        # Sanity check
        if pos_i in negs:
            negs.remove(pos_i)
            # Resample one
            while len(negs) < neg_k:
                c = int(rng.choice(all_items, p=pop_probs))
                if c != pos_i and c not in negs:
                    negs.append(c)

        rows.append({"u": u, "pos_i": pos_i, "neg_i": negs})
    out = pd.DataFrame(rows)
    out.to_parquet(DATA_OUT / f"eval_candidates_{split_name}.parquet", index=False)
    return out

print("Building validation candidates...")
val_cand  = build_eval_candidates(val_pos, "val")
print("Building test candidates...")
test_cand = build_eval_candidates(test_pos, "test")

val_cand.head()

Popularity distribution built from 7482 purchases.
Top 5 items: [ 6527 11661 25184 13542 17100]
Building validation candidates...
Building test candidates...


Unnamed: 0,u,pos_i,neg_i
0,0,6248,"[8717, 16913, 22554, 22576, 23089, 9778, 12849..."
1,1,17677,"[22530, 1045, 2584, 10268, 13853, 17439, 3114,..."
2,2,12435,"[10241, 19461, 8200, 4618, 10253, 23, 1565, 15..."
3,3,6942,"[10241, 4106, 8718, 5147, 3106, 11811, 10275, ..."
4,4,15780,"[9217, 14343, 4105, 4618, 11274, 24590, 22031,..."


# 9. Save Artifacts

In [21]:
# Save processed data
train_i.to_parquet(DATA_OUT / "interactions_train.parquet", index=False)
val_i.to_parquet(DATA_OUT / "interactions_val.parquet", index=False)
test_i.to_parquet(DATA_OUT / "interactions_test.parquet", index=False)

meta = {
    "random_seed": RANDOM_SEED,
    "event_weight": event_weight,
    "min_user_events_for_cf": min_user_events_for_cf,
    "min_item_events": min_item_events,
    "cap_user_item_strength": cap_user_item_strength,
    "split_strategy": split_strategy,
    "neg_k": neg_k,
    "eval_one_positive_per_user": eval_one_positive_per_user,
    "train_cutoff_used_for_item_props": str(train_cutoff) if train_cutoff is not None else None,
    "n_users": len(user2idx),
    "n_items": len(item2idx),
    "X_train_nnz": int(X_train.nnz),
    "X_item_nnz": int(X_item.nnz),
}

with open(DATA_OUT / "preprocess_metadata.json", "w") as f:
    json.dump(meta, f, indent=2)

meta

{'random_seed': 42,
 'event_weight': {'view': 1.0, 'addtocart': 3.0, 'transaction': 10.0},
 'min_user_events_for_cf': 3,
 'min_item_events': 1,
 'cap_user_item_strength': 20.0,
 'split_strategy': 'loo',
 'neg_k': 100,
 'eval_one_positive_per_user': True,
 'train_cutoff_used_for_item_props': '2015-09-18 02:34:21.089000+00:00',
 'n_users': 984,
 'n_items': 25479,
 'X_train_nnz': 59659,
 'X_item_nnz': 558243}

In [25]:
# Verify data integrity
print("Train users:", train_i["u"].nunique(), "Train items:", train_i["i"].nunique())
print("Val users:", val_i["u"].nunique(), "Val items:", val_i["i"].nunique())
print("Test users:", test_i["u"].nunique(), "Test items:", test_i["i"].nunique())

assert len(val_i) > 0 and len(test_i) > 0, "Val/Test ended up empty after filtering — relax filters or adjust split."
assert X_item.shape[0] == len(item2idx), "Item content matrix must align to item2idx."
assert X_train.shape == (len(user2idx), len(item2idx)), "CF matrix shape mismatch."

Train users: 984 Train items: 25479
Val users: 794 Val items: 749
Test users: 814 Test items: 745
