# 02: Data Embeddings & Baseline Models

This notebook covers:

- **Configuration & Imports**: define data paths and evaluation settings
- **Load Processed Data**: sequences, maps, and interaction tables
- **Data Inspection**: peek at samples
- **Create Interaction Matrices**:
  - Build sparse user–item matrices from sequences
  - Generate train/test splits for evaluation
- **Define Baseline Models**:
  - Popularity
  - Item-based KNN
  - User-based KNN
  - Matrix Factorization (NMF)
- **Evaluation Metrics**: Recall@K
- **Train & Evaluate** each baseline
- **Summarize & Save** results to JSON

## 1. Configuration & Imports

- Set file paths for processed data
- Define evaluation parameters (K values, sample sizes)

In [1]:
import os, random
import pandas as pd
import numpy as np
import json
from pathlib import Path
from scipy.sparse import csr_matrix
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')

# Global reproducibility
SEED = 42
os.environ["PYTHONHASHSEED"] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)

# Paths to processed data
out_dir = Path('../data/processed/jarir/')
seq_train_path = out_dir / 'sequences_train.parquet'
seq_val_path   = out_dir / 'sequences_val.parquet'
seq_test_path  = out_dir / 'sequences_test.parquet'
item_map_path  = out_dir / 'item_id_map.parquet'
cust_map_path  = out_dir / 'customer_id_map.parquet'

# Evaluation config
K_VALUES = [5, 10, 20]
EVAL_SAMPLE_SIZE = 1000  # sample users for faster evaluation (None = all)

# Placeholder for baseline results
baseline_results = {}

## 2. Load Processed Data

- Read train/val/test sequence tables
- Read user & item mappings

In [2]:
print("Loading sequences and maps...")
seq_train = pd.read_parquet(seq_train_path, engine='fastparquet')
seq_val   = pd.read_parquet(seq_val_path,   engine='fastparquet')
seq_test  = pd.read_parquet(seq_test_path,  engine='fastparquet')
item_map  = pd.read_parquet(item_map_path, engine='fastparquet')
cust_map  = pd.read_parquet(cust_map_path, engine='fastparquet')
print(f"Train seq: {len(seq_train)} rows")
print(f"Val seq:   {len(seq_val)} rows")
print(f"Test seq:  {len(seq_test)} rows")
print(f"Items:     {len(item_map)}")
print(f"Users:     {len(cust_map)}")

Loading sequences and maps...
Train seq: 1108 rows
Val seq:   169 rows
Test seq:  160 rows
Items:     1735
Users:     929


## 3. Data Inspection

- View a few example rows from sequences, item_map, and cust_map


In [3]:
print("Sample sequence row:")
print(seq_train.head(3))
print("\nSample item map:")
print(item_map.head(3))
print("\nSample customer map:")
print(cust_map.head(3))

Sample sequence row:
   customer_id  user_idx         ts history_idx  pos_item_idx     country
0     10018322         5 2024-03-07        9 10            11  0103-PLAZA
1     10018322         5 2024-03-24     9 10 11            12  0103-PLAZA
2     10018322         5 2024-04-30  9 10 11 12            13  0103-PLAZA

Sample item map:
      stock_code  item_idx
0      RQ-CHB002         0
1  ZQ-F27318BGLD         1
2     NE-0230059         2

Sample customer map:
   customer_id  user_idx
0        11949         0
1        24811         1
2        33097         2


## 4. Create Sparse Interaction Matrix

- Build user–item matrix: history items weighted lower than positive event
- Also prepare full interactions matrix for non-sequence data

In [4]:
from scipy.sparse import csr_matrix

def build_matrix_from_sequences(seq_df, n_users, n_items):
    rows, cols, vals = [], [], []
    for _, r in seq_df.iterrows():
        u = int(r['user_idx'])
        p = int(r['pos_item_idx'])
        # positive event
        rows.append(u); cols.append(p); vals.append(1.0)
        # history events
        if pd.notna(r['history_idx']) and r['history_idx']:
            h = [int(x) for x in r['history_idx'].split()]
            for item in h:
                rows.append(u); cols.append(item); vals.append(0.5)
    return csr_matrix((vals, (rows, cols)), shape=(n_users, n_items))

n_users = len(cust_map)
n_items = len(item_map)
train_mat = build_matrix_from_sequences(seq_train, n_users, n_items)
full_mat  = build_matrix_from_sequences(pd.concat([seq_train, seq_val, seq_test]), n_users, n_items)
print(f"Train matrix: {train_mat.shape}, nz={train_mat.nnz}")

Train matrix: (929, 1735), nz=1623


In [5]:
# Build a denser KNN source matrix from full interactions (cut at train cutoff) to help KNN
from pathlib import Path
try:
    interactions_path = out_dir / 'interactions_clean.parquet'
    if interactions_path.exists():
        interactions = pd.read_parquet(interactions_path, engine='fastparquet')
        interactions = interactions.merge(item_map, on='stock_code', how='inner')
        interactions = interactions.merge(cust_map, on='customer_id', how='inner')
        cutoff = pd.to_datetime(seq_train['ts'].max()) if 'ts' in seq_train.columns else None
        if cutoff is not None and 'invoice_date' in interactions.columns:
            interactions = interactions[interactions['invoice_date'] <= cutoff]
        rows = interactions['user_idx'].astype(int).to_numpy()
        cols = interactions['item_idx'].astype(int).to_numpy()
        vals = np.ones_like(rows, dtype='float32')
        knn_mat = csr_matrix((vals, (rows, cols)), shape=(n_users, n_items))
        print(f"KNN source matrix (from interactions): {knn_mat.shape}, nz={knn_mat.nnz}")
    else:
        knn_mat = train_mat
        print("KNN source matrix fallback to train_mat")
except Exception as e:
    knn_mat = train_mat
    print("KNN source matrix build failed, fallback to train_mat:", e)


KNN source matrix (from interactions): (929, 1735), nz=2230


## 5. Prepare Train/Test Split

- Remove validation interactions from training matrix for proper held-out evaluation
- Collect test interactions list [(user, item)]

In [6]:
test_interactions = []
for _, r in seq_val.iterrows():
    u, p = int(r['user_idx']), int(r['pos_item_idx'])
    test_interactions.append((u, p))
    train_mat[u, p] = 0
print(f"Prepared {len(test_interactions)} held-out test cases")

Prepared 169 held-out test cases


## 6. Define Baseline Models

- **Popularity**: rank by total interactions
- **ItemKNN**: cosine similarity on item columns
- **UserKNN**: cosine similarity on user rows
- **Matrix Factorization**: NMF on dense matrix


In [7]:
class Popularity:
    def fit(self, mat):
        self.pop = np.array(mat.sum(axis=0)).flatten()
        return self
    def recommend(self, u, k=10, allow_item=None):
        seen = mat[u].nonzero()[1]
        if allow_item is not None:
            seen = np.setdiff1d(seen, np.array([allow_item]))
        scores = self.pop.copy()
        scores[seen] = -1e12
        return np.argsort(scores)[-k:][::-1]

class ItemKNN:
    def __init__(self, k=50, pop_alpha=0.01):
        self.k = k
        self.pop_alpha = pop_alpha
    def fit(self, mat):
        # Prefer a denser matrix from interactions if available (pre-cutoff) to improve co-occurrence
        base = globals().get('knn_mat', mat)
        self.base = base
        # Item-item cosine similarity
        self.sim = cosine_similarity(base.T)
        # Popularity fallback
        self.pop = np.array(base.sum(axis=0)).flatten().astype(np.float32)
        if self.pop.max() > 0:
            self.pop /= self.pop.max()
        return self
    def recommend(self, u, k=10, allow_item=None):
        # User profile in the same base used for similarity
        user_vec = self.base[u].toarray().flatten()
        scores = user_vec @ self.sim
        # Add a small popularity prior to break ties in sparse regimes
        scores = scores + self.pop_alpha * self.pop
        seen = np.where(user_vec > 0)[0]
        if allow_item is not None:
            seen = np.setdiff1d(seen, np.array([allow_item]))
        scores[seen] = -1e12
        return np.argsort(scores)[-k:][::-1]

class UserKNN:
    def __init__(self, k=50):
        self.k = k
    def fit(self, mat):
        self.sim = cosine_similarity(mat)
        return self
    def recommend(self, u, k=10, allow_item=None):
        # Top similar users excluding self
        sim_u = self.sim[u].copy()
        sim_u[u] = -1e12
        top_users = np.argsort(sim_u)[-self.k:][::-1]
        scores = np.zeros(mat.shape[1])
        for v in top_users:
            scores += mat[v].toarray().flatten() * max(sim_u[v], 0)
        seen = mat[u].nonzero()[1]
        if allow_item is not None:
            seen = np.setdiff1d(seen, np.array([allow_item]))
        scores[seen] = -1e12
        return np.argsort(scores)[-k:][::-1]

class MFBaseline:
    def __init__(self, n_f=50):
        self.n_f = n_f
    def fit(self, mat):
        dense = mat.toarray()
        self.model = NMF(n_components=self.n_f, random_state=42)
        self.W = self.model.fit_transform(dense)
        self.H = self.model.components_
        return self
    def recommend(self, u, k=10, allow_item=None):
        scores = self.W[u].dot(self.H)
        seen = train_mat[u].nonzero()[1]
        if allow_item is not None:
            seen = np.setdiff1d(seen, np.array([allow_item]))
        scores[seen] = -1e12
        return np.argsort(scores)[-k:][::-1]

# bind mat for popularity
mat = train_mat

## 7. Evaluation Function (Recall@K)

- Compute Recall@K over held-out test_interactions

In [8]:
def recall_at_k(recs, true_item):
    return int(true_item in recs)

def ndcg_at_k(recs, true_item):
    for rank, item in enumerate(recs, start=1):
        if item == true_item:
            return 1.0 / np.log2(rank + 1)
    return 0.0

def mrr_at_k(recs, true_item):
    for rank, item in enumerate(recs, start=1):
        if item == true_item:
            return 1.0 / rank
    return 0.0

def evaluate(model, k=10):
    recalls, ndcgs, mrrs = [], [], []
    for u, true in test_interactions:
        # allow masking all seen items EXCEPT the held-out positive to avoid leakage
        recs = model.recommend(u, k, allow_item=true)
        recalls.append(recall_at_k(recs, true))
        ndcgs.append(ndcg_at_k(recs, true))
        mrrs.append(mrr_at_k(recs, true))
    return {
        'Recall': float(np.mean(recalls)),
        'NDCG': float(np.mean(ndcgs)),
        'MRR': float(np.mean(mrrs)),
    }

## 8. Train & Evaluate Baselines

- Fit each model on `train_mat`
- Evaluate Recall@K for K in `K_VALUES`

In [9]:
results = {}

# Popularity
pop = Popularity().fit(train_mat)
results['Popularity'] = {f'K={k}': evaluate(pop, k) for k in K_VALUES}

# ItemKNN
itemknn = ItemKNN(k=50).fit(train_mat)
results['ItemKNN'] = {f'K={k}': evaluate(itemknn, k) for k in K_VALUES}

# UserKNN
userknn = UserKNN(k=50).fit(train_mat)
results['UserKNN'] = {f'K={k}': evaluate(userknn, k) for k in K_VALUES}

# Matrix Factorization
mf = MFBaseline(n_f=50).fit(train_mat)
results['MatrixFactorization'] = {f'K={k}': evaluate(mf, k) for k in K_VALUES}

print("Baseline Results:")
for model_name, metrics_by_k in results.items():
    for k, m in metrics_by_k.items():
        print(f"  {model_name} {k}: Recall={m['Recall']:.4f}, NDCG={m['NDCG']:.4f}, MRR={m['MRR']:.4f}")

Baseline Results:
  Popularity K=5: Recall=0.0651, NDCG=0.0391, MRR=0.0303
  Popularity K=10: Recall=0.0947, NDCG=0.0488, MRR=0.0344
  Popularity K=20: Recall=0.1243, NDCG=0.0568, MRR=0.0369
  ItemKNN K=5: Recall=0.0947, NDCG=0.0947, MRR=0.0947
  ItemKNN K=10: Recall=0.1124, NDCG=0.1003, MRR=0.0969
  ItemKNN K=20: Recall=0.1479, NDCG=0.1095, MRR=0.0996
  UserKNN K=5: Recall=0.0059, NDCG=0.0059, MRR=0.0059
  UserKNN K=10: Recall=0.0118, NDCG=0.0079, MRR=0.0068
  UserKNN K=20: Recall=0.0178, NDCG=0.0093, MRR=0.0071
  MatrixFactorization K=5: Recall=0.0000, NDCG=0.0000, MRR=0.0000
  MatrixFactorization K=10: Recall=0.0059, NDCG=0.0017, MRR=0.0006
  MatrixFactorization K=20: Recall=0.0059, NDCG=0.0017, MRR=0.0006


## 9. Summarize & Save Results

- Choose best model at highest K
- Write `baseline_results.json`

In [10]:
# Persist detailed metrics; choose best by Recall@K (highest K)
final_k = K_VALUES[-1]
get_recall = lambda m: m[f'K={final_k}']['Recall']
best = max(results.items(), key=lambda x: get_recall(x[1]))

baseline_summary = {
    'held_out_interactions': results,
    'best_model': best[0],
    'best_recall': get_recall(best[1]),
    'k': final_k
}
with open(out_dir / 'baseline_results.json', 'w') as f:
    json.dump(baseline_summary, f, indent=2)

print(f"Saved baseline results, best={best[0]} Recall@{final_k}={get_recall(best[1]):.4f}")

Saved baseline results, best=ItemKNN Recall@20=0.1479


In [11]:
# Optional: build denser KNN matrix from full interactions to improve KNN baselines
# Uses interactions before validation cutoff to avoid leakage
try:
    interactions_path = out_dir / 'interactions_clean.parquet'
    if interactions_path.exists():
        interactions = pd.read_parquet(interactions_path, engine='fastparquet')
        # Map to indices
        interactions = interactions.merge(item_map, on='stock_code', how='inner')
        interactions = interactions.merge(cust_map, on='customer_id', how='inner')
        # Determine cutoff from train sequences
        cutoff = pd.to_datetime(seq_train['ts'].max()) if 'ts' in seq_train.columns else None
        if cutoff is not None and 'invoice_date' in interactions.columns:
            interactions = interactions[interactions['invoice_date'] <= cutoff]
        # Build binary interactions matrix
        rows = interactions['user_idx'].astype(int).to_numpy()
        cols = interactions['item_idx'].astype(int).to_numpy()
        vals = np.ones_like(rows, dtype='float32')
        knn_mat = csr_matrix((vals, (rows, cols)), shape=(n_users, n_items))
        print(f"KNN source matrix (from interactions): {knn_mat.shape}, nz={knn_mat.nnz}")
    else:
        knn_mat = train_mat
        print("KNN source matrix fallback to train_mat")
except Exception as e:
    knn_mat = train_mat
    print("KNN source matrix build failed, fallback to train_mat:", e)


KNN source matrix (from interactions): (929, 1735), nz=2230
