# H&M Recommendation System â€” 04 SVD Graph Embedding

In this notebook, based on `03_ranking.ipynb`to build item-item co-occurrence graph from train_hist and compute TruncatedSVD embeddings.

In [1]:
import os
import json
import pickle
from datetime import datetime, timedelta

import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import lightgbm as lgb

from collections import defaultdict
from scipy import sparse
from sklearn.decomposition import TruncatedSVD

DATA_DIR = "../data"
TRAIN_DIR = os.path.join(DATA_DIR, "train")
RECALL_DIR = os.path.join(DATA_DIR, "recall")
HM_DATA_DIR = "../hm_data"

OUT_DIR = "../experiments"
MODEL_DIR = "../models"
CFG_DIR = "../experiments/configs"
RESULT_CSV = "../experiments/results.csv"

os.makedirs(TRAIN_DIR, exist_ok=True)
os.makedirs(HM_DATA_DIR, exist_ok=True)
os.makedirs(OUT_DIR, exist_ok=True)
os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(CFG_DIR, exist_ok=True)

VALID_START = pd.to_datetime("2020-09-16")
VALID_END = pd.to_datetime("2020-09-23")  # exclusive

# Internal train label window (last 7 days before VALID_START)
TRAIN_LABEL_START = VALID_START - pd.Timedelta(days=7)

# Recall sizes
N_HISTORY = 30
N_POP = 20
N_CATEGORY = 20
N_COPURCHASE = 30
MAX_CANDIDATES = 100   # final merge cap per user

# Category mapping source column
CATEGORY_COL = "product_type_no"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def save_pickle(obj, path, overwrite=False):
    if (not overwrite) and os.path.exists(path):
        print(f"[skip] exists: {path}")
        return
    with open(path, "wb") as f:
        pickle.dump(obj, f)
    print(f"[saved] {path}")

def load_pickle(path):
    with open(path, "rb") as f:
        return pickle.load(f)

def save_json(obj, path, overwrite=False):
    if (not overwrite) and os.path.exists(path):
        print(f"[skip] exists: {path}")
        return
    with open(path, "w", encoding="utf-8") as f:
        json.dump(obj, f, ensure_ascii=False, indent=2)
    print(f"[saved] {path}")

In [3]:
train_df = load_pickle(os.path.join(TRAIN_DIR, "train_df.pkl"))
valid_df = load_pickle(os.path.join(TRAIN_DIR, "valid_df.pkl"))

# Make sure types are correct
train_df["t_dat"] = pd.to_datetime(train_df["t_dat"])
valid_df["t_dat"] = pd.to_datetime(valid_df["t_dat"])

print("Train window:", train_df["t_dat"].min(), "->", train_df["t_dat"].max(), train_df.shape)
print("Valid window:", valid_df["t_dat"].min(), "->", valid_df["t_dat"].max(), valid_df.shape)

# ground truth dicts: customer -> set(article_id)
train_hist = train_df[train_df["t_dat"] < TRAIN_LABEL_START].copy()
train_label_period = train_df[(train_df["t_dat"] >= TRAIN_LABEL_START) & (train_df["t_dat"] < VALID_START)].copy()
train_gt = train_label_period.groupby("customer_id")["article_id"].apply(lambda s: set(s.astype(int))).to_dict()
valid_period = valid_df[(valid_df["t_dat"] >= VALID_START) & (valid_df["t_dat"] < VALID_END)].copy()
valid_gt = valid_period.groupby("customer_id")["article_id"].apply(lambda s: set(s.astype(int))).to_dict()

cust_to_candidates = load_pickle(os.path.join(RECALL_DIR, "recall_final_merged.pkl"))

Train window: 2018-09-20 00:00:00 -> 2020-09-15 00:00:00 (31548013, 5)
Valid window: 2020-09-16 00:00:00 -> 2020-09-22 00:00:00 (240311, 5)


In [4]:
articles_path = os.path.join(HM_DATA_DIR, "articles.csv")
articles = pd.read_csv(articles_path)
if "product_group_id" not in articles.columns:
    articles["product_group_id"] = pd.factorize(articles["product_group_name"])[0].astype(np.int32)
articles_use = articles[[
    "article_id",
    "product_type_no",
    "product_group_id",
    "colour_group_code",
    "department_no",
    "index_group_no",
    "garment_group_no",
]].copy()
articles_use["article_id"] = articles_use["article_id"].astype(np.int64)

In [5]:
rank_df_train = load_pickle(os.path.join(TRAIN_DIR, "rank_df_train.pkl"))
rank_df_val = load_pickle(os.path.join(TRAIN_DIR, "rank_df_valid.pkl"))

In [6]:
rank_df_train_f = load_pickle(os.path.join(TRAIN_DIR, "rank_df_train_features.pkl"))
rank_df_val_f = load_pickle(os.path.join(TRAIN_DIR, "rank_df_valid_features.pkl"))

In [7]:
def build_item_graph_svd_embeddings(
    train_hist: pd.DataFrame,
    k: int = 16,
    top_items: int = 20000,
    user_max_items: int = 20,
    min_item_cnt: int = 5,
    seed: int = 42
):
    """
    Build item-item co-occurrence graph from train_hist and compute TruncatedSVD embeddings.

    Speed controls:
      - top_items: restrict to top-N popular items
      - user_max_items: for each user, only take last N unique items
      - min_item_cnt: remove ultra-rare items before top-N
    """
    df = train_hist.copy()
    df["t_dat"] = pd.to_datetime(df["t_dat"])
    df["article_id"] = df["article_id"].astype(int)

    # (1) choose item vocabulary from train_hist
    item_cnt = df["article_id"].value_counts()
    item_cnt = item_cnt[item_cnt >= min_item_cnt]
    vocab_items = item_cnt.head(top_items).index.astype(int).tolist()
    vocab_set = set(vocab_items)

    item2idx = {aid: i for i, aid in enumerate(vocab_items)}
    idx2item = np.array(vocab_items, dtype=np.int64)

    print(f"[graph] vocab size = {len(vocab_items)} (top_items={top_items}, min_item_cnt={min_item_cnt})")

    # (2) build per-user recent unique items (filtered to vocab)
    # sort by time then take last unique
    df = df.sort_values(["customer_id", "t_dat"])
    # keep only vocab items early to reduce memory
    df = df[df["article_id"].isin(vocab_set)]

    # get per-user list of items in chronological order, then keep last unique items
    # We'll do an efficient pass with groupby apply
    def last_unique_tail(s, n=user_max_items):
        # s is series of article_id in time order
        # keep last occurrence unique
        x = pd.Series(s.values)
        x = x.drop_duplicates(keep="last")
        return x.tail(n).tolist()

    user_items = df.groupby("customer_id")["article_id"].apply(last_unique_tail)

    print(f"[graph] users with at least 1 vocab item = {len(user_items)}")
    print(f"[graph] avg items/user used for graph = {np.mean([len(x) for x in user_items]):.2f}")

    # (3) co-occurrence counting into sparse COO
    # We'll count pairs (i,j) for each user's item list (unique)
    rows = []
    cols = []
    data = []

    # Use dict for counts to avoid huge COO duplicates
    pair_cnt = defaultdict(int)

    for items in user_items:
        if len(items) < 2:
            continue
        idxs = sorted(item2idx[i] for i in items if i in item2idx)
        m = len(idxs)
        for a in range(m):
            ia = idxs[a]
            for b in range(a + 1, m):
                ib = idxs[b]
                pair_cnt[(ia, ib)] += 1

    print(f"[graph] unique co-occurrence pairs = {len(pair_cnt)}")

    for (ia, ib), w in pair_cnt.items():
        # add symmetric entries
        rows.extend([ia, ib])
        cols.extend([ib, ia])
        data.extend([w, w])

    n = len(vocab_items)
    A = sparse.coo_matrix((data, (rows, cols)), shape=(n, n), dtype=np.float32).tocsr()

    # (4) normalize rows (optional but usually helps)
    # Use log1p to dampen huge counts
    A = A.copy()
    A.data = np.log1p(A.data)

    # (5) TruncatedSVD
    svd = TruncatedSVD(n_components=k, random_state=seed)
    emb = svd.fit_transform(A)  # (n_items, k)

    # L2 normalize embeddings (good for cosine)
    norm = np.linalg.norm(emb, axis=1, keepdims=True) + 1e-12
    emb = emb / norm

    item_emb_df = pd.DataFrame(emb, columns=[f"gsvd_{d}" for d in range(k)])
    item_emb_df["article_id"] = idx2item
    item_emb_df = item_emb_df[["article_id"] + [f"gsvd_{d}" for d in range(k)]]

    print("[graph] explained variance ratio (sum):", float(np.sum(svd.explained_variance_ratio_)))

    return item_emb_df, item2idx, idx2item, svd


In [8]:
ITEM_EMB_16_DF, item2idx, idx2item, svd_model = build_item_graph_svd_embeddings(
    train_hist=train_hist,
    k=16,
    top_items=20000,      # tweak: 10000 faster, 30000 richer
    user_max_items=20,    # tweak: 10 faster, 30 richer
    min_item_cnt=5,
    seed=42
)

save_pickle(ITEM_EMB_16_DF, os.path.join(TRAIN_DIR, "item_graph_svd_emb16.pkl"), overwrite=True)

[graph] vocab size = 20000 (top_items=20000, min_item_cnt=5)
[graph] users with at least 1 vocab item = 1275520
[graph] avg items/user used for graph = 9.65
[graph] unique co-occurrence pairs = 45494204
[graph] explained variance ratio (sum): 0.4089445471763611
[saved] ../data/train/item_graph_svd_emb16.pkl


In [9]:
def build_user_profile_from_item_emb(
    train_hist: pd.DataFrame,
    item_emb_df: pd.DataFrame,
    k: int = 16,
    user_max_items: int = 20
):
    """
    For each user, build a k-dim profile embedding by averaging embeddings of recent unique items (from train_hist).
    """
    df = train_hist.copy()
    df["t_dat"] = pd.to_datetime(df["t_dat"])
    df["article_id"] = df["article_id"].astype(int)

    # map item -> embedding
    emb_cols = [f"gsvd_{d}" for d in range(k)]
    item_emb_map = item_emb_df.set_index("article_id")[emb_cols]

    # keep only items with embeddings
    df = df[df["article_id"].isin(item_emb_map.index)]
    df = df.sort_values(["customer_id", "t_dat"])

    def last_unique_tail(s, n=user_max_items):
        x = pd.Series(s.values)
        x = x.drop_duplicates(keep="last")
        return x.tail(n).tolist()

    user_items = df.groupby("customer_id")["article_id"].apply(last_unique_tail)

    rows = []
    for cust, items in user_items.items():
        embs = item_emb_map.loc[items].values  # (m,k)
        prof = embs.mean(axis=0)
        # L2 normalize
        prof = prof / (np.linalg.norm(prof) + 1e-12)
        row = {"customer_id": cust}
        for d in range(k):
            row[f"user_gsvd_{d}"] = float(prof[d])
        rows.append(row)

    user_prof = pd.DataFrame(rows)
    return user_prof

In [10]:
USER_PROF_16 = build_user_profile_from_item_emb(
    train_hist=train_hist,
    item_emb_df=ITEM_EMB_16_DF,
    k=16,
    user_max_items=20
)

save_pickle(USER_PROF_16, os.path.join(TRAIN_DIR, "user_graph_profile_emb16.pkl"), overwrite=True)
USER_PROF_16.head()

KeyboardInterrupt: 

In [None]:
def add_graph_embedding_features(rank_df: pd.DataFrame, item_emb_df: pd.DataFrame, user_prof_df: pd.DataFrame, k: int = 16):
    df = rank_df.copy()
    emb_cols = [f"gsvd_{d}" for d in range(k)]
    user_cols = [f"user_gsvd_{d}" for d in range(k)]

    df = df.merge(item_emb_df, on="article_id", how="left")
    df = df.merge(user_prof_df, on="customer_id", how="left")

    # fill missing embeddings with 0 (items/users not in vocab)
    for c in emb_cols:
        df[c] = df[c].fillna(0.0).astype(np.float32)
    for c in user_cols:
        df[c] = df[c].fillna(0.0).astype(np.float32)

    # cosine(user_profile, item_emb) since both are L2-normalized
    # (if zero vector, cosine=0)
    U = df[user_cols].values
    V = df[emb_cols].values
    df["ui_gsvd_cosine"] = (U * V).sum(axis=1).astype(np.float32)

    # drop user embedding columns to keep feature space smaller (optional)
    df.drop(columns=user_cols, inplace=True)

    return df

rank_df_train_g = add_graph_embedding_features(rank_df_train_f, ITEM_EMB_16_DF, USER_PROF_16, k=16)
rank_df_val_g   = add_graph_embedding_features(rank_df_val_f,   ITEM_EMB_16_DF, USER_PROF_16, k=16)

save_pickle(rank_df_train_g, os.path.join(TRAIN_DIR, "rank_df_train_features_plus_graph16.pkl"), overwrite=True)
save_pickle(rank_df_val_g,   os.path.join(TRAIN_DIR, "rank_df_valid_features_plus_graph16.pkl"), overwrite=True)

print("Added graph features. Columns now:", len(rank_df_train_g.columns))
rank_df_train_g[["gsvd_0","gsvd_1","ui_gsvd_cosine"]].describe()


In [None]:
F_ALL = [
    'tx_cnt', 'unique_items', 'recency_days',
    'product_type_no', 'colour_group_code', 'department_no', 'index_group_no', 'garment_group_no', 'product_group_id',
    'item_popularity', 'unique_buyers', 'item_recency_days',
    'ui_cnt', 'ui_recency_days'
]
F_USER = ['tx_cnt', 'unique_items', 'recency_days']
F_ITEM_ATTR = ['product_type_no', 'product_group_id', 'colour_group_code', 'department_no', 'index_group_no', 'garment_group_no']
F_ITEM_POP = ['item_popularity', 'unique_buyers', 'item_recency_days']
F_UI = ['ui_cnt', 'ui_recency_days']

FEATURE_SETS = {
    "all": F_ALL,
    "user_only": F_USER,
    "item_attr_only": F_ITEM_ATTR,
    "item_pop_only": F_ITEM_POP,
    "ui_only": F_UI,
    "user+ui": F_USER + F_UI,
    "item_attr+pop": F_ITEM_ATTR + F_ITEM_POP,
    "all_minus_ui": [c for c in F_ALL if c not in set(F_UI)],
    "all_minus_pop": [c for c in F_ALL if c not in set(F_ITEM_POP)],
}

print({k: len(v) for k, v in FEATURE_SETS.items()})

In [None]:
F_GRAPH_16 = [f"gsvd_{d}" for d in range(16)] + ["ui_gsvd_cosine"]

# Extend your feature sets
FEATURE_SETS["all_plus_graph16"] = FEATURE_SETS["all"] + F_GRAPH_16
FEATURE_SETS["item_pop_plus_graph16"] = FEATURE_SETS["item_pop_only"] + [f"gsvd_{d}" for d in range(16)]  # optional variant

print("all_plus_graph16:", len(FEATURE_SETS["all_plus_graph16"]))


In [None]:
def group_sizes_from_sorted(df, group_key="customer_id"):
    return df.groupby(group_key).size().to_list()

def make_lgb_data(rank_df, feature_cols):
    df = ensure_sorted_by_group(rank_df, group_key="customer_id")
    X = df[feature_cols]
    y = df["label"].astype(np.int8)
    group = group_sizes_from_sorted(df, group_key="customer_id")
    return df, X, y, group

def append_result_row(row, csv_path=RESULT_CSV):
    df = pd.DataFrame([row])
    if os.path.exists(csv_path):
        df.to_csv(csv_path, mode="a", header=False, index=False)
    else:
        df.to_csv(csv_path, index=False)
    print(f"[appended] {csv_path}")
    
def apk(actual, predicted, k=12):
    # actual: set/list of true items
    # predicted: list of predicted items
    if len(predicted) > k:
        predicted = predicted[:k]
    score = 0.0
    hits = 0.0
    for i, p in enumerate(predicted, start=1):
        if p in actual:
            hits += 1.0
            score += hits / i
    denom = min(len(actual), k)
    return score / denom if denom > 0 else 0.0

def mapk_from_scored_df(df, k=12):
    # df columns: customer_id, article_id, label, score
    # label is 1 for relevant
    df = df.sort_values(["customer_id", "score"], ascending=[True, False])
    gt = df[df["label"] == 1].groupby("customer_id")["article_id"].apply(list).to_dict()
    pred = df.groupby("customer_id")["article_id"].apply(list).to_dict()

    scores = []
    for cust, pred_list in pred.items():
        actual = set(gt.get(cust, []))
        scores.append(apk(actual, pred_list, k=k))
    return float(np.mean(scores)) if scores else 0.0

def ensure_sorted_by_group(df, group_key="customer_id"):
    # LightGBM ranking expects grouped rows contiguous.
    return df.sort_values([group_key]).reset_index(drop=True)

In [None]:
def run_experiment(
    exp_id: str,
    exp_name: str,
    recall_name: str,
    feature_cols: list,
    lgb_params: dict,
    rank_df_train: pd.DataFrame,
    rank_df_val: pd.DataFrame,
    save_model: bool = True,
):
    start = datetime.now()

    # build lgb data
    train_sorted, X_train, y_train, group_train = make_lgb_data(rank_df_train, feature_cols)
    val_sorted, X_val, y_val, group_val = make_lgb_data(rank_df_val, feature_cols)

    # train
    ranker = lgb.LGBMRanker(**lgb_params)
    ranker.fit(
        X_train, y_train,
        group=group_train,
        eval_set=[(X_val, y_val)],
        eval_group=[group_val],
        eval_at=lgb_params.get("eval_at", [12]),
        callbacks=[
            lgb.early_stopping(stopping_rounds=50, verbose=True),
            lgb.log_evaluation(period=50)
        ]
    )

    best_iter = getattr(ranker, "best_iteration_", None)
    train_time_sec = (datetime.now() - start).total_seconds()

    # eval
    val_scores = ranker.predict(X_val, num_iteration=best_iter)
    tmp = val_sorted[["customer_id", "article_id", "label"]].copy()
    tmp["score"] = val_scores
    manual_map12 = mapk_from_scored_df(tmp, k=12)

    # recall stats on valid customers (for logging)
    valid_customers = set(valid_gt.keys())
    cand_counts = [len(cust_to_candidates.get(c, [])) for c in valid_customers]
    val_avg_cand = float(np.mean(cand_counts)) if cand_counts else 0.0
    val_med_cand = float(np.median(cand_counts)) if cand_counts else 0.0
    val_min_cand = int(np.min(cand_counts)) if cand_counts else 0
    val_max_cand = int(np.max(cand_counts)) if cand_counts else 0
    val_pos_rate = float(rank_df_val["label"].mean())

    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    # save config
    cfg = {
        "exp_id": exp_id,
        "exp_name": exp_name,
        "recall_name": recall_name,
        "timestamp": timestamp,
        "feature_cols": feature_cols,
        "lgb_params": lgb_params,
        "num_train_rows": int(X_train.shape[0]),
        "num_val_rows": int(X_val.shape[0]),
        "num_features": int(X_train.shape[1]),
        "valid_start": str(VALID_START.date()),
        "valid_end": str((VALID_END - pd.Timedelta(days=1)).date()),
        "train_label_start": str(TRAIN_LABEL_START.date()),
    }
    cfg_path = os.path.join(CFG_DIR, f"{exp_id}_{exp_name}.json")
    save_json(cfg, cfg_path, overwrite=True)

    # save model
    model_path = None
    if save_model:
        model_path = os.path.join(MODEL_DIR, f"{exp_id}_{exp_name}.pkl")
        save_pickle(ranker, model_path, overwrite=True)

    row = {
        "exp_id": exp_id,
        "exp_name": exp_name,
        "recall_name": recall_name,
        "timestamp": timestamp,
        "best_iteration": int(best_iter) if best_iter is not None else None,
        "train_time_sec": round(train_time_sec, 2),
        "val_map12_manual": manual_map12,
        "num_train_rows": int(X_train.shape[0]),
        "num_val_rows": int(X_val.shape[0]),
        "num_features": int(X_train.shape[1]),
        "model_path": model_path,
        "config_path": cfg_path,
        "val_avg_candidates": val_avg_cand,
        "val_med_candidates": val_med_cand,
        "val_min_candidates": val_min_cand,
        "val_max_candidates": val_max_cand,
        "val_pos_rate": val_pos_rate,
    }
    append_result_row(row)
    return row

In [None]:
SEED = 42

BASE_PARAMS = dict(
    objective="lambdarank",
    metric="map",
    eval_at=[12],
    learning_rate=0.05,
    n_estimators=500,
    num_leaves=63,
    min_data_in_leaf=50,
    feature_fraction=0.8,
    bagging_fraction=0.8,
    bagging_freq=1,
    reg_lambda=1.0,
    random_state=SEED,
    bagging_seed=SEED,
    feature_fraction_seed=SEED,
    data_random_seed=SEED,
    n_jobs=-1,
    # GPU (optional)
    # device_type="gpu",
    # gpu_platform_id=0,
    # gpu_device_id=0,
)

EXPS_EXTRA = [
    ("E13", "all_plus_graph16", "all_plus_graph16", {}),
    ("E14", "stronger_reg_plus_graph16", "all_plus_graph16", {"min_data_in_leaf":200, "reg_lambda":5.0, "reg_alpha":1.0}),
]
RECALL_NAME = "final_merged_recall_v2"

rows_extra = []
for exp_id, exp_name, feat_set_name, override in EXPS_EXTRA:
    feats = FEATURE_SETS[feat_set_name]
    params = {**BASE_PARAMS, **override}

    row = run_experiment(
        exp_id=exp_id,
        exp_name=exp_name,
        recall_name=RECALL_NAME,
        feature_cols=feats,
        lgb_params=params,
        rank_df_train=rank_df_train_g,   # <-- graph-added
        rank_df_val=rank_df_val_g,       # <-- graph-added
        save_model=True
    )
    rows_extra.append(row)
    print(exp_id, exp_name, "MAP@12 =", row["val_map12_manual"])
