# H&M Recommendation System — 03 Ranking

In this notebook, based on (customer, candidate_item) pair generated by `02_recall.ipynb`to create feature and label and then train ranking model(LightGBM Ranker)
- Train window: <= 2020-09-15
- Valid window: 2020-09-16 ~ 2020-09-22

In [1]:
import os
import json
import pickle
from datetime import datetime, timedelta

import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import lightgbm as lgb

DATA_DIR = "../data"
TRAIN_DIR = os.path.join(DATA_DIR, "train")
RECALL_DIR = os.path.join(DATA_DIR, "recall")
HM_DATA_DIR = "../hm_data"

OUT_DIR = "../experiments"
MODEL_DIR = "../models"
CFG_DIR = "../experiments/configs"
RESULT_CSV = "../experiments/results.csv"

os.makedirs(TRAIN_DIR, exist_ok=True)
os.makedirs(HM_DATA_DIR, exist_ok=True)
os.makedirs(OUT_DIR, exist_ok=True)
os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(CFG_DIR, exist_ok=True)

VALID_START = pd.to_datetime("2020-09-16")
VALID_END = pd.to_datetime("2020-09-23")  # exclusive

# Internal train label window (last 7 days before VALID_START)
TRAIN_LABEL_START = VALID_START - pd.Timedelta(days=7)

# Recall sizes
N_HISTORY = 30
N_POP = 20
N_CATEGORY = 20
N_COPURCHASE = 30
MAX_CANDIDATES = 100   # final merge cap per user

# Category mapping source column
CATEGORY_COL = "product_type_no"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def save_pickle(obj, path, overwrite=False):
    if (not overwrite) and os.path.exists(path):
        print(f"[skip] exists: {path}")
        return
    with open(path, "wb") as f:
        pickle.dump(obj, f)
    print(f"[saved] {path}")

def load_pickle(path):
    with open(path, "rb") as f:
        return pickle.load(f)

def save_json(obj, path, overwrite=False):
    if (not overwrite) and os.path.exists(path):
        print(f"[skip] exists: {path}")
        return
    with open(path, "w", encoding="utf-8") as f:
        json.dump(obj, f, ensure_ascii=False, indent=2)
    print(f"[saved] {path}")

### Step 1: Load data

In [3]:
train_df = load_pickle(os.path.join(TRAIN_DIR, "train_df.pkl"))
valid_df = load_pickle(os.path.join(TRAIN_DIR, "valid_df.pkl"))

# Make sure types are correct
train_df["t_dat"] = pd.to_datetime(train_df["t_dat"])
valid_df["t_dat"] = pd.to_datetime(valid_df["t_dat"])

print("Train window:", train_df["t_dat"].min(), "->", train_df["t_dat"].max(), train_df.shape)
print("Valid window:", valid_df["t_dat"].min(), "->", valid_df["t_dat"].max(), valid_df.shape)

# ground truth dicts: customer -> set(article_id)
train_hist = train_df[train_df["t_dat"] < TRAIN_LABEL_START].copy()
train_label_period = train_df[(train_df["t_dat"] >= TRAIN_LABEL_START) & (train_df["t_dat"] < VALID_START)].copy()
train_gt = train_label_period.groupby("customer_id")["article_id"].apply(lambda s: set(s.astype(int))).to_dict()
valid_period = valid_df[(valid_df["t_dat"] >= VALID_START) & (valid_df["t_dat"] < VALID_END)].copy()
valid_gt = valid_period.groupby("customer_id")["article_id"].apply(lambda s: set(s.astype(int))).to_dict()

cust_to_candidates = load_pickle(os.path.join(RECALL_DIR, "recall_final_merged.pkl"))


Train window: 2018-09-20 00:00:00 -> 2020-09-15 00:00:00 (31548013, 5)
Valid window: 2020-09-16 00:00:00 -> 2020-09-22 00:00:00 (240311, 5)


In [4]:
articles_path = os.path.join(HM_DATA_DIR, "articles.csv")
articles = pd.read_csv(articles_path)
if "product_group_id" not in articles.columns:
    articles["product_group_id"] = pd.factorize(articles["product_group_name"])[0].astype(np.int32)
articles_use = articles[[
    "article_id",
    "product_type_no",
    "product_group_id",
    "colour_group_code",
    "department_no",
    "index_group_no",
    "garment_group_no",
]].copy()
articles_use["article_id"] = articles_use["article_id"].astype(np.int64)

### Step 2: Build ranking datasets (train / valid)
labels:
- train labels from train_gt (last 7 days inside train)
- valid labels from valid_gt (2020-09-16 ~ 2020-09-22)

In [None]:
# def build_rank_df(cust_to_cand, gt_dict, name="rank"):
#     rows = []
#     for cust, cands in tqdm(cust_to_cand.items(), desc=f"Building {name}"):
#         gt = gt_dict.get(cust, set())
#         for aid in cands:
#             rows.append({
#                 "customer_id": cust,
#                 "article_id": int(aid),
#                 "label": 1 if int(aid) in gt else 0
#             })
#     df = pd.DataFrame(rows)
#     return df

# rank_df_train = build_rank_df(cust_to_candidates, train_gt, name="rank_train")
# rank_df_val = build_rank_df(cust_to_candidates, valid_gt, name="rank_valid")

# print("rank_df_train:", rank_df_train.shape, "pos_rate:", rank_df_train["label"].mean())
# print("rank_df_val:", rank_df_val.shape, "pos_rate:", rank_df_val["label"].mean())

# save_pickle(rank_df_train, os.path.join(TRAIN_DIR, "rank_df_train.pkl"), overwrite=True)
# save_pickle(rank_df_val, os.path.join(TRAIN_DIR, "rank_df_valid.pkl"), overwrite=True)

Building rank_train: 100%|██████████| 1362281/1362281 [01:39<00:00, 13726.53it/s]
Building rank_valid: 100%|██████████| 1362281/1362281 [01:45<00:00, 12925.06it/s]


rank_df_train: (269726031, 3) pos_rate: 0.0008449685006487194
rank_df_val: (269726031, 3) pos_rate: 0.00014426861158239488
[saved] ../data/train/rank_df_train.pkl
[saved] ../data/train/rank_df_valid.pkl


In [5]:
rank_df_train = load_pickle(os.path.join(TRAIN_DIR, "rank_df_train.pkl"))
rank_df_val = load_pickle(os.path.join(TRAIN_DIR, "rank_df_valid.pkl"))

### Step 3: Feature engineering (ALL computed from train_hist only)

1. User features

In [6]:
user_feat = train_hist.groupby("customer_id").agg(
    tx_cnt=("article_id", "count"),
    unique_items=("article_id", "nunique"),
    last_date=("t_dat", "max"),
).reset_index()
user_feat["recency_days"] = (VALID_START - user_feat["last_date"]).dt.days.astype(np.int32)
user_feat.drop(columns=["last_date"], inplace=True)

2. Item & popularity features

In [7]:
item_pop = train_hist.groupby("article_id").agg(
    item_popularity=("customer_id", "count"),
    unique_buyers=("customer_id", "nunique"),
    item_last_date=("t_dat", "max"),
).reset_index()
item_pop["item_recency_days"] = (VALID_START - item_pop["item_last_date"]).dt.days.astype(np.int32)
item_pop.drop(columns=["item_last_date"], inplace=True)
item_pop["article_id"] = item_pop["article_id"].astype(np.int64)

3. User-item interaction features (based on train_hist only)

In [8]:
ui = train_hist.groupby(["customer_id", "article_id"]).agg(
    ui_cnt=("t_dat", "count"),
    ui_last_date=("t_dat", "max"),
).reset_index()
ui["ui_recency_days"] = (VALID_START - ui["ui_last_date"]).dt.days.astype(np.int32)
ui.drop(columns=["ui_last_date"], inplace=True)
ui["article_id"] = ui["article_id"].astype(np.int64)

In [9]:
def add_features(rank_df):
    df = rank_df.merge(user_feat, on="customer_id", how="left")
    df = df.merge(articles_use, on="article_id", how="left")
    df = df.merge(item_pop, on="article_id", how="left")
    df = df.merge(ui, on=["customer_id", "article_id"], how="left")

    # Fill missing (cold) values
    for c in tqdm(
        ["tx_cnt","unique_items","recency_days"],
        desc="Fill user cold features"
    ):
        df[c] = df[c].fillna(0).astype(np.int32)

    for c in tqdm(
        ["item_popularity","unique_buyers","item_recency_days"],
        desc="Fill item cold features"
    ):
        df[c] = df[c].fillna(0).astype(np.int32)

    for c in tqdm(
        ["ui_cnt","ui_recency_days"],
        desc="Fill UI features"
    ):
        df[c] = df[c].fillna(0).astype(np.int32)

    # Item attrs missing
    attr_cols = ["product_type_no","product_group_id","colour_group_code","department_no","index_group_no","garment_group_no"]
    for c in attr_cols:
        df[c] = df[c].fillna(-1).astype(np.int32)

    return df

In [None]:
# rank_df_train_f = add_features(rank_df_train)
# rank_df_val_f = add_features(rank_df_val)

# save_pickle(rank_df_train_f, os.path.join(TRAIN_DIR, "rank_df_train_features.pkl"), overwrite=True)
# save_pickle(rank_df_val_f, os.path.join(TRAIN_DIR, "rank_df_valid_features.pkl"), overwrite=True)

# print("rank_df_train_f columns:", len(rank_df_train_f.columns))

[saved] ../data/train/rank_df_train_features.pkl
[saved] ../data/train/rank_df_valid_features.pkl
rank_df_train_f columns: 17


In [10]:
rank_df_train_f = load_pickle(os.path.join(TRAIN_DIR, "rank_df_train_features.pkl"))
rank_df_val_f = load_pickle(os.path.join(TRAIN_DIR, "rank_df_valid_features.pkl"))

### Step 4: Train LightBGM

In [11]:
F_ALL = [
    'tx_cnt', 'unique_items', 'recency_days',
    'product_type_no', 'colour_group_code', 'department_no', 'index_group_no', 'garment_group_no', 'product_group_id',
    'item_popularity', 'unique_buyers', 'item_recency_days',
    'ui_cnt', 'ui_recency_days'
]
F_USER = ['tx_cnt', 'unique_items', 'recency_days']
F_ITEM_ATTR = ['product_type_no', 'product_group_id', 'colour_group_code', 'department_no', 'index_group_no', 'garment_group_no']
F_ITEM_POP = ['item_popularity', 'unique_buyers', 'item_recency_days']
F_UI = ['ui_cnt', 'ui_recency_days']

FEATURE_SETS = {
    "all": F_ALL,
    "user_only": F_USER,
    "item_attr_only": F_ITEM_ATTR,
    "item_pop_only": F_ITEM_POP,
    "ui_only": F_UI,
    "user+ui": F_USER + F_UI,
    "item_attr+pop": F_ITEM_ATTR + F_ITEM_POP,
    "all_minus_ui": [c for c in F_ALL if c not in set(F_UI)],
    "all_minus_pop": [c for c in F_ALL if c not in set(F_ITEM_POP)],
}

print({k: len(v) for k, v in FEATURE_SETS.items()})

{'all': 14, 'user_only': 3, 'item_attr_only': 6, 'item_pop_only': 3, 'ui_only': 2, 'user+ui': 5, 'item_attr+pop': 9, 'all_minus_ui': 12, 'all_minus_pop': 11}


In [12]:
def group_sizes_from_sorted(df, group_key="customer_id"):
    return df.groupby(group_key).size().to_list()

def make_lgb_data(rank_df, feature_cols):
    df = ensure_sorted_by_group(rank_df, group_key="customer_id")
    X = df[feature_cols]
    y = df["label"].astype(np.int8)
    group = group_sizes_from_sorted(df, group_key="customer_id")
    return df, X, y, group

def append_result_row(row, csv_path=RESULT_CSV):
    df = pd.DataFrame([row])
    if os.path.exists(csv_path):
        df.to_csv(csv_path, mode="a", header=False, index=False)
    else:
        df.to_csv(csv_path, index=False)
    print(f"[appended] {csv_path}")
    
def apk(actual, predicted, k=12):
    # actual: set/list of true items
    # predicted: list of predicted items
    if len(predicted) > k:
        predicted = predicted[:k]
    score = 0.0
    hits = 0.0
    for i, p in enumerate(predicted, start=1):
        if p in actual:
            hits += 1.0
            score += hits / i
    denom = min(len(actual), k)
    return score / denom if denom > 0 else 0.0

def mapk_from_scored_df(df, k=12):
    # df columns: customer_id, article_id, label, score
    # label is 1 for relevant
    df = df.sort_values(["customer_id", "score"], ascending=[True, False])
    gt = df[df["label"] == 1].groupby("customer_id")["article_id"].apply(list).to_dict()
    pred = df.groupby("customer_id")["article_id"].apply(list).to_dict()

    scores = []
    for cust, pred_list in pred.items():
        actual = set(gt.get(cust, []))
        scores.append(apk(actual, pred_list, k=k))
    return float(np.mean(scores)) if scores else 0.0

def ensure_sorted_by_group(df, group_key="customer_id"):
    # LightGBM ranking expects grouped rows contiguous.
    return df.sort_values([group_key]).reset_index(drop=True)

In [13]:
def run_experiment(
    exp_id: str,
    exp_name: str,
    recall_name: str,
    feature_cols: list,
    lgb_params: dict,
    rank_df_train: pd.DataFrame,
    rank_df_val: pd.DataFrame,
    save_model: bool = True,
):
    start = datetime.now()

    # build lgb data
    train_sorted, X_train, y_train, group_train = make_lgb_data(rank_df_train, feature_cols)
    val_sorted, X_val, y_val, group_val = make_lgb_data(rank_df_val, feature_cols)

    # train
    ranker = lgb.LGBMRanker(**lgb_params)
    ranker.fit(
        X_train, y_train,
        group=group_train,
        eval_set=[(X_val, y_val)],
        eval_group=[group_val],
        eval_at=lgb_params.get("eval_at", [12]),
        callbacks=[
            lgb.early_stopping(stopping_rounds=50, verbose=True),
            lgb.log_evaluation(period=50)
        ]
    )

    best_iter = getattr(ranker, "best_iteration_", None)
    train_time_sec = (datetime.now() - start).total_seconds()

    # eval
    val_scores = ranker.predict(X_val, num_iteration=best_iter)
    tmp = val_sorted[["customer_id", "article_id", "label"]].copy()
    tmp["score"] = val_scores
    manual_map12 = mapk_from_scored_df(tmp, k=12)

    # recall stats on valid customers (for logging)
    valid_customers = set(valid_gt.keys())
    cand_counts = [len(cust_to_candidates.get(c, [])) for c in valid_customers]
    val_avg_cand = float(np.mean(cand_counts)) if cand_counts else 0.0
    val_med_cand = float(np.median(cand_counts)) if cand_counts else 0.0
    val_min_cand = int(np.min(cand_counts)) if cand_counts else 0
    val_max_cand = int(np.max(cand_counts)) if cand_counts else 0
    val_pos_rate = float(rank_df_val["label"].mean())

    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    # save config
    cfg = {
        "exp_id": exp_id,
        "exp_name": exp_name,
        "recall_name": recall_name,
        "timestamp": timestamp,
        "feature_cols": feature_cols,
        "lgb_params": lgb_params,
        "num_train_rows": int(X_train.shape[0]),
        "num_val_rows": int(X_val.shape[0]),
        "num_features": int(X_train.shape[1]),
        "valid_start": str(VALID_START.date()),
        "valid_end": str((VALID_END - pd.Timedelta(days=1)).date()),
        "train_label_start": str(TRAIN_LABEL_START.date()),
    }
    cfg_path = os.path.join(CFG_DIR, f"{exp_id}_{exp_name}.json")
    save_json(cfg, cfg_path, overwrite=True)

    # save model
    model_path = None
    if save_model:
        model_path = os.path.join(MODEL_DIR, f"{exp_id}_{exp_name}.pkl")
        save_pickle(ranker, model_path, overwrite=True)

    row = {
        "exp_id": exp_id,
        "exp_name": exp_name,
        "recall_name": recall_name,
        "timestamp": timestamp,
        "best_iteration": int(best_iter) if best_iter is not None else None,
        "train_time_sec": round(train_time_sec, 2),
        "val_map12_manual": manual_map12,
        "num_train_rows": int(X_train.shape[0]),
        "num_val_rows": int(X_val.shape[0]),
        "num_features": int(X_train.shape[1]),
        "model_path": model_path,
        "config_path": cfg_path,
        "val_avg_candidates": val_avg_cand,
        "val_med_candidates": val_med_cand,
        "val_min_candidates": val_min_cand,
        "val_max_candidates": val_max_cand,
        "val_pos_rate": val_pos_rate,
    }
    append_result_row(row)
    return row

In [None]:
SEED = 42

BASE_PARAMS = dict(
    objective="lambdarank",
    metric="map",
    eval_at=[12],
    learning_rate=0.05,
    n_estimators=500,
    num_leaves=63,
    min_data_in_leaf=50,
    feature_fraction=0.8,
    bagging_fraction=0.8,
    bagging_freq=1,
    reg_lambda=1.0,
    random_state=SEED,
    bagging_seed=SEED,
    feature_fraction_seed=SEED,
    data_random_seed=SEED,
    n_jobs=-1,
    # GPU (optional)
    # device_type="gpu",
    # gpu_platform_id=0,
    # gpu_device_id=0,
)

EXPS = [
    ("E01", "baseline_all",          "all",            {}),
    ("E02", "user_only",             "user_only",       {}),
    ("E03", "item_attr_only",        "item_attr_only",  {}),
    ("E04", "item_pop_only",         "item_pop_only",   {}),
    ("E05", "ui_only",               "ui_only",         {}),
    ("E06", "user_plus_ui",          "user+ui",         {}),
    ("E07", "item_attr_plus_pop",    "item_attr+pop",   {}),
    ("E08", "all_minus_ui",          "all_minus_ui",    {}),
    ("E09", "all_minus_pop",         "all_minus_pop",   {}),

    ("E10", "stronger_reg",          "all", {"min_data_in_leaf":200, "reg_lambda":5.0, "reg_alpha":1.0}),
    ("E11", "lr_0.03_more_trees",    "all", {"learning_rate":0.03, "n_estimators":4000}),
    ("E11-v2", "lr_0.03_more_trees",    "all", {"learning_rate":0.03}),
    ("E12", "extra_trees",           "all", {"extra_trees":True}),
]

# IMPORTANT:
# - rank_df_train_f / rank_df_val_f must already contain ALL engineered columns
# - FEATURE_SETS maps feat_set_name -> list of feature column names
# - run_experiment() is the time-split version (uses rank_df_train/val DataFrames)

RECALL_NAME = "final_merged_recall_v2"

rows = []
for exp_id, exp_name, feat_set_name, override in tqdm(
    EXPS,
    desc="Running LGBM Rank Experiments",
    total=len(EXPS)
):
    feats = FEATURE_SETS[feat_set_name]
    params = {**BASE_PARAMS, **override}

    row = run_experiment(
        exp_id=exp_id,
        exp_name=exp_name,
        recall_name=RECALL_NAME,
        feature_cols=feats,
        lgb_params=params,
        rank_df_train=rank_df_train_f,
        rank_df_val=rank_df_val_f,
        save_model=True
    )
    rows.append(row)
    print(exp_id, exp_name, feat_set_name, "MAP@12 =", row["val_map12_manual"])

# Optional: show as a dataframe summary
res_df = pd.DataFrame(rows).sort_values("val_map12_manual", ascending=False)
display(res_df[["exp_id","exp_name","num_features","best_iteration","train_time_sec","val_map12_manual"]])



[LightGBM] [Info] Total groups: 1362281, total data: 269726031
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 3.478570 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2155
[LightGBM] [Info] Number of data points in the train set: 269726031, number of used features: 14
[LightGBM] [Info] Total groups: 1362281, total data: 269726031
Training until validation scores don't improve for 50 rounds
[50]	valid_0's map@12: 0.981658
[100]	valid_0's map@12: 0.981718
[150]	valid_0's map@12: 0.9818
[200]	valid_0's map@12: 0.981846
[250]	valid_0's map@12: 0.981866
[300]	valid_0's map@12: 0.981884
[350]	valid_0's map@12: 0.981904
[400]	valid_0's map@12: 0.981913
[450]	valid_0's map@12: 0.981927
[500]	valid_0's map@12: 0.981935
Did not meet early stopping. Best iteration is:
[500]	valid_0's map@12: 0.981935




[saved] ../experiments/configs/E11-v2_lr_0.03_more_trees.json
[saved] ../models/E11-v2_lr_0.03_more_trees.pkl
[appended] ../experiments/results.csv


Running LGBM Rank Experiments: 100%|██████████| 1/1 [51:10<00:00, 3070.14s/it]

E11-v2 lr_0.03_more_trees all MAP@12 = 0.0017342082720044477





Unnamed: 0,exp_id,exp_name,num_features,best_iteration,train_time_sec,val_map12_manual
0,E11-v2,lr_0.03_more_trees,14,500,2532.95,0.001734
