### H&M Recommendation System — 03 Ranking

In this notebook, based on (customer, candidate_item) pair generated by `02_recall.ipynb`to create feature and label and then train ranking model(LightGBM Ranker)

#### Step 1: Load Dataset

In [3]:
import os
import gc
from datetime import datetime, timedelta
from collections import defaultdict, Counter

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import pickle
from tqdm import tqdm

pd.set_option("display.max_columns", 20)
pd.set_option("display.max_rows", 20)

HM_DIR = "../hm_data"
RECALL_DIR = "../data/recall"
TRAIN_DIR = "../data/train"
RES_DIR = "../data/ranking"
MODEL_DIR = "../models"

VALID_START = pd.to_datetime("2020-09-16")

In [4]:
# save recall results
def save_pickle(obj, path, overwrite=False):
    if os.path.exists(path) and not overwrite:
        print(f"[Skip] {path} already exists.")
        return
    with open(path, "wb") as f:
        pickle.dump(obj, f)

def load_pickle(path):
    with open(path, "rb") as f:
        return pickle.load(f)

In [3]:
cust_cand_pair = load_pickle(os.path.join(RECALL_DIR, "recall_final_merged.pkl"))
valid_gt = load_pickle(os.path.join(TRAIN_DIR, "validation_groundtruth.pkl"))
train_df = load_pickle(os.path.join(TRAIN_DIR, "train_df.pkl"))
valid_df = load_pickle(os.path.join(TRAIN_DIR, "valid_df.pkl"))

In [4]:
len(cust_cand_pair), len(valid_gt)

(1362281, 68984)

In [None]:
NEG_LIMIT = 50
rows = []
for cust in tqdm(valid_gt.keys(), desc="Building rank_df (valid users only)"):
    candidates = cust_cand_pair.get(cust, [])
    valid = valid_gt.get(cust, set())
    negatives = 0
    for article_id in candidates:
        label = 1 if article_id in valid else 0
        if label == 0:
            negatives += 1
        rows.append({"customer_id": cust, "article_id": article_id, "label": label})
        if(negatives > NEG_LIMIT):
            break
rank_df = pd.DataFrame(rows)
    

Building rank_df (valid users only):   6%|▋         | 4466/68984 [00:00<00:02, 22645.59it/s]

Building rank_df (valid users only): 100%|██████████| 68984/68984 [00:02<00:00, 26416.01it/s]


In [6]:
print(rank_df.head(5))
print(rank_df['label'].value_counts())

                                         customer_id  article_id  label
0  00039306476aaf41a07fed942884f16b30abfa83a2a8be...   636455004      0
1  00039306476aaf41a07fed942884f16b30abfa83a2a8be...   625939003      0
2  00039306476aaf41a07fed942884f16b30abfa83a2a8be...   671186001      0
3  00039306476aaf41a07fed942884f16b30abfa83a2a8be...   504155012      0
4  00039306476aaf41a07fed942884f16b30abfa83a2a8be...   632832001      0
label
0    3518184
1       9379
Name: count, dtype: int64


### User Features

In [None]:
# Get user information

# Number of purchases
g = train_df.groupby("customer_id")
tx_cnt = g.size()
tx_cnt.head()
# How many unique items have been bought?
unique_items = g["article_id"].nunique()
# Number of days since the last purchase
train_df["t_dat"] = pd.to_datetime(train_df["t_dat"])
last_date = g["t_dat"].max()
recency_days = (VALID_START - last_date).dt.days

user_features = pd.DataFrame({
    "tx_cnt": tx_cnt,
    "unique_items": unique_items,
    "last_date": last_date,
    "recency_days": recency_days,
})
user_features = user_features.reset_index()
save_pickle(user_features, os.path.join(RES_DIR, "user_features.pkl"))

In [None]:
# merge user info into rank_df
rank_df = rank_df.merge(
    user_features,
    on="customer_id",
    how="left"
)

In [13]:
user_features.isna().mean()

customer_id     0.0
tx_cnt          0.0
unique_items    0.0
last_date       0.0
recency_days    0.0
dtype: float64

In [11]:
user_features.head()

Unnamed: 0,customer_id,tx_cnt,unique_items,last_date,recency_days
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,21,19,2020-09-05,11
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,86,64,2020-07-08,70
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,18,14,2020-09-15,1
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,2,2,2019-06-09,465
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,13,12,2020-08-12,35


In [12]:
rank_df.head()

Unnamed: 0,customer_id,article_id,...,last_date,recency_days
0,00039306476aaf41a07fed942884f16b30abfa83a2a8be...,636455004,...,2020-02-06,223.0
1,00039306476aaf41a07fed942884f16b30abfa83a2a8be...,625939003,...,2020-02-06,223.0
2,00039306476aaf41a07fed942884f16b30abfa83a2a8be...,671186001,...,2020-02-06,223.0
3,00039306476aaf41a07fed942884f16b30abfa83a2a8be...,504155012,...,2020-02-06,223.0
4,00039306476aaf41a07fed942884f16b30abfa83a2a8be...,632832001,...,2020-02-06,223.0


### Item Features & Item Popularity Features 

In [14]:
# Get product information
articles = pd.read_csv(os.path.join(HM_DIR, "articles.csv"))
item_meta = articles[[
   "article_id",
   "product_type_no",
   "product_group_name",
   "colour_group_code",
   "department_no",
   "index_group_no",
   "garment_group_no"
]]

g_item = train_df.groupby("article_id")

# how many times this product has been purchased
item_popularity = g_item.size()
# How many different people have bought it
unique_buyers = g_item["customer_id"].nunique()
item_last_date = g_item["t_dat"].max()
# Number of days since the last purchase
item_recency_days = (VALID_START - item_last_date).dt.days

item_agg = pd.DataFrame({
    "item_popularity": item_popularity,
    "unique_buyers": unique_buyers,
    "item_last_date": item_last_date,
    "item_recency_days": item_recency_days
})
item_agg = item_agg.reset_index()
item_features = item_meta.merge(item_agg, on="article_id", how="left")

# NA processing method
item_features["item_popularity"] = item_features["item_popularity"].fillna(0).astype(int)
item_features["unique_buyers"] = item_features["unique_buyers"].fillna(0).astype(int)
very_old_date = pd.to_datetime("1970-01-01")
item_features["item_last_date"] = item_features["item_last_date"].fillna(very_old_date)
item_features["item_recency_days"] = item_features["item_recency_days"].fillna(999).astype(int)

save_pickle(item_features, os.path.join(RES_DIR, "item_features.pkl"))

In [15]:
item_features.head()

Unnamed: 0,article_id,product_type_no,...,item_last_date,item_recency_days
0,108775015,253,...,2020-07-22,56
1,108775044,253,...,2020-09-13,3
2,108775051,253,...,2019-06-28,446
3,110065001,306,...,2020-08-02,45
4,110065002,306,...,2020-08-05,42


In [16]:
item_features.isna().mean()

article_id           0.0
product_type_no      0.0
                    ... 
item_last_date       0.0
item_recency_days    0.0
Length: 11, dtype: float64

In [None]:
rank_df = rank_df.merge(
    item_features,
    on="article_id",
    how="left"
)
save_pickle(rank_df, os.path.join(RES_DIR, "rank_df.pkl"), overwrite=True)

In [18]:
rank_df.head()

Unnamed: 0,customer_id,article_id,...,item_last_date,item_recency_days
0,00039306476aaf41a07fed942884f16b30abfa83a2a8be...,636455004,...,2019-12-30,261
1,00039306476aaf41a07fed942884f16b30abfa83a2a8be...,625939003,...,2020-08-25,22
2,00039306476aaf41a07fed942884f16b30abfa83a2a8be...,671186001,...,2020-03-04,196
3,00039306476aaf41a07fed942884f16b30abfa83a2a8be...,504155012,...,2020-02-01,228
4,00039306476aaf41a07fed942884f16b30abfa83a2a8be...,632832001,...,2019-09-05,377


In [19]:
rank_df.columns

Index(['customer_id', 'article_id', 'label', 'tx_cnt', 'unique_items',
       'last_date', 'recency_days', 'product_type_no', 'product_group_name',
       'colour_group_code', 'department_no', 'index_group_no',
       'garment_group_no', 'item_popularity', 'unique_buyers',
       'item_last_date', 'item_recency_days'],
      dtype='object')

In [20]:
rank_df.isna().mean().sort_values(ascending=False)

unique_items         0.080814
recency_days         0.080814
                       ...   
item_last_date       0.000000
item_recency_days    0.000000
Length: 17, dtype: float64

In [None]:
group_sizes = rank_df.groupby("customer_id").size().tolist()
group_sizes
save_pickle(group_sizes, os.path.join(RES_DIR, "group_sizes.pkl"))

### User-Item Interaction Features

In [None]:
# User–Item Interaction Features
ui_df = train_df[["customer_id", "article_id", "t_dat"]].copy()
g_ui = ui_df.groupby(["customer_id", "article_id"])

# How many times did the user buy this product
ui_cnt = g_ui.size()  

# When is the last time the user buy this product
ui_last_date = g_ui["t_dat"].max()
ui_recency_days = (VALID_START - ui_last_date).dt.days

ui_features = pd.DataFrame({
    "ui_cnt": ui_cnt,
    "ui_recency_days": ui_recency_days,
}).reset_index()

# Merge back into rank_df
rank_df = rank_df.merge(
    ui_features,
    on=["customer_id", "article_id"],
    how="left"
)

# Nan value
rank_df["ui_cnt"] = rank_df["ui_cnt"].fillna(0).astype(int)
rank_df["ui_recency_days"] = rank_df["ui_recency_days"].fillna(999).astype(int)

save_pickle(ui_features, os.path.join(RES_DIR, "ui_features.pkl"))

In [6]:
rank_df.head()
rank_df[["ui_cnt", "ui_recency_days"]].describe()

Unnamed: 0,ui_cnt,ui_recency_days
count,3.527563e+06,3.527563e+06
mean,6.881133e-01,6.398531e+02
...,...,...
75%,1.000000e+00,9.990000e+02
max,1.660000e+02,9.990000e+02


In [4]:
for col in rank_df.columns:
    print(col)

customer_id
article_id
label
tx_cnt
unique_items
last_date
recency_days
product_type_no
product_group_name
colour_group_code
department_no
index_group_no
garment_group_no
item_popularity
unique_buyers
item_last_date
item_recency_days
ui_cnt
ui_recency_days


In [5]:
rank_df = rank_df.drop(columns=["last_date", "item_last_date"], errors="ignore")

In [6]:
rank_df[["product_type_no"]].describe()

Unnamed: 0,product_type_no
count,3527563.0
mean,248.4634
std,62.87538
min,-1.0
25%,252.0
50%,259.0
75%,272.0
max,762.0


In [7]:
# Change string to int
codes, uniques = pd.factorize(rank_df["product_group_name"])
rank_df["product_group_id"] = codes
rank_df = rank_df.drop(columns=["product_group_name"])

In [8]:
rank_df.dtypes

customer_id           object
article_id             int64
label                  int64
tx_cnt               float64
unique_items         float64
recency_days         float64
product_type_no        int64
colour_group_code      int64
department_no          int64
index_group_no         int64
garment_group_no       int64
item_popularity        int64
unique_buyers          int64
item_recency_days      int64
ui_cnt                 int64
ui_recency_days        int64
product_group_id       int64
dtype: object

### Step 2: Prepare train and valid dataset for Ranker

In [5]:
rank_df = load_pickle(os.path.join(RES_DIR, "rank_df.pkl"))

In [6]:
drop_cols = ["customer_id", "article_id", "label"]

feature_cols = [c for c in rank_df.columns if c not in drop_cols]
print("Num features:", len(feature_cols))
print(feature_cols)

rank_df = rank_df.sort_values("customer_id").reset_index(drop=True)

Num features: 14
['tx_cnt', 'unique_items', 'recency_days', 'product_type_no', 'colour_group_code', 'department_no', 'index_group_no', 'garment_group_no', 'item_popularity', 'unique_buyers', 'item_recency_days', 'ui_cnt', 'ui_recency_days', 'product_group_id']


In [7]:
X_all = rank_df[feature_cols]
y_all = rank_df["label"].values

group_sizes_all = rank_df.groupby("customer_id").size().tolist()

print("X_all shape:", X_all.shape)
print("y_all shape:", y_all.shape)
print("num groups (customers):", len(group_sizes_all))


X_all shape: (3527563, 14)
y_all shape: (3527563,)
num groups (customers): 68984


In [8]:
# shuffle and 80% training and 20% valid
customers = rank_df["customer_id"].unique()
rng = np.random.default_rng(42)   # fix random seed
rng.shuffle(customers)

cut = int(len(customers) * 0.8)
train_customers = set(customers[:cut])
val_customers   = set(customers[cut:])

print(len(train_customers), len(val_customers))

# Mask and Split
train_mask = rank_df["customer_id"].isin(train_customers)
val_mask   = ~train_mask

X_train = X_all[train_mask]
y_train = y_all[train_mask]

X_val = X_all[val_mask]
y_val = y_all[val_mask]

print("Train shape:", X_train.shape, " Val shape:", X_val.shape)

group_train = rank_df[train_mask].groupby("customer_id").size().tolist()
group_val   = rank_df[val_mask].groupby("customer_id").size().tolist()

print(len(group_train), len(group_val))

55187 13797
Train shape: (2822015, 14)  Val shape: (705548, 14)
55187 13797


### Step 3: Train LightGBM Ranker

In [16]:
def apk(actual_set, predicted_list, k=12):
    if not actual_set:
        return 0.0
    score = 0.0
    hit = 0
    for i, p in enumerate(predicted_list[:k], start=1):
        if p in actual_set:
            hit += 1
            score += hit / i
    return score / min(len(actual_set), k)

def mapk_from_scored_df(df, k=12):
    ap_list = []
    for cust, g in df.groupby("customer_id"):
        actual = set(g.loc[g["label"] == 1, "article_id"].tolist())
        pred = g.sort_values("score", ascending=False)["article_id"].tolist()
        ap_list.append(apk(actual, pred, k=k))
    return sum(ap_list) / len(ap_list)

In [17]:
import json

EXP_DIR = "../experiments"
CFG_DIR = os.path.join(EXP_DIR, "configs")
MODEL_DIR = "../models"
os.makedirs(EXP_DIR, exist_ok=True)
os.makedirs(CFG_DIR, exist_ok=True)
os.makedirs(MODEL_DIR, exist_ok=True)

RESULTS_CSV = os.path.join(EXP_DIR, "exp_results.csv")

def append_result_row(row: dict, csv_path: str = RESULTS_CSV):
    df_row = pd.DataFrame([row])
    if os.path.exists(csv_path):
        df_row.to_csv(csv_path, mode="a", header=False, index=False)
    else:
        df_row.to_csv(csv_path, mode="w", header=True, index=False)

def save_json(obj: dict, path: str, overwrite: bool = False):
    if (not overwrite) and os.path.exists(path):
        return
    with open(path, "w") as f:
        json.dump(obj, f, indent=2, default=str)

In [26]:
import lightgbm as lgb
import time

def run_experiment(
    exp_id: str,
    exp_name: str,
    recall_name: str,
    lgb_params: dict,
    X_train, y_train, group_train,
    X_val, y_val, group_val,
    rank_df_val_for_eval: pd.DataFrame = None,  # optional: df with customer_id/article_id/label for val rows
    save_model: bool = True,
):
    """
    Train LGBMRanker + evaluate + save config/model + append results to CSV.
    exp_id: e.g. "E01"
    exp_name: e.g. "baseline"
    recall_name: e.g. "history+recent_pop+category+copurchase"
    lgb_params: parameters for lgb.LGBMRanker(...)
    rank_df_val_for_eval: if provided, will compute manual MAP@12 using predicted scores
    """
    start = time.time()

    # --- train ---
    ranker = lgb.LGBMRanker(**lgb_params)
    ranker.fit(
        X_train, y_train,
        group=group_train,
        eval_set=[(X_val, y_val)],
        eval_group=[group_val],
        eval_at=lgb_params.get("eval_at", [12]),
        callbacks=[
            lgb.early_stopping(stopping_rounds=50, verbose=True),
            lgb.log_evaluation(period=50)
        ]
    )

    train_time_sec = time.time() - start
    best_iter = getattr(ranker, "best_iteration_", None)

    # --- evaluate ---
    val_scores = ranker.predict(X_val, num_iteration=best_iter)
    manual_map12 = None
    if rank_df_val_for_eval is not None:
        tmp = rank_df_val_for_eval.copy()
        tmp["score"] = val_scores
        manual_map12 = mapk_from_scored_df(tmp, k=12)

    # --- save config ---
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    cfg = {
        "exp_id": exp_id,
        "exp_name": exp_name,
        "recall_name": recall_name,
        "timestamp": timestamp,
        "lgb_params": lgb_params,
        "num_train_rows": int(X_train.shape[0]),
        "num_val_rows": int(X_val.shape[0]),
        "num_features": int(X_train.shape[1]),
        "feature_cols": list(X_train.columns),
    }
    cfg_path = os.path.join(CFG_DIR, f"{exp_id}_{exp_name}.json")
    save_json(cfg, cfg_path, overwrite=True)

    # --- save model ---
    model_path = None
    if save_model:
        model_path = os.path.join(MODEL_DIR, f"{exp_id}_{exp_name}.pkl")
        save_pickle(ranker, model_path, overwrite=True)

    # --- append results row ---
    row = {
        "exp_id": exp_id,
        "exp_name": exp_name,
        "recall_name": recall_name,
        "timestamp": timestamp,
        "best_iteration": best_iter,
        "train_time_sec": round(train_time_sec, 2),
        "val_map12_manual": manual_map12,
        "num_train_rows": int(X_train.shape[0]),
        "num_val_rows": int(X_val.shape[0]),
        "num_features": int(X_train.shape[1]),
        "model_path": model_path,
        "config_path": cfg_path,
    }
    append_result_row(row)

    return row


In [19]:
rank_df_val_for_eval = rank_df[val_mask][["customer_id", "article_id", "label"]].copy()
rank_df_val_for_eval = rank_df_val_for_eval.sort_values("customer_id").reset_index(drop=True)

In [20]:
# 檢查同一筆是否對齊
print(rank_df_val_for_eval.iloc[0][["customer_id","article_id","label"]])
print(rank_df[val_mask].sort_values("customer_id").reset_index(drop=True).iloc[0][["customer_id","article_id","label"]])

customer_id    00039306476aaf41a07fed942884f16b30abfa83a2a8be...
article_id                                             636455004
label                                                          0
Name: 0, dtype: object
customer_id    00039306476aaf41a07fed942884f16b30abfa83a2a8be...
article_id                                             636455004
label                                                          0
Name: 0, dtype: object


In [30]:
params = dict(
    objective="lambdarank",
    metric="map",
    eval_at=[12],
    learning_rate=0.05,
    n_estimators=2000,
    num_leaves=63,
    min_data_in_leaf=50,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    random_state=42,
    n_jobs=-1
)

row = run_experiment(
    exp_id="E01",
    exp_name="baseline",
    recall_name="final_merged_recall_v1",
    lgb_params=params,
    X_train=X_train, y_train=y_train, group_train=group_train,
    X_val=X_val, y_val=y_val, group_val=group_val,
    rank_df_val_for_eval=rank_df_val_for_eval
)






[LightGBM] [Info] Total groups: 55187, total data: 2822015
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.042165 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2185
[LightGBM] [Info] Number of data points in the train set: 2822015, number of used features: 14
[LightGBM] [Info] Total groups: 13797, total data: 705548
Training until validation scores don't improve for 50 rounds
[50]	valid_0's map@12: 0.91918
Early stopping, best iteration is:
[41]	valid_0's map@12: 0.919414




In [31]:
row

{'exp_id': 'E01',
 'exp_name': 'baseline',
 'recall_name': 'final_merged_recall_v1',
 'timestamp': '2025-12-22 06:00:21',
 'best_iteration': 41,
 'train_time_sec': 288.41,
 'val_map12_manual': 0.004921578479235953,
 'num_train_rows': 2822015,
 'num_val_rows': 705548,
 'num_features': 14,
 'model_path': '../models/E01_baseline.pkl',
 'config_path': '../experiments/configs/E01_baseline.json'}

In [32]:
pd.read_csv("../experiments/exp_results.csv").tail(5)

Unnamed: 0,exp_id,exp_name,recall_name,timestamp,best_iteration,train_time_sec,val_map12_manual,num_train_rows,num_val_rows,num_features,model_path,config_path
0,E01,baseline,final_merged_recall_v1,2025-12-22 06:00:21,41,288.41,0.004922,2822015,705548,14,../models/E01_baseline.pkl,../experiments/configs/E01_baseline.json
