In [1]:
import os
import json
import time
from datetime import datetime
import joblib
import numpy as np
import pandas as pd
import pickle
from tqdm import tqdm

import lightgbm as lgb
from sklearn.model_selection import GroupShuffleSplit
from dataclasses import dataclass, asdict

# 讓結果顯示好看
pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 200)

SEED = 42
np.random.seed(SEED)

# 路徑（照你的專案習慣改）
DATA_DIR = "../data"
RANKING_DIR = os.path.join(DATA_DIR, "ranking")
EXP_DIR = "../experiments"
MODEL_DIR = "../models"
CFG_DIR = os.path.join(EXP_DIR, "configs")

os.makedirs(EXP_DIR, exist_ok=True)
os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(CFG_DIR, exist_ok=True)

RESULTS_CSV = os.path.join(EXP_DIR, "exp_results.csv")

VALID_START = pd.to_datetime("2020-09-16")

### Step 1: Prepare train and valid dataset for Ranker

In [2]:
# save recall results
def save_pickle(obj, path, overwrite=False):
    if os.path.exists(path) and not overwrite:
        print(f"[Skip] {path} already exists.")
        return
    with open(path, "wb") as f:
        pickle.dump(obj, f)

def load_pickle(path):
    with open(path, "rb") as f:
        return pickle.load(f)

In [3]:
rank_df = load_pickle(os.path.join(RANKING_DIR, "rank_df.pkl"))

drop_cols = ["customer_id", "article_id", "label"]

feature_cols = [c for c in rank_df.columns if c not in drop_cols]
print("Num features:", len(feature_cols))
print(feature_cols)

rank_df = rank_df.sort_values("customer_id").reset_index(drop=True)

Num features: 14
['tx_cnt', 'unique_items', 'recency_days', 'product_type_no', 'colour_group_code', 'department_no', 'index_group_no', 'garment_group_no', 'item_popularity', 'unique_buyers', 'item_recency_days', 'ui_cnt', 'ui_recency_days', 'product_group_id']


In [4]:
customers = rank_df["customer_id"].unique()
rng = np.random.default_rng(SEED)
rng.shuffle(customers)

cut = int(len(customers) * 0.8)
train_customers = set(customers[:cut])

train_df = rank_df[rank_df["customer_id"].isin(train_customers)].sort_values("customer_id").reset_index(drop=True)
val_df   = rank_df[~rank_df["customer_id"].isin(train_customers)].sort_values("customer_id").reset_index(drop=True)

X_train = train_df[feature_cols]
y_train = train_df["label"].values
group_train = train_df.groupby("customer_id").size().tolist()

X_val = val_df[feature_cols]
y_val = val_df["label"].values
group_val = val_df.groupby("customer_id").size().tolist()

rank_df_val_for_eval = val_df[["customer_id", "article_id", "label"]].copy()

assert sum(group_train) == len(X_train)
assert sum(group_val) == len(X_val)


# X_all = rank_df[feature_cols]
# y_all = rank_df["label"].values

# group_sizes_all = rank_df.groupby("customer_id").size().tolist()

# print("X_all shape:", X_all.shape)
# print("y_all shape:", y_all.shape)
# print("num groups (customers):", len(group_sizes_all))

# # shuffle and 80% training and 20% valid
# customers = rank_df["customer_id"].unique()
# rng = np.random.default_rng(42)   # fix random seed
# rng.shuffle(customers)

# cut = int(len(customers) * 0.8)
# train_customers = set(customers[:cut])
# val_customers   = set(customers[cut:])

# print(len(train_customers), len(val_customers))

# # Mask and Split
# train_mask = rank_df["customer_id"].isin(train_customers)
# val_mask   = ~train_mask

# X_train = X_all[train_mask]
# y_train = y_all[train_mask]

# X_val = X_all[val_mask]
# y_val = y_all[val_mask]

# print("Train shape:", X_train.shape, " Val shape:", X_val.shape)

# group_train = rank_df[train_mask].groupby("customer_id").size().tolist()
# group_val   = rank_df[val_mask].groupby("customer_id").size().tolist()

# print(len(group_train), len(group_val))

### Step 2: Run Experiments

In [5]:
def apk(actual_set, predicted_list, k=12):
    if not actual_set:
        return 0.0
    score = 0.0
    hit = 0
    seen = set()
    for i, p in enumerate(predicted_list[:k], start=1):
        if p in seen:
            continue
        seen.add(p)
        if p in actual_set:
            hit += 1
            score += hit / i
    return score / min(len(actual_set), k)


def mapk_from_scored_df(df, k=12):
    ap_list = []
    for cust, g in df.groupby("customer_id"):
        actual = set(g.loc[g["label"] == 1, "article_id"].tolist())
        pred = g.sort_values("score", ascending=False)["article_id"].tolist()
        ap_list.append(apk(actual, pred, k=k))
    return sum(ap_list) / len(ap_list)

In [6]:
def append_result_row(row: dict, csv_path: str = RESULTS_CSV):
    df_row = pd.DataFrame([row])
    if os.path.exists(csv_path):
        df_row.to_csv(csv_path, mode="a", header=False, index=False)
    else:
        df_row.to_csv(csv_path, mode="w", header=True, index=False)

def save_json(obj: dict, path: str, overwrite: bool = False):
    if (not overwrite) and os.path.exists(path):
        return
    with open(path, "w") as f:
        json.dump(obj, f, indent=2, default=str)

def candidate_stats(df):
    per_cust = df.groupby("customer_id").size()
    return {
        "val_avg_candidates": float(per_cust.mean()),
        "val_med_candidates": float(per_cust.median()),
        "val_min_candidates": int(per_cust.min()),
        "val_max_candidates": int(per_cust.max()),
        "val_pos_rate": float(df["label"].mean()),
    }

In [7]:
def run_experiment(
    exp_id: str,
    exp_name: str,
    recall_name: str,
    lgb_params: dict,
    X_train, y_train, group_train,
    X_val, y_val, group_val,
    rank_df_val_for_eval: pd.DataFrame = None,  # optional: df with customer_id/article_id/label for val rows
    save_model: bool = True,
):
    """
    Train LGBMRanker + evaluate + save config/model + append results to CSV.
    exp_id: e.g. "E01"
    exp_name: e.g. "baseline"
    recall_name: e.g. "history+recent_pop+category+copurchase"
    lgb_params: parameters for lgb.LGBMRanker(...)
    rank_df_val_for_eval: if provided, will compute manual MAP@12 using predicted scores
    """
    start = time.time()

    # --- train ---
    ranker = lgb.LGBMRanker(**lgb_params)
    ranker.fit(
        X_train, y_train,
        group=group_train,
        eval_set=[(X_val, y_val)],
        eval_group=[group_val],
        eval_at=lgb_params.get("eval_at", [12]),
        callbacks=[
            lgb.early_stopping(stopping_rounds=50, verbose=True),
            lgb.log_evaluation(period=50)
        ]
    )

    train_time_sec = time.time() - start
    best_iter = getattr(ranker, "best_iteration_", None)
    pred_kwargs = {}
    if best_iter is not None:
        pred_kwargs["num_iteration"] = best_iter

    # --- evaluate ---
    val_scores = ranker.predict(X_val, **pred_kwargs)
    manual_map12 = None
    stats = {}
    if rank_df_val_for_eval is not None:
        tmp = rank_df_val_for_eval.copy()
        tmp["score"] = val_scores

        manual_map12 = mapk_from_scored_df(tmp, k=12)
        stats = candidate_stats(tmp)

    # --- save config ---
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    cfg = {
        "exp_id": exp_id,
        "exp_name": exp_name,
        "recall_name": recall_name,
        "timestamp": timestamp,
        "lgb_params": lgb_params,
        "num_train_rows": int(X_train.shape[0]),
        "num_val_rows": int(X_val.shape[0]),
        "num_features": int(X_train.shape[1]),
        "feature_cols": list(X_train.columns),
    }
    cfg_path = os.path.join(CFG_DIR, f"{exp_id}_{exp_name}.json")
    save_json(cfg, cfg_path, overwrite=True)

    # --- save model ---
    model_path = None
    if save_model:
        model_path = os.path.join(MODEL_DIR, f"{exp_id}_{exp_name}.pkl")
        joblib.dump(ranker, model_path)

    # --- append results row ---
    row = {
        "exp_id": exp_id,
        "exp_name": exp_name,
        "recall_name": recall_name,
        "timestamp": timestamp,
        "best_iteration": best_iter,
        "train_time_sec": round(train_time_sec, 2),
        "val_map12_manual": manual_map12,
        "num_train_rows": int(X_train.shape[0]),
        "num_val_rows": int(X_val.shape[0]),
        "num_features": int(X_train.shape[1]),
        "model_path": model_path,
        "config_path": cfg_path,
    }
    row.update(stats)
    append_result_row(row)

    return row


In [8]:
F_ALL = [
    'tx_cnt', 'unique_items', 'recency_days',
    'product_type_no', 'colour_group_code', 'department_no', 'index_group_no', 'garment_group_no', 'product_group_id',
    'item_popularity', 'unique_buyers', 'item_recency_days',
    'ui_cnt', 'ui_recency_days'
]

F_USER = ['tx_cnt', 'unique_items', 'recency_days']
F_ITEM_ATTR = ['product_type_no', 'product_group_id', 'colour_group_code', 'department_no', 'index_group_no', 'garment_group_no']
F_ITEM_POP = ['item_popularity', 'unique_buyers', 'item_recency_days']
F_UI = ['ui_cnt', 'ui_recency_days']

FEATURE_SETS = {
    "all": F_ALL,
    "user_only": F_USER,
    "item_attr_only": F_ITEM_ATTR,
    "item_pop_only": F_ITEM_POP,
    "ui_only": F_UI,
    "user+ui": F_USER + F_UI,
    "item_attr+pop": F_ITEM_ATTR + F_ITEM_POP,
    "all_minus_ui": [c for c in F_ALL if c not in set(F_UI)],
    "all_minus_pop": [c for c in F_ALL if c not in set(F_ITEM_POP)],
}
{ k: len(v) for k,v in FEATURE_SETS.items() }

{'all': 14,
 'user_only': 3,
 'item_attr_only': 6,
 'item_pop_only': 3,
 'ui_only': 2,
 'user+ui': 5,
 'item_attr+pop': 9,
 'all_minus_ui': 12,
 'all_minus_pop': 11}

In [9]:
BASE_PARAMS = dict(
    objective="lambdarank",
    metric="ndcg",
    eval_at=[12],
    learning_rate=0.05,
    n_estimators=2000,
    num_leaves=63,
    min_data_in_leaf=50,
    feature_fraction=0.8,
    bagging_fraction=0.8,
    bagging_freq=1,
    reg_lambda=1.0,
    random_state=SEED,
    bagging_seed=SEED,
    feature_fraction_seed=SEED,
    data_random_seed=SEED,
    n_jobs=-1
)

EXPS = [
    ("E01", "baseline_all",          "all",            {}),
    ("E02", "user_only",             "user_only",       {}),
    ("E03", "item_attr_only",        "item_attr_only",  {}),
    ("E04", "item_pop_only",         "item_pop_only",   {}),
    ("E05", "ui_only",               "ui_only",         {}),
    ("E06", "user_plus_ui",          "user+ui",         {}),
    ("E07", "item_attr_plus_pop",    "item_attr+pop",   {}),
    ("E08", "all_minus_ui",          "all_minus_ui",    {}),
    ("E09", "all_minus_pop",         "all_minus_pop",   {}),

    ("E10", "stronger_reg",          "all", {"min_data_in_leaf":200, "reg_lambda":5.0, "reg_alpha":1.0}),
    ("E11", "lr_0.03_more_trees",    "all", {"learning_rate":0.03, "n_estimators":4000}),
    ("E12", "extra_trees",           "all", {"extra_trees":True}),
]

rows = []
for exp_id, exp_name, feat_set_name, override in EXPS:
    feats = FEATURE_SETS[feat_set_name]
    params = {**BASE_PARAMS, **override}

    X_train_fs = train_df[feats]
    X_val_fs   = val_df[feats]

    row = run_experiment(
        exp_id=exp_id,
        exp_name=exp_name,
        recall_name="final_merged_recall_v1",
        lgb_params=params,
        X_train=X_train_fs, y_train=y_train, group_train=group_train,
        X_val=X_val_fs, y_val=y_val, group_val=group_val,
        rank_df_val_for_eval=rank_df_val_for_eval
    )
    rows.append(row)
    print(exp_id, exp_name, feat_set_name, row["val_map12_manual"])




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.020419 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2185
[LightGBM] [Info] Number of data points in the train set: 2822015, number of used features: 14
Training until validation scores don't improve for 50 rounds
[50]	valid_0's ndcg@12: 0.928125
[100]	valid_0's ndcg@12: 0.928376
Early stopping, best iteration is:
[51]	valid_0's ndcg@12: 0.92848




E01 baseline_all all 0.03691019111445646




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010165 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 2822015, number of used features: 3
Training until validation scores don't improve for 50 rounds
[50]	valid_0's ndcg@12: 0.893668
Early stopping, best iteration is:
[1]	valid_0's ndcg@12: 0.893668




E02 user_only user_only 0.0065428509216108035




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.023000 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 386
[LightGBM] [Info] Number of data points in the train set: 2822015, number of used features: 6
Training until validation scores don't improve for 50 rounds
[50]	valid_0's ndcg@12: 0.908278
[100]	valid_0's ndcg@12: 0.908745
Early stopping, best iteration is:
[83]	valid_0's ndcg@12: 0.909141




E03 item_attr_only item_attr_only 0.01904076377974687




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.022049 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 764
[LightGBM] [Info] Number of data points in the train set: 2822015, number of used features: 3
Training until validation scores don't improve for 50 rounds
[50]	valid_0's ndcg@12: 0.918156
[100]	valid_0's ndcg@12: 0.917268
[150]	valid_0's ndcg@12: 0.917091
[200]	valid_0's ndcg@12: 0.917122
Early stopping, best iteration is:
[189]	valid_0's ndcg@12: 0.919005




E04 item_pop_only item_pop_only 0.02709499311514235
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.016128 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 270
[LightGBM] [Info] Number of data points in the train set: 2822015, number of used features: 2




Training until validation scores don't improve for 50 rounds
[50]	valid_0's ndcg@12: 0.917613
[100]	valid_0's ndcg@12: 0.917685
Early stopping, best iteration is:
[79]	valid_0's ndcg@12: 0.918162




E05 ui_only ui_only 0.027275898295503954




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009967 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1035
[LightGBM] [Info] Number of data points in the train set: 2822015, number of used features: 5
Training until validation scores don't improve for 50 rounds
[50]	valid_0's ndcg@12: 0.917478
Early stopping, best iteration is:
[22]	valid_0's ndcg@12: 0.918133




E06 user_plus_ui user+ui 0.027600650230932135




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.025256 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1150
[LightGBM] [Info] Number of data points in the train set: 2822015, number of used features: 9
Training until validation scores don't improve for 50 rounds
[50]	valid_0's ndcg@12: 0.921746
Early stopping, best iteration is:
[37]	valid_0's ndcg@12: 0.922323




E07 item_attr_plus_pop item_attr+pop 0.030441786800868446




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.029598 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1915
[LightGBM] [Info] Number of data points in the train set: 2822015, number of used features: 12
Training until validation scores don't improve for 50 rounds
[50]	valid_0's ndcg@12: 0.921995
[100]	valid_0's ndcg@12: 0.922466
[150]	valid_0's ndcg@12: 0.922683
Early stopping, best iteration is:
[122]	valid_0's ndcg@12: 0.923155




E08 all_minus_ui all_minus_ui 0.030972516942655313




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.013173 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1421
[LightGBM] [Info] Number of data points in the train set: 2822015, number of used features: 11
Training until validation scores don't improve for 50 rounds
[50]	valid_0's ndcg@12: 0.923634
[100]	valid_0's ndcg@12: 0.923485
[150]	valid_0's ndcg@12: 0.923666
Early stopping, best iteration is:
[139]	valid_0's ndcg@12: 0.924323




E09 all_minus_pop all_minus_pop 0.0330972623971101




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.020202 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2185
[LightGBM] [Info] Number of data points in the train set: 2822015, number of used features: 14
Training until validation scores don't improve for 50 rounds
[50]	valid_0's ndcg@12: 0.927521
[100]	valid_0's ndcg@12: 0.927837
[150]	valid_0's ndcg@12: 0.928269
[200]	valid_0's ndcg@12: 0.928258
Early stopping, best iteration is:
[197]	valid_0's ndcg@12: 0.928623




E10 stronger_reg all 0.03700724652308324




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.026426 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2185
[LightGBM] [Info] Number of data points in the train set: 2822015, number of used features: 14
Training until validation scores don't improve for 50 rounds
[50]	valid_0's ndcg@12: 0.927931
Early stopping, best iteration is:
[20]	valid_0's ndcg@12: 0.928547




E11 lr_0.03_more_trees all 0.036904771056216895




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.023163 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2185
[LightGBM] [Info] Number of data points in the train set: 2822015, number of used features: 14
Training until validation scores don't improve for 50 rounds
[50]	valid_0's ndcg@12: 0.924573
[100]	valid_0's ndcg@12: 0.925341
[150]	valid_0's ndcg@12: 0.926228
[200]	valid_0's ndcg@12: 0.927195
[250]	valid_0's ndcg@12: 0.927153
Early stopping, best iteration is:
[238]	valid_0's ndcg@12: 0.927387




E12 extra_trees all 0.03607907892643696


In [None]:
# params = dict(
#     objective="lambdarank",
#     metric="ndcg",
#     eval_at=[12],
#     learning_rate=0.05,
#     n_estimators=2000,
#     num_leaves=63,
#     min_data_in_leaf=50,
#     subsample=0.8,
#     colsample_bytree=0.8,
#     reg_lambda=1.0,
#     random_state=SEED,
#     bagging_seed=SEED,
#     feature_fraction_seed=SEED,
#     data_random_seed=SEED,
#     n_jobs=-1
# )

# row = run_experiment(
#     exp_id="E01",
#     exp_name="baseline",
#     recall_name="final_merged_recall_v1",
#     lgb_params=params,
#     X_train=X_train, y_train=y_train, group_train=group_train,
#     X_val=X_val, y_val=y_val, group_val=group_val,
#     rank_df_val_for_eval=rank_df_val_for_eval
# )


In [None]:
row

In [10]:
pd.read_csv("../experiments/exp_results.csv").tail(5)

Unnamed: 0,exp_id,exp_name,recall_name,timestamp,best_iteration,train_time_sec,val_map12_manual,num_train_rows,num_val_rows,num_features,model_path,config_path,val_avg_candidates,val_med_candidates,val_min_candidates,val_max_candidates,val_pos_rate
6,E08,all_minus_ui,final_merged_recall_v1,2025-12-30 23:01:46,122,11.11,0.030973,2822015,705548,12,../models/E08_all_minus_ui.pkl,../experiments/configs/E08_all_minus_ui.json,51.137784,51.0,51,57,0.002694
7,E09,all_minus_pop,final_merged_recall_v1,2025-12-30 23:02:03,139,10.18,0.033097,2822015,705548,11,../models/E09_all_minus_pop.pkl,../experiments/configs/E09_all_minus_pop.json,51.137784,51.0,51,57,0.002694
8,E10,stronger_reg,final_merged_recall_v1,2025-12-30 23:02:24,197,14.43,0.037007,2822015,705548,14,../models/E10_stronger_reg.pkl,../experiments/configs/E10_stronger_reg.json,51.137784,51.0,51,57,0.002694
9,E11,lr_0.03_more_trees,final_merged_recall_v1,2025-12-30 23:02:36,20,5.03,0.036905,2822015,705548,14,../models/E11_lr_0.03_more_trees.pkl,../experiments/configs/E11_lr_0.03_more_trees....,51.137784,51.0,51,57,0.002694
10,E12,extra_trees,final_merged_recall_v1,2025-12-30 23:02:59,238,16.93,0.036079,2822015,705548,14,../models/E12_extra_trees.pkl,../experiments/configs/E12_extra_trees.json,51.137784,51.0,51,57,0.002694
