In [1]:
# # 03 — Обучение LightGBM реранкера (lambdarank, ndcg@10)

In [2]:
import os
import json
import polars as pl
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import GroupKFold
from typing import List

SEED = 42
np.random.seed(SEED)

In [4]:
# ----- Пути -----

BASE_DIR = os.path.dirname(os.getcwd())
DATA_DIR = os.path.join(BASE_DIR, "Data")
TRAIN_PATH = os.path.join(DATA_DIR, "train-dset.parquet")
TEST_PATH  = os.path.join(DATA_DIR, "test-dset-small.parquet")

TRAIN_COS = os.path.join(DATA_DIR, "train_cos.parquet")
TEST_COS  = os.path.join(DATA_DIR, "test_cos.parquet")

TRAIN_FEATS_PATH = os.path.join(DATA_DIR, "train_feats.parquet")
TEST_FEATS_PATH  = os.path.join(DATA_DIR, "test_feats.parquet")

SUBMISSION_PATH  = "../solution.csv"
MODEL_TXT_PATH   = "lgbm_ranker.txt"
MODEL_PKL_PATH   = "lgbm_ranker.pkl"

In [5]:
# ## 1) Загрузка и базовые проверки

assert os.path.exists(TRAIN_FEATS_PATH), f"Не найден {TRAIN_FEATS_PATH}"
assert os.path.exists(TEST_FEATS_PATH), f"Не найден {TEST_FEATS_PATH}"

In [6]:
train_feats = pl.read_parquet(TRAIN_FEATS_PATH)
test_feats  = pl.read_parquet(TEST_FEATS_PATH)

In [7]:
print("[load] train_feats:", train_feats.shape)
print("[load] test_feats :", test_feats.shape)

[load] train_feats: (7781790, 60)
[load] test_feats : (335348, 59)


In [8]:
# ## 2) Описание набора фичей и колонок

FEATURES = [
    # эмбеддинги
    "cos_q_title",

    # тексты
    "query_len", "title_len", "abs_len_diff",
    "overlap_q_title", "jaccard_q_title", "dice_q_title", "ratio_overlap_title",
    "title_contains_query", "has_query_text", "has_title",

    # категории/локации
    "same_cat", "same_mcat", "same_loc", "same_cat_loc", "triple_match",
    "freq_item_cat", "freq_item_mcat", "freq_item_loc", "is_query_mcat_missing",

    # цена
    "price_clip", "price_log1p", "price_rank_in_query", "price_z_in_query", "price_is_zero", "price_vs_median_query",

    # поведенческие
    "conv_known", "conv_filled", "conv_z_in_query", "conv_rank_in_query",

    # групповые
    "n_items_in_query", "max_cos_in_query", "cos_minus_max",

    # интеракции
    "same_cat__cos", "same_loc__cos", "conv__cos", "conv__same_loc", "price_rank__same_loc",
]


In [9]:
TARGET = "item_contact"
GROUP  = "query_id"
ID_COLS = ["query_id","item_id"]

In [10]:
# Проверим, что все фичи есть
missing_cols = [c for c in FEATURES + [TARGET, GROUP] if c not in train_feats.columns]
assert not missing_cols, f"В train_feats нет колонок: {missing_cols}"

In [11]:
missing_test = [c for c in FEATURES + ID_COLS if c not in test_feats.columns]
assert not missing_test, f"В test_feats нет колонок: {missing_test}"

In [14]:
def sanitize_features(df: pl.DataFrame, features: list[str]) -> pl.DataFrame:
    fixes = []
    for c in features:
        if c not in df.columns:
            continue
        dt = df.schema[c]
        if dt in (pl.Float32, pl.Float64):
            fixes.append(
                pl.when(pl.col(c).is_finite()).then(pl.col(c)).otherwise(0.0)
                  .fill_null(0.0).cast(pl.Float32).alias(c)
            )
        elif dt == pl.Boolean:
            fixes.append(pl.col(c).cast(pl.Int8).fill_null(0).alias(c))
        elif dt.is_integer():
            fixes.append(pl.col(c).fill_null(0).alias(c))
        else:
            fixes.append(pl.col(c).cast(pl.Float32).fill_null(0.0).alias(c))
    return df.with_columns(fixes)

In [16]:
def prepare_matrix(df: pl.DataFrame, features: List[str]) -> np.ndarray:
    return df.select([pl.col(c).cast(pl.Float32) for c in features]).fill_null(0.0).to_numpy()

In [17]:
# очистим train/test
train_feats = sanitize_features(train_feats, FEATURES + [TARGET, GROUP])
test_feats  = sanitize_features(test_feats,  FEATURES + ID_COLS)

In [18]:
# соберём матрицы и цели
y = train_feats[TARGET].to_numpy()
groups_vec = train_feats["query_id"].to_numpy()
X = prepare_matrix(train_feats, FEATURES)
X_test = prepare_matrix(test_feats, FEATURES)

In [19]:
print("[matrix] X:", X.shape, "y:", y.shape, "X_test:", X_test.shape)

[matrix] X: (7781790, 38) y: (7781790,) X_test: (335348, 38)


In [21]:
# ## 4) Hold-out сплит по query_id и обучение (быстро)

import numpy as np
import lightgbm as lgb
from tqdm.auto import tqdm
from lightgbm.callback import CallbackEnv

VAL_FRACTION = 0.1   # 10% запросов в валидацию
SEED = 42
USE_GPU = True

rng = np.random.RandomState(SEED)

In [22]:
# --- 4.1 Сплит по уникальным query_id ---
uniq_q = np.unique(groups_vec)
rng.shuffle(uniq_q)
n_val = max(1, int(len(uniq_q) * VAL_FRACTION))
val_q = set(uniq_q[:n_val])

val_mask = np.array([q in val_q for q in groups_vec], dtype=bool)
tr_mask  = ~val_mask


In [23]:
# --- 4.2 Упорядочим каждый сабсет по query_id (важно для group sizes) ---
tr_idx = np.where(tr_mask)[0]
va_idx = np.where(val_mask)[0]
tr_idx = tr_idx[np.argsort(groups_vec[tr_idx], kind="mergesort")]
va_idx = va_idx[np.argsort(groups_vec[va_idx], kind="mergesort")]

X_tr, y_tr, q_tr = X[tr_idx], y[tr_idx], groups_vec[tr_idx]
X_va, y_va, q_va = X[va_idx], y[va_idx], groups_vec[va_idx]


In [24]:
def group_sizes_from_sorted_ids(ids: np.ndarray) -> np.ndarray:
    # ids уже отсортированы по query_id
    _, counts = np.unique(ids, return_counts=True)
    return counts

tr_groups = group_sizes_from_sorted_ids(q_tr)
va_groups = group_sizes_from_sorted_ids(q_va)


In [25]:
print(f"[holdout] queries: total={len(uniq_q)}, train={len(uniq_q)-n_val}, val={n_val}")
print(f"[holdout] rows: train={X_tr.shape[0]}, val={X_va.shape[0]}")

[holdout] queries: total=678190, train=610371, val=67819
[holdout] rows: train=7001690, val=780100


In [26]:
# --- 4.3 Прогресс-бар по бустингу ---
class TQDMCallback:
    def __init__(self, total_rounds: int, desc: str = "boosting"):
        self.total_rounds = total_rounds
        self.desc = desc
        self.pbar = None
    def __call__(self, env: CallbackEnv):
        if self.pbar is None:
            end_it = getattr(env, "end_iteration", self.total_rounds)
            begin_it = getattr(env, "begin_iteration", 0)
            self.pbar = tqdm(total=end_it - begin_it, desc=self.desc, leave=False)
        self.pbar.update(1)
        if env.iteration + 1 >= self.total_rounds or (env.iteration + 1) >= getattr(env, "end_iteration", self.total_rounds):
            self.pbar.close()
            self.pbar = None

In [27]:
# --- 4.4 Монотонные ограничения (как раньше) ---
monotone = []
for f in FEATURES:
    if f in ["cos_q_title","conv_filled","jaccard_q_title","dice_q_title","ratio_overlap_title","max_cos_in_query"]:
        monotone.append(1)
    elif f in ["cos_minus_max"]:
        monotone.append(-1)
    else:
        monotone.append(0)

In [28]:
# --- 4.5 Параметры LGBM (+GPU при наличии) ---
params = dict(
    objective="lambdarank",
    metric="ndcg",
    ndcg_eval_at=[10],
    learning_rate=0.05,
    num_leaves=127,
    min_data_in_leaf=100,
    feature_fraction=0.9,
    bagging_fraction=0.8,
    bagging_freq=1,
    verbose=-1,
    seed=SEED,
    monotone_constraints=monotone,
    num_threads=0,
    device="gpu" if USE_GPU else "cpu",
)

In [29]:
num_boost_round = 3000
early_stopping_rounds = 100
log_every = 100

dtr = lgb.Dataset(X_tr, label=y_tr, group=tr_groups, feature_name=FEATURES)
dva = lgb.Dataset(X_va, label=y_va, group=va_groups, feature_name=FEATURES)

In [30]:
try:
    model = lgb.train(
        params,
        dtr,
        valid_sets=[dva],
        num_boost_round=num_boost_round,
        callbacks=[
            TQDMCallback(num_boost_round, desc="holdout"),
            lgb.early_stopping(early_stopping_rounds, verbose=False),
            lgb.log_evaluation(log_every),
        ],
    )
except lgb.basic.LightGBMError as e:
    if USE_GPU:
        print("[warn] GPU недоступен, переключаюсь на CPU:", str(e).splitlines()[0])
        params["device"] = "cpu"
        model = lgb.train(
            params,
            dtr,
            valid_sets=[dva],
            num_boost_round=num_boost_round,
            callbacks=[
                TQDMCallback(num_boost_round, desc="holdout (cpu)"),
                lgb.early_stopping(early_stopping_rounds, verbose=False),
                lgb.log_evaluation(log_every),
            ],
        )
    else:
        raise

holdout:   3%|▎         | 100/3000 [00:31<13:17,  3.64it/s]

[100]	valid_0's ndcg@10: 0.883235


holdout:   7%|▋         | 200/3000 [00:59<11:56,  3.91it/s]

[200]	valid_0's ndcg@10: 0.883838


holdout:  10%|█         | 300/3000 [01:21<09:55,  4.54it/s]

[300]	valid_0's ndcg@10: 0.883743


                                                           

In [31]:
# --- 4.6 Кастомный NDCG@10 (decay 0.97^pos) на валидации ---
def calc_dcg_at_k(v: np.ndarray, k: int = 10) -> float:
    w = 0.97 ** np.arange(len(v))
    return (v * w)[:k].sum()
def calc_ndcg_at_k(labels: np.ndarray, preds: np.ndarray, groups: np.ndarray, k: int = 10) -> float:
    order = np.argsort(groups, kind="mergesort")
    labels, preds, groups = labels[order], preds[order], groups[order]
    _, counts = np.unique(groups, return_counts=True)
    start, lst = 0, []
    for c in counts:
        sl = slice(start, start+c)
        l, p = labels[sl], preds[sl]
        idx = np.argsort(-p, kind="mergesort")
        idcg = calc_dcg_at_k(np.sort(l)[::-1], k) + 1e-12
        lst.append(calc_dcg_at_k(l[idx], k) / idcg)
        start += c
    return float(np.mean(lst))

preds_va = model.predict(X_va, num_iteration=model.best_iteration)
ndcg_val = calc_ndcg_at_k(y_va, preds_va, q_va, k=10)
print(f"[holdout] custom ndcg@10: {ndcg_val:.5f}")

[holdout] custom ndcg@10: 0.30902


In [33]:
# ## 5) Инференс на тесте и сабмит

SUBMISSION_PATH = "../solution.csv"

test_pred = model.predict(X_test, num_iteration=model.best_iteration)
sub_df = test_feats.select(["query_id", "item_id"]).with_columns(pl.Series("pred", test_pred))
submission = (
    sub_df.sort(["query_id", "pred"], descending=[False, True])
          .select(["query_id", "item_id"])
)
submission.write_csv(SUBMISSION_PATH, include_header=True)
print(f"[save] submission -> {SUBMISSION_PATH}, rows={submission.height}")



[save] submission -> solution.csv, rows=335348


In [34]:
# per-query ndcg@10 (custom 0.97^pos)
def ndcg_per_query(labels, preds, groups, k=10):
    import numpy as np
    order = np.argsort(groups, kind="mergesort")
    labels, preds, groups = labels[order], preds[order], groups[order]
    uq, counts = np.unique(groups, return_counts=True)
    res = []
    s = 0
    w = 0.97 ** np.arange(1000)  # с запасом
    for q, c in zip(uq, counts):
        sl = slice(s, s+c)
        l = labels[sl]; p = preds[sl]
        idx = np.argsort(-p, kind="mergesort")
        dcg = (l[idx] * w[:c])[:k].sum()
        idcg = (np.sort(l)[::-1] * w[:c])[:k].sum() + 1e-12
        res.append((q, dcg/idcg, l.sum()))
        s += c
    return res

# примеры «плохих» запросов (нет попаданий в топ-10, но есть позитивы)
perq = ndcg_per_query(y_va, preds_va, q_va, k=10)
bad = [(q,score,pos) for q,score,pos in perq if pos>0 and score<0.1][:10]
print("bad queries (q, ndcg, positives):", bad[:10])


bad queries (q, ndcg, positives): [(np.int64(910), np.float64(0.0), np.float32(1.0)), (np.int64(2116), np.float64(0.0), np.float32(1.0)), (np.int64(3275), np.float64(0.0), np.float32(1.0)), (np.int64(3782), np.float64(0.0), np.float32(2.0)), (np.int64(3886), np.float64(0.0), np.float32(1.0)), (np.int64(4004), np.float64(0.0), np.float32(2.0)), (np.int64(4161), np.float64(0.0), np.float32(1.0)), (np.int64(4358), np.float64(0.0), np.float32(1.0)), (np.int64(5223), np.float64(0.0), np.float32(1.0)), (np.int64(5508), np.float64(0.0), np.float32(1.0))]
