
# 03 — Сборка фичей → LightGBM реранкер → сабмит
- Собираем все подготовленные фичи по ключу `['query_id','item_id']`.
- Обучаем `LightGBM LGBMRanker (lambdarank)` с hold-out сплитом по `query_id`.
- Считаем кастомный `NDCG@10` с декеем `0.97^position`.
- Делаем предсказания на тесте и пишем `solution.csv` (пары `query_id,item_id` в порядке убывания скоринга).


In [None]:

##%%
# Требования:
# pip install polars pyarrow lightgbm tqdm numpy


In [1]:

##%%
import os, json, math
import numpy as np
import polars as pl
from tqdm.auto import tqdm

# ML
try:
    import lightgbm as lgb
    HAS_LGBM = True
except Exception as e:
    HAS_LGBM = False
    print("[warn] LightGBM не установлен:", e)


  from .autonotebook import tqdm as notebook_tqdm


In [2]:

##%%
# ---- Поиск корня проекта ----
def find_project_root(start=None):
    cur = os.path.abspath(start or os.getcwd())
    while True:
        has_data = os.path.isdir(os.path.join(cur, "Data"))
        has_feat = os.path.isdir(os.path.join(cur, "Features"))
        if has_data and has_feat:
            return cur
        parent = os.path.dirname(cur)
        if parent == cur:
            raise RuntimeError("Не нашёл корень проекта (ожидаю папки Data/ и Features/).")
        cur = parent

BASE_DIR = find_project_root()
DATA_DIR = os.path.join(BASE_DIR, "Data")
FEAT_DIR = os.path.join(BASE_DIR, "Features")

TRAIN_PATH = os.path.join(DATA_DIR, "train-dset.parquet")
TEST_PATH  = os.path.join(DATA_DIR, "test-dset-small.parquet")

# Источники фичей (меняем под наличие файлов)
TAB_DIR  = os.path.join(FEAT_DIR, "table-basic")
COS_TIT_DIR_CAND = [
    os.path.join(FEAT_DIR, "cos-e5-small"),   # как в твоей структуре
    DATA_DIR,                                  # на случай, если лежит рядом (train_cos.parquet / test_cos.parquet)
]
COS_DESC_BASE_DIR = os.path.join(FEAT_DIR, "cos-e5-base-desc")

def pick_cos_tit_dir():
    for p in COS_TIT_DIR_CAND:
        if os.path.exists(os.path.join(p, "train_cos.parquet")) and os.path.exists(os.path.join(p, "test_cos.parquet")):
            return p
    raise FileNotFoundError("Не нашёл train_cos.parquet/test_cos.parquet ни в Features/cos-e5-small, ни в Data/.")

COS_TIT_DIR = pick_cos_tit_dir()

print("[paths]")
print("BASE_DIR =", BASE_DIR)
print("DATA_DIR =", DATA_DIR)
print("FEAT_DIR =", FEAT_DIR)
print("TAB_DIR  =", TAB_DIR)
print("COS_TIT_DIR =", COS_TIT_DIR)
print("COS_DESC_BASE_DIR =", COS_DESC_BASE_DIR)


[paths]
BASE_DIR = C:\Users\idine\PycharmProjects\Avito_Test
DATA_DIR = C:\Users\idine\PycharmProjects\Avito_Test\Data
FEAT_DIR = C:\Users\idine\PycharmProjects\Avito_Test\Features
TAB_DIR  = C:\Users\idine\PycharmProjects\Avito_Test\Features\table-basic
COS_TIT_DIR = C:\Users\idine\PycharmProjects\Avito_Test\Features\cos-e5-small
COS_DESC_BASE_DIR = C:\Users\idine\PycharmProjects\Avito_Test\Features\cos-e5-base-desc


In [3]:

##%%
# ---- Загрузка и сборка ----

# Базовые пары и таргет
train_base = pl.scan_parquet(TRAIN_PATH).select([
    "query_id","item_id","item_contact"  # таргет
]).collect(streaming=True)

test_base  = pl.scan_parquet(TEST_PATH).select([
    "query_id","item_id"
]).collect(streaming=True)

# Табличные фичи (без косинусов)
train_tab = pl.read_parquet(os.path.join(TAB_DIR, "train_feats_tab.parquet"))
test_tab  = pl.read_parquet(os.path.join(TAB_DIR, "test_feats_tab.parquet"))

# Косинус query-title (e5-small) — cos_q_title
train_cos_tit = pl.read_parquet(os.path.join(COS_TIT_DIR, "train_cos.parquet"))
test_cos_tit  = pl.read_parquet(os.path.join(COS_TIT_DIR, "test_cos.parquet"))

# Косинус query_base ↔ desc_base — cos_q_desc_base (если есть)
has_desc_cos = (
    os.path.exists(os.path.join(COS_DESC_BASE_DIR, "train_cos_desc_base.parquet")) and
    os.path.exists(os.path.join(COS_DESC_BASE_DIR, "test_cos_desc_base.parquet"))
)
if has_desc_cos:
    train_cos_desc = pl.read_parquet(os.path.join(COS_DESC_BASE_DIR, "train_cos_desc_base.parquet"))
    test_cos_desc  = pl.read_parquet(os.path.join(COS_DESC_BASE_DIR, "test_cos_desc_base.parquet"))
else:
    train_cos_desc = None
    test_cos_desc  = None

print("[loaded]",
      "train_base:", train_base.shape,
      "train_tab:", train_tab.shape,
      "train_cos_tit:", train_cos_tit.shape,
      "train_cos_desc:", None if train_cos_desc is None else train_cos_desc.shape)


  train_base = pl.scan_parquet(TRAIN_PATH).select([
  test_base  = pl.scan_parquet(TEST_PATH).select([


[loaded] train_base: (7781790, 3) train_tab: (7781790, 56) train_cos_tit: (7781790, 3) train_cos_desc: (7781790, 3)


In [None]:

##%%
# ---- Джойним всё по ключу ['query_id','item_id'] ----
def join_all(base_df: pl.DataFrame,
             tab_df: pl.DataFrame,
             cos_tit_df: pl.DataFrame,
             cos_desc_df: pl.DataFrame | None) -> pl.DataFrame:
    df = base_df.join(tab_df, on=["query_id","item_id"], how="left")
    df = df.join(cos_tit_df, on=["query_id","item_id"], how="left")
    if cos_desc_df is not None:
        df = df.join(cos_desc_df, on=["query_id","item_id"], how="left")
    return df

train_full = join_all(train_base, train_tab, train_cos_tit, train_cos_desc)
test_full  = join_all(test_base,  test_tab,  test_cos_tit,  test_cos_desc)

print("[assembled] train_full:", train_full.shape, "test_full:", test_full.shape)


In [None]:

##%%
# ---- QC: покрытие пар ----
def coverage(pairs: pl.DataFrame, feats: pl.DataFrame, name: str):
    miss = pairs.join(feats.select(["query_id","item_id"]), on=["query_id","item_id"], how="anti")
    print(f"[coverage] {name}: missing pairs =", miss.height)

coverage(train_base, train_full, "train")
coverage(test_base,  test_full,  "test")

# Быстрая проверка NaN/Inf
def non_finite_report(df: pl.DataFrame, name: str, cols_sample=20):
    num_cols = [c for c, t in df.schema.items() if t.is_numeric() and c not in ("query_id","item_id","item_contact")]
    sample = num_cols[:cols_sample]
    for c in sample:
        s = df.get_column(c)
        bad = int((~s.is_finite()).sum())
        if bad:
            print(f"[non-finite] {name}.{c}: {bad} rows")

non_finite_report(train_full, "train")
non_finite_report(test_full,  "test")


In [None]:

##%%
# ---- Подготовка матриц для LGBM ----

# Выбираем признаки: все числовые колонки, кроме ключей и таргета
drop_cols = {"query_id","item_id","item_contact"}
feat_cols = [c for c, t in train_full.schema.items() if t.is_numeric() and c not in drop_cols]

# Заполняем NaN/Inf нулями
def sanitize(df: pl.DataFrame, cols: list[str]) -> np.ndarray:
    mat = []
    for c in cols:
        s = df.get_column(c)
        # заменим non-finite на 0
        arr = s.fill_null(0.0).to_numpy()
        if arr.dtype.kind in ("i","u"):
            arr = arr.astype(np.float32, copy=False)
        else:
            # float: заменяем inf/nan
            arr = arr.astype(np.float32, copy=False)
            arr[~np.isfinite(arr)] = 0.0
        mat.append(arr)
    X = np.vstack(mat).T.astype(np.float32, copy=False)
    return X

X_train = sanitize(train_full, feat_cols)
y_train = train_full.get_column("item_contact").to_numpy().astype(np.int32)
qids    = train_full.get_column("query_id").to_numpy()

X_test  = sanitize(test_full, feat_cols)

print("[matrix] X_train:", X_train.shape, "X_test:", X_test.shape, "features:", len(feat_cols))


In [None]:

##%%
# ---- Hold-out сплит по query_id ----
rng = np.random.default_rng(42)
uniq_q = np.unique(qids)
rng.shuffle(uniq_q)
n_val = max(1, int(0.1 * len(uniq_q)))
val_set = set(uniq_q[:n_val])

val_mask = np.isin(qids, list(val_set))
tr_mask  = ~val_mask

# Упорядочим каждый сабсет по query_id (чтобы группы были подряд)
tr_idx = np.where(tr_mask)[0]
va_idx = np.where(val_mask)[0]
tr_idx = tr_idx[np.argsort(qids[tr_idx], kind="mergesort")]
va_idx = va_idx[np.argsort(qids[va_idx], kind="mergesort")]

X_tr, y_tr, q_tr = X_train[tr_idx], y_train[tr_idx], qids[tr_idx]
X_va, y_va, q_va = X_train[va_idx], y_train[va_idx], qids[va_idx]

def group_sizes_from_sorted_ids(ids: np.ndarray) -> np.ndarray:
    _, counts = np.unique(ids, return_counts=True)
    return counts.astype(int)

tr_groups = group_sizes_from_sorted_ids(q_tr)
va_groups = group_sizes_from_sorted_ids(q_va)

print(f"[split] train rows={X_tr.shape[0]}, val rows={X_va.shape[0]}, queries train/val={len(np.unique(q_tr))}/{len(np.unique(q_va))}")


In [None]:

##%%
# ---- Кастомный NDCG@10 (0.97^pos) для отчёта ----
def calc_dcg_at_k(v: np.ndarray, k: int = 10) -> float:
    w = 0.97 ** np.arange(len(v))
    return float((v * w)[:k].sum())

def calc_ndcg_at_k(labels: np.ndarray, preds: np.ndarray, groups: np.ndarray, k: int = 10) -> float:
    # ожидаем, что ids внутри groups отсортированы и group_sizes известны отдельно
    order = np.argsort(groups, kind="mergesort")
    labels, preds, groups = labels[order], preds[order], groups[order]
    uq, counts = np.unique(groups, return_counts=True)
    s = 0
    scores = []
    for c in counts:
        sl = slice(s, s+c)
        l = labels[sl]; p = preds[sl]
        idx = np.argsort(-p, kind="mergesort")
        idcg = calc_dcg_at_k(np.sort(l)[::-1], k) + 1e-12
        scores.append(calc_dcg_at_k(l[idx], k) / idcg)
        s += c
    return float(np.mean(scores)) if scores else 0.0


In [None]:

##%%
# ---- Обучение LGBM ----
if not HAS_LGBM:
    raise RuntimeError("LightGBM недоступен в окружении. Установи пакет lightgbm.")

params = dict(
    objective="lambdarank",
    metric="ndcg",
    ndcg_eval_at=[10],
    learning_rate=0.05,
    num_leaves=127,
    min_data_in_leaf=100,
    feature_fraction=0.9,
    bagging_fraction=0.8,
    bagging_freq=1,
    verbose=-1,
    seed=42,
    device_type="cpu",  # если есть GPU-версия LGBM, можно поставить "gpu"
)

dtr = lgb.Dataset(X_tr, label=y_tr, group=tr_groups, feature_name=feat_cols)
dva = lgb.Dataset(X_va, label=y_va, group=va_groups, feature_name=feat_cols, reference=dtr)

model = lgb.train(
    params,
    dtr,
    valid_sets=[dva],
    num_boost_round=3000,
    callbacks=[
        lgb.early_stopping(100, verbose=False),
        lgb.log_evaluation(100),
    ],
)
print("[train] best_iteration:", model.best_iteration)


In [None]:

##%%
# ---- Валидация кастомным NDCG@10 ----
preds_va = model.predict(X_va, num_iteration=model.best_iteration)
ndcg_val = calc_ndcg_at_k(y_va.astype(float), preds_va.astype(float), q_va, k=10)
print(f"[holdout] custom NDCG@10 (0.97^pos) = {ndcg_val:.5f}")


In [None]:

##%%
# ---- Дообучение на всём train (опционально можно пересесть) ----
# Для простоты — сразу используем уже обученную модель; при желании можно переобучить на X_train.
# Прогноз на тест
test_pred = model.predict(X_test, num_iteration=model.best_iteration)

# Сабмит: сортируем внутри query_id по score убыв.
sub = test_full.select(["query_id","item_id"]).with_columns(
    pl.Series("pred", test_pred)
).sort(["query_id","pred"], descending=[False, True]).select(["query_id","item_id"])

SUB_PATH = os.path.join(BASE_DIR, "solution.csv")
sub.write_csv(SUB_PATH, include_header=True)
print("[save] submission ->", SUB_PATH, "rows=", sub.height)
