In [1]:
# CatBoost Ranker (YetiRankPairwise) — CV, GPU, прогресс, OOF/Test сохранение

import os
import numpy as np
import polars as pl
from typing import List
from tqdm.auto import tqdm

from catboost import CatBoostRanker, Pool
from sklearn.model_selection import GroupKFold


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
SEED = 42
np.random.seed(SEED)

BASE_DIR = os.path.dirname(os.getcwd())
DATA_DIR = os.path.join(BASE_DIR, "Data")
TRAIN_FEATS_PATH = os.path.join(DATA_DIR, "train_feats.parquet")
TEST_FEATS_PATH  = os.path.join(DATA_DIR, "test_feats.parquet")


In [3]:
OUT_DIR = "out_cat"
os.makedirs(OUT_DIR, exist_ok=True)

TARGET = "item_contact"
GROUP  = "query_id"
ID_COLS = ["query_id", "item_id"]


In [4]:
assert os.path.exists(TRAIN_FEATS_PATH), TRAIN_FEATS_PATH
assert os.path.exists(TEST_FEATS_PATH),  TEST_FEATS_PATH

In [5]:
train_feats = pl.read_parquet(TRAIN_FEATS_PATH)
test_feats  = pl.read_parquet(TEST_FEATS_PATH)

In [6]:
print("[load] train_feats:", train_feats.shape)
print("[load] test_feats :", test_feats.shape)


[load] train_feats: (7781790, 60)
[load] test_feats : (335348, 59)


In [7]:
# 2) Фичи — тот же набор, что у тебя (подхватываем только существующие)

FEATURES = [
    # эмбеддинги
    "cos_q_title",

    # тексты
    "query_len","title_len","abs_len_diff",
    "ratio_overlap_title",
    "title_contains_query","has_query_text","has_title",

    # категории/локации
    "same_cat","same_mcat","same_loc","same_cat_loc","triple_match",
    "freq_item_cat","freq_item_mcat","freq_item_loc",

    # цена
    "price_clip","price_log1p","price_rank_in_query","price_z_in_query",
    "price_is_zero","price_vs_median_query",

    # поведенческие
    "conv_known","conv_filled","conv_z_in_query","conv_rank_in_query",

    # групповые
    "n_items_in_query",

    # интеракции
    "same_cat__cos","same_loc__cos","conv__cos","conv__same_loc","price_rank__same_loc",
]


In [8]:
FEATURES = [c for c in FEATURES if c in train_feats.columns]
FEATURES = list(dict.fromkeys(FEATURES))
print(f"[features] using {len(FEATURES)} columns")

[features] using 32 columns


In [9]:
SAFE_EXCLUDE = {TARGET, GROUP, *ID_COLS}

def sanitize_features(df: pl.DataFrame, features: list[str]) -> pl.DataFrame:
    # санитизируем только фичи, без таргета/группы/идентификаторов
    cols = [c for c in features if c in df.columns and c not in SAFE_EXCLUDE]

    exprs = []
    for c in cols:
        dt = df.schema[c]
        if dt in (pl.Float32, pl.Float64):
            exprs.append(
                pl.when(pl.col(c).is_finite()).then(pl.col(c)).otherwise(0.0)
                  .fill_null(0.0).cast(pl.Float32).alias(c)
            )
        elif dt == pl.Boolean:
            exprs.append(pl.col(c).cast(pl.Int8).fill_null(0).alias(c))
        elif dt.is_integer():
            exprs.append(pl.col(c).fill_null(0).alias(c))
        else:
            exprs.append(pl.col(c).cast(pl.Float32).fill_null(0.0).alias(c))
    return df.with_columns(exprs)

In [10]:
# 1) санитизируем только FEATURES
train_feats = sanitize_features(train_feats, FEATURES)
test_feats  = sanitize_features(test_feats,  FEATURES)

# 2) служебные колонки – только fill_null, без кастов/alias, и строго один раз
if TARGET in train_feats.columns:
    train_feats = train_feats.with_columns(pl.col(TARGET).fill_null(0))
if GROUP in train_feats.columns:
    train_feats = train_feats.with_columns(pl.col(GROUP).fill_null(0))

for c in ID_COLS:
    if c in train_feats.columns:
        train_feats = train_feats.with_columns(pl.col(c).fill_null(0))
    if c in test_feats.columns:
        test_feats = test_feats.with_columns(pl.col(c).fill_null(0))


In [11]:
# 1) санитизируем только FEATURES
train_feats = sanitize_features(train_feats, FEATURES)
test_feats  = sanitize_features(test_feats,  FEATURES)

# 2) служебные колонки – только fill_null, без кастов/alias, и строго один раз
if TARGET in train_feats.columns:
    train_feats = train_feats.with_columns(pl.col(TARGET).fill_null(0))
if GROUP in train_feats.columns:
    train_feats = train_feats.with_columns(pl.col(GROUP).fill_null(0))

for c in ID_COLS:
    if c in train_feats.columns:
        train_feats = train_feats.with_columns(pl.col(c).fill_null(0))
    if c in test_feats.columns:
        test_feats = test_feats.with_columns(pl.col(c).fill_null(0))


In [12]:
bad = [c for c in FEATURES if c in {TARGET, GROUP, *ID_COLS}]
if bad:
    print("[warn] service columns accidentally in FEATURES:", bad)


In [13]:
def prepare_matrix(df: pl.DataFrame, features: List[str]) -> np.ndarray:
    return df.select([pl.col(c).cast(pl.Float32) for c in features]).to_numpy()

In [14]:
X = prepare_matrix(train_feats, FEATURES)
y = train_feats[TARGET].to_numpy()
g = train_feats[GROUP].to_numpy()
X_test = prepare_matrix(test_feats, FEATURES)

In [15]:
print("[matrix] X:", X.shape, "y:", y.shape, "X_test:", X_test.shape)

[matrix] X: (7781790, 32) y: (7781790,) X_test: (335348, 32)


In [16]:
# 4) CV сплит по query_id (GroupKFold)

N_FOLDS = 5
gkf = GroupKFold(n_splits=N_FOLDS)
folds = [(tr_idx, va_idx) for tr_idx, va_idx in gkf.split(X, y, groups=g)]

In [None]:
# быстрые параметры
cat_params_fast = dict(
    loss_function="YetiRankPairwise",
    task_type="GPU", devices="0",
    iterations=1200,               # короче
    learning_rate=0.12,            # быстрее сходимость
    depth=6,
    l2_leaf_reg=6.0,
    random_seed=SEED,
    verbose=False,                  # без CPU-метрик
)

oof_cat = np.zeros(len(y), dtype=np.float32)
test_pred_cat = np.zeros(len(X_test), dtype=np.float32)

for fold,(tr_idx,va_idx) in enumerate(folds,1):
    tr_sorted = tr_idx[np.argsort(g[tr_idx], kind="mergesort")]
    va_sorted = va_idx[np.argsort(g[va_idx], kind="mergesort")]

    pool_tr = Pool(X[tr_sorted], y[tr_sorted], group_id=g[tr_sorted], feature_names=FEATURES)
    # без eval_set — сильно быстрее
    model_cat = CatBoostRanker(**cat_params_fast)
    model_cat.fit(pool_tr)

    # OOF предсказания
    pool_va = Pool(X[va_sorted], group_id=g[va_sorted], feature_names=FEATURES)
    oof_cat[va_sorted] = model_cat.predict(pool_va)

    # test (можно без сортировки, но на всякий — быстро отсортируем/вернём порядок)
    order_test = np.argsort(test_feats[GROUP].to_numpy(), kind="mergesort")
    inv = np.empty_like(order_test); inv[order_test] = np.arange(len(order_test))
    test_pool_sorted = Pool(X_test[order_test],
                            group_id=test_feats[GROUP].to_numpy()[order_test],
                            feature_names=FEATURES)
    pred_sorted = model_cat.predict(test_pool_sorted)
    test_pred_cat += pred_sorted[inv] / N_FOLDS


Default metric period is 5 because PFound is/are not implemented for GPU
Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


In [None]:
# 7) Сохранение артефактов для мета-модели

oof_path = os.path.join(OUT_DIR, "oof_cat.parquet")
test_path = os.path.join(OUT_DIR, "test_pred_cat.parquet")

pl.DataFrame({
    "query_id": train_feats["query_id"],
    "item_id": train_feats["item_id"],
    "oof_cat": oof_cat,
}).write_parquet(oof_path)

pl.DataFrame({
    "query_id": test_feats["query_id"],
    "item_id": test_feats["item_id"],
    "pred_cat": test_pred_cat,
}).write_parquet(test_path)

print("[save]", oof_path)
print("[save]", test_path)

# 8) (опционально) Сабмит из CatBoost (если нужен сразу)

DO_SUBMIT = True
if DO_SUBMIT:
    sub = (
        test_feats.select(ID_COLS)
        .with_columns(pl.Series("pred", test_pred_cat))
        .sort(["query_id", "pred"], descending=[False, True])
        .select(ID_COLS)
    )
    sub_path = "solution_catboost.csv"
    sub.write_csv(sub_path, include_header=True)
    print("[save] submission ->", sub_path)