In [1]:
import os, json, polars as pl, numpy as np, lightgbm as lgb
from sklearn.model_selection import GroupKFold
from typing import List
from tqdm.auto import tqdm
from lightgbm.callback import CallbackEnv

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
SEED = 42
np.random.seed(SEED)

BASE_DIR = os.path.dirname(os.getcwd())
DATA_DIR = os.path.join(BASE_DIR, "Data")
TRAIN_FEATS_PATH = os.path.join(DATA_DIR, "train_feats2.parquet")
TEST_FEATS_PATH  = os.path.join(DATA_DIR, "test_feats2.parquet")

OUT_DIR = "out_lgbm"; os.makedirs(OUT_DIR, exist_ok=True)
TARGET = "item_contact"; GROUP = "query_id"; ID_COLS = ["query_id","item_id"]

In [3]:
assert os.path.exists(TRAIN_FEATS_PATH);
assert os.path.exists(TEST_FEATS_PATH)
train_feats = pl.read_parquet(TRAIN_FEATS_PATH)
test_feats = pl.read_parquet(TEST_FEATS_PATH)
print("[load] train_feats:", train_feats.shape, "| test_feats:", test_feats.shape)

[load] train_feats: (7781790, 72) | test_feats: (335348, 71)


In [4]:
# ## 2) Описание набора фичей и колонок

FEATURES = [
    # эмбеддинги
    "cos_q_title",

    # тексты
    "query_len", "title_len", "abs_len_diff",
    "overlap_q_title", "jaccard_q_title", "dice_q_title", "ratio_overlap_title",
    "title_contains_query", "has_query_text", "has_title",

    # категории/локации
    "same_cat", "same_mcat", "same_loc", "same_cat_loc", "triple_match",
    "freq_item_cat", "freq_item_mcat", "freq_item_loc", "is_query_mcat_missing",

    # цена
    "price_clip", "price_log1p", "price_rank_in_query", "price_z_in_query", "price_is_zero", "price_vs_median_query",

    # поведенческие
    "conv_known", "conv_filled", "conv_z_in_query", "conv_rank_in_query",

    # групповые
    "n_items_in_query", "max_cos_in_query", "cos_minus_max",

    # интеракции
    "same_cat__cos", "same_loc__cos", "conv__cos", "conv__same_loc", "price_rank__same_loc",
]

In [5]:
def sanitize_features(df: pl.DataFrame, features: list[str]) -> pl.DataFrame:
    fixes = []
    for c in features:
        if c not in df.columns: continue
        dt = df.schema[c]
        if dt in (pl.Float32, pl.Float64):
            fixes.append(
                pl.when(pl.col(c).is_finite()).then(pl.col(c)).otherwise(0.0).fill_null(0.0).cast(pl.Float32).alias(c))
        elif dt == pl.Boolean:
            fixes.append(pl.col(c).cast(pl.Int8).fill_null(0).alias(c))
        elif dt.is_integer():
            fixes.append(pl.col(c).fill_null(0).alias(c))
        else:
            fixes.append(pl.col(c).cast(pl.Float32).fill_null(0.0).alias(c))
    return df.with_columns(fixes)


In [6]:
def prepare_matrix(df: pl.DataFrame, features: List[str]) -> np.ndarray:
    return df.select([pl.col(c).cast(pl.Float32) for c in features]).fill_null(0.0).to_numpy()

In [7]:
train_feats = sanitize_features(train_feats, FEATURES + [TARGET, GROUP])
test_feats  = sanitize_features(test_feats,  FEATURES + ID_COLS)


In [8]:
y = train_feats[TARGET].to_numpy()
groups_vec = train_feats[GROUP].to_numpy()
X = prepare_matrix(train_feats, FEATURES)
X_test = prepare_matrix(test_feats, FEATURES)
print("[matrix] X:", X.shape, "y:", y.shape, "X_test:", X_test.shape)

[matrix] X: (7781790, 38) y: (7781790,) X_test: (335348, 38)


In [9]:
N_FOLDS = 5
gkf = GroupKFold(n_splits=N_FOLDS)
folds = [(tr_idx, va_idx) for tr_idx, va_idx in gkf.split(X, y, groups=groups_vec)]

In [10]:
def group_sizes_from_sorted_ids(ids: np.ndarray) -> np.ndarray:
    _, counts = np.unique(ids, return_counts=True); return counts

class TQDMCallback:
    def __init__(self, total_rounds: int, desc: str="boosting"): self.t=total_rounds; self.desc=desc; self.p=None
    def __call__(self, env: CallbackEnv):
        if self.p is None:
            end=getattr(env,"end_iteration",self.t); beg=getattr(env,"begin_iteration",0)
            from tqdm.auto import tqdm as _tqdm; self.p=_tqdm(total=end-beg, desc=self.desc, leave=False)
        self.p.update(1)
        if env.iteration+1>=self.t or (env.iteration+1)>=getattr(env,"end_iteration",self.t): self.p.close(); self.p=None

def calc_dcg_at_k(v: np.ndarray, k: int = 10) -> float:
    w = 0.97 ** np.arange(len(v)); return (v * w)[:k].sum()
def calc_ndcg_at_k(labels: np.ndarray, preds: np.ndarray, groups: np.ndarray, k: int = 10) -> float:
    order = np.argsort(groups, kind="mergesort"); labels, preds, groups = labels[order], preds[order], groups[order]
    _, counts = np.unique(groups, return_counts=True); s=0; lst=[]
    for c in counts:
        sl = slice(s, s+c); l, p = labels[sl], preds[sl]; idx = np.argsort(-p, kind="mergesort")
        idcg = calc_dcg_at_k(np.sort(l)[::-1], k) + 1e-12; lst.append(calc_dcg_at_k(l[idx], k) / idcg); s += c
    return float(np.mean(lst))

In [11]:
params = dict(objective="lambdarank", metric="ndcg", ndcg_eval_at=[10],
              learning_rate=0.01, num_leaves=63, min_data_in_leaf=200,
              feature_fraction=0.85, bagging_fraction=0.8, bagging_freq=1,
              lambda_l2=1.0, verbose=-1, seed=SEED, num_threads=0, device="gpu")

num_boost_round=1200; early_stopping_rounds=80

In [12]:
oof_lgb = np.zeros(len(y), dtype=np.float32)
test_pred_lgb = np.zeros(len(X_test), dtype=np.float32)

for fold, (tr_idx, va_idx) in enumerate(folds, 1):
    tr_s = tr_idx[np.argsort(groups_vec[tr_idx], kind="mergesort")]
    va_s = va_idx[np.argsort(groups_vec[va_idx], kind="mergesort")]
    dtr = lgb.Dataset(X[tr_s], label=y[tr_s], group=group_sizes_from_sorted_ids(groups_vec[tr_s]), feature_name=FEATURES)
    dva = lgb.Dataset(X[va_s], label=y[va_s], group=group_sizes_from_sorted_ids(groups_vec[va_s]), feature_name=FEATURES)
    try:
        model = lgb.train(params, dtr, valid_sets=[dva], num_boost_round=num_boost_round,
                          callbacks=[TQDMCallback(num_boost_round, f"LGB fold {fold}"),
                                     lgb.early_stopping(early_stopping_rounds, verbose=False),
                                     lgb.log_evaluation(100)])
    except lgb.basic.LightGBMError:
        params["device"]="cpu"
        model = lgb.train(params, dtr, valid_sets=[dva], num_boost_round=num_boost_round,
                          callbacks=[TQDMCallback(num_boost_round, f"LGB fold {fold} (cpu)"),
                                     lgb.early_stopping(early_stopping_rounds, verbose=False),
                                     lgb.log_evaluation(100)])
    oof_lgb[va_s] = model.predict(X[va_s], num_iteration=model.best_iteration)
    test_pred_lgb += model.predict(X_test, num_iteration=model.best_iteration) / N_FOLDS
    print(f"fold {fold}: ndcg@10={calc_ndcg_at_k(y[va_s], oof_lgb[va_s], groups_vec[va_s]):.5f}")

print("[LGBM] CV ndcg@10:", calc_ndcg_at_k(y, oof_lgb, groups_vec))


LGB fold 1:   8%|▊         | 100/1200 [00:27<05:29,  3.34it/s]

[100]	valid_0's ndcg@10: 0.877316


LGB fold 1:  17%|█▋        | 200/1200 [00:54<04:24,  3.78it/s]

[200]	valid_0's ndcg@10: 0.879173


LGB fold 1:  25%|██▌       | 300/1200 [01:20<03:43,  4.03it/s]

[300]	valid_0's ndcg@10: 0.880244


LGB fold 1:  33%|███▎      | 400/1200 [01:45<03:12,  4.15it/s]

[400]	valid_0's ndcg@10: 0.880986


LGB fold 1:  42%|████▏     | 501/1200 [02:07<02:19,  5.02it/s]

[500]	valid_0's ndcg@10: 0.881471


LGB fold 1:  50%|█████     | 601/1200 [02:27<01:56,  5.13it/s]

[600]	valid_0's ndcg@10: 0.881653


LGB fold 1:  58%|█████▊    | 700/1200 [02:45<01:33,  5.35it/s]

[700]	valid_0's ndcg@10: 0.881872


LGB fold 1:  67%|██████▋   | 801/1200 [03:02<01:04,  6.16it/s]

[800]	valid_0's ndcg@10: 0.88193


LGB fold 1:  75%|███████▌  | 901/1200 [03:18<00:44,  6.66it/s]

[900]	valid_0's ndcg@10: 0.882075


LGB fold 1:  83%|████████▎ | 1001/1200 [03:34<00:30,  6.52it/s]

[1000]	valid_0's ndcg@10: 0.882177


LGB fold 1:  92%|█████████▏| 1101/1200 [03:49<00:15,  6.51it/s]

[1100]	valid_0's ndcg@10: 0.882165


                                                               

[1200]	valid_0's ndcg@10: 0.882232
fold 1: ndcg@10=0.31033


LGB fold 2:   8%|▊         | 100/1200 [00:25<04:59,  3.68it/s]

[100]	valid_0's ndcg@10: 0.878148


LGB fold 2:  17%|█▋        | 200/1200 [00:51<04:16,  3.89it/s]

[200]	valid_0's ndcg@10: 0.879468


LGB fold 2:  25%|██▌       | 300/1200 [01:17<04:03,  3.69it/s]

[300]	valid_0's ndcg@10: 0.881206


LGB fold 2:  33%|███▎      | 400/1200 [01:42<02:51,  4.67it/s]

[400]	valid_0's ndcg@10: 0.88182


LGB fold 2:  42%|████▏     | 501/1200 [02:04<02:23,  4.86it/s]

[500]	valid_0's ndcg@10: 0.882315


LGB fold 2:  50%|█████     | 600/1200 [02:24<01:49,  5.47it/s]

[600]	valid_0's ndcg@10: 0.88249


LGB fold 2:  58%|█████▊    | 700/1200 [02:42<01:42,  4.90it/s]

[700]	valid_0's ndcg@10: 0.882648


LGB fold 2:  67%|██████▋   | 801/1200 [02:59<01:09,  5.74it/s]

[800]	valid_0's ndcg@10: 0.882822


LGB fold 2:  75%|███████▌  | 901/1200 [03:16<00:50,  5.98it/s]

[900]	valid_0's ndcg@10: 0.88294


                                                              

fold 2: ndcg@10=0.30830


LGB fold 3:   8%|▊         | 100/1200 [00:25<04:42,  3.90it/s]

[100]	valid_0's ndcg@10: 0.877894


LGB fold 3:  17%|█▋        | 200/1200 [00:51<04:15,  3.91it/s]

[200]	valid_0's ndcg@10: 0.879697


LGB fold 3:  25%|██▌       | 300/1200 [01:16<03:35,  4.18it/s]

[300]	valid_0's ndcg@10: 0.881041


LGB fold 3:  33%|███▎      | 400/1200 [01:41<03:06,  4.28it/s]

[400]	valid_0's ndcg@10: 0.881799


LGB fold 3:  42%|████▏     | 501/1200 [02:02<02:05,  5.56it/s]

[500]	valid_0's ndcg@10: 0.882284


LGB fold 3:  50%|█████     | 601/1200 [02:21<01:53,  5.28it/s]

[600]	valid_0's ndcg@10: 0.882569


LGB fold 3:  58%|█████▊    | 700/1200 [02:39<01:35,  5.22it/s]

[700]	valid_0's ndcg@10: 0.882692


LGB fold 3:  67%|██████▋   | 801/1200 [02:57<01:07,  5.92it/s]

[800]	valid_0's ndcg@10: 0.882891


LGB fold 3:  75%|███████▌  | 901/1200 [03:13<00:50,  5.98it/s]

[900]	valid_0's ndcg@10: 0.88291


                                                              

fold 3: ndcg@10=0.31035


LGB fold 4:   8%|▊         | 100/1200 [00:25<04:46,  3.83it/s]

[100]	valid_0's ndcg@10: 0.878356


LGB fold 4:  17%|█▋        | 200/1200 [00:53<04:32,  3.67it/s]

[200]	valid_0's ndcg@10: 0.879526


LGB fold 4:  25%|██▌       | 300/1200 [01:20<03:31,  4.25it/s]

[300]	valid_0's ndcg@10: 0.880878


LGB fold 4:  33%|███▎      | 400/1200 [01:43<03:05,  4.31it/s]

[400]	valid_0's ndcg@10: 0.881661


LGB fold 4:  42%|████▏     | 500/1200 [02:04<02:30,  4.66it/s]

[500]	valid_0's ndcg@10: 0.881975


LGB fold 4:  50%|█████     | 601/1200 [02:23<01:51,  5.37it/s]

[600]	valid_0's ndcg@10: 0.882174


LGB fold 4:  58%|█████▊    | 701/1200 [02:41<01:34,  5.29it/s]

[700]	valid_0's ndcg@10: 0.882351


LGB fold 4:  67%|██████▋   | 801/1200 [02:58<01:10,  5.67it/s]

[800]	valid_0's ndcg@10: 0.882457


LGB fold 4:  75%|███████▌  | 901/1200 [03:14<00:51,  5.79it/s]

[900]	valid_0's ndcg@10: 0.882575


LGB fold 4:  83%|████████▎ | 1001/1200 [03:30<00:29,  6.63it/s]

[1000]	valid_0's ndcg@10: 0.882742


LGB fold 4:  92%|█████████▏| 1101/1200 [03:46<00:13,  7.16it/s]

[1100]	valid_0's ndcg@10: 0.882772


                                                               

[1200]	valid_0's ndcg@10: 0.882885
fold 4: ndcg@10=0.30966


LGB fold 5:   8%|▊         | 100/1200 [00:25<04:35,  3.99it/s]

[100]	valid_0's ndcg@10: 0.877957


LGB fold 5:  17%|█▋        | 200/1200 [00:50<04:07,  4.04it/s]

[200]	valid_0's ndcg@10: 0.880332


LGB fold 5:  25%|██▌       | 300/1200 [01:14<03:29,  4.30it/s]

[300]	valid_0's ndcg@10: 0.881367


LGB fold 5:  33%|███▎      | 400/1200 [01:37<03:06,  4.28it/s]

[400]	valid_0's ndcg@10: 0.881892


LGB fold 5:  42%|████▏     | 500/1200 [01:59<02:23,  4.87it/s]

[500]	valid_0's ndcg@10: 0.882514


LGB fold 5:  50%|█████     | 601/1200 [02:19<01:39,  6.01it/s]

[600]	valid_0's ndcg@10: 0.883005


LGB fold 5:  58%|█████▊    | 701/1200 [02:36<01:36,  5.16it/s]

[700]	valid_0's ndcg@10: 0.883133


LGB fold 5:  67%|██████▋   | 801/1200 [02:54<01:02,  6.41it/s]

[800]	valid_0's ndcg@10: 0.883353


LGB fold 5:  75%|███████▌  | 901/1200 [03:11<00:49,  5.99it/s]

[900]	valid_0's ndcg@10: 0.883413


LGB fold 5:  83%|████████▎ | 1001/1200 [03:27<00:33,  5.89it/s]

[1000]	valid_0's ndcg@10: 0.883479


LGB fold 5:  92%|█████████▏| 1101/1200 [03:42<00:16,  5.94it/s]

[1100]	valid_0's ndcg@10: 0.883619


                                                               

[1200]	valid_0's ndcg@10: 0.883694
fold 5: ndcg@10=0.31020
[LGBM] CV ndcg@10: 0.30976922610916974


In [13]:
pl.DataFrame(
    {"query_id": train_feats["query_id"], "item_id": train_feats["item_id"], "oof_lgb": oof_lgb}).write_parquet(
    os.path.join(OUT_DIR, "oof_lgb.parquet"))
pl.DataFrame(
    {"query_id": test_feats["query_id"], "item_id": test_feats["item_id"], "pred_lgb": test_pred_lgb}).write_parquet(
    os.path.join(OUT_DIR, "test_pred_lgb.parquet"))
print("[save] out_lgbm/oof_lgb.parquet & out_lgbm/test_pred_lgb.parquet")

[save] out_lgbm/oof_lgb.parquet & out_lgbm/test_pred_lgb.parquet
