In [2]:
# # BM25 + LightGBMRanker (hold-out, без шардирования) с лемматизацией spaCy
#
# Быстрый энд-ту-энд ноутбук:
# - Polars: ленивое чтение и подготовка уникальных queries/items
# - spaCy ru_core_news_sm: токенизация + леммы (можно отключить)
# - BM25 по (title + первые N лемм из description)
# - Hold-out по query_id (одна валидация)
# - LightGBMRanker (LambdaRank), метрика NDCG@10 с decay 0.97^pos

In [1]:
# ## Импорт и настройки

import os
import time
import gc
import polars as pl
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from rank_bm25 import BM25Okapi
import lightgbm as lgb

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
# Пути к данным
BASE_DIR = os.path.dirname(os.getcwd())
DATA_DIR = os.path.join(BASE_DIR, "Data")
TRAIN_PATH = os.path.join(DATA_DIR, "train-dset.parquet")
TEST_PATH  = os.path.join(DATA_DIR, "test-dset-small.parquet")

# Параметры пайплайна
RANDOM_STATE        = 42
EVAL_AT             = 10                 # NDCG@10
DESC_CHARS_LIMIT    = 1500               # предварительная обрезка описания по символам
DESC_TOKENS_LIMIT   = 80                 # максимум лемм из description для BM25
USE_SPACY_LEMMAS    = False              # переключатель: True — spaCy леммы, False — простая токенизация
REMOVE_STOPWORDS    = True               # удаление стоп-слов в spaCy-ветке
MIN_LEMMA_LEN       = 2                  # отбрасывать короткие токены в spaCy-ветке

In [3]:
# ## Утилиты

def ndcg_at_k(y_true: np.ndarray, y_score: np.ndarray, qids: np.ndarray, k: int = 10) -> float:
    # decay = 0.97**pos (позиции с нуля), как в описании
    order = np.argsort(qids, kind="mergesort")
    y_true = y_true[order]; y_score = y_score[order]; qids = qids[order]
    uniq, starts = np.unique(qids, return_index=True)
    starts = list(starts) + [len(qids)]
    w = 0.97 ** np.arange(1000)
    total = 0.0; cnt = 0
    for i in range(len(uniq)):
        s, e = starts[i], starts[i+1]
        y = y_true[s:e]; scores = y_score[s:e]
        idx = np.argsort(scores)[::-1]; gains = y[idx]
        dcg = (gains * w[:len(gains)])[:k].sum()
        ideal = np.sort(y)[::-1]; idcg = (ideal * w[:len(ideal)])[:k].sum()
        if idcg > 0:
            total += dcg / idcg; cnt += 1
    return total / max(cnt, 1)

def group_sizes(qids: np.ndarray) -> list[int]:
    _, counts = np.unique(qids, return_counts=True)
    return counts.tolist()

def print_step(msg: str):
    print(f"[step] {msg}")


In [4]:
# ## Ленивое чтение Parquet и подготовка уникальных сущностей

t0 = time.time()
train_lf = pl.scan_parquet(TRAIN_PATH)
test_lf  = pl.scan_parquet(TEST_PATH)
print_step(f"lazy sources ready in {time.time()-t0:.2f}s")

[step] lazy sources ready in 0.00s


In [5]:
# уникальные queries (train+test)
queries_lf = (
    pl.concat([train_lf.select("query_id", "query_text"),
               test_lf.select("query_id", "query_text")])
    .unique(subset=["query_id"])
)

In [6]:
# уникальные items (train+test) + обрезка описания по символам
items_lf = (
    pl.concat([train_lf.select("item_id", "item_title", "item_description"),
               test_lf.select("item_id", "item_title", "item_description")])
    .unique(subset=["item_id"])
    .with_columns([
        pl.col("item_description").cast(pl.Utf8).fill_null("")
          .str.slice(0, DESC_CHARS_LIMIT).alias("desc_short")
    ])
    .select(["item_id", "item_title", "desc_short"])
)

In [7]:
t1 = time.time()
queries_df = queries_lf.collect(streaming=True)

  queries_df = queries_lf.collect(streaming=True)


In [11]:
items_df   = items_lf.collect(streaming=True)
print_step(f"collect uniques: queries={queries_df.shape}, items={items_df.shape}, time={time.time()-t1:.2f}s")

  items_df   = items_lf.collect(streaming=True)


[step] collect uniques: queries=(690695, 2), items=(5986464, 3), time=129.98s


In [24]:
def norm_expr(col) -> pl.Expr:
    e = pl.col(col) if isinstance(col, str) else col
    return (
        e.cast(pl.Utf8).fill_null("")
         .str.to_lowercase()
         .str.replace_all(r"[.,;:!?\(\)\[\]\{\}\"'«»“”’/\\\-\+_=~`|<>^€$#@%&*]+", " ")
         .str.replace_all(r"\s+", " ")
         .str.strip_chars()
    )


In [25]:
# токены для уникальных запросов и товаров
queries_tok = (
    queries_df
    .with_columns(norm_expr("query_text").str.split(" ").alias("query_tokens"))
    .select(["query_id","query_tokens"])
)

In [28]:
items_tok = (
    items_df
    .with_columns([
        norm_expr("item_title").str.split(" ").alias("title_tokens"),
        norm_expr("desc_short").str.split(" ").list.head(DESC_TOKENS_LIMIT).alias("desc_tokens"),
    ])
    .with_columns([
        pl.concat_list([pl.col("title_tokens"), pl.col("desc_tokens")]).alias("doc_tokens")
    ])
    .select(["item_id","doc_tokens"])
)

In [29]:
# --- присоединяем токены к парам и материализуем ---
train_join_lf = (
    train_lf
    .join(queries_tok.lazy(), on="query_id", how="left")
    .join(items_tok.lazy(),   on="item_id",  how="left")
    .select(["query_id","item_id","item_contact","query_tokens","doc_tokens"])
)

In [30]:
test_join_lf = (
    test_lf
    .join(queries_tok.lazy(), on="query_id", how="left")
    .join(items_tok.lazy(),   on="item_id",  how="left")
    .select(["query_id","item_id","query_tokens","doc_tokens"])
)

In [None]:
t2 = time.time()
train_df = train_join_lf.collect(streaming=True)

  train_df = train_join_lf.collect(streaming=True)


In [None]:
test_df  = test_join_lf.collect(streaming=True)
print_step(f"joined: train={train_df.shape}, test={test_df.shape}, time={time.time()-t2:.2f}s")


In [None]:
# --- BM25 по query_id с progress bar ---
from tqdm.auto import tqdm
from rank_bm25 import BM25Okapi
import numpy as np

In [None]:
def bm25_scores_with_progress(df: pl.DataFrame, desc: str) -> pl.DataFrame:
    gidx = df.group_by("query_id").groups()
    scores = np.zeros(df.height, dtype=np.float32)
    for qid, idx_list in tqdm(gidx.iter_rows(), total=gidx.height, desc=desc, leave=True):
        idxs = np.asarray(idx_list, dtype=np.int64)
        docs = df["doc_tokens"].take(idxs).to_list()
        docs = [d if isinstance(d, list) and len(d) > 0 else ["<empty>"] for d in docs]
        bm25 = BM25Okapi(docs)
        q = df["query_tokens"][int(idxs[0])]
        if not isinstance(q, list) or len(q) == 0:
            q = ["<empty>"]
        sc = bm25.get_scores(q).astype(np.float32)
        scores[idxs] = sc
    return df.with_columns(pl.Series("bm25_doc", scores))