In [1]:
# LGBM refit-only submit (no OOF)
#-----------------------------------------------
import os, polars as pl, numpy as np, lightgbm as lgb
from typing import List
from tqdm.auto import tqdm
from lightgbm.callback import CallbackEnv

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
SEED = 42
np.random.seed(SEED)

BASE_DIR = os.path.dirname(os.getcwd())
DATA_DIR = os.path.join(BASE_DIR, "Data")
TRAIN_FEATS_PATH = os.path.join(DATA_DIR, "train_feats2.parquet")
TEST_FEATS_PATH  = os.path.join(DATA_DIR, "test_feats2.parquet")

In [3]:

SUB_REFIT = "solution_lgbm_refit.csv"

TARGET = "item_contact"
GROUP  = "query_id"
ID_COLS = ["query_id","item_id"]


In [4]:
# === load ===
assert os.path.exists(TRAIN_FEATS_PATH); assert os.path.exists(TEST_FEATS_PATH)
train_feats = pl.read_parquet(TRAIN_FEATS_PATH)
test_feats  = pl.read_parquet(TEST_FEATS_PATH)
print("[load] train_feats:", train_feats.shape, "| test_feats:", test_feats.shape)

[load] train_feats: (7781790, 72) | test_feats: (335348, 71)


In [5]:
# === features (ваш список) ===
FEATURES = [
    "cos_q_title",
    "query_len","title_len","abs_len_diff",
    "overlap_q_title","jaccard_q_title","dice_q_title","ratio_overlap_title",
    "title_contains_query","has_query_text","has_title",
    "same_cat","same_mcat","same_loc","same_cat_loc","triple_match",
    "freq_item_cat","freq_item_mcat","freq_item_loc","is_query_mcat_missing",
    "price_clip","price_log1p","price_rank_in_query","price_z_in_query","price_is_zero","price_vs_median_query",
    "conv_known","conv_filled","conv_z_in_query","conv_rank_in_query",
    "n_items_in_query","max_cos_in_query","cos_minus_max",
    "same_cat__cos","same_loc__cos","conv__cos","conv__same_loc","price_rank__same_loc",
]
FEATURES = [c for c in FEATURES if c in train_feats.columns]

In [6]:

# === sanitize & prepare ===
def sanitize_features(df: pl.DataFrame, features: list[str]) -> pl.DataFrame:
    exprs=[]
    for c in features:
        if c not in df.columns: continue
        dt=df.schema[c]
        if dt in (pl.Float32, pl.Float64):
            exprs.append(pl.when(pl.col(c).is_finite()).then(pl.col(c)).otherwise(0.0).fill_null(0.0).cast(pl.Float32).alias(c))
        elif dt==pl.Boolean:
            exprs.append(pl.col(c).cast(pl.Int8).fill_null(0).alias(c))
        elif dt.is_integer():
            exprs.append(pl.col(c).fill_null(0).alias(c))
        else:
            exprs.append(pl.col(c).cast(pl.Float32).fill_null(0.0).alias(c))
    return df.with_columns(exprs)

In [7]:

def prepare_matrix(df: pl.DataFrame, features: List[str]) -> np.ndarray:
    return df.select([pl.col(c).cast(pl.Float32) for c in features]).to_numpy()

In [8]:
train_feats = sanitize_features(train_feats, FEATURES)
test_feats  = sanitize_features(test_feats,  FEATURES)

y = train_feats[TARGET].to_numpy()
g = train_feats[GROUP].to_numpy()
X = prepare_matrix(train_feats, FEATURES)
X_test = prepare_matrix(test_feats, FEATURES)

In [9]:
print("[matrix] X:", X.shape, "y:", y.shape, "X_test:", X_test.shape)

# === group sizes for refit (нужен сортированный по query_id порядок) ===
ord_idx = np.argsort(g, kind="mergesort")
X_all, y_all, g_all = X[ord_idx], y[ord_idx], g[ord_idx]
_, group_sizes = np.unique(g_all, return_counts=True)

[matrix] X: (7781790, 38) y: (7781790,) X_test: (335348, 38)


In [10]:
dall = lgb.Dataset(X_all, label=y_all, group=group_sizes, feature_name=FEATURES)

In [12]:
# === быстрые/стабильные гиперы
params = dict(
    objective="lambdarank",
    metric="ndcg",
    ndcg_eval_at=[10],
    learning_rate=0.08,     # можно 0.05–0.10
    num_leaves=63,
    min_data_in_leaf=200,
    feature_fraction=0.85,
    bagging_fraction=0.8,
    bagging_freq=1,
    lambda_l2=1.0,
    verbose=-1,
    seed=SEED,
    num_threads=0,
    device="gpu",
)

In [13]:
num_boost_round = 900

In [14]:
# === обучаем один финальный бустер на всем train ===
print("[fit] refit on full train...")
model = lgb.train(params, dall, num_boost_round=num_boost_round)

[fit] refit on full train...


In [15]:
# === предсказания на тест ===
test_pred = model.predict(X_test)

In [16]:
# === сабмит ===
sub = (
    test_feats.select(ID_COLS)
    .with_columns(pl.Series("pred", test_pred))
    .sort(["query_id","pred"], descending=[False, True])
    .select(ID_COLS)
)
sub.write_csv(SUB_REFIT, include_header=True)
print(f"[save] {SUB_REFIT}  rows={sub.height}")

[save] solution_lgbm_refit.csv  rows=335348


In [None]:
# === (опционально) второй сабмит — Z-нормировка скоров по запросу ===
do_z = True
if do_z:
    df_pred = test_feats.select(ID_COLS).with_columns(pl.Series("pred", test_pred))
    df_pred = df_pred.with_columns([
        pl.col("pred").mean().over("query_id").alias("__m"),
        pl.col("pred").std(ddof=0).fill_null(0.0).over("query_id").alias("__s"),
    ]).with_columns([
        ((pl.col("pred") - pl.col("__m")) / (pl.col("__s")+1e-6)).alias("pred_z_in_query")
    ]).drop(["__m","__s"])

    sub_z = (
        df_pred.sort(["query_id","pred_z_in_query"], descending=[False, True])
               .select(ID_COLS)
    )
    sub_z.write_csv("solution_lgbm_refit_z.csv", include_header=True)
    print("[save] solution_lgbm_refit_z.csv  rows=", sub_z.height)