In [1]:
import os
import polars as pl
import numpy as np

In [2]:
BASE_DIR = os.path.dirname(os.getcwd())
DATA_DIR = os.path.join(BASE_DIR, "Data")
TRAIN_PATH = os.path.join(DATA_DIR, "train-dset.parquet")
TEST_PATH  = os.path.join(DATA_DIR, "test-dset-small.parquet")

TRAIN_COS = os.path.join(DATA_DIR, "train_cos.parquet")
TEST_COS  = os.path.join(DATA_DIR, "test_cos.parquet")

In [3]:
def tokenize(col: str) -> pl.Expr:
    # простая нормализация: lower, убрать небукв/цифр, свести повторные пробелы, split
    return (
        pl.col(col).cast(pl.Utf8).fill_null("")
        .str.to_lowercase()
        .str.replace_all(r"ё", "е")
        .str.replace_all(r"[^0-9\p{L}]+", " ")
        .str.strip_chars()
        .str.split(" ")
        .list.eval(pl.element().filter(pl.element() != ""))
    )

In [4]:
def add_text_feats(df: pl.DataFrame) -> pl.DataFrame:
    df = df.with_columns([
        tokenize("query_text").alias("query_tokens"),
        tokenize("item_title").alias("title_tokens"),
    ])
    df = df.with_columns([
        pl.col("query_tokens").list.len().alias("query_len"),
        pl.col("title_tokens").list.len().alias("title_len"),
    ])
    # пересечения и нормализованные меры
    df = df.with_columns([
        pl.col("query_tokens").list.set_intersection(pl.col("title_tokens")).list.len().alias("overlap_q_title"),
    ])
    df = df.with_columns([
        (pl.col("overlap_q_title") / (pl.col("query_len")+pl.col("title_len")-pl.col("overlap_q_title")+1e-6)).alias("jaccard_q_title"),
        (2*pl.col("overlap_q_title") / (pl.col("query_len")+pl.col("title_len")+1e-6)).alias("dice_q_title"),
        (pl.col("overlap_q_title") / (pl.col("query_len")+1)).alias("ratio_overlap_title"),
        (pl.col("title_len") - pl.col("query_len")).abs().alias("abs_len_diff"),
    ])
    # строгая проверка вхождения строки
    norm_q = (
        pl.col("query_text").cast(pl.Utf8).fill_null("").str.to_lowercase()
        .str.replace_all(r"ё","е").str.replace_all(r"[^0-9\p{L}]+"," ").str.strip_chars()
    )
    norm_t = (
        pl.col("item_title").cast(pl.Utf8).fill_null("").str.to_lowercase()
        .str.replace_all(r"ё","е").str.replace_all(r"[^0-9\p{L}]+"," ").str.strip_chars()
    )
    df = df.with_columns([
        norm_q.alias("__qnorm"), norm_t.alias("__tnorm"),
        (pl.col("query_text").fill_null("") != "").alias("has_query_text"),
        (pl.col("item_title").fill_null("") != "").alias("has_title"),
    ]).with_columns([
        pl.col("__tnorm").str.contains(pl.col("__qnorm")).fill_null(False).alias("title_contains_query")
    ]).drop(["__qnorm","__tnorm"])
    return df

In [5]:
def build_global_freqs(train_lf: pl.LazyFrame, test_lf: pl.LazyFrame) -> dict[str, pl.DataFrame]:
    both = pl.concat([train_lf, test_lf])
    # частоты по item_* (глобальные)
    freq_cat  = both.select("item_cat_id").group_by("item_cat_id").agg(pl.len().alias("freq_item_cat"))
    freq_mcat = both.select("item_mcat_id").group_by("item_mcat_id").agg(pl.len().alias("freq_item_mcat"))
    freq_loc  = both.select("item_loc").group_by("item_loc").agg(pl.len().alias("freq_item_loc"))
    return {"cat": freq_cat.collect(streaming=True),
            "mcat": freq_mcat.collect(streaming=True),
            "loc": freq_loc.collect(streaming=True)}

In [15]:
def add_basic_feats(df: pl.DataFrame, cos_df: pl.DataFrame, freqs: dict[str, pl.DataFrame]) -> pl.DataFrame:
    # косинус
    df = df.join(cos_df, on=["query_id","item_id"], how="left")

    # совпадения категорий/локаций
    df = df.with_columns([
        (pl.col("query_cat")==pl.col("item_cat_id")).cast(pl.Int8).alias("same_cat"),
        (pl.col("query_mcat")==pl.col("item_mcat_id")).cast(pl.Int8).alias("same_mcat"),
        (pl.col("query_loc")==pl.col("item_loc")).cast(pl.Int8).alias("same_loc"),
    ])
    df = df.with_columns([
        (pl.col("same_cat") & pl.col("same_mcat") & pl.col("same_loc")).cast(pl.Int8).alias("triple_match"),
        (pl.col("same_cat") & pl.col("same_loc")).cast(pl.Int8).alias("same_cat_loc"),
        (pl.col("query_mcat").is_null()).cast(pl.Int8).alias("is_query_mcat_missing"),
    ])

    # частоты (джойним глобальные)
    df = df.join(freqs["cat"],  on="item_cat_id",  how="left")
    df = df.join(freqs["mcat"], on="item_mcat_id", how="left")
    df = df.join(freqs["loc"],  on="item_loc",     how="left")
    for c in ["freq_item_cat","freq_item_mcat","freq_item_loc"]:
        if c in df.columns:
            df = df.with_columns(pl.col(c).fill_null(0).cast(pl.Int32))

    # цена: клип/лог/ранги/з-скор внутри запроса/медиана
    p99 = float(df.select(pl.col("price").quantile(0.99)).item())
    df = df.with_columns([
        pl.col("price").clip(0, p99).alias("price_clip"),
        pl.col("price").clip(0, p99).log1p().alias("price_log1p"),
        (pl.col("price")==0).cast(pl.Int8).alias("price_is_zero"),
    ])
    # пер-запросные агрегаты по цене
    q_price_aggs = df.group_by("query_id").agg([
        pl.col("price_clip").median().alias("q_price_median"),
        pl.col("price_clip").mean().alias("q_price_mean"),
        pl.col("price_clip").std(ddof=0).fill_null(0.0).alias("q_price_std"),
        pl.len().alias("n_items_in_query"),
        pl.col("cos_q_title").max().alias("max_cos_in_query"),
    ])
    df = df.join(q_price_aggs, on="query_id", how="left")
    df = df.with_columns([
        (pl.col("price_clip") / (pl.col("q_price_median")+1)).alias("price_vs_median_query"),
        ((pl.col("price_clip") - pl.col("q_price_mean"))/(pl.col("q_price_std")+1e-6)).alias("price_z_in_query"),
        pl.col("price_clip").rank("ordinal").over("query_id").alias("price_rank_in_query"),
        (pl.col("cos_q_title") - pl.col("max_cos_in_query")).alias("cos_minus_max"),
    ])

    # поведенческие: conv -1 → NaN → заполнения/ранги/з-скор
    df = df.with_columns([
        (pl.col("item_query_click_conv") != -1).alias("conv_known"),
        pl.when(pl.col("item_query_click_conv")==-1).then(None).otherwise(pl.col("item_query_click_conv")).alias("conv_raw")
    ])
    q_conv_aggs = df.group_by("query_id").agg([
        pl.col("conv_raw").mean().alias("q_conv_mean"),
        pl.col("conv_raw").std(ddof=0).fill_null(0.0).alias("q_conv_std"),
    ])
    df = df.join(q_conv_aggs, on="query_id", how="left")
    df = df.with_columns([
        pl.col("conv_raw").fill_null(0.0).alias("conv_filled"),
        ((pl.col("conv_raw") - pl.col("q_conv_mean"))/(pl.col("q_conv_std")+1e-6)).alias("conv_z_in_query"),
        pl.col("conv_raw").fill_null(-1.0).rank("ordinal").over("query_id").alias("conv_rank_in_query"),
    ])

    # интеракции
    df = df.with_columns([
        (pl.col("same_cat") * pl.col("cos_q_title")).alias("same_cat__cos"),
        (pl.col("same_loc") * pl.col("cos_q_title")).alias("same_loc__cos"),
        (pl.col("conv_filled") * pl.col("cos_q_title")).alias("conv__cos"),
        (pl.col("conv_filled") * pl.col("same_loc")).alias("conv__same_loc"),
        (pl.col("price_rank_in_query") * pl.col("same_loc")).alias("price_rank__same_loc"),
    ])

    return df

In [7]:
# ---------- сборка train/test ----------

train_lf = pl.scan_parquet(TRAIN_PATH)
test_lf  = pl.scan_parquet(TEST_PATH)

In [8]:
freqs = build_global_freqs(
    train_lf.select(["item_cat_id","item_mcat_id","item_loc"]),
    test_lf.select(["item_cat_id","item_mcat_id","item_loc"])
)

  return {"cat": freq_cat.collect(streaming=True),
  "mcat": freq_mcat.collect(streaming=True),
  "loc": freq_loc.collect(streaming=True)}


In [9]:
test_base  = test_lf.collect(streaming=True)

  test_base  = test_lf.collect(streaming=True)


In [10]:
train_base = train_lf.collect(streaming=True)

  train_base = train_lf.collect(streaming=True)


In [11]:
# текстовые фичи
test_txt  = add_text_feats(test_base)

In [12]:
train_txt = add_text_feats(train_base)

In [13]:
# косинусы
train_cos = pl.read_parquet(TRAIN_COS)
test_cos  = pl.read_parquet(TEST_COS)

In [16]:
# базовые + цена/поведение/интеракции/группы
train_feats = add_basic_feats(train_txt, train_cos, freqs)
test_feats  = add_basic_feats(test_txt,  test_cos,  freqs)

In [17]:
print("[train_feats]", train_feats.shape)
print("[test_feats ]", test_feats.shape)

[train_feats] (7781790, 60)
[test_feats ] (335348, 59)


In [18]:
train_feats.write_parquet("train_feats.parquet", compression="zstd")
test_feats.write_parquet("test_feats.parquet", compression="zstd")
