### Feature Engineering

В этом ноутбуке:

1. проверю, что в данных есть все нужные колонки;
2. добавлю простые и устойчивые признаки:
    - `price_log`, `price_norm`, `is_free`;
    - клики `click_conv` (с клиппингом выбросов);
    - длины текстов, матчи по категориям/локациям;
    - TF-IDF косинус между запросом и заголовком (`tfidf_cosine_title`);
    - **новое**: `price_rank_in_query`, `q_in_title_frac`, `q_in_desc_frac`, частотные признаки `*_freq`.
3. сохраню результат в Parquet.


In [1]:
import os
import re
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sys, os

sys.path.append(os.path.abspath(".."))
plt.rcParams["figure.figsize"] = (8, 4)
plt.rcParams["axes.grid"] = True

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

DATA_DIR = "../data/raw"
SAMPLE_PATH = "../data/transformed/train-sample.parquet"
TEST_PATH = os.path.join(DATA_DIR, "test-dset-small.parquet")


##### Загрузка данных


In [2]:
sample_df = pd.read_parquet(
    SAMPLE_PATH,
    engine="pyarrow",
)
test_df = pd.read_parquet(
    TEST_PATH,
    engine="pyarrow",
)

required_columns = [
    "query_id",
    "item_id",
    "item_contact",
    "query_text",
    "item_title",
    "item_description",
    "query_cat",
    "query_mcat",
    "query_loc",
    "item_cat_id",
    "item_mcat_id",
    "item_loc",
    "price",
    "item_query_click_conv",
]
missing_train = [c for c in required_columns if c not in sample_df.columns]
missing_test = [
    c for c in required_columns if c not in test_df.columns if c != "item_contact"
]
if missing_train:
    raise ValueError(f"sample_df missing columns: {missing_train}")
if missing_test:
    raise ValueError(f"test_df missing columns: {missing_test}")

print("OK: columns present in train/test")

sample_df.head(3)


OK: columns present in train/test


Unnamed: 0,query_id,item_id,query_text,item_title,item_description,query_cat,query_mcat,query_loc,item_cat_id,item_mcat_id,item_loc,price,item_query_click_conv,item_contact
0,112133,4479307056,памперсы для взрослых,Памперсы для взрослых l 30 штук,несколько упаковок в наличии,88.0,430.0,624030.0,88,430,624030,550.0,0.286,1.0
1,112133,2531208467,памперсы для взрослых,Памперсы для взрослых Senso,Памперсы для взрослых Senso\r\nРазмер Medium,88.0,430.0,624030.0,88,430,624030,450.0,0.0,0.0
2,112133,4427760183,памперсы для взрослых,Памперсы для взрослых Senso,Подгузники для взрослых. Размер М (70-120 см)....,88.0,430.0,624030.0,88,430,624030,500.0,0.0,0.0


Мы заранее считаем две вещи **по train**:

-   `price_median_cat`: медиана цены в каждой категории — чтобы делать нормировку цены;
-   `clip_val` для кликов — 99-й перцентиль, чтобы обрезать выбросы.


In [3]:
# 2.1 Медиана цены по item_cat_id
med_price_ser = (
    sample_df.groupby("item_cat_id", observed=False)["price"]
    .median()
    .rename("price_median_cat")
)
med_price_map = med_price_ser.reset_index()
try:
    med_price_map["item_cat_id"] = med_price_map["item_cat_id"].astype(
        sample_df["item_cat_id"].dtype
    )
except Exception:
    med_price_map["item_cat_id"] = med_price_map["item_cat_id"].astype("object")
med_price_map = dict(
    zip(
        med_price_map["item_cat_id"].astype("object"), med_price_map["price_median_cat"]
    )
)

# 2.2 Клиппинг для CTR (99p)
clip_val = float(sample_df["item_query_click_conv"].quantile(0.99))
if np.isnan(clip_val):
    clip_val = 0.0

print(
    "Lookup ready: price_median_cat (per item_cat_id), clip_val for CTR =",
    round(clip_val, 6),
)


Lookup ready: price_median_cat (per item_cat_id), clip_val for CTR = 0.2


##### Функции для добавления фичей

-   **Цена**: логарифм, нормализация по медиане категории, флаг бесплатного товара.
-   **Клики**: заменяем пропуски на 0 и обрезаем слишком большие значения.
-   **Тексты**: добавляем длины (в символах/словах) + доли попаданий токенов запроса.
-   **Матчи**: равны ли категории/локации запроса и объявления (1/0).
-   **TF-IDF косинус**: похожи ли `query_text` и `item_title`.
-   **Внутризапросная цена**: место цены среди соседей (`price_rank_in_query`).
-   **Частоты значений**: как часто встречаются категории/локации (частотное кодирование).


In [4]:
from src.features import (
    fill_missing_cats,
    add_price_features,
    add_ctr_features,
    add_text_features,
    add_match_features,
    add_price_in_query_feats,
    add_price_z_in_query,
    add_tfidf_groupwise_cosine,
    add_2d_freq,
    add_freq_enc,
    tweak_click_conv,
    add_basic_interactions,
    add_group_norms,
    add_initial_rank,
    add_tfidf_rank,
    add_numeric_and_unit_matches,
    add_price_bins_by_category,
    add_price_over_median_cat_loc,
)

Применяю функции к train/test


In [5]:
train_fe = sample_df.copy()
test_fe = test_df.copy()

In [6]:
# Заполнение пропущенных категориальных значений "MISSING"
cat_cols = [
    "query_cat",
    "query_mcat",
    "query_loc",
    "item_cat_id",
    "item_mcat_id",
    "item_loc",
]
train_fe = fill_missing_cats(train_fe, cat_cols)
test_fe = fill_missing_cats(test_fe, cat_cols)


In [7]:
# Добавляю исходную позицию в рамках query_id
train_fe = add_initial_rank(train_fe)
test_fe = add_initial_rank(test_fe)

[START] add_initial_rank | shape=(342886, 14)
[END]   add_initial_rank | shape=(342886, 20) | Δcols=6 | time=0.09s
------------------------------------------------------------
[START] add_initial_rank | shape=(335348, 13)
[END]   add_initial_rank | shape=(335348, 19) | Δcols=6 | time=0.05s
------------------------------------------------------------


In [8]:
# Добавляю price_log, price_norm, is_free, price_median_cat
train_fe = add_price_features(train_fe, med_price_map)
test_fe = add_price_features(test_fe, med_price_map)

# Добавляю price_z_in_query
train_fe = add_price_z_in_query(train_fe)
test_fe = add_price_z_in_query(test_fe)

train_fe = add_price_bins_by_category(
    train_fe, cat_col="item_cat_id", price_col="price"
)
test_fe = add_price_bins_by_category(test_fe, cat_col="item_cat_id", price_col="price")

train_fe = add_price_over_median_cat_loc(
    train_fe, cat_col="item_cat_id", loc_col="item_loc", price_col="price"
)
test_fe = add_price_over_median_cat_loc(
    test_fe, cat_col="item_cat_id", loc_col="item_loc", price_col="price"
)

[START] add_price_features | shape=(342886, 20)
[END]   add_price_features | shape=(342886, 23) | Δcols=3 | time=0.12s
------------------------------------------------------------
[START] add_price_features | shape=(335348, 19)
[END]   add_price_features | shape=(335348, 22) | Δcols=3 | time=0.11s
------------------------------------------------------------
[START] add_price_z_in_query | shape=(342886, 23)
[END]   add_price_z_in_query | shape=(342886, 24) | Δcols=1 | time=0.03s
------------------------------------------------------------
[START] add_price_z_in_query | shape=(335348, 22)
[END]   add_price_z_in_query | shape=(335348, 23) | Δcols=1 | time=0.02s
------------------------------------------------------------
[START] add_price_bins_by_category | shape=(342886, 24)
[END]   add_price_bins_by_category | shape=(342886, 27) | Δcols=3 | time=0.12s
------------------------------------------------------------
[START] add_price_bins_by_category | shape=(335348, 23)


  grp = out.groupby(cat_col, dropna=False)
  grp = out.groupby(cat_col, dropna=False)


[END]   add_price_bins_by_category | shape=(335348, 26) | Δcols=3 | time=0.08s
------------------------------------------------------------
[START] add_price_over_median_cat_loc | shape=(342886, 27)
[END]   add_price_over_median_cat_loc | shape=(342886, 28) | Δcols=1 | time=0.07s
------------------------------------------------------------
[START] add_price_over_median_cat_loc | shape=(335348, 26)
[END]   add_price_over_median_cat_loc | shape=(335348, 27) | Δcols=1 | time=0.08s
------------------------------------------------------------


  grp = out.groupby([cat_col, loc_col], dropna=False)
  grp = out.groupby([cat_col, loc_col], dropna=False)


In [9]:
# Совпадение единиц измерения
train_fe = add_numeric_and_unit_matches(train_fe)
test_fe = add_numeric_and_unit_matches(test_fe)

[START] add_numeric_and_unit_matches | shape=(342886, 28)
[END]   add_numeric_and_unit_matches | shape=(342886, 35) | Δcols=7 | time=68.40s
------------------------------------------------------------
[START] add_numeric_and_unit_matches | shape=(335348, 27)
[END]   add_numeric_and_unit_matches | shape=(335348, 34) | Δcols=7 | time=69.79s
------------------------------------------------------------


In [10]:
# Совпадение атрибутов
train_fe = add_match_features(train_fe)
test_fe = add_match_features(test_fe)

[START] add_match_features | shape=(342886, 35)
[END]   add_match_features | shape=(342886, 38) | Δcols=3 | time=0.15s
------------------------------------------------------------
[START] add_match_features | shape=(335348, 34)
[END]   add_match_features | shape=(335348, 37) | Δcols=3 | time=0.13s
------------------------------------------------------------


In [11]:
# Популярность
train_fe = add_ctr_features(train_fe, clip_val)
test_fe = add_ctr_features(test_fe, clip_val)

train_fe = tweak_click_conv(train_fe)
test_fe = tweak_click_conv(test_fe)

[START] add_ctr_features | shape=(342886, 38)
[END]   add_ctr_features | shape=(342886, 39) | Δcols=1 | time=0.04s
------------------------------------------------------------
[START] add_ctr_features | shape=(335348, 37)
[END]   add_ctr_features | shape=(335348, 38) | Δcols=1 | time=0.03s
------------------------------------------------------------
[START] tweak_click_conv | shape=(342886, 39)
[END]   tweak_click_conv | shape=(342886, 41) | Δcols=2 | time=0.04s
------------------------------------------------------------
[START] tweak_click_conv | shape=(335348, 38)
[END]   tweak_click_conv | shape=(335348, 40) | Δcols=2 | time=0.03s
------------------------------------------------------------


In [12]:
# Простая лексика: длины текстов, совпадающие токены
train_fe = add_text_features(train_fe)
test_fe = add_text_features(test_fe)

[START] add_text_features | shape=(342886, 41)


[END]   add_text_features | shape=(342886, 49) | Δcols=8 | time=27.76s
------------------------------------------------------------
[START] add_text_features | shape=(335348, 40)
[END]   add_text_features | shape=(335348, 48) | Δcols=8 | time=26.76s
------------------------------------------------------------


In [13]:
train_fe = add_tfidf_groupwise_cosine(train_fe, with_desc=True)
test_fe = add_tfidf_groupwise_cosine(test_fe, with_desc=True)

train_fe = add_tfidf_rank(train_fe, tfidf_col="tfidf_cosine_title")
test_fe = add_tfidf_rank(test_fe, tfidf_col="tfidf_cosine_title")

train_fe = add_tfidf_rank(train_fe, tfidf_col="tfidf_cosine_desc")
test_fe = add_tfidf_rank(test_fe, tfidf_col="tfidf_cosine_desc")

[START] add_tfidf_groupwise_cosine | shape=(342886, 49)


TFIDF per query: 100%|██████████| 30000/30000 [04:36<00:00, 108.38it/s]


[END]   add_tfidf_groupwise_cosine | shape=(342886, 51) | Δcols=2 | time=277.22s
------------------------------------------------------------
[START] add_tfidf_groupwise_cosine | shape=(335348, 48)


TFIDF per query: 100%|██████████| 12505/12505 [02:51<00:00, 72.73it/s]


[END]   add_tfidf_groupwise_cosine | shape=(335348, 50) | Δcols=2 | time=172.21s
------------------------------------------------------------
[START] add_tfidf_rank | shape=(342886, 51)
[END]   add_tfidf_rank | shape=(342886, 52) | Δcols=1 | time=0.12s
------------------------------------------------------------
[START] add_tfidf_rank | shape=(335348, 50)
[END]   add_tfidf_rank | shape=(335348, 51) | Δcols=1 | time=0.10s
------------------------------------------------------------
[START] add_tfidf_rank | shape=(342886, 52)
[END]   add_tfidf_rank | shape=(342886, 52) | Δcols=0 | time=0.15s
------------------------------------------------------------
[START] add_tfidf_rank | shape=(335348, 51)
[END]   add_tfidf_rank | shape=(335348, 51) | Δcols=0 | time=0.12s
------------------------------------------------------------


In [14]:
# Для каждого запроса (query_id) ставит цену объекта в ранг внутри группы (от 0 до 1)
train_fe = add_price_in_query_feats(train_fe)
test_fe = add_price_in_query_feats(test_fe)

[START] add_price_in_query_feats | shape=(342886, 52)
[END]   add_price_in_query_feats | shape=(342886, 53) | Δcols=1 | time=0.16s
------------------------------------------------------------
[START] add_price_in_query_feats | shape=(335348, 51)
[END]   add_price_in_query_feats | shape=(335348, 52) | Δcols=1 | time=0.08s
------------------------------------------------------------


In [15]:
# Популярность пар признаков
train_fe, test_fe = add_2d_freq(
    train_fe, test_fe, "item_cat_id", "item_loc", "item_cat_loc_freq"
)

In [16]:
# Популярность категорий
freq_cols = ["query_loc", "item_loc", "item_cat_id", "item_mcat_id"]
train_fe, test_fe = add_freq_enc(train_fe, test_fe, freq_cols)

In [17]:
train_fe = add_group_norms(train_fe)
test_fe = add_group_norms(test_fe)

[START] add_group_norms | shape=(342886, 58)
[END]   add_group_norms | shape=(342886, 64) | Δcols=6 | time=0.14s
------------------------------------------------------------
[START] add_group_norms | shape=(335348, 57)
[END]   add_group_norms | shape=(335348, 63) | Δcols=6 | time=0.11s
------------------------------------------------------------


In [18]:
train_fe = add_basic_interactions(train_fe)
test_fe = add_basic_interactions(test_fe)

[START] add_basic_interactions | shape=(342886, 64)
[END]   add_basic_interactions | shape=(342886, 68) | Δcols=4 | time=0.06s
------------------------------------------------------------
[START] add_basic_interactions | shape=(335348, 63)
[END]   add_basic_interactions | shape=(335348, 67) | Δcols=4 | time=0.04s
------------------------------------------------------------


##### Финальные колонки и типы

Собираем ID, таргет и все фичи в один датафрейм.  
Числа приводим к `float32/int32` — это экономит память.


In [19]:
# таргет → int8 (0/1)
train_fe["item_contact"] = train_fe["item_contact"].fillna(0).astype("int8")

# даункаст чисел
for c in train_fe.columns:
    if c == "item_contact":
        continue
    if train_fe[c].dtype == "float64":
        train_fe[c] = train_fe[c].astype("float32")
    if c in test_fe and test_fe[c].dtype == "float64":
        test_fe[c] = test_fe[c].astype("float32")

print("Train shape:", train_fe.shape, "| Test shape:", test_fe.shape)


Train shape: (342886, 68) | Test shape: (335348, 67)


##### Сохранение


In [20]:
OUTPUT_DIR = "../data/transformed"
os.makedirs(OUTPUT_DIR, exist_ok=True)

train_out_path = os.path.join(OUTPUT_DIR, "train-features.parquet")
test_out_path = os.path.join(OUTPUT_DIR, "test-features.parquet")

train_fe.to_parquet(train_out_path, index=False)
test_fe.to_parquet(test_out_path, index=False)

print("Saved:", train_out_path)
print("Saved:", test_out_path)


Saved: ../data/transformed/train-features.parquet
Saved: ../data/transformed/test-features.parquet


##### Получившиеся фичи


In [None]:
from ydata_profiling import ProfileReport
import pandas as pd

train_fe = pd.read_parquet(SAMPLE_PATH)
ProfileReport(train_fe)

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

