In [1]:
import pyarrow.parquet as pq
import pandas as pd
import polars as pl
import datetime
from polars.datatypes import (
    Boolean, UInt8, UInt16, UInt32, UInt64, Utf8,
    Int8, Int16, Int32, Int64,
    Float32, Float64
)
from polars.datatypes import Utf8, Datetime, Date, Time
import xgboost as xgb
import numpy as np
import xgboost as xgb
from sklearn.model_selection import GroupKFold
from sklearn.metrics import ndcg_score
from geopy.distance import geodesic
from src.utils.utils import evaluate_feature
from src.train_new_base import train_base_model
rows = (
    pl.scan_parquet("data/train.parquet")        # or scan_parquet/scan_ndjson/…
      .select(pl.len())           # len() == COUNT(*)
      .collect(streaming=True)    # constant-memory execution
      .item()                     # get the scalar
)

  .collect(streaming=True)    # constant-memory execution


In [2]:
data = pl.read_parquet("data/clean_data_cut.parquet")

In [14]:
feature_cols = ['company_count', 'company_freq', 'totalPrice', 'route_distance_km', 'frequentFlyer_n_programs', 'unique_ranker_count', 'hour_sin', 
                'hour_cos', 'wday_sin', 'wday_cos', 'twoway_route', 'avg_oneway_price', 'total_duration_hours_leg0', 'total_duration_hours_leg1', 'legs0_segments0_aircraft_code_freq',
                'legs1_segments0_aircraft_code_freq', 'frequentFlyer', 'isVip', 'miniRules0_monetaryAmount', 'miniRules0_percentage',
                'miniRules1_monetaryAmount', 'miniRules1_percentage', 'sex', 'total_segments_count', 'legs0_num_segments', 'legs1_num_segments', 'tariff_code_filled', 'nationality_cat', 'miniRules0_statusInfos',
                'miniRules1_statusInfos', 'pricingInfo_isAccessTP', 'legs0_departureAt_period', 'legs0_arrivalAt_period', 'legs1_departureAt_period', 'legs1_arrivalAt_period',
                'legs0_segments0_baggageAllowance_quantity_kg', 'legs1_segments0_baggageAllowance_quantity_kg', 'legs0_segments0_cabinClass_cat', 'legs1_segments0_cabinClass_cat']
FEATURE_COLUMNS_V1 = [
    # 1-2. Пересадки
    "layover_hours_leg0",
    "layover_hours_leg1",

    # 3. Сколько часов до вылёта
    "days_before_flight_leg0",

    # 4 + 6. Смена аэропорта внутри одной ноги
    "legs0_airport_changes_count",
    "legs1_airport_changes_count",

    # 5. Ночная пересадка
    "night_layover_leg0",
    "night_layover_leg1",

    # 7 + 11. Одна ли авиакомпания-оператор
    "same_operator_carrier_leg0",
    "same_operator_carrier_leg1",

    # 8. Участие билета в программе лояльности
    "ticket_is_in_FFprogramms_leg0",
    "ticket_is_in_FFprogramms_leg1",

    # 9. Ранг по цене
    "totalPrice_rank",

    # 10. Ранг по общему времени перелёта
    "totalTime_hours_ranked",

    # 12. Согласованность багажа
    "baggage_kg_equal_flag",
    "baggage_units_equal_flag",

    # 13. Сколько билетов показано в поисковой сессии
    "tickets_in_session",

    # 14. Метрики оставшихся билетов
    "remainingTickets_avg",
    "remainingTickets_rank",

    # 15. Частота покупок пользователя
    "user_search_freq",

    # 16. Доля совпадений оператор-продавец
    "operator_marketer_match_rate",

    # 17. Билет ≤ 20 % дороже минимального
    "within_20pct_of_min",

    # 18. День недели вылета (синус/косинус)
    "leg0_depday_sin", "leg1_depday_sin",
    "leg0_depday_cos", "leg1_depday_cos",

    # 19. День недели прилёта (синус/косинус)
    "leg0_arrday_sin", "leg1_arrday_sin",
    "leg0_arrday_cos", "leg1_arrday_cos",

    # 20. Комплексная «оптимальность» билета
    "opt_ticket_score",
]
FEATURE_COLUMNS_V2 = [
    # Cчётчики сегментов и направления
    "n_segments_leg0",
    "n_segments_leg1",
    "is_one_way",
    "total_segments",
    "is_direct_leg0",
    "is_direct_leg1",
    "both_direct",

    # Стоимость, налоги, сборы
    "price_per_tax",
    "tax_rate",
    "log_price",
    "total_fees",
    "has_fees",
    "fee_rate",

    # Продолжительности
    "duration_ratio",

    # Тарифы и привилегии
    "has_corporate_tariff",
    "has_access_tp",
    "n_ff_programs",
    "is_vip_freq",

    # Класс обслуживания
    "avg_cabin_class",
    "cabin_class_diff",

    # Популярные маршруты
    "is_popular_route",

    # Относительные метрики цены и сегментов внутри поиска
    "price_pct_rank",
    "is_cheapest",
    "price_from_median",
    "is_min_segments",
    "is_direct_cheapest",
]
feature_list = feature_cols + FEATURE_COLUMNS_V1 + FEATURE_COLUMNS_V2
cat_features_final = ['tariff_code_filled', 'nationality_cat', 'legs0_departureAt_period', 'legs0_arrivalAt_period',
                      'legs1_departureAt_period', 'legs1_arrivalAt_period', 'legs0_segments0_cabinClass_cat', 'legs1_segments0_cabinClass_cat', 'frequentFlyer']
#feature_cols = ['totalPrice']
#cat_features_final = []
#X = data.select(feature_cols)
#y = data.select("selected")            # Polars DataFrame with 1 col
#groups = data.select("ranker_id")      # Polars DataFrame with 1 col
n2 = rows

# 5. Set params to use pairwise ranking + ndcg@3 + histogram tree builder
params = {
    "objective":   "rank:pairwise",
    "eval_metric": "ndcg@3",
    "tree_method": "hist",      # 🔥 much faster
    "seed":        42,
    "n_jobs":      -1,
    'eta': 0.14381590394747168,
    "learning_rate": 0.022641389657079056,
    'max_depth': 14,
    'min_child_weight': 2,
    'subsample': 0.7308722227055789,
    "colsample_bytree": 0.45840689146263086,
    "gamma": 3.3084297630544888,
    "lambda": 6.952586917313028,
    "alpha": 0.6395254133055179,
    'num_boost_round': 1500
}


In [5]:
cv_stats = pd.read_csv('model/cv_results.csv')
print('mean top@3 - ', round(cv_stats['val-top@3'].mean(), 4))
print('std top@3 - ', round(cv_stats['val-top@3'].std(), 4))
print('mean ndcg@3 - ', round(cv_stats['val-ndcg@3'].mean(), 4))
print('std ndcg@3 - ', round(cv_stats['val-ndcg@3'].std(), 4))

mean top@3 -  0.4192
std top@3 -  0.0027
mean ndcg@3 -  0.3848
std ndcg@3 -  0.002


In [5]:
feature_list = ['totalPrice', 'ranker_id', 'selected', 'company_count', 'route_distance_km', 'frequentFlyer_n_programs', 'unique_ranker_count']

In [None]:
top_sigma = np.std(pd.read_csv('model/cv_results.csv')['val-top@3']), np.mean(pd.read_csv('model/cv_results.csv')['val-top@3'])
meta, df = evaluate_feature(data[:n2][feature_list], 
                       new_feature_name='unique_ranker_count', 
                       group_col='ranker_id', 
                       label_col='selected', 
                       baseline_model='model/base.json', 
                       params=params, 
                       sigma0=top_sigma[1],
                       corr_threshold=0.00,
                       cat_features_final=cat_features_final)
meta

top@3 before -  0.2258  top@3 after -  0.2237
ndcg@3 before -  0.2622  top@3 after -  0.2614
mean top@3 -  0.2239
std top@3 -  0.003
mean ndcg@3 -  0.2621
std ndcg@3 -  0.0012


{'corr': np.float64(0.005832222368000487),
 'warm_delta': np.float64(-0.0021618937392779147),
 'mini_delta': None}

In [47]:
if round(cv_stats['val-top@3'].mean(), 4) < round(df['val-top@3'].mean(), 4):
    print('accepted by top@3')
if round(cv_stats['val-ndcg@3'].mean(), 4)  < round(df['val-ndcg@3'].mean(), 4):
    print('accepted by ncdg3')

accepted by ncdg3


In [43]:
df.to_pandas().to_csv('model/cv_results.csv')

In [None]:
res = train_base_model(data=data[:n2], 
                 features=feature_list, 
                 label_col='selected', 
                 group_col='ranker_id', 
                 params=params, 
                 num_boost_round=1000, 
                 baseline_model_path='model/', 
                 seed=42, 
                 verbose_eval_size=5, 
                 full_cv = False,
                 cat_features_final=cat_features_final)

res

Parameters: { "num_boost_round" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


0.4694783687591553


In [6]:
booster = xgb.Booster()
booster.load_model('model/base.json')

In [9]:
def get_group_sizes(ranker_ids: np.ndarray) -> np.ndarray:
    # unique in order of first appearance + their counts
    uniq, idx, counts = np.unique(ranker_ids, return_index=True, return_counts=True)
    return counts[np.argsort(idx)]
data_xgb = data[n2:].with_columns([
                                (pl.col(c).rank("dense") - 1)
                                .fill_null(-1)
                                .cast(pl.Int32)
                                .alias(c)
                                for c in cat_features_final
                            ])
label_col = 'selected'
group_col = 'ranker_id'
feature_cols_all = feature_list
data_test = xgb.DMatrix(data_xgb.select(feature_cols_all).to_numpy(), feature_names=feature_cols_all, group=get_group_sizes(data_xgb.select(group_col)))

In [10]:
preds = booster.predict(data_test)

In [11]:
test = pl.read_parquet('data/test.parquet')

In [12]:
submission_xgb = (
    test.select(['Id', 'ranker_id'])
    .with_columns(pl.Series('pred_score', preds))
    .with_columns(
        pl.col('pred_score')
        .rank(method='ordinal', descending=True)
        .over('ranker_id')
        .cast(pl.Int32)
        .alias('selected')
    )
    .select(['Id', 'ranker_id', 'selected', 'pred_score'])
)

In [13]:
submission_xgb.write_csv('submission_v2.csv')

In [None]:
import optuna
LABEL_COL = "selected"          # binary 1/0 – chosen by the user
GROUP_COL = "ranker_id"         # search/session id
FEATURE_COLUMNS = feature_list
CAT_FEATURES_FINAL = cat_features_final
def get_group_sizes(ranker_ids: np.ndarray) -> np.ndarray:
    """Return group‐sizes array for XGBoost ranking.

    Parameters
    ----------
    ranker_ids : np.ndarray
        1‑D array with group id (ranker_id) of every row in the same order as X.

    Returns
    -------
    np.ndarray
        Array of group sizes, ordered by the first appearance of each id (exactly
        what xgb.DMatrix wants for ranking).
    """
    uniq, idx, counts = np.unique(ranker_ids, return_index=True, return_counts=True)
    return counts[np.argsort(idx)]
def train_single_split(
    data_xgb: pl.DataFrame,
    params: dict,
    num_boost_round: int,
    seed: int = 42,
) -> float:
    """Train XGBoost on a random 80/20 group‑based split and return hit@3."""

    # Ordinal‑encode categorical features ------------------------------------------------


    rng = np.random.default_rng(seed)
    groups_unique = data_xgb.select(GROUP_COL).unique().to_numpy().ravel()
    val_groups = rng.choice(groups_unique, size=max(1, int(0.2 * len(groups_unique))), replace=False)

    val_df = data_xgb.filter(pl.col(GROUP_COL).is_in(val_groups))
    train_df = data_xgb.filter(~pl.col(GROUP_COL).is_in(val_groups))

    feature_cols = [c for c in FEATURE_COLUMNS if c not in {LABEL_COL, GROUP_COL}]

    dtrain = xgb.DMatrix(
        train_df.select(feature_cols).to_numpy(),
        label=train_df.select(LABEL_COL).to_numpy().ravel(),
        group=get_group_sizes(train_df.select(GROUP_COL).to_numpy().ravel()),
        feature_names=feature_cols,
    )

    dval = xgb.DMatrix(
        val_df.select(feature_cols).to_numpy(),
        label=val_df.select(LABEL_COL).to_numpy().ravel(),
        group=get_group_sizes(val_df.select(GROUP_COL).to_numpy().ravel()),
        feature_names=feature_cols,
    )

    bst = xgb.train(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        evals=[(dval, "val")],
        early_stopping_rounds=30,
        verbose_eval=False,
    )

    # NDCG@3 produced by XGBoost is geometric mean across groups; for hit@3 we compute manually
    preds = bst.predict(dval)

    # Hit@3 – at least one correct item in top‑3 per group ------------------
    # Reconstruct group slices
    ranker_ids = val_df.select(GROUP_COL).to_numpy().ravel()
    labels = dval.get_label()
    order = np.argsort(ranker_ids, kind="stable")
    ranker_ids, labels, preds = ranker_ids[order], labels[order], preds[order]

    hit_total, group_total = 0, 0
    start = 0
    while start < len(ranker_ids):
        group = ranker_ids[start]
        end = start
        while end < len(ranker_ids) and ranker_ids[end] == group:
            end += 1
        if (end - start) > 10:
            group_labels = labels[start:end]
            group_preds = preds[start:end]
            top3_idx = np.argsort(group_preds)[::-1][:3]
            hit_total += int(group_labels[top3_idx].sum() > 0)
            group_total += 1
        start = end
    return hit_total / group_total


# ---------------------------------------------------------------------------
# Optuna objective ----------------------------------------------------------
# ---------------------------------------------------------------------------

def objective(trial: optuna.Trial, data: pl.DataFrame) -> float:
    params = {
        "objective": "rank:pairwise",
        "eval_metric": "ndcg@3",
        "eta": trial.suggest_float("eta", 0.01, 0.3, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 20),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "gamma": trial.suggest_float("gamma", 0.0, 5.0),
        "lambda": trial.suggest_float("lambda", 1e-8, 10.0, log=True),
        "alpha": trial.suggest_float("alpha", 1e-8, 10.0, log=True),
        "tree_method": "hist",
        "single_precision_histogram": True,
        "max_bin": 128,
        "seed": 42,
        'device': 'cuda'
    }

    num_boost_round = trial.suggest_int("num_boost_round", 300, 1500, step=100)
    data_xgb = data.with_columns([
    (pl.col(c).rank("dense") - 1)  # rank starts at 1 → shift to 0
    .fill_null(-1)
    .cast(pl.Int32)
    .alias(c)
    for c in CAT_FEATURES_FINAL
    ])
    hit_at_3 = train_single_split(
        data_xgb=data_xgb,
        params=params,
        num_boost_round=num_boost_round,
        seed=42,
    )

    return hit_at_3  # maximise



  from .autonotebook import tqdm as notebook_tqdm


In [11]:
from pathlib import Path
study = optuna.create_study(direction="maximize")
study.optimize(lambda trial: objective(trial, data[:n2]), n_trials=50, show_progress_bar=True, gc_after_trial=True)

# ---------------------------------------------------------------------
# 3) Report best params & save
# ---------------------------------------------------------------------
print("\nBest hit@3:", round(study.best_value, 4))
print("Best params:\n", study.best_params)

out_dir = Path("optuna_results")
out_dir.mkdir(exist_ok=True, parents=True)
(out_dir / "best_params.json").write_text(study.best_trial.params.__repr__())

[I 2025-07-31 12:31:00,571] A new study created in memory with name: no-name-ca9840d4-f07f-47cf-bc7a-bff5524b0287
Parameters: { "single_precision_histogram" } are not used.

  self.starting_round = model.num_boosted_rounds()
Best trial: 0. Best value: 0.35665:   2%|▏         | 1/50 [01:58<1:36:35, 118.28s/it]

[I 2025-07-31 12:32:58,824] Trial 0 finished with value: 0.3566495878814881 and parameters: {'eta': 0.016799433682372827, 'max_depth': 5, 'min_child_weight': 20, 'subsample': 0.9614366002961072, 'colsample_bytree': 0.7213201036469383, 'gamma': 1.7137185245609894, 'lambda': 6.041193036497697e-07, 'alpha': 0.00901802221641974, 'num_boost_round': 1500}. Best is trial 0 with value: 0.3566495878814881.


Parameters: { "single_precision_histogram" } are not used.

  self.starting_round = model.num_boosted_rounds()
Best trial: 1. Best value: 0.420077:   4%|▍         | 2/50 [03:24<1:19:30, 99.39s/it] 

[I 2025-07-31 12:34:24,991] Trial 1 finished with value: 0.42007704762436493 and parameters: {'eta': 0.08870660003385719, 'max_depth': 7, 'min_child_weight': 17, 'subsample': 0.6373344331189786, 'colsample_bytree': 0.9551034843619213, 'gamma': 1.321789700331823, 'lambda': 1.9735635430171912e-07, 'alpha': 1.868435583931355, 'num_boost_round': 600}. Best is trial 1 with value: 0.42007704762436493.


Parameters: { "single_precision_histogram" } are not used.

  self.starting_round = model.num_boosted_rounds()
Best trial: 1. Best value: 0.420077:   6%|▌         | 3/50 [04:42<1:10:08, 89.55s/it]

[I 2025-07-31 12:35:42,821] Trial 2 finished with value: 0.38929765886287626 and parameters: {'eta': 0.279992293813335, 'max_depth': 3, 'min_child_weight': 2, 'subsample': 0.8329077229633369, 'colsample_bytree': 0.9755969990496046, 'gamma': 4.028052183805037, 'lambda': 0.0022659354869553608, 'alpha': 2.484745077423099e-06, 'num_boost_round': 600}. Best is trial 1 with value: 0.42007704762436493.


Parameters: { "single_precision_histogram" } are not used.

  self.starting_round = model.num_boosted_rounds()
Best trial: 1. Best value: 0.420077:   8%|▊         | 4/50 [04:55<45:30, 59.36s/it]  

[I 2025-07-31 12:35:55,907] Trial 3 finished with value: 0.3230366783547615 and parameters: {'eta': 0.019613388631192816, 'max_depth': 3, 'min_child_weight': 13, 'subsample': 0.6283157514185657, 'colsample_bytree': 0.9749313229151269, 'gamma': 1.7926576767561364, 'lambda': 0.0004645264473988765, 'alpha': 0.0013261274951899466, 'num_boost_round': 800}. Best is trial 1 with value: 0.42007704762436493.


Parameters: { "single_precision_histogram" } are not used.

  self.starting_round = model.num_boosted_rounds()
Best trial: 1. Best value: 0.420077:  10%|█         | 5/50 [06:03<46:55, 62.56s/it]

[I 2025-07-31 12:37:04,139] Trial 4 finished with value: 0.38335834583958645 and parameters: {'eta': 0.07759584290394192, 'max_depth': 5, 'min_child_weight': 13, 'subsample': 0.613231870575633, 'colsample_bytree': 0.7834919228219692, 'gamma': 0.5292424251646599, 'lambda': 2.20853621773693e-05, 'alpha': 5.233631405198676e-07, 'num_boost_round': 900}. Best is trial 1 with value: 0.42007704762436493.


Parameters: { "single_precision_histogram" } are not used.

  self.starting_round = model.num_boosted_rounds()
Best trial: 1. Best value: 0.420077:  12%|█▏        | 6/50 [06:17<33:42, 45.97s/it]

[I 2025-07-31 12:37:17,904] Trial 5 finished with value: 0.33837566285235837 and parameters: {'eta': 0.018692502341774456, 'max_depth': 4, 'min_child_weight': 13, 'subsample': 0.882777588658926, 'colsample_bytree': 0.7931523649436593, 'gamma': 1.2666357708789022, 'lambda': 0.001087182027942776, 'alpha': 0.00021539394147242086, 'num_boost_round': 1100}. Best is trial 1 with value: 0.42007704762436493.


Parameters: { "single_precision_histogram" } are not used.

  self.starting_round = model.num_boosted_rounds()
Best trial: 1. Best value: 0.420077:  14%|█▍        | 7/50 [08:00<46:15, 64.55s/it]

[I 2025-07-31 12:39:00,718] Trial 6 finished with value: 0.3791242419184332 and parameters: {'eta': 0.049109061777369165, 'max_depth': 5, 'min_child_weight': 20, 'subsample': 0.8372603618462015, 'colsample_bytree': 0.7296440584886602, 'gamma': 2.801850963046772, 'lambda': 8.196257040742894e-08, 'alpha': 0.0020984229363452235, 'num_boost_round': 1000}. Best is trial 1 with value: 0.42007704762436493.


Parameters: { "single_precision_histogram" } are not used.

  self.starting_round = model.num_boosted_rounds()
Best trial: 1. Best value: 0.420077:  16%|█▌        | 8/50 [08:58<43:50, 62.64s/it]

[I 2025-07-31 12:39:59,257] Trial 7 finished with value: 0.38486971140375453 and parameters: {'eta': 0.1732801296789772, 'max_depth': 4, 'min_child_weight': 3, 'subsample': 0.7402426770728454, 'colsample_bytree': 0.9292892535882693, 'gamma': 0.3818279685768533, 'lambda': 0.30360461534911104, 'alpha': 1.5397847978721687e-05, 'num_boost_round': 800}. Best is trial 1 with value: 0.42007704762436493.


Parameters: { "single_precision_histogram" } are not used.

  self.starting_round = model.num_boosted_rounds()
Best trial: 1. Best value: 0.420077:  18%|█▊        | 9/50 [10:46<52:30, 76.85s/it]

[I 2025-07-31 12:41:47,344] Trial 8 finished with value: 0.3970653871903593 and parameters: {'eta': 0.12814813040209747, 'max_depth': 5, 'min_child_weight': 12, 'subsample': 0.9729651257124419, 'colsample_bytree': 0.7754946738282393, 'gamma': 2.5076427575152755, 'lambda': 0.00019214359854910734, 'alpha': 4.0718966672602004e-05, 'num_boost_round': 800}. Best is trial 1 with value: 0.42007704762436493.


Parameters: { "single_precision_histogram" } are not used.

  self.starting_round = model.num_boosted_rounds()
Best trial: 1. Best value: 0.420077:  20%|██        | 10/50 [12:00<50:34, 75.86s/it]

[I 2025-07-31 12:43:00,984] Trial 9 finished with value: 0.41838555022856505 and parameters: {'eta': 0.2030763743146656, 'max_depth': 7, 'min_child_weight': 8, 'subsample': 0.9129843985466237, 'colsample_bytree': 0.8340645830062385, 'gamma': 2.669174423631815, 'lambda': 0.00011539863330105868, 'alpha': 6.142685196289024e-05, 'num_boost_round': 600}. Best is trial 1 with value: 0.42007704762436493.


Parameters: { "single_precision_histogram" } are not used.

  self.starting_round = model.num_boosted_rounds()
Best trial: 1. Best value: 0.420077:  22%|██▏       | 11/50 [12:48<43:49, 67.41s/it]

[I 2025-07-31 12:43:49,259] Trial 10 finished with value: 0.3941446484549933 and parameters: {'eta': 0.04027692450272782, 'max_depth': 8, 'min_child_weight': 17, 'subsample': 0.713966511598936, 'colsample_bytree': 0.8819154098667217, 'gamma': 4.848215831054434, 'lambda': 1.658773793482456e-08, 'alpha': 4.595545946375133, 'num_boost_round': 300}. Best is trial 1 with value: 0.42007704762436493.


Parameters: { "single_precision_histogram" } are not used.

  self.starting_round = model.num_boosted_rounds()
Best trial: 1. Best value: 0.420077:  24%|██▍       | 12/50 [13:48<41:11, 65.05s/it]

[I 2025-07-31 12:44:48,904] Trial 11 finished with value: 0.4056057060069096 and parameters: {'eta': 0.10989727842322797, 'max_depth': 7, 'min_child_weight': 6, 'subsample': 0.9050533185793992, 'colsample_bytree': 0.6041077421307431, 'gamma': 3.2336837689589473, 'lambda': 3.2372777797230518e-06, 'alpha': 9.350888229440912, 'num_boost_round': 400}. Best is trial 1 with value: 0.42007704762436493.


Parameters: { "single_precision_histogram" } are not used.

  self.starting_round = model.num_boosted_rounds()
Best trial: 12. Best value: 0.424018:  26%|██▌       | 13/50 [14:55<40:30, 65.70s/it]

[I 2025-07-31 12:45:56,094] Trial 12 finished with value: 0.42401824452108133 and parameters: {'eta': 0.26719262721796244, 'max_depth': 7, 'min_child_weight': 8, 'subsample': 0.7293171122808932, 'colsample_bytree': 0.8680306488294656, 'gamma': 3.414701970831686, 'lambda': 0.17194469533114948, 'alpha': 2.109701442545636e-08, 'num_boost_round': 500}. Best is trial 12 with value: 0.42401824452108133.


Parameters: { "single_precision_histogram" } are not used.

  self.starting_round = model.num_boosted_rounds()
Best trial: 13. Best value: 0.432833:  28%|██▊       | 14/50 [16:07<40:35, 67.66s/it]

[I 2025-07-31 12:47:08,268] Trial 13 finished with value: 0.43283331467592073 and parameters: {'eta': 0.2887408193928653, 'max_depth': 7, 'min_child_weight': 9, 'subsample': 0.6897831242711121, 'colsample_bytree': 0.8927024009906425, 'gamma': 3.5828756631243586, 'lambda': 8.901198548020144, 'alpha': 0.16135884133456024, 'num_boost_round': 500}. Best is trial 13 with value: 0.43283331467592073.


Parameters: { "single_precision_histogram" } are not used.

  self.starting_round = model.num_boosted_rounds()
Best trial: 13. Best value: 0.432833:  30%|███       | 15/50 [16:49<34:57, 59.92s/it]

[I 2025-07-31 12:47:50,249] Trial 14 finished with value: 0.4164438502673797 and parameters: {'eta': 0.2749183483793218, 'max_depth': 8, 'min_child_weight': 8, 'subsample': 0.7294063716982953, 'colsample_bytree': 0.8900946952899895, 'gamma': 3.7718976495652634, 'lambda': 5.459975882670884, 'alpha': 0.08339006219704657, 'num_boost_round': 400}. Best is trial 13 with value: 0.43283331467592073.


Parameters: { "single_precision_histogram" } are not used.

  self.starting_round = model.num_boosted_rounds()
Best trial: 13. Best value: 0.432833:  32%|███▏      | 16/50 [18:25<40:05, 70.75s/it]

[I 2025-07-31 12:49:26,148] Trial 15 finished with value: 0.41049692908989394 and parameters: {'eta': 0.17215737611440313, 'max_depth': 6, 'min_child_weight': 5, 'subsample': 0.6945674534458719, 'colsample_bytree': 0.8632886641996118, 'gamma': 4.847343385947882, 'lambda': 0.06435647592950272, 'alpha': 1.667763292915514e-08, 'num_boost_round': 1200}. Best is trial 13 with value: 0.43283331467592073.


Parameters: { "single_precision_histogram" } are not used.

  self.starting_round = model.num_boosted_rounds()
Best trial: 13. Best value: 0.432833:  34%|███▍      | 17/50 [19:33<38:27, 69.93s/it]

[I 2025-07-31 12:50:34,163] Trial 16 finished with value: 0.4073620315197416 and parameters: {'eta': 0.2831365852666034, 'max_depth': 6, 'min_child_weight': 9, 'subsample': 0.781873369468132, 'colsample_bytree': 0.9030966811901286, 'gamma': 3.969223140237168, 'lambda': 5.382382603513352, 'alpha': 1.602589981715294e-08, 'num_boost_round': 500}. Best is trial 13 with value: 0.43283331467592073.


Parameters: { "single_precision_histogram" } are not used.

  self.starting_round = model.num_boosted_rounds()
Best trial: 13. Best value: 0.432833:  36%|███▌      | 18/50 [20:22<33:56, 63.64s/it]

[I 2025-07-31 12:51:23,161] Trial 17 finished with value: 0.39802024495274313 and parameters: {'eta': 0.038072451486822076, 'max_depth': 8, 'min_child_weight': 10, 'subsample': 0.6694924769197279, 'colsample_bytree': 0.9992620176787986, 'gamma': 3.313915606151014, 'lambda': 0.06131124628633514, 'alpha': 0.05893150885720593, 'num_boost_round': 300}. Best is trial 13 with value: 0.43283331467592073.


Parameters: { "single_precision_histogram" } are not used.

  self.starting_round = model.num_boosted_rounds()
Best trial: 13. Best value: 0.432833:  38%|███▊      | 19/50 [22:37<43:56, 85.05s/it]

[I 2025-07-31 12:53:38,099] Trial 18 finished with value: 0.39778634915311084 and parameters: {'eta': 0.026668051242005195, 'max_depth': 7, 'min_child_weight': 5, 'subsample': 0.7704129807607896, 'colsample_bytree': 0.8407037699240886, 'gamma': 4.367402093998503, 'lambda': 0.5338144244439076, 'alpha': 0.3675316796862543, 'num_boost_round': 1300}. Best is trial 13 with value: 0.43283331467592073.


Parameters: { "single_precision_histogram" } are not used.

  self.starting_round = model.num_boosted_rounds()
Best trial: 13. Best value: 0.432833:  40%|████      | 20/50 [22:57<32:40, 65.35s/it]

[I 2025-07-31 12:53:57,551] Trial 19 finished with value: 0.3559624832514515 and parameters: {'eta': 0.012189762885118336, 'max_depth': 6, 'min_child_weight': 7, 'subsample': 0.6673017029150318, 'colsample_bytree': 0.6411714391433039, 'gamma': 3.2937287668918898, 'lambda': 0.010581272298300887, 'alpha': 4.467541778296972e-07, 'num_boost_round': 700}. Best is trial 13 with value: 0.43283331467592073.


Parameters: { "single_precision_histogram" } are not used.

  self.starting_round = model.num_boosted_rounds()
Best trial: 13. Best value: 0.432833:  42%|████▏     | 21/50 [24:10<32:42, 67.67s/it]

[I 2025-07-31 12:55:10,614] Trial 20 finished with value: 0.40661120107208665 and parameters: {'eta': 0.06708686890458095, 'max_depth': 7, 'min_child_weight': 11, 'subsample': 0.8108767769864405, 'colsample_bytree': 0.9308671781226242, 'gamma': 2.099051916994381, 'lambda': 0.7882444156311661, 'alpha': 0.014615470432388703, 'num_boost_round': 500}. Best is trial 13 with value: 0.43283331467592073.


Parameters: { "single_precision_histogram" } are not used.

  self.starting_round = model.num_boosted_rounds()
Best trial: 13. Best value: 0.432833:  44%|████▍     | 22/50 [25:35<34:06, 73.09s/it]

[I 2025-07-31 12:56:36,340] Trial 21 finished with value: 0.4169120506694816 and parameters: {'eta': 0.10340081330363342, 'max_depth': 7, 'min_child_weight': 15, 'subsample': 0.6533291673867245, 'colsample_bytree': 0.929785762625, 'gamma': 1.4776078069288898, 'lambda': 9.27580258444157, 'alpha': 0.9428564250784821, 'num_boost_round': 600}. Best is trial 13 with value: 0.43283331467592073.


Parameters: { "single_precision_histogram" } are not used.

  self.starting_round = model.num_boosted_rounds()
Best trial: 22. Best value: 0.444507:  46%|████▌     | 23/50 [26:50<33:04, 73.50s/it]

[I 2025-07-31 12:57:50,808] Trial 22 finished with value: 0.44450653850452665 and parameters: {'eta': 0.20305532176595223, 'max_depth': 8, 'min_child_weight': 17, 'subsample': 0.6074425029850297, 'colsample_bytree': 0.8268822028822889, 'gamma': 0.9968846331283965, 'lambda': 0.012562012050778977, 'alpha': 0.9517182997033248, 'num_boost_round': 500}. Best is trial 22 with value: 0.44450653850452665.


Parameters: { "single_precision_histogram" } are not used.

  self.starting_round = model.num_boosted_rounds()
Best trial: 23. Best value: 0.446447:  48%|████▊     | 24/50 [27:51<30:17, 69.89s/it]

[I 2025-07-31 12:58:52,270] Trial 23 finished with value: 0.44644650228718064 and parameters: {'eta': 0.2103804410553803, 'max_depth': 8, 'min_child_weight': 10, 'subsample': 0.6042263814161108, 'colsample_bytree': 0.8379584556161299, 'gamma': 0.6894195135928064, 'lambda': 0.012001765450301594, 'alpha': 0.23041162967038759, 'num_boost_round': 400}. Best is trial 23 with value: 0.44644650228718064.


Parameters: { "single_precision_histogram" } are not used.

  self.starting_round = model.num_boosted_rounds()
Best trial: 23. Best value: 0.446447:  50%|█████     | 25/50 [28:53<28:04, 67.37s/it]

[I 2025-07-31 12:59:53,755] Trial 24 finished with value: 0.4342920971795588 and parameters: {'eta': 0.1381787315572436, 'max_depth': 8, 'min_child_weight': 15, 'subsample': 0.6201534830523364, 'colsample_bytree': 0.8272773376318108, 'gamma': 1.0675345657330064, 'lambda': 0.006310480637820436, 'alpha': 0.25203622182951346, 'num_boost_round': 400}. Best is trial 23 with value: 0.44644650228718064.


Parameters: { "single_precision_histogram" } are not used.

  self.starting_round = model.num_boosted_rounds()
Best trial: 23. Best value: 0.446447:  52%|█████▏    | 26/50 [29:41<24:41, 61.71s/it]

[I 2025-07-31 13:00:42,264] Trial 25 finished with value: 0.42134078212290504 and parameters: {'eta': 0.14727533860134556, 'max_depth': 8, 'min_child_weight': 16, 'subsample': 0.6046049728739498, 'colsample_bytree': 0.7434408372179612, 'gamma': 0.8094972892832922, 'lambda': 0.0071889585321587264, 'alpha': 0.5283826990662592, 'num_boost_round': 300}. Best is trial 23 with value: 0.44644650228718064.


Parameters: { "single_precision_histogram" } are not used.

  self.starting_round = model.num_boosted_rounds()
Best trial: 23. Best value: 0.446447:  54%|█████▍    | 27/50 [30:42<23:35, 61.53s/it]

[I 2025-07-31 13:01:43,380] Trial 26 finished with value: 0.43990398034946687 and parameters: {'eta': 0.21041870053144116, 'max_depth': 8, 'min_child_weight': 15, 'subsample': 0.6120713018226301, 'colsample_bytree': 0.8232459852588678, 'gamma': 0.025542478915377753, 'lambda': 0.011211373264882077, 'alpha': 0.011147742611755137, 'num_boost_round': 400}. Best is trial 23 with value: 0.44644650228718064.


Parameters: { "single_precision_histogram" } are not used.

  self.starting_round = model.num_boosted_rounds()
Best trial: 23. Best value: 0.446447:  56%|█████▌    | 28/50 [32:03<24:42, 67.41s/it]

[I 2025-07-31 13:03:04,489] Trial 27 finished with value: 0.4389172965762561 and parameters: {'eta': 0.2078342399266211, 'max_depth': 8, 'min_child_weight': 18, 'subsample': 0.6018242527644581, 'colsample_bytree': 0.7012947967061594, 'gamma': 0.06846754505654183, 'lambda': 0.03956910238656795, 'alpha': 0.01911525464716762, 'num_boost_round': 700}. Best is trial 23 with value: 0.44644650228718064.


Parameters: { "single_precision_histogram" } are not used.

  self.starting_round = model.num_boosted_rounds()
Best trial: 23. Best value: 0.446447:  58%|█████▊    | 29/50 [33:04<22:54, 65.47s/it]

[I 2025-07-31 13:04:05,445] Trial 28 finished with value: 0.4377792672028597 and parameters: {'eta': 0.20469826412877434, 'max_depth': 8, 'min_child_weight': 15, 'subsample': 0.6416899297407693, 'colsample_bytree': 0.8105863783411896, 'gamma': 0.2581667522976163, 'lambda': 6.764969218219065e-05, 'alpha': 0.0015815447182384568, 'num_boost_round': 400}. Best is trial 23 with value: 0.44644650228718064.


Parameters: { "single_precision_histogram" } are not used.

  self.starting_round = model.num_boosted_rounds()
Best trial: 29. Best value: 0.456293:  60%|██████    | 30/50 [35:42<31:03, 93.18s/it]

[I 2025-07-31 13:06:43,278] Trial 29 finished with value: 0.45629331544824503 and parameters: {'eta': 0.2119913143426212, 'max_depth': 8, 'min_child_weight': 20, 'subsample': 0.6830709286506377, 'colsample_bytree': 0.7555971552751897, 'gamma': 0.005464206831312372, 'lambda': 9.937903090998755e-06, 'alpha': 0.029987385801414235, 'num_boost_round': 1500}. Best is trial 29 with value: 0.45629331544824503.


Parameters: { "single_precision_histogram" } are not used.

  self.starting_round = model.num_boosted_rounds()
Best trial: 29. Best value: 0.456293:  62%|██████▏   | 31/50 [37:39<31:42, 100.13s/it]

[I 2025-07-31 13:08:39,632] Trial 30 finished with value: 0.39883779404369446 and parameters: {'eta': 0.06464847198539596, 'max_depth': 6, 'min_child_weight': 19, 'subsample': 0.6883715253924405, 'colsample_bytree': 0.6856792222379116, 'gamma': 0.7901338820086434, 'lambda': 2.947970132253066e-06, 'alpha': 2.592459841108678, 'num_boost_round': 1500}. Best is trial 29 with value: 0.45629331544824503.


Parameters: { "single_precision_histogram" } are not used.

  self.starting_round = model.num_boosted_rounds()
Best trial: 29. Best value: 0.456293:  64%|██████▍   | 32/50 [39:16<29:47, 99.32s/it] 

[I 2025-07-31 13:10:17,076] Trial 31 finished with value: 0.4502417069511585 and parameters: {'eta': 0.20077916460214343, 'max_depth': 8, 'min_child_weight': 19, 'subsample': 0.6515129179390527, 'colsample_bytree': 0.7560348852804613, 'gamma': 0.6726709844646308, 'lambda': 1.7351872590228793e-05, 'alpha': 0.007055891259721372, 'num_boost_round': 1400}. Best is trial 29 with value: 0.45629331544824503.


Parameters: { "single_precision_histogram" } are not used.

  self.starting_round = model.num_boosted_rounds()
Best trial: 32. Best value: 0.460573:  66%|██████▌   | 33/50 [41:41<32:00, 112.96s/it]

[I 2025-07-31 13:12:41,855] Trial 32 finished with value: 0.46057327682355564 and parameters: {'eta': 0.16294447195189452, 'max_depth': 8, 'min_child_weight': 20, 'subsample': 0.6484825388288731, 'colsample_bytree': 0.7708348393224855, 'gamma': 0.7165007134041275, 'lambda': 7.5440807588555035e-06, 'alpha': 0.062420940263057806, 'num_boost_round': 1400}. Best is trial 32 with value: 0.46057327682355564.


Parameters: { "single_precision_histogram" } are not used.

  self.starting_round = model.num_boosted_rounds()
Best trial: 32. Best value: 0.460573:  66%|██████▌   | 33/50 [42:54<22:06, 78.01s/it] 


[W 2025-07-31 13:13:54,838] Trial 33 failed with parameters: {'eta': 0.1683192252772687, 'max_depth': 8, 'min_child_weight': 20, 'subsample': 0.650877028538525, 'colsample_bytree': 0.7636606745146671, 'gamma': 0.4982249771945027, 'lambda': 1.715039819604432e-05, 'alpha': 0.036004453144688475, 'num_boost_round': 1400} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "d:\Work\FlightRank 2025 Aeroclub RecSys Cup\.venv\Lib\site-packages\optuna\study\_optimize.py", line 201, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\1tiii\AppData\Local\Temp\ipykernel_12180\2573677029.py", line 3, in <lambda>
    study.optimize(lambda trial: objective(trial, data[:n2]), n_trials=50, show_progress_bar=True, gc_after_trial=True)
                                 ~~~~~~~~~^^^^^^^^^^^^^^^^^^
  File "C:\Users\1tiii\AppData\Local\Temp\ipykernel_12180\1600262994.py", line 123, in objective
    hit_at_3 = train_single_split(
        data_xgb=data_xg

KeyboardInterrupt: 

In [12]:
study.best_params

{'eta': 0.16294447195189452,
 'max_depth': 8,
 'min_child_weight': 20,
 'subsample': 0.6484825388288731,
 'colsample_bytree': 0.7708348393224855,
 'gamma': 0.7165007134041275,
 'lambda': 7.5440807588555035e-06,
 'alpha': 0.062420940263057806,
 'num_boost_round': 1400}

In [5]:
import numpy as np
import polars as pl
import xgboost as xgb
from copy import deepcopy
from typing import List, Tuple


# ─────────────────────────────────────────────────────────────────────────
# 1) Train / evaluate on a subset of columns
# ─────────────────────────────────────────────────────────────────────────
def train_single_split_subset(
    data_xgb: pl.DataFrame,
    feature_cols: List[str],
    params: dict,
    num_boost_round: int,
    seed: int = 42,
) -> float:
    """Return hit@3 on an 80/20 group‑based split using only feature_cols."""
    rng = np.random.default_rng(seed)
    groups_unique = data_xgb.select(GROUP_COL).unique().to_numpy().ravel()
    val_groups = rng.choice(
        groups_unique, size=max(1, int(0.2 * len(groups_unique))), replace=False
    )

    val_df   = data_xgb.filter(pl.col(GROUP_COL).is_in(val_groups))
    train_df = data_xgb.filter(~pl.col(GROUP_COL).is_in(val_groups))

    dtrain = xgb.DMatrix(
        train_df.select(feature_cols).to_numpy(),
        label=train_df.select(LABEL_COL).to_numpy().ravel(),
        group=get_group_sizes(train_df.select(GROUP_COL).to_numpy().ravel()),
        feature_names=feature_cols,
    )
    dval = xgb.DMatrix(
        val_df.select(feature_cols).to_numpy(),
        label=val_df.select(LABEL_COL).to_numpy().ravel(),
        group=get_group_sizes(val_df.select(GROUP_COL).to_numpy().ravel()),
        feature_names=feature_cols,
    )

    bst = xgb.train(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        evals=[(dval, "val")],
        early_stopping_rounds=30,
        verbose_eval=False,
    )

    # --- hit@3 -----------------------------------------------------------
    preds       = bst.predict(dval)
    ranker_ids  = val_df.select(GROUP_COL).to_numpy().ravel()
    labels      = dval.get_label()

    order = np.argsort(ranker_ids, kind="stable")
    ranker_ids, labels, preds = ranker_ids[order], labels[order], preds[order]

    hit_total = group_total = 0
    i = 0
    while i < len(ranker_ids):
        g = ranker_ids[i]
        j = i
        while j < len(ranker_ids) and ranker_ids[j] == g:
            j += 1
        if (j - i) > 10:                       # skip tiny groups
            top3 = np.argsort(preds[i:j])[::-1][:3]
            hit_total += int(labels[i:j][top3].sum() > 0)
            group_total += 1
        i = j
    return hit_total / group_total if group_total else 0.0


# ─────────────────────────────────────────────────────────────────────────
# 2) Greedy step‑forward selection
# ─────────────────────────────────────────────────────────────────────────
def step_forward_selection(
    data_xgb: pl.DataFrame,
    all_features: List[str],
    params: dict,
    num_boost_round: int,
    seed: int = 42,
    min_improvement: float = 1e-4,
) -> Tuple[List[str], float]:
    """
    Greedy SFS: start with empty set, add best column each round.
    Stops when no remaining feature improves hit@3 by > min_improvement.
    Returns (selected_features, best_score).
    """
    remaining = set(all_features)
    selected: List[str] = []
    best_score = 0.0

    while remaining:
        trial_scores = {}
        for feat in remaining:
            score = train_single_split_subset(
                data_xgb,
                selected + [feat],
                deepcopy(params),          # XGBoost mutates dict → copy!
                num_boost_round,
                seed,
            )
            trial_scores[feat] = score
            
        # pick the feature with the highest score
        feat_best, score_best = max(trial_scores.items(), key=lambda kv: kv[1])

        # check if it is a real improvement
        if score_best - best_score < min_improvement:
            print(f"Stopping: no feature improves hit@3 by ≥{min_improvement:.1e}")
            break

        selected.append(feat_best)
        remaining.remove(feat_best)
        best_score = score_best
        print(f"Added {feat_best:>30}  →  hit@3 = {best_score:.4f}")

    return selected, best_score


In [6]:
params = {
"objective":   "rank:pairwise",
"eval_metric": "ndcg@3",
"tree_method": "hist",      # 🔥 much faster
"seed":        42,
"n_jobs":      -1,
'eta': 0.11258528754406343,
 'max_depth': 7,
 'min_child_weight': 9,
 'subsample': 0.6386911919274852,
 'colsample_bytree': 0.8167665572930206,
 'gamma': 3.3978723412953182,
 'lambda': 0.0015573270572558455,
 'alpha': 0.0007046464277948389,
 'device': 'cuda'}

In [7]:
data_ready = data.with_columns([
(pl.col(c).rank("dense") - 1)  # rank starts at 1 → shift to 0
.fill_null(-1)
.cast(pl.Int32)
.alias(c)
for c in cat_features_final
])
chosen_feats, final_hit3 = step_forward_selection(
     data_xgb         = data_ready,
     all_features     = feature_list,
     params           = params,
     num_boost_round  = 1000,
     seed             = 42,
 )

Added               opt_ticket_score  →  hit@3 = 0.5022
Added      miniRules1_monetaryAmount  →  hit@3 = 0.5263
Added               avg_oneway_price  →  hit@3 = 0.5383
Added                      log_price  →  hit@3 = 0.5562
Added                   company_freq  →  hit@3 = 0.5749
Added       legs0_departureAt_period  →  hit@3 = 0.5813
Added       legs1_departureAt_period  →  hit@3 = 0.5962


KeyboardInterrupt: 