In [1]:
import pyarrow.parquet as pq
import pandas as pd
import polars as pl
import datetime
from polars.datatypes import (
    Boolean, UInt8, UInt16, UInt32, UInt64, Utf8,
    Int8, Int16, Int32, Int64,
    Float32, Float64
)
from polars.datatypes import Utf8, Datetime, Date, Time
import xgboost as xgb
import numpy as np
import xgboost as xgb
from sklearn.model_selection import GroupKFold
from sklearn.metrics import ndcg_score
from geopy.distance import geodesic
from src.utils.utils import evaluate_feature
from src.train_new_base import train_base_model
rows = (
    pl.scan_parquet("data/train.parquet")        # or scan_parquet/scan_ndjson/…
      .select(pl.len())           # len() == COUNT(*)
      .collect(streaming=True)    # constant-memory execution
      .item()                     # get the scalar
)
from src.metric import hitrate_at_3

  .collect(streaming=True)    # constant-memory execution


In [2]:
data = pl.read_parquet("data/clean_data_cut.parquet")

In [3]:
feature_cols = ['company_count', 'company_freq', 'totalPrice', 'route_distance_km', 'frequentFlyer_n_programs', 'unique_ranker_count', 'hour_sin', 
                'hour_cos', 'wday_sin', 'wday_cos', 'twoway_route', 'avg_oneway_price', 'total_duration_hours_leg0', 'total_duration_hours_leg1', 'legs0_segments0_aircraft_code_freq',
                'legs1_segments0_aircraft_code_freq', 'frequentFlyer', 'isVip', 'miniRules0_monetaryAmount', 'miniRules0_percentage',
                'miniRules1_monetaryAmount', 'miniRules1_percentage', 'sex', 'total_segments_count', 'legs0_num_segments', 'legs1_num_segments', 'tariff_code_filled', 'nationality_cat', 'miniRules0_statusInfos',
                'miniRules1_statusInfos', 'pricingInfo_isAccessTP', 'legs0_departureAt_period', 'legs0_arrivalAt_period', 'legs1_departureAt_period', 'legs1_arrivalAt_period',
                'legs0_segments0_baggageAllowance_quantity_kg', 'legs1_segments0_baggageAllowance_quantity_kg', 'legs0_segments0_cabinClass_cat', 'legs1_segments0_cabinClass_cat']
FEATURE_COLUMNS_V1 = [
    # 1-2. Пересадки
    "layover_hours_leg0",
    "layover_hours_leg1",

    # 3. Сколько часов до вылёта
    "days_before_flight_leg0",

    # 4 + 6. Смена аэропорта внутри одной ноги
    "legs0_airport_changes_count",
    "legs1_airport_changes_count",

    # 5. Ночная пересадка
    "night_layover_leg0",
    "night_layover_leg1",

    # 7 + 11. Одна ли авиакомпания-оператор
    "same_operator_carrier_leg0",
    "same_operator_carrier_leg1",

    # 8. Участие билета в программе лояльности
    "ticket_is_in_FFprogramms_leg0",
    "ticket_is_in_FFprogramms_leg1",

    # 9. Ранг по цене
    "totalPrice_rank",

    # 10. Ранг по общему времени перелёта
    "totalTime_hours_ranked",

    # 12. Согласованность багажа
    "baggage_kg_equal_flag",
    "baggage_units_equal_flag",

    # 13. Сколько билетов показано в поисковой сессии
    "tickets_in_session",

    # 14. Метрики оставшихся билетов
    "remainingTickets_avg",
    "remainingTickets_rank",

    # 15. Частота покупок пользователя
    "user_search_freq",

    # 16. Доля совпадений оператор-продавец
    "operator_marketer_match_rate",

    # 17. Билет ≤ 20 % дороже минимального
    "within_20pct_of_min",

    # 18. День недели вылета (синус/косинус)
    "leg0_depday_sin", "leg1_depday_sin",
    "leg0_depday_cos", "leg1_depday_cos",

    # 19. День недели прилёта (синус/косинус)
    "leg0_arrday_sin", "leg1_arrday_sin",
    "leg0_arrday_cos", "leg1_arrday_cos",

    # 20. Комплексная «оптимальность» билета
    "opt_ticket_score",
]
FEATURE_COLUMNS_V2 = [
    # Cчётчики сегментов и направления
    "n_segments_leg0",
    "n_segments_leg1",
    "is_one_way",
    "total_segments",
    "is_direct_leg0",
    "is_direct_leg1",
    "both_direct",

    # Стоимость, налоги, сборы
    "price_per_tax",
    "tax_rate",
    "log_price",
    "total_fees",
    "has_fees",
    "fee_rate",

    # Продолжительности
    "duration_ratio",

    # Тарифы и привилегии
    "has_corporate_tariff",
    "has_access_tp",
    "n_ff_programs",
    "is_vip_freq",

    # Класс обслуживания
    "avg_cabin_class",
    "cabin_class_diff",

    # Популярные маршруты
    "is_popular_route",

    # Относительные метрики цены и сегментов внутри поиска
    "price_pct_rank",
    "is_cheapest",
    "price_from_median",
    "is_min_segments",
    "is_direct_cheapest",
]
cat_feat_add = [
    'legs0_segments0_aircraft_code', 'legs0_segments0_arrivalTo_airport_city_iata',
    'legs0_segments0_arrivalTo_airport_iata', 'legs0_segments0_departureFrom_airport_iata',
    'legs0_segments0_marketingCarrier_code', 'legs0_segments0_operatingCarrier_code',
    'legs0_segments0_flightNumber',
    'legs0_segments1_aircraft_code', 'legs0_segments1_arrivalTo_airport_city_iata',
    'legs0_segments1_arrivalTo_airport_iata', 'legs0_segments1_departureFrom_airport_iata',
    'legs0_segments1_marketingCarrier_code', 'legs0_segments1_operatingCarrier_code',
    'legs0_segments1_flightNumber',
    # Leg 1 segments 0-1
    'legs1_segments0_aircraft_code', 'legs1_segments0_arrivalTo_airport_city_iata',
    'legs1_segments0_arrivalTo_airport_iata', 'legs1_segments0_departureFrom_airport_iata',
    'legs1_segments0_marketingCarrier_code', 'legs1_segments0_operatingCarrier_code',
    'legs1_segments0_flightNumber',
    'legs1_segments1_aircraft_code', 'legs1_segments1_arrivalTo_airport_city_iata',
    'legs1_segments1_arrivalTo_airport_iata', 'legs1_segments1_departureFrom_airport_iata',
    'legs1_segments1_marketingCarrier_code', 'legs1_segments1_operatingCarrier_code',
    'legs1_segments1_flightNumber',
]

feature_list = feature_cols + FEATURE_COLUMNS_V1 + FEATURE_COLUMNS_V2 + cat_feat_add
cat_features_final = ['tariff_code_filled', 'nationality_cat', 'legs0_departureAt_period', 'legs0_arrivalAt_period',
                      'legs1_departureAt_period', 'legs1_arrivalAt_period', 'legs0_segments0_cabinClass_cat', 'legs1_segments0_cabinClass_cat', 'frequentFlyer'] + cat_feat_add
#feature_cols = ['totalPrice']
#cat_features_final = []
#X = data.select(feature_cols)
#y = data.select("selected")            # Polars DataFrame with 1 col
#groups = data.select("ranker_id")      # Polars DataFrame with 1 col
n2 = rows

# 5. Set params to use pairwise ranking + ndcg@3 + histogram tree builder
params = {
    "objective":   "rank:pairwise",
    "eval_metric": "ndcg@3",
    "tree_method": "hist",      # 🔥 much faster
    "seed":        42,
    "n_jobs":      -1,
    'eta': 0.21097440411031887,
    'max_depth': 11,
    'min_child_weight': 14,
    'subsample': 0.9941042210100538,
    'colsample_bytree': 0.6866804869216795,
    'gamma': 1.7235898102326062,
    'lambda': 1.0938350888836036e-05,
    'alpha': 0.23093425602077636,
    'num_boost_round': 1200,
    'device':'cuda'
}


In [5]:
cv_stats = pd.read_csv('model/cv_results.csv')
print('mean top@3 - ', round(cv_stats['val-top@3'].mean(), 4))
print('std top@3 - ', round(cv_stats['val-top@3'].std(), 4))
print('mean ndcg@3 - ', round(cv_stats['val-ndcg@3'].mean(), 4))
print('std ndcg@3 - ', round(cv_stats['val-ndcg@3'].std(), 4))

mean top@3 -  0.4192
std top@3 -  0.0027
mean ndcg@3 -  0.3848
std ndcg@3 -  0.002


In [5]:
feature_list = ['totalPrice', 'ranker_id', 'selected', 'company_count', 'route_distance_km', 'frequentFlyer_n_programs', 'unique_ranker_count']

In [None]:
top_sigma = np.std(pd.read_csv('model/cv_results.csv')['val-top@3']), np.mean(pd.read_csv('model/cv_results.csv')['val-top@3'])
meta, df = evaluate_feature(data[:n2][feature_list], 
                       new_feature_name='unique_ranker_count', 
                       group_col='ranker_id', 
                       label_col='selected', 
                       baseline_model='model/base.json', 
                       params=params, 
                       sigma0=top_sigma[1],
                       corr_threshold=0.00,
                       cat_features_final=cat_features_final)
meta

top@3 before -  0.2258  top@3 after -  0.2237
ndcg@3 before -  0.2622  top@3 after -  0.2614
mean top@3 -  0.2239
std top@3 -  0.003
mean ndcg@3 -  0.2621
std ndcg@3 -  0.0012


{'corr': np.float64(0.005832222368000487),
 'warm_delta': np.float64(-0.0021618937392779147),
 'mini_delta': None}

In [47]:
if round(cv_stats['val-top@3'].mean(), 4) < round(df['val-top@3'].mean(), 4):
    print('accepted by top@3')
if round(cv_stats['val-ndcg@3'].mean(), 4)  < round(df['val-ndcg@3'].mean(), 4):
    print('accepted by ncdg3')

accepted by ncdg3


In [43]:
df.to_pandas().to_csv('model/cv_results.csv')

In [4]:
data_xgb=data[:n2] 
features=feature_list 
label_col='selected' 
group_col='ranker_id' 
params=params
num_boost_round=1200
baseline_model_path='model/'
seed=42
verbose_eval_size=100
full_cv = False
cat_features_final=cat_features_final
def get_group_sizes(ranker_ids: np.ndarray) -> np.ndarray:
    # unique in order of first appearance + their counts
    uniq, idx, counts = np.unique(ranker_ids, return_index=True, return_counts=True)
    return counts[np.argsort(idx)]
data_xgb = data[:n2].with_columns([
                            (pl.col(c).rank("dense") - 1)
                            .fill_null(-1)
                            .cast(pl.Int32)
                            .alias(c)
                            for c in cat_features_final
                        ])
rng = np.random.default_rng(seed=seed)
groups = data_xgb.select('ranker_id').unique().to_numpy().ravel()
val_groups = rng.choice(groups, size=max(1, int(0.001 * len(groups))), replace=False)
val_df = data_xgb.filter(pl.col('ranker_id').is_in(val_groups))
train_df = data_xgb.filter(~pl.col('ranker_id').is_in(val_groups))


y_val        = val_df.select(label_col).to_numpy().ravel()
y_train        = train_df.select(label_col).to_numpy().ravel()

feature_cols_all = [c for c in features if c not in {label_col, group_col}]

dtrain = xgb.DMatrix(train_df.select(feature_cols_all).to_numpy(),
                        label=y_train,
                        group=get_group_sizes(train_df.select(group_col)),
                        feature_names=feature_cols_all)

dval = xgb.DMatrix(val_df.select(feature_cols_all).to_numpy(),
                        label=y_val,
                        group=get_group_sizes(val_df.select(group_col)),
                        feature_names=feature_cols_all)

tiny_model = xgb.train(
    params,
    dtrain,
    #early_stopping_rounds=100,
    evals=[(dtrain, 'train'), (dval, 'val')],
    num_boost_round=num_boost_round,
    verbose_eval=100,
)

Parameters: { "num_boost_round" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[0]	train-ndcg@3:0.35275	val-ndcg@3:0.31621
[100]	train-ndcg@3:0.64204	val-ndcg@3:0.44954
[200]	train-ndcg@3:0.73916	val-ndcg@3:0.44954
[300]	train-ndcg@3:0.82917	val-ndcg@3:0.48889
[400]	train-ndcg@3:0.84049	val-ndcg@3:0.50544
[500]	train-ndcg@3:0.84049	val-ndcg@3:0.50544
[600]	train-ndcg@3:0.84220	val-ndcg@3:0.49592
[700]	train-ndcg@3:0.87978	val-ndcg@3:0.51270
[800]	train-ndcg@3:0.88120	val-ndcg@3:0.51270
[900]	train-ndcg@3:0.88335	val-ndcg@3:0.51394
[1000]	train-ndcg@3:0.88404	val-ndcg@3:0.51043
[1100]	train-ndcg@3:0.88404	val-ndcg@3:0.51043
[1199]	train-ndcg@3:0.88404	val-ndcg@3:0.51043


In [5]:
preds = tiny_model.predict(dval)

NameError: name 'tiny_model' is not defined

In [5]:
xgb_importance = tiny_model.get_score(importance_type='gain')
xgb_importance_df = pl.DataFrame(
    [{'feature': k, 'importance': v} for k, v in xgb_importance.items()]
).sort('importance', descending=bool(1))

In [9]:
score = hitrate_at_3(dval.get_label(), preds, val_df.select(group_col).to_series().to_numpy())

In [10]:
score

0.6524184346199036

In [None]:
res = train_base_model(data=data[:n2], 
                 features=feature_list, 
                 label_col='selected', 
                 group_col='ranker_id', 
                 params=params, 
                 num_boost_round=1500, 
                 baseline_model_path='model/', 
                 seed=42, 
                 verbose_eval_size=5, 
                 full_cv = False,
                 cat_features_final=cat_features_final)

res

Parameters: { "num_boost_round" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


0.4694783687591553


In [5]:
booster = xgb.Booster()
booster.load_model('model/base.json')
booster = tiny_model

In [7]:
def get_group_sizes(ranker_ids: np.ndarray) -> np.ndarray:
    # unique in order of first appearance + their counts
    uniq, idx, counts = np.unique(ranker_ids, return_index=True, return_counts=True)
    return counts[np.argsort(idx)]
data_xgb = data[n2:].with_columns([
                                (pl.col(c).rank("dense") - 1)
                                .fill_null(-1)
                                .cast(pl.Int32)
                                .alias(c)
                                for c in cat_features_final
                            ])
label_col = 'selected'
group_col = 'ranker_id'
feature_cols_all = feature_list
data_test = xgb.DMatrix(data_xgb.select(feature_cols_all).to_numpy(), feature_names=feature_cols_all, group=get_group_sizes(data_xgb.select(group_col)))

In [8]:
preds = booster.predict(data_test)

In [9]:
test = pl.read_parquet('data/test.parquet')

In [10]:
submission_xgb = (
    test.select(['Id', 'ranker_id'])
    .with_columns(pl.Series('pred_score', preds))
    .with_columns(
        pl.col('pred_score')
        .rank(method='ordinal', descending=True)
        .over('ranker_id')
        .cast(pl.Int32)
        .alias('selected')
    )
    .select(['Id', 'ranker_id', 'selected', 'pred_score'])
)

In [11]:
submission_xgb.write_csv('submission_v4.csv')

In [4]:
import optuna
LABEL_COL = "selected"          # binary 1/0 – chosen by the user
GROUP_COL = "ranker_id"         # search/session id
FEATURE_COLUMNS = feature_list
CAT_FEATURES_FINAL = cat_features_final
def get_group_sizes(ranker_ids: np.ndarray) -> np.ndarray:
    """Return group‐sizes array for XGBoost ranking.

    Parameters
    ----------
    ranker_ids : np.ndarray
        1‑D array with group id (ranker_id) of every row in the same order as X.

    Returns
    -------
    np.ndarray
        Array of group sizes, ordered by the first appearance of each id (exactly
        what xgb.DMatrix wants for ranking).
    """
    uniq, idx, counts = np.unique(ranker_ids, return_index=True, return_counts=True)
    return counts[np.argsort(idx)]
def train_single_split(
    data_xgb: pl.DataFrame,
    params: dict,
    num_boost_round: int,
    seed: int = 42,
) -> float:
    """Train XGBoost on a random 80/20 group‑based split and return hit@3."""

    # Ordinal‑encode categorical features ------------------------------------------------


    rng = np.random.default_rng(seed)
    groups_unique = data_xgb.select(GROUP_COL).unique().to_numpy().ravel()
    val_groups = rng.choice(groups_unique, size=max(1, int(0.2 * len(groups_unique))), replace=False)

    val_df = data_xgb.filter(pl.col(GROUP_COL).is_in(val_groups))
    train_df = data_xgb.filter(~pl.col(GROUP_COL).is_in(val_groups))

    feature_cols = [c for c in FEATURE_COLUMNS if c not in {LABEL_COL, GROUP_COL}]

    dtrain = xgb.DMatrix(
        train_df.select(feature_cols).to_numpy(),
        label=train_df.select(LABEL_COL).to_numpy().ravel(),
        group=get_group_sizes(train_df.select(GROUP_COL).to_numpy().ravel()),
        feature_names=feature_cols,
    )

    dval = xgb.DMatrix(
        val_df.select(feature_cols).to_numpy(),
        label=val_df.select(LABEL_COL).to_numpy().ravel(),
        group=get_group_sizes(val_df.select(GROUP_COL).to_numpy().ravel()),
        feature_names=feature_cols,
    )

    bst = xgb.train(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        evals=[(dval, "val")],
        early_stopping_rounds=30,
        verbose_eval=False,
    )

    # NDCG@3 produced by XGBoost is geometric mean across groups; for hit@3 we compute manually
    preds = bst.predict(dval)

    # Hit@3 – at least one correct item in top‑3 per group ------------------
    # Reconstruct group slices
    ranker_ids = val_df.select(GROUP_COL).to_numpy().ravel()
    labels = dval.get_label()
    order = np.argsort(ranker_ids, kind="stable")
    ranker_ids, labels, preds = ranker_ids[order], labels[order], preds[order]

    hit_total, group_total = 0, 0
    start = 0
    while start < len(ranker_ids):
        group = ranker_ids[start]
        end = start
        while end < len(ranker_ids) and ranker_ids[end] == group:
            end += 1
        if (end - start) > 10:
            group_labels = labels[start:end]
            group_preds = preds[start:end]
            top3_idx = np.argsort(group_preds)[::-1][:3]
            hit_total += int(group_labels[top3_idx].sum() > 0)
            group_total += 1
        start = end
    return hit_total / group_total


# ---------------------------------------------------------------------------
# Optuna objective ----------------------------------------------------------
# ---------------------------------------------------------------------------

def objective(trial: optuna.Trial, data: pl.DataFrame) -> float:
    params = {
        "objective": "rank:pairwise",
        "eval_metric": "ndcg@3",
        "eta": trial.suggest_float("eta", 0.01, 0.3, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 20),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "gamma": trial.suggest_float("gamma", 0.0, 5.0),
        "lambda": trial.suggest_float("lambda", 1e-8, 10.0, log=True),
        "alpha": trial.suggest_float("alpha", 1e-8, 10.0, log=True),
        "tree_method": "hist",
        "max_bin": 128,
        "seed": 42,
        'device': 'cuda'
    }

    num_boost_round = trial.suggest_int("num_boost_round", 300, 1500, step=100)
    data_xgb = data.with_columns([
    (pl.col(c).rank("dense") - 1)  # rank starts at 1 → shift to 0
    .fill_null(-1)
    .cast(pl.Int32)
    .alias(c)
    for c in CAT_FEATURES_FINAL
    ])
    hit_at_3 = train_single_split(
        data_xgb=data_xgb,
        params=params,
        num_boost_round=num_boost_round,
        seed=42,
    )

    return hit_at_3  # maximise



  from .autonotebook import tqdm as notebook_tqdm


In [5]:
from pathlib import Path
import optuna
store_dir = Path("optuna_results")
store_dir.mkdir(parents=True, exist_ok=True)

# 2) делаем корректный абсолютный URI
storage_uri = f"sqlite:///{(store_dir / 'study.db').resolve()}"

In [None]:
# 3) создаём или открываем Study
study = optuna.create_study(
    study_name="xgb_search",
    direction="maximize",
    storage=storage_uri,
    load_if_exists=True
)

In [6]:
study = optuna.load_study(
    study_name = "xgb_search",
    storage    = storage_uri
)

In [7]:
study.optimize(lambda trial: objective(trial, data[:n2]), n_trials=50, show_progress_bar=True, gc_after_trial=True)

# ---------------------------------------------------------------------
# 3) Report best params & save
# ---------------------------------------------------------------------
print("\nBest hit@3:", round(study.best_value, 4))
print("Best params:\n", study.best_params)

out_dir = Path("optuna_results")
out_dir.mkdir(exist_ok=True, parents=True)
(out_dir / "best_params.json").write_text(study.best_trial.params.__repr__())

Best trial: 4. Best value: 0.543184:   2%|▏         | 1/50 [02:05<1:42:41, 125.75s/it]

[I 2025-07-31 15:11:50,446] Trial 8 finished with value: 0.4662334777296457 and parameters: {'eta': 0.054089595287261844, 'max_depth': 8, 'min_child_weight': 10, 'subsample': 0.7565366851178921, 'colsample_bytree': 0.9452070555978844, 'gamma': 3.1517328032733167, 'lambda': 2.9844649491816266e-07, 'alpha': 0.006713227456176927, 'num_boost_round': 300}. Best is trial 4 with value: 0.5431843575418994.


Best trial: 4. Best value: 0.543184:   4%|▍         | 2/50 [04:50<1:58:55, 148.65s/it]

[I 2025-07-31 15:14:35,136] Trial 9 finished with value: 0.45340401785714285 and parameters: {'eta': 0.0320284992549291, 'max_depth': 6, 'min_child_weight': 6, 'subsample': 0.7204405189364425, 'colsample_bytree': 0.7958177859102721, 'gamma': 0.608829736013396, 'lambda': 0.00010011110392002817, 'alpha': 0.005916200202277913, 'num_boost_round': 600}. Best is trial 4 with value: 0.5431843575418994.


Best trial: 4. Best value: 0.543184:   6%|▌         | 3/50 [07:16<1:55:35, 147.57s/it]

[I 2025-07-31 15:17:01,410] Trial 10 finished with value: 0.47411777802515864 and parameters: {'eta': 0.17529212513767822, 'max_depth': 5, 'min_child_weight': 9, 'subsample': 0.7938616749364508, 'colsample_bytree': 0.9590306958501253, 'gamma': 4.46323441898894, 'lambda': 0.2115451133027719, 'alpha': 0.01017524009021287, 'num_boost_round': 1300}. Best is trial 4 with value: 0.5431843575418994.


Best trial: 4. Best value: 0.543184:   8%|▊         | 4/50 [08:08<1:24:06, 109.70s/it]

[I 2025-07-31 15:17:53,136] Trial 11 finished with value: 0.38417322395222947 and parameters: {'eta': 0.020700655209104434, 'max_depth': 6, 'min_child_weight': 20, 'subsample': 0.9704843297697588, 'colsample_bytree': 0.9432714376950468, 'gamma': 4.963358092742362, 'lambda': 5.835599300879498, 'alpha': 2.994447468599529e-05, 'num_boost_round': 300}. Best is trial 4 with value: 0.5431843575418994.


Best trial: 12. Best value: 0.562081:  10%|█         | 5/50 [09:55<1:21:33, 108.75s/it]

[I 2025-07-31 15:19:40,151] Trial 12 finished with value: 0.5620812862885216 and parameters: {'eta': 0.2713322362860683, 'max_depth': 12, 'min_child_weight': 1, 'subsample': 0.9003033337492424, 'colsample_bytree': 0.6264713043381429, 'gamma': 2.271616000943315, 'lambda': 3.3617174705593664e-08, 'alpha': 3.856122179288259, 'num_boost_round': 900}. Best is trial 12 with value: 0.5620812862885216.


Best trial: 12. Best value: 0.562081:  12%|█▏        | 6/50 [11:29<1:16:05, 103.75s/it]

[I 2025-07-31 15:21:14,210] Trial 13 finished with value: 0.5540683732084101 and parameters: {'eta': 0.2872260842966097, 'max_depth': 12, 'min_child_weight': 1, 'subsample': 0.8977515535078602, 'colsample_bytree': 0.6304803694467846, 'gamma': 2.4643971147913613, 'lambda': 1.2484361732743704e-08, 'alpha': 6.104132168523236, 'num_boost_round': 1500}. Best is trial 12 with value: 0.5620812862885216.


Best trial: 12. Best value: 0.562081:  14%|█▍        | 7/50 [13:30<1:18:24, 109.40s/it]

[I 2025-07-31 15:23:15,232] Trial 14 finished with value: 0.5428714412811388 and parameters: {'eta': 0.29415708903682625, 'max_depth': 10, 'min_child_weight': 1, 'subsample': 0.8956856424839637, 'colsample_bytree': 0.6037125758118901, 'gamma': 2.4994054703232487, 'lambda': 1.4499523533667203e-08, 'alpha': 7.9739094962389325, 'num_boost_round': 1500}. Best is trial 12 with value: 0.5620812862885216.


Best trial: 12. Best value: 0.562081:  16%|█▌        | 8/50 [15:35<1:20:08, 114.49s/it]

[I 2025-07-31 15:25:20,658] Trial 15 finished with value: 0.5493961150999054 and parameters: {'eta': 0.2858193348280146, 'max_depth': 10, 'min_child_weight': 4, 'subsample': 0.8982564777838123, 'colsample_bytree': 0.6105286184222108, 'gamma': 2.4798772576540773, 'lambda': 1.5817695489804202e-08, 'alpha': 6.834227828244898, 'num_boost_round': 900}. Best is trial 12 with value: 0.5620812862885216.


Best trial: 12. Best value: 0.562081:  18%|█▊        | 9/50 [17:53<1:23:08, 121.68s/it]

[I 2025-07-31 15:27:38,112] Trial 16 finished with value: 0.5368632707774799 and parameters: {'eta': 0.16333806254628663, 'max_depth': 10, 'min_child_weight': 4, 'subsample': 0.8905474836085736, 'colsample_bytree': 0.6749431070677041, 'gamma': 3.2706149415610692, 'lambda': 0.000947496063024176, 'alpha': 0.29170620064939495, 'num_boost_round': 1000}. Best is trial 12 with value: 0.5620812862885216.


Best trial: 17. Best value: 0.562658:  20%|██        | 10/50 [20:10<1:24:20, 126.50s/it]

[I 2025-07-31 15:29:55,370] Trial 17 finished with value: 0.5626575541986444 and parameters: {'eta': 0.21097440411031887, 'max_depth': 11, 'min_child_weight': 14, 'subsample': 0.9941042210100538, 'colsample_bytree': 0.6866804869216795, 'gamma': 1.7235898102326062, 'lambda': 1.0938350888836036e-05, 'alpha': 0.23093425602077636, 'num_boost_round': 1200}. Best is trial 17 with value: 0.5626575541986444.


Best trial: 17. Best value: 0.562658:  22%|██▏       | 11/50 [22:27<1:24:13, 129.59s/it]

[I 2025-07-31 15:32:11,949] Trial 18 finished with value: 0.5622633103141012 and parameters: {'eta': 0.12975503016044343, 'max_depth': 11, 'min_child_weight': 15, 'subsample': 0.9891604903333733, 'colsample_bytree': 0.6856003097753048, 'gamma': 1.939637792800385, 'lambda': 0.0009081994817525512, 'alpha': 0.2061072333494555, 'num_boost_round': 1200}. Best is trial 17 with value: 0.5626575541986444.


Best trial: 17. Best value: 0.562658:  24%|██▍       | 12/50 [26:04<1:38:57, 156.24s/it]

[I 2025-07-31 15:35:49,165] Trial 19 finished with value: 0.5353529738743746 and parameters: {'eta': 0.11578251152816867, 'max_depth': 9, 'min_child_weight': 16, 'subsample': 0.9966829269297759, 'colsample_bytree': 0.7041730959875934, 'gamma': 1.6355592802491676, 'lambda': 0.002978943506440341, 'alpha': 0.06258517851942154, 'num_boost_round': 1200}. Best is trial 17 with value: 0.5626575541986444.


Best trial: 17. Best value: 0.562658:  24%|██▍       | 12/50 [26:12<1:22:59, 131.03s/it]


[W 2025-07-31 15:35:57,191] Trial 20 failed with parameters: {'eta': 0.1296035972621578, 'max_depth': 11, 'min_child_weight': 16, 'subsample': 0.6675234480457631, 'colsample_bytree': 0.7018701030290455, 'gamma': 1.787834608639515, 'lambda': 1.2953335132722847e-05, 'alpha': 0.37146198362050187, 'num_boost_round': 1300} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "d:\Work\FlightRank 2025 Aeroclub RecSys Cup\.venv\Lib\site-packages\optuna\study\_optimize.py", line 201, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\1tiii\AppData\Local\Temp\ipykernel_22964\4015532063.py", line 1, in <lambda>
    study.optimize(lambda trial: objective(trial, data[:n2]), n_trials=50, show_progress_bar=True, gc_after_trial=True)
                                 ~~~~~~~~~^^^^^^^^^^^^^^^^^^
  File "C:\Users\1tiii\AppData\Local\Temp\ipykernel_22964\3748191809.py", line 122, in objective
    hit_at_3 = train_single_split(
        data_xgb=data_x

KeyboardInterrupt: 

In [10]:
study.best_params

{'eta': 0.21097440411031887,
 'max_depth': 11,
 'min_child_weight': 14,
 'subsample': 0.9941042210100538,
 'colsample_bytree': 0.6866804869216795,
 'gamma': 1.7235898102326062,
 'lambda': 1.0938350888836036e-05,
 'alpha': 0.23093425602077636,
 'num_boost_round': 1200}

In [9]:
study.trials

[FrozenTrial(number=0, state=0, values=None, datetime_start=datetime.datetime(2025, 7, 31, 14, 51, 46, 274228), datetime_complete=None, params={'eta': 0.013782869342439474, 'max_depth': 12, 'min_child_weight': 18, 'subsample': 0.9104912727780459, 'colsample_bytree': 0.7325712310823745, 'gamma': 0.6878229253767104, 'lambda': 1.0430375163161512, 'alpha': 1.657867169817773e-06, 'num_boost_round': 1300}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'eta': FloatDistribution(high=0.3, log=True, low=0.01, step=None), 'max_depth': IntDistribution(high=12, log=False, low=3, step=1), 'min_child_weight': IntDistribution(high=20, log=False, low=1, step=1), 'subsample': FloatDistribution(high=1.0, log=False, low=0.6, step=None), 'colsample_bytree': FloatDistribution(high=1.0, log=False, low=0.6, step=None), 'gamma': FloatDistribution(high=5.0, log=False, low=0.0, step=None), 'lambda': FloatDistribution(high=10.0, log=True, low=1e-08, step=None), 'alpha': FloatDistribution(

In [5]:
import numpy as np
import polars as pl
import xgboost as xgb
from copy import deepcopy
from typing import List, Tuple


# ─────────────────────────────────────────────────────────────────────────
# 1) Train / evaluate on a subset of columns
# ─────────────────────────────────────────────────────────────────────────
def train_single_split_subset(
    data_xgb: pl.DataFrame,
    feature_cols: List[str],
    params: dict,
    num_boost_round: int,
    seed: int = 42,
) -> float:
    """Return hit@3 on an 80/20 group‑based split using only feature_cols."""
    rng = np.random.default_rng(seed)
    groups_unique = data_xgb.select(GROUP_COL).unique().to_numpy().ravel()
    val_groups = rng.choice(
        groups_unique, size=max(1, int(0.2 * len(groups_unique))), replace=False
    )

    val_df   = data_xgb.filter(pl.col(GROUP_COL).is_in(val_groups))
    train_df = data_xgb.filter(~pl.col(GROUP_COL).is_in(val_groups))

    dtrain = xgb.DMatrix(
        train_df.select(feature_cols).to_numpy(),
        label=train_df.select(LABEL_COL).to_numpy().ravel(),
        group=get_group_sizes(train_df.select(GROUP_COL).to_numpy().ravel()),
        feature_names=feature_cols,
    )
    dval = xgb.DMatrix(
        val_df.select(feature_cols).to_numpy(),
        label=val_df.select(LABEL_COL).to_numpy().ravel(),
        group=get_group_sizes(val_df.select(GROUP_COL).to_numpy().ravel()),
        feature_names=feature_cols,
    )

    bst = xgb.train(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        evals=[(dval, "val")],
        early_stopping_rounds=30,
        verbose_eval=False,
    )

    # --- hit@3 -----------------------------------------------------------
    preds       = bst.predict(dval)
    ranker_ids  = val_df.select(GROUP_COL).to_numpy().ravel()
    labels      = dval.get_label()

    order = np.argsort(ranker_ids, kind="stable")
    ranker_ids, labels, preds = ranker_ids[order], labels[order], preds[order]

    hit_total = group_total = 0
    i = 0
    while i < len(ranker_ids):
        g = ranker_ids[i]
        j = i
        while j < len(ranker_ids) and ranker_ids[j] == g:
            j += 1
        if (j - i) > 10:                       # skip tiny groups
            top3 = np.argsort(preds[i:j])[::-1][:3]
            hit_total += int(labels[i:j][top3].sum() > 0)
            group_total += 1
        i = j
    return hit_total / group_total if group_total else 0.0


# ─────────────────────────────────────────────────────────────────────────
# 2) Greedy step‑forward selection
# ─────────────────────────────────────────────────────────────────────────
def step_forward_selection(
    data_xgb: pl.DataFrame,
    all_features: List[str],
    params: dict,
    num_boost_round: int,
    seed: int = 42,
    min_improvement: float = 1e-4,
) -> Tuple[List[str], float]:
    """
    Greedy SFS: start with empty set, add best column each round.
    Stops when no remaining feature improves hit@3 by > min_improvement.
    Returns (selected_features, best_score).
    """
    remaining = set(all_features)
    selected: List[str] = []
    best_score = 0.0

    while remaining:
        trial_scores = {}
        for feat in remaining:
            score = train_single_split_subset(
                data_xgb,
                selected + [feat],
                deepcopy(params),          # XGBoost mutates dict → copy!
                num_boost_round,
                seed,
            )
            trial_scores[feat] = score
            
        # pick the feature with the highest score
        feat_best, score_best = max(trial_scores.items(), key=lambda kv: kv[1])

        # check if it is a real improvement
        if score_best - best_score < min_improvement:
            print(f"Stopping: no feature improves hit@3 by ≥{min_improvement:.1e}")
            break

        selected.append(feat_best)
        remaining.remove(feat_best)
        best_score = score_best
        print(f"Added {feat_best:>30}  →  hit@3 = {best_score:.4f}")

    return selected, best_score


In [6]:
params = {
"objective":   "rank:pairwise",
"eval_metric": "ndcg@3",
"tree_method": "hist",      # 🔥 much faster
"seed":        42,
"n_jobs":      -1,
'eta': 0.11258528754406343,
 'max_depth': 7,
 'min_child_weight': 9,
 'subsample': 0.6386911919274852,
 'colsample_bytree': 0.8167665572930206,
 'gamma': 3.3978723412953182,
 'lambda': 0.0015573270572558455,
 'alpha': 0.0007046464277948389,
 'device': 'cuda'}

In [7]:
data_ready = data.with_columns([
(pl.col(c).rank("dense") - 1)  # rank starts at 1 → shift to 0
.fill_null(-1)
.cast(pl.Int32)
.alias(c)
for c in cat_features_final
])
chosen_feats, final_hit3 = step_forward_selection(
     data_xgb         = data_ready,
     all_features     = feature_list,
     params           = params,
     num_boost_round  = 1000,
     seed             = 42,
 )

Added               opt_ticket_score  →  hit@3 = 0.5022
Added      miniRules1_monetaryAmount  →  hit@3 = 0.5263
Added               avg_oneway_price  →  hit@3 = 0.5383
Added                      log_price  →  hit@3 = 0.5562
Added                   company_freq  →  hit@3 = 0.5749
Added       legs0_departureAt_period  →  hit@3 = 0.5813
Added       legs1_departureAt_period  →  hit@3 = 0.5962


KeyboardInterrupt: 