In [1]:
import sys
import os
from dotenv import load_dotenv

root_dir = os.path.abspath("..")
sys.path.append(root_dir)
dotenv_path = os.path.join(root_dir, ".env")
load_dotenv(dotenv_path)

True

In [23]:
import xgboost as xgb
import polars as pl
import pandas as pd
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score
from datetime import datetime

In [3]:
train_df = pl.read_parquet(os.path.join(root_dir, "data", "v1", "train.parquet"))
train_target = pl.read_parquet(os.path.join(root_dir, "data", "v1", "train_target.parquet"))
test_df = pl.read_parquet(os.path.join(root_dir, "data", "v1", "test.parquet"))

In [4]:
train_df_seg = train_df.select([
    "row_id",
    "ranker_id",
    
    "go_seg_number",
    "go_bag_allow_0",
    "go_travel_distance",
    "go_total_flight_time",
    "go_exchange_wait",
    "go_airline_is_lcc_0",
    "go_international",
    
    "rtn_seg_number",
    "rtn_bag_allow_0",
    "rtn_travel_distance",
    "rtn_total_flight_time",
    "rtn_exchange_wait",
    "rtn_airline_is_lcc_0", # Major carrier?
    
    "is_vip",
    "is_access_tp",
    "is_access_3d",
    "has_corporate_tariff",
    "frequent_flyer_count",
    "any_segment_in_ff",

    "price_rank",
    "price_delta",
    "departure_rank",
    "is_preferred_airline",
    "lead_time_days",
    "departure_month",
])

test_df_seg = test_df.select([
    "row_id",
    "ranker_id",
    
    "go_seg_number",
    "go_bag_allow_0",
    "go_travel_distance",
    "go_total_flight_time",
    "go_exchange_wait",
    "go_airline_is_lcc_0",
    "go_international",
    
    "rtn_seg_number",
    "rtn_bag_allow_0",
    "rtn_travel_distance",
    "rtn_total_flight_time",
    "rtn_exchange_wait",
    "rtn_airline_is_lcc_0", # Major carrier?
    
    "is_vip",
    "is_access_tp",
    "is_access_3d",
    "has_corporate_tariff",
    "frequent_flyer_count",
    "any_segment_in_ff",

    "price_rank",
    "price_delta",
    "departure_rank",
    "is_preferred_airline",
    "lead_time_days",
    "departure_month",
])

In [5]:
ranker_ids = train_df_seg.select("ranker_id").unique()

In [6]:
ranker_id_train, ranker_id_valid = train_test_split(
    ranker_ids.to_numpy().reshape(-1),
    test_size=0.2,
    random_state=42,
    shuffle=True,
)

In [7]:
train_mask = train_df_seg["ranker_id"].is_in(ranker_id_train)
valid_mask = train_df_seg["ranker_id"].is_in(ranker_id_valid)

train_idx = train_mask.to_numpy().nonzero()[0]
valid_idx = valid_mask.to_numpy().nonzero()[0]

In [8]:
X_train, X_val = train_df_seg[train_idx], train_df_seg[valid_idx]
y_train, y_val = train_target[train_idx], train_target[valid_idx]

In [9]:
print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"X_val: {X_val.shape}, y_val: {y_val.shape}")

X_train: (14575710, 27), y_train: (14575710, 2)
X_val: (3569662, 27), y_val: (3569662, 2)


In [10]:
X_train_np = X_train.drop("ranker_id").drop("row_id").to_numpy()
y_train_np = y_train['selected'].to_numpy()

X_val_np = X_val.drop("ranker_id").drop("row_id").to_numpy()
y_val_np = y_val['selected'].to_numpy()

In [11]:
X_test_np = test_df_seg.drop("ranker_id").drop("row_id").to_numpy()

In [12]:
train_group_sizes = X_train.group_by("ranker_id").len().sort("ranker_id")["len"].to_numpy()
val_group_sizes = X_val.group_by("ranker_id").len().sort("ranker_id")["len"].to_numpy()

In [13]:
dtrain = xgb.DMatrix(X_train_np, label=y_train_np)
dtrain.set_group(train_group_sizes)

dval = xgb.DMatrix(X_val_np, label=y_val_np)
dval.set_group(val_group_sizes)

In [14]:
dtest = xgb.DMatrix(X_test_np)

#### Optuna - hyperparameter tuning

In [24]:
# Optimize the hyper parameter using optuna
def groupwise_ndcg(y_true, y_pred, group_ids, k=3):
    df = pd.DataFrame({
        "group": group_ids,
        "y_true": y_true,
        "y_pred": y_pred
    })

    grouped = df.groupby("group")

    scores = grouped.apply(
        lambda g: ndcg_score(
            [g["y_true"].values],
            [g["y_pred"].values],
            k=k
        ) if len(g) >= 2 else None
    ).dropna()

    return scores.mean() if not scores.empty else 0.0


def objective(trial):
    param = {
        "objective": "rank:pairwise",
        "eval_metric": "ndcg@3",
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1),
        "max_depth": trial.suggest_int("max_depth", 6, 14),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.3, 1.0),
        "gamma": trial.suggest_float("gamma", 0.01, 10, log=True),
        "lambda": trial.suggest_float("lambda", 0.01, 10, log=True),
        "alpha": trial.suggest_float("alpha", 0.01, 5, log=True),
        "verbosity": 1,
        "tree_method": "hist",
        "n_jobs": -1,
        "seed": 42,
    }

    model = xgb.train(
        param,
        dtrain,
        num_boost_round=1000,
        evals=[(dval, "eval")],
    )

    preds = model.predict(dval)
    score = groupwise_ndcg(y_val_np, preds, X_val['ranker_id'].to_numpy(), k=3)
    return score

In [None]:
study = optuna.create_study(
    direction="maximize", 
    storage=f"sqlite:///{os.path.join(root_dir, 'db.sqlite3')}", 
    study_name=f'xgb_{datetime.now().strftime('%Y%m%d_%H%M%S')}'
)
study.optimize(objective, n_trials=50)  # ⏱️ takes time

print("[BEST PARAMS]")
print(study.best_params)
print("Best NDCG@3:", study.best_value)

[I 2025-07-29 13:03:29,482] A new study created in RDB with name: xgb_20250729_130329


[0]	eval-ndcg@3:0.78627
[1]	eval-ndcg@3:0.79452
[2]	eval-ndcg@3:0.79782
[3]	eval-ndcg@3:0.79914
[4]	eval-ndcg@3:0.80027
[5]	eval-ndcg@3:0.80080
[6]	eval-ndcg@3:0.80108
[7]	eval-ndcg@3:0.80141
[8]	eval-ndcg@3:0.80218
[9]	eval-ndcg@3:0.80271
[10]	eval-ndcg@3:0.80300
[11]	eval-ndcg@3:0.80290
[12]	eval-ndcg@3:0.80313
[13]	eval-ndcg@3:0.80318
[14]	eval-ndcg@3:0.80315
[15]	eval-ndcg@3:0.80343
[16]	eval-ndcg@3:0.80401
[17]	eval-ndcg@3:0.80389
[18]	eval-ndcg@3:0.80438
[19]	eval-ndcg@3:0.80478
[20]	eval-ndcg@3:0.80470
[21]	eval-ndcg@3:0.80430
[22]	eval-ndcg@3:0.80509
[23]	eval-ndcg@3:0.80550
[24]	eval-ndcg@3:0.80531


In [None]:
# XGBoost parameters
params = {
    'objective': 'rank:pairwise',
    'eval_metric': 'ndcg@3',
    "learning_rate": 0.025,
    "max_depth": 14,
    "min_child_weight": 2,
    "subsample": 0.8842234913702768,
    "colsample_bytree": 0.45840689146263086,
    "gamma": 3.3084297630544888,
    "lambda": 6.952586917313028,
    "alpha": 0.6395254133055179,
    'seed': 42,
    'n_jobs': -1,
    # 'device': 'cuda'
}

watchlist = [(dtrain, "train"), (dval, "eval")]

model = xgb.train(
    params,
    dtrain,
    num_boost_round=1000,
    evals=watchlist,
    # early_stopping_rounds=100,
)

In [None]:
xgb_importance = model.get_score(importance_type='gain')
xgb_importance_df = pl.DataFrame(
    [{'feature': k, 'importance': v} for k, v in xgb_importance.items()]
).sort('importance', descending=bool(1))
print(xgb_importance_df.to_pandas().to_string())

In [None]:
pred_scores = model.predict(dtest)

In [None]:
original = pl.read_parquet(os.path.join(root_dir, "kaggle", "test.parquet"))
original = (
    original
    .select(["Id", "ranker_id"])
    .with_columns(
        pl.Series("score", pred_scores)
    )
    .with_columns(
        pl.col('score')
        .rank(method='ordinal', descending=True)
        .over('ranker_id')
        .alias('selected')
    )
    .select(["Id", "ranker_id", "selected"])
    .with_columns(
        pl.col("Id").cast(pl.Int64), 
        pl.col("selected").cast(pl.Int64)
    )
)

In [None]:
from datetime import datetime
original.write_csv(os.path.join(root_dir, "submission", f"submission_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"))