In [1]:
import sys
import os
from dotenv import load_dotenv

root_dir = os.path.abspath("..")
sys.path.append(root_dir)
dotenv_path = os.path.join(root_dir, ".env")
load_dotenv(dotenv_path)

True

In [2]:
import xgboost as xgb
import polars as pl
import pandas as pd
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score
from datetime import datetime

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
train_df = pl.read_parquet(os.path.join(root_dir, "data", "v2", "train.parquet"))
train_target = pl.read_parquet(os.path.join(root_dir, "data", "v2", "train_target.parquet"))
test_df = pl.read_parquet(os.path.join(root_dir, "data", "v2", "test.parquet"))

In [4]:
train_df_seg = train_df.select([
    "row_id",
    "ranker_id",
    
    "go_seg_number",
    "go_bag_allow_0",
    "go_travel_distance",
    "go_total_flight_time",
    "go_exchange_wait",
    "go_airline_is_lcc_0",
    "go_international",
    
    "rtn_seg_number",
    "rtn_bag_allow_0",
    "rtn_travel_distance",
    "rtn_total_flight_time",
    "rtn_exchange_wait",
    "rtn_airline_is_lcc_0", # Major carrier?
    
    "is_vip",
    "is_access_tp",
    "is_access_3d",
    "has_corporate_tariff",
    "frequent_flyer_count",
    "any_segment_in_ff",

    "price_rank",
    "price_delta",
    "departure_rank",
    "is_preferred_airline",
    "lead_time_days",
    "departure_month",
])

test_df_seg = test_df.select([
    "row_id",
    "ranker_id",
    
    "go_seg_number",
    "go_bag_allow_0",
    "go_travel_distance",
    "go_total_flight_time",
    "go_exchange_wait",
    "go_airline_is_lcc_0",
    "go_international",
    
    "rtn_seg_number",
    "rtn_bag_allow_0",
    "rtn_travel_distance",
    "rtn_total_flight_time",
    "rtn_exchange_wait",
    "rtn_airline_is_lcc_0", # Major carrier?
    
    "is_vip",
    "is_access_tp",
    "is_access_3d",
    "has_corporate_tariff",
    "frequent_flyer_count",
    "any_segment_in_ff",

    "price_rank",
    "price_delta",
    "departure_rank",
    "is_preferred_airline",
    "lead_time_days",
    "departure_month",
])

In [5]:
ranker_ids = train_df_seg.select("ranker_id").unique()

In [6]:
print(f"I have {len(ranker_ids)} ranker_ids")

I have 105539 ranker_ids


In [7]:
ranker_id_train, ranker_id_valid = train_test_split(
    ranker_ids.to_numpy().reshape(-1),
    test_size=0.35,
    random_state=42,
    shuffle=True,
)

print("Train size:", len(ranker_id_train))
print("Valid size:", len(ranker_id_valid))

Train size: 68600
Valid size: 36939


In [8]:
train_mask = train_df_seg["ranker_id"].is_in(ranker_id_train)
valid_mask = train_df_seg["ranker_id"].is_in(ranker_id_valid)

train_idx = train_mask.to_numpy().nonzero()[0]
valid_idx = valid_mask.to_numpy().nonzero()[0]

In [9]:
X_train, X_val = train_df_seg[train_idx], train_df_seg[valid_idx]
y_train, y_val = train_target[train_idx], train_target[valid_idx]

In [10]:
print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"X_val: {X_val.shape}, y_val: {y_val.shape}")

X_train: (11844170, 27), y_train: (11844170, 2)
X_val: (6301202, 27), y_val: (6301202, 2)


In [11]:
X_train_np = X_train.drop("ranker_id").drop("row_id").to_numpy()
y_train_np = y_train['selected'].to_numpy()

X_val_np = X_val.drop("ranker_id").drop("row_id").to_numpy()
y_val_np = y_val['selected'].to_numpy()

In [12]:
X_test_np = test_df_seg.drop("ranker_id").drop("row_id").to_numpy()

In [13]:
train_group_sizes = X_train.group_by("ranker_id").len().sort("ranker_id")["len"].to_numpy()
val_group_sizes = X_val.group_by("ranker_id").len().sort("ranker_id")["len"].to_numpy()

In [14]:
dtrain = xgb.DMatrix(X_train_np, label=y_train_np)
dtrain.set_group(train_group_sizes)

dval = xgb.DMatrix(X_val_np, label=y_val_np)
dval.set_group(val_group_sizes)

In [15]:
dtest = xgb.DMatrix(X_test_np)

#### Optuna - hyperparameter tuning

In [16]:
# Optimize the hyper parameter using optuna
def objective(trial):
    param = {
        "objective": "rank:pairwise",
        "eval_metric": "ndcg@3",
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1),
        "max_depth": trial.suggest_int("max_depth", 12, 14),
        "min_child_weight": trial.suggest_int("min_child_weight", 10, 100),  # Increase to prevent overfit. Prevent splits on tiny groups
        "subsample": trial.suggest_float("subsample", 0.6, 0.9), # Force diversity. Force generalizable splits
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.3, 0.8), 
        "gamma": trial.suggest_float("gamma", 1e-3, 1.0, log=True),  # Avoid excessive meorization.
        "lambda": trial.suggest_float("lambda", 0.1, 10, log=True),  # Avoid excessive meorization.
        "alpha": trial.suggest_float("alpha", 0.01, 1.0, log=True),  # Avoid excessive meorization.
        "verbosity": 1,
        "tree_method": "hist",
        "n_jobs": -1,
        "seed": 42,
    }

    # Connects XGBoost's training process to Optuna's pruning mechanisms
    # During train, XGBoost logs the eval-ndcg@3 score
    # The callback watches that score and calls `trial.report(step-iteration)`
    prune_callback = optuna.integration.XGBoostPruningCallback(trial, "eval-ndcg@3")

    model = xgb.train(
        param,
        dtrain,
        num_boost_round=1000,
        evals=[(dtrain, "train"), (dval, "eval")],
        callbacks=[prune_callback],
        early_stopping_rounds=100,
    )

    score = model.best_score  # `best_score` is only defined when early stopping is used.
    return score

In [None]:
study = optuna.create_study(
    direction="maximize", 
    storage=f"sqlite:///{os.path.join(root_dir, 'db.sqlite3')}", 
    pruner=optuna.pruners.MedianPruner(n_warmup_steps=10),
    study_name=f'xgb_{datetime.now().strftime('%Y%m%d_%H%M%S')}'
)
study.optimize(objective, n_trials=50)  # ⏱️ takes time

print("[BEST PARAMS]")
print(study.best_params)
print("Best NDCG@3:", study.best_value)

In [None]:
study.best_params

In [None]:
# XGBoost parameters
params = {
    'objective': 'rank:pairwise',
    'eval_metric': 'ndcg@3',
    'seed': 42,
    'n_jobs': -1,
    **study.best_params,
    # 'device': 'cuda'
}

watchlist = [(dtrain, "train"), (dval, "eval")]

model = xgb.train(
    params,
    dtrain,
    num_boost_round=1000,
    evals=watchlist,
    early_stopping_rounds=100,
)

In [None]:
xgb_importance = model.get_score(importance_type='gain')
xgb_importance_df = pl.DataFrame(
    [{'feature': k, 'importance': v} for k, v in xgb_importance.items()]
).sort('importance', descending=bool(1))
print(xgb_importance_df.to_pandas().to_string())

In [None]:
pred_scores = model.predict(dtest)

In [None]:
original = pl.read_parquet(os.path.join(root_dir, "kaggle", "test.parquet"))
original = (
    original
    .select(["Id", "ranker_id"])
    .with_columns(
        pl.Series("score", pred_scores)
    )
    .with_columns(
        pl.col('score')
        .rank(method='ordinal', descending=True)
        .over('ranker_id')
        .alias('selected')
    )
    .select(["Id", "ranker_id", "selected"])
    .with_columns(
        pl.col("Id").cast(pl.Int64), 
        pl.col("selected").cast(pl.Int64)
    )
)

In [None]:
from datetime import datetime
original.write_csv(os.path.join(root_dir, "submission", f"submission_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"))