In [None]:
import sys
import os
from dotenv import load_dotenv

root_dir = os.path.abspath("..")
sys.path.append(root_dir)
dotenv_path = os.path.join(root_dir, ".env")
load_dotenv(dotenv_path)

In [None]:
import xgboost as xgb
import polars as pl
from sklearn.model_selection import train_test_split
import optuna
from datetime import datetime

In [None]:
train_df = pl.read_parquet(os.path.join(root_dir, "data", "v2", "train.parquet"))
train_target = pl.read_parquet(os.path.join(root_dir, "data", "v2", "train_target.parquet"))
test_df = pl.read_parquet(os.path.join(root_dir, "data", "v2", "test.parquet"))

In [None]:
train_ranker_ids = (
    pl.scan_parquet(os.path.join(root_dir, "kaggle", "train.parquet"))
    .select("ranker_id")
    .collect()
    .with_row_index("row_id")
)
test_ranker_ids = (
    pl.scan_parquet(os.path.join(root_dir, "kaggle", "test.parquet"))
    .select("ranker_id")
    .collect()
    .with_row_index("row_id")
)

In [None]:
train_df = train_df.join(train_ranker_id, on ='row_id')
test_df = test_df.join(test_ranker_id, on ='row_id')

In [None]:
ranker_ids = train_df.select("ranker_id").unique()

In [None]:
ranker_id_train, ranker_id_valid = train_test_split(
    ranker_ids.to_numpy().reshape(-1),
    test_size=0.25,
    random_state=41,
    shuffle=True,
)

In [None]:
train_mask = train_df["ranker_id"].is_in(ranker_id_train)
valid_mask = train_df["ranker_id"].is_in(ranker_id_valid)

train_idx = train_mask.to_numpy().nonzero()[0]
valid_idx = valid_mask.to_numpy().nonzero()[0]

In [None]:
X_train, X_val = train_df[train_idx], train_df[valid_idx]
y_train, y_val = train_target[train_idx], train_target[valid_idx]

print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"X_val: {X_val.shape}, y_val: {y_val.shape}")

In [None]:
X_train_np = X_train.drop("ranker_id", "row_id").to_numpy()
y_train_np = y_train['selected'].to_numpy()

X_val_np = X_val.drop("ranker_id", "row_id").to_numpy()
y_val_np = y_val['selected'].to_numpy()

X_test_np = test_df.drop("ranker_id", "row_id").to_numpy()

train_group_sizes = X_train.group_by("ranker_id").len().sort("ranker_id")["len"].to_numpy()
val_group_sizes = X_val.group_by("ranker_id").len().sort("ranker_id")["len"].to_numpy()
test_group_sizes = X_test.group_by("ranker_id").len().sort("ranker_id")["len"].to_numpy()

In [None]:
print("Train group:", train_group_sizes)
print("Validation group:", val_group_sizes)
print("Test group:", test_group_sizes)

In [None]:
dtrain = xgb.DMatrix(X_train_np, label=y_train_np)
dtrain.set_group(train_group_sizes)

dval = xgb.DMatrix(X_val_np, label=y_val_np)
dval.set_group(val_group_sizes)

dtest = xgb.DMatrix(X_test_np)
dtest.set_group(test_group_sizes)

In [None]:
# Optimize the hyper parameter using optuna
def hitrate_at_3(y_true, y_pred, groups):
    df = pl.DataFrame({
        'group': groups,
        'pred': y_pred,
        'true': y_true
    })
    
    return (
        df.filter(pl.col("group").count().over("group") > 10)
        .sort(["group", "pred"], descending=[False, True])
        .group_by("group", maintain_order=True)
        .head(3)
        .group_by("group")
        .agg(pl.col("true").max())
        .select(pl.col("true").mean())
        .item()
    )

def objective(trial):
    param = {
        "objective": "rank:pairwise",
        "eval_metric": "ndcg@3",
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1),
        "max_depth": trial.suggest_int("max_depth", 12, 14),
        "min_child_weight": trial.suggest_int("min_child_weight", 10, 100),  # Increase to prevent overfit. Prevent splits on tiny groups
        "subsample": trial.suggest_float("subsample", 0.6, 0.9), # Force diversity. Force generalizable splits
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.3, 0.8), 
        "gamma": trial.suggest_float("gamma", 1, 10.0, log=True),  # Avoid excessive meorization.
        "lambda": trial.suggest_float("lambda", 5, 50, log=True),  # Avoid excessive meorization.
        "alpha": trial.suggest_float("alpha", 1.0, 5.0, log=True),  # Avoid excessive meorization.
        "verbosity": 1,
        "tree_method": "hist",
        "n_jobs": -1,
        "seed": 42,
    }

    # Connects XGBoost's training process to Optuna's pruning mechanisms
    # During train, XGBoost logs the eval-ndcg@3 score
    # The callback watches that score and calls `trial.report(step-iteration)`
    prune_callback = optuna.integration.XGBoostPruningCallback(trial, "eval-ndcg@3")

    model = xgb.train(
        param,
        dtrain,
        num_boost_round=800,
        evals=[(dtrain, "train"), (dval, "eval")],
        callbacks=[prune_callback],
        early_stopping_rounds=30,
    )

    score = model.best_score  # `best_score` is only defined when early stopping is used.

    # Calculate hitrate@3
    y_pred = model.predict(dval)
    hitrate_at_3_score = hitrate_at_3(y_val, y_pred, X_val['ranker_id'])
    print(f"Hitrate@3: {hitrate_at_3_score}")

    return score

In [None]:
study = optuna.create_study(
    direction="maximize", 
    storage=f"sqlite:///{os.path.join(root_dir, 'db.sqlite3')}", 
    pruner=optuna.pruners.MedianPruner(n_warmup_steps=10),
    study_name=f'xgb_{datetime.now().strftime('%Y%m%d_%H%M%S')}'
)
study.optimize(objective, n_trials=50)  # ⏱️ takes time

print("[BEST PARAMS]")
print(study.best_params)
print("Best NDCG@3:", study.best_value)

In [None]:
# XGBoost best parameters
params = {
    'objective': 'rank:pairwise',
    'eval_metric': 'ndcg@3',
    'seed': 42,
    'n_jobs': -1,
    **study.best_params,
    # 'device': 'cuda'
}

watchlist = [(dtrain, "train"), (dval, "eval")]

model = xgb.train(
    params,
    dtrain,
    num_boost_round=2000,
    evals=watchlist,
    early_stopping_rounds=100,
)

In [None]:
pred_scores = model.predict(dtest)

In [None]:
original = pl.read_parquet(os.path.join(root_dir, "kaggle", "test.parquet"))
original = (
    original
    .select(["Id", "ranker_id"])
    .with_columns(
        pl.Series("score", pred_scores)
    )
    .with_columns(
        pl.col('score')
        .rank(method='ordinal', descending=True)
        .over('ranker_id')
        .alias('selected')
    )
    .select(["Id", "ranker_id", "selected"])
    .with_columns(
        pl.col("Id").cast(pl.Int64), 
        pl.col("selected").cast(pl.Int64)
    )
)

In [None]:
original.write_csv(os.path.join(root_dir, "submission", f"submission_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"))