In [1]:
import sys
import os
from dotenv import load_dotenv

root_dir = os.path.abspath("..")
sys.path.append(root_dir)
dotenv_path = os.path.join(root_dir, ".env")
load_dotenv(dotenv_path)

True

In [13]:
import polars as pl
from sklearn.model_selection import train_test_split


In [3]:
train_df = pl.read_parquet(os.path.join(root_dir, "data", "v1", "train.parquet"))
train_target = pl.read_parquet(os.path.join(root_dir, "data", "v1", "train_target.parquet"))
test_df = pl.read_parquet(os.path.join(root_dir, "data", "v1", "test.parquet"))

In [7]:
ranker_ids = train_df.select("ranker_id").unique()

In [14]:
ranker_id_train, ranker_id_valid = train_test_split(
    ranker_ids.to_numpy().reshape(-1),
    test_size=0.2,
    random_state=42,
    shuffle=True,
)

In [21]:
train_mask = train_df["ranker_id"].is_in(ranker_id_train)
valid_mask = train_df["ranker_id"].is_in(ranker_id_valid)

train_idx = train_mask.to_numpy().nonzero()[0]
valid_idx = valid_mask.to_numpy().nonzero()[0]

In [25]:
X_train, X_val = train_df[train_idx], train_df[valid_idx]
y_train, y_val = train_target[train_idx], train_target[valid_idx]

In [28]:
print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"X_val: {X_val.shape}, y_val: {y_val.shape}")

X_train: (14459340, 83), y_train: (14459340, 2)
X_val: (3686032, 83), y_val: (3686032, 2)


In [29]:
X_train_np = X_train.drop("ranker_id").to_numpy()
y_train_np = y_train['selected'].to_numpy()

X_val_np = X_val.drop("ranker_id").to_numpy()
y_val_np = y_val['selected'].to_numpy()

In [36]:
X_test_np = test_df.drop("ranker_id").to_numpy()

In [33]:
train_group_sizes = X_train.group_by("ranker_id").len().sort("ranker_id")["len"].to_numpy()
val_group_sizes = X_val.group_by("ranker_id").len().sort("ranker_id")["len"].to_numpy()

In [34]:
import xgboost as xgb

dtrain = xgb.DMatrix(X_train_np, label=y_train_np)
dtrain.set_group(train_group_sizes)

dval = xgb.DMatrix(X_val_np, label=y_val_np)
dval.set_group(val_group_sizes)

In [37]:
dtest = xgb.DMatrix(X_test_np)

In [51]:
params = {
    "objective": "rank:ndcg",
    "eval_metric": "ndcg@3",
    "eta": 0.05,                          # 🔽 Smaller learning rate
    "max_depth": 8,                       # 🔼 Allow more complexity
    "min_child_weight": 10,              # 🔼 Regularization
    "subsample": 0.8,                    # 🔀 Row subsampling
    "colsample_bytree": 0.8,             # 🔀 Column subsampling
    "lambda": 1.0,                        # 🔼 L2 regularization
    "tree_method": "hist",
    "verbosity": 1,
}

watchlist = [(dtrain, "train"), (dval, "eval")]

model = xgb.train(
    params,
    dtrain,
    num_boost_round=1000,
    evals=watchlist,
    early_stopping_rounds=20,
)

[0]	train-ndcg@3:0.77745	eval-ndcg@3:0.78264
[1]	train-ndcg@3:0.78657	eval-ndcg@3:0.78957
[2]	train-ndcg@3:0.78926	eval-ndcg@3:0.79211
[3]	train-ndcg@3:0.79089	eval-ndcg@3:0.79401
[4]	train-ndcg@3:0.79441	eval-ndcg@3:0.79677
[5]	train-ndcg@3:0.79520	eval-ndcg@3:0.79760
[6]	train-ndcg@3:0.79567	eval-ndcg@3:0.79838
[7]	train-ndcg@3:0.79602	eval-ndcg@3:0.79880
[8]	train-ndcg@3:0.79649	eval-ndcg@3:0.79851
[9]	train-ndcg@3:0.79670	eval-ndcg@3:0.79867
[10]	train-ndcg@3:0.79764	eval-ndcg@3:0.79948
[11]	train-ndcg@3:0.79842	eval-ndcg@3:0.79963
[12]	train-ndcg@3:0.79873	eval-ndcg@3:0.80020
[13]	train-ndcg@3:0.79901	eval-ndcg@3:0.80048
[14]	train-ndcg@3:0.79935	eval-ndcg@3:0.80025
[15]	train-ndcg@3:0.79946	eval-ndcg@3:0.80064
[16]	train-ndcg@3:0.79957	eval-ndcg@3:0.80076
[17]	train-ndcg@3:0.79992	eval-ndcg@3:0.80094
[18]	train-ndcg@3:0.80042	eval-ndcg@3:0.80109
[19]	train-ndcg@3:0.80070	eval-ndcg@3:0.80119
[20]	train-ndcg@3:0.80083	eval-ndcg@3:0.80122
[21]	train-ndcg@3:0.80104	eval-ndcg@3:0.8010

In [55]:
pred_scores = model.predict(dtest)

In [56]:
original = pl.read_parquet(os.path.join(root_dir, "kaggle", "test.parquet"))
original = (
    original
    .select(["Id", "ranker_id"])
    .with_columns(
        pl.Series("score", pred_scores)
    )
    .with_columns(
        pl.col('score')
        .rank(method='ordinal', descending=True)
        .over('ranker_id')
        .alias('selected')
    )
    .select(["Id", "ranker_id", "selected"])
    .with_columns(
        pl.col("Id").cast(pl.Int64), 
        pl.col("selected").cast(pl.Int64)
    )
)

In [57]:
from datetime import datetime
original.write_csv(os.path.join(root_dir, "submission", f"submission_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"))