In [1]:
import sys
import os
from dotenv import load_dotenv

root_dir = os.path.abspath("..")
sys.path.append(root_dir)
dotenv_path = os.path.join(root_dir, ".env")
load_dotenv(dotenv_path)

True

In [2]:
import polars as pl
from sklearn.model_selection import train_test_split


In [3]:
train_df = pl.read_parquet(os.path.join(root_dir, "data", "v1", "train.parquet"))
train_target = pl.read_parquet(os.path.join(root_dir, "data", "v1", "train_target.parquet"))
test_df = pl.read_parquet(os.path.join(root_dir, "data", "v1", "test.parquet"))

In [4]:
ranker_ids = train_df.select("ranker_id").unique()

In [5]:
ranker_id_train, ranker_id_valid = train_test_split(
    ranker_ids.to_numpy().reshape(-1),
    test_size=0.2,
    random_state=42,
    shuffle=True,
)

In [6]:
train_mask = train_df["ranker_id"].is_in(ranker_id_train)
valid_mask = train_df["ranker_id"].is_in(ranker_id_valid)

train_idx = train_mask.to_numpy().nonzero()[0]
valid_idx = valid_mask.to_numpy().nonzero()[0]

In [7]:
X_train, X_val = train_df[train_idx], train_df[valid_idx]
y_train, y_val = train_target[train_idx], train_target[valid_idx]

In [8]:
print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"X_val: {X_val.shape}, y_val: {y_val.shape}")

X_train: (14510161, 83), y_train: (14510161, 2)
X_val: (3635211, 83), y_val: (3635211, 2)


In [9]:
X_train_np = X_train.drop("ranker_id").to_numpy()
y_train_np = y_train['selected'].to_numpy()

X_val_np = X_val.drop("ranker_id").to_numpy()
y_val_np = y_val['selected'].to_numpy()

In [10]:
X_test_np = test_df.drop("ranker_id").to_numpy()

In [11]:
train_group_sizes = X_train.group_by("ranker_id").len().sort("ranker_id")["len"].to_numpy()
val_group_sizes = X_val.group_by("ranker_id").len().sort("ranker_id")["len"].to_numpy()

In [12]:
import xgboost as xgb

dtrain = xgb.DMatrix(X_train_np, label=y_train_np)
dtrain.set_group(train_group_sizes)

dval = xgb.DMatrix(X_val_np, label=y_val_np)
dval.set_group(val_group_sizes)

In [13]:
dtest = xgb.DMatrix(X_test_np)

In [16]:
# XGBoost parameters
params = {
    'objective': 'rank:pairwise',
    'eval_metric': 'ndcg@3',
    "learning_rate": 0.022641389657079056,
    "max_depth": 14,
    "min_child_weight": 2,
    "subsample": 0.8842234913702768,
    "colsample_bytree": 0.45840689146263086,
    "gamma": 3.3084297630544888,
    "lambda": 6.952586917313028,
    "alpha": 0.6395254133055179,
    'seed': 42,
    'n_jobs': -1,
    # 'device': 'cuda'
}

watchlist = [(dtrain, "train"), (dval, "eval")]

model = xgb.train(
    params,
    dtrain,
    num_boost_round=2000,
    evals=watchlist,
    early_stopping_rounds=100,
)

[0]	train-ndcg@3:0.77440	eval-ndcg@3:0.77371
[1]	train-ndcg@3:0.80298	eval-ndcg@3:0.79673
[2]	train-ndcg@3:0.81431	eval-ndcg@3:0.80196
[3]	train-ndcg@3:0.82056	eval-ndcg@3:0.80358
[4]	train-ndcg@3:0.82436	eval-ndcg@3:0.80541
[5]	train-ndcg@3:0.82838	eval-ndcg@3:0.80687
[6]	train-ndcg@3:0.83139	eval-ndcg@3:0.80697
[7]	train-ndcg@3:0.83459	eval-ndcg@3:0.80871
[8]	train-ndcg@3:0.83548	eval-ndcg@3:0.80968
[9]	train-ndcg@3:0.83811	eval-ndcg@3:0.81058
[10]	train-ndcg@3:0.83881	eval-ndcg@3:0.81033
[11]	train-ndcg@3:0.84084	eval-ndcg@3:0.81080
[12]	train-ndcg@3:0.84162	eval-ndcg@3:0.81118
[13]	train-ndcg@3:0.84335	eval-ndcg@3:0.81169
[14]	train-ndcg@3:0.84414	eval-ndcg@3:0.81217
[15]	train-ndcg@3:0.84452	eval-ndcg@3:0.81252
[16]	train-ndcg@3:0.84481	eval-ndcg@3:0.81279
[17]	train-ndcg@3:0.84595	eval-ndcg@3:0.81318
[18]	train-ndcg@3:0.84762	eval-ndcg@3:0.81416
[19]	train-ndcg@3:0.84870	eval-ndcg@3:0.81420
[20]	train-ndcg@3:0.84981	eval-ndcg@3:0.81457
[21]	train-ndcg@3:0.85048	eval-ndcg@3:0.8147

In [None]:
pred_scores = model.predict(dtest)

In [None]:
original = pl.read_parquet(os.path.join(root_dir, "kaggle", "test.parquet"))
original = (
    original
    .select(["Id", "ranker_id"])
    .with_columns(
        pl.Series("score", pred_scores)
    )
    .with_columns(
        pl.col('score')
        .rank(method='ordinal', descending=True)
        .over('ranker_id')
        .alias('selected')
    )
    .select(["Id", "ranker_id", "selected"])
    .with_columns(
        pl.col("Id").cast(pl.Int64), 
        pl.col("selected").cast(pl.Int64)
    )
)

In [None]:
from datetime import datetime
original.write_csv(os.path.join(root_dir, "submission", f"submission_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"))