In [1]:
import sys
import os
from dotenv import load_dotenv

root_dir = os.path.abspath("..")
sys.path.append(root_dir)
dotenv_path = os.path.join(root_dir, ".env")
load_dotenv(dotenv_path)

True

In [2]:
import polars as pl
from sklearn.model_selection import train_test_split


In [3]:
train_df = pl.read_parquet(os.path.join(root_dir, "data", "v1", "train.parquet"))
train_target = pl.read_parquet(os.path.join(root_dir, "data", "v1", "train_target.parquet"))
test_df = pl.read_parquet(os.path.join(root_dir, "data", "v1", "test.parquet"))

In [4]:
ranker_ids = train_df.select("ranker_id").unique()

In [5]:
ranker_id_train, ranker_id_valid = train_test_split(
    ranker_ids.to_numpy().reshape(-1),
    test_size=0.2,
    random_state=42,
    shuffle=True,
)

In [6]:
train_mask = train_df["ranker_id"].is_in(ranker_id_train)
valid_mask = train_df["ranker_id"].is_in(ranker_id_valid)

train_idx = train_mask.to_numpy().nonzero()[0]
valid_idx = valid_mask.to_numpy().nonzero()[0]

In [7]:
X_train, X_val = train_df[train_idx], train_df[valid_idx]
y_train, y_val = train_target[train_idx], train_target[valid_idx]

In [8]:
X_train_np = X_train.drop("ranker_id").to_numpy()
y_train_np = y_train['selected'].to_numpy()

X_val_np = X_val.drop("ranker_id").to_numpy()
y_val_np = y_val['selected'].to_numpy()

In [9]:
X_test_np = test_df.drop("ranker_id").to_numpy()

In [10]:
train_group_sizes = X_train.group_by("ranker_id").len().sort("ranker_id")["len"].to_numpy()
val_group_sizes = X_val.group_by("ranker_id").len().sort("ranker_id")["len"].to_numpy()

In [11]:
from lightgbm import LGBMRanker, early_stopping, log_evaluation

ranker = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    ndcg_eval_at=[3],
    learning_rate=0.03,           # ↓ Lower LR for deeper exploration
    num_leaves=512,               # ↑ Allow more splits per tree
    max_depth=-1,
    min_child_samples=50,         # ↓ Allow finer splits
    feature_fraction=0.7,         # ↓ More randomness
    bagging_fraction=0.7,
    bagging_freq=1,
    boosting_type="gbdt",
    device="cpu",  # or "gpu"
    n_estimators=5000,            # ↑ Longer training horizon
    verbose=-1,
)

ranker.fit(
    X_train_np,
    y_train_np,
    group=train_group_sizes,  # list of group sizes for training
    eval_set=[(X_val_np, y_val_np)],
    eval_group=[val_group_sizes],
    callbacks=[
        early_stopping(stopping_rounds=100),
        log_evaluation(period=10),
    ],
)



Training until validation scores don't improve for 100 rounds
[10]	valid_0's ndcg@3: 0.801301
[20]	valid_0's ndcg@3: 0.802933
[30]	valid_0's ndcg@3: 0.80497
[40]	valid_0's ndcg@3: 0.806342
[50]	valid_0's ndcg@3: 0.806208
[60]	valid_0's ndcg@3: 0.807118
[70]	valid_0's ndcg@3: 0.808815
[80]	valid_0's ndcg@3: 0.809288
[90]	valid_0's ndcg@3: 0.810158
[100]	valid_0's ndcg@3: 0.810688
[110]	valid_0's ndcg@3: 0.811322
[120]	valid_0's ndcg@3: 0.811736
[130]	valid_0's ndcg@3: 0.812611
[140]	valid_0's ndcg@3: 0.812783
[150]	valid_0's ndcg@3: 0.812902
[160]	valid_0's ndcg@3: 0.81333
[170]	valid_0's ndcg@3: 0.814092
[180]	valid_0's ndcg@3: 0.81455
[190]	valid_0's ndcg@3: 0.815127
[200]	valid_0's ndcg@3: 0.815498
[210]	valid_0's ndcg@3: 0.816481
[220]	valid_0's ndcg@3: 0.816739
[230]	valid_0's ndcg@3: 0.817004
[240]	valid_0's ndcg@3: 0.817243
[250]	valid_0's ndcg@3: 0.817746
[260]	valid_0's ndcg@3: 0.818499
[270]	valid_0's ndcg@3: 0.818903
[280]	valid_0's ndcg@3: 0.818901
[290]	valid_0's ndcg@3: 0.

0,1,2
,boosting_type,'gbdt'
,num_leaves,512
,max_depth,-1
,learning_rate,0.03
,n_estimators,5000
,subsample_for_bin,200000
,objective,'lambdarank'
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [12]:
test_preds = ranker.predict(X_test_np, num_iteration=ranker.best_iteration_)



In [13]:
original = pl.read_parquet(os.path.join(root_dir, "kaggle", "test.parquet"))
original = (
    original
    .select(["Id", "ranker_id"])
    .with_columns(
        pl.Series("score", test_preds)
    )
    .with_columns(
        pl.col('score')
        .rank(method='ordinal', descending=True)
        .over('ranker_id')
        .alias('selected')
    )
    .select(["Id", "ranker_id", "selected"])
    .with_columns(
        pl.col("Id").cast(pl.Int64), 
        pl.col("selected").cast(pl.Int64)
    )
)

In [14]:
from datetime import datetime
original.write_csv(os.path.join(root_dir, "submission", f"submission_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"))