In [1]:
import sys
import os
from dotenv import load_dotenv

root_dir = os.path.abspath("..")
sys.path.append(root_dir)
dotenv_path = os.path.join(root_dir, ".env")
load_dotenv(dotenv_path)

True

In [2]:
import polars as pl
from sklearn.model_selection import train_test_split


In [3]:
train_df = pl.read_parquet(os.path.join(root_dir, "data", "v1", "train.parquet"))
train_target = pl.read_parquet(os.path.join(root_dir, "data", "v1", "train_target.parquet"))
test_df = pl.read_parquet(os.path.join(root_dir, "data", "v1", "test.parquet"))

In [4]:
ranker_ids = train_df.select("ranker_id").unique()

In [5]:
ranker_id_train, ranker_id_valid = train_test_split(
    ranker_ids.to_numpy().reshape(-1),
    test_size=0.2,
    random_state=42,
    shuffle=True,
)

In [6]:
train_mask = train_df["ranker_id"].is_in(ranker_id_train)
valid_mask = train_df["ranker_id"].is_in(ranker_id_valid)

train_idx = train_mask.to_numpy().nonzero()[0]
valid_idx = valid_mask.to_numpy().nonzero()[0]

In [7]:
X_train, X_val = train_df[train_idx], train_df[valid_idx]
y_train, y_val = train_target[train_idx], train_target[valid_idx]

In [8]:
X_train_np = X_train.drop("ranker_id").to_numpy()
y_train_np = y_train['selected'].to_numpy()

X_val_np = X_val.drop("ranker_id").to_numpy()
y_val_np = y_val['selected'].to_numpy()

In [9]:
X_test_np = test_df.drop("ranker_id").to_numpy()

In [10]:
train_group_sizes = X_train.group_by("ranker_id").len().sort("ranker_id")["len"].to_numpy()
val_group_sizes = X_val.group_by("ranker_id").len().sort("ranker_id")["len"].to_numpy()

In [17]:
from lightgbm import LGBMRanker, early_stopping, log_evaluation

ranker = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    ndcg_eval_at=[3],
    learning_rate=0.05,
    num_leaves=255,
    max_depth=-1,
    min_child_samples=100,
    feature_fraction=0.8,
    bagging_fraction=0.8,
    bagging_freq=1,
    boosting_type="gbdt",
    device="cpu",  # or "gpu"
    n_estimators=3000,
    verbose=-1,  # suppress warnings
)

ranker.fit(
    X_train_np,
    y_train_np,
    group=train_group_sizes,  # list of group sizes for training
    eval_set=[(X_val_np, y_val_np)],
    eval_group=[val_group_sizes],
    callbacks=[
        early_stopping(stopping_rounds=50),
        log_evaluation(period=10),
    ],
)



Training until validation scores don't improve for 50 rounds
[10]	valid_0's ndcg@3: 0.800489
[20]	valid_0's ndcg@3: 0.8029
[30]	valid_0's ndcg@3: 0.805989
[40]	valid_0's ndcg@3: 0.806236
[50]	valid_0's ndcg@3: 0.807207
[60]	valid_0's ndcg@3: 0.808666
[70]	valid_0's ndcg@3: 0.809713
[80]	valid_0's ndcg@3: 0.81049
[90]	valid_0's ndcg@3: 0.811712
[100]	valid_0's ndcg@3: 0.812533
[110]	valid_0's ndcg@3: 0.813196
[120]	valid_0's ndcg@3: 0.814221
[130]	valid_0's ndcg@3: 0.815444
[140]	valid_0's ndcg@3: 0.815909
[150]	valid_0's ndcg@3: 0.816512
[160]	valid_0's ndcg@3: 0.817548
[170]	valid_0's ndcg@3: 0.817567
[180]	valid_0's ndcg@3: 0.817923
[190]	valid_0's ndcg@3: 0.818401
[200]	valid_0's ndcg@3: 0.81916
[210]	valid_0's ndcg@3: 0.819381
[220]	valid_0's ndcg@3: 0.819562
[230]	valid_0's ndcg@3: 0.820063
[240]	valid_0's ndcg@3: 0.820407
[250]	valid_0's ndcg@3: 0.820896
[260]	valid_0's ndcg@3: 0.821476
[270]	valid_0's ndcg@3: 0.821831
[280]	valid_0's ndcg@3: 0.822092
[290]	valid_0's ndcg@3: 0.82

0,1,2
,boosting_type,'gbdt'
,num_leaves,255
,max_depth,-1
,learning_rate,0.05
,n_estimators,3000
,subsample_for_bin,200000
,objective,'lambdarank'
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [20]:
test_preds = ranker.predict(X_test_np, num_iteration=ranker.best_iteration_)



In [22]:
original = pl.read_parquet(os.path.join(root_dir, "kaggle", "test.parquet"))
original = (
    original
    .select(["Id", "ranker_id"])
    .with_columns(
        pl.Series("score", test_preds)
    )
    .with_columns(
        pl.col('score')
        .rank(method='ordinal', descending=True)
        .over('ranker_id')
        .alias('selected')
    )
    .select(["Id", "ranker_id", "selected"])
    .with_columns(
        pl.col("Id").cast(pl.Int64), 
        pl.col("selected").cast(pl.Int64)
    )
)

In [23]:
from datetime import datetime
original.write_csv(os.path.join(root_dir, "submission", f"submission_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"))