In [1]:
import sys
import os
from dotenv import load_dotenv

root_dir = os.path.abspath("..")
sys.path.append(root_dir)
dotenv_path = os.path.join(root_dir, ".env")
load_dotenv(dotenv_path)

True

In [2]:
import polars as pl
from sklearn.model_selection import train_test_split
from lightgbm import LGBMRanker, early_stopping, log_evaluation
import optuna
from datetime import datetime

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
train_df = pl.read_parquet(os.path.join(root_dir, "data", "v2", "train.parquet"))
train_target = pl.read_parquet(os.path.join(root_dir, "data", "v2", "train_target.parquet"))
test_df = pl.read_parquet(os.path.join(root_dir, "data", "v2", "test.parquet"))

In [6]:
train_ranker_ids = (
    pl.scan_parquet(os.path.join(root_dir, "kaggle", "train.parquet"))
    .select("ranker_id")
    .collect()
    .with_row_index("row_id")
)
test_ranker_ids = (
    pl.scan_parquet(os.path.join(root_dir, "kaggle", "test.parquet"))
    .select("ranker_id")
    .collect()
    .with_row_index("row_id")
)

In [7]:
train_df = train_df.join(train_ranker_ids, on="row_id")
test_df = test_df.join(test_ranker_ids, on="row_id")

In [18]:
ranker_ids = train_df['ranker_id'].unique()

In [19]:
ranker_id_train, ranker_id_valid = train_test_split(
    ranker_ids.to_numpy().reshape(-1),
    test_size=0.2,
    random_state=42,
    shuffle=True,
)

In [20]:
train_mask = train_df["ranker_id"].is_in(ranker_id_train)
valid_mask = train_df["ranker_id"].is_in(ranker_id_valid)

train_idx = train_mask.to_numpy().nonzero()[0]
valid_idx = valid_mask.to_numpy().nonzero()[0]

In [21]:
X_train, X_val = train_df[train_idx], train_df[valid_idx]
y_train, y_val = train_target[train_idx], train_target[valid_idx]

print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"X_val: {X_val.shape}, y_val: {y_val.shape}")

X_train: (14556169, 68), y_train: (14556169, 2)
X_val: (3589203, 68), y_val: (3589203, 2)


In [23]:
X_train_np = X_train.drop("ranker_id", "row_id").to_numpy()
y_train_np = y_train['selected'].to_numpy()

X_val_np = X_val.drop("ranker_id", "row_id").to_numpy()
y_val_np = y_val['selected'].to_numpy()

X_test_np = test_df.drop("ranker_id", "row_id").to_numpy()

In [24]:
train_group_sizes = X_train.group_by("ranker_id").len().sort("ranker_id")["len"].to_numpy()
val_group_sizes = X_val.group_by("ranker_id").len().sort("ranker_id")["len"].to_numpy()
test_group_sizes = test_df.group_by("ranker_id").len().sort("ranker_id")["len"].to_numpy()

# Basic LightGBM

In [None]:
def hitrate_at_3(y_true, y_pred, groups):
    df = pl.DataFrame({
        'group': groups,
        'pred': y_pred,
        'true': y_true
    })
    
    return (
        df.filter(pl.col("group").count().over("group") > 10)
        .sort(["group", "pred"], descending=[False, True])
        .group_by("group", maintain_order=True)
        .head(3)
        .group_by("group")
        .agg(pl.col("true").max())
        .select(pl.col("true").mean())
        .item()
    )

def objective(trial):
    params = {
        "objective": "lambdarank",
        "metric": "ndcg",
        "ndcg_eval_at": [3],
        "bagging_freq": 1,
        "boosting_type": "gbdt",
        "device": "cpu",
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.07, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 16, 256, step=16),
        "min_child_samples": trial.suggest_int("min_child_samples", 20, 300),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.5, 0.9),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.6, 0.9),
        "lambda_l1": trial.suggest_float("lambda_l1", 0.0, 5.0),
        "lambda_l2": trial.suggest_float("lambda_l2", 0.0, 5.0),
        "verbose": -1,
        "n_estimators": 5000,
    }

    model = LGBMRanker(**params)

    model.fit(
        X_train_np,
        y_train_np,
        group=train_group_sizes,
        eval_set=[(X_val_np, y_val_np)],
        eval_group=[val_group_sizes],
        callbacks=[
            early_stopping(100, verbose=False),
            log_evaluation(period=10),
        ],
    )

    # Must return a scalar score Optuna will *maximize*
    score = list(model.best_score_.values())[0]['ndcg@3']
    return score

In [39]:
hitrate_at_3(y_val_np, val_preds, X_val['ranker_id'])

0.4369336816943688

In [32]:
val_preds = model.predict(X_val_np, num_iteration=model.best_iteration_)



In [29]:
params = {
    "objective": "lambdarank",
    "metric": "ndcg",
    "boosting_type": "gbdt",
    "n_estimators": 5000,
    "learning_rate": 0.05,
    "num_leaves": 63,
    "max_depth": 9,
    "ndcg_eval_at": [3],
    "bagging_freq": 1,
    "verbose": -1,
    "device": "cpu",
    "random_state": 42,
}

model = LGBMRanker(**params)

model.fit(
    X_train_np,
    y_train_np,
    group=train_group_sizes,
    eval_set=[
        (X_train_np, y_train_np),
        (X_val_np, y_val_np)
    ],
    eval_group=[
        train_group_sizes,
        val_group_sizes
    ],
    callbacks=[
        early_stopping(100, verbose=False),
        log_evaluation(period=10),
    ],
)



[10]	training's ndcg@3: 0.798699	valid_1's ndcg@3: 0.793576
[20]	training's ndcg@3: 0.80091	valid_1's ndcg@3: 0.796279
[30]	training's ndcg@3: 0.803293	valid_1's ndcg@3: 0.798637
[40]	training's ndcg@3: 0.805302	valid_1's ndcg@3: 0.799028
[50]	training's ndcg@3: 0.80728	valid_1's ndcg@3: 0.800324
[60]	training's ndcg@3: 0.809312	valid_1's ndcg@3: 0.801339
[70]	training's ndcg@3: 0.811536	valid_1's ndcg@3: 0.80198
[80]	training's ndcg@3: 0.813556	valid_1's ndcg@3: 0.802681
[90]	training's ndcg@3: 0.815195	valid_1's ndcg@3: 0.803524
[100]	training's ndcg@3: 0.816935	valid_1's ndcg@3: 0.804355
[110]	training's ndcg@3: 0.818667	valid_1's ndcg@3: 0.80484
[120]	training's ndcg@3: 0.820237	valid_1's ndcg@3: 0.805788
[130]	training's ndcg@3: 0.821785	valid_1's ndcg@3: 0.805888
[140]	training's ndcg@3: 0.823476	valid_1's ndcg@3: 0.806485
[150]	training's ndcg@3: 0.824762	valid_1's ndcg@3: 0.806684
[160]	training's ndcg@3: 0.826262	valid_1's ndcg@3: 0.807029
[170]	training's ndcg@3: 0.82774	vali

0,1,2
,boosting_type,'gbdt'
,num_leaves,63
,max_depth,9
,learning_rate,0.05
,n_estimators,5000
,subsample_for_bin,200000
,objective,'lambdarank'
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [None]:
study = optuna.create_study(
    direction="maximize", 
    storage=f"sqlite:///{os.path.join(root_dir, 'db.sqlite3')}", 
    study_name=f'lgbm_{datetime.now().strftime('%Y%m%d_%H%M%S')}'
)
study.optimize(objective, n_trials=50)  # ⏱️ takes time

print("[BEST PARAMS]")
print(study.best_params)
print("Best NDCG@3:", study.best_value)

In [None]:
params = {
    "objective": "lambdarank",
    "metric": "ndcg",
    "ndcg_eval_at": [3],
    "bagging_freq": 1,
    "boosting_type": "gbdt",
    "device": "cpu",
    "verbose": -1,
    "n_estimators": 5000,
    **study.best_params
}


model = LGBMRanker(**params)

model.fit(
    X_train_np,
    y_train_np,
    group=train_group_sizes,
    eval_set=[(X_val_np, y_val_np)],
    eval_group=[val_group_sizes],
    callbacks=[
        early_stopping(100, verbose=False),
        log_evaluation(period=10),
    ],
)

In [30]:
test_preds = model.predict(X_test_np, num_iteration=model.best_iteration_)



In [None]:
original = pl.read_parquet(os.path.join(root_dir, "kaggle", "test.parquet"))
original = (
    original
    .select(["Id", "ranker_id"])
    .with_columns(
        pl.Series("score", test_preds)
    )
    .with_columns(
        pl.col('score')
        .rank(method='ordinal', descending=True)
        .over('ranker_id')
        .alias('selected')
    )
    .select(["Id", "ranker_id", "selected"])
    .with_columns(
        pl.col("Id").cast(pl.Int64), 
        pl.col("selected").cast(pl.Int64)
    )
)

In [None]:
original.write_csv(os.path.join(root_dir, "submission", f"submission_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"))