#### Because of memory constraint, wasn't able to perform xgboost

In [1]:
import sys
import os
from dotenv import load_dotenv

root_dir = os.path.abspath("..")
sys.path.append(root_dir)
dotenv_path = os.path.join(root_dir, ".env")
load_dotenv(dotenv_path)

True

In [2]:
import polars as pl
import xgboost as xgb
import numpy as np

In [None]:
train = os.path.join(root_dir, "data", "train_split.parquet")
valid = os.path.join(root_dir, "data", "valid_split.parquet")

train_df = pl.read_parquet(train)
valid_df = pl.read_parquet(valid)

In [None]:
TARGET_COL = "selected"

In [None]:
# Columns to exclude
exclude_cols = {"ranker_id", "row_id", TARGET_COL}

# Determine feature columns from train_df
feature_cols = [c for c in train_df.columns if c not in exclude_cols]

In [None]:
# -----------------------------
# Helper: build DMatrix from parquet
# -----------------------------
def build_dmatrix(df, target_col):
    # Check target
    if target_col not in df.columns:
        raise ValueError(f"{target_col} not in {parquet_path}")

    # Cast features to float32 to save memory
    exclude_cols = {"ranker_id", "row_id", target_col}
    feature_cols = [c for c in df.columns if c not in exclude_cols]
    df = df.with_columns([pl.col(c).cast(pl.Float32) for c in feature_cols])

    # Extract numpy arrays
    X = df.select(feature_cols).to_numpy()
    y = df[target_col].to_numpy()

    # Group sizes (count rows per ranker_id)
    group_sizes = (
        df.group_by("ranker_id")
          .len()
          .sort("ranker_id")["len"]
          .to_list()
    )

    dmat = xgb.DMatrix(X, label=y)
    dmat.set_group(group_sizes)
    print(f"[INFO] Built DMatrix with {X.shape[0]} rows and {X.shape[1]} features.")
    return dmat

In [None]:
# Build train/valid matrices & Free memory
dtrain = build_dmatrix(train_df, TARGET_COL)
dvalid = build_dmatrix(valid_df, TARGET_COL)

del train_df
del valid_df

import gc
gc.collect()


In [None]:
params = {
    "objective": "rank:pairwise",   # or rank:ndcg
    "tree_method": "hist",
    "max_depth": 6,
    "learning_rate": 0.1,
    "eval_metric": "ndcg@3"
}

print("[INFO] Training...")
evals_result = {}
bst = xgb.train(
    params,
    dtrain,
    num_boost_round=1000,
    evals=[(dtrain, "train"), (dvalid, "valid")],
    evals_result=evals_result,
    early_stopping_rounds=50,
    verbose_eval=5
)

print(f"[RESULT] Best iteration: {bst.best_iteration}")
print(f"[RESULT] Best validation score: {bst.best_score}")


# -----------------------------
# Save model
# -----------------------------
model_path = os.path.join(root_dir, "rank_xgb_model.json")
bst.save_model(model_path)
print(f"[INFO] Model saved to {model_path}")