In [1]:
import sys
import os
from dotenv import load_dotenv

root_dir = os.path.abspath("..")
sys.path.append(root_dir)
dotenv_path = os.path.join(root_dir, ".env")
load_dotenv(dotenv_path)

True

In [None]:
from src.data_insert import ParquetRankDataset
from src.model import RankerNN

from torch.utils.data import DataLoader
import torch
from datetime import datetime

In [None]:
NORMALIZATION = os.path.join(root_dir, "data", "train", "train_split_0.parquet")
test_file_paths = [os.path.join(root_dir, "data", "test", "test.parquet")]

In [None]:
EXCLUDED_COLS = ['row_id', 'ranker_id', 'selected']

LABEL_COL = 'dummy_selected'
GROUP_COL = 'ranker_id'

test_dataset_stream = ParquetRankDataset(
    parquet_paths=test_file_paths,
    exclude_feature_cols=EXCLUDED_COLS,
    label_col=LABEL_COL,
    group_col=GROUP_COL,
    max_rows=4096,
    normalization_parquet=NORMALIZATION
)

test_loader = DataLoader(test_dataset_stream, batch_size=None, shuffle=False)

In [None]:
HIDDEN_LAYERS = [512, 512, 256, 128]
HIDDEN_LAYERS_STR = "_".join(map(str, HIDDEN_LAYERS))

LEARNING_RATE = 1e-3
WEIGHT_DECAY = 1e-4
DROP_RATE = 0.2

VAL_INTERVAL = 500
PATIENCE = 3
BEST_VAL_LOSS = float("inf")
NO_IMPROVE_COUNT = 0

NUM_EPOCHS = 3

MODEL_NAME = f"best_model_4_{HIDDEN_LAYERS_STR}.pt"  # First of 10 models
MODEL_OUTPUT = os.path.join(root_dir, "models", MODEL_NAME)

In [None]:
model = RankerNN(
    n_features=test_dataset_stream.feature_len, 
    hidden_layers=HIDDEN_LAYERS, 
    dropout=DROP_RATE
)

if torch.backends.mps.is_available():
    device = torch.device("mps")  # Apple Silicon GPU via Metal
else:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the best model so far
print("[INFO] Loading model from", MODEL_OUTPUT)
state_dict = torch.load(MODEL_OUTPUT, map_location=device)
model.load_state_dict(state_dict)
model.to(device)
model.eval()

In [None]:
# Inference
all_scores = []
all_groups = []

with torch.no_grad():
    for X, _, G in test_loader:  # labels are dummy
        X = X.to(device)
        scores = model(X)  # (batch_size, 1)
        all_scores.append(scores.cpu())
        all_groups.append(G)

all_scores = torch.cat(all_scores).squeeze()
all_groups = torch.cat(all_groups)

print("[INFO] Inference complete:", all_scores.shape, all_groups.shape)

In [None]:
import polars as pl

# POST-PROCESS TO SUBMISSION
df_pred = (
    pl.DataFrame({
        "score": all_scores.numpy()
    })
    .with_row_index("id")
)

# Replace `ranker_id` with original column
original_ranker_id = (
    pl.read_parquet(test_file_paths[0])
    .select(["ranker_id"])
    .with_row_index("id")
)

# Replace `Id` with the original columns
original = pl.read_parquet(os.path.join(root_dir, "kaggle", "test.parquet"))
original = original.select("Id").with_row_index("id")

df_pred = (
    df_pred
    .join(original_ranker_id, on="id")
    .join(original, on="id")
)

In [None]:
final_result = df_pred.with_columns(
    pl.col("score")
    .rank(method='ordinal', descending=True)
    .over("ranker_id")
    .alias("selected")
).select(["Id", "ranker_id", "selected"])

final_result = final_result.with_columns(
    pl.col("Id").cast(pl.Int64),
    pl.col("selected").cast(pl.Int64)
)


In [None]:
(
    final_result
    .write_csv(os.path.join(root_dir, "submission", f"submission_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"))
)

In [None]:
check = (
    final_result
    .group_by("ranker_id")
    .agg([
        pl.len().alias("total_rows"),
        pl.col("selected").n_unique().alias("unique_ranks")
    ])
    .filter(pl.col("total_rows") != pl.col("unique_ranks"))
)

if check.height == 0:
    print("✅ No duplicate ranks per ranker_id")
else:
    print("❌ Duplicate ranks found!")
    print(check)