In [1]:
import sys
import os
from dotenv import load_dotenv

root_dir = os.path.abspath("..")
sys.path.append(root_dir)
dotenv_path = os.path.join(root_dir, ".env")
load_dotenv(dotenv_path)

True

In [2]:
from src.data_insert import ParquetRankDataset
from src.model import RankerNN
from src.metric import pairwise_loss

from torch.utils.data import DataLoader
import torch
from datetime import datetime

In [3]:
NORMALIZATION = os.path.join(root_dir, "data", "train", "train_split_0.parquet")
test_file_paths = [os.path.join(root_dir, "data", "test", "test.parquet")]

In [6]:
import polars as pl

d = pl.scan_parquet(test_file_paths[0])
d2 = pl.scan_parquet(NORMALIZATION)

In [None]:
EXCLUDED_COLS = ['row_id', 'ranker_id', 'selected']

LABEL_COL = 'dummy_selected'
GROUP_COL = 'ranker_id'

test_dataset_stream = ParquetRankDataset(
    parquet_paths=test_file_paths,
    exclude_feature_cols=EXCLUDED_COLS,
    label_col=LABEL_COL,
    group_col=GROUP_COL,
    max_rows=4096,
    normalization_parquet=NORMALIZATION
)

  self.feature_cols: list[str] = [c for c in first_schema.columns if c not in self.exclude_feature_cols]


[INFO] Normalization stats loaded from /Users/goonzard/Developer/data-science-09-kaggle-airplane/data/train/train_split_0.parquet
[INFO] Example mean/std: [('emb_0', 0.044884927570819855), ('emb_1', -0.026697352528572083), ('emb_2', -0.10396946221590042), ('emb_3', 0.04172823578119278), ('emb_4', -0.01801719330251217)]


In [None]:
HIDDEN_LAYERS = [512, 256, 128]
HIDDEN_LAYERS_STR = "_".join(map(str, HIDDEN_LAYERS))

DROP_RATE = 0.2
LEARNING_RATE = 1e-3
TRAIN_DATE = datetime.now().strftime("%Y%m%dT%H%M%S")

VAL_INTERVAL = 500
PATIENCE = 5
BEST_VAL_LOSS = float("inf")
NO_IMPROVE_COUNT = 0

NUM_EPOCHS = 5

SAVED_MODEL_NAME = f"best_model_2_512_256_128_0.2_0.001.pt"
SAVED_MODEL_FILE = os.path.join(root_dir, "models", SAVED_MODEL_NAME)

In [14]:
model = RankerNN(
    n_features=test_dataset_stream.feature_len, 
    hidden_layers=HIDDEN_LAYERS, 
    dropout=DROP_RATE
)

In [15]:
test_loader = DataLoader(test_dataset_stream, batch_size=None, shuffle=False)

if torch.backends.mps.is_available():
    device = torch.device("mps")  # Apple Silicon GPU via Metal
else:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the best model so far
state_dict = torch.load(SAVED_MODEL_FILE, map_location=device)
model.load_state_dict(state_dict)
model.to(device)
model.eval()

RankerNN(
  (net): Sequential(
    (0): Linear(in_features=418, out_features=512, bias=True)
    (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): GELU(approximate='none')
    (3): Dropout(p=0.2, inplace=False)
    (4): Linear(in_features=512, out_features=256, bias=True)
    (5): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): GELU(approximate='none')
    (7): Dropout(p=0.2, inplace=False)
    (8): Linear(in_features=256, out_features=128, bias=True)
    (9): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (10): GELU(approximate='none')
    (11): Dropout(p=0.2, inplace=False)
    (12): Linear(in_features=128, out_features=1, bias=True)
  )
)

In [None]:
# Inference
all_scores = []
all_groups = []

with torch.no_grad():
    for X, _, G in test_loader:  # labels are dummy
        X = X.to(device)
        scores = model(X)  # (batch_size, 1)
        all_scores.append(scores.cpu())
        all_groups.append(G)

all_scores = torch.cat(all_scores).squeeze()
all_groups = torch.cat(all_groups)

print("[INFO] Inference complete:", all_scores.shape, all_groups.shape)

In [None]:
# POST-PROCESS TO SUBMISSION
df_pred = (
    pl.DataFrame({
        "score": all_scores.numpy()
    })
    .with_row_index("id")
)

# Replace `ranker_id` with original column
original_ranker_id = (
    pl.read_parquet(test_file_paths[0])
    .select(["ranker_id"])
    .with_row_index("id")
)

# Replace `Id` with the original columns
original = pl.read_parquet(os.path.join(root_dir, "kaggle", "test.parquet"))
original = original.select("Id").with_row_index("id")

df_pred = (
    df_pred
    .join(original_ranker_id, on="id")
    .join(original, on="id")
)

In [56]:
final_result = df_pred.with_columns(
    pl.col("score")
    .rank(method='ordinal', descending=True)
    .over("ranker_id")
    .alias("selected")
).select(["Id", "ranker_id", "selected"])

final_result = final_result.with_columns(
    pl.col("Id").cast(pl.Int64),
    pl.col("selected").cast(pl.Int64)
)


In [57]:
from datetime import datetime
(
    final_result
    .write_csv(os.path.join(root_dir, "submission", f"submission_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"))
)

In [62]:
check = (
    final_result
    .group_by("ranker_id")
    .agg([
        pl.len().alias("total_rows"),
        pl.col("selected").n_unique().alias("unique_ranks")
    ])
    .filter(pl.col("total_rows") != pl.col("unique_ranks"))
)

if check.height == 0:
    print("✅ No duplicate ranks per ranker_id")
else:
    print("❌ Duplicate ranks found!")
    print(check)

✅ No duplicate ranks per ranker_id
