In [1]:
import sys
import os
from dotenv import load_dotenv

root_dir = os.path.abspath("..")
sys.path.append(root_dir)
dotenv_path = os.path.join(root_dir, ".env")
load_dotenv(dotenv_path)

True

In [2]:
from src.data_insert import ParquetRankDataset
from src.model import SetRank

import polars as pl
from torch.utils.data import DataLoader
import torch
from datetime import datetime

In [3]:
NORMALIZATION = os.path.join(root_dir, "data", "train", "train_split_0.parquet")
test_file_paths = [os.path.join(root_dir, "data", "test", "test.parquet")]

In [4]:
EXCLUDED_COLS = ['row_id', 'ranker_id', 'selected']

LABEL_COL = 'dummy_selected'
GROUP_COL = 'ranker_id'

test_dataset_stream = ParquetRankDataset(
    parquet_paths=test_file_paths,
    exclude_feature_cols=EXCLUDED_COLS,
    label_col=LABEL_COL,
    group_col=GROUP_COL,
    max_rows=4096,
    normalization_parquet=NORMALIZATION
)

  self.feature_cols: list[str] = [c for c in first_schema.columns if c not in self.exclude_feature_cols]


[INFO] Normalization stats loaded from /Users/goonzard/Developer/data-science-09-kaggle-airplane/data/train/train_split_0.parquet
[INFO] Example mean/std: [('emb_0', 0.044884927570819855), ('emb_1', -0.026697352528572083), ('emb_2', -0.10396946221590042), ('emb_3', 0.04172823578119278), ('emb_4', -0.01801719330251217)]


In [5]:
LEARNING_RATE = 1e-3
WEIGHT_DECAY = 1e-4

VAL_INTERVAL = 40000
PATIENCE = 3
BEST_VAL_LOSS = float("inf")
NO_IMPROVE_COUNT = 0

NUM_EPOCHS = 3

SAVED_MODEL_FILE = [
    os.path.join(root_dir, "models", "best_model_3_SETRANK_1.pt"),
    os.path.join(root_dir, "models", "best_model_3_SETRANK_2.pt"),
    os.path.join(root_dir, "models", "best_model_3_SETRANK_3.pt"),
    os.path.join(root_dir, "models", "best_model_3_SETRANK_4.pt"),
    os.path.join(root_dir, "models", "best_model_3_SETRANK_5.pt"),
]

In [6]:
test_loader = DataLoader(test_dataset_stream, batch_size=None, shuffle=False)

if torch.backends.mps.is_available():
    device = torch.device("mps")  # Apple Silicon GPU via Metal
else:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [7]:
def build_model():
    m = SetRank(
        input_dim=test_dataset_stream.feature_len, 
        hidden_dim=128,
        num_heads=4,
        num_layers=2,
    )
    return m

In [9]:
ensemble_scores = None
all_groups = None

for idx, model_file in enumerate(SAVED_MODEL_FILE):
    print(f"[INFO] Loading model {model_file}")
    model = build_model()
    state_dict = torch.load(model_file, map_location=device)
    model.load_state_dict(state_dict)
    model.to(device)
    model.eval()

    fold_scores = []
    fold_groups = []

    with torch.no_grad():
        for X, _, G in test_loader:
            X = X.to(device)
            scores = model(X)
            fold_scores.extend(scores.cpu())
            fold_groups.extend(G.cpu())

    fold_scores = torch.tensor(fold_scores)
    fold_groups = torch.tensor(fold_groups)

    if ensemble_scores is None:
        ensemble_scores = fold_scores
        all_groups = fold_groups
    else:
        ensemble_scores += fold_scores  # Accumulate scores

ensemble_scores = ensemble_scores / len(SAVED_MODEL_FILE)
print("[INFO] Ensemble inference completed", ensemble_scores.shape, all_groups.shape)

[INFO] Loading model /Users/goonzard/Developer/data-science-09-kaggle-airplane/models/best_model_3_SETRANK_1.pt
[INFO] Loading model /Users/goonzard/Developer/data-science-09-kaggle-airplane/models/best_model_3_SETRANK_2.pt
[INFO] Loading model /Users/goonzard/Developer/data-science-09-kaggle-airplane/models/best_model_3_SETRANK_3.pt
[INFO] Loading model /Users/goonzard/Developer/data-science-09-kaggle-airplane/models/best_model_3_SETRANK_4.pt
[INFO] Loading model /Users/goonzard/Developer/data-science-09-kaggle-airplane/models/best_model_3_SETRANK_5.pt
[INFO] Ensemble inference completed torch.Size([6897776]) torch.Size([6897776])


In [11]:
# POST-PROCESS TO SUBMISSION
df_pred = (
    pl.DataFrame({
        "score": ensemble_scores.numpy()
    })
    .with_row_index("id")
)

# Replace `ranker_id` with original column
original_ranker_id = (
    pl.read_parquet(test_file_paths[0])
    .select(["ranker_id"])
    .with_row_index("id")
)

# Replace `Id` with the original columns
original = pl.read_parquet(os.path.join(root_dir, "kaggle", "test.parquet"))
original = original.select("Id").with_row_index("id")

df_pred = (
    df_pred
    .join(original_ranker_id, on="id")
    .join(original, on="id")
)

final_result = df_pred.with_columns(
    pl.col("score")
    .rank(method='ordinal', descending=True)
    .over("ranker_id")
    .alias("selected")
).select(["Id", "ranker_id", "selected"])

final_result = final_result.with_columns(
    pl.col("Id").cast(pl.Int64),
    pl.col("selected").cast(pl.Int64)
)

(
    final_result
    .write_csv(os.path.join(root_dir, "submission", f"submission_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"))
)