# DA5401-2025-Data-Challenge

## By : B.S.Tejas

In [None]:
import os, json, random
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import GroupKFold
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from tqdm.auto import tqdm

from sentence_transformers import SentenceTransformer

# ------------------ SEED ------------------
RND = 42
random.seed(RND); np.random.seed(RND)



In [5]:
# ------------------ HUGGINGFACE LOGIN ------------------
try:
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    HF_TOKEN = user_secrets.get_secret("HF_TOKEN")
except Exception:
    print("Kaggle Secrets not available. Checking HF_TOKEN env var...")
    HF_TOKEN = os.environ.get("HF_TOKEN")

if HF_TOKEN:
    from huggingface_hub import login
    login(token=HF_TOKEN, add_to_git_credential=False)
    print("Successfully logged into Hugging Face.")
else:
    print("HF_TOKEN not found. If Gemma is gated, this may fail.")

Successfully logged into Hugging Face.


## Dataset Loadin & Feature Engineering

In [6]:
# ------------------ CONFIG ------------------
KAGGLE_DIR = "/kaggle/input/da5401-dataset"
TRAIN_JSON = os.path.join(KAGGLE_DIR, "train_data.json")
TEST_JSON  = os.path.join(KAGGLE_DIR, "test_data.json")
METRIC_NAMES = os.path.join(KAGGLE_DIR, "metric_names.json")
METRIC_EMBS  = os.path.join(KAGGLE_DIR, "metric_name_embeddings.npy")
SUBMISSION_FILE = os.path.join(KAGGLE_DIR, "sample_submission.csv")

OUT_TRAIN = "/kaggle/working/train_oof_meta_learn.csv"
OUT_AUG   = "/kaggle/working/train_aug_meta_learn.csv"
OUT_COMB  = "/kaggle/working/combined_oof_meta_learn.csv"
OUT_SUB   = "/kaggle/working/submission_da5401_meta_learn_bias.csv"

# tuned hyperparams
NEG_PER_SAMPLE   = 1
SVD_DIM          = 145
TFIDF_MAX_FEAT   = 20000
N_SPLITS         = 5
RIDGE_ALPHA      = 1.0          # base Ridge
RF_N_ESTIMATORS  = 1000         # tuned RF size
RF_MAX_DEPTH     = None         # tuned (None = unlimited)
RF_MIN_LEAF      = 1            # tuned
META_ALPHA       = 1          # tuned meta-learner Ridge
BIAS_ALPHA       = 3.0          # tuned bias smoothing

# ------------------ SANITY CHECK ------------------
for p in [TRAIN_JSON, METRIC_NAMES, METRIC_EMBS, TEST_JSON, SUBMISSION_FILE]:
    if not os.path.exists(p):
        raise FileNotFoundError(f"Required file not found: {p}")

# ------------------ LOAD TRAIN DATA ------------------
print("Loading raw json files...")
with open(TRAIN_JSON, "r", encoding="utf-8") as f:
    train_raw = json.load(f)
with open(METRIC_NAMES, "r", encoding="utf-8") as f:
    metric_names = json.load(f)

metric_map = {name: i for i, name in enumerate(metric_names)}
metric_embs = np.load(METRIC_EMBS)   # (145, 768)
print("Metric embeddings shape:", metric_embs.shape)

rows = []
for rec in train_raw:
    rows.append({
        "metric_name": rec.get("metric_name"),
        "metric_idx": metric_map.get(rec.get("metric_name"), -1),
        "prompt": (rec.get("prompt","") or ""),
        "system_prompt": (rec.get("system_prompt","") or ""),
        "response": (rec.get("response","") or ""),
        "score": float(rec.get("score", 0.0))
    })
df = pd.DataFrame(rows)
print("Train rows:", len(df))

# ------------------ TEXT CONCAT ------------------
combined_texts = (df["prompt"].fillna("") + " [SEP] " +
                  df["system_prompt"].fillna("") + " [SEP] " +
                  df["response"].fillna("")).tolist()

Loading raw json files...
Metric embeddings shape: (145, 768)
Train rows: 5000


In [7]:

# ------------------ TF-IDF + SVD ------------------
print("\n=== Building TF-IDF & SVD text features ===")
tfidf = TfidfVectorizer(max_features=TFIDF_MAX_FEAT, ngram_range=(1,2))
X_tfidf = tfidf.fit_transform(combined_texts)
print("TF-IDF shape:", X_tfidf.shape)

svd = TruncatedSVD(n_components=SVD_DIM, random_state=RND)
X_text_svd = svd.fit_transform(X_tfidf)
print("Text SVD shape:", X_text_svd.shape)

# ------------------ METRIC SVD ------------------
print("Reducing metric embeddings via SVD...")
svd_metric = TruncatedSVD(n_components=SVD_DIM, random_state=RND)
metric_embs_reduced = svd_metric.fit_transform(metric_embs)
print("Metric reduced shape:", metric_embs_reduced.shape)

metric_vecs = np.vstack([
    metric_embs_reduced[idx] if (0 <= idx < metric_embs_reduced.shape[0]) else np.zeros(SVD_DIM)
    for idx in df["metric_idx"].values
])


=== Building TF-IDF & SVD text features ===
TF-IDF shape: (5000, 20000)
Text SVD shape: (5000, 145)
Reducing metric embeddings via SVD...
Metric reduced shape: (145, 145)


In [8]:

# ------------------ PAIRWISE FEATS ------------------
def pairwise_feats(A, B):
    assert A.shape == B.shape, f"pairwise_feats: shape mismatch {A.shape} vs {B.shape}"
    dot = np.sum(A * B, axis=1)
    an  = np.linalg.norm(A, axis=1)
    bn  = np.linalg.norm(B, axis=1)
    cos = dot / (an * bn + 1e-9)
    l1  = np.sum(np.abs(A - B), axis=1)
    l2  = np.sqrt(np.sum((A - B)**2, axis=1))
    prod = A * B
    return {
        "dot": dot, "cos": cos,
        "l1": l1, "l2": l2,
        "prod_mean": prod.mean(axis=1),
        "prod_std":  prod.std(axis=1),
        "ad_mean": np.abs(A - B).mean(axis=1),
        "ad_std":  np.abs(A - B).std(axis=1)
    }

print("\n=== metric-vs-text pairwise features (SVD space) ===")
metric_pairs = pairwise_feats(metric_vecs, X_text_svd)
feat_df = pd.DataFrame(metric_pairs)



=== metric-vs-text pairwise features (SVD space) ===


In [9]:
# ------------------ PROTOTYPES IN TEXT SVD SPACE ------------------
print("Computing prototypes per metric in text SVD space...")
proto_by_metric = {}
for mid, g in df.groupby("metric_idx"):
    idxs = g.index.values
    if len(idxs) > 0:
        proto_by_metric[int(mid)] = X_text_svd[idxs].mean(axis=0)
    else:
        proto_by_metric[int(mid)] = np.zeros(SVD_DIM)

n_metrics = metric_embs_reduced.shape[0]
proto_matrix = np.vstack([proto_by_metric.get(i, np.zeros(SVD_DIM)) for i in range(n_metrics)])
proto_mat_self = np.vstack([proto_by_metric.get(int(mid), np.zeros(SVD_DIM))
                            for mid in df["metric_idx"].values])

proto_pairs = pairwise_feats(proto_mat_self, X_text_svd)
for k, v in proto_pairs.items():
    feat_df[f"{k}_pt"] = v

# ------------------ PROTOTYPE MARGIN ------------------
print("Prototype margin (cosine to all prototypes) ...")
P = proto_matrix
P_norm = np.linalg.norm(P, axis=1) + 1e-9
T = X_text_svd
T_norm = np.linalg.norm(T, axis=1) + 1e-9
cos_mat = (T @ P.T) / (T_norm[:, None] * P_norm[None, :])

metric_idx_arr = df["metric_idx"].values.astype(int)
row_idx = np.arange(len(df))
self_cos = cos_mat[row_idx, metric_idx_arr]
cos_mat_wo = cos_mat.copy()
cos_mat_wo[row_idx, metric_idx_arr] = -1e9
best_other = cos_mat_wo.max(axis=1)
margin = self_cos - best_other

feat_df["cos_pt_self_full"] = self_cos
feat_df["cos_pt_other_max"] = best_other
feat_df["cos_pt_margin"]    = margin

Computing prototypes per metric in text SVD space...
Prototype margin (cosine to all prototypes) ...


In [10]:
# ------------------ GEMMA TEXT ENCODING ------------------
print("\n=== Encoding with Gemma (google/embeddinggemma-300m) for text ===")
gemma_model = SentenceTransformer("google/embeddinggemma-300m")
X_text_gemma = gemma_model.encode(
    combined_texts,
    batch_size=64,
    show_progress_bar=True,
    convert_to_numpy=True
)
print("Gemma embeddings shape:", X_text_gemma.shape)

G = metric_embs              # (145, 768) in same space as Gemma
G_norm = np.linalg.norm(G, axis=1) + 1e-9
Tg = X_text_gemma            # (5000, 768)
Tg_norm = np.linalg.norm(Tg, axis=1) + 1e-9
cos_gemma = (Tg @ G.T) / (Tg_norm[:, None] * G_norm[None, :])

self_cos_g = cos_gemma[row_idx, metric_idx_arr]
cos_g_wo = cos_gemma.copy()
cos_g_wo[row_idx, metric_idx_arr] = -1e9
best_other_g = cos_g_wo.max(axis=1)
margin_g = self_cos_g - best_other_g

feat_df["gm_cos_self"]      = self_cos_g
feat_df["gm_cos_other_max"] = best_other_g
feat_df["gm_cos_margin"]    = margin_g

# ------------------ SIMPLE SCALARS + LANG FLAGS ------------------
feat_df["len_prompt"]   = df["prompt"].apply(lambda x: len(x or ""))
feat_df["len_response"] = df["response"].apply(lambda x: len(x or ""))
feat_df["ratio_resp_pr"] = (feat_df["len_response"] + 1) / (feat_df["len_prompt"] + 1)

def lang_flag(s):
    if any("\u0B80" <= ch <= "\u0BFF" for ch in s): return "ta_te"
    if any("\u0900" <= ch <= "\u097F" for ch in s): return "hi"
    if any("\u0980" <= ch <= "\u09FF" for ch in s): return "bn"
    return "latin"

langs = (df["prompt"].fillna("") + " " + df["response"].fillna("")).apply(lang_flag)
feat_df["lang_is_latin"] = (langs == "latin").astype(int)
feat_df["lang_is_hi"]    = (langs == "hi").astype(int)
feat_df["lang_is_ta_te"] = (langs == "ta_te").astype(int)

feat_df["metric_idx"] = df["metric_idx"].astype(int)
feat_df["score"]      = df["score"].astype(float)

metric_te = feat_df.groupby("metric_idx")["score"].mean().to_dict()
feat_df["metric_te"] = feat_df["metric_idx"].map(metric_te).fillna(feat_df["score"].mean())

feat_df.to_csv(OUT_TRAIN, index=False)
print("Saved base train features to:", OUT_TRAIN)


=== Encoding with Gemma (google/embeddinggemma-300m) for text ===


modules.json:   0%|          | 0.00/573 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/997 [00:00<?, ?B/s]



README.md:   0%|          | 0.00/18.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/58.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.49k [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/1.21G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/312 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/134 [00:00<?, ?B/s]

2_Dense/model.safetensors:   0%|          | 0.00/9.44M [00:00<?, ?B/s]

3_Dense/model.safetensors:   0%|          | 0.00/9.44M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/134 [00:00<?, ?B/s]

Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Gemma embeddings shape: (5000, 768)
Saved base train features to: /kaggle/working/train_oof_meta_learn.csv


In [11]:

# ------------------ AUGMENTATION (NEGATIVE SAMPLES) ------------------
print("\n=== Generating augmented rows (NEG_PER_SAMPLE = %d ) ===" % NEG_PER_SAMPLE)
unique_metrics = df["metric_idx"].unique().tolist()
aug_rows = []
for i in tqdm(range(len(feat_df)), desc="Creating aug rows"):
    base = feat_df.iloc[i].to_dict()
    base["is_synth"] = False
    aug_rows.append(base)

    cos_row_proto = cos_mat[i].copy()
    cos_row_g     = cos_gemma[i].copy()
    for _ in range(NEG_PER_SAMPLE):
        wrong = int(base["metric_idx"])
        while wrong == base["metric_idx"]:
            wrong = int(random.choice(unique_metrics))

        wrong_vec  = metric_embs_reduced[wrong]
        text_vec   = X_text_svd[i]
        pf         = pairwise_feats(wrong_vec.reshape(1,-1), text_vec.reshape(1,-1))
        wrong_proto = proto_by_metric.get(wrong, np.zeros(SVD_DIM))
        pf_pt       = pairwise_feats(wrong_proto.reshape(1,-1), text_vec.reshape(1,-1))

        # prototype margin for wrong metric
        self_cos_w = cos_row_proto[wrong]
        cos_row_proto_wo = cos_row_proto.copy()
        cos_row_proto_wo[wrong] = -1e9
        best_other_w = cos_row_proto_wo.max()
        margin_w     = self_cos_w - best_other_w

        # Gemma margin for wrong metric
        self_cos_g_w = cos_row_g[wrong]
        cos_row_g_wo = cos_row_g.copy()
        cos_row_g_wo[wrong] = -1e9
        best_other_g_w = cos_row_g_wo.max()
        margin_g_w     = self_cos_g_w - best_other_g_w

        neg = {
            "dot": float(pf["dot"][0]),
            "cos": float(pf["cos"][0]),
            "l1":  float(pf["l1"][0]),
            "l2":  float(pf["l2"][0]),
            "prod_mean": float(pf["prod_mean"][0]),
            "prod_std":  float(pf["prod_std"][0]),
            "ad_mean":   float(pf["ad_mean"][0]),
            "ad_std":    float(pf["ad_std"][0]),

            "dot_pt": float(pf_pt["dot"][0]),
            "cos_pt": float(pf_pt["cos"][0]),
            "l1_pt":  float(pf_pt["l1"][0]),
            "l2_pt":  float(pf_pt["l2"][0]),
            "prod_mean_pt": float(pf_pt["prod_mean"][0]),
            "prod_std_pt":  float(pf_pt["prod_std"][0]),
            "ad_mean_pt":   float(pf_pt["ad_mean"][0]),
            "ad_std_pt":    float(pf_pt["ad_std"][0]),

            "cos_pt_self_full": float(self_cos_w),
            "cos_pt_other_max": float(best_other_w),
            "cos_pt_margin":    float(margin_w),

            "gm_cos_self":      float(self_cos_g_w),
            "gm_cos_other_max": float(best_other_g_w),
            "gm_cos_margin":    float(margin_g_w),

            "len_prompt":   float(base["len_prompt"]),
            "len_response": float(base["len_response"]),
            "ratio_resp_pr":float(base["ratio_resp_pr"]),
            "lang_is_latin":int(base["lang_is_latin"]),
            "lang_is_hi":   int(base["lang_is_hi"]),
            "lang_is_ta_te":int(base["lang_is_ta_te"]),
            "metric_idx":   int(wrong),
            "score":        float(random.choice([0,1,2])),
            "metric_te":    float(metric_te.get(wrong, feat_df["score"].mean())),
            "is_synth": True
        }
        aug_rows.append(neg)

aug_df = pd.DataFrame(aug_rows)
aug_df.to_csv(OUT_AUG, index=False)
print("Saved augmented dataframe to:", OUT_AUG, "shape:", aug_df.shape)


=== Generating augmented rows (NEG_PER_SAMPLE = 1 ) ===


Creating aug rows:   0%|          | 0/5000 [00:00<?, ?it/s]

Saved augmented dataframe to: /kaggle/working/train_aug_meta_learn.csv shape: (10000, 32)


In [12]:
# ------------------ FEATURE COLS ------------------
feature_cols = [
    "dot","cos","l1","l2","prod_mean","prod_std","ad_mean","ad_std",
    "dot_pt","cos_pt","l1_pt","l2_pt","prod_mean_pt","prod_std_pt","ad_mean_pt","ad_std_pt",
    "cos_pt_self_full","cos_pt_other_max","cos_pt_margin",
    "gm_cos_self","gm_cos_other_max","gm_cos_margin",
    "len_prompt","len_response","ratio_resp_pr",
    "lang_is_latin","lang_is_hi","lang_is_ta_te","metric_te"
]
feature_cols = [c for c in feature_cols if c in feat_df.columns]
print("\nUsing feature columns:", feature_cols)


Using feature columns: ['dot', 'cos', 'l1', 'l2', 'prod_mean', 'prod_std', 'ad_mean', 'ad_std', 'dot_pt', 'cos_pt', 'l1_pt', 'l2_pt', 'prod_mean_pt', 'prod_std_pt', 'ad_mean_pt', 'ad_std_pt', 'cos_pt_self_full', 'cos_pt_other_max', 'cos_pt_margin', 'gm_cos_self', 'gm_cos_other_max', 'gm_cos_margin', 'len_prompt', 'len_response', 'ratio_resp_pr', 'lang_is_latin', 'lang_is_hi', 'lang_is_ta_te', 'metric_te']


## Models

In [13]:
# ------------------ BASE MODELS (Ridge + RF) ------------------
X = feat_df[feature_cols].fillna(0).values
y = feat_df["score"].values
groups = feat_df["metric_idx"].values

gkf = GroupKFold(n_splits=N_SPLITS)
oof_ridge = np.zeros(len(y))
oof_rf    = np.zeros(len(y))
ridges = []
rfs    = []

print("\n=== Training OOF base models... ===")
for fold, (tr, val) in enumerate(gkf.split(X, y, groups)):
    print(" Fold", fold)
    r = Ridge(alpha=RIDGE_ALPHA, random_state=RND)
    r.fit(X[tr], y[tr])
    oof_ridge[val] = r.predict(X[val])
    ridges.append(r)

    rf = RandomForestRegressor(
        n_estimators   = RF_N_ESTIMATORS,
        max_depth      = RF_MAX_DEPTH,
        min_samples_leaf = RF_MIN_LEAF,
        n_jobs=-1,
        random_state=RND
    )
    rf.fit(X[tr], y[tr])
    oof_rf[val] = rf.predict(X[val])
    rfs.append(rf)

print("Ridge OOF RMSE:", mean_squared_error(y, oof_ridge, squared=False))
print("RF    OOF RMSE:", mean_squared_error(y, oof_rf,    squared=False))


=== Training OOF base models... ===
 Fold 0
 Fold 1
 Fold 2
 Fold 3
 Fold 4
Ridge OOF RMSE: 0.8678186308819579
RF    OOF RMSE: 0.9136772813864203


In [14]:
# ------------------ RF ON AUGMENTED DATA ------------------
print("\nTraining augmented-RF model on augmented data …")
X_aug = aug_df[feature_cols].fillna(0).values
y_aug = aug_df["score"].values
rf_aug = RandomForestRegressor(
    n_estimators   = RF_N_ESTIMATORS,
    max_depth      = RF_MAX_DEPTH,
    min_samples_leaf = RF_MIN_LEAF,
    n_jobs=-1,
    random_state=RND
)
rf_aug.fit(X_aug, y_aug)
aug_preds_on_real = rf_aug.predict(feat_df[feature_cols].fillna(0).values)

combined = feat_df[["metric_idx","score"]].copy()
combined["oof_ridge"]     = oof_ridge
combined["oof_rf"]        = oof_rf
combined["oof_aug_model"] = aug_preds_on_real
combined.to_csv(OUT_COMB, index=False)
print("Saved combined OOFs to:", OUT_COMB)


Training augmented-RF model on augmented data …
Saved combined OOFs to: /kaggle/working/combined_oof_meta_learn.csv


In [15]:
# ------------------ META-LEARNER (Ridge on [ridge, rf, aug, metric_te]) ------------------
meta_X = combined[["oof_ridge","oof_rf","oof_aug_model"]].values
meta_extra = combined[["metric_idx","score"]].copy()
meta_extra["metric_te"] = combined["metric_idx"].map(metric_te)
meta_X_extra = np.concatenate([meta_X, meta_extra[["metric_te"]].values], axis=1)

meta_y = combined["score"].values
meta_groups = combined["metric_idx"].values

gkf_meta = GroupKFold(n_splits=N_SPLITS)
meta_oof = np.zeros(len(meta_y))
from sklearn.linear_model import Ridge as MetaRidge
meta_models = []

print("\n=== Training meta-learner (Ridge) … ===")
for fold, (tr, val) in enumerate(gkf_meta.split(meta_X_extra, meta_y, meta_groups)):
    print(" Meta fold", fold)
    m = MetaRidge(alpha=META_ALPHA, random_state=RND)
    m.fit(meta_X_extra[tr], meta_y[tr])
    meta_oof[val] = m.predict(meta_X_extra[val])
    meta_models.append(m)

print("Meta-learner OOF RMSE (no bias):", mean_squared_error(meta_y, meta_oof, squared=False))

combined["meta_oof_pred"] = meta_oof


=== Training meta-learner (Ridge) … ===
 Meta fold 0
 Meta fold 1
 Meta fold 2
 Meta fold 3
 Meta fold 4
Meta-learner OOF RMSE (no bias): 0.6067000760669571


In [16]:
# ------------------ BIAS CORRECTION ON META PRED ------------------
print("\n=== Bias correction (metric-wise) ===")
ms = combined.groupby("metric_idx").agg(
    n=("score","size"),
    mean_true=("score","mean"),
    mean_pred=("meta_oof_pred","mean")
).reset_index()
ms["bias_raw"] = ms["mean_true"] - ms["mean_pred"]
ms["bias_smoothed"] = (ms["n"] * ms["bias_raw"]) / (ms["n"] + BIAS_ALPHA)
bias_map = ms.set_index("metric_idx")["bias_smoothed"].to_dict()

combined["bias"] = combined["metric_idx"].map(bias_map).fillna(0.0)
meta_oof_corr = np.clip(combined["meta_oof_pred"] + combined["bias"], 0.0, 10.0)
print("Meta-learner + bias OOF RMSE:", mean_squared_error(meta_y, meta_oof_corr, squared=False))

# update feat_df with OOFs (for any analysis you want)
feat_df["oof_ridge"]   = oof_ridge
feat_df["oof_rf"]      = oof_rf
feat_df["oof_aug_model"] = aug_preds_on_real
feat_df["meta_oof"]    = meta_oof
feat_df["meta_oof_corr"] = meta_oof_corr
feat_df.to_csv(OUT_TRAIN, index=False)


=== Bias correction (metric-wise) ===
Meta-learner + bias OOF RMSE: 0.5840728457241864


## Inference

In [17]:

# ------------------ TEST INFERENCE ------------------
print("\n=== Preparing test predictions … ===")
with open(TEST_JSON, "r", encoding="utf-8") as f:
    test_raw = json.load(f)
test_rows = []
for rec in test_raw:
    test_rows.append({
        "metric_idx": metric_map.get(rec.get("metric_name"), -1),
        "prompt": (rec.get("prompt","") or ""),
        "system_prompt": (rec.get("system_prompt","") or ""),
        "response": (rec.get("response","") or "")
    })
test_df = pd.DataFrame(test_rows)
test_texts = (test_df["prompt"].fillna("") + " [SEP] " +
              test_df["system_prompt"].fillna("") + " [SEP] " +
              test_df["response"].fillna("")).tolist()

# text features
X_test_tfidf = tfidf.transform(test_texts)
X_test_svd   = svd.transform(X_test_tfidf)

test_metric_vecs = np.vstack([
    metric_embs_reduced[idx] if 0 <= idx < metric_embs_reduced.shape[0] else np.zeros(SVD_DIM)
    for idx in test_df["metric_idx"].values
])
test_pairs = pairwise_feats(test_metric_vecs, X_test_svd)

test_proto_self = np.vstack([
    proto_by_metric.get(int(mid), np.zeros(SVD_DIM))
    for mid in test_df["metric_idx"].values
])
test_proto_pairs = pairwise_feats(test_proto_self, X_test_svd)

T_test   = X_test_svd
Tt_norm  = np.linalg.norm(T_test, axis=1) + 1e-9
cos_mat_test = (T_test @ P.T) / (Tt_norm[:, None] * P_norm[None, :])
metric_idx_test = test_df["metric_idx"].values.astype(int)
row_idx_test    = np.arange(len(test_df))
self_cos_test   = cos_mat_test[row_idx_test, metric_idx_test]
cos_mat_test_wo = cos_mat_test.copy()
cos_mat_test_wo[row_idx_test, metric_idx_test] = -1e-9
best_other_test = cos_mat_test_wo.max(axis=1)
margin_test     = self_cos_test - best_other_test

X_test_gemma = gemma_model.encode(
    test_texts,
    batch_size=64,
    show_progress_bar=True,
    convert_to_numpy=True
)
Tg_test   = X_test_gemma
Tgn_test  = np.linalg.norm(Tg_test, axis=1) + 1e-9
cos_gemma_test = (Tg_test @ G.T) / (Tgn_test[:, None] * G_norm[None, :])
self_cos_g_test = cos_gemma_test[row_idx_test, metric_idx_test]
cos_g_test_wo = cos_gemma_test.copy()
cos_g_test_wo[row_idx_test, metric_idx_test] = -1e-9
best_other_g_test = cos_g_test_wo.max(axis=1)
margin_g_test     = self_cos_g_test - best_other_g_test

test_feat = pd.DataFrame(test_pairs)
for k, v in test_proto_pairs.items():
    test_feat[f"{k}_pt"] = v
test_feat["cos_pt_self_full"] = self_cos_test
test_feat["cos_pt_other_max"] = best_other_test
test_feat["cos_pt_margin"]    = margin_test
test_feat["gm_cos_self"]      = self_cos_g_test
test_feat["gm_cos_other_max"] = best_other_g_test
test_feat["gm_cos_margin"]    = margin_g_test
test_feat["len_prompt"]       = test_df["prompt"].apply(lambda x: len(x or ""))
test_feat["len_response"]     = test_df["response"].apply(lambda x: len(x or ""))
test_feat["ratio_resp_pr"]    = (test_feat["len_response"] + 1)/(test_feat["len_prompt"] + 1)
langs_test = (test_df["prompt"].fillna("") + " " + test_df["response"].fillna("")).apply(lang_flag)
test_feat["lang_is_latin"] = (langs_test == "latin").astype(int)
test_feat["lang_is_hi"]    = (langs_test == "hi").astype(int)
test_feat["lang_is_ta_te"] = (langs_test == "ta_te").astype(int)
test_feat["metric_idx"]    = test_df["metric_idx"].values
test_feat["metric_te"]     = test_feat["metric_idx"].map(metric_te).fillna(feat_df["score"].mean())

X_test_feat = test_feat[feature_cols].fillna(0).values
test_ridge_preds = np.column_stack([m.predict(X_test_feat) for m in ridges]).mean(axis=1)
test_rf_preds    = np.column_stack([m.predict(X_test_feat) for m in rfs]).mean(axis=1)
test_aug_preds   = rf_aug.predict(X_test_feat)

meta_test_X = np.vstack([test_ridge_preds, test_rf_preds, test_aug_preds]).T
test_extra  = test_feat["metric_te"].values.reshape(-1,1)
meta_test_X_extra = np.concatenate([meta_test_X, test_extra], axis=1)
test_meta_preds   = np.mean([m.predict(meta_test_X_extra) for m in meta_models], axis=0)

test_biases   = np.array([bias_map.get(int(mi), 0.0) for mi in test_df["metric_idx"].values])
test_meta_corr = np.clip(test_meta_preds + test_biases, 0.0, 10.0)

sample = pd.read_csv(SUBMISSION_FILE)
if len(sample) == len(test_meta_corr):
    out_sub = pd.DataFrame({"ID": sample.iloc[:,0].values, "score": test_meta_corr})
else:
    out_sub = pd.DataFrame({"ID": np.arange(1, len(test_meta_corr)+1), "score": test_meta_corr})

out_sub.to_csv(OUT_SUB, index=False)
print("\nSaved submission to:", OUT_SUB)
print(out_sub.head())

print("\nDONE.")


=== Preparing test predictions … ===


Batches:   0%|          | 0/57 [00:00<?, ?it/s]


Saved submission to: /kaggle/working/submission_da5401_meta_learn_bias.csv
   ID     score
0   1  9.134753
1   2  9.365601
2   3  8.989640
3   4  9.227561
4   5  2.666913

DONE.
