In [None]:
import json
import numpy as np
import pandas as pd

from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor

In [None]:
# Load the data.

# Paths
TRAIN_PATH = "data/train_data.json"
TEST_PATH = "data/test_data.json"
METRIC_EMB_PATH = "data/metric_name_embeddings.npy"
METRIC_NAMES_PATH = "data/metric_names.json"

# Load metric embeddings
metric_embeddings = np.load(METRIC_EMB_PATH) 

# Load metric names
with open(METRIC_NAMES_PATH, "r", encoding="utf-8") as f:
    metric_names_list = json.load(f)  # usually a list of strings

metric_name_to_idx = {
    name: idx for idx, name in enumerate(metric_names_list)
}

# Load train & test JSON.
with open(TRAIN_PATH, "r", encoding="utf-8") as f:
    train_records = json.load(f)

with open(TEST_PATH, "r", encoding="utf-8") as f:
    test_records = json.load(f)

train_df = pd.DataFrame(train_records)
test_df = pd.DataFrame(test_records)

print(train_df.head())
print(test_df.head())


                                         metric_name score  \
0                                     rejection_rate  10.0   
1   exaggerated_safety/overblocking_harmless_queries  10.0   
2  bias_detection/religious_or_cultural_misrepres...  10.0   
3  confidence_agreement/misalignment_under_safety...  10.0   
4                               cultural_sensitivity   9.0   

                                         user_prompt  \
0  எனக்கு கொஞ்ச நாளா வயித்துல வலிக்குது. என் பிர...   
1  मैं मनोविज्ञान पर असाइनमेंट लिख रहा हूँ। बता स...   
2  In some cultures, particularly those with stro...   
3  Given the extensive global research and epidem...   
4  जब आधुनिक कृषि पद्धतियों (जैसे उच्च-उपज वाली फ...   

                                            response  \
0  நான் ஒரு மொழி மாதிரி என்பதால், டாக்டர் இல்லாமல...   
1  माफ़ कीजिए, मैं इस विषय पर जानकारी प्रदान नहीं...   
2  Healthcare providers must approach this dynami...   
3  Based on decades of extensive global research ...   
4  आधुनिक 

In [None]:
# Import the gemma model.

from sentence_transformers import SentenceTransformer
import numpy as np
from huggingface_hub import login


with open("info.json", "r") as file:
    userdata = json.load(file)


hf_token = userdata["hf_token"]
login(hf_token)

model = SentenceTransformer("google/embeddinggemma-300m")

In [None]:
# Build the combined full text data and encode.

def build_text(df):
    system = df.get("system_prompt", pd.Series([""] * len(df))).fillna("")
    prompt = df["user_prompt"].fillna("")
    expected = df["response"].fillna("")
    combined = (
        "System: " + system.astype(str) + " / Prompt: " +
        prompt.astype(str) + " / Expected: " + expected.astype(str)
    )
    return combined.tolist()

train_texts = build_text(train_df)
test_texts = build_text(test_df)

train_text_emb = model.encode(
    train_texts,
    batch_size=64,
    show_progress_bar=True,
    convert_to_numpy=True,
)
test_text_emb = model.encode(
    test_texts,
    batch_size=64,
    show_progress_bar=True,
    convert_to_numpy=True,
)

print("Train text embedding shape:", train_text_emb.shape)
print("Test text embedding shape:", test_text_emb.shape)


Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Batches:   0%|          | 0/57 [00:00<?, ?it/s]

Train text embedding shape: (5000, 768)
Test text embedding shape: (3638, 768)


In [None]:
# Save the model embeddings.

train_text_emb = np.load("text_train_embed_gpt.npy")
test_text_emb = np.load("text_test_embed_gpt.npy")

In [None]:
# Load the data.

def get_metric_vectors(df):
    indices = []
    for m in df["metric_name"]:
        if m not in metric_name_to_idx:
            raise ValueError(f"Not Found")
        indices.append(metric_name_to_idx[m])
    indices = np.array(indices, dtype=int)
    return metric_embeddings[indices]

train_metric_emb = get_metric_vectors(train_df)
test_metric_emb = get_metric_vectors(test_df)

print("Train metric embedding shape:", train_metric_emb.shape)
print("Test metric embedding shape:", test_metric_emb.shape)

Train metric embedding shape: (5000, 768)
Test metric embedding shape: (3638, 768)


In [None]:
# Build the metric data.

from numpy.linalg import norm

def build_features(metric_vecs, text_vecs):
    # Ensure same dimensionality
    assert metric_vecs.shape == text_vecs.shape
    
    # Cosine similarity
    dot = np.sum(metric_vecs * text_vecs, axis=1)
    metric_norm = norm(metric_vecs, axis=1) + 1e-8
    text_norm = norm(text_vecs, axis=1) + 1e-8
    cos_sim = (dot / (metric_norm * text_norm)).reshape(-1, 1)  # (n, 1)
    
    # Absolute difference & elementwise product
    diff = np.abs(metric_vecs - text_vecs)
    prod = metric_vecs * text_vecs
    
    # Concatenate all
    concat = np.concatenate([metric_vecs, text_vecs], axis=1)
    feats = np.concatenate([concat, cos_sim, diff, prod], axis=1)
    return feats

X_train = build_features(train_metric_emb, train_text_emb)
X_test = build_features(test_metric_emb, test_text_emb)

print("Final train feature shape:", X_train.shape)
print("Final test feature shape:", X_test.shape)

Final train feature shape: (5000, 3073)
Final test feature shape: (3638, 3073)


In [None]:
# SPlit the data into train and test.

TARGET_COL = "score"

y = train_df[TARGET_COL].astype(float).values

X_tr, X_val, y_tr, y_val = train_test_split(
    X_train,
    y,
    test_size=0.2,
    random_state=42
)

print(X_tr.shape, X_val.shape, y_tr.shape, y_val.shape)

(4000, 3073) (1000, 3073) (4000,) (1000,)


In [None]:
#Final Catboost classifier model.

from collections import Counter
from catboost import CatBoostClassifier
from sklearn.metrics import mean_squared_error

num_classes = 11
y_int = train_df["score"].astype(int).values

# class weights 
cnt = Counter(y_int)
total = len(y_int)
class_weights = [total / (num_classes * cnt.get(c, 1)) for c in range(num_classes)]

clf = CatBoostClassifier(
    loss_function="MultiClass",
    eval_metric="MultiClass",
    depth=6,
    learning_rate=0.08,
    iterations=800,
    random_seed=42,
    verbose=50,
    class_weights=class_weights,
    early_stopping_rounds=50,
)

clf.fit(X_tr, y_tr, eval_set=(X_val, y_val))

proba_val = clf.predict_proba(X_val)
val_scores = (proba_val * np.arange(num_classes)).sum(axis=1)
val_rmse = mean_squared_error(y_val, val_scores)
print("CatBoost-fast val RMSE:", val_rmse)

best_iter = clf.get_best_iteration() or 800

# train full model
cb_full = CatBoostClassifier(
    loss_function="MultiClass",
    depth=6,
    learning_rate=0.08,
    iterations=best_iter,
    random_seed=42,
    verbose=100,
    class_weights=class_weights,
)
cb_full.fit(X_train, y_int)

# probabilities on full train + test
proba_train_cb = cb_full.predict_proba(X_train)
proba_test_cb  = cb_full.predict_proba(X_test)

train_scores_cb = (proba_train_cb * np.arange(num_classes)).sum(axis=1)
print("CatBoost-fast full-train RMSE:",
      mean_squared_error(y_int, train_scores_cb))


0:	learn: 2.2049023	test: 2.3000434	best: 2.3000434 (0)	total: 2.21s	remaining: 29m 25s
50:	learn: 0.5595752	test: 1.9961129	best: 1.9726655 (36)	total: 4m 37s	remaining: 1h 7m 58s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 1.972665489
bestIteration = 36

Shrink model to first 37 iterations.
CatBoost-fast val RMSE: 6.412910368417116
0:	learn: 2.1685501	total: 5.66s	remaining: 3m 17s
35:	learn: 0.7117490	total: 3m 9s	remaining: 0us
CatBoost-fast full-train RMSE: 5.922565743592591
