# Preprocessing

This embedding pipeline transforms raw JSON evaluation data into reusable vector representations.


In [None]:
import numpy as np
import pandas as pd
import json

import numpy as np
import json
import pandas as pd
from sentence_transformers import SentenceTransformer
import torch

In [3]:
# --- Configuration ---
MODEL_NAME = "google/embeddinggemma-300m" # The specified embedding model
BATCH_SIZE = 32
LEARNING_RATE = 1e-4
NUM_EPOCHS = 20 # Increased epochs slightly for potentially better convergence
RANDOM_SEED = 42

In [1]:
# --- File Paths ---
metric_names_path = "metric_names.json"
metric_embeddings_path = "metric_name_embeddings.npy"
train_data_path = "train_data.json"
test_data_path = "test_data.json"
output_submission_path = "submission.csv"
best_model_save_path = "best_model.pth"

In [4]:
# Ensure reproducibility
torch.manual_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(RANDOM_SEED)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [None]:
# --- 1. Load Data ---

print("--- Loading Data ---")
# Load metric names
with open(metric_names_path, "r") as f:
    metric_names = json.load(f)

# Load metric embeddings
metric_embeddings_array = np.load(metric_embeddings_path)

# Create a mapping from metric name to its embedding
metric_name_to_embedding = {name: metric_embeddings_array[i] for i, name in enumerate(metric_names)}

# Load training data
with open(train_data_path, "r", encoding="utf-8") as f:
    train_data_raw = json.load(f)

# Load test data
with open(test_data_path, "r", encoding="utf-8") as f:
    test_data_raw = json.load(f)


--- Loading Data ---


In [6]:
print(f"Loaded {len(metric_names)} metric names and {metric_embeddings_array.shape} embeddings.")
print(f"Loaded {len(train_data_raw)} training examples.")
print(f"Loaded {len(test_data_raw)} test examples.")

Loaded 145 metric names and (145, 768) embeddings.
Loaded 5000 training examples.
Loaded 3638 test examples.


In [7]:
# --- 2. Initialize Sentence Transformer Model ---
print(f"\n--- Initializing Sentence Transformer Model: {MODEL_NAME} ---")
embedding_model = SentenceTransformer(MODEL_NAME)
EMBEDDING_DIM = embedding_model.get_sentence_embedding_dimension()
embedding_model = embedding_model.to('cuda')
print(f"SentenceTransformer model loaded. Embedding dimension: {EMBEDDING_DIM}")



--- Initializing Sentence Transformer Model: google/embeddinggemma-300m ---
SentenceTransformer model loaded. Embedding dimension: 768


# Processing Each Data Entry

Every training or test sample is passed through a standardized function:

```
process_entry(...)
```

This function extracts:

- `metric_name`
- `system_prompt`
- `user_prompt`
- `response`

Each field is cleaned using:

```
safe_text(x) → returns "" if None
```

Then five embeddings are computed:

| Field | Description |
|---|---|
| metric_embedding | Precomputed reference embedding for this metric |
| metric_name_embedding | SentenceTransformer embedding of the metric string |
| system_prompt_embedding | Embedding of system-level instructions |
| user_prompt_embedding | Embedding of user-provided query |
| response_embedding | Embedding of the generated output |

If the record is from training data, we also attach the numeric `score`.

This function creates a complete structured representation of every sample in embedding space.


In [8]:
def safe_text(x):
    return x if (x is not None and isinstance(x, str)) else ""

def process_entry(entry, metric_embedding_map, embedding_model, is_train=True):
    metric_name = safe_text(entry.get('metric_name'))
    system_prompt = safe_text(entry.get('system_prompt'))
    user_prompt = safe_text(entry.get('user_prompt'))
    response = safe_text(entry.get('response'))

    # ---- embeddings ----
    metric_embedding       = metric_embedding_map[metric_name]
    metric_name_embedding  = embedding_model.encode(metric_name,     convert_to_numpy=True, device='cuda')
    system_embedding       = embedding_model.encode(system_prompt,   convert_to_numpy=True, device='cuda')
    user_embedding         = embedding_model.encode(user_prompt,     convert_to_numpy=True, device='cuda')
    response_embedding     = embedding_model.encode(response,        convert_to_numpy=True, device='cuda')

    result = {
        "metric_embedding": metric_embedding,
        "metric_name_embedding": metric_name_embedding,
        "system_prompt_embedding": system_embedding,
        "user_prompt_embedding": user_embedding,
        "response_embedding": response_embedding
    }

    if is_train:
        result["score"] = float(entry["score"])

    return result


In [None]:
# -------- load test json --------
with open("test_data.json", "r", encoding="utf-8") as f:
    test_data = json.load(f)

metric_embeds = []
metric_name_embeds = []
system_embeds = []
user_embeds = []
response_embeds = []

# -------- process each entry --------
for entry in test_data:
    out = process_entry(entry, metric_name_to_embedding, embedding_model, is_train=False)

    metric_embeds.append(out["metric_embedding"])
    metric_name_embeds.append(out["metric_name_embedding"])
    system_embeds.append(out["system_prompt_embedding"])
    user_embeds.append(out["user_prompt_embedding"])
    response_embeds.append(out["response_embedding"])

# -------- convert to numpy --------
metric_embeds = np.array(metric_embeds)
metric_name_embeds = np.array(metric_name_embeds)
system_embeds = np.array(system_embeds)
user_embeds = np.array(user_embeds)
response_embeds = np.array(response_embeds)

# -------- save separately --------
np.save("test_metric_embedding.npy", metric_embeds)
np.save("test_metric_name_embedding.npy", metric_name_embeds)
np.save("test_system_prompt_embedding.npy", system_embeds)
np.save("test_user_prompt_embedding.npy", user_embeds)
np.save("test_response_embedding.npy", response_embeds)
print("Test embeddings saved.")

Test embeddings saved.


In [10]:
import numpy as np
import json

metric_embedding_map = {name: metric_embeddings_array[i] for i, name in enumerate(metric_names)}

# ---- load train json ----

metric_embeds = []
metric_name_embeds = []
system_embeds = []
user_embeds = []
response_embeds = []
scores = []

for i, entry in enumerate(train_data_raw):
    out = process_entry(entry, metric_embedding_map, embedding_model, is_train=True)

    metric_embeds.append(out["metric_embedding"])
    metric_name_embeds.append(out["metric_name_embedding"])
    system_embeds.append(out["system_prompt_embedding"])
    user_embeds.append(out["user_prompt_embedding"])
    response_embeds.append(out["response_embedding"])
    scores.append(out["score"])
    if (i+1) % 1000 == 0:
        print(f"Processed {i+1} training entries.")


# ---- convert → numpy ----
metric_embeds = np.array(metric_embeds)
metric_name_embeds = np.array(metric_name_embeds)
system_embeds = np.array(system_embeds)
user_embeds = np.array(user_embeds)
response_embeds = np.array(response_embeds)
scores = np.array(scores)

# ---- save ----
np.save("train_metric_embedding.npy", metric_embeds)
np.save("train_metric_name_embedding.npy", metric_name_embeds)
np.save("train_system_prompt_embedding.npy", system_embeds)
np.save("train_user_prompt_embedding.npy", user_embeds)
np.save("train_response_embedding.npy", response_embeds)
np.save("train_scores.npy", scores)


Processed 1000 training entries.
Processed 2000 training entries.
Processed 3000 training entries.
Processed 4000 training entries.
Processed 5000 training entries.


### Save to Disk

Each embedding type is saved as:

```
test_metric_embedding.npy
test_metric_name_embedding.npy
test_system_prompt_embedding.npy
test_user_prompt_embedding.npy
test_response_embedding.npy
```

These files are used later for downstream modeling:

- Matching models  
- Regression predictions  
- Post-processing