# Step 2 - Embedding-based Model

In this notebook we use a pre-trained sentence embedding model (MiniLM) to construct prompt+response embeddings
and we train a classifier.


In [1]:
# Imports
import pandas as pd, numpy as np, re, torch
from pathlib import Path

In [2]:
#Loading the data
DATA = Path("../data")
TRAIN_PATH = DATA / 'train.csv'
TEST_PATH = DATA / 'test.csv'
OUT_DIR  = Path("../outputs");  OUT_DIR.mkdir(parents=True, exist_ok=True)
ART_DIR  = Path("../artifacts"); ART_DIR.mkdir(parents=True, exist_ok=True)

train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)

In [3]:
# Build 3-class target: 0=A wins, 1=B wins, 2=Tie
y = np.select(
    [train_df['winner_model_a']==1, train_df['winner_model_b']==1, train_df['winner_tie']==1],
    [0, 1, 2]
)
train_df['target'] = y

Text Preprocessing for Embeddings

In [4]:
import ast

def extract_text_from_field(text_field):
    """Extract text from string representation of list (e.g., '["text"]' -> 'text')."""
    if text_field is None or (isinstance(text_field, float) and pd.isna(text_field)):
        return ""
    
    try:
        parsed = ast.literal_eval(str(text_field))
        if isinstance(parsed, list):
            return ' '.join(str(item) for item in parsed)
        else:
            return str(parsed)
    except:
        return str(text_field)

# Apply to both train and test
for df, df_name in [(train_df, 'train'), (test_df, 'test')]:
    df['prompt_text'] = df['prompt'].apply(extract_text_from_field)
    df['response_a_text'] = df['response_a'].apply(extract_text_from_field)
    df['response_b_text'] = df['response_b'].apply(extract_text_from_field)
    
    # Combine prompt with responses (using [SEP] token)
    df['text_a'] = df['prompt_text'] + " [SEP] " + df['response_a_text']
    df['text_b'] = df['prompt_text'] + " [SEP] " + df['response_b_text']

Embedding Generation

In [5]:
from sentence_transformers import SentenceTransformer
import time

# Load pre-trained embedding model
EMBEDDING_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'
BATCH_SIZE = 32

model = SentenceTransformer(EMBEDDING_MODEL)

  from tqdm.autonotebook import tqdm, trange


For kaggle notebook, use code below. Since, there is no internet connection in competition env. 

In [6]:
#from sentence_transformers import SentenceTransformer

# Load from the correct nested path
#BATCH_SIZE = 32
#model = SentenceTransformer('/address/of/uploaded/all-minilm-l6-v2-model/', trust_remote_code=True)

In [7]:
# --- Paths for cached embeddings
ART_DIR = Path("../artifacts"); ART_DIR.mkdir(parents=True, exist_ok=True)
EMBED_A_TRAIN = ART_DIR / 'train_embeddings_a.npy'
EMBED_B_TRAIN = ART_DIR / 'train_embeddings_b.npy'
EMBED_A_TEST  = ART_DIR / 'test_embeddings_a.npy'
EMBED_B_TEST  = ART_DIR / 'test_embeddings_b.npy'

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Loading SentenceTransformer:", EMBEDDING_MODEL, "on", device)
st_model = SentenceTransformer(EMBEDDING_MODEL, device=device)
st_model.max_seq_length = 512
BATCH = 32

def load_or_encode(prefix, a_path, b_path, df):
    # Try cache
    if a_path.exists() and b_path.exists():
        Ea = np.load(a_path)
        Eb = np.load(b_path)
        if Ea.shape[0] == len(df) and Eb.shape[0] == len(df):
            print(f"[cache] Loaded {prefix} embeddings:", Ea.shape, Eb.shape)
            return Ea, Eb
        else:
            print(f"[cache] Shape mismatch for {prefix} cache → recomputing.")
    # Compute and save
    start = time.time()
    print(f"[encode] Computing {prefix} embeddings...")
    Ea = st_model.encode(df["text_a"].tolist(), batch_size=BATCH, show_progress_bar=True, convert_to_numpy=True)
    Eb = st_model.encode(df["text_b"].tolist(), batch_size=BATCH, show_progress_bar=True, convert_to_numpy=True)
    np.save(a_path, Ea); np.save(b_path, Eb)
    print(f"[encode] Saved {prefix} embeddings to {ART_DIR}  (elapsed {(time.time()-start)/60:.2f} min)")
    return Ea, Eb

# --- Train/Test embeddings (A and B)
train_a, train_b = load_or_encode("train", EMBED_A_TRAIN, EMBED_B_TRAIN, train_df)
test_a,  test_b  = load_or_encode("test",  EMBED_A_TEST,  EMBED_B_TEST,  test_df)

# --- concat A‖B
X_emb_train = np.concatenate([train_a, train_b], axis=1)
X_emb_test  = np.concatenate([test_a,  test_b],  axis=1)
print("Emb shapes:", X_emb_train.shape, X_emb_test.shape)

Loading SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2 on cpu
[cache] Loaded train embeddings: (57477, 384) (57477, 384)
[cache] Loaded test embeddings: (3, 384) (3, 384)
Emb shapes: (57477, 768) (3, 768)


CROSS-VALIDATION

In [8]:
y_emb = np.select(
    [train_df['winner_model_a'].eq(1),
     train_df['winner_model_b'].eq(1),
     train_df['winner_tie'].eq(1)],
    [0, 1, 2]
).astype(int)

# === Cross-validation on embeddings (scaled LR) ===
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_losses = []

for fold, (tr, va) in enumerate(cv.split(X_emb_train, y_emb), 1):
    Xtr, Xva = X_emb_train[tr], X_emb_train[va]
    ytr, yva = y_emb[tr], y_emb[va]

    scaler = StandardScaler()
    Xtr_s = scaler.fit_transform(Xtr)
    Xva_s = scaler.transform(Xva)

    clf = LogisticRegression(max_iter=2000, C=1.0, random_state=42)
    clf.fit(Xtr_s, ytr)
    proba = clf.predict_proba(Xva_s)
    loss = log_loss(yva, proba, labels=[0,1,2])
    cv_losses.append(loss)
    print(f"Fold {fold}: log_loss = {loss:.5f}")

print(f"\nCross-val log_loss → mean={np.mean(cv_losses):.5f}, std={np.std(cv_losses):.5f}")



Fold 1: log_loss = 1.07008
Fold 2: log_loss = 1.07554
Fold 3: log_loss = 1.06972
Fold 4: log_loss = 1.07088
Fold 5: log_loss = 1.07205

Cross-val log_loss → mean=1.07165, std=0.00210


Embedding model (1.072) is performing a bit worse than the lexical baseline (1.071). 
This implies lexical features capture important preference signals.
The embedding approach might benefit from combining with lexical features.

In [9]:
# Final fit on full training data & submission
scaler_final = StandardScaler()
X_emb_train_s = scaler_final.fit_transform(X_emb_train)
X_emb_test_s  = scaler_final.transform(X_emb_test)

final_lr = LogisticRegression(max_iter=2000, C=1.0, random_state=42)
final_lr.fit(X_emb_train_s, y_emb)
test_proba = final_lr.predict_proba(X_emb_test_s)

submission_embed = pd.DataFrame({
    'id': test_df['id'].values,
    'winner_model_a': test_proba[:, 0],
    'winner_model_b': test_proba[:, 1],
    'winner_tie':     test_proba[:, 2]
})
submission_embed.to_csv(OUT_DIR / 'submission_embeddings.csv', index=False)
print("Saved:", OUT_DIR / 'submission_embeddings.csv')

Saved: ..\outputs\submission_embeddings.csv
