# ðŸš€ Modeling and Results

In [1]:
# Imports
import pandas as pd, numpy as np, re
from pathlib import Path

In [2]:
#Loading the data
DATA = Path("../data")
TRAIN_PATH = DATA / 'train.csv'
TEST_PATH = DATA / 'test.csv'

train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)

In [3]:
# Build 3-class target: 0=A wins, 1=B wins, 2=Tie
y = np.select(
    [train_df['winner_model_a']==1, train_df['winner_model_b']==1, train_df['winner_tie']==1],
    [0, 1, 2]
)
train_df['target'] = y

In [4]:
# Helper functions for feature engineering

# count regex matches safely
def count_pattern(text, pattern):
    if not isinstance(text, str):
        return 0
    return len(re.findall(pattern, text))

# Paragraph breaks: "\n\n"
def paragraph_count(text):
    return text.count('\\n\\n')

# List / bullet usage: "-", "â€¢", or numbered lists
def list_count(text):
    return count_pattern(text, r"(^\s*[\-\*â€¢]\s|\d+\.)")

# Quote / markdown emphasis: ">", "**", or blockquotes
def quote_count(text):
    return count_pattern(text, r">|\*\*")

# This choice of features comes from our Exploratory Data Analysis in which we tested the correlation between prefered answers and different lexical features

In [5]:
# Compute structural features for A and B responses
for df in [train_df, test_df]:
    for col in ['response_a', 'response_b']:
        df[f'para_count_{col[-1]}'] = df[col].apply(paragraph_count)
        df[f'list_count_{col[-1]}'] = df[col].apply(list_count)
        df[f'quote_count_{col[-1]}'] = df[col].apply(quote_count)

    # Compute relative differences (A - B)
    for feat in ['para_count', 'list_count', 'quote_count']:
        df[f'{feat}_diff'] = df[f'{feat}_a'] - df[f'{feat}_b']

In [6]:
# Length features (characters)
train_df['len_a'] = train_df['response_a'].astype(str).apply(len)
train_df['len_b'] = train_df['response_b'].astype(str).apply(len)
train_df['len_diff'] = train_df['len_a'] - train_df['len_b']

# Compute metrics difference between a and b:
train_df['para_count_diff']  = train_df['para_count_a']  - train_df['para_count_b']
train_df['list_count_diff']  = train_df['list_count_a']  - train_df['list_count_b']
train_df['quote_count_diff'] = train_df['quote_count_a'] - train_df['quote_count_b']

# Final feature matrix
feature_cols = ['len_diff', 'para_count_diff', 'list_count_diff', 'quote_count_diff']
X = train_df[feature_cols].fillna(0)
X.head()

Unnamed: 0,len_diff,para_count_diff,list_count_diff,quote_count_diff
0,3332,17,8,24
1,-535,-7,-3,0
2,-914,1,0,0
3,1620,0,5,0
4,528,-2,0,0


In [7]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
import numpy as np

# Define modeling pipeline
pipe = Pipeline([
    ('normalize', StandardScaler()),
    ('model', LogisticRegression(
        max_iter=2000,
        C=1.0,
        class_weight=None
    ))
])

# Stratified 5-fold cross-validation setup
cv_splitter = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

fold_scores = []
for fold_idx, (train_idx, val_idx) in enumerate(cv_splitter.split(X, y), start=1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    pipe.fit(X_train, y_train)
    preds = pipe.predict_proba(X_val)
    loss = log_loss(y_val, preds, labels=[0, 1, 2])
    fold_scores.append(loss)
    print(f"Fold {fold_idx}: log_loss = {loss:.5f}")

fold_scores = np.array(fold_scores)
print(f"\nCross-val log_loss â†’ mean={fold_scores.mean():.5f}, std={fold_scores.std():.5f}")

Fold 1: log_loss = 1.07255
Fold 2: log_loss = 1.07106
Fold 3: log_loss = 1.07194
Fold 4: log_loss = 1.07216
Fold 5: log_loss = 1.06893

Cross-val log_loss â†’ mean=1.07133, std=0.00130


In [8]:
# Build test features
test_df['len_a'] = test_df['response_a'].astype(str).apply(len)
test_df['len_b'] = test_df['response_b'].astype(str).apply(len)
test_df['len_diff'] = test_df['len_a'] - test_df['len_b']

test_df['para_count_diff']  = test_df['para_count_a']  - test_df['para_count_b']
test_df['list_count_diff']  = test_df['list_count_a']  - test_df['list_count_b']
test_df['quote_count_diff'] = test_df['quote_count_a'] - test_df['quote_count_b']

X_test = test_df[feature_cols].fillna(0)

# Fit on full training data
pipe.fit(X, y)

# Predict probabilities for the 3 classes in Kaggle order:
# winner_model_a (class 0), winner_model_b (class 1), winner_tie (class 2)
proba_test = pipe.predict_proba(X_test)  # (n_test, 3)

# Build submission
sub = pd.DataFrame({
    'id': test_df['id'].values,
    'winner_model_a': proba_test[:,0],
    'winner_model_b': proba_test[:,1],
    'winner_tie':     proba_test[:,2],
})

# Save
sub.to_csv('submission_lr.csv', index=False)
print("Saved submission_lr.csv")

Saved submission_lr.csv


Text Preprocessing for Embeddings

In [9]:
import ast

def extract_text_from_field(text_field):
    """Extract text from string representation of list (e.g., '["text"]' -> 'text')."""
    try:
        parsed = ast.literal_eval(text_field)
        return ' '.join(parsed) if isinstance(parsed, list) else str(parsed)
    except:
        return str(text_field)

# Apply to both train and test
for df, df_name in [(train_df, 'train'), (test_df, 'test')]:
    df['prompt_text'] = df['prompt'].apply(extract_text_from_field)
    df['response_a_text'] = df['response_a'].apply(extract_text_from_field)
    df['response_b_text'] = df['response_b'].apply(extract_text_from_field)
    
    # Combine prompt with responses (using [SEP] token)
    df['text_a'] = df['prompt_text'] + " [SEP] " + df['response_a_text']
    df['text_b'] = df['prompt_text'] + " [SEP] " + df['response_b_text']

Embedding Generation

In [11]:
from sentence_transformers import SentenceTransformer
import time

# Load pre-trained embedding model
EMBEDDING_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'
BATCH_SIZE = 32

model = SentenceTransformer(EMBEDDING_MODEL)

In [13]:
# Generate TRAINING embeddings
start_time = time.time()

print("\nEmbedding response A...")
train_embeddings_a = model.encode(
    train_df['text_a'].tolist(),
    show_progress_bar=True,
    batch_size=BATCH_SIZE,
    convert_to_numpy=True
)

print("\nEmbedding response B...")
train_embeddings_b = model.encode(
    train_df['text_b'].tolist(),
    show_progress_bar=True,
    batch_size=BATCH_SIZE,
    convert_to_numpy=True
)

elapsed = time.time() - start_time
print(f"\n Training embeddings generated in {elapsed/60:.2f} minutes")
print(f"   Shape A: {train_embeddings_a.shape}")
print(f"   Shape B: {train_embeddings_b.shape}")

# Save embeddings
np.save('train_embeddings_a.npy', train_embeddings_a)
np.save('train_embeddings_b.npy', train_embeddings_b)


Embedding response A...


Batches: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1797/1797 [01:09<00:00, 25.69it/s] 



Embedding response B...


Batches: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1797/1797 [01:16<00:00, 23.49it/s] 



 Training embeddings generated in 2.55 minutes
   Shape A: (57477, 384)
   Shape B: (57477, 384)


In [14]:
# Generate TEST embeddings
start_time = time.time()

print("\nEmbedding response A...")
test_embeddings_a = model.encode(
    test_df['text_a'].tolist(),
    show_progress_bar=True,
    batch_size=BATCH_SIZE,
    convert_to_numpy=True
)

print("\nEmbedding response B...")
test_embeddings_b = model.encode(
    test_df['text_b'].tolist(),
    show_progress_bar=True,
    batch_size=BATCH_SIZE,
    convert_to_numpy=True
)

elapsed = time.time() - start_time
print(f"\nâœ… Test embeddings generated in {elapsed/60:.2f} minutes")
print(f"   Shape A: {test_embeddings_a.shape}")
print(f"   Shape B: {test_embeddings_b.shape}")

# Save embeddings
np.save('test_embeddings_a.npy', test_embeddings_a)
np.save('test_embeddings_b.npy', test_embeddings_b)


Embedding response A...


Batches: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1/1 [00:00<00:00, 21.07it/s]



Embedding response B...


Batches: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1/1 [00:00<00:00, 21.94it/s]


âœ… Test embeddings generated in 0.00 minutes
   Shape A: (3, 384)
   Shape B: (3, 384)





In [18]:
# Concatenate embeddings for A and B
X_train_embed = np.concatenate([train_embeddings_a, train_embeddings_b], axis=1)
y_train = train_df['target'].values

CROSS-VALIDATION

In [19]:
cv_splitter = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = []

for fold_idx, (train_idx, val_idx) in enumerate(cv_splitter.split(X_train_embed, y_train), start=1):
    X_train_fold = X_train_embed[train_idx]
    X_val_fold = X_train_embed[val_idx]
    y_train_fold = y_train[train_idx]
    y_val_fold = y_train[val_idx]
    
    # Scale
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_fold)
    X_val_scaled = scaler.transform(X_val_fold)
    
    # Train
    model_lr = LogisticRegression(max_iter=2000, C=1.0, random_state=42)
    model_lr.fit(X_train_scaled, y_train_fold)
    
    # Predict and evaluate
    y_pred_proba = model_lr.predict_proba(X_val_scaled)
    fold_loss = log_loss(y_val_fold, y_pred_proba)
    cv_scores.append(fold_loss)
    
    print(f"Fold {fold_idx}: log_loss = {fold_loss:.5f}")

print(f"\nCross-val log_loss â†’ mean={np.mean(cv_scores):.5f}, std={np.std(cv_scores):.5f}")

Fold 1: log_loss = 1.07008
Fold 2: log_loss = 1.07554
Fold 3: log_loss = 1.06972
Fold 4: log_loss = 1.07088
Fold 5: log_loss = 1.07210

Cross-val log_loss â†’ mean=1.07166, std=0.00210


Embedding model (1.072) is performing worse than the lexical baseline (1.071). 
This implies lexical features capture important preference signals.
The embedding approach might benefit from combining with lexical features.

In [20]:
# Prepare test features
X_test_embed = np.concatenate([test_embeddings_a, test_embeddings_b], axis=1)

# Train on full dataset
scaler_final = StandardScaler()
X_train_scaled_full = scaler_final.fit_transform(X_train_embed)
X_test_scaled = scaler_final.transform(X_test_embed)

# Train final model
final_model = LogisticRegression(max_iter=2000, C=1.0, random_state=42)
final_model.fit(X_train_scaled_full, y_train)

# Predict on test set
test_proba = final_model.predict_proba(X_test_scaled)

# Create submission
submission_embed = pd.DataFrame({
    'id': test_df['id'].values,
    'winner_model_a': test_proba[:, 0],
    'winner_model_b': test_proba[:, 1],
    'winner_tie': test_proba[:, 2]
})

submission_embed.to_csv('submission_embeddings.csv', index=False)
print("Saved submission_embeddings.csv")

Saved submission_embeddings.csv
