In [1]:
# MPS (Apple Silicon) check and default device setup
import torch, platform
print(f"Python: {platform.python_version()}")
print(f"PyTorch: {torch.__version__}")
mps_ok = torch.backends.mps.is_available() and torch.backends.mps.is_built()
device = torch.device("mps") if mps_ok else torch.device("cpu")
print(f"MPS available: {mps_ok}")
print(f"Using device: {device}")

# quick sanity test on selected device
x = torch.randn(1000, 1000, device=device)
y = torch.mm(x, x.T)
print("Matmul ok:", y.shape)

Python: 3.13.2
PyTorch: 2.9.0
MPS available: True
Using device: mps
Matmul ok: torch.Size([1000, 1000])
Matmul ok: torch.Size([1000, 1000])


In [2]:
# !pip -q install pandas numpy scikit-learn lightgbm sentence-transformers transformers peft accelerate datasets evaluate
import pandas as pd, numpy as np, os, gc, math, random
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss, confusion_matrix, classification_report
RNG = 42
random.seed(RNG); np.random.seed(RNG)

In [5]:
# Paths and data loading
import pandas as pd, numpy as np, os, json, math, re, string
from pathlib import Path
from typing import List, Dict, Tuple
BASE = Path('llm-classification-finetuning')
TRAIN_PATH = BASE / 'train.csv'
TEST_PATH = BASE / 'test.csv'
SAMPLE_SUB_PATH = BASE / 'sample_submission.csv'

assert TRAIN_PATH.exists(), f"Missing {TRAIN_PATH}"
assert TEST_PATH.exists(), f"Missing {TEST_PATH}"
assert SAMPLE_SUB_PATH.exists(), f"Missing {SAMPLE_SUB_PATH}"

# Read CSVs (train is large, so let pandas stream types)
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)
sample_sub = pd.read_csv(SAMPLE_SUB_PATH)

print('Train shape:', train.shape)
print('Test shape:', test.shape)
print('Train columns:', list(train.columns))
print('Test columns:', list(test.columns))
print('Sample submission columns:', list(sample_sub.columns))

# Expected text columns
TEXT_COLS = ['prompt', 'response_a', 'response_b']
for c in TEXT_COLS:
    if c not in train.columns:
        raise ValueError(f"Expected column '{c}' in train.csv")
for c in TEXT_COLS:
    if c not in test.columns:
        raise ValueError(f"Expected column '{c}' in test.csv")
ID_COL = 'id' if 'id' in test.columns else test.columns[0]
print('ID column:', ID_COL)

# Determine targets
SOFT_TARGETS = ['winner_model_a','winner_model_b','winner_tie']
HARD_TARGET = None
if all(c in train.columns for c in SOFT_TARGETS):
    y_soft = train[SOFT_TARGETS].values.astype(float)
    y = y_soft.argmax(axis=1)  # 0=A, 1=B, 2=Tie
    print('Using soft targets (converted to hard labels via argmax).')
elif 'winner' in train.columns:
    # Map common strings to class ids
    mapping = {
        'model_a': 0, 'a': 0, 'A': 0, 0: 0,
        'model_b': 1, 'b': 1, 'B': 1, 1: 1,
        'tie': 2, 'TIE': 2, 2: 2,
    }
    y = train['winner'].map(mapping)
    if y.isna().any():
        raise ValueError('Unknown labels in winner column; please adjust mapping.')
    y = y.astype(int).values
    y_soft = None
    print('Using hard labels from winner column.')
elif 'label' in train.columns:
    # Assume values {0,1,2} -> A,B,Tie
    y = train['label'].astype(int).values
    y_soft = None
    print('Using hard integer labels from label column.')
else:
    raise ValueError('Could not find target columns. Expected winner_model_* or winner/label.')
num_classes = 3
print('Num classes:', num_classes)

Train shape: (57477, 9)
Test shape: (3, 4)
Train columns: ['id', 'model_a', 'model_b', 'prompt', 'response_a', 'response_b', 'winner_model_a', 'winner_model_b', 'winner_tie']
Test columns: ['id', 'prompt', 'response_a', 'response_b']
Sample submission columns: ['id', 'winner_model_a', 'winner_model_b', 'winner_tie']
ID column: id
Using soft targets (converted to hard labels via argmax).
Num classes: 3


In [None]:
# Baseline feature engineering: lexical/length features + Logistic Regression
import numpy as np, pandas as pd, re, string, math, gc
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss, classification_report

puncts = set(string.punctuation)

def safe_len(s: str) -> int:
    return len(s) if isinstance(s, str) else 0

def words(s: str):
    if not isinstance(s, str):
        return []
    return re.findall(r"\b\w+\b", s.lower())

def text_feats(s: str) -> dict:
    if not isinstance(s, str):
        s = ""
    w = words(s)
    wc = len(w)
    char_len = len(s)
    avg_wlen = (sum(len(x) for x in w) / wc) if wc else 0.0
    num_digits = sum(ch.isdigit() for ch in s)
    num_upper = sum(ch.isupper() for ch in s)
    num_punct = sum(ch in puncts for ch in s)
    excl = s.count('!')
    ques = s.count('?')
    commas = s.count(',')
    periods = s.count('.')
    newlines = s.count('\n')
    cap_ratio = (num_upper / char_len) if char_len else 0.0
    digit_ratio = (num_digits / char_len) if char_len else 0.0
    punct_ratio = (num_punct / max(char_len,1))
    return {
        'char_len': char_len,
        'word_count': wc,
        'avg_word_len': avg_wlen,
        'num_digits': num_digits,
        'num_upper': num_upper,
        'num_punct': num_punct,
        'excl': excl,
        'ques': ques,
        'commas': commas,
        'periods': periods,
        'newlines': newlines,
        'cap_ratio': cap_ratio,
        'digit_ratio': digit_ratio,
        'punct_ratio': punct_ratio,
    }

def build_pair_features(df: pd.DataFrame) -> pd.DataFrame:
    # Compute features for prompt and both responses
    p_feats = df['prompt'].map(text_feats).apply(pd.Series).add_prefix('p_')
    a_feats = df['response_a'].map(text_feats).apply(pd.Series).add_prefix('a_')
    b_feats = df['response_b'].map(text_feats).apply(pd.Series).add_prefix('b_')
    # Pairwise differences and ratios (verbosity/bias-aware)
    diff = (a_feats - b_feats).add_prefix('diff_')
    ad = (a_feats - b_feats).abs().add_prefix('abs_')
    # Ratios (add small constant)
    eps = 1e-6
    ratio_cols = {}
    for col in [c for c in a_feats.columns if c.startswith('a_')]:
        base = col[2:]
        ac = a_feats[col].astype(float)
        bc = b_feats['b_'+base].astype(float)
        ratio = (ac + eps) / (bc + eps)
        ratio = ratio.replace([np.inf, -np.inf], np.nan).fillna(1.0)
        ratio_cols[f'ratio_{base}'] = ratio
    ratio = pd.DataFrame(ratio_cols, index=df.index)
    # Position bias proxy: longer response tends to be preferred (captured above); also include which is longer
    longer_a = (a_feats['a_char_len'] > b_feats['b_char_len']).astype(int).rename('a_is_longer')
    # Combine
    X = pd.concat([p_feats, a_feats, b_feats, diff, ad, ratio, longer_a], axis=1)
    # Clean NaNs/Infs
    X = X.replace([np.inf, -np.inf], np.nan).fillna(0.0)
    return X

# Build features
X_baseline = build_pair_features(train)
X_test_baseline = build_pair_features(test)
print('Baseline feature matrix shapes:', X_baseline.shape, X_test_baseline.shape)

# Scale features
scaler = StandardScaler(with_mean=True, with_std=True)
X_all = scaler.fit_transform(X_baseline.values)
X_tst = scaler.transform(X_test_baseline.values)

# CV training
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RNG)
oof = np.zeros((len(train), 3), dtype=float)
test_pred = np.zeros((len(test), 3), dtype=float)
fold_reports = []

for fold, (tr_idx, va_idx) in enumerate(skf.split(X_all, y), 1):
    X_tr, X_va = X_all[tr_idx], X_all[va_idx]
    y_tr, y_va = y[tr_idx], y[va_idx]
    clf = LogisticRegression(max_iter=2000, solver='saga', n_jobs=-1, C=2.0, class_weight=None, random_state=RNG)
    clf.fit(X_tr, y_tr)
    proba_va = clf.predict_proba(X_va)
    oof[va_idx] = proba_va
    ll = log_loss(y_true=y_va, y_pred=proba_va)
    fold_reports.append((fold, ll))
    test_pred += clf.predict_proba(X_tst) / skf.n_splits
    print(f"Fold {fold} log_loss: {ll:.5f}")
    gc.collect()

oof_ll = log_loss(y_true=y, y_pred=oof)
print(f"OOF log_loss: {oof_ll:.5f}")

# Build submission from baseline
sub_baseline = pd.DataFrame({
    ID_COL: test[ID_COL].values,
    'winner_model_a': test_pred[:,0],
    'winner_model_b': test_pred[:,1],
    'winner_tie': test_pred[:,2],
})
sub_baseline.to_csv('submission_baseline.csv', index=False)
print('Wrote submission_baseline.csv')

In [None]:
# Optional: Submit to Kaggle (set COMPETITION and ensure Kaggle API is configured)
import os, subprocess, sys
COMPETITION = os.environ.get('KAGGLE_COMPETITION', 'REPLACE_WITH_COMPETITION_SLUG')  # e.g., 'llm-classification-finetuning'
SUB_FILE = 'submission_baseline.csv'
MESSAGE = 'baseline lexical+LR'

def try_kaggle_submit(competition: str, file_path: str, message: str):
    try:
        import kaggle  # noqa: F401
    except Exception as e:
        print('Kaggle package not found; install with `pip install kaggle` and set credentials.', e)
        return
    if competition.startswith('REPLACE_') or competition.startswith('replace_') or competition=='REPLACE_WITH_COMPETITION_SLUG':
        print('Set COMPETITION to the correct Kaggle competition slug before submitting.')
        return
    if not os.path.exists(file_path):
        print('Submission file not found:', file_path)
        return
    # Run Kaggle CLI
    cmd = ['kaggle','competitions','submit','-c', competition, '-f', file_path, '-m', message]
    print('Running:', ' '.join(cmd))
    try:
        out = subprocess.run(cmd, capture_output=True, text=True, check=False)
        print('Return code:', out.returncode)
        print('STDOUT:\n', out.stdout)
        print('STDERR:\n', out.stderr)
    except Exception as e:
        print('Submission failed with exception:', e)

# try_kaggle_submit(COMPETITION, SUB_FILE, MESSAGE)  # Uncomment to submit

In [None]:
# Embedding-based model: Sentence-Transformers (MiniLM) + Logistic Regression
from sentence_transformers import SentenceTransformer, util as st_util
import numpy as np, pandas as pd, gc
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss

embed_model_name = 'sentence-transformers/all-MiniLM-L6-v2'
st_device = str(device) if 'device' in globals() else 'cpu'
print('Loading embedding model:', embed_model_name, 'on', st_device)
st_model = SentenceTransformer(embed_model_name, device=st_device)

def encode_texts(texts, batch_size=256, show_progress_bar=True):
    return st_model.encode(texts, batch_size=batch_size, convert_to_numpy=True, show_progress_bar=show_progress_bar, normalize_embeddings=True)

def build_embedding_features(df: pd.DataFrame) -> np.ndarray:
    prompts = df['prompt'].fillna('').tolist()
    respa = df['response_a'].fillna('').tolist()
    respb = df['response_b'].fillna('').tolist()
    p_emb = encode_texts(prompts)  # [N, D]
    a_emb = encode_texts(respa)    # [N, D]
    b_emb = encode_texts(respb)    # [N, D]
    # Pairwise similarities
    cos_ab = (a_emb * b_emb).sum(axis=1, keepdims=True)  # since normalized embeddings
    cos_ap = (a_emb * p_emb).sum(axis=1, keepdims=True)
    cos_bp = (b_emb * p_emb).sum(axis=1, keepdims=True)
    # Combine features
    X = np.concatenate([
        a_emb, b_emb, p_emb,
        a_emb - b_emb, np.abs(a_emb - b_emb),
        cos_ab, cos_ap, cos_bp
    ], axis=1)
    return X

# Build features
X_emb_tr = build_embedding_features(train)
X_emb_te = build_embedding_features(test)
print('Embedding feature shapes:', X_emb_tr.shape, X_emb_te.shape)

# Scale + CV train
scaler_emb = StandardScaler(with_mean=True, with_std=True)
Xtr = scaler_emb.fit_transform(X_emb_tr)
Xte = scaler_emb.transform(X_emb_te)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RNG)
oof_e = np.zeros((len(train), 3), dtype=float)
test_pred_e = np.zeros((len(test), 3), dtype=float)

for fold, (tr_idx, va_idx) in enumerate(skf.split(Xtr, y), 1):
    X_tr, X_va = Xtr[tr_idx], Xtr[va_idx]
    y_tr, y_va = y[tr_idx], y[va_idx]
    clf = LogisticRegression(max_iter=2000, solver='saga', n_jobs=-1, C=4.0, class_weight=None, random_state=RNG)
    clf.fit(X_tr, y_tr)
    proba_va = clf.predict_proba(X_va)
    oof_e[va_idx] = proba_va
    ll = log_loss(y_true=y_va, y_pred=proba_va)
    print(f"[Emb] Fold {fold} log_loss: {ll:.5f}")
    test_pred_e += clf.predict_proba(Xte) / skf.n_splits
    gc.collect()

oof_ll_e = log_loss(y_true=y, y_pred=oof_e)
print(f"[Emb] OOF log_loss: {oof_ll_e:.5f}")

# Save embedding submission
sub_emb = pd.DataFrame({
    ID_COL: test[ID_COL].values,
    'winner_model_a': test_pred_e[:,0],
    'winner_model_b': test_pred_e[:,1],
    'winner_tie': test_pred_e[:,2],
})
sub_emb.to_csv('submission_embedding.csv', index=False)
print('Wrote submission_embedding.csv')

In [None]:
# Calibration and simple ensembling
import numpy as np, pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss

def calibrate_probs(oof_probs: np.ndarray, y_true: np.ndarray, test_probs: np.ndarray) -> Tuple[np.ndarray, float]:
    # Fit a simple multinomial logistic regression on predicted probabilities as features
    clf = LogisticRegression(max_iter=2000, solver='lbfgs', C=1.0)
    clf.fit(oof_probs, y_true)
    cal_oof = clf.predict_proba(oof_probs)
    cal_test = clf.predict_proba(test_probs)
    ll = log_loss(y_true, cal_oof, labels=[0,1,2])
    return cal_test, ll

# Calibrate baseline
cal_test_base, ll_base_cal = calibrate_probs(oof, y, test_pred)
print(f"Baseline calibrated OOF log_loss (stacking): {ll_base_cal:.5f}")

# Calibrate embedding
cal_test_emb, ll_emb_cal = calibrate_probs(oof_e, y, test_pred_e)
print(f"Embedding calibrated OOF log_loss (stacking): {ll_emb_cal:.5f}")

# Ensemble (equal weight)
ens_test = 0.5 * cal_test_base + 0.5 * cal_test_emb

sub_ens = pd.DataFrame({
    ID_COL: test[ID_COL].values,
    'winner_model_a': ens_test[:,0],
    'winner_model_b': ens_test[:,1],
    'winner_tie': ens_test[:,2],
})
sub_ens.to_csv('submission_ensemble.csv', index=False)
print('Wrote submission_ensemble.csv')

In [None]:
# Optional: Lightweight fine-tuning (DeBERTa-small + LoRA)
from dataclasses import dataclass
from typing import Optional
import numpy as np, pandas as pd, torch, os, gc
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
from peft import LoraConfig, get_peft_model

RUN_LORA = False  # Set True to run a tiny LoRA finetune (can be slow on CPU)
MODEL_NAME = 'microsoft/deberta-v3-small'
NUM_LABELS = 3
MAX_LEN = 512
BATCH = 8
EPOCHS = 1

class PairDataset(Dataset):
    def __init__(self, df: pd.DataFrame, y: Optional[np.ndarray], tok: AutoTokenizer, max_len: int = 512):
        self.df = df.reset_index(drop=True)
        self.y = None if y is None else y.astype(int)
        self.tok = tok
        self.max_len = max_len
    def __len__(self):
        return len(self.df)
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        text = f"Prompt:\n{row['prompt']}\n\nResponse A:\n{row['response_a']}\n\nResponse B:\n{row['response_b']}"
        enc = self.tok(text, truncation=True, max_length=self.max_len)
        item = {k: torch.tensor(v) for k,v in enc.items()}
        if self.y is not None:
            item['labels'] = torch.tensor(self.y[idx], dtype=torch.long)
        return item

def run_lora_training(train_df: pd.DataFrame, y: np.ndarray, eval_frac: float = 0.05):
    tok = AutoTokenizer.from_pretrained(MODEL_NAME)
    # Small eval split
    n = len(train_df)
    n_eval = max(100, int(n * eval_frac))
    eval_idx = np.random.RandomState(RNG).choice(n, size=n_eval, replace=False)
    tr_mask = np.ones(n, dtype=bool)
    tr_mask[eval_idx] = False
    dtrain = PairDataset(train_df[tr_mask], y[tr_mask], tok, MAX_LEN)
    deval = PairDataset(train_df[~tr_mask], y[~tr_mask], tok, MAX_LEN)
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=NUM_LABELS)
    lconf = LoraConfig(task_type='SEQ_CLS', r=8, lora_alpha=16, lora_dropout=0.05)
    model = get_peft_model(model, lconf)
    args = TrainingArguments(
        output_dir='out_lora',
        per_device_train_batch_size=BATCH,
        per_device_eval_batch_size=BATCH,
        learning_rate=5e-5,
        num_train_epochs=EPOCHS,
        logging_steps=50,
        evaluation_strategy='steps',
        eval_steps=200,
        save_strategy='no',
        report_to=[]
    )
    collate = DataCollatorWithPadding(tok)
    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=dtrain,
        eval_dataset=deval,
        tokenizer=tok,
        data_collator=collate,
    )
    trainer.train()
    return model, tok

if RUN_LORA:
    model_lora, tok_lora = run_lora_training(train, y)
    # Inference on test
    class TstDataset(Dataset):
        def __init__(self, df, tok):
            self.df = df.reset_index(drop=True)
            self.tok = tok
        def __len__(self):
            return len(self.df)
        def __getitem__(self, idx):
            row = self.df.iloc[idx]
            text = f"Prompt:\n{row['prompt']}\n\nResponse A:\n{row['response_a']}\n\nResponse B:\n{row['response_b']}"
            return self.tok(text, truncation=True, max_length=MAX_LEN, return_tensors='pt')
    td = TstDataset(test, tok_lora)
    # Simple loop (avoid Trainer.predict to keep it lightweight)
    model_lora.eval()
    preds = []
    for i in range(len(td)):
        batch = td[i]
        batch = {k: v.squeeze(0).to(model_lora.device) for k,v in batch.items()}
        with torch.no_grad():
            out = model_lora(**batch)
            p = torch.softmax(out.logits, dim=-1).cpu().numpy()
        preds.append(p)
    preds = np.vstack(preds)
    sub_lora = pd.DataFrame({
        ID_COL: test[ID_COL].values,
        'winner_model_a': preds[:,0],
        'winner_model_b': preds[:,1],
        'winner_tie': preds[:,2],
    })
    sub_lora.to_csv('submission_lora.csv', index=False)
    print('Wrote submission_lora.csv')
else:
    print('LoRA training skipped. Set RUN_LORA=True to run (may take time).')