# Step 3 â€” Extensions: Bias features, Calibration, Ensembling, and LoRA fine-tuning

This notebook implements the full Step 3 pipeline without touching prior notebooks:

- Bias-aware lexical features (verbosity and structure)
- Calibrated classifiers (sigmoid and isotonic)
- Embeddings-based model (reusing precomputed .npy when available)
- Simple ensembling via OOF-weight search
- Optional lightweight LoRA fine-tuning with temperature scaling

Outputs: submission CSVs for each component and a blended ensemble.

In [61]:
# Imports
import os, re, ast, random, warnings, time
from pathlib import Path
import numpy as np
import pandas as pd
warnings.filterwarnings("ignore")

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import log_loss

# 0) CONFIG

In [62]:
MODE = "embeddings"         # "lexical" | "embeddings" | "ensemble" | "lora"
N_SPLITS = 5
SEED = 42
RUN_LORA = False          # keep False unless you want to train LoRA

random.seed(SEED); np.random.seed(SEED)

# go up one level for data and outputs
DATA_DIR = Path("../data")
OUT_DIR  = Path("../outputs");  OUT_DIR.mkdir(parents=True, exist_ok=True)
ART_DIR  = Path("../artifacts"); ART_DIR.mkdir(parents=True, exist_ok=True)

TRAIN_PATH = DATA_DIR / "train.csv"
TEST_PATH  = DATA_DIR / "test.csv"

# Sentence-Transformer to fetch online
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"

# 1) DATA LOADING


In [63]:
train_df = pd.read_csv(TRAIN_PATH).dropna()
test_df  = pd.read_csv(TEST_PATH).dropna()

# 3-class target: 0=A, 1=B, 2=Tie
y = np.select(
    [train_df["winner_model_a"].eq(1), train_df["winner_model_b"].eq(1), train_df["winner_tie"].eq(1)],
    [0, 1, 2]
)
classes = [0, 1, 2]

# Normalize JSON-ish fields and build text_a / text_b
def extract_text_from_field(v):
    try:
        parsed = ast.literal_eval(v)
        return " ".join(map(str, parsed)) if isinstance(parsed, list) else str(parsed)
    except Exception:
        return str(v)

for df in (train_df, test_df):
    df["prompt_text"]     = df["prompt"].apply(extract_text_from_field)
    df["response_a_text"] = df["response_a"].apply(extract_text_from_field)
    df["response_b_text"] = df["response_b"].apply(extract_text_from_field)
    df["text_a"] = df["prompt_text"] + " [SEP] " + df["response_a_text"]
    df["text_b"] = df["prompt_text"] + " [SEP] " + df["response_b_text"]

# 2) HELPERS (CV, calibration, prediction, blending)

In [64]:
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)
fold_indices = [(tr, va) for tr, va in skf.split(np.zeros(len(y)), y)]

def cv_calibrated_probs_numpy(X, y, method="sigmoid"):
    """Return OOF probs + list of fold models & scalers."""
    oof = np.zeros((len(X), 3), dtype=float)
    models, scalers = [], []
    for fold, (tr, va) in enumerate(fold_indices, 1):
        scaler = StandardScaler()
        Xtr_s = scaler.fit_transform(X[tr])
        Xva_s = scaler.transform(X[va])

        base = LogisticRegression(max_iter=2000, C=1.0, random_state=SEED)
        clf  = CalibratedClassifierCV(estimator=base, method=method, cv=3)
        clf.fit(Xtr_s, y[tr])
        proba = clf.predict_proba(Xva_s)
        oof[va] = proba
        print(f"[CV {method}] fold {fold} logloss = {log_loss(y[va], proba, labels=classes):.5f}")
        models.append(clf); scalers.append(scaler)
    print(f"[CV {method}] OOF logloss = {log_loss(y, oof, labels=classes):.5f}")
    return oof, models, scalers

def predict_from_folds(models, scalers, X):
    P = np.zeros((len(X), 3), dtype=float)
    for clf, scaler in zip(models, scalers):
        Xt = scaler.transform(X)
        P += clf.predict_proba(Xt) / len(models)
    return P

def best_weight_for_blend(y, p1, p2, steps=101):
    best_w, best_loss = 0.5, 1e9
    for i in range(steps):
        w = i/(steps-1)
        blend = w*p1 + (1-w)*p2
        loss  = log_loss(y, blend, labels=classes)
        if loss < best_loss:
            best_loss, best_w = loss, w
    return best_w, best_loss

## Bias-aware and structural lexical features

In [65]:
def count_pattern(text, pattern):
    if not isinstance(text, str): return 0
    return len(re.findall(pattern, text, flags=re.MULTILINE))

def paragraph_count(t): return t.count("\n\n") if isinstance(t, str) else 0
def list_count(t):      return count_pattern(t, r"^\s*(?:[\-\*â€¢]\s|\d+\.)")
def quote_count(t):     return count_pattern(t, r"(^>\s|(?<!\*)\*\*[^*]+\*\*)")
def sentence_count(t):  return count_pattern(t, r"[.!?](?:\s|$)")
def code_block_count(t):return count_pattern(t, r"```|`[^`]+`")
def heading_count(t):   return count_pattern(t, r"^(?:#{1,6})\s")
def word_count(t):      return len(t.split()) if isinstance(t, str) else 0

def build_lex_features(df, a_col="response_a_text", b_col="response_b_text"):
    A = df[a_col].fillna("").astype(str); B = df[b_col].fillna("").astype(str)
    feats = {}
    for tag, series in [("a", A), ("b", B)]:
        feats[f"len_{tag}"]   = series.map(len)
        feats[f"wc_{tag}"]    = series.map(word_count)
        feats[f"sent_{tag}"]  = series.map(sentence_count)
        feats[f"para_{tag}"]  = series.map(paragraph_count)
        feats[f"list_{tag}"]  = series.map(list_count)
        feats[f"quote_{tag}"] = series.map(quote_count)
        feats[f"code_{tag}"]  = series.map(code_block_count)
        feats[f"hdr_{tag}"]   = series.map(heading_count)
    F = pd.DataFrame(feats).astype(float)

    # diffs
    for base in ["len","wc","sent","para","list","quote","code","hdr"]:
        F[f"{base}_diff"] = F[f"{base}_a"] - F[f"{base}_b"]
    # ratios
    for base in ["len","wc","sent"]:
        F[f"{base}_ratio"] = (F[f"{base}_a"] + 1.0) / (F[f"{base}_b"] + 1.0)

    eps = 1e-6
    for base in ["sent","para","list","quote","code","hdr"]:
        F[f"{base}_per100w_a"] = 100.0 * F[f"{base}_a"] / (F["wc_a"] + eps)
        F[f"{base}_per100w_b"] = 100.0 * F[f"{base}_b"] / (F["wc_b"] + eps)
        F[f"{base}_per100w_diff"] = F[f"{base}_per100w_a"] - F[f"{base}_per100w_b"]
    return F.replace([np.inf, -np.inf], 0.0).fillna(0.0)

lex_cols_keep = [
    "len_diff","wc_diff","sent_diff","para_diff","list_diff","quote_diff","code_diff","hdr_diff",
    "len_ratio","wc_ratio","sent_ratio"
]

X_lex_train_df = build_lex_features(train_df)
X_lex_test_df  = build_lex_features(test_df)
X_lex      = X_lex_train_df[lex_cols_keep].copy().astype(float)
X_lex_test = X_lex_test_df[lex_cols_keep].copy().astype(float)

## Calibrated lexical model (sigmoid and isotonic) with OOF predictions

In [66]:
def cv_calibrated_probs(X, y, base_model=None, method='sigmoid', n_splits=5, seed=RANDOM_STATE):
    if base_model is None:
        base_model = LogisticRegression(max_iter=2000, C=1.0, random_state=seed)
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    oof = np.zeros((len(X), 3), dtype=float)
    models = []
    scalers = []
    for fold, (tr, va) in enumerate(skf.split(X, y), 1):
        Xtr, Xva = X.iloc[tr], X.iloc[va]
        ytr, yva = y[tr], y[va]
        scaler = StandardScaler()
        Xtr_s = scaler.fit_transform(Xtr)
        Xva_s = scaler.transform(Xva)
        clf = CalibratedClassifierCV(estimator=base_model, method=method, cv=3)
        clf.fit(Xtr_s, ytr)
        proba = clf.predict_proba(Xva_s)
        oof[va] = proba
        loss = log_loss(yva, proba, labels=classes)
        print(f'[Lex {method}] Fold {fold}: log_loss={loss:.5f}')
        models.append(clf)
        scalers.append(scaler)
    print(f'[Lex {method}] OOF log_loss: {log_loss(y, oof, labels=classes):.5f}')
    return oof, models, scalers

oof_lex_sigmoid, lex_sigmoid_models, lex_sigmoid_scalers = cv_calibrated_probs(X_lex, y, method='sigmoid')
oof_lex_isotonic, lex_isotonic_models, lex_isotonic_scalers = cv_calibrated_probs(X_lex, y, method='isotonic')

# Choose the better calibration by OOF loss
lex_oof_list = [('sigmoid', oof_lex_sigmoid), ('isotonic', oof_lex_isotonic)]
lex_best_name, lex_best_oof = min(lex_oof_list, key=lambda t: log_loss(y, t[1], labels=classes))
print('Best lexical calibration:', lex_best_name, 'OOF log_loss=', log_loss(y, lex_best_oof, labels=classes))

[Lex sigmoid] Fold 1: log_loss=1.07004
[Lex sigmoid] Fold 2: log_loss=1.06940
[Lex sigmoid] Fold 3: log_loss=1.07066
[Lex sigmoid] Fold 4: log_loss=1.07088
[Lex sigmoid] Fold 5: log_loss=1.06935
[Lex sigmoid] OOF log_loss: 1.07007
[Lex isotonic] Fold 1: log_loss=1.06096
[Lex isotonic] Fold 2: log_loss=1.06248
[Lex isotonic] Fold 3: log_loss=1.06179
[Lex isotonic] Fold 4: log_loss=1.06018
[Lex isotonic] Fold 5: log_loss=1.06076
[Lex isotonic] OOF log_loss: 1.06124
Best lexical calibration: isotonic OOF log_loss= 1.0612351518282728


# 4) LEXICAL MODEL (if MODE in {"lexical", "ensemble"})

In [67]:
lex_best_name = None
lex_best_oof = None
lex_best_models = []
lex_best_scalers = []

if MODE in {"lexical","ensemble"}:
    oof_lex_sig, lex_sig_models, lex_sig_scalers = cv_calibrated_probs_numpy(X_lex.values, y, method="sigmoid")
    oof_lex_iso, lex_iso_models, lex_iso_scalers = cv_calibrated_probs_numpy(X_lex.values, y, method="isotonic")

    if log_loss(y, oof_lex_iso, labels=classes) < log_loss(y, oof_lex_sig, labels=classes):
        lex_best_name, lex_best_oof = "isotonic", oof_lex_iso
        lex_best_models, lex_best_scalers = lex_iso_models, lex_iso_scalers
    else:
        lex_best_name, lex_best_oof = "sigmoid", oof_lex_sig
        lex_best_models, lex_best_scalers = lex_sig_models, lex_sig_scalers

    # test preds
    lex_proba_test = predict_from_folds(lex_best_models, lex_best_scalers, X_lex_test.values)
else:
    lex_proba_test = None

# 5) EMBEDDINGS (if MODE in {"embeddings","ensemble"})

In [None]:
X_emb_train = None; X_emb_test = None
emb_best_name = None; emb_best_oof = None
emb_best_models = []; emb_best_scalers = []

if MODE in {"embeddings","ensemble"}:
    import torch
    from sentence_transformers import SentenceTransformer

    # --- Paths for cached embeddings
    ART_DIR = Path("../artifacts"); ART_DIR.mkdir(parents=True, exist_ok=True)
    EMBED_A_TRAIN = ART_DIR / 'train_embeddings_a.npy'
    EMBED_B_TRAIN = ART_DIR / 'train_embeddings_b.npy'
    EMBED_A_TEST  = ART_DIR / 'test_embeddings_a.npy'
    EMBED_B_TEST  = ART_DIR / 'test_embeddings_b.npy'

    device = "cuda" if torch.cuda.is_available() else "cpu"
    print("Loading SentenceTransformer:", EMBEDDING_MODEL, "on", device)
    st_model = SentenceTransformer(EMBEDDING_MODEL, device=device)
    st_model.max_seq_length = 512
    BATCH = 32

    def load_or_encode(prefix, a_path, b_path, df):
        # Try cache
        if a_path.exists() and b_path.exists():
            Ea = np.load(a_path)
            Eb = np.load(b_path)
            if Ea.shape[0] == len(df) and Eb.shape[0] == len(df):
                print(f"[cache] Loaded {prefix} embeddings:", Ea.shape, Eb.shape)
                return Ea, Eb
            else:
                print(f"[cache] Shape mismatch for {prefix} cache â†’ recomputing.")
        # Compute and save
        start = time.time()
        print(f"[encode] Computing {prefix} embeddings...")
        Ea = st_model.encode(df["text_a"].tolist(), batch_size=BATCH, show_progress_bar=True, convert_to_numpy=True)
        Eb = st_model.encode(df["text_b"].tolist(), batch_size=BATCH, show_progress_bar=True, convert_to_numpy=True)
        np.save(a_path, Ea); np.save(b_path, Eb)
        print(f"[encode] Saved {prefix} embeddings to {ART_DIR}  (elapsed {(time.time()-start)/60:.2f} min)")
        return Ea, Eb

    # --- Train/Test embeddings (A and B)
    train_a, train_b = load_or_encode("train", EMBED_A_TRAIN, EMBED_B_TRAIN, train_df)
    test_a,  test_b  = load_or_encode("test",  EMBED_A_TEST,  EMBED_B_TEST,  test_df)

    # --- concat Aâ€–B
    X_emb_train = np.concatenate([train_a, train_b], axis=1)
    X_emb_test  = np.concatenate([test_a,  test_b],  axis=1)
    print("Emb shapes:", X_emb_train.shape, X_emb_test.shape)

    # --- Calibrated CV (best of sigmoid / isotonic)
    oof_emb_sig, emb_sig_models, emb_sig_scalers = cv_calibrated_probs_numpy(X_emb_train, y, method="sigmoid")
    oof_emb_iso, emb_iso_models, emb_iso_scalers = cv_calibrated_probs_numpy(X_emb_train, y, method="isotonic")

    if log_loss(y, oof_emb_iso, labels=classes) < log_loss(y, oof_emb_sig, labels=classes):
        emb_best_name, emb_best_oof = "isotonic", oof_emb_iso
        emb_best_models, emb_best_scalers = emb_iso_models, emb_iso_scalers
    else:
        emb_best_name, emb_best_oof = "sigmoid", oof_emb_sig
        emb_best_models, emb_best_scalers = emb_sig_models, emb_sig_scalers

    # --- Test-time probs (fold average)
    emb_proba_test = predict_from_folds(emb_best_models, emb_best_scalers, X_emb_test)

else:
    emb_proba_test = None
    print("MODE does not include embeddings â†’ skipping.")


Loading SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2 on cpu
[cache] Loaded train embeddings: (57477, 384) (57477, 384)
[cache] Loaded test embeddings: (3, 384) (3, 384)
Emb shapes: (57477, 768) (3, 768)
[CV sigmoid] fold 1 logloss = 1.06671
[CV sigmoid] fold 2 logloss = 1.06958
[CV sigmoid] fold 3 logloss = 1.06626
[CV sigmoid] fold 4 logloss = 1.06707
[CV sigmoid] fold 5 logloss = 1.06730
[CV sigmoid] OOF logloss = 1.06738
[CV isotonic] fold 1 logloss = 1.06589
[CV isotonic] fold 2 logloss = 1.06898
[CV isotonic] fold 3 logloss = 1.06579
[CV isotonic] fold 4 logloss = 1.06626
[CV isotonic] fold 5 logloss = 1.06662
[CV isotonic] OOF logloss = 1.06671


## Optional: Lightweight LoRA fine-tuning with temperature scaling

In [69]:
if MODE == "lora":
    import os, ast, numpy as np, pandas as pd, torch
    from pathlib import Path
    from datasets import Dataset
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import log_loss

    from transformers import (
        AutoTokenizer, AutoModelForSequenceClassification,
        DataCollatorWithPadding, Trainer, TrainingArguments
    )
    from peft import LoraConfig, get_peft_model

    # --- Paths (always running from notebooks/)
    DATA_DIR = Path("../data")
    OUT_DIR  = Path("../outputs");  OUT_DIR.mkdir(parents=True, exist_ok=True)
    ART_DIR  = Path("../artifacts"); ART_DIR.mkdir(parents=True, exist_ok=True)

    TRAIN_PATH = DATA_DIR / "train.csv"
    TEST_PATH  = DATA_DIR / "test.csv"

    # --- 1) Load data
    train_df = pd.read_csv(TRAIN_PATH)
    test_df  = pd.read_csv(TEST_PATH)

    # Build 3-class labels: 0=A, 1=B, 2=Tie
    y = np.select(
        [train_df["winner_model_a"].eq(1), train_df["winner_model_b"].eq(1), train_df["winner_tie"].eq(1)],
        [0, 1, 2]
    ).astype(int)

    # Extract text fields (handle '["..."]' cases)
    def extract_text_from_field(text_field):
        try:
            parsed = ast.literal_eval(text_field)
            return " ".join(parsed) if isinstance(parsed, list) else str(parsed)
        except Exception:
            return str(text_field)

    for df in (train_df, test_df):
        df["prompt_text"]     = df["prompt"].apply(extract_text_from_field)
        df["response_a_text"] = df["response_a"].apply(extract_text_from_field)
        df["response_b_text"] = df["response_b"].apply(extract_text_from_field)

    # Concatenate prompt + A + B so the model compares both responses in context
    def build_triplet(row):
        return f"{row['prompt_text']} [SEP] {row['response_a_text']} [SEP] {row['response_b_text']}"

    train_texts = train_df.apply(build_triplet, axis=1).tolist()
    test_texts  = test_df.apply(build_triplet,  axis=1).tolist()

    # --- 2) Tokenizer & datasets (online loading)
    BASE_MODEL = "microsoft/deberta-v3-base"  # robust encoder for classification
    tokenizer  = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)

    def tok(batch):
        return tokenizer(batch["text"], truncation=True, max_length=512)

    df_all = pd.DataFrame({"text": train_texts, "label": y})
    tr_df, va_df = train_test_split(
        df_all, test_size=0.1, random_state=42, stratify=df_all["label"]
    )
    te_df = pd.DataFrame({"text": test_texts})

    ds_tr = Dataset.from_pandas(tr_df.reset_index(drop=True)).map(tok, batched=True, remove_columns=["text"])
    ds_va = Dataset.from_pandas(va_df.reset_index(drop=True)).map(tok, batched=True, remove_columns=["text"])
    ds_te = Dataset.from_pandas(te_df.reset_index(drop=True)).map(tok, batched=True, remove_columns=["text"])

    collator = DataCollatorWithPadding(tokenizer)

    # --- 3) Base model + LoRA (download online)
    model = AutoModelForSequenceClassification.from_pretrained(
        BASE_MODEL, num_labels=3
    )

    # LoRA config â€” common target modules for DeBERTa/RoBERTa-style encoders
    lora_cfg = LoraConfig(
        r=16,
        lora_alpha=32,
        lora_dropout=0.05,
        target_modules=["query", "key", "value", "dense"],
        bias="none",
        task_type="SEQ_CLS"
    )
    model = get_peft_model(model, lora_cfg)

    # --- 4) Training setup
    use_fp16 = torch.cuda.is_available()
    args = TrainingArguments(
        output_dir=str(ART_DIR / "lora_cls"),
        per_device_train_batch_size=8,
        per_device_eval_batch_size=16,
        gradient_accumulation_steps=2,
        learning_rate=2e-4,
        num_train_epochs=2,          # start small; tune later
        weight_decay=0.01,
        warmup_ratio=0.06,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        logging_steps=50,
        load_best_model_at_end=True,
        fp16=use_fp16,
        report_to="none",
        seed=42,
    )

    # Log-loss metric for validation
    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        probs = torch.softmax(torch.tensor(logits), dim=-1).numpy()
        return {"log_loss": float(log_loss(labels, probs, labels=[0, 1, 2]))}

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=ds_tr,
        eval_dataset=ds_va,
        tokenizer=tokenizer,
        data_collator=collator,
        compute_metrics=compute_metrics,
    )

    # --- 5) Train & report validation log-loss
    trainer.train()
    va_logits = trainer.predict(ds_va).predictions
    va_probs  = torch.softmax(torch.tensor(va_logits), dim=-1).numpy()
    va_loss   = log_loss(va_df["label"].values, va_probs, labels=[0, 1, 2])
    print(f"[LoRA] Validation log_loss: {va_loss:.5f}")

    # --- 6) Predict on test and build submission
    te_logits = trainer.predict(ds_te).predictions
    te_probs  = torch.softmax(torch.tensor(te_logits), dim=-1).numpy()

    sub_lora = pd.DataFrame({
        "id": test_df["id"].values,
        "winner_model_a": te_probs[:, 0],
        "winner_model_b": te_probs[:, 1],
        "winner_tie":     te_probs[:, 2],
    })
    out_path = OUT_DIR / "submission_step6_lora.csv"
    sub_lora.to_csv(out_path, index=False)
    print("Saved:", out_path)

else:
    print("MODE is not 'lora' â†’ skipping LoRA step.")

MODE is not 'lora' â†’ skipping LoRA step.


# 7) FINAL SUBMISSIONS

In [70]:
def save_submission(name, probs):
    sub = pd.DataFrame({
        "id": test_df["id"].values,
        "winner_model_a": probs[:, 0],
        "winner_model_b": probs[:, 1],
        "winner_tie":     probs[:, 2],
    })
    out = OUT_DIR / f"submission_step3_{name}.csv"
    sub.to_csv(out, index=False)
    print("âœ… Saved:", out)


# ===========================
# (A) If MODE = "lexical"
# ===========================
if MODE == "lexical":
    save_submission(f"lexical_calibrated_{lex_best_name}", lex_proba_test)

elif MODE == "embeddings":
    save_submission(f"embeddings_calibrated_{emb_best_name}", emb_proba_test)

elif MODE == "ensemble":
    w_blend, loss_blend = best_weight_for_blend(y, lex_best_oof, emb_best_oof)
    print(f"\nðŸ”— Ensemble mode active")
    print(f"  â†’ Best weight for lexical = {w_blend:.2f}")
    print(f"  â†’ OOF log_loss (blend)    = {loss_blend:.5f}")

    blend_test = w_blend * lex_proba_test + (1 - w_blend) * emb_proba_test

    save_submission("blend_lex_emb", blend_test)

print("\nDone âœ…")


âœ… Saved: ..\outputs\submission_step3_embeddings_calibrated_isotonic.csv

Done âœ…


### Notes
- To enable LoRA fine-tuning, set `RUN_LORA = True` in the LoRA cell.
- LoRA section uses PEFT; ensure `peft`, `transformers`, `datasets`, and `torch` are installed.
- Calibrated models use scikit-learn's `CalibratedClassifierCV` with both `sigmoid` and `isotonic` methods tested via OOF.
- The ensemble weight is found by minimizing OOF log_loss over a simple 1D grid.
- All submissions are written to the working directory.