# Step 3 — Extensions: Bias features, Calibration, Ensembling, and LoRA fine-tuning

This notebook implements the full Step 3 pipeline without touching prior notebooks:

- Bias-aware lexical features (verbosity and structure)
- Calibrated classifiers (sigmoid and isotonic)
- Embeddings-based model (reusing precomputed .npy when available)
- Simple ensembling via OOF-weight search
- Optional lightweight LoRA fine-tuning with temperature scaling

Outputs: submission CSVs for each component and a blended ensemble.

In [None]:
# Imports & setup
import re, ast, random, warnings
from pathlib import Path
import numpy as np, pandas as pd
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
warnings.filterwarnings('ignore')

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
random.seed(RANDOM_STATE)

DATA = Path('../data')
TRAIN_PATH = DATA / 'train.csv'
TEST_PATH = DATA / 'test.csv'


train_df = pd.read_csv(TRAIN_PATH)
test_df  = pd.read_csv(TEST_PATH)

# Build 3-class target: 0=A, 1=B, 2=Tie
y = np.select([train_df['winner_model_a']==1, train_df['winner_model_b']==1, train_df['winner_tie']==1], [0,1,2])
train_df['target'] = y
classes = [0,1,2]

FileNotFoundError: [Errno 2] No such file or directory: '..\\llm-classification-finetuning\\train.csv'

In [None]:
# Text extraction utilities
def extract_text_from_field(text_field):
    try:
        parsed = ast.literal_eval(text_field)
        return ' '.join(parsed) if isinstance(parsed, list) else str(parsed)
    except Exception:
        return str(text_field)

for df in (train_df, test_df):
    df['prompt_text'] = df['prompt'].apply(extract_text_from_field)
    df['response_a_text'] = df['response_a'].apply(extract_text_from_field)
    df['response_b_text'] = df['response_b'].apply(extract_text_from_field)
    df['text_a'] = df['prompt_text'] + ' [SEP] ' + df['response_a_text']
    df['text_b'] = df['prompt_text'] + ' [SEP] ' + df['response_b_text']

## Bias-aware and structural lexical features

In [None]:
# Structural counters

def count_pattern(text, pattern):
    if not isinstance(text, str):
        return 0
    return len(re.findall(pattern, text))


def paragraph_count(t):
    return t.count('\n\n') if isinstance(t, str) else 0


def list_count(t):
    return count_pattern(t, r'(^\s*[\-\*•]\s|\d+\.)')


def quote_count(t):
    return count_pattern(t, r'>|\*\*')


def sentence_count(t):
    return count_pattern(t, r'[.!?](\s|$)')


def code_block_count(t):
    return count_pattern(t, r'```|`[^`]+`')


def heading_count(t):
    return count_pattern(t, r'^(#|##|###|####|#####|######)\s')


def word_count(t):
    return len(t.split()) if isinstance(t, str) else 0

# Compute per-side features using explicit suffix mapping ('a' / 'b')
for df in (train_df, test_df):
    for col, suffix in [('response_a_text', 'a'), ('response_b_text', 'b')]:
        df[f'len_{suffix}']   = df[col].astype(str).apply(len)
        df[f'wc_{suffix}']    = df[col].astype(str).apply(word_count)
        df[f'sent_{suffix}']  = df[col].apply(sentence_count)
        df[f'para_{suffix}']  = df[col].apply(paragraph_count)
        df[f'list_{suffix}']  = df[col].apply(list_count)
        df[f'quote_{suffix}'] = df[col].apply(quote_count)
        df[f'code_{suffix}']  = df[col].apply(code_block_count)
        df[f'hdr_{suffix}']   = df[col].apply(heading_count)

    # Diffs (A - B) — captures verbosity and structure bias
    for base in ['len','wc','sent','para','list','quote','code','hdr']:
        df[f'{base}_diff'] = df[f'{base}_a'] - df[f'{base}_b']

    # Ratios (A / (B+1)) to capture scale-invariant verbosity bias
    for base in ['len','wc','sent']:
        df[f'{base}_ratio'] = df[f'{base}_a'] / (df[f'{base}_b'] + 1.0)

LEX_FEATURES = [
    'len_diff','wc_diff','sent_diff','para_diff','list_diff','quote_diff','code_diff','hdr_diff',
    'len_ratio','wc_ratio','sent_ratio'
]
X_lex = train_df[LEX_FEATURES].fillna(0).astype(float)
X_lex_test = test_df[LEX_FEATURES].fillna(0).astype(float)
X_lex.shape, X_lex_test.shape

((57477, 11), (3, 11))

## Calibrated lexical model (sigmoid and isotonic) with OOF predictions

In [None]:
def cv_calibrated_probs(X, y, base_model=None, method='sigmoid', n_splits=5, seed=RANDOM_STATE):
    if base_model is None:
        base_model = LogisticRegression(max_iter=2000, C=1.0, random_state=seed)
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    oof = np.zeros((len(X), 3), dtype=float)
    models = []
    scalers = []
    for fold, (tr, va) in enumerate(skf.split(X, y), 1):
        Xtr, Xva = X.iloc[tr], X.iloc[va]
        ytr, yva = y[tr], y[va]
        scaler = StandardScaler()
        Xtr_s = scaler.fit_transform(Xtr)
        Xva_s = scaler.transform(Xva)
        clf = CalibratedClassifierCV(estimator=base_model, method=method, cv=3)
        clf.fit(Xtr_s, ytr)
        proba = clf.predict_proba(Xva_s)
        oof[va] = proba
        loss = log_loss(yva, proba, labels=classes)
        print(f'[Lex {method}] Fold {fold}: log_loss={loss:.5f}')
        models.append(clf)
        scalers.append(scaler)
    print(f'[Lex {method}] OOF log_loss: {log_loss(y, oof, labels=classes):.5f}')
    return oof, models, scalers

oof_lex_sigmoid, lex_sigmoid_models, lex_sigmoid_scalers = cv_calibrated_probs(X_lex, y, method='sigmoid')
oof_lex_isotonic, lex_isotonic_models, lex_isotonic_scalers = cv_calibrated_probs(X_lex, y, method='isotonic')

# Choose the better calibration by OOF loss
lex_oof_list = [('sigmoid', oof_lex_sigmoid), ('isotonic', oof_lex_isotonic)]
lex_best_name, lex_best_oof = min(lex_oof_list, key=lambda t: log_loss(y, t[1], labels=classes))
print('Best lexical calibration:', lex_best_name, 'OOF log_loss=', log_loss(y, lex_best_oof, labels=classes))

[Lex sigmoid] Fold 1: log_loss=1.06936
[Lex sigmoid] Fold 2: log_loss=1.06919
[Lex sigmoid] Fold 3: log_loss=1.07030
[Lex sigmoid] Fold 4: log_loss=1.07064
[Lex sigmoid] Fold 3: log_loss=1.07030
[Lex sigmoid] Fold 4: log_loss=1.07064
[Lex sigmoid] Fold 5: log_loss=1.06862
[Lex sigmoid] OOF log_loss: 1.06962
[Lex isotonic] Fold 1: log_loss=1.06078
[Lex sigmoid] Fold 5: log_loss=1.06862
[Lex sigmoid] OOF log_loss: 1.06962
[Lex isotonic] Fold 1: log_loss=1.06078
[Lex isotonic] Fold 2: log_loss=1.06198
[Lex isotonic] Fold 3: log_loss=1.06130
[Lex isotonic] Fold 2: log_loss=1.06198
[Lex isotonic] Fold 3: log_loss=1.06130
[Lex isotonic] Fold 4: log_loss=1.05976
[Lex isotonic] Fold 5: log_loss=1.05932
[Lex isotonic] OOF log_loss: 1.06063
Best lexical calibration: isotonic OOF log_loss= 1.060625553766379
[Lex isotonic] Fold 4: log_loss=1.05976
[Lex isotonic] Fold 5: log_loss=1.05932
[Lex isotonic] OOF log_loss: 1.06063
Best lexical calibration: isotonic OOF log_loss= 1.060625553766379


## Embedding features (reuse precomputed .npy if available)

In [None]:
EMBED_A_TRAIN = Path('train_embeddings_a.npy')
EMBED_B_TRAIN = Path('train_embeddings_b.npy')
EMBED_A_TEST  = Path('test_embeddings_a.npy')
EMBED_B_TEST  = Path('test_embeddings_b.npy')

def ensure_embeddings(train_df, test_df, batch_size=32, model_name='sentence-transformers/all-MiniLM-L6-v2'):
    from sentence_transformers import SentenceTransformer
    model = SentenceTransformer(model_name)
    train_a = model.encode(train_df['text_a'].tolist(), batch_size=batch_size, show_progress_bar=True, convert_to_numpy=True)
    train_b = model.encode(train_df['text_b'].tolist(), batch_size=batch_size, show_progress_bar=True, convert_to_numpy=True)
    test_a  = model.encode(test_df['text_a'].tolist(),  batch_size=batch_size, show_progress_bar=True, convert_to_numpy=True)
    test_b  = model.encode(test_df['text_b'].tolist(),  batch_size=batch_size, show_progress_bar=True, convert_to_numpy=True)
    np.save(EMBED_A_TRAIN, train_a); np.save(EMBED_B_TRAIN, train_b)
    np.save(EMBED_A_TEST,  test_a);  np.save(EMBED_B_TEST,  test_b)
    return train_a, train_b, test_a, test_b

if EMBED_A_TRAIN.exists() and EMBED_B_TRAIN.exists() and EMBED_A_TEST.exists() and EMBED_B_TEST.exists():
    train_a = np.load(EMBED_A_TRAIN)
    train_b = np.load(EMBED_B_TRAIN)
    test_a  = np.load(EMBED_A_TEST)
    test_b  = np.load(EMBED_B_TEST)
else:
    train_a, train_b, test_a, test_b = ensure_embeddings(train_df, test_df)

X_emb = np.concatenate([train_a, train_b], axis=1)
X_emb_test = np.concatenate([test_a, test_b], axis=1)
X_emb.shape, X_emb_test.shape

((57477, 768), (3, 768))

## Calibrated embeddings model with OOF predictions

In [None]:
def cv_calibrated_probs_numpy(X, y, method='sigmoid', n_splits=5, seed=RANDOM_STATE):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    oof = np.zeros((len(X), 3), dtype=float)
    models, scalers = [], []
    for fold, (tr, va) in enumerate(skf.split(X, y), 1):
        Xtr, Xva = X[tr], X[va]
        ytr, yva = y[tr], y[va]
        scaler = StandardScaler()
        Xtr_s = scaler.fit_transform(Xtr)
        Xva_s = scaler.transform(Xva)
        base = LogisticRegression(max_iter=2000, C=1.0, random_state=seed)
        clf = CalibratedClassifierCV(estimator=base, method=method, cv=3)
        clf.fit(Xtr_s, ytr)
        proba = clf.predict_proba(Xva_s)
        oof[va] = proba
        loss = log_loss(yva, proba, labels=classes)
        print(f'[Emb {method}] Fold {fold}: log_loss={loss:.5f}')
        models.append(clf); scalers.append(scaler)
    print(f'[Emb {method}] OOF log_loss: {log_loss(y, oof, labels=classes):.5f}')
    return oof, models, scalers

oof_emb_sigmoid, emb_sigmoid_models, emb_sigmoid_scalers = cv_calibrated_probs_numpy(X_emb, y, method='sigmoid')
oof_emb_isotonic, emb_isotonic_models, emb_isotonic_scalers = cv_calibrated_probs_numpy(X_emb, y, method='isotonic')

emb_oof_list = [('sigmoid', oof_emb_sigmoid), ('isotonic', oof_emb_isotonic)]
emb_best_name, emb_best_oof = min(emb_oof_list, key=lambda t: log_loss(y, t[1], labels=classes))
print('Best embedding calibration:', emb_best_name, 'OOF log_loss=', log_loss(y, emb_best_oof, labels=classes))

TypeError: CalibratedClassifierCV.__init__() got an unexpected keyword argument 'base_estimator'

## Simple ensemble (OOF-weight search on lexical + embeddings)

In [None]:
# Grid search weights w in [0..1] for p = w*p_lex + (1-w)*p_emb minimizing OOF log_loss
def best_weight_for_blend(y, p_lex, p_emb, steps=101):
    best_w, best_loss = 0.5, 1e9
    for i in range(steps):
        w = i/(steps-1)
        blend = w*p_lex + (1-w)*p_emb
        loss = log_loss(y, blend, labels=classes)
        if loss < best_loss:
            best_loss = loss; best_w = w
    return best_w, best_loss

w_blend, loss_blend = best_weight_for_blend(y, lex_best_oof, emb_best_oof)
print(f'Ensemble best weight (lexical): {w_blend:.2f}, OOF log_loss: {loss_blend:.5f}')

## Optional: Lightweight LoRA fine-tuning with temperature scaling

In [None]:
RUN_LORA = False  # set True to train (can be slow)
LORA_MODEL_NAME = 'distilbert-base-uncased'  # small and fast; change to 'microsoft/deberta-v3-small' if desired

lora_oof = None
lora_test_proba = None

if RUN_LORA:
    import torch
    from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
    from peft import LoraConfig, get_peft_model
    from datasets import Dataset

    # Build text inputs for classification (prompt + A + B)
    def build_input(df):
        return (
            '[PROMPT] ' + df['prompt_text'] +
            ' [A] ' + df['response_a_text'] +
            ' [B] ' + df['response_b_text']
        )

    train_inputs = build_input(train_df)
    test_inputs  = build_input(test_df)

    tokenizer = AutoTokenizer.from_pretrained(LORA_MODEL_NAME)

    def tokenize_fn(batch):
        return tokenizer(batch['text'], truncation=True, padding='max_length', max_length=512)

    ds_train = Dataset.from_pandas(pd.DataFrame({'text': train_inputs, 'label': y}))
    ds_test  = Dataset.from_pandas(pd.DataFrame({'text': test_inputs}))

    model = AutoModelForSequenceClassification.from_pretrained(LORA_MODEL_NAME, num_labels=3)

    # Auto-detect common attention module names for LoRA targets
    target_keywords = ['q_proj','v_proj','k_proj','query','key','value','q_lin','v_lin']
    all_module_names = [n for n,_ in model.named_modules()]
    target_modules = sorted({n.split('.')[-1] for n in all_module_names if any(k in n for k in target_keywords)})
    if not target_modules:
        # fallback for DistilBERT attention names
        target_modules = ['q_lin','v_lin']

    peft_config = LoraConfig(r=8, lora_alpha=16, lora_dropout=0.05, task_type='SEQ_CLS', target_modules=target_modules)
    model = get_peft_model(model, peft_config)

    tokenized_train = ds_train.map(tokenize_fn, batched=True, remove_columns=['text'])
    tokenized_test  = ds_test.map(tokenize_fn, batched=True, remove_columns=['text'])

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        probs = torch.softmax(torch.tensor(logits), dim=1).numpy()
        return {'log_loss': log_loss(labels, probs, labels=classes)}

    # Simple split for evaluation and temperature fitting
    tr_idx, va_idx = train_test_split(np.arange(len(ds_train)), test_size=0.15, random_state=RANDOM_STATE, stratify=y)
    ds_tr = tokenized_train.select(tr_idx.tolist())
    ds_va = tokenized_train.select(va_idx.tolist())

    args = TrainingArguments(
        output_dir='out_lora',
        per_device_train_batch_size=8,
        per_device_eval_batch_size=16,
        num_train_epochs=1,
        learning_rate=2e-4,
        evaluation_strategy='steps',
        eval_steps=200,
        logging_steps=100,
        save_strategy='no',
        report_to=[]
    )

    trainer = Trainer(model=model, args=args, train_dataset=ds_tr, eval_dataset=ds_va, compute_metrics=compute_metrics)
    trainer.train()

    # Temperature scaling on validation logits
    with torch.no_grad():
        va_logits = torch.tensor(trainer.predict(ds_va).predictions)
        va_labels = torch.tensor(y[va_idx])

    temperature = torch.nn.Parameter(torch.ones(()))
    opt = torch.optim.LBFGS([temperature], lr=0.1, max_iter=50)

    def nll_with_temperature():
        opt.zero_grad()
        scaled = va_logits / temperature.clamp_min(1e-3)
        loss = torch.nn.functional.cross_entropy(scaled, va_labels)
        loss.backward()
        return loss

    opt.step(nll_with_temperature)
    T = float(temperature.detach().cpu().numpy())
    print(f'Fitted temperature: {T:.3f}')

    # OOF-like predictions via simple CV (one split used above); approximate OOF by combining tr/va
    # For simplicity we will treat validation as OOF and train part as model predictions on train subset.
    with torch.no_grad():
        tr_logits = torch.tensor(trainer.predict(ds_tr).predictions)
        tr_probs = torch.softmax(tr_logits / T, dim=1).numpy()
        va_probs = torch.softmax(va_logits / T, dim=1).numpy()
    lora_oof = np.zeros((len(train_df), 3), dtype=float)
    lora_oof[tr_idx] = tr_probs
    lora_oof[va_idx] = va_probs
    print('LoRA pseudo-OOF log_loss:', log_loss(y, lora_oof, labels=classes))

    # Test predictions
    with torch.no_grad():
        test_logits = torch.tensor(trainer.predict(tokenized_test).predictions)
        lora_test_proba = torch.softmax(test_logits / T, dim=1).numpy()
else:
    print('Skipping LoRA training (RUN_LORA=False).')

## Fit final models on full data and produce submissions

In [None]:
# 1) Lexical: choose best calibration and train on full data
lex_best_models = lex_sigmoid_models if lex_best_name=='sigmoid' else lex_isotonic_models
lex_best_scalers = lex_sigmoid_scalers if lex_best_name=='sigmoid' else lex_isotonic_scalers
# Refit: use all folds' models+scalers to average predictions on test
lex_proba_test_list = []
for clf, scaler in zip(lex_best_models, lex_best_scalers):
    Xs = scaler.transform(X_lex_test)
    lex_proba_test_list.append(clf.predict_proba(Xs))
lex_proba_test = np.mean(lex_proba_test_list, axis=0)

sub_lex = pd.DataFrame({
    'id': test_df['id'].values,
    'winner_model_a': lex_proba_test[:,0],
    'winner_model_b': lex_proba_test[:,1],
    'winner_tie':     lex_proba_test[:,2],
})
sub_lex.to_csv('submission_step3_lexical_calibrated.csv', index=False)
print('Saved submission_step3_lexical_calibrated.csv')

# 2) Embeddings: choose best calibration and train on full data
emb_best_models = emb_sigmoid_models if emb_best_name=='sigmoid' else emb_isotonic_models
emb_best_scalers = emb_sigmoid_scalers if emb_best_name=='sigmoid' else emb_isotonic_scalers
emb_proba_test_list = []
for clf, scaler in zip(emb_best_models, emb_best_scalers):
    Xs = scaler.transform(X_emb_test)
    emb_proba_test_list.append(clf.predict_proba(Xs))
emb_proba_test = np.mean(emb_proba_test_list, axis=0)

sub_emb = pd.DataFrame({
    'id': test_df['id'].values,
    'winner_model_a': emb_proba_test[:,0],
    'winner_model_b': emb_proba_test[:,1],
    'winner_tie':     emb_proba_test[:,2],
})
sub_emb.to_csv('submission_step3_embeddings_calibrated.csv', index=False)
print('Saved submission_step3_embeddings_calibrated.csv')

# 3) LoRA submission if available
if lora_test_proba is not None:
    sub_lora = pd.DataFrame({
        'id': test_df['id'].values,
        'winner_model_a': lora_test_proba[:,0],
        'winner_model_b': lora_test_proba[:,1],
        'winner_tie':     lora_test_proba[:,2],
    })
    sub_lora.to_csv('submission_step3_lora.csv', index=False)
    print('Saved submission_step3_lora.csv')

# 4) Ensemble using OOF-optimal weight between lexical and embeddings
blend_test = w_blend * lex_proba_test + (1 - w_blend) * emb_proba_test
sub_blend = pd.DataFrame({
    'id': test_df['id'].values,
    'winner_model_a': blend_test[:,0],
    'winner_model_b': blend_test[:,1],
    'winner_tie':     blend_test[:,2],
})
sub_blend.to_csv('submission_step3_ensemble.csv', index=False)
print('Saved submission_step3_ensemble.csv')

### Notes
- To enable LoRA fine-tuning, set `RUN_LORA = True` in the LoRA cell.
- LoRA section uses PEFT; ensure `peft`, `transformers`, `datasets`, and `torch` are installed.
- Calibrated models use scikit-learn's `CalibratedClassifierCV` with both `sigmoid` and `isotonic` methods tested via OOF.
- The ensemble weight is found by minimizing OOF log_loss over a simple 1D grid.
- All submissions are written to the working directory.