In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/distillroberta/distilroberta-base_finetuned/checkpoint-6464/config.json
/kaggle/input/distillroberta/distilroberta-base_finetuned/checkpoint-6464/merges.txt
/kaggle/input/distillroberta/distilroberta-base_finetuned/checkpoint-6464/trainer_state.json
/kaggle/input/distillroberta/distilroberta-base_finetuned/checkpoint-6464/training_args.bin
/kaggle/input/distillroberta/distilroberta-base_finetuned/checkpoint-6464/tokenizer.json
/kaggle/input/distillroberta/distilroberta-base_finetuned/checkpoint-6464/vocab.json
/kaggle/input/distillroberta/distilroberta-base_finetuned/checkpoint-6464/tokenizer_config.json
/kaggle/input/distillroberta/distilroberta-base_finetuned/checkpoint-6464/scaler.pt
/kaggle/input/distillroberta/distilroberta-base_finetuned/checkpoint-6464/scheduler.pt
/kaggle/input/distillroberta/distilroberta-base_finetuned/checkpoint-6464/model.safetensors
/kaggle/input/distillroberta/distilroberta-base_finetuned/checkpoint-6464/special_tokens_map.json
/kaggle/input

In [2]:
# --- Protobuf compatibility shim (must run BEFORE any other imports) ---
import os
os.environ.setdefault("PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION", "python")  # safer in Kaggle

from google.protobuf import __version__ as _pb_ver
from google.protobuf import message_factory as _mf
from google.protobuf import symbol_database as _symdb

# Make both names available regardless of protobuf version.
# Some libs call GetPrototype (3.x), others call GetMessageClass (4.x).

# Patch MessageFactory
if not hasattr(_mf.MessageFactory, "GetMessageClass") and hasattr(_mf.MessageFactory, "GetPrototype"):
    # Env has only GetPrototype -> define GetMessageClass in terms of it
    def _GetMessageClass(self, descriptor):
        return self.GetPrototype(descriptor)
    _mf.MessageFactory.GetMessageClass = _GetMessageClass

if not hasattr(_mf.MessageFactory, "GetPrototype") and hasattr(_mf.MessageFactory, "GetMessageClass"):
    # Env has only GetMessageClass -> define GetPrototype in terms of it
    def _GetPrototype(self, descriptor):
        return self.GetMessageClass(descriptor)
    _mf.MessageFactory.GetPrototype = _GetPrototype

# Patch SymbolDatabase
_sym = _symdb.Default()
if not hasattr(_sym, "GetMessageClass") and hasattr(_sym, "GetPrototype"):
    def _sym_GetMessageClass(descriptor):
        return _sym.GetPrototype(descriptor)
    _sym.GetMessageClass = _sym_GetMessageClass

if not hasattr(_sym, "GetPrototype") and hasattr(_sym, "GetMessageClass"):
    def _sym_GetPrototype(descriptor):
        return _sym.GetMessageClass(descriptor)
    _sym.GetPrototype = _sym_GetPrototype

print(f"[protobuf shim ready] protobuf={_pb_ver}, "
      f"MessageFactory.has(GetPrototype={hasattr(_mf.MessageFactory,'GetPrototype')}, "
      f"GetMessageClass={hasattr(_mf.MessageFactory,'GetMessageClass')})")
# --- end shim ---


[protobuf shim ready] protobuf=6.33.0, MessageFactory.has(GetPrototype=False, GetMessageClass=False)


In [3]:
import os, re, ast, gc
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# --- Paths (edit MODEL_DIR to your dataset path) ---
COMP_DIR  = "/kaggle/input/lmsys-chatbot-arena"
TEST_CSV  = f"{COMP_DIR}/test.csv"

# Example: if your dataset slug is yourname/lmsys-roberta-weights
# and the folder inside is export_model/, then the path is:
MODEL_DIR = "/kaggle/input/lmsys-distillroberta/distilroberta-base_finetuned/checkpoint-6464"   # <<< EDIT THIS

DEVICE  = "cuda" if torch.cuda.is_available() else "cpu"
MAX_LEN = 512
BATCH   = 16


In [4]:
def set_seed(seed=42):
    import random
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

def _strip_surrogates(text: str) -> str:
    if not isinstance(text, str):
        text = str(text)
    # replace invalid code points to keep kernel output stable
    return text.encode("utf-8", "replace").decode("utf-8")

def _strip_control(s: str) -> str:
    # keep \n and \t; drop others
    return re.sub(r"[\x00-\x08\x0B-\x1F\x7F]", "", s)

def _safe_literal_list(text: str):
    if isinstance(text, str) and text.startswith("[") and text.endswith("]"):
        try:
            parsed = ast.literal_eval(text)
            if isinstance(parsed, list):
                return parsed
        except Exception:
            return None
    return None

def clean_text(x):
    if x is None:
        return ""
    if isinstance(x, float) and (np.isnan(x) or np.isinf(x)):
        return ""
    if isinstance(x, list):
        return " ".join(_strip_surrogates(_strip_control(str(t))) for t in x)
    s = str(x).strip()
    parsed = _safe_literal_list(s)
    if parsed is not None:
        return " ".join(_strip_surrogates(_strip_control(str(t))) for t in parsed)
    s = _strip_surrogates(s)
    s = _strip_control(s)
    return s


In [5]:
class LMSYSDataset(Dataset):
    def __init__(self, df: pd.DataFrame, tokenizer, max_length: int):
        # Clean text once; DO NOT drop rows in code submission (must predict all ids)
        self.df = df.reset_index(drop=True).copy()
        for c in ("prompt", "response_a", "response_b"):
            self.df[c] = self.df[c].apply(clean_text)
        self.tok = tokenizer
        self.max_length = max_length

    def __len__(self): return len(self.df)

    def __getitem__(self, idx):
        r = self.df.iloc[idx]
        text = f"{r['prompt']} [SEP] {r['response_a']} [SEP] {r['response_b']}"
        enc = self.tok(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "id": int(r["id"]),
        }

def predict_proba(df, tokenizer, model, batch_size=BATCH, device=DEVICE):
    ds = LMSYSDataset(df, tokenizer, MAX_LEN)
    dl = DataLoader(ds, batch_size=batch_size, shuffle=False, num_workers=0)
    ids_all, probs_all = [], []
    model.eval()
    with torch.no_grad():
        for batch in dl:
            input_ids = batch["input_ids"].to(device)
            attn = batch["attention_mask"].to(device)
            logits = model(input_ids=input_ids, attention_mask=attn).logits
            probs = F.softmax(logits, dim=-1).detach().cpu().numpy()
            probs_all.append(probs)
            ids_all.extend(batch["id"].tolist())
    if not probs_all:
        return np.zeros((0,3)), []
    return np.vstack(probs_all), ids_all


In [6]:
torch.cuda.is_available()

True

In [7]:
# Load test CSV (Kaggle replaces this with full hidden test during scoring)
test_df = pd.read_csv(TEST_CSV)

# Load local (attached) model
tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR, use_fast=True)
if tokenizer.pad_token is None and tokenizer.eos_token is not None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR, num_labels=3)
if model.config.pad_token_id is None and tokenizer.pad_token_id is not None:
    model.config.pad_token_id = tokenizer.pad_token_id
model.to(DEVICE)


2025-11-30 21:02:20.760927: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764536540.931793      20 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764536540.982254      20 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
           

In [8]:
test_df.head()

Unnamed: 0,id,prompt,response_a,response_b
0,136060,"[""I have three oranges today, I ate an orange ...","[""You have two oranges today.""]","[""You still have three oranges. Eating an oran..."
1,211333,"[""You are a mediator in a heated political deb...","[""Thank you for sharing the details of the sit...","[""Mr Reddy and Ms Blue both have valid points ..."
2,1233961,"[""How to initialize the classification head wh...","[""When you want to initialize the classificati...","[""To initialize the classification head when p..."


In [9]:
# TTA 1: normal order (prompt, A, B)
p_norm, ids_norm = predict_proba(test_df, tokenizer, model, BATCH, DEVICE)

# TTA 2: swap A/B
test_sw = test_df.copy()
test_sw[["response_a","response_b"]] = test_sw[["response_b","response_a"]]
p_swap, ids_swap = predict_proba(test_sw, tokenizer, model, BATCH, DEVICE)

# Align ids just in case (should match)
if ids_norm != ids_swap:
    idx = pd.Series(np.arange(len(ids_swap)), index=ids_swap)
    p_swap = p_swap[idx.loc[ids_norm].values]

# Fix swapped class meaning: class0=first wins(B), class1=second wins(A)
p_swap_fix = np.zeros_like(p_swap)
p_swap_fix[:,0] = p_swap[:,1]  # B->A
p_swap_fix[:,1] = p_swap[:,0]  # A->B
p_swap_fix[:,2] = p_swap[:,2]  # tie

final = (p_norm + p_swap_fix) / 2.0

# Build submission
sub = pd.DataFrame({
    "id": ids_norm,
    "winner_model_a": final[:,0],
    "winner_model_b": final[:,1],
    "winner_tie":     final[:,2],
})

# Sanity checks
assert sub.columns.tolist() == ["id","winner_model_a","winner_model_b","winner_tie"]
assert np.isfinite(sub[["winner_model_a","winner_model_b","winner_tie"]].values).all()

out_path = "/kaggle/working/submission.csv"
sub.to_csv(out_path, index=False)
print(f"Wrote {out_path} with {len(sub)} rows")


Wrote /kaggle/working/submission.csv with 3 rows
