In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Random Forest Baseline

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelEncoder

data_path = "/kaggle/input/llm-classification-finetuning/"
train_file = os.path.join(data_path, "train.csv")
test_file = os.path.join(data_path, "test.csv")
submission_template_file = os.path.join(data_path, "sample_submission.csv")

train = pd.read_csv(train_file)
test = pd.read_csv(test_file)
submission_template = pd.read_csv(submission_template_file)
train['target'] = train[['winner_model_a', 'winner_model_b', 'winner_tie']].idxmax(axis=1)
target_encoder = LabelEncoder()
train['target_encoded'] = target_encoder.fit_transform(train['target'])
X = train[['response_a', 'response_b', 'prompt']]
y = train['target_encoded']

def create_features(X):
    X['a_length'] = X['response_a'].str.len()
    X['b_length'] = X['response_b'].str.len()
    X['prompt_length'] = X['prompt'].str.len()
    X['a_words'] = X['response_a'].apply(lambda x: len(x.split()))
    X['b_words'] = X['response_b'].apply(lambda x: len(x.split()))
    X['prompt_words'] = X['prompt'].apply(lambda x: len(x.split()))
    return X

X = create_features(X)
X = X.drop(['response_a', 'response_b', 'prompt'], axis=1)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_val_pred = model.predict_proba(X_val)
logloss = log_loss(y_val, y_val_pred)
print(f"Validation Log Loss: {logloss}")

test = create_features(test)
test_X = test.drop(['response_a', 'response_b', 'prompt', 'id'], axis=1)
predictions = model.predict_proba(test_X)

submission = pd.DataFrame({
    "id": test['id'],
    "winner_model_a": predictions[:, 0],
    "winner_model_b": predictions[:, 1],
    "winner_tie": predictions[:, 2]
})

submission.to_csv("submission.csv", index=False)

## Main model

In [1]:
import os
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    DebertaV2ForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
    get_linear_schedule_with_warmup
)
from peft import LoraConfig, get_peft_model, TaskType
import numpy as np
from sklearn.metrics import accuracy_score, log_loss
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn

2025-06-19 15:50:27.736449: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750348227.762676     332 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750348227.770459     332 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
tokenizer = AutoTokenizer.from_pretrained("/kaggle/input/debertav3_tokenizer/transformers/default/1/debertav3_tokenizer")
SPECIAL_TOKENS = {
    "sep_token": "[SEP]",
    "response_a_token": "[RESP_A]",
    "response_b_token": "[RESP_B]"
}
tokenizer.add_tokens(list(SPECIAL_TOKENS.values()), special_tokens=True)

2

In [3]:
model = DebertaV2ForSequenceClassification.from_pretrained(
    "/kaggle/input/debertav3_model/transformers/default/1/debertav3_model",
    num_labels=3,
    problem_type="multi_label_classification"
)

In [4]:
model.resize_token_embeddings(len(tokenizer))

Embedding(128003, 768, padding_idx=0)

In [5]:
data_path = "/kaggle/input/llm-classification-finetuning/"
train_file = os.path.join(data_path, "train.csv")
test_file = os.path.join(data_path, "test.csv")
submission_template_file = os.path.join(data_path, "sample_submission.csv")

train = pd.read_csv(train_file)
test = pd.read_csv(test_file)

In [None]:
train.shape

In [None]:
train.head()

In [6]:
train, val = train_test_split(train, test_size=0.33, random_state=42)

In [7]:
dataset_train = Dataset.from_pandas(train)
dataset_val = Dataset.from_pandas(val)

In [8]:
def preprocess_function(examples):
    # Structure: [PROMPT] [SEP] [RESP_A] response_a [SEP] [RESP_B] response_b [SEP]
    texts = [
        f"{p} {tokenizer.sep_token} {SPECIAL_TOKENS['response_a_token']} {a} "
        f"{tokenizer.sep_token} {SPECIAL_TOKENS['response_b_token']} {b} {tokenizer.sep_token}"
        for p, a, b in zip(examples['prompt'], examples['response_a'], examples['response_b'])
    ]
    
    tokenized = tokenizer(
        texts,
        truncation=True,
        max_length=512,
        padding="max_length"
    )
    
    labels = [
        [a, b, tie] 
        for a, b, tie in zip(examples['winner_model_a'], examples['winner_model_b'], examples['winner_tie'])
    ]
    
    return {
        'input_ids': tokenized['input_ids'],
        'attention_mask': tokenized['attention_mask'],
        'labels': labels
    }

In [11]:
tokenized_dataset_train = dataset_train.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset_train.column_names
)
tokenized_dataset_val = dataset_val.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset_val.column_names
)

Map:   0%|          | 0/38509 [00:00<?, ? examples/s]

Map:   0%|          | 0/18968 [00:00<?, ? examples/s]

In [12]:
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
    target_modules=["query_proj", "value_proj"], 
    inference_mode=False
)

In [13]:
model_lora = get_peft_model(model, lora_config)

In [14]:
model_lora.print_trainable_parameters()

trainable params: 297,219 || all params: 184,647,174 || trainable%: 0.1610


In [15]:
model_lora

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): DebertaV2ForSequenceClassification(
      (deberta): DebertaV2Model(
        (embeddings): DebertaV2Embeddings(
          (word_embeddings): Embedding(128003, 768, padding_idx=0)
          (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): DebertaV2Encoder(
          (layer): ModuleList(
            (0-11): 12 x DebertaV2Layer(
              (attention): DebertaV2Attention(
                (self): DisentangledSelfAttention(
                  (query_proj): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=768, out_features=8, bias=False)
        

In [16]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    pred_probs = torch.softmax(torch.tensor(predictions), dim=-1).numpy()
    
    pred_classes = np.argmax(predictions, axis=1)
    true_classes = np.argmax(labels, axis=1)
    
    return {
        "accuracy": accuracy_score(true_classes, pred_classes),
        "log_loss": log_loss(true_classes, pred_probs, normalize = True),
        "class_distribution": dict(zip(
            ["prefer_a", "prefer_b", "tie"],
            np.bincount(true_classes, minlength=3) / len(true_classes)
        ))}

In [17]:
def collate_fn(batch):
    return {
        'input_ids': torch.stack([torch.tensor(item['input_ids']) for item in batch]),
        'attention_mask': torch.stack([torch.tensor(item['attention_mask']) for item in batch]),
        'labels': torch.tensor([item['labels'] for item in batch], dtype=torch.float)
    }

class CustomTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.loss_fct = nn.CrossEntropyLoss()
        
    def compute_loss(self, model, inputs,num_items_in_batch = 1, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        
        # Convert probability labels to class indices for CE loss
        class_labels = torch.argmax(labels, dim=1)
        loss = self.loss_fct(logits, class_labels)
        
        return (loss, outputs) if return_outputs else loss

training_args = TrainingArguments(
    output_dir="./debertv3_preference_results",
    learning_rate=2e-5, 
    per_device_train_batch_size=8, 
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    warmup_ratio=0.1,
    save_strategy="epoch",
    eval_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_log_loss",
    logging_dir='./logs',
    logging_steps=10,
    gradient_accumulation_steps=2,
    fp16=True,
    report_to="none"
)


trainer = CustomTrainer(
    model=model_lora,
    args=training_args,
    train_dataset=tokenized_dataset_train,
    eval_dataset=tokenized_dataset_val,
    compute_metrics=compute_metrics,
    data_collator=collate_fn,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)


trainer.train()

No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Accuracy,Log Loss,Class Distribution,Runtime,Samples Per Second,Steps Per Second
1,1.1036,1.090222,0.377478,1.090222,"{'prefer_a': 0.35153943483762123, 'prefer_b': 0.34020455504006747, 'tie': 0.30825601012231124}",475.7697,39.868,2.493
2,1.0875,1.085305,0.385491,1.085305,"{'prefer_a': 0.35153943483762123, 'prefer_b': 0.34020455504006747, 'tie': 0.30825601012231124}",475.9802,39.85,2.492




TrainOutput(global_step=4814, training_loss=1.093341258635004, metrics={'train_runtime': 5729.1466, 'train_samples_per_second': 13.443, 'train_steps_per_second': 0.84, 'total_flos': 2.0335154429251584e+16, 'train_loss': 1.093341258635004, 'epoch': 2.0})

In [None]:
def predict_preference(prompt, response_a, response_b):
    text = (
        f"{prompt} {tokenizer.sep_token} {SPECIAL_TOKENS['response_a_token']} {response_a} "
        f"{tokenizer.sep_token} {SPECIAL_TOKENS['response_b_token']} {response_b} {tokenizer.sep_token}"
    )
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to("cuda")
    
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
    
    probs = torch.softmax(outputs.logits, dim=-1).squeeze()
    
    return {
        "winner_model_a": probs[0].item(),
        "winner_model_b": probs[1].item(),
        "winner_tie": probs[2].item(),
        
    }

In [None]:
prompt = train.iloc[5]["prompt"]
response_a = train.iloc[5]["response_a"]
response_b = train.iloc[5]["response_b"]

In [None]:
predict_preference(prompt, response_a, response_b)

In [None]:
def batch_generator(df, batch_size):
    for i in range(0, len(df), batch_size):
        yield df.iloc[i:i + batch_size]

def generate_submission_csv(
    test_csv_path: str,
    output_csv_path: str = "submission.csv",
    batch_size: int = 8,
    max_length: int = 512
) -> None:
   
    test_df = pd.read_csv(test_csv_path)
    predictions = []
    for batch in tqdm(batch_generator(test_df, batch_size), desc="Processing batches"):
        texts = [
            f"{row.prompt} [SEP] [RESP_A] {row.response_a} [SEP] [RESP_B] {row.response_b} [SEP]"
            for _, row in batch.iterrows()
        ]

        inputs = tokenizer(
            texts,
            padding=True,
            truncation=True,
            max_length=max_length,
            return_tensors="pt"
        )
        
        if torch.cuda.is_available():
            inputs = {k: v.to('cuda') for k, v in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
            batch_probs = torch.softmax(outputs.logits, dim=-1).cpu().numpy()
        
        predictions.extend(batch_probs)
    
    submission = pd.DataFrame({
        "id": list(test_df['id'].values),
        "winner_model_a": [p[0] for p in predictions],
        "winner_model_b": [p[1] for p in predictions],
        "winner_tie": [p[2] for p in predictions]
    })
    

    submission.to_csv(output_csv_path, index=False)
    print(f"Submission saved to {output_csv_path}")

generate_submission_csv(
    test_csv_path="/kaggle/input/llm-classification-finetuning/test.csv"
)