In [1]:
# !pip install -U transformers accelerate peft bitsandbytes -q

In [2]:
### https://www.kaggle.com/code/johnsonhk88/multilingual-chatbot-arena-llm-fine-tune

In [3]:
!nvidia-smi

Tue Apr 22 13:00:32 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.02              Driver Version: 560.94         CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 4090        On  |   00000000:01:00.0  On |                  Off |
|  0%   44C    P8             11W /  450W |     725MiB /  24564MiB |      1%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [4]:
TARGET_MODEL = "Qwen/Qwen2.5-0.5B"

DEBUG = False

In [5]:
from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType # type: ignore
from transformers import BitsAndBytesConfig, AutoModelForCausalLM, AutoModelForSequenceClassification
import torch
import numpy  as np
import random
import os
import json

os.environ['hf_cache '] = '///mnt/c/Personal/Competitions/HFCache'

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
def set_seed(seed=42):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    
set_seed(42)

In [7]:
# Define the tutor classes
TUTOR_CLASSES = [
    "Expert",
    "Novice",
    "Gemini",
    "GPT4",
    "Llama31405B",
    "Llama318B",
    "Mistral",
    "Phi3",
    "Sonnet"
]

# Create label mappings
id2label = {i: label for i, label in enumerate(TUTOR_CLASSES)}
label2id = {v: k for k, v in id2label.items()}
print(id2label, label2id)

{0: 'Expert', 1: 'Novice', 2: 'Gemini', 3: 'GPT4', 4: 'Llama31405B', 5: 'Llama318B', 6: 'Mistral', 7: 'Phi3', 8: 'Sonnet'} {'Expert': 0, 'Novice': 1, 'Gemini': 2, 'GPT4': 3, 'Llama31405B': 4, 'Llama318B': 5, 'Mistral': 6, 'Phi3': 7, 'Sonnet': 8}


In [8]:

# Data Processing Functions
def load_data(dev_data_path='///mnt/c/Personal/Competitions/BEA_2025/data/mrbench_v3_devset.json'):
    """
    Load development and (optionally) test datasets
    """
    # Load development data
    with open(dev_data_path, 'r') as f:
        dev_data = json.load(f)
    
    # Process development data
    dev_examples = []
    for dialogue in dev_data:
        conversation_id = dialogue["conversation_id"]
        conversation_history = dialogue["conversation_history"]
        
        for tutor_id, tutor_data in dialogue["tutor_responses"].items():
            if tutor_id in TUTOR_CLASSES or any(cls_name in tutor_id for cls_name in TUTOR_CLASSES):
                # Map the tutor_id to one of our classes
                tutor_class = next((cls for cls in TUTOR_CLASSES if cls in tutor_id), tutor_id)
                
                dev_examples.append({
                    "conversation_id": conversation_id,
                    "conversation_history": conversation_history,
                    "tutor_response": tutor_data["response"],
                    "tutor_class": tutor_class
                })
    

    return dev_examples

In [9]:
import pandas as pd
train = pd.DataFrame(load_data())
train['target'] = train['tutor_class'].map(label2id)
train.rename(columns={'conversation_history':'Question','tutor_response':'Response'},inplace=True)

In [10]:
train.head()

Unnamed: 0,conversation_id,Question,Response,tutor_class,target
0,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,"Tutor: Hi, could you please provide a step-by-...","Great, you've correctly identified the cost of...",Sonnet,8
1,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,"Tutor: Hi, could you please provide a step-by-...",Now that we know the cost of 1 pound of meat i...,Llama318B,5
2,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,"Tutor: Hi, could you please provide a step-by-...","You're close, but I notice that you calculated...",Llama31405B,4
3,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,"Tutor: Hi, could you please provide a step-by-...","That's correct. So, if 1 pound of meat costs $...",GPT4,3
4,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,"Tutor: Hi, could you please provide a step-by-...",It seems like you've calculated the cost as if...,Mistral,6


In [11]:
rows = []
for i, row in train.iterrows():
    question = row.Question
    response = row.Response
    convid = row.conversation_id
    context = train[(train.conversation_id==convid)&(train.Response!=response)].Response.values
    context = ' [SEP] '.join(context)
    train.loc[i, 'context'] = context

In [12]:
fold_map = pd.read_csv('///mnt/c/Personal/Competitions/BEA_2025/debetav3_context_multisampleDropout/oofs.csv')

train = pd.merge(train,fold_map[['conversation_id','fold','tutor_class']],on=['conversation_id','tutor_class'], how='left')


In [13]:
train['input'] = "Question: " + train['Question'] + '; Answer: ' + train['Response'] + '; Context: ' + train["context"]

In [14]:
train = train.rename(columns={'target': 'label'})

In [15]:
train.head()

Unnamed: 0,conversation_id,Question,Response,tutor_class,label,context,fold,input
0,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,"Tutor: Hi, could you please provide a step-by-...","Great, you've correctly identified the cost of...",Sonnet,8,Now that we know the cost of 1 pound of meat i...,0,"Question: Tutor: Hi, could you please provide ..."
1,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,"Tutor: Hi, could you please provide a step-by-...",Now that we know the cost of 1 pound of meat i...,Llama318B,5,"Great, you've correctly identified the cost of...",0,"Question: Tutor: Hi, could you please provide ..."
2,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,"Tutor: Hi, could you please provide a step-by-...","You're close, but I notice that you calculated...",Llama31405B,4,"Great, you've correctly identified the cost of...",0,"Question: Tutor: Hi, could you please provide ..."
3,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,"Tutor: Hi, could you please provide a step-by-...","That's correct. So, if 1 pound of meat costs $...",GPT4,3,"Great, you've correctly identified the cost of...",0,"Question: Tutor: Hi, could you please provide ..."
4,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,"Tutor: Hi, could you please provide a step-by-...",It seems like you've calculated the cost as if...,Mistral,6,"Great, you've correctly identified the cost of...",0,"Question: Tutor: Hi, could you please provide ..."


In [16]:
# load model with 4bit bnb

from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType # type: ignore
from transformers import BitsAndBytesConfig
import torch


In [17]:
import torch
from transformers import AutoModelForSequenceClassification, Cache, Qwen2PreTrainedModel, Qwen2Model
from transformers.modeling_outputs import SequenceClassifierOutputWithPast
import torch.nn as nn
from typing import Optional, Union, List, Tuple
from torch.nn import MSELoss, CrossEntropyLoss, BCEWithLogitsLoss

class Qwen2ForSequenceClassificationPlus(Qwen2PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.model = Qwen2Model(config)

        self.score = nn.Sequential(
            nn.Dropout(0.1),
            nn.Linear(config.hidden_size, config.hidden_size // 2),
            nn.Dropout(0.1),
            nn.GELU(),
            nn.Linear(config.hidden_size // 2, self.num_labels),
        )
        self.post_init()

    def get_input_embeddings(self):
        return self.model.embed_tokens

    def set_input_embeddings(self, value):
        self.model.embed_tokens = value

    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        transformer_outputs = self.model(
            input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        hidden_states = transformer_outputs[0]
        logits = self.score(hidden_states)

        if input_ids is not None:
            batch_size = input_ids.shape[0]
        else:
            batch_size = inputs_embeds.shape[0]

        if self.config.pad_token_id is None and batch_size != 1:
            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
        if self.config.pad_token_id is None:
            sequence_lengths = -1
        else:
            if input_ids is not None:
                # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
                sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
                sequence_lengths = sequence_lengths % input_ids.shape[-1]
                sequence_lengths = sequence_lengths.to(logits.device)
            else:
                sequence_lengths = -1

        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]

        loss = None
        if labels is not None:
            labels = labels.to(logits.device)
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(pooled_logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(pooled_logits, labels)
        if not return_dict:
            output = (pooled_logits,) + transformer_outputs[1:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutputWithPast(
            loss=loss,
            logits=pooled_logits,
            past_key_values=transformer_outputs.past_key_values,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
        )

In [18]:
from transformers import AutoTokenizer, LlamaForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained(TARGET_MODEL)
tokenizer.add_eos_token = True
# tokenizer.truncation_side = "right"
tokenizer.padding_side = "left"

# processor.tokenizer.padding_side = "left"

In [19]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        
        all_param += param.numel()
        if param.requires_grad:
            print(_)
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [20]:
def preprocess_function(examples, max_length=4096):
    return tokenizer(examples["input"], truncation=True, max_length=max_length, padding=True)

In [21]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [22]:
import numpy as np
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
import numpy as np
import evaluate

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    accuracy = evaluate.load("accuracy")
    f1 = evaluate.load("f1")
    
    accuracy_score = accuracy.compute(predictions=predictions, references=labels)
    f1_score = f1.compute(predictions=predictions, references=labels, average="macro")
    
    # Add per-class F1 scores
    f1_per_class = f1.compute(predictions=predictions, references=labels, average=None)
    per_class_scores = {f"f1_{id2label[i]}": score for i, score in enumerate(f1_per_class["f1"])}
    
    metrics = {
        "accuracy": accuracy_score["accuracy"],
        "f1_macro": f1_score["f1"],
        **per_class_scores
    }
    
    return metrics

In [23]:
def softmax(x):
    e_x = np.exp(x - np.max(x, axis=1, keepdims=True))
    return e_x / e_x.sum(axis=1, keepdims=True)

In [24]:
from transformers import TrainingArguments, Trainer
from datasets import Dataset

In [25]:
import typing as tp
def find_all_linear_modules(model) -> tp.List[str]:
    r"""
    Finds all available modules to apply lora.
    """
    linear_cls = torch.nn.Linear

    output_layer_names = ["lm_head", "embed_tokens"]

    module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, linear_cls) and not any(
            [output_layer in name for output_layer in output_layer_names]
        ):
            module_names.add(name.split(".")[-1])
    return list(module_names)

In [26]:
# !pip install flash-attn --no-build-isolation


In [27]:

steps = 5 if DEBUG else 50
# train = train.sample(n=400)# if DEBUG else train

for fold in range(5):
    print(f'Fold {fold+1}/5')

    valid_df = train[train["fold"] == fold]
    train_df = train[train["fold"] != fold]
    
    # from pandas
    train_ds = Dataset.from_pandas(train_df)
    valid_ds = Dataset.from_pandas(valid_df)
    
    train_tokenized_ds = train_ds.map(preprocess_function, batched=True)
    valid_tokenized_ds = valid_ds.map(preprocess_function, batched=True)
    
    base_model = Qwen2ForSequenceClassificationPlus.from_pretrained(
        TARGET_MODEL,
        num_labels=len(id2label),
        id2label=id2label,          # Add this line
        label2id=label2id,          # Add this line
        device_map={"":0},
        torch_dtype=torch.bfloat16,
        attn_implementation="flash_attention_2",
    )
    base_model.config.pad_token_id = tokenizer.pad_token_id
    # Model
    
    peft_config = LoraConfig(
        r=64,
        lora_alpha=128,
        lora_dropout=0.1,
        bias="none",
        task_type=TaskType.SEQ_CLS,
        inference_mode=False,
        target_modules=['q_proj','k_proj','v_proj','up_proj','down_proj','gate_proj'],
        # modules_to_save=["score"],
    )

    model = get_peft_model(base_model, peft_config)
    model.print_trainable_parameters()

    training_args = TrainingArguments(
        output_dir=f"///mnt/c/Personal/Competitions/BEA_2025/Qwen25_0.5/outputs/fold{fold}",
        learning_rate=1e-4,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        gradient_accumulation_steps=4,
        max_grad_norm= 0.5,#0.3,
        optim='paged_adamw_32bit',
        lr_scheduler_type="cosine",
        num_train_epochs=10,
        weight_decay=0.0001,
        save_total_limit=1,
        eval_strategy="steps",
        save_strategy="steps",
        save_steps = 100,
        eval_steps = 100,
        logging_steps= 100,
        load_best_model_at_end=True,
        push_to_hub=False,
        warmup_steps=10,
        report_to='none', # if DEBUG else 'wandb',
        metric_for_best_model="f1_macro",
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_tokenized_ds,
        eval_dataset=valid_tokenized_ds,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)
    )
    
    trainer.train()
    # validation 
    pred_output = trainer.predict(valid_tokenized_ds)
    logits = pred_output.predictions
    probas = softmax(logits)
    np.save(f'///mnt/c/Personal/Competitions/BEA_2025/Qwen25_0.5/outputs/fold{fold}.npy', probas)

Fold 1/5


Map: 100%|██████████| 1982/1982 [00:01<00:00, 1935.83 examples/s]
Map: 100%|██████████| 494/494 [00:00<00:00, 2618.25 examples/s]
Some weights of Qwen2ForSequenceClassificationPlus were not initialized from the model checkpoint at Qwen/Qwen2.5-0.5B and are newly initialized: ['score.1.bias', 'score.1.weight', 'score.4.bias', 'score.4.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


trainable params: 32,846,217 || all params: 527,284,882 || trainable%: 6.2293


Step,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Expert,F1 Novice,F1 Gemini,F1 Gpt4,F1 Llama31405b,F1 Llama318b,F1 Mistral,F1 Phi3,F1 Sonnet
100,1.9793,1.123276,0.617409,0.627211,0.776978,0.526316,0.819048,0.604167,0.509804,0.451064,0.666667,0.708333,0.582524
200,0.7622,0.633783,0.757085,0.763488,0.87395,0.846154,0.859504,0.80916,0.660194,0.604027,0.806723,0.78125,0.630435
300,0.4559,0.536888,0.834008,0.838631,0.896,0.923077,0.92562,0.765957,0.738739,0.615385,0.878049,0.892857,0.912
400,0.3105,0.578309,0.84413,0.849446,0.896552,0.88,0.898305,0.782609,0.823529,0.724409,0.851852,0.877193,0.910569
500,0.2019,0.580704,0.8583,0.860708,0.935484,0.88,0.866667,0.827586,0.788991,0.759124,0.896,0.884956,0.907563
600,0.0773,0.778616,0.864372,0.867653,0.890756,0.923077,0.932203,0.828125,0.824427,0.699029,0.918033,0.868852,0.92437
700,0.0502,0.821473,0.87247,0.879742,0.915254,0.962963,0.915254,0.84127,0.808824,0.728972,0.905983,0.902655,0.936508
800,0.0055,0.806361,0.87247,0.877758,0.9,0.928571,0.913793,0.852459,0.845528,0.770492,0.883333,0.896552,0.909091
900,0.0061,0.813999,0.870445,0.878193,0.9,0.962963,0.907563,0.832,0.832,0.765217,0.87395,0.886957,0.943089
1000,0.0002,0.822098,0.868421,0.876379,0.9,0.962963,0.905983,0.832,0.828125,0.761062,0.859504,0.886957,0.95082


Fold 2/5


Map: 100%|██████████| 1982/1982 [00:00<00:00, 2511.54 examples/s]
Map: 100%|██████████| 494/494 [00:00<00:00, 3223.81 examples/s]
Some weights of Qwen2ForSequenceClassificationPlus were not initialized from the model checkpoint at Qwen/Qwen2.5-0.5B and are newly initialized: ['score.1.bias', 'score.1.weight', 'score.4.bias', 'score.4.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


trainable params: 32,846,217 || all params: 527,284,882 || trainable%: 6.2293


Step,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Expert,F1 Novice,F1 Gemini,F1 Gpt4,F1 Llama31405b,F1 Llama318b,F1 Mistral,F1 Phi3,F1 Sonnet
100,2.177,1.290359,0.550607,0.569037,0.705882,0.782609,0.673077,0.25,0.396694,0.383929,0.37037,0.785047,0.773723
200,0.8951,0.668793,0.777328,0.781503,0.807692,0.833333,0.913793,0.8,0.657895,0.494845,0.776978,0.871795,0.877193
300,0.5523,0.421013,0.84413,0.844547,0.915254,0.846154,0.907692,0.842105,0.765217,0.672,0.83871,0.913793,0.9
400,0.3272,0.582588,0.831984,0.83487,0.899225,0.88,0.920635,0.790323,0.707965,0.661157,0.876033,0.867925,0.910569
500,0.1989,0.509731,0.864372,0.869306,0.958678,0.928571,0.941176,0.875,0.722222,0.738462,0.859649,0.866667,0.933333
600,0.077,0.7017,0.864372,0.871615,0.932203,0.962963,0.920635,0.870968,0.74,0.713178,0.9,0.912281,0.892308
700,0.0392,0.907628,0.850202,0.859454,0.957265,0.962963,0.902256,0.848,0.711538,0.706767,0.871795,0.878505,0.896
800,0.0217,0.806921,0.878543,0.885149,0.93913,0.962963,0.95082,0.910569,0.72381,0.742424,0.868852,0.942149,0.92562
900,0.0043,0.857961,0.878543,0.885494,0.948276,0.962963,0.958678,0.909091,0.72381,0.736842,0.870968,0.942149,0.916667
1000,0.0001,0.841355,0.892713,0.89916,0.948276,0.962963,0.958678,0.901639,0.767857,0.75,0.912,0.974359,0.916667


Fold 3/5


Map: 100%|██████████| 1979/1979 [00:00<00:00, 2499.78 examples/s]
Map: 100%|██████████| 497/497 [00:00<00:00, 2457.24 examples/s]
Some weights of Qwen2ForSequenceClassificationPlus were not initialized from the model checkpoint at Qwen/Qwen2.5-0.5B and are newly initialized: ['score.1.bias', 'score.1.weight', 'score.4.bias', 'score.4.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


trainable params: 32,846,217 || all params: 527,284,882 || trainable%: 6.2293


Step,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Expert,F1 Novice,F1 Gemini,F1 Gpt4,F1 Llama31405b,F1 Llama318b,F1 Mistral,F1 Phi3,F1 Sonnet
100,2.2621,1.804829,0.325956,0.252573,0.56,0.0,0.55914,0.086957,0.26,0.078947,0.364532,0.031746,0.331839
200,1.0942,0.698069,0.762575,0.768353,0.833333,0.918919,0.883721,0.695652,0.79646,0.527473,0.738739,0.701987,0.818898
300,0.5343,0.587632,0.800805,0.811666,0.882883,0.918919,0.914729,0.867257,0.702128,0.701493,0.697987,0.774775,0.844828
400,0.3052,0.582063,0.841046,0.848981,0.910569,0.944444,0.921739,0.814286,0.777778,0.741379,0.809524,0.824561,0.896552
500,0.1939,0.624616,0.828974,0.83368,0.904348,0.944444,0.95935,0.80303,0.77037,0.597938,0.827586,0.814159,0.88189
600,0.0828,0.749373,0.855131,0.863377,0.983051,1.0,0.967213,0.826087,0.770642,0.686869,0.80916,0.857143,0.870229
700,0.0564,0.795278,0.853119,0.864516,0.95,1.0,0.899225,0.866667,0.811321,0.752,0.760331,0.87931,0.861789
800,0.0164,0.781912,0.869215,0.878279,0.915254,0.971429,0.958678,0.892562,0.872727,0.766917,0.786325,0.87395,0.866667
900,0.002,0.732779,0.867203,0.872618,0.910569,0.944444,0.935484,0.887097,0.840336,0.77686,0.817391,0.862069,0.87931
1000,0.0002,0.716751,0.877264,0.884777,0.92562,0.971429,0.95082,0.885246,0.854701,0.784,0.830508,0.881356,0.87931


Fold 4/5


Map: 100%|██████████| 1981/1981 [00:00<00:00, 2528.71 examples/s]
Map: 100%|██████████| 495/495 [00:00<00:00, 2678.14 examples/s]
Some weights of Qwen2ForSequenceClassificationPlus were not initialized from the model checkpoint at Qwen/Qwen2.5-0.5B and are newly initialized: ['score.1.bias', 'score.1.weight', 'score.4.bias', 'score.4.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


trainable params: 32,846,217 || all params: 527,284,882 || trainable%: 6.2293


Step,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Expert,F1 Novice,F1 Gemini,F1 Gpt4,F1 Llama31405b,F1 Llama318b,F1 Mistral,F1 Phi3,F1 Sonnet
100,2.3081,1.97702,0.19596,0.109279,0.388571,0.0,0.149254,0.0,0.0,0.167539,0.0,0.0,0.278146
200,1.3239,0.696683,0.783838,0.784123,0.686869,0.814815,0.910569,0.829268,0.763889,0.647619,0.844037,0.746479,0.813559
300,0.5921,0.451712,0.842424,0.842811,0.857143,0.846154,0.919355,0.864407,0.82243,0.740157,0.822581,0.870968,0.842105
400,0.3338,0.467679,0.872727,0.879939,0.92562,0.967742,0.932203,0.866667,0.839286,0.754386,0.848485,0.894309,0.890756
500,0.1845,0.640544,0.858586,0.867007,0.890756,0.965517,0.928,0.846154,0.882883,0.75,0.852459,0.849057,0.838235
600,0.0744,0.661983,0.89697,0.90187,0.903226,0.967742,0.915254,0.935484,0.885246,0.785047,0.834646,0.957983,0.932203
700,0.0351,0.617238,0.89899,0.904776,0.912,0.965517,0.958678,0.926829,0.859813,0.787402,0.859504,0.957983,0.915254
800,0.0135,0.584258,0.915152,0.922101,0.920635,1.0,0.95,0.918033,0.910714,0.842975,0.864407,0.95,0.942149
900,0.0011,0.606306,0.913131,0.920688,0.926829,1.0,0.95,0.910569,0.920354,0.836066,0.868852,0.949153,0.92437
1000,0.0004,0.602754,0.917172,0.924246,0.92562,1.0,0.95,0.894309,0.921739,0.85,0.885246,0.949153,0.942149


Fold 5/5


Map: 100%|██████████| 1980/1980 [00:00<00:00, 2634.06 examples/s]
Map: 100%|██████████| 496/496 [00:00<00:00, 2549.95 examples/s]
Some weights of Qwen2ForSequenceClassificationPlus were not initialized from the model checkpoint at Qwen/Qwen2.5-0.5B and are newly initialized: ['score.1.bias', 'score.1.weight', 'score.4.bias', 'score.4.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


trainable params: 32,846,217 || all params: 527,284,882 || trainable%: 6.2293


Step,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Expert,F1 Novice,F1 Gemini,F1 Gpt4,F1 Llama31405b,F1 Llama318b,F1 Mistral,F1 Phi3,F1 Sonnet
100,2.1005,1.347003,0.508065,0.50213,0.707483,0.608696,0.788462,0.121212,0.45614,0.298137,0.41791,0.505747,0.615385
200,0.9012,0.827824,0.731855,0.721331,0.825397,0.8,0.888889,0.641975,0.676692,0.27027,0.777778,0.818898,0.792079
300,0.5495,0.609078,0.8125,0.819175,0.933333,0.888889,0.897638,0.815385,0.730435,0.671533,0.788462,0.803922,0.842975
400,0.3273,0.524433,0.836694,0.846858,0.933333,0.941176,0.929825,0.817518,0.723077,0.66087,0.844828,0.852174,0.918919
500,0.1775,0.750772,0.834677,0.845351,0.921875,0.967742,0.93913,0.828125,0.742857,0.678261,0.810811,0.8,0.919355
600,0.0938,0.664916,0.862903,0.870913,0.926829,0.967742,0.918033,0.883721,0.803419,0.705882,0.857143,0.882883,0.892562
700,0.0388,0.781856,0.856855,0.860284,0.931034,0.909091,0.933333,0.87395,0.776119,0.660377,0.854962,0.920354,0.883333
800,0.0231,0.746349,0.868952,0.875221,0.905983,0.941176,0.90625,0.857143,0.8,0.770492,0.871795,0.875,0.949153
900,0.0036,0.720428,0.872984,0.878578,0.92562,0.9375,0.910569,0.857143,0.816,0.770492,0.888889,0.859813,0.941176
1000,0.0013,0.706372,0.870968,0.874444,0.916667,0.909091,0.912,0.873016,0.803279,0.768,0.886957,0.859813,0.941176


In [28]:
del trainer, model, base_model

In [29]:
tokenizer.sep_token

In [30]:
# cuda cache clear
import torch
torch.cuda.empty_cache()

## Get OOF

In [31]:

# Data Processing Functions
def load_test_data(test_data_path='///mnt/c/Personal/Competitions/BEA_2025/data/mrbench_v3_testset.json'):
    """
    Load development and (optionally) test datasets
    """    
    # Load test data if provided
    test_examples = []
    test_data = None
    if test_data_path:
        with open(test_data_path, 'r',encoding="utf-8") as f:
            test_data = json.load(f)
        
        for dialogue in test_data:
            conversation_id = dialogue["conversation_id"]
            conversation_history = dialogue["conversation_history"]
            
            for tutor_id, tutor_data in dialogue["tutor_responses"].items():
                test_examples.append({
                    "conversation_id": conversation_id,
                    "conversation_history": conversation_history,
                    "tutor_response": tutor_data["response"],
                    "tutor_id": tutor_id
                })
    
    return test_examples, test_data

In [32]:
test_examples,test_data = load_test_data()
test = pd.DataFrame(test_examples)
test.rename(columns={'conversation_history':'Question','tutor_response':'Response'},inplace=True)
test.head()

Unnamed: 0,conversation_id,Question,Response,tutor_id
0,1030-adb61831-0383-4e51-a673-ab978590f69b,"Tutor: Hi, could you please provide a step-by-...",It looks like you've done a great job figuring...,Tutor_1
1,1030-adb61831-0383-4e51-a673-ab978590f69b,"Tutor: Hi, could you please provide a step-by-...","You've done a great job, but there's a small m...",Tutor_2
2,1030-adb61831-0383-4e51-a673-ab978590f69b,"Tutor: Hi, could you please provide a step-by-...","OK, read the question again, and answer these ...",Tutor_3
3,1030-adb61831-0383-4e51-a673-ab978590f69b,"Tutor: Hi, could you please provide a step-by-...","Tutor: I see where you're coming from, but I t...",Tutor_4
4,1030-adb61831-0383-4e51-a673-ab978590f69b,"Tutor: Hi, could you please provide a step-by-...",Great job! Can you explain how you arrived at ...,Tutor_5


In [33]:
rows = []
for i, row in test.iterrows():
    question = row.Question
    response = row.Response
    convid = row.conversation_id
    context = test[(test.conversation_id==convid)&(test.Response!=response)].Response.values
    context = ' [SEP] '.join(context)
    test.loc[i, 'context'] = context

In [34]:
from transformers import TrainingArguments, Trainer
from datasets import Dataset
import pandas as pd

test['input'] = "Question: " + test['Question'] + '; Answer: ' + test['Response'] + '; Context: ' + train["context"]
test_ds = Dataset.from_pandas(test)
test_tokenized_ds = test_ds.map(preprocess_function, batched=True)

Map: 100%|██████████| 1547/1547 [00:00<00:00, 2387.45 examples/s]


In [35]:
target_cols = [f'target_{i}' for i in range(len(id2label))]

In [36]:
CKPTS =  ["checkpoint-700", "checkpoint-1000", "checkpoint-1200", "checkpoint-1100", "checkpoint-900"]
final_preds = []
for fold, ckpt in enumerate(CKPTS):
    print("-------------------------------------------------------")
    print(f"Fold {fold}")
    print("-------------------------------------------------------")
    
    base_model = Qwen2ForSequenceClassificationPlus.from_pretrained(
        TARGET_MODEL,
        num_labels=len(id2label),
        id2label=id2label,          # Add this line
        label2id=label2id,          # Add this line
        device_map={"":0},
        torch_dtype=torch.bfloat16,
        attn_implementation="flash_attention_2",
    )
    base_model.config.pad_token_id = tokenizer.pad_token_id
    
    model = PeftModel.from_pretrained(base_model, f"///mnt/c/Personal/Competitions/BEA_2025/Qwen25_0.5/outputs/fold{fold}/{ckpt}")
    
    trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    data_collator=data_collator,
)
    # valid
    valid_df = train[train["fold"] == fold]
    idxs = valid_df.index
    valid_ds = Dataset.from_pandas(valid_df)
    valid_tokenized_ds = valid_ds.map(preprocess_function, batched=True)
    
    pred_output = trainer.predict(valid_tokenized_ds)
    logits = pred_output.predictions
    probas = softmax(logits)
    train.loc[idxs, target_cols] = probas
    
    # test
    
    pred_output = trainer.predict(test_tokenized_ds)
    logits = pred_output.predictions
    probas = softmax(logits)
    final_preds.append(probas)
    

-------------------------------------------------------
Fold 0
-------------------------------------------------------


Some weights of Qwen2ForSequenceClassificationPlus were not initialized from the model checkpoint at Qwen/Qwen2.5-0.5B and are newly initialized: ['score.1.bias', 'score.1.weight', 'score.4.bias', 'score.4.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
Map: 100%|██████████| 494/494 [00:00<00:00, 2689.16 examples/s]


-------------------------------------------------------
Fold 1
-------------------------------------------------------


Some weights of Qwen2ForSequenceClassificationPlus were not initialized from the model checkpoint at Qwen/Qwen2.5-0.5B and are newly initialized: ['score.1.bias', 'score.1.weight', 'score.4.bias', 'score.4.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
Map: 100%|██████████| 494/494 [00:00<00:00, 3179.43 examples/s]


-------------------------------------------------------
Fold 2
-------------------------------------------------------


Some weights of Qwen2ForSequenceClassificationPlus were not initialized from the model checkpoint at Qwen/Qwen2.5-0.5B and are newly initialized: ['score.1.bias', 'score.1.weight', 'score.4.bias', 'score.4.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
Map: 100%|██████████| 497/497 [00:00<00:00, 2517.17 examples/s]


-------------------------------------------------------
Fold 3
-------------------------------------------------------


Some weights of Qwen2ForSequenceClassificationPlus were not initialized from the model checkpoint at Qwen/Qwen2.5-0.5B and are newly initialized: ['score.1.bias', 'score.1.weight', 'score.4.bias', 'score.4.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
Map: 100%|██████████| 495/495 [00:00<00:00, 2679.86 examples/s]


-------------------------------------------------------
Fold 4
-------------------------------------------------------


Some weights of Qwen2ForSequenceClassificationPlus were not initialized from the model checkpoint at Qwen/Qwen2.5-0.5B and are newly initialized: ['score.1.bias', 'score.1.weight', 'score.4.bias', 'score.4.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
Map: 100%|██████████| 496/496 [00:00<00:00, 2569.33 examples/s]


In [37]:
test[target_cols] = np.mean(final_preds, 0)

In [38]:
test['pred'] = test[target_cols].idxmax(axis=1).apply(lambda x: x.split("_")[1])
test['pred'] = test['pred'].astype(int)
test['pred']

0       2
1       3
2       0
3       4
4       7
       ..
1542    8
1543    7
1544    3
1545    2
1546    6
Name: pred, Length: 1547, dtype: int64

In [39]:
pred_labels = test['pred'].values
pred_labels

array([2, 3, 0, ..., 3, 2, 6], shape=(1547,))

In [41]:
test.to_csv('///mnt/c/Personal/Competitions/BEA_2025/Qwen25_0.5/outputs/test_probas.csv',index=
            False)

In [42]:
# Create submission file
submission = []
unique_conversation_ids = list(ex["conversation_id"] for ex in test_examples)

for conversation_id in unique_conversation_ids:
    conversation_data = next(d for d in test_data if d["conversation_id"] == conversation_id)
    submission_item = {
        "conversation_id": conversation_id,
        "conversation_history": conversation_data["conversation_history"],
        "tutor_responses": {}
    }
        
    for tutor_id, tutor_data in conversation_data["tutor_responses"].items():
        # Find the corresponding prediction
        idx = next(i for i, ex in enumerate(test_examples) 
                    if ex["conversation_id"] == conversation_id and ex["tutor_id"] == tutor_id)
        
        predicted_class = id2label[pred_labels[idx]]
        
        submission_item["tutor_responses"][tutor_id] = {
            "response": tutor_data["response"],
            "annotation": {
                "Tutor_Identification": predicted_class
            }
        }
    
    submission.append(submission_item)

In [4]:
submission

In [43]:
with open(os.path.join("///mnt/c/Personal/Competitions/BEA_2025/Qwen25_0.5/outputs", "predictions.json"), "w") as f:
    json.dump(submission, f, indent=2)