In [1]:
# !pip install -U transformers accelerate peft bitsandbytes -q

In [2]:
!nvidia-smi

Wed Apr 23 17:47:02 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.02              Driver Version: 560.94         CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 4090        On  |   00000000:01:00.0  On |                  Off |
|  0%   41C    P8              8W /  450W |     638MiB /  24564MiB |      3%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [3]:
TARGET_MODEL = "HuggingFaceH4/zephyr-7b-alpha"

DEBUG = False

In [4]:
from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType # type: ignore
from transformers import BitsAndBytesConfig
import torch
import numpy  as np
import random
import os
import json

os.environ['HF_HOME '] = '///mnt/c/Personal/Competitions/HFCache'

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
def set_seed(seed=42):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    
set_seed(42)

In [6]:
# Define the tutor classes
TUTOR_CLASSES = [
    "Expert",
    "Novice",
    "Gemini",
    "GPT4",
    "Llama31405B",
    "Llama318B",
    "Mistral",
    "Phi3",
    "Sonnet"
]

# Create label mappings
id2label = {i: label for i, label in enumerate(TUTOR_CLASSES)}
label2id = {v: k for k, v in id2label.items()}
print(id2label, label2id)

MAX_LEN = 512

{0: 'Expert', 1: 'Novice', 2: 'Gemini', 3: 'GPT4', 4: 'Llama31405B', 5: 'Llama318B', 6: 'Mistral', 7: 'Phi3', 8: 'Sonnet'} {'Expert': 0, 'Novice': 1, 'Gemini': 2, 'GPT4': 3, 'Llama31405B': 4, 'Llama318B': 5, 'Mistral': 6, 'Phi3': 7, 'Sonnet': 8}


In [7]:

# Data Processing Functions
def load_data(dev_data_path='///mnt/c/Personal/Competitions/BEA_2025/data/mrbench_v3_devset.json'):
    """
    Load development and (optionally) test datasets
    """
    # Load development data
    with open(dev_data_path, 'r') as f:
        dev_data = json.load(f)
    
    # Process development data
    dev_examples = []
    for dialogue in dev_data:
        conversation_id = dialogue["conversation_id"]
        conversation_history = dialogue["conversation_history"]
        
        for tutor_id, tutor_data in dialogue["tutor_responses"].items():
            if tutor_id in TUTOR_CLASSES or any(cls_name in tutor_id for cls_name in TUTOR_CLASSES):
                # Map the tutor_id to one of our classes
                tutor_class = next((cls for cls in TUTOR_CLASSES if cls in tutor_id), tutor_id)
                
                dev_examples.append({
                    "conversation_id": conversation_id,
                    "conversation_history": conversation_history,
                    "tutor_response": tutor_data["response"],
                    "tutor_class": tutor_class
                })
    

    return dev_examples

In [8]:
import pandas as pd
train = pd.DataFrame(load_data())
train['target'] = train['tutor_class'].map(label2id)
train.rename(columns={'conversation_history':'Question','tutor_response':'Response'},inplace=True)

In [9]:
train.head()

Unnamed: 0,conversation_id,Question,Response,tutor_class,target
0,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,"Tutor: Hi, could you please provide a step-by-...","Great, you've correctly identified the cost of...",Sonnet,8
1,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,"Tutor: Hi, could you please provide a step-by-...",Now that we know the cost of 1 pound of meat i...,Llama318B,5
2,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,"Tutor: Hi, could you please provide a step-by-...","You're close, but I notice that you calculated...",Llama31405B,4
3,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,"Tutor: Hi, could you please provide a step-by-...","That's correct. So, if 1 pound of meat costs $...",GPT4,3
4,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,"Tutor: Hi, could you please provide a step-by-...",It seems like you've calculated the cost as if...,Mistral,6


In [10]:
train.shape

(2476, 5)

In [11]:
rows = []
for i, row in train.iterrows():
    question = row.Question
    response = row.Response
    convid = row.conversation_id
    context = train[(train.conversation_id==convid)&(train.Response!=response)].Response.values
    context = ' [SEP] '.join(context)
    train.loc[i, 'context'] = context

In [12]:
train.shape

(2476, 6)

In [13]:
fold_map = pd.read_csv('///mnt/c/Personal/Competitions/BEA_2025/debetav3_context_multisampleDropout/oofs.csv')
fold_map.head(2)
fold_map.shape

(2476, 17)

In [14]:
train.head()

Unnamed: 0,conversation_id,Question,Response,tutor_class,target,context
0,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,"Tutor: Hi, could you please provide a step-by-...","Great, you've correctly identified the cost of...",Sonnet,8,Now that we know the cost of 1 pound of meat i...
1,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,"Tutor: Hi, could you please provide a step-by-...",Now that we know the cost of 1 pound of meat i...,Llama318B,5,"Great, you've correctly identified the cost of..."
2,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,"Tutor: Hi, could you please provide a step-by-...","You're close, but I notice that you calculated...",Llama31405B,4,"Great, you've correctly identified the cost of..."
3,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,"Tutor: Hi, could you please provide a step-by-...","That's correct. So, if 1 pound of meat costs $...",GPT4,3,"Great, you've correctly identified the cost of..."
4,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,"Tutor: Hi, could you please provide a step-by-...",It seems like you've calculated the cost as if...,Mistral,6,"Great, you've correctly identified the cost of..."


In [15]:
fold_map.head()

Unnamed: 0,conversation_id,Question,Response,tutor_class,target,context,fold,target_0,target_1,target_2,target_3,target_4,target_5,target_6,target_7,target_8,pred
0,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,"Tutor: Hi, could you please provide a step-by-...","Great, you've correctly identified the cost of...",Sonnet,8,Now that we know the cost of 1 pound of meat i...,0,0.00043,0.000975,0.004653,0.00099,0.00167,0.00336,0.006057,0.001408,0.980455,8
1,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,"Tutor: Hi, could you please provide a step-by-...",Now that we know the cost of 1 pound of meat i...,Llama318B,5,"Great, you've correctly identified the cost of...",0,0.000411,0.000347,0.000558,0.000706,0.001815,0.994394,0.000382,0.000257,0.001131,5
2,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,"Tutor: Hi, could you please provide a step-by-...","You're close, but I notice that you calculated...",Llama31405B,4,"Great, you've correctly identified the cost of...",0,0.000498,0.000256,0.0003,0.000437,0.992729,0.004073,0.000793,0.000214,0.0007,4
3,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,"Tutor: Hi, could you please provide a step-by-...","That's correct. So, if 1 pound of meat costs $...",GPT4,3,"Great, you've correctly identified the cost of...",0,0.004078,0.000667,0.001449,0.983959,0.001099,0.005142,0.001323,0.001366,0.000917,3
4,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,"Tutor: Hi, could you please provide a step-by-...",It seems like you've calculated the cost as if...,Mistral,6,"Great, you've correctly identified the cost of...",0,0.000712,0.000428,0.001301,0.000389,0.000633,0.000496,0.994973,0.000704,0.000364,6


In [16]:
train = pd.merge(train,fold_map[['conversation_id','fold','tutor_class']],on=['conversation_id','tutor_class'], how='left')
train.head()

Unnamed: 0,conversation_id,Question,Response,tutor_class,target,context,fold
0,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,"Tutor: Hi, could you please provide a step-by-...","Great, you've correctly identified the cost of...",Sonnet,8,Now that we know the cost of 1 pound of meat i...,0
1,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,"Tutor: Hi, could you please provide a step-by-...",Now that we know the cost of 1 pound of meat i...,Llama318B,5,"Great, you've correctly identified the cost of...",0
2,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,"Tutor: Hi, could you please provide a step-by-...","You're close, but I notice that you calculated...",Llama31405B,4,"Great, you've correctly identified the cost of...",0
3,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,"Tutor: Hi, could you please provide a step-by-...","That's correct. So, if 1 pound of meat costs $...",GPT4,3,"Great, you've correctly identified the cost of...",0
4,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,"Tutor: Hi, could you please provide a step-by-...",It seems like you've calculated the cost as if...,Mistral,6,"Great, you've correctly identified the cost of...",0


In [17]:
train['fold'].value_counts()

fold
2    497
4    496
3    495
0    494
1    494
Name: count, dtype: int64

In [18]:
print(train.shape)

(2476, 7)


In [19]:
# from sklearn.model_selection import GroupKFold
# folds = GroupKFold(n_splits=5, shuffle=False)
# train['fold'] = -1
# for i,(train_index, test_index) in enumerate(folds.split(train,train['target'], groups=train['Question'])): 
#     train.loc[test_index,'fold'] = i

In [20]:
train['input'] = "Question: " + train['Question'] + '; Answer: ' + train['Response'] #+ '; Context: ' + train["context"]

In [21]:
train = train.rename(columns={'target': 'label'})

In [22]:
train.head()

Unnamed: 0,conversation_id,Question,Response,tutor_class,label,context,fold,input
0,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,"Tutor: Hi, could you please provide a step-by-...","Great, you've correctly identified the cost of...",Sonnet,8,Now that we know the cost of 1 pound of meat i...,0,"Question: Tutor: Hi, could you please provide ..."
1,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,"Tutor: Hi, could you please provide a step-by-...",Now that we know the cost of 1 pound of meat i...,Llama318B,5,"Great, you've correctly identified the cost of...",0,"Question: Tutor: Hi, could you please provide ..."
2,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,"Tutor: Hi, could you please provide a step-by-...","You're close, but I notice that you calculated...",Llama31405B,4,"Great, you've correctly identified the cost of...",0,"Question: Tutor: Hi, could you please provide ..."
3,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,"Tutor: Hi, could you please provide a step-by-...","That's correct. So, if 1 pound of meat costs $...",GPT4,3,"Great, you've correctly identified the cost of...",0,"Question: Tutor: Hi, could you please provide ..."
4,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,"Tutor: Hi, could you please provide a step-by-...",It seems like you've calculated the cost as if...,Mistral,6,"Great, you've correctly identified the cost of...",0,"Question: Tutor: Hi, could you please provide ..."


In [23]:
# load model with 4bit bnb

from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType # type: ignore
from transformers import BitsAndBytesConfig
import torch


In [24]:
from transformers import AutoTokenizer, LlamaForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained(TARGET_MODEL, use_fast=False)
tokenizer.pad_token = tokenizer.eos_token

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [25]:
tokenizer.eos_token

'</s>'

In [26]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        
        all_param += param.numel()
        if param.requires_grad:
            print(_)
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [27]:
def preprocess_function(examples, max_length=MAX_LEN):
    return tokenizer(examples["input"], truncation=True, max_length=max_length, padding=True)
    # return tokenizer(examples["input"], 
    #                 #  text_target = examples['label'],
    #                  truncation=True, 
    #                  max_length=max_length, 
    #                  padding="max_length",)

In [28]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="longest")

In [29]:
import numpy as np
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
import numpy as np
import evaluate

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    accuracy = evaluate.load("accuracy")
    f1 = evaluate.load("f1")
    
    accuracy_score = accuracy.compute(predictions=predictions, references=labels)
    f1_score = f1.compute(predictions=predictions, references=labels, average="macro")
    
    # Add per-class F1 scores
    f1_per_class = f1.compute(predictions=predictions, references=labels, average=None)
    per_class_scores = {f"f1_{id2label[i]}": score for i, score in enumerate(f1_per_class["f1"])}
    
    metrics = {
        "accuracy": accuracy_score["accuracy"],
        "f1_macro": f1_score["f1"],
        **per_class_scores
    }
    
    return metrics

In [30]:
def softmax(x):
    e_x = np.exp(x - np.max(x, axis=1, keepdims=True))
    return e_x / e_x.sum(axis=1, keepdims=True)

In [31]:
from transformers import TrainingArguments, Trainer
from datasets import Dataset

In [32]:
train.head()

Unnamed: 0,conversation_id,Question,Response,tutor_class,label,context,fold,input
0,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,"Tutor: Hi, could you please provide a step-by-...","Great, you've correctly identified the cost of...",Sonnet,8,Now that we know the cost of 1 pound of meat i...,0,"Question: Tutor: Hi, could you please provide ..."
1,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,"Tutor: Hi, could you please provide a step-by-...",Now that we know the cost of 1 pound of meat i...,Llama318B,5,"Great, you've correctly identified the cost of...",0,"Question: Tutor: Hi, could you please provide ..."
2,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,"Tutor: Hi, could you please provide a step-by-...","You're close, but I notice that you calculated...",Llama31405B,4,"Great, you've correctly identified the cost of...",0,"Question: Tutor: Hi, could you please provide ..."
3,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,"Tutor: Hi, could you please provide a step-by-...","That's correct. So, if 1 pound of meat costs $...",GPT4,3,"Great, you've correctly identified the cost of...",0,"Question: Tutor: Hi, could you please provide ..."
4,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,"Tutor: Hi, could you please provide a step-by-...",It seems like you've calculated the cost as if...,Mistral,6,"Great, you've correctly identified the cost of...",0,"Question: Tutor: Hi, could you please provide ..."


In [33]:

# steps = 5 if DEBUG else 50

# for fold in range(5):
#     print(f'Fold {fold+1}/5')

#     valid_df = train[train["fold"] == fold].reset_index(drop=True)
#     train_df = train[train["fold"] != fold].reset_index(drop=True)
    
#     # from pandas
#     train_ds = Dataset.from_pandas(train_df)
#     valid_ds = Dataset.from_pandas(valid_df)
#     remove = ['conversation_id', 'Question', 'Response', 'tutor_class','context', 'fold']
#     train_tokenized_ds = train_ds.map(preprocess_function, batched=True,remove_columns=remove)
#     valid_tokenized_ds = valid_ds.map(preprocess_function, batched=True,remove_columns=remove)
    
#     # Model
    
#     peft_config = LoraConfig(
#         r=16,
#         lora_alpha=32,
#         lora_dropout=0.1,
#         bias="none",
#         task_type=TaskType.SEQ_CLS,
#         inference_mode=False,
#         target_modules=[
#             "q_proj",
#             "v_proj"
#         ],
#     )

#     bnb_config = BitsAndBytesConfig(
#         load_in_4bit=True,
#         bnb_4bit_quant_type="nf4",
#         bnb_4bit_use_double_quant=True,
#         bnb_4bit_compute_dtype=torch.bfloat16
#     )
    
#     base_model = LlamaForSequenceClassification.from_pretrained(
#         TARGET_MODEL,
#         num_labels=len(id2label),
#         quantization_config=bnb_config,
#         id2label=id2label,          # Add this line
#         label2id=label2id,          # Add this line
#         device_map={"":0},
#         # classifier_dropout = 0.4,
#     )
#     base_model.config.pretraining_tp = 1 # 1 is 7b
#     base_model.config.pad_token_id = tokenizer.pad_token_id
    
#     model = get_peft_model(base_model, peft_config)
    
#     training_args = TrainingArguments(
#         output_dir=f"///mnt/c/Personal/Competitions/BEA_2025/Zephyr/outputs/fold{fold}",
#         learning_rate=1e-4,
#         per_device_train_batch_size=4,
#         per_device_eval_batch_size=4,
#         gradient_accumulation_steps=4,
#         max_grad_norm= 0.5,#0.3,
#         optim='paged_adamw_32bit',
#         lr_scheduler_type="cosine",
#         num_train_epochs=10,
#         weight_decay=0.0001,
#         save_total_limit=1,
#         eval_strategy="steps",
#         save_strategy="steps",
#         save_steps = 50,
#         eval_steps = 50,
#         logging_steps= 50,
#         load_best_model_at_end=True,
#         push_to_hub=False,
#         warmup_steps=10,
#         report_to='none', # if DEBUG else 'wandb',
#         metric_for_best_model="f1_macro",
#         greater_is_better=True,
#         overwrite_output_dir=True,
#     )
    
#     trainer = Trainer(
#         model=model,
#         args=training_args,
#         train_dataset=train_tokenized_ds,
#         eval_dataset=valid_tokenized_ds,
#         tokenizer=tokenizer,
#         data_collator=data_collator,
#         compute_metrics=compute_metrics,
#         # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)
#     )
    
#     trainer.train()
#     # validation 
#     pred_output = trainer.predict(valid_tokenized_ds)
#     logits = pred_output.predictions
#     probas = softmax(logits)
#     np.save(f'///mnt/c/Personal/Competitions/BEA_2025/Zephyr/outputs/fold{fold}.npy', probas)

#     del trainer, model, base_model
#     import torch
#     torch.cuda.empty_cache()

In [34]:
import torch
torch.cuda.empty_cache()

In [35]:
# cuda cache clear
import torch
torch.cuda.empty_cache()

## Get OOF

In [36]:

# Data Processing Functions
def load_test_data(test_data_path='///mnt/c/Personal/Competitions/BEA_2025/data/mrbench_v3_testset.json'):
    """
    Load development and (optionally) test datasets
    """    
    # Load test data if provided
    test_examples = []
    test_data = None
    if test_data_path:
        with open(test_data_path, 'r',encoding="utf-8") as f:
            test_data = json.load(f)
        
        for dialogue in test_data:
            conversation_id = dialogue["conversation_id"]
            conversation_history = dialogue["conversation_history"]
            
            for tutor_id, tutor_data in dialogue["tutor_responses"].items():
                test_examples.append({
                    "conversation_id": conversation_id,
                    "conversation_history": conversation_history,
                    "tutor_response": tutor_data["response"],
                    "tutor_id": tutor_id
                })
    
    return test_examples, test_data

In [37]:
test_examples,test_data = load_test_data()
test = pd.DataFrame(test_examples)
test.rename(columns={'conversation_history':'Question','tutor_response':'Response'},inplace=True)
test.head()


# import pandas as pd
# train = pd.DataFrame(load_data())
# train['target'] = train['tutor_class'].map(label2id)
# train.rename(columns={'conversation_history':'Question','tutor_response':'Response'},inplace=True)

Unnamed: 0,conversation_id,Question,Response,tutor_id
0,1030-adb61831-0383-4e51-a673-ab978590f69b,"Tutor: Hi, could you please provide a step-by-...",It looks like you've done a great job figuring...,Tutor_1
1,1030-adb61831-0383-4e51-a673-ab978590f69b,"Tutor: Hi, could you please provide a step-by-...","You've done a great job, but there's a small m...",Tutor_2
2,1030-adb61831-0383-4e51-a673-ab978590f69b,"Tutor: Hi, could you please provide a step-by-...","OK, read the question again, and answer these ...",Tutor_3
3,1030-adb61831-0383-4e51-a673-ab978590f69b,"Tutor: Hi, could you please provide a step-by-...","Tutor: I see where you're coming from, but I t...",Tutor_4
4,1030-adb61831-0383-4e51-a673-ab978590f69b,"Tutor: Hi, could you please provide a step-by-...",Great job! Can you explain how you arrived at ...,Tutor_5


In [38]:
from transformers import TrainingArguments, Trainer
from datasets import Dataset
import pandas as pd

test['input'] = "Question: " + test['Question'] + '; Answer: ' + test['Response'] #+ '; Context: ' + train["context"]
test_ds = Dataset.from_pandas(test)
test_tokenized_ds = test_ds.map(preprocess_function, batched=True)

Map: 100%|██████████| 1547/1547 [00:01<00:00, 1002.87 examples/s]


In [39]:
target_cols = [f'target_{i}' for i in range(len(id2label))]

In [40]:
CKPTS =  ["checkpoint-800", "checkpoint-1050", "checkpoint-650", "checkpoint-750", "checkpoint-1000"]
final_preds = []
for fold, ckpt in enumerate(CKPTS):
    print("-------------------------------------------------------")
    print(f"Fold {fold}")
    print("-------------------------------------------------------")
    
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.float16
    )
    
    base_model = LlamaForSequenceClassification.from_pretrained(
        TARGET_MODEL,
        num_labels=len(id2label),
        quantization_config=bnb_config,
        device_map={"":0}
    )
    base_model.config.pretraining_tp = 1 # 1 is 7b
    base_model.config.pad_token_id = tokenizer.pad_token_id
    
    model = PeftModel.from_pretrained(base_model, f"///mnt/c/Personal/Competitions/BEA_2025/Zephyr/outputs/fold{fold}/{ckpt}")
    
    trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    data_collator=data_collator,
)
    # valid
    valid_df = train[train["fold"] == fold]
    idxs = valid_df.index
    valid_ds = Dataset.from_pandas(valid_df)
    valid_tokenized_ds = valid_ds.map(preprocess_function, batched=True)
    
    pred_output = trainer.predict(valid_tokenized_ds)
    logits = pred_output.predictions
    probas = softmax(logits)
    train.loc[idxs, target_cols] = probas
    
    # test
    
    pred_output = trainer.predict(test_tokenized_ds)
    logits = pred_output.predictions
    probas = softmax(logits)
    final_preds.append(probas)

    del model
    del trainer
    del base_model
    

-------------------------------------------------------
Fold 0
-------------------------------------------------------


You are using a model of type mistral to instantiate a model of type llama. This is not supported for all configurations of models and can yield errors.
Fetching 8 files:   0%|          | 0/8 [00:00<?, ?it/s]Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to

-------------------------------------------------------
Fold 1
-------------------------------------------------------


You are using a model of type mistral to instantiate a model of type llama. This is not supported for all configurations of models and can yield errors.
Loading checkpoint shards: 100%|██████████| 8/8 [00:08<00:00,  1.02s/it]
Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at HuggingFaceH4/zephyr-7b-alpha and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
Map: 100%|██████████| 494/494 [00:00<00:00, 1317.62 examples/s]


-------------------------------------------------------
Fold 2
-------------------------------------------------------


You are using a model of type mistral to instantiate a model of type llama. This is not supported for all configurations of models and can yield errors.
Loading checkpoint shards: 100%|██████████| 8/8 [00:07<00:00,  1.04it/s]
Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at HuggingFaceH4/zephyr-7b-alpha and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
Map: 100%|██████████| 497/497 [00:00<00:00, 1335.71 examples/s]


-------------------------------------------------------
Fold 3
-------------------------------------------------------


You are using a model of type mistral to instantiate a model of type llama. This is not supported for all configurations of models and can yield errors.
Loading checkpoint shards: 100%|██████████| 8/8 [00:07<00:00,  1.05it/s]
Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at HuggingFaceH4/zephyr-7b-alpha and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
Map: 100%|██████████| 495/495 [00:00<00:00, 1315.14 examples/s]


-------------------------------------------------------
Fold 4
-------------------------------------------------------


You are using a model of type mistral to instantiate a model of type llama. This is not supported for all configurations of models and can yield errors.
Loading checkpoint shards: 100%|██████████| 8/8 [00:07<00:00,  1.06it/s]
Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at HuggingFaceH4/zephyr-7b-alpha and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
Map: 100%|██████████| 496/496 [00:00<00:00, 1253.78 examples/s]


In [41]:
test[target_cols] = np.mean(final_preds, 0)

In [42]:
test[target_cols]

  has_large_values = (abs_vals > 1e6).any()


Unnamed: 0,target_0,target_1,target_2,target_3,target_4,target_5,target_6,target_7,target_8
0,0.0,0.000000e+00,1.000000e+00,0.000000,0.000000e+00,0.000000e+00,1.177192e-04,0.000000e+00,0.000000
1,0.0,0.000000e+00,0.000000e+00,1.000000,0.000000e+00,5.960464e-08,0.000000e+00,0.000000e+00,0.000000
2,1.0,0.000000e+00,1.072884e-06,0.000000,0.000000e+00,9.191036e-05,1.561642e-05,0.000000e+00,0.000000
3,0.0,0.000000e+00,0.000000e+00,0.000000,1.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000
4,0.0,5.960464e-08,0.000000e+00,0.000001,0.000000e+00,7.152557e-07,9.536743e-06,1.000000e+00,0.000117
...,...,...,...,...,...,...,...,...,...
1542,0.0,0.000000e+00,0.000000e+00,0.000000,2.384186e-07,1.829863e-05,1.430511e-06,5.364418e-07,1.000000
1543,0.0,0.000000e+00,0.000000e+00,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,1.000000e+00,0.000000
1544,0.0,0.000000e+00,5.960464e-08,1.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000
1545,0.0,0.000000e+00,1.000000e+00,0.000000,0.000000e+00,0.000000e+00,2.384186e-07,0.000000e+00,0.000000


In [43]:
test['pred'] = test[target_cols].idxmax(axis=1).apply(lambda x: x.split("_")[1])
test['pred'] = test['pred'].astype(int)
test['pred']

0       2
1       3
2       0
3       4
4       7
       ..
1542    8
1543    7
1544    3
1545    2
1546    6
Name: pred, Length: 1547, dtype: int64

In [44]:
test.to_csv('///mnt/c/Personal/Competitions/BEA_2025/Zephyr/outputs/test_probas.csv',index=False)

In [None]:
pred_labels = test['pred'].values
pred_labels

In [None]:
len(test_examples)

In [48]:
# Create submission file
submission = []
unique_conversation_ids = list(ex["conversation_id"] for ex in test_examples)

for conversation_id in unique_conversation_ids:
    conversation_data = next(d for d in test_data if d["conversation_id"] == conversation_id)
    submission_item = {
        "conversation_id": conversation_id,
        "conversation_history": conversation_data["conversation_history"],
        "tutor_responses": {}
    }
        
    for tutor_id, tutor_data in conversation_data["tutor_responses"].items():
        # Find the corresponding prediction
        idx = next(i for i, ex in enumerate(test_examples) 
                    if ex["conversation_id"] == conversation_id and ex["tutor_id"] == tutor_id)
        
        predicted_class = id2label[pred_labels[idx]]
        
        submission_item["tutor_responses"][tutor_id] = {
            "response": tutor_data["response"],
            "annotation": {
                "Tutor_Identification": predicted_class
            }
        }
    
    submission.append(submission_item)

In [None]:
submission

In [50]:
with open(os.path.join("///mnt/c/Personal/Competitions/BEA_2025/Zephyr/outputs", "predictions.json"), "w") as f:
    json.dump(submission, f, indent=2)

In [None]:
## load