In [1]:
!nvidia-smi

Wed Apr 23 17:41:09 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.02              Driver Version: 560.94         CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 4090        On  |   00000000:01:00.0  On |                  Off |
|  0%   39C    P8              9W /  450W |     640MiB /  24564MiB |      1%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

## Imports

In [2]:
import os
import random
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
import torch
import torch.nn as nn
from torch.utils.checkpoint import checkpoint
from torch.utils.data import Dataset,DataLoader
from torch.optim import AdamW
import json
from sklearn import metrics
from torch.nn import functional as F
from tqdm import tqdm
from transformers import (
    AutoConfig,
    AutoModel,
    AutoTokenizer,
    get_cosine_schedule_with_warmup,
    get_linear_schedule_with_warmup,
    DataCollatorWithPadding
)

from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


## Configs

In [3]:
cfg = {"model_name": "microsoft/deberta-v3-base",
    "max_len": 2048,
    "freeze" : False,
    # Train Configs
    "fold_num": 5,
    "val_fold": 0,
    "learning_rate": 2e-05,
    "min_lr": 1e-7,
    "T_max": 500,
    "valid_batch_size": 32,
    'train_batch_size' : 16,
 
    "epochs": 25, # Set to 1 because it is a demo
    "accumulation_steps": 1,
    "val_steps": 375,
    "n_accumulate":2,
    
    # GPU Optimize Settings
    "scheduler" : 'cosine',
    "warmup_epochs": 1,

    "gradient_checkpoint" : False,
    'tokenizer' : AutoTokenizer.from_pretrained("microsoft/deberta-v3-base"),
    
    # Path
    "output": f"///mnt/c/Personal/Competitions/BEA_2025/debetav3_base_context_multisampleDropout",
    "seed":42,
}

Path(cfg['output']).mkdir(exist_ok=True)


## Utils

In [4]:
def set_seed(seed=42):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    
set_seed(cfg['seed'])

In [5]:
def optimizer_scheduler(model):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {
                "params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and p.requires_grad],
                "weight_decay": 0.003,
            },
            {
                "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and p.requires_grad],
                "weight_decay": 0.0,
            },
        ]
        opt = AdamW(optimizer_parameters, lr=cfg['learning_rate'])
        return opt

In [6]:
import matplotlib.pyplot as plt
def plot_history(history):
    plt.figure(figsize=(20, 12))
    
    # Plot Loss
    plt.subplot(2, 1, 1)
    for k in ["Train Loss", "Valid Loss"]:
        plt.plot(history[k])
    plt.title('Loss')
    plt.xlabel('epochs')
    plt.ylabel('loss')
    plt.legend(['train', 'valid'], loc='upper left')
    
    # Plot Metrics
    plt.subplot(2, 1, 2)
    for k in ["Macro F1", "Accuracy"]:
        plt.plot(history[k])
    plt.title('Metrics')
    plt.xlabel('epochs')
    plt.ylabel('score')
    plt.legend(['Macro F1', 'Accuracy'], loc='lower right')
    
    plt.tight_layout()
    plt.show()

In [7]:
# Define the tutor classes
TUTOR_CLASSES = [
    "Expert",
    "Novice",
    "Gemini",
    "GPT4",
    "Llama31405B",
    "Llama318B",
    "Mistral",
    "Phi3",
    "Sonnet"
]

# Create label mappings
id2label = {i: label for i, label in enumerate(TUTOR_CLASSES)}
label2id = {v: k for k, v in id2label.items()}
print(id2label, label2id)

{0: 'Expert', 1: 'Novice', 2: 'Gemini', 3: 'GPT4', 4: 'Llama31405B', 5: 'Llama318B', 6: 'Mistral', 7: 'Phi3', 8: 'Sonnet'} {'Expert': 0, 'Novice': 1, 'Gemini': 2, 'GPT4': 3, 'Llama31405B': 4, 'Llama318B': 5, 'Mistral': 6, 'Phi3': 7, 'Sonnet': 8}


In [8]:
def load_data(dev_data_path='///mnt/c/Personal/Competitions/BEA_2025/data/mrbench_v3_devset.json'):
    """
    Load development and (optionally) test datasets
    """
    # Load development data
    with open(dev_data_path, 'r') as f:
        dev_data = json.load(f)
    
    # Process development data
    dev_examples = []
    for dialogue in dev_data:
        conversation_id = dialogue["conversation_id"]
        conversation_history = dialogue["conversation_history"]
        
        for tutor_id, tutor_data in dialogue["tutor_responses"].items():
            if tutor_id in TUTOR_CLASSES or any(cls_name in tutor_id for cls_name in TUTOR_CLASSES):
                # Map the tutor_id to one of our classes
                tutor_class = next((cls for cls in TUTOR_CLASSES if cls in tutor_id), tutor_id)
                
                dev_examples.append({
                    "conversation_id": conversation_id,
                    "conversation_history": conversation_history,
                    "tutor_response": tutor_data["response"],
                    "tutor_class": tutor_class
                })
    return dev_examples

## Dataset

In [9]:
train = pd.DataFrame(load_data())
train['target'] = train['tutor_class'].map(label2id)
train.rename(columns={'conversation_history':'Question','tutor_response':'Response'},inplace=True)

In [10]:
train.head()

Unnamed: 0,conversation_id,Question,Response,tutor_class,target
0,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,"Tutor: Hi, could you please provide a step-by-...","Great, you've correctly identified the cost of...",Sonnet,8
1,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,"Tutor: Hi, could you please provide a step-by-...",Now that we know the cost of 1 pound of meat i...,Llama318B,5
2,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,"Tutor: Hi, could you please provide a step-by-...","You're close, but I notice that you calculated...",Llama31405B,4
3,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,"Tutor: Hi, could you please provide a step-by-...","That's correct. So, if 1 pound of meat costs $...",GPT4,3
4,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,"Tutor: Hi, could you please provide a step-by-...",It seems like you've calculated the cost as if...,Mistral,6


In [11]:
rows = []
for i, row in train.iterrows():
    question = row.Question
    response = row.Response
    convid = row.conversation_id
    context = train[(train.conversation_id==convid)&(train.Response!=response)].Response.values
    context = ' [SEP] '.join(context)
    train.loc[i, 'context'] = context

In [12]:
train['Question'].nunique()

294

In [13]:
TARGETS = ["target"]

In [14]:
class Dataset(Dataset) :
    def __init__(self,df,is_train = True, use_aug=False) :
        self.use_aug = use_aug
        self.df = df
        self.tokenizer = cfg['tokenizer']
        self.sep_token = self.tokenizer.sep_token
        self.text = (self.df['Question'] + self.sep_token + '[R_STRAT]' + self.df['Response']+ '[R_END]'+ self.sep_token + self.df['context']).values

        if is_train==True :
            self.targets = df['target'].values
        self.is_train = is_train

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        
        text = self.text[index]
        
        inputs = self.tokenizer.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length=cfg["max_len"]
        )
        feature_dict = {
            'input_ids': inputs['input_ids'],
            'attention_mask': inputs['attention_mask'],
        }
        if self.is_train==True :
            feature_dict['target'] =  self.targets[index]
            
        custom_mask = inputs['attention_mask'].copy()
        stop_idx = 0
        for idx, token in enumerate(inputs['input_ids']):
                if token == cfg["tokenizer"].convert_tokens_to_ids('[R_END]'):
                    stop_idx = idx + 1
                    break
        for idx in range(stop_idx, len(inputs['attention_mask'])):
            custom_mask[idx] = 0
        feature_dict['R_mask'] = custom_mask

        return feature_dict

In [15]:
cfg["tokenizer"].add_tokens(['[R_STRAT]', '[R_END]'], special_tokens=True)

2

## Model

In [16]:
def odd_layer_freeze(module):
    for i in range(1,24,2):
        for n,p in module.encoder.layer[i].named_parameters():
            p.requires_grad = False
            
def even_layer_freeze(module):
    for i in range(0,24,2):
        for n,p in module.encoder.layer[i].named_parameters():
            p.requires_grad = False
            
def top_half_layer_freeze(module):
    for i in range(0,13,1):
        for n,p in module.encoder.layer[i].named_parameters():
            p.requires_grad = False

def bottom_half_layer_freeze(module):
    for i in range(13,14,1):
        for n,p in module.encoder.layer[i].named_parameters():
            p.requires_grad = False
    

In [17]:
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings
    
class GeMText(nn.Module):
    def __init__(self, dim=1, p=3, eps=1e-6):
        super(GeMText, self).__init__()
        self.dim = dim
        self.p = nn.Parameter(torch.ones(1) * p)
        self.eps = eps
        self.feat_mult = 1

    def forward(self, x, attention_mask):
        attention_mask_expanded = attention_mask.unsqueeze(-1).expand(x.shape)
        x = (x.clamp(min=self.eps) * attention_mask_expanded).pow(self.p).sum(self.dim)
        ret = x / attention_mask_expanded.sum(self.dim).clip(min=self.eps)
        ret = ret.pow(1 / self.p)
        return ret
    
class MultiSampleDropout(nn.Module):
    def __init__(self, classifier, start_prob=0.2, num_samples=8, increment=0.01):
        super(MultiSampleDropout, self).__init__()
        # Use standard nn.Dropout since we're integrating with the first notebook
        self.dropouts = nn.ModuleList([
            nn.Dropout(start_prob + (increment*i)) for i in range(num_samples)
        ])
        self.classifier = classifier
        
    def forward(self, out):
        # Apply multiple dropouts and average results
        return torch.mean(torch.stack([
            self.classifier(dropout(out)) for dropout in self.dropouts
        ], dim=0), dim=0)
    
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.drop = nn.Dropout(p=cfg["dropout"])
        
        self.config = AutoConfig.from_pretrained(cfg["model_name"])
        #self.config.hidden_dropout = 0.
        self.config.hidden_dropout_prob = 0.007
        #self.config.attention_dropout = 0.
        self.config.attention_probs_dropout_prob = 0.008

        self.model = AutoModel.from_pretrained(cfg["model_name"], config=self.config)
        #odd_layer_freeze(self.model)
        if cfg["gradient_checkpoint"]:
            print('Enabling Grad Checkpointing')
            self.model.gradient_checkpointing_enable()  
        if cfg["freeze"]:
            print('freezing params')
            for parameter in self.model.parameters():
                parameter.requires_grad = False
        self.pool = GeMText()

        # Create a classifier (single linear layer)
        self.fc_base = nn.Linear(self.config.hidden_size, len(id2label))

        # Wrap it with Multi-Sample Dropout
        self.fc = MultiSampleDropout(self.fc_base, 
                                    start_prob=0.2, 
                                    num_samples=8, 
                                    increment=0.01)
        
    def forward(self, ids, mask, rhead):        
        out = self.model(input_ids=ids,attention_mask=mask,
                         output_hidden_states=False)
        out = self.pool(out.last_hidden_state, rhead)
        out = self.drop(out)
        out = self.fc(out)
        return out

In [18]:
class Collate:
    def __init__(self, tokenizer, max_len=512):
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __call__(self, batch):
        output = dict()
        output["input_ids"] = [sample["input_ids"] for sample in batch]
        output["attention_mask"] = [sample["attention_mask"] for sample in batch]
        output["R_mask"] = [sample["R_mask"] for sample in batch]
        output["target"] = [sample["target"] for sample in batch]

        # calculate max token length of this batch
        batch_max = max([len(ids) for ids in output["input_ids"]]) 

        batch_max = min(batch_max, self.max_len)
        
        output["input_ids"] = [s[:batch_max] for s in output["input_ids"]]
        output["attention_mask"] = [s[:batch_max] for s in output["attention_mask"]] 
        output["R_mask"] = [s[:batch_max] for s in output["R_mask"]] 
        #output["target"] = [s[:batch_max] for s in output["target"]]


        output["input_ids"] = [s + (batch_max - len(s)) * [self.tokenizer.pad_token_id] for s in output["input_ids"]]
        output["attention_mask"] = [s + (batch_max - len(s)) * [0] for s in output["attention_mask"]]
        output["R_mask"] = [s + (batch_max - len(s)) * [0] for s in output["R_mask"]]
        #output["target"] = [s + (batch_max - len(s)) * [0] for s in output["target"]]
        

        # convert to tensors
        output["input_ids"] = torch.tensor(output["input_ids"], dtype=torch.long)
        output["attention_mask"] = torch.tensor(output["attention_mask"], dtype=torch.long)
        output["target"] = torch.tensor(output["target"], dtype=torch.long)
        output["R_mask"] = torch.tensor(output["R_mask"], dtype=torch.long)

        return output

## Engine

In [19]:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [20]:
def train_one_epoch(model, optimizer, scheduler, dataloader, device):
    model.train()

    total = 0
    running_loss = 0.0
    losses = AverageMeter()
    scaler = torch.amp.GradScaler(enabled=cfg["apex"])
    lr = []
    bar = tqdm(dataloader, total=len(dataloader))
    steps = len(dataloader)
    
    all_preds = np.array([])
    all_groud_truth = np.array([])
    
    for step, data in enumerate(bar):
        ids = data["input_ids"].to(device, dtype=torch.long)
        mask = data["attention_mask"].to(device, dtype=torch.long)
        targets = data["target"].to(device, dtype=torch.long)
        r_mask = data["R_mask"].to(device, dtype=torch.long)

        batch_size = ids.size(0)
        with torch.amp.autocast(device_type="cuda"):
        # with torch.amp.autocast(enabled=cfg["apex"],device_type="cuda"):
            outputs = model(ids, mask, r_mask)
            loss = criterion(outputs, targets)
        loss = loss / cfg['n_accumulate']
        
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        
        if (step + 1) % cfg['n_accumulate'] == 0 or step == steps:
            scaler.unscale_(optimizer)
            # grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), cfg["grad_norm"])
            torch.nn.utils.clip_grad_norm_(model.parameters(), cfg["grad_norm"])
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            if scheduler:
                scheduler.step()


        epoch_loss = losses.avg
        #acc = correct / total

        bar.set_postfix(
            Loss=epoch_loss, LR=optimizer.param_groups[0]['lr'])
        
        lr.append(optimizer.param_groups[0]['lr'])

    return losses.avg

from sklearn.metrics import f1_score, accuracy_score

@torch.no_grad()
def evaluate(model, dataloader, device):
    model.eval()

    losses = AverageMeter()
    preds = []
    y_test = []
    for data in dataloader:
        ids = data["input_ids"].to(device, dtype=torch.long)
        mask = data["attention_mask"].to(device, dtype=torch.long)
        targets = data["target"].to(device, dtype=torch.long)
        r_mask = data["R_mask"].to(device, dtype=torch.long)

        batch_size = ids.size(0)

        outputs = model(ids, mask, r_mask)

        loss = criterion(outputs, targets)

        losses.update(loss.item(), batch_size)
        preds.append(outputs.detach().cpu().numpy())
        y_test.append(targets.detach().cpu().numpy())
    
    preds = np.concatenate(preds)
    y_test = np.concatenate(y_test)
    
    # Get the predicted class (argmax)
    pred_labels = np.argmax(preds, axis=1)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, pred_labels)
    
    # Calculate macro F1 score
    macro_f1 = f1_score(y_test, pred_labels, average='macro')

    return losses.avg, preds, y_test, macro_f1, accuracy

In [21]:
def start_training(model, optimizer, scheduler, device, num_epochs, train_loader, valid_loader, fold=0):
    import time
    start = time.time()
    best_score = 0  # Changed to 0 since higher F1 is better
    history = {"Train Loss": [], "Valid Loss": [], "Macro F1": [], "Accuracy": []}
    
    for epoch in range(1, num_epochs + 1):
        print("Epoch: ", epoch)
        train_epoch_loss = train_one_epoch(
            model, optimizer, scheduler, dataloader=train_loader, device=cfg["device"]
        )

        val_epoch_loss, preds, y_test, macro_f1, accuracy = evaluate(
            model, valid_loader, device=cfg["device"]
        )
        
        # print(f"Epoch {epoch}: Loss={val_epoch_loss:.4f}, Macro F1={macro_f1:.4f}, Accuracy={accuracy:.4f}")
        print(f"Epoch {epoch}: Train Loss={train_epoch_loss:.4f}, Valid Loss={val_epoch_loss:.4f}, Macro F1={macro_f1:.4f}, Accuracy={accuracy:.4f}")
        
        # Use Macro F1 as the primary score for model selection
        score = macro_f1

        history["Train Loss"].append(train_epoch_loss)
        history["Valid Loss"].append(val_epoch_loss)
        history["Macro F1"].append(macro_f1)
        history["Accuracy"].append(accuracy)

        # For Macro F1, higher is better, so we change the comparison
        if score >= best_score:
            print(
                f"Score Improved ({best_score:.4f} ---> {score:.4f})"
            )
            with open(f"{cfg['output']}/log.txt", 'a') as f:
                # f.write(f'Epoch {epoch}: Loss={val_epoch_loss:.4f}, Macro F1={macro_f1:.4f}, Accuracy={accuracy:.4f}\n')
                f.write(f'Epoch {epoch}: Train Loss={train_epoch_loss:.4f}, Valid Loss={val_epoch_loss:.4f}, Macro F1={macro_f1:.4f}, Accuracy={accuracy:.4f}\n')

            best_score = score
            PATH = os.path.join(cfg['output'],f'fold_{fold}.bin')
            torch.save(model.state_dict(), PATH)
            
            print(f"Model Saved")
            best_y = preds

        print()

    end = time.time()
    time_elapsed = end - start
    print(
        "Training complete in {:.0f}h {:.0f}m {:.0f}s".format(
            time_elapsed // 3600,
            (time_elapsed % 3600) // 60,
            (time_elapsed % 3600) % 60,
        )
    )
    print(
        "Best Macro F1 Score: {:.4f}".format(
            best_score
        )
    )

    return history, best_y

In [22]:
train.head()

Unnamed: 0,conversation_id,Question,Response,tutor_class,target,context
0,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,"Tutor: Hi, could you please provide a step-by-...","Great, you've correctly identified the cost of...",Sonnet,8,Now that we know the cost of 1 pound of meat i...
1,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,"Tutor: Hi, could you please provide a step-by-...",Now that we know the cost of 1 pound of meat i...,Llama318B,5,"Great, you've correctly identified the cost of..."
2,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,"Tutor: Hi, could you please provide a step-by-...","You're close, but I notice that you calculated...",Llama31405B,4,"Great, you've correctly identified the cost of..."
3,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,"Tutor: Hi, could you please provide a step-by-...","That's correct. So, if 1 pound of meat costs $...",GPT4,3,"Great, you've correctly identified the cost of..."
4,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,"Tutor: Hi, could you please provide a step-by-...",It seems like you've calculated the cost as if...,Mistral,6,"Great, you've correctly identified the cost of..."


In [23]:
from sklearn.model_selection import StratifiedGroupKFold
folds = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)
train['fold'] = -1
for i,(train_index, test_index) in enumerate(folds.split(train,train['target'], groups=train['conversation_id'])): 
    train.loc[test_index,'fold'] = i

In [24]:
cfg["device"] = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [25]:
# cfg['n_accumulate'] = 4#5
cfg['dropout'] = 0.4
cfg['apex'] = True
cfg["grad_norm"] = 20
cfg["gradient_checkpoint"] = False

In [26]:
cfg['output']

'///mnt/c/Personal/Competitions/BEA_2025/debetav3_base_context_multisampleDropout'

In [27]:
from torch.optim import lr_scheduler
from sklearn.metrics import mean_squared_error
from torch.nn import Parameter
import time
criterion = nn.CrossEntropyLoss()

def run_folds() :

    for fold in range(5) :

        print(f'--------------------------------Training Fold {fold+1}/5---------------------------------')
        with open(f"{cfg['output']}/log.txt", 'a') as f:
                f.write(f'fold {fold+1}/5 \n')
        train_ = train[train.fold!=fold].reset_index(drop=True)
        valid_ = train[train.fold==fold].reset_index(drop=True)
        
        print(f'train shape : {len(train_)}')
        print(f'valid shape : {len(valid_)}')
        
        train_dataset = Dataset(
                                    train_, True, True
                            )
        valid_dataset = Dataset(
                                    valid_,True
        )
        collate_fn = Collate(tokenizer=cfg['tokenizer'])
        #Collate(tokenizer=cfg['tokenizer'])#DataCollatorWithPadding(tokenizer=cfg['tokenizer'])
        train_loader = DataLoader(
                train_dataset,
                batch_size=cfg["train_batch_size"],
                collate_fn=collate_fn,
                num_workers=8,
                shuffle=True,
                pin_memory=True,
                drop_last=True
                    )
        valid_loader = DataLoader(
            valid_dataset,
            batch_size=cfg["valid_batch_size"],
            collate_fn=collate_fn,
            num_workers=8,
            shuffle=False,
            pin_memory=True,
        )
        
        model = Model()
        model.to(cfg['device'])

        steps = len(train_loader)
        total_steps = steps * cfg['epochs']
        optimizer = optimizer_scheduler(model)
        # scheduler = lr_scheduler.CosineAnnealingLR(
        #                 optimizer, T_max=cfg['T_max'], eta_min=cfg['min_lr'])
        scheduler = lr_scheduler.OneCycleLR(optimizer, max_lr=cfg['learning_rate'], 
                                            total_steps=total_steps//cfg['n_accumulate'])
        history = start_training(
                        model, optimizer, scheduler, cfg['device'], cfg['epochs'] ,train_loader=train_loader,valid_loader=valid_loader,fold=fold)
        torch.cuda.empty_cache()
        del model, optimizer, scheduler, train_loader, valid_loader, train_dataset, valid_dataset, collate_fn
        #plot_history(history)
        



In [28]:
# run_folds()

## Get OOF

In [29]:
target_cols = [f'target_{i}' for i in range(len(id2label))]

In [30]:
target_cols

['target_0',
 'target_1',
 'target_2',
 'target_3',
 'target_4',
 'target_5',
 'target_6',
 'target_7',
 'target_8']

In [31]:
@torch.inference_mode()
def infer(model, dataloader, device):
    model.eval()

    total = 0
    losses = AverageMeter()
    correct = 0
    preds = []
    y_test = []
    bar = tqdm(dataloader, total=len(dataloader))
    for data in bar:
        ids = data["input_ids"].to(device, dtype=torch.long)
        mask = data["attention_mask"].to(device, dtype=torch.long)
        rmask = data["R_mask"].to(device, dtype=torch.long)
        batch_size = ids.size(0)

        outputs = model(ids, mask, rmask)
        preds.append(outputs.softmax(dim=1).detach().cpu().numpy())
        
    
    preds = np.concatenate(preds)

    return preds

In [32]:
import gc
for i in range(10):
    torch.cuda.empty_cache()
    try:
        gc.collct()
    except:
        pass


In [33]:
def get_oof() :
    oof = train.copy(deep=True)
    for fold in range(5) :

        print(f'--------------------------------Inferring Fold {fold+1}/5---------------------------------')
        valid_ = train[train.fold==fold]
        idxs = valid_.index
        valid_ = valid_.reset_index()
        
        print(f'valid shape : {len(valid_)}')
        
        valid_dataset = Dataset(valid_,True)
        collate_fn = Collate(tokenizer=cfg['tokenizer'])#DataCollatorWithPadding(tokenizer=cfg['tokenizer'])
        valid_loader = DataLoader(
            valid_dataset,
            batch_size=cfg["valid_batch_size"],
            collate_fn=collate_fn,
            num_workers=8,
            shuffle=False,
            pin_memory=True,
        )
        
        model = Model()
        model.to(cfg['device'])
        model.load_state_dict(
            torch.load(f"///mnt/c/Personal/Competitions/BEA_2025/debetav3_base_context_multisampleDropout/fold_{fold}.bin")
        )
        y_test = infer(
            model, valid_loader, device=cfg["device"]
        )
        oof.loc[idxs, target_cols] = y_test
        torch.cuda.empty_cache()
    return oof


In [34]:
# oof = get_oof()

In [35]:
# oof.head(1)

In [36]:
# from sklearn.metrics import f1_score, accuracy_score

# oof['pred'] = oof[target_cols].idxmax(axis=1).apply(lambda x: x.split("_")[1])
# oof['pred'] = oof['pred'].astype(int)

# print(f1_score(oof['target'], oof['pred'], average='macro'))
# print(accuracy_score(oof['target'], oof['pred']))

- 0.8971366197782591
- 0.8901453957996769

In [37]:
# oof.to_csv('///mnt/c/Personal/Competitions/BEA_2025/debetav3_base_context_multisampleDropout/oofs.csv', index=False)

## Inference

In [38]:

# Data Processing Functions
def load_test_data(test_data_path='///mnt/c/Personal/Competitions/BEA_2025/data/mrbench_v3_testset.json'):
    """
    Load development and (optionally) test datasets
    """    
    # Load test data if provided
    test_examples = []
    test_data = None
    if test_data_path:
        with open(test_data_path, 'r',encoding="utf-8") as f:
            test_data = json.load(f)
        
        for dialogue in test_data:
            conversation_id = dialogue["conversation_id"]
            conversation_history = dialogue["conversation_history"]
            
            for tutor_id, tutor_data in dialogue["tutor_responses"].items():
                test_examples.append({
                    "conversation_id": conversation_id,
                    "conversation_history": conversation_history,
                    "tutor_response": tutor_data["response"],
                    "tutor_id": tutor_id
                })
    
    return test_examples, test_data

In [39]:
test_examples,test_data = load_test_data()

In [40]:
test_examples

[{'conversation_id': '1030-adb61831-0383-4e51-a673-ab978590f69b',
  'conversation_history': 'Tutor: Hi, could you please provide a step-by-step solution for the question below? The question is: Tyson decided to make muffaletta sandwiches for the big game.  Each sandwich required 1 pound each of meat and cheese and would serve 4 people.  There would be 20 people in total watching the game.  The meat cost $7.00 per pound and the cheese cost $3.00 per pound.  How much money would he spend on the meat and cheese to make enough sandwiches to serve 20 people? \n Student: To serve 20 people, Tyson needs to make 20/4 = 5 sandwiches.\nEach sandwich requires 1+1 = 2 pounds of meat and cheese.\nFor 5 sandwiches, he needs a total of 2 x 5 = 10 pounds of meat and cheese.\nThe cost of 10 pounds of meat is 10 x $7.00 = $70.\nThe cost of 10 pounds of cheese is 10 x $3.00 = $30.\nThe total cost of meat and cheese is $70 + $30 = $100.\n 100 \n Tutor: do you want to talk me through your solution \n Stude

In [41]:
test = pd.DataFrame(test_examples)
test.rename(columns={'conversation_history':'Question','tutor_response':'Response'},inplace=True)
test

Unnamed: 0,conversation_id,Question,Response,tutor_id
0,1030-adb61831-0383-4e51-a673-ab978590f69b,"Tutor: Hi, could you please provide a step-by-...",It looks like you've done a great job figuring...,Tutor_1
1,1030-adb61831-0383-4e51-a673-ab978590f69b,"Tutor: Hi, could you please provide a step-by-...","You've done a great job, but there's a small m...",Tutor_2
2,1030-adb61831-0383-4e51-a673-ab978590f69b,"Tutor: Hi, could you please provide a step-by-...","OK, read the question again, and answer these ...",Tutor_3
3,1030-adb61831-0383-4e51-a673-ab978590f69b,"Tutor: Hi, could you please provide a step-by-...","Tutor: I see where you're coming from, but I t...",Tutor_4
4,1030-adb61831-0383-4e51-a673-ab978590f69b,"Tutor: Hi, could you please provide a step-by-...",Great job! Can you explain how you arrived at ...,Tutor_5
...,...,...,...,...
1542,3512-aa670406-dbb2-4993-865e-54f2d2b30d56,"Tutor: Hi, could you please provide a step-by-...","Great progress, but let's double-check the cal...",Tutor_4
1543,3512-aa670406-dbb2-4993-865e-54f2d2b30d56,"Tutor: Hi, could you please provide a step-by-...","Great job! Now, let's move on to the next prob...",Tutor_5
1544,3512-aa670406-dbb2-4993-865e-54f2d2b30d56,"Tutor: Hi, could you please provide a step-by-...","Actually, you're almost there but remember to ...",Tutor_6
1545,3512-aa670406-dbb2-4993-865e-54f2d2b30d56,"Tutor: Hi, could you please provide a step-by-...","You're very close! Remember, she only needs t...",Tutor_7


In [42]:
rows = []
for i, row in test.iterrows():
    question = row.Question
    response = row.Response
    convid = row.conversation_id
    context = test[(test.conversation_id==convid)&(test.Response!=response)].Response.values
    context = ' [SEP] '.join(context)
    test.loc[i, 'context'] = context
test.head()

Unnamed: 0,conversation_id,Question,Response,tutor_id,context
0,1030-adb61831-0383-4e51-a673-ab978590f69b,"Tutor: Hi, could you please provide a step-by-...",It looks like you've done a great job figuring...,Tutor_1,"You've done a great job, but there's a small m..."
1,1030-adb61831-0383-4e51-a673-ab978590f69b,"Tutor: Hi, could you please provide a step-by-...","You've done a great job, but there's a small m...",Tutor_2,It looks like you've done a great job figuring...
2,1030-adb61831-0383-4e51-a673-ab978590f69b,"Tutor: Hi, could you please provide a step-by-...","OK, read the question again, and answer these ...",Tutor_3,It looks like you've done a great job figuring...
3,1030-adb61831-0383-4e51-a673-ab978590f69b,"Tutor: Hi, could you please provide a step-by-...","Tutor: I see where you're coming from, but I t...",Tutor_4,It looks like you've done a great job figuring...
4,1030-adb61831-0383-4e51-a673-ab978590f69b,"Tutor: Hi, could you please provide a step-by-...",Great job! Can you explain how you arrived at ...,Tutor_5,It looks like you've done a great job figuring...


In [43]:
class Collate:
    def __init__(self, tokenizer, max_len=cfg['max_len']):
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __call__(self, batch):
        output = dict()
        output["input_ids"] = [sample["input_ids"] for sample in batch]
        output["attention_mask"] = [sample["attention_mask"] for sample in batch]
        output["R_mask"] = [sample["R_mask"] for sample in batch]

        # calculate max token length of this batch
        batch_max = max([len(ids) for ids in output["input_ids"]]) 

        batch_max = min(batch_max, self.max_len)
        
        output["input_ids"] = [s[:batch_max] for s in output["input_ids"]]
        output["attention_mask"] = [s[:batch_max] for s in output["attention_mask"]] 
        output["R_mask"] = [s[:batch_max] for s in output["R_mask"]] 
        #output["target"] = [s[:batch_max] for s in output["target"]]


        output["input_ids"] = [s + (batch_max - len(s)) * [self.tokenizer.pad_token_id] for s in output["input_ids"]]
        output["attention_mask"] = [s + (batch_max - len(s)) * [0] for s in output["attention_mask"]]
        output["R_mask"] = [s + (batch_max - len(s)) * [0] for s in output["R_mask"]]
        #output["target"] = [s + (batch_max - len(s)) * [0] for s in output["target"]]
        

        # convert to tensors
        output["input_ids"] = torch.tensor(output["input_ids"], dtype=torch.long)
        output["attention_mask"] = torch.tensor(output["attention_mask"], dtype=torch.long)
        output["R_mask"] = torch.tensor(output["R_mask"], dtype=torch.long)

        return output

In [44]:
valid_dataset = Dataset(test,False )
collate_fn = Collate(tokenizer=cfg['tokenizer'])
test_loader = DataLoader(
                valid_dataset,
                batch_size=cfg["valid_batch_size"],
                collate_fn=collate_fn,
                num_workers=8,
                shuffle=False,
                pin_memory=True,
            )

final_preds = []
for fold in range(5):
    model = Model()
    model.to(cfg['device'])
    model.load_state_dict(
        torch.load(f'///mnt/c/Personal/Competitions/BEA_2025/debetav3_base_context_multisampleDropout/fold_{fold}.bin')
    )
    model = torch.compile(model)
    preds = infer(model, test_loader, 'cuda')
    final_preds.append(preds)
    del model
    torch.cuda.empty_cache()
    gc.collect()

final_preds = np.mean(final_preds, 0) 

100%|██████████| 49/49 [00:55<00:00,  1.13s/it]
100%|██████████| 49/49 [00:14<00:00,  3.32it/s]
100%|██████████| 49/49 [00:14<00:00,  3.33it/s]
100%|██████████| 49/49 [00:14<00:00,  3.30it/s]
100%|██████████| 49/49 [00:14<00:00,  3.34it/s]


In [45]:
target_cols = [f'target_{i}' for i in range(len(id2label))]

In [46]:
test[target_cols] = final_preds

In [47]:
test.head()

Unnamed: 0,conversation_id,Question,Response,tutor_id,context,target_0,target_1,target_2,target_3,target_4,target_5,target_6,target_7,target_8
0,1030-adb61831-0383-4e51-a673-ab978590f69b,"Tutor: Hi, could you please provide a step-by-...",It looks like you've done a great job figuring...,Tutor_1,"You've done a great job, but there's a small m...",0.005174,0.001483,0.925089,0.008375,0.006387,0.013104,0.021824,0.011598,0.006965
1,1030-adb61831-0383-4e51-a673-ab978590f69b,"Tutor: Hi, could you please provide a step-by-...","You've done a great job, but there's a small m...",Tutor_2,It looks like you've done a great job figuring...,0.005118,0.001437,0.005278,0.958946,0.004202,0.009362,0.005535,0.005198,0.004924
2,1030-adb61831-0383-4e51-a673-ab978590f69b,"Tutor: Hi, could you please provide a step-by-...","OK, read the question again, and answer these ...",Tutor_3,It looks like you've done a great job figuring...,0.963293,0.000846,0.004049,0.003732,0.003101,0.007229,0.006564,0.005012,0.006174
3,1030-adb61831-0383-4e51-a673-ab978590f69b,"Tutor: Hi, could you please provide a step-by-...","Tutor: I see where you're coming from, but I t...",Tutor_4,It looks like you've done a great job figuring...,0.003281,0.002466,0.004598,0.004965,0.955541,0.00921,0.004848,0.007917,0.007174
4,1030-adb61831-0383-4e51-a673-ab978590f69b,"Tutor: Hi, could you please provide a step-by-...",Great job! Can you explain how you arrived at ...,Tutor_5,It looks like you've done a great job figuring...,0.004487,0.001878,0.011213,0.006288,0.008378,0.014966,0.009138,0.935173,0.008479


In [48]:
test['pred'] = test[target_cols].idxmax(axis=1).apply(lambda x: x.split("_")[1])
test['pred'] = test['pred'].astype(int)
test['pred']

0       2
1       3
2       0
3       4
4       7
       ..
1542    3
1543    7
1544    3
1545    2
1546    6
Name: pred, Length: 1547, dtype: int64

In [49]:
test.to_csv('///mnt/c/Personal/Competitions/BEA_2025/debetav3_base_context_multisampleDropout/test_probas.csv',index=False)

In [50]:
pred_labels = test['pred'].values

In [51]:
# Create submission file
submission = []
unique_conversation_ids = list(ex["conversation_id"] for ex in test_examples)

for conversation_id in unique_conversation_ids:
    conversation_data = next(d for d in test_data if d["conversation_id"] == conversation_id)
    submission_item = {
        "conversation_id": conversation_id,
        "conversation_history": conversation_data["conversation_history"],
        "tutor_responses": {}
    }
        
    for tutor_id, tutor_data in conversation_data["tutor_responses"].items():
        # Find the corresponding prediction
        idx = next(i for i, ex in enumerate(test_examples) 
                    if ex["conversation_id"] == conversation_id and ex["tutor_id"] == tutor_id)
        
        predicted_class = id2label[pred_labels[idx]]
        
        submission_item["tutor_responses"][tutor_id] = {
            "response": tutor_data["response"],
            "annotation": {
                "Tutor_Identification": predicted_class
            }
        }
    
    submission.append(submission_item)

In [None]:
submission

In [53]:
with open(os.path.join("///mnt/c/Personal/Competitions/BEA_2025/debetav3_base_context_multisampleDropout", "predictions.json"), "w") as f:
    json.dump(submission, f, indent=2)