In [1]:
!nvidia-smi

Wed Apr 23 01:50:42 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.120                Driver Version: 550.120        CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3090        Off |   00000000:01:00.0  On |                  N/A |
| 30%   45C    P8             38W /  370W |     425MiB /  24576MiB |     31%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

## Imports

In [2]:
import os
import random
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
import torch
import torch.nn as nn
from torch.utils.checkpoint import checkpoint
from torch.utils.data import Dataset,DataLoader
from torch.optim import AdamW
import json
from sklearn import metrics
from torch.nn import functional as F
from tqdm import tqdm
from transformers import (
    AutoConfig,
    AutoModel,
    AutoTokenizer,
    get_cosine_schedule_with_warmup,
    get_linear_schedule_with_warmup,
    DataCollatorWithPadding
)

from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


## Configs

In [3]:
cfg = {"model_name": "google/bigbird-roberta-large",
    "max_len": 1024,
    "freeze" : False,
    # Train Configs
    "fold_num": 5,
    "val_fold": 0,
    "learning_rate": 2e-05,
    "min_lr": 1e-7,
    "T_max": 500,
    "valid_batch_size": 2,
    'train_batch_size' : 2,
 
    "epochs": 25, # Set to 1 because it is a demo
    "n_accumulate":8,
    
    # GPU Optimize Settings
    "scheduler" : 'cosine',
    "warmup_epochs": 1,

    "gradient_checkpoint" : False,
    'tokenizer' : AutoTokenizer.from_pretrained("google/bigbird-roberta-large"),
    
    # Path
    "output": f"bigbird-roberta-large",
    "seed":42,
}

Path(cfg['output']).mkdir(exist_ok=True)


## Utils

In [4]:
def set_seed(seed=42):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    
set_seed(cfg['seed'])

In [5]:
def optimizer_scheduler(model):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {
                "params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and p.requires_grad],
                "weight_decay": 0.003,
            },
            {
                "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and p.requires_grad],
                "weight_decay": 0.0,
            },
        ]
        opt = AdamW(optimizer_parameters, lr=cfg['learning_rate'])
        return opt

In [6]:
import matplotlib.pyplot as plt
def plot_history(history):
    plt.figure(figsize=(20, 12))
    
    # Plot Loss
    plt.subplot(2, 1, 1)
    for k in ["Train Loss", "Valid Loss"]:
        plt.plot(history[k])
    plt.title('Loss')
    plt.xlabel('epochs')
    plt.ylabel('loss')
    plt.legend(['train', 'valid'], loc='upper left')
    
    # Plot Metrics
    plt.subplot(2, 1, 2)
    for k in ["Macro F1", "Accuracy"]:
        plt.plot(history[k])
    plt.title('Metrics')
    plt.xlabel('epochs')
    plt.ylabel('score')
    plt.legend(['Macro F1', 'Accuracy'], loc='lower right')
    
    plt.tight_layout()
    plt.show()

In [7]:
# Define the tutor classes
TUTOR_CLASSES = [
    "Expert",
    "Novice",
    "Gemini",
    "GPT4",
    "Llama31405B",
    "Llama318B",
    "Mistral",
    "Phi3",
    "Sonnet"
]

# Create label mappings
id2label = {i: label for i, label in enumerate(TUTOR_CLASSES)}
label2id = {v: k for k, v in id2label.items()}
print(id2label, label2id)

{0: 'Expert', 1: 'Novice', 2: 'Gemini', 3: 'GPT4', 4: 'Llama31405B', 5: 'Llama318B', 6: 'Mistral', 7: 'Phi3', 8: 'Sonnet'} {'Expert': 0, 'Novice': 1, 'Gemini': 2, 'GPT4': 3, 'Llama31405B': 4, 'Llama318B': 5, 'Mistral': 6, 'Phi3': 7, 'Sonnet': 8}


In [8]:
def load_data(dev_data_path='mrbench_v3_devset.json'):
    """
    Load development and (optionally) test datasets
    """
    # Load development data
    with open(dev_data_path, 'r') as f:
        dev_data = json.load(f)
    
    # Process development data
    dev_examples = []
    for dialogue in dev_data:
        conversation_id = dialogue["conversation_id"]
        conversation_history = dialogue["conversation_history"]
        
        for tutor_id, tutor_data in dialogue["tutor_responses"].items():
            if tutor_id in TUTOR_CLASSES or any(cls_name in tutor_id for cls_name in TUTOR_CLASSES):
                # Map the tutor_id to one of our classes
                tutor_class = next((cls for cls in TUTOR_CLASSES if cls in tutor_id), tutor_id)
                
                dev_examples.append({
                    "conversation_id": conversation_id,
                    "conversation_history": conversation_history,
                    "tutor_response": tutor_data["response"],
                    "tutor_class": tutor_class
                })
    return dev_examples

## Dataset

In [9]:
train = pd.DataFrame(load_data())
train['target'] = train['tutor_class'].map(label2id)
train.rename(columns={'conversation_history':'Question','tutor_response':'Response'},inplace=True)

In [10]:
train.head()

Unnamed: 0,conversation_id,Question,Response,tutor_class,target
0,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,"Tutor: Hi, could you please provide a step-by-...","Great, you've correctly identified the cost of...",Sonnet,8
1,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,"Tutor: Hi, could you please provide a step-by-...",Now that we know the cost of 1 pound of meat i...,Llama318B,5
2,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,"Tutor: Hi, could you please provide a step-by-...","You're close, but I notice that you calculated...",Llama31405B,4
3,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,"Tutor: Hi, could you please provide a step-by-...","That's correct. So, if 1 pound of meat costs $...",GPT4,3
4,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,"Tutor: Hi, could you please provide a step-by-...",It seems like you've calculated the cost as if...,Mistral,6


In [11]:
rows = []
for i, row in train.iterrows():
    question = row.Question
    response = row.Response
    convid = row.conversation_id
    context = train[(train.conversation_id==convid)&(train.Response!=response)].Response.values
    context = ' [SEP] '.join(context)
    train.loc[i, 'context'] = context

In [12]:
train['Question'].nunique()

294

In [13]:
TARGETS = ["target"]

In [14]:
class Dataset(Dataset) :
    def __init__(self,df,is_train = True, use_aug=False) :
        self.use_aug = use_aug
        self.df = df
        self.tokenizer = cfg['tokenizer']
        self.sep_token = self.tokenizer.sep_token
        self.text = (self.df['Question'] + self.sep_token + '[R_STRAT]' + self.df['Response']+ '[R_END]'+ self.sep_token + self.df['context']).values

        if is_train==True :
            self.targets = df['target'].values
        self.is_train = is_train

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        
        text = self.text[index]
        
        inputs = self.tokenizer.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length=cfg["max_len"]
        )
        feature_dict = {
            'input_ids': inputs['input_ids'],
            'attention_mask': inputs['attention_mask'],
        }
        if self.is_train==True :
            feature_dict['target'] =  self.targets[index]
            
        custom_mask = inputs['attention_mask'].copy()
        stop_idx = 0
        for idx, token in enumerate(inputs['input_ids']):
                if token == cfg["tokenizer"].convert_tokens_to_ids('[R_END]'):
                    stop_idx = idx + 1
                    break
        for idx in range(stop_idx, len(inputs['attention_mask'])):
            custom_mask[idx] = 0
        feature_dict['R_mask'] = custom_mask

        return feature_dict

In [15]:
cfg["tokenizer"].add_tokens(['[R_STRAT]', '[R_END]'], special_tokens=True)


2

## Model

In [16]:
def odd_layer_freeze(module):
    for i in range(1,24,2):
        for n,p in module.encoder.layer[i].named_parameters():
            p.requires_grad = False
            
def even_layer_freeze(module):
    for i in range(0,24,2):
        for n,p in module.encoder.layer[i].named_parameters():
            p.requires_grad = False
            
def top_half_layer_freeze(module):
    for i in range(0,13,1):
        for n,p in module.encoder.layer[i].named_parameters():
            p.requires_grad = False

def bottom_half_layer_freeze(module):
    for i in range(13,14,1):
        for n,p in module.encoder.layer[i].named_parameters():
            p.requires_grad = False
    

In [17]:
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings
    
class GeMText(nn.Module):
    def __init__(self, dim=1, p=3, eps=1e-6):
        super(GeMText, self).__init__()
        self.dim = dim
        self.p = nn.Parameter(torch.ones(1) * p)
        self.eps = eps
        self.feat_mult = 1

    def forward(self, x, attention_mask):
        attention_mask_expanded = attention_mask.unsqueeze(-1).expand(x.shape)
        x = (x.clamp(min=self.eps) * attention_mask_expanded).pow(self.p).sum(self.dim)
        ret = x / attention_mask_expanded.sum(self.dim).clip(min=self.eps)
        ret = ret.pow(1 / self.p)
        return ret
    
class MultiSampleDropout(nn.Module):
    def __init__(self, classifier, start_prob=0.2, num_samples=8, increment=0.01):
        super(MultiSampleDropout, self).__init__()
        # Use standard nn.Dropout since we're integrating with the first notebook
        self.dropouts = nn.ModuleList([
            nn.Dropout(start_prob + (increment*i)) for i in range(num_samples)
        ])
        self.classifier = classifier
        
    def forward(self, out):
        # Apply multiple dropouts and average results
        return torch.mean(torch.stack([
            self.classifier(dropout(out)) for dropout in self.dropouts
        ], dim=0), dim=0)
    
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.drop = nn.Dropout(p=cfg["dropout"])
        
        self.config = AutoConfig.from_pretrained(cfg["model_name"])
        #self.config.hidden_dropout = 0.
        self.config.hidden_dropout_prob = 0.007
        #self.config.attention_dropout = 0.
        self.config.attention_probs_dropout_prob = 0.008

        self.model = AutoModel.from_pretrained(cfg["model_name"], config=self.config)
        self.model.resize_token_embeddings(len(cfg["tokenizer"]))
        #odd_layer_freeze(self.model)
        if cfg["gradient_checkpoint"]:
            print('Enabling Grad Checkpointing')
            self.model.gradient_checkpointing_enable()  
        if cfg["freeze"]:
            print('freezing params')
            for parameter in self.model.parameters():
                parameter.requires_grad = False
        self.pool = GeMText()

        # Create a classifier (single linear layer)
        self.fc_base = nn.Linear(self.config.hidden_size, len(id2label))

        # Wrap it with Multi-Sample Dropout
        self.fc = MultiSampleDropout(self.fc_base, 
                                    start_prob=0.2, 
                                    num_samples=8, 
                                    increment=0.01)
        
    def forward(self, ids, mask, rhead):        
        out = self.model(input_ids=ids,attention_mask=mask,
                         output_hidden_states=False)
        out = self.pool(out.last_hidden_state, rhead)
        out = self.drop(out)
        out = self.fc(out)
        return out

In [18]:
class Collate:
    def __init__(self, tokenizer, max_len=cfg['max_len']):
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __call__(self, batch):
        output = dict()
        output["input_ids"] = [sample["input_ids"] for sample in batch]
        output["attention_mask"] = [sample["attention_mask"] for sample in batch]
        output["R_mask"] = [sample["R_mask"] for sample in batch]
        output["target"] = [sample["target"] for sample in batch]

        # calculate max token length of this batch
        batch_max = max([len(ids) for ids in output["input_ids"]]) 

        batch_max = min(batch_max, self.max_len)
        
        output["input_ids"] = [s[:batch_max] for s in output["input_ids"]]
        output["attention_mask"] = [s[:batch_max] for s in output["attention_mask"]] 
        output["R_mask"] = [s[:batch_max] for s in output["R_mask"]] 
        #output["target"] = [s[:batch_max] for s in output["target"]]


        output["input_ids"] = [s + (batch_max - len(s)) * [self.tokenizer.pad_token_id] for s in output["input_ids"]]
        output["attention_mask"] = [s + (batch_max - len(s)) * [0] for s in output["attention_mask"]]
        output["R_mask"] = [s + (batch_max - len(s)) * [0] for s in output["R_mask"]]
        #output["target"] = [s + (batch_max - len(s)) * [0] for s in output["target"]]
        

        # convert to tensors
        output["input_ids"] = torch.tensor(output["input_ids"], dtype=torch.long)
        output["attention_mask"] = torch.tensor(output["attention_mask"], dtype=torch.long)
        output["target"] = torch.tensor(output["target"], dtype=torch.long)
        output["R_mask"] = torch.tensor(output["R_mask"], dtype=torch.long)

        return output

## Engine

In [19]:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [20]:
def train_one_epoch(model, optimizer, scheduler, dataloader, device):
    model.train()

    total = 0
    running_loss = 0.0
    losses = AverageMeter()
    scaler = torch.amp.GradScaler(enabled=cfg["apex"])
    lr = []
    bar = tqdm(dataloader, total=len(dataloader))
    steps = len(dataloader)
    
    all_preds = np.array([])
    all_groud_truth = np.array([])
    
    for step, data in enumerate(bar):
        ids = data["input_ids"].to(device, dtype=torch.long)
        mask = data["attention_mask"].to(device, dtype=torch.long)
        targets = data["target"].to(device, dtype=torch.long)
        r_mask = data["R_mask"].to(device, dtype=torch.long)

        batch_size = ids.size(0)
        with torch.amp.autocast(device_type="cuda"):
        # with torch.amp.autocast(enabled=cfg["apex"],device_type="cuda"):
            outputs = model(ids, mask, r_mask)
            loss = criterion(outputs, targets)
        loss = loss / cfg['n_accumulate']
        
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        
        if (step + 1) % cfg['n_accumulate'] == 0 or step == steps:
            scaler.unscale_(optimizer)
            # grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), cfg["grad_norm"])
            torch.nn.utils.clip_grad_norm_(model.parameters(), cfg["grad_norm"])
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            if scheduler:
                scheduler.step()


        epoch_loss = losses.avg
        #acc = correct / total

        bar.set_postfix(
            Loss=epoch_loss, LR=optimizer.param_groups[0]['lr'])
        
        lr.append(optimizer.param_groups[0]['lr'])

    return losses.avg

from sklearn.metrics import f1_score, accuracy_score

@torch.no_grad()
def evaluate(model, dataloader, device):
    model.eval()

    losses = AverageMeter()
    preds = []
    y_test = []
    for data in dataloader:
        ids = data["input_ids"].to(device, dtype=torch.long)
        mask = data["attention_mask"].to(device, dtype=torch.long)
        targets = data["target"].to(device, dtype=torch.long)
        r_mask = data["R_mask"].to(device, dtype=torch.long)

        batch_size = ids.size(0)

        outputs = model(ids, mask, r_mask)

        loss = criterion(outputs, targets)

        losses.update(loss.item(), batch_size)
        preds.append(outputs.detach().cpu().numpy())
        y_test.append(targets.detach().cpu().numpy())
    
    preds = np.concatenate(preds)
    y_test = np.concatenate(y_test)
    
    # Get the predicted class (argmax)
    pred_labels = np.argmax(preds, axis=1)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, pred_labels)
    
    # Calculate macro F1 score
    macro_f1 = f1_score(y_test, pred_labels, average='macro')

    return losses.avg, preds, y_test, macro_f1, accuracy

In [21]:
def start_training(model, optimizer, scheduler, device, num_epochs, train_loader, valid_loader, fold=0):
    import time
    start = time.time()
    best_score = 0  # Changed to 0 since higher F1 is better
    history = {"Train Loss": [], "Valid Loss": [], "Macro F1": [], "Accuracy": []}
    
    for epoch in range(1, num_epochs + 1):
        print("Epoch: ", epoch)
        train_epoch_loss = train_one_epoch(
            model, optimizer, scheduler, dataloader=train_loader, device=cfg["device"]
        )

        val_epoch_loss, preds, y_test, macro_f1, accuracy = evaluate(
            model, valid_loader, device=cfg["device"]
        )
        
        # print(f"Epoch {epoch}: Loss={val_epoch_loss:.4f}, Macro F1={macro_f1:.4f}, Accuracy={accuracy:.4f}")
        print(f"Epoch {epoch}: Train Loss={train_epoch_loss:.4f}, Valid Loss={val_epoch_loss:.4f}, Macro F1={macro_f1:.4f}, Accuracy={accuracy:.4f}")
        
        # Use Macro F1 as the primary score for model selection
        score = macro_f1

        history["Train Loss"].append(train_epoch_loss)
        history["Valid Loss"].append(val_epoch_loss)
        history["Macro F1"].append(macro_f1)
        history["Accuracy"].append(accuracy)

        # For Macro F1, higher is better, so we change the comparison
        if score >= best_score:
            print(
                f"Score Improved ({best_score:.4f} ---> {score:.4f})"
            )
            with open(f"{cfg['output']}/log.txt", 'a') as f:
                # f.write(f'Epoch {epoch}: Loss={val_epoch_loss:.4f}, Macro F1={macro_f1:.4f}, Accuracy={accuracy:.4f}\n')
                f.write(f'Epoch {epoch}: Train Loss={train_epoch_loss:.4f}, Valid Loss={val_epoch_loss:.4f}, Macro F1={macro_f1:.4f}, Accuracy={accuracy:.4f}\n')

            best_score = score
            PATH = os.path.join(cfg['output'],f'fold_{fold}.bin')
            torch.save(model.state_dict(), PATH)
            
            print(f"Model Saved")
            best_y = preds

        print()

    end = time.time()
    time_elapsed = end - start
    print(
        "Training complete in {:.0f}h {:.0f}m {:.0f}s".format(
            time_elapsed // 3600,
            (time_elapsed % 3600) // 60,
            (time_elapsed % 3600) % 60,
        )
    )
    print(
        "Best Macro F1 Score: {:.4f}".format(
            best_score
        )
    )

    return history, best_y

In [22]:
train.head()

Unnamed: 0,conversation_id,Question,Response,tutor_class,target,context
0,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,"Tutor: Hi, could you please provide a step-by-...","Great, you've correctly identified the cost of...",Sonnet,8,Now that we know the cost of 1 pound of meat i...
1,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,"Tutor: Hi, could you please provide a step-by-...",Now that we know the cost of 1 pound of meat i...,Llama318B,5,"Great, you've correctly identified the cost of..."
2,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,"Tutor: Hi, could you please provide a step-by-...","You're close, but I notice that you calculated...",Llama31405B,4,"Great, you've correctly identified the cost of..."
3,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,"Tutor: Hi, could you please provide a step-by-...","That's correct. So, if 1 pound of meat costs $...",GPT4,3,"Great, you've correctly identified the cost of..."
4,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,"Tutor: Hi, could you please provide a step-by-...",It seems like you've calculated the cost as if...,Mistral,6,"Great, you've correctly identified the cost of..."


In [23]:
fold_map = pd.read_csv('oofs.csv')
train = pd.merge(train,fold_map[['conversation_id','fold','tutor_class']],on=['conversation_id','tutor_class'], how='left')


In [24]:
cfg["device"] = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [25]:
# cfg['n_accumulate'] = 4#5
cfg['dropout'] = 0.4
cfg['apex'] = True
cfg["grad_norm"] = 20
cfg["gradient_checkpoint"] = False

In [26]:
cfg['output']

'bigbird-roberta-large'

In [27]:
from torch.optim import lr_scheduler
from sklearn.metrics import mean_squared_error
from torch.nn import Parameter
import time
criterion = nn.CrossEntropyLoss()

def run_folds() :

    for fold in range(5) :

        print(f'--------------------------------Training Fold {fold+1}/5---------------------------------')
        with open(f"{cfg['output']}/log.txt", 'a') as f:
                f.write(f'fold {fold+1}/5 \n')
        train_ = train[train.fold!=fold].reset_index(drop=True)
        valid_ = train[train.fold==fold].reset_index(drop=True)
        
        print(f'train shape : {len(train_)}')
        print(f'valid shape : {len(valid_)}')
        
        train_dataset = Dataset(
                                    train_, True, True
                            )
        valid_dataset = Dataset(
                                    valid_,True
        )
        collate_fn = Collate(tokenizer=cfg['tokenizer'])
        #Collate(tokenizer=cfg['tokenizer'])#DataCollatorWithPadding(tokenizer=cfg['tokenizer'])
        train_loader = DataLoader(
                train_dataset,
                batch_size=cfg["train_batch_size"],
                collate_fn=collate_fn,
                num_workers=8,
                shuffle=True,
                pin_memory=True,
                drop_last=True
                    )
        valid_loader = DataLoader(
            valid_dataset,
            batch_size=cfg["valid_batch_size"],
            collate_fn=collate_fn,
            num_workers=8,
            shuffle=False,
            pin_memory=True,
        )
        
        model = Model()
        model.to(cfg['device'])

        steps = len(train_loader)
        total_steps = steps * cfg['epochs']
        optimizer = optimizer_scheduler(model)
        # scheduler = lr_scheduler.CosineAnnealingLR(
        #                 optimizer, T_max=cfg['T_max'], eta_min=cfg['min_lr'])
        scheduler = lr_scheduler.OneCycleLR(optimizer, max_lr=cfg['learning_rate'], 
                                            total_steps=total_steps//cfg['n_accumulate'])
        history = start_training(
                        model, optimizer, scheduler, cfg['device'], cfg['epochs'] ,train_loader=train_loader,valid_loader=valid_loader,fold=fold)
        torch.cuda.empty_cache()
        del model, optimizer, scheduler, train_loader, valid_loader, train_dataset, valid_dataset, collate_fn
        #plot_history(history)
        



In [28]:
run_folds()

--------------------------------Training Fold 1/5---------------------------------
train shape : 1982
valid shape : 494


The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Epoch:  1


  0%|          | 0/991 [00:00<?, ?it/s]Input ids are automatically padded from 846 to 896 to be a multiple of `config.block_size`: 64
  0%|          | 1/991 [00:00<14:15,  1.16it/s, LR=8e-7, Loss=0.242]Attention type 'block_sparse' is not possible if sequence_length: 511 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3. Changing attention type to 'original_full'...
100%|██████████| 991/991 [02:57<00:00,  5.60it/s, LR=1.62e-6, Loss=0.277]


Epoch 1: Train Loss=0.2775, Valid Loss=2.1754, Macro F1=0.0581, Accuracy=0.1235
Score Improved (0.0000 ---> 0.0581)
Model Saved

Epoch:  2


100%|██████████| 991/991 [02:57<00:00,  5.60it/s, LR=3.94e-6, Loss=0.271]


Epoch 2: Train Loss=0.2710, Valid Loss=2.1429, Macro F1=0.0741, Accuracy=0.1417
Score Improved (0.0581 ---> 0.0741)
Model Saved

Epoch:  3


100%|██████████| 991/991 [02:58<00:00,  5.55it/s, LR=7.37e-6, Loss=0.269]


Epoch 3: Train Loss=0.2694, Valid Loss=2.1378, Macro F1=0.0605, Accuracy=0.1356

Epoch:  4


100%|██████████| 991/991 [02:57<00:00,  5.58it/s, LR=1.13e-5, Loss=0.268]


Epoch 4: Train Loss=0.2681, Valid Loss=2.1122, Macro F1=0.1118, Accuracy=0.1903
Score Improved (0.0741 ---> 0.1118)
Model Saved

Epoch:  5


100%|██████████| 991/991 [02:58<00:00,  5.55it/s, LR=1.51e-5, Loss=0.253]


Epoch 5: Train Loss=0.2529, Valid Loss=1.8832, Macro F1=0.3812, Accuracy=0.4069
Score Improved (0.1118 ---> 0.3812)
Model Saved

Epoch:  6


100%|██████████| 991/991 [02:55<00:00,  5.64it/s, LR=1.81e-5, Loss=0.209]


Epoch 6: Train Loss=0.2086, Valid Loss=1.4667, Macro F1=0.6040, Accuracy=0.5789
Score Improved (0.3812 ---> 0.6040)
Model Saved

Epoch:  7


100%|██████████| 991/991 [02:56<00:00,  5.63it/s, LR=1.98e-5, Loss=0.126]


Epoch 7: Train Loss=0.1255, Valid Loss=0.7726, Macro F1=0.7340, Accuracy=0.7206
Score Improved (0.6040 ---> 0.7340)
Model Saved

Epoch:  8


100%|██████████| 991/991 [02:56<00:00,  5.61it/s, LR=2e-5, Loss=0.0602]   


Epoch 8: Train Loss=0.0602, Valid Loss=0.5822, Macro F1=0.8120, Accuracy=0.8057
Score Improved (0.7340 ---> 0.8120)
Model Saved

Epoch:  9


100%|██████████| 991/991 [02:55<00:00,  5.64it/s, LR=1.97e-5, Loss=0.0466]


Epoch 9: Train Loss=0.0466, Valid Loss=0.5683, Macro F1=0.8221, Accuracy=0.8178
Score Improved (0.8120 ---> 0.8221)
Model Saved

Epoch:  10


100%|██████████| 991/991 [02:56<00:00,  5.60it/s, LR=1.91e-5, Loss=0.0233]


Epoch 10: Train Loss=0.0233, Valid Loss=0.6324, Macro F1=0.8053, Accuracy=0.7996

Epoch:  11


100%|██████████| 991/991 [02:54<00:00,  5.69it/s, LR=1.82e-5, Loss=0.0139]


Epoch 11: Train Loss=0.0139, Valid Loss=0.5221, Macro F1=0.8437, Accuracy=0.8381
Score Improved (0.8221 ---> 0.8437)
Model Saved

Epoch:  12


100%|██████████| 991/991 [02:45<00:00,  5.99it/s, LR=1.7e-5, Loss=0.00654] 


Epoch 12: Train Loss=0.0065, Valid Loss=0.6083, Macro F1=0.8292, Accuracy=0.8219

Epoch:  13


100%|██████████| 991/991 [02:44<00:00,  6.04it/s, LR=1.56e-5, Loss=0.00805]


Epoch 13: Train Loss=0.0081, Valid Loss=0.5914, Macro F1=0.8300, Accuracy=0.8239

Epoch:  14


100%|██████████| 991/991 [02:44<00:00,  6.03it/s, LR=1.41e-5, Loss=0.00536]


Epoch 14: Train Loss=0.0054, Valid Loss=0.6388, Macro F1=0.8313, Accuracy=0.8239

Epoch:  15


100%|██████████| 991/991 [02:44<00:00,  6.02it/s, LR=1.24e-5, Loss=0.00273]


Epoch 15: Train Loss=0.0027, Valid Loss=0.5681, Macro F1=0.8620, Accuracy=0.8583
Score Improved (0.8437 ---> 0.8620)
Model Saved

Epoch:  16


100%|██████████| 991/991 [02:44<00:00,  6.04it/s, LR=1.06e-5, Loss=0.00271]


Epoch 16: Train Loss=0.0027, Valid Loss=0.5754, Macro F1=0.8553, Accuracy=0.8502

Epoch:  17


100%|██████████| 991/991 [02:44<00:00,  6.01it/s, LR=8.85e-6, Loss=0.00231]


Epoch 17: Train Loss=0.0023, Valid Loss=0.5899, Macro F1=0.8584, Accuracy=0.8543

Epoch:  18


100%|██████████| 991/991 [02:45<00:00,  6.00it/s, LR=7.11e-6, Loss=0.00211]


Epoch 18: Train Loss=0.0021, Valid Loss=0.5947, Macro F1=0.8624, Accuracy=0.8583
Score Improved (0.8620 ---> 0.8624)
Model Saved

Epoch:  19


100%|██████████| 991/991 [02:44<00:00,  6.02it/s, LR=5.45e-6, Loss=0.00214] 


Epoch 19: Train Loss=0.0021, Valid Loss=0.6303, Macro F1=0.8525, Accuracy=0.8482

Epoch:  20


100%|██████████| 991/991 [02:44<00:00,  6.04it/s, LR=3.95e-6, Loss=0.00225]


Epoch 20: Train Loss=0.0022, Valid Loss=0.6034, Macro F1=0.8606, Accuracy=0.8563

Epoch:  21


100%|██████████| 991/991 [02:44<00:00,  6.01it/s, LR=2.63e-6, Loss=0.00193]


Epoch 21: Train Loss=0.0019, Valid Loss=0.6010, Macro F1=0.8553, Accuracy=0.8502

Epoch:  22


100%|██████████| 991/991 [02:45<00:00,  5.99it/s, LR=1.55e-6, Loss=0.00189]


Epoch 22: Train Loss=0.0019, Valid Loss=0.6017, Macro F1=0.8570, Accuracy=0.8522

Epoch:  23


100%|██████████| 991/991 [02:44<00:00,  6.03it/s, LR=7.34e-7, Loss=0.00187] 


Epoch 23: Train Loss=0.0019, Valid Loss=0.6020, Macro F1=0.8552, Accuracy=0.8502

Epoch:  24


100%|██████████| 991/991 [02:42<00:00,  6.08it/s, LR=2.14e-7, Loss=0.00188] 


Epoch 24: Train Loss=0.0019, Valid Loss=0.6023, Macro F1=0.8552, Accuracy=0.8502

Epoch:  25


100%|██████████| 991/991 [02:44<00:00,  6.01it/s, LR=4.28e-9, Loss=0.00186]


Epoch 25: Train Loss=0.0019, Valid Loss=0.6023, Macro F1=0.8552, Accuracy=0.8502

Training complete in 1h 19m 53s
Best Macro F1 Score: 0.8624
--------------------------------Training Fold 2/5---------------------------------
train shape : 1982
valid shape : 494
Epoch:  1


  0%|          | 0/991 [00:00<?, ?it/s]Attention type 'block_sparse' is not possible if sequence_length: 637 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3. Changing attention type to 'original_full'...
100%|██████████| 991/991 [02:43<00:00,  6.05it/s, LR=1.62e-6, Loss=0.276]


Epoch 1: Train Loss=0.2762, Valid Loss=2.1743, Macro F1=0.0484, Accuracy=0.1215
Score Improved (0.0000 ---> 0.0484)
Model Saved

Epoch:  2


100%|██████████| 991/991 [02:44<00:00,  6.04it/s, LR=3.94e-6, Loss=0.272]


Epoch 2: Train Loss=0.2719, Valid Loss=2.1475, Macro F1=0.0475, Accuracy=0.1215

Epoch:  3


100%|██████████| 991/991 [02:44<00:00,  6.01it/s, LR=7.37e-6, Loss=0.27] 


Epoch 3: Train Loss=0.2695, Valid Loss=2.1237, Macro F1=0.0885, Accuracy=0.1538
Score Improved (0.0484 ---> 0.0885)
Model Saved

Epoch:  4


100%|██████████| 991/991 [02:43<00:00,  6.05it/s, LR=1.13e-5, Loss=0.264]


Epoch 4: Train Loss=0.2636, Valid Loss=1.9931, Macro F1=0.2699, Accuracy=0.3441
Score Improved (0.0885 ---> 0.2699)
Model Saved

Epoch:  5


100%|██████████| 991/991 [02:44<00:00,  6.04it/s, LR=1.51e-5, Loss=0.229]


Epoch 5: Train Loss=0.2290, Valid Loss=1.5870, Macro F1=0.6100, Accuracy=0.5850
Score Improved (0.2699 ---> 0.6100)
Model Saved

Epoch:  6


100%|██████████| 991/991 [02:45<00:00,  6.00it/s, LR=1.81e-5, Loss=0.169]


Epoch 6: Train Loss=0.1686, Valid Loss=1.1589, Macro F1=0.6206, Accuracy=0.5931
Score Improved (0.6100 ---> 0.6206)
Model Saved

Epoch:  7


100%|██████████| 991/991 [02:44<00:00,  6.03it/s, LR=1.98e-5, Loss=0.102]


Epoch 7: Train Loss=0.1019, Valid Loss=0.6821, Macro F1=0.7802, Accuracy=0.7591
Score Improved (0.6206 ---> 0.7802)
Model Saved

Epoch:  8


100%|██████████| 991/991 [02:44<00:00,  6.01it/s, LR=2e-5, Loss=0.061]    


Epoch 8: Train Loss=0.0610, Valid Loss=0.5585, Macro F1=0.8118, Accuracy=0.8016
Score Improved (0.7802 ---> 0.8118)
Model Saved

Epoch:  9


100%|██████████| 991/991 [02:45<00:00,  6.00it/s, LR=1.97e-5, Loss=0.0374]


Epoch 9: Train Loss=0.0374, Valid Loss=0.5953, Macro F1=0.8099, Accuracy=0.8016

Epoch:  10


100%|██████████| 991/991 [02:44<00:00,  6.01it/s, LR=1.91e-5, Loss=0.0234]


Epoch 10: Train Loss=0.0234, Valid Loss=0.5157, Macro F1=0.8356, Accuracy=0.8279
Score Improved (0.8118 ---> 0.8356)
Model Saved

Epoch:  11


100%|██████████| 991/991 [02:45<00:00,  6.00it/s, LR=1.82e-5, Loss=0.0122]


Epoch 11: Train Loss=0.0122, Valid Loss=0.4766, Macro F1=0.8687, Accuracy=0.8563
Score Improved (0.8356 ---> 0.8687)
Model Saved

Epoch:  12


100%|██████████| 991/991 [02:44<00:00,  6.01it/s, LR=1.7e-5, Loss=0.00753] 


Epoch 12: Train Loss=0.0075, Valid Loss=0.5112, Macro F1=0.8639, Accuracy=0.8522

Epoch:  13


100%|██████████| 991/991 [02:43<00:00,  6.06it/s, LR=1.56e-5, Loss=0.00485]


Epoch 13: Train Loss=0.0048, Valid Loss=0.4546, Macro F1=0.8820, Accuracy=0.8745
Score Improved (0.8687 ---> 0.8820)
Model Saved

Epoch:  14


100%|██████████| 991/991 [02:44<00:00,  6.03it/s, LR=1.41e-5, Loss=0.00365]


Epoch 14: Train Loss=0.0036, Valid Loss=0.5480, Macro F1=0.8663, Accuracy=0.8603

Epoch:  15


100%|██████████| 991/991 [02:44<00:00,  6.02it/s, LR=1.24e-5, Loss=0.00375]


Epoch 15: Train Loss=0.0038, Valid Loss=0.4863, Macro F1=0.8742, Accuracy=0.8644

Epoch:  16


100%|██████████| 991/991 [02:43<00:00,  6.05it/s, LR=1.06e-5, Loss=0.00275]


Epoch 16: Train Loss=0.0027, Valid Loss=0.4862, Macro F1=0.8849, Accuracy=0.8765
Score Improved (0.8820 ---> 0.8849)
Model Saved

Epoch:  17


100%|██████████| 991/991 [02:44<00:00,  6.04it/s, LR=8.85e-6, Loss=0.0024] 


Epoch 17: Train Loss=0.0024, Valid Loss=0.5151, Macro F1=0.8754, Accuracy=0.8664

Epoch:  18


100%|██████████| 991/991 [02:44<00:00,  6.04it/s, LR=7.11e-6, Loss=0.00233]


Epoch 18: Train Loss=0.0023, Valid Loss=0.5192, Macro F1=0.8756, Accuracy=0.8664

Epoch:  19


100%|██████████| 991/991 [02:44<00:00,  6.04it/s, LR=5.45e-6, Loss=0.0021]  


Epoch 19: Train Loss=0.0021, Valid Loss=0.5242, Macro F1=0.8718, Accuracy=0.8623

Epoch:  20


100%|██████████| 991/991 [02:43<00:00,  6.06it/s, LR=3.95e-6, Loss=0.00204] 


Epoch 20: Train Loss=0.0020, Valid Loss=0.5190, Macro F1=0.8756, Accuracy=0.8664

Epoch:  21


100%|██████████| 991/991 [02:44<00:00,  6.04it/s, LR=2.63e-6, Loss=0.00202]


Epoch 21: Train Loss=0.0020, Valid Loss=0.5182, Macro F1=0.8739, Accuracy=0.8644

Epoch:  22


100%|██████████| 991/991 [02:44<00:00,  6.03it/s, LR=1.55e-6, Loss=0.00199] 


Epoch 22: Train Loss=0.0020, Valid Loss=0.5181, Macro F1=0.8740, Accuracy=0.8644

Epoch:  23


100%|██████████| 991/991 [02:43<00:00,  6.04it/s, LR=7.34e-7, Loss=0.002]  


Epoch 23: Train Loss=0.0020, Valid Loss=0.5178, Macro F1=0.8740, Accuracy=0.8644

Epoch:  24


100%|██████████| 991/991 [02:45<00:00,  6.00it/s, LR=2.14e-7, Loss=0.00199] 


Epoch 24: Train Loss=0.0020, Valid Loss=0.5178, Macro F1=0.8740, Accuracy=0.8644

Epoch:  25


100%|██████████| 991/991 [02:42<00:00,  6.09it/s, LR=4.28e-9, Loss=0.00198]


Epoch 25: Train Loss=0.0020, Valid Loss=0.5178, Macro F1=0.8740, Accuracy=0.8644

Training complete in 1h 17m 13s
Best Macro F1 Score: 0.8849
--------------------------------Training Fold 3/5---------------------------------
train shape : 1979
valid shape : 497
Epoch:  1


  0%|          | 0/989 [00:00<?, ?it/s]Attention type 'block_sparse' is not possible if sequence_length: 302 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3. Changing attention type to 'original_full'...
100%|██████████| 989/989 [02:43<00:00,  6.04it/s, LR=1.62e-6, Loss=0.278]


Epoch 1: Train Loss=0.2780, Valid Loss=2.1783, Macro F1=0.0458, Accuracy=0.1247
Score Improved (0.0000 ---> 0.0458)
Model Saved

Epoch:  2


100%|██████████| 989/989 [02:43<00:00,  6.06it/s, LR=3.95e-6, Loss=0.271]


Epoch 2: Train Loss=0.2706, Valid Loss=2.1494, Macro F1=0.0624, Accuracy=0.1469
Score Improved (0.0458 ---> 0.0624)
Model Saved

Epoch:  3


100%|██████████| 989/989 [02:43<00:00,  6.06it/s, LR=7.39e-6, Loss=0.27] 


Epoch 3: Train Loss=0.2697, Valid Loss=2.1303, Macro F1=0.0634, Accuracy=0.1288
Score Improved (0.0624 ---> 0.0634)
Model Saved

Epoch:  4


100%|██████████| 989/989 [02:43<00:00,  6.06it/s, LR=1.13e-5, Loss=0.267]


Epoch 4: Train Loss=0.2669, Valid Loss=2.1037, Macro F1=0.1331, Accuracy=0.1932
Score Improved (0.0634 ---> 0.1331)
Model Saved

Epoch:  5


100%|██████████| 989/989 [02:44<00:00,  6.01it/s, LR=1.51e-5, Loss=0.249]


Epoch 5: Train Loss=0.2493, Valid Loss=1.7859, Macro F1=0.4851, Accuracy=0.4849
Score Improved (0.1331 ---> 0.4851)
Model Saved

Epoch:  6


100%|██████████| 989/989 [02:43<00:00,  6.04it/s, LR=1.81e-5, Loss=0.186]


Epoch 6: Train Loss=0.1862, Valid Loss=1.2149, Macro F1=0.6283, Accuracy=0.6036
Score Improved (0.4851 ---> 0.6283)
Model Saved

Epoch:  7


100%|██████████| 989/989 [02:43<00:00,  6.03it/s, LR=1.98e-5, Loss=0.117]


Epoch 7: Train Loss=0.1174, Valid Loss=0.8163, Macro F1=0.7453, Accuracy=0.7203
Score Improved (0.6283 ---> 0.7453)
Model Saved

Epoch:  8


100%|██████████| 989/989 [02:43<00:00,  6.05it/s, LR=2e-5, Loss=0.0633]   


Epoch 8: Train Loss=0.0633, Valid Loss=0.7133, Macro F1=0.7733, Accuracy=0.7606
Score Improved (0.7453 ---> 0.7733)
Model Saved

Epoch:  9


100%|██████████| 989/989 [02:43<00:00,  6.04it/s, LR=1.97e-5, Loss=0.0362]


Epoch 9: Train Loss=0.0362, Valid Loss=0.5440, Macro F1=0.8361, Accuracy=0.8209
Score Improved (0.7733 ---> 0.8361)
Model Saved

Epoch:  10


100%|██████████| 989/989 [02:43<00:00,  6.04it/s, LR=1.9e-5, Loss=0.0208] 


Epoch 10: Train Loss=0.0208, Valid Loss=0.6641, Macro F1=0.8140, Accuracy=0.8008

Epoch:  11


100%|██████████| 989/989 [02:43<00:00,  6.05it/s, LR=1.81e-5, Loss=0.0119] 


Epoch 11: Train Loss=0.0119, Valid Loss=0.5941, Macro F1=0.8161, Accuracy=0.8048

Epoch:  12


100%|██████████| 989/989 [02:43<00:00,  6.04it/s, LR=1.7e-5, Loss=0.00603] 


Epoch 12: Train Loss=0.0060, Valid Loss=0.6036, Macro F1=0.8415, Accuracy=0.8270
Score Improved (0.8361 ---> 0.8415)
Model Saved

Epoch:  13


100%|██████████| 989/989 [02:42<00:00,  6.08it/s, LR=1.56e-5, Loss=0.00391]


Epoch 13: Train Loss=0.0039, Valid Loss=0.7113, Macro F1=0.8216, Accuracy=0.8048

Epoch:  14


100%|██████████| 989/989 [02:43<00:00,  6.05it/s, LR=1.4e-5, Loss=0.00435] 


Epoch 14: Train Loss=0.0043, Valid Loss=0.6584, Macro F1=0.8363, Accuracy=0.8209

Epoch:  15


100%|██████████| 989/989 [02:42<00:00,  6.08it/s, LR=1.23e-5, Loss=0.00348]


Epoch 15: Train Loss=0.0035, Valid Loss=0.6621, Macro F1=0.8355, Accuracy=0.8229

Epoch:  16


100%|██████████| 989/989 [02:42<00:00,  6.07it/s, LR=1.06e-5, Loss=0.00228]


Epoch 16: Train Loss=0.0023, Valid Loss=0.7649, Macro F1=0.8247, Accuracy=0.8048

Epoch:  17


100%|██████████| 989/989 [02:42<00:00,  6.07it/s, LR=8.79e-6, Loss=0.00207]


Epoch 17: Train Loss=0.0021, Valid Loss=0.6852, Macro F1=0.8468, Accuracy=0.8330
Score Improved (0.8415 ---> 0.8468)
Model Saved

Epoch:  18


100%|██████████| 989/989 [02:43<00:00,  6.06it/s, LR=7.05e-6, Loss=0.00186]


Epoch 18: Train Loss=0.0019, Valid Loss=0.6942, Macro F1=0.8485, Accuracy=0.8350
Score Improved (0.8468 ---> 0.8485)
Model Saved

Epoch:  19


100%|██████████| 989/989 [02:42<00:00,  6.07it/s, LR=5.39e-6, Loss=0.00178] 


Epoch 19: Train Loss=0.0018, Valid Loss=0.6978, Macro F1=0.8465, Accuracy=0.8330

Epoch:  20


100%|██████████| 989/989 [02:42<00:00,  6.07it/s, LR=3.89e-6, Loss=0.00171]


Epoch 20: Train Loss=0.0017, Valid Loss=0.7004, Macro F1=0.8465, Accuracy=0.8330

Epoch:  21


100%|██████████| 989/989 [02:42<00:00,  6.07it/s, LR=2.58e-6, Loss=0.00174] 


Epoch 21: Train Loss=0.0017, Valid Loss=0.7028, Macro F1=0.8448, Accuracy=0.8310

Epoch:  22


100%|██████████| 989/989 [02:42<00:00,  6.10it/s, LR=1.51e-6, Loss=0.00146] 


Epoch 22: Train Loss=0.0015, Valid Loss=0.7032, Macro F1=0.8448, Accuracy=0.8310

Epoch:  23


100%|██████████| 989/989 [02:42<00:00,  6.07it/s, LR=7.05e-7, Loss=0.00163] 


Epoch 23: Train Loss=0.0016, Valid Loss=0.7038, Macro F1=0.8465, Accuracy=0.8330

Epoch:  24


100%|██████████| 989/989 [02:42<00:00,  6.08it/s, LR=1.97e-7, Loss=0.00157]


Epoch 24: Train Loss=0.0016, Valid Loss=0.7040, Macro F1=0.8465, Accuracy=0.8330

Epoch:  25


100%|██████████| 989/989 [02:42<00:00,  6.07it/s, LR=2.15e-9, Loss=0.00169] 


Epoch 25: Train Loss=0.0017, Valid Loss=0.7039, Macro F1=0.8465, Accuracy=0.8330

Training complete in 1h 16m 50s
Best Macro F1 Score: 0.8485
--------------------------------Training Fold 4/5---------------------------------
train shape : 1981
valid shape : 495
Epoch:  1


  0%|          | 0/990 [00:00<?, ?it/s]Input ids are automatically padded from 733 to 768 to be a multiple of `config.block_size`: 64
  0%|          | 1/990 [00:00<07:15,  2.27it/s, LR=8e-7, Loss=0.217]Attention type 'block_sparse' is not possible if sequence_length: 664 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3. Changing attention type to 'original_full'...
100%|██████████| 990/990 [02:45<00:00,  5.97it/s, LR=1.62e-6, Loss=0.275]


Epoch 1: Train Loss=0.2750, Valid Loss=2.1670, Macro F1=0.0278, Accuracy=0.1232
Score Improved (0.0000 ---> 0.0278)
Model Saved

Epoch:  2


100%|██████████| 990/990 [02:45<00:00,  5.98it/s, LR=3.95e-6, Loss=0.271]


Epoch 2: Train Loss=0.2710, Valid Loss=2.1422, Macro F1=0.0808, Accuracy=0.1354
Score Improved (0.0278 ---> 0.0808)
Model Saved

Epoch:  3


100%|██████████| 990/990 [02:46<00:00,  5.94it/s, LR=7.38e-6, Loss=0.269]


Epoch 3: Train Loss=0.2690, Valid Loss=2.1249, Macro F1=0.0631, Accuracy=0.1475

Epoch:  4


100%|██████████| 990/990 [02:45<00:00,  5.96it/s, LR=1.13e-5, Loss=0.264]


Epoch 4: Train Loss=0.2636, Valid Loss=2.0544, Macro F1=0.2525, Accuracy=0.3131
Score Improved (0.0808 ---> 0.2525)
Model Saved

Epoch:  5


100%|██████████| 990/990 [02:46<00:00,  5.94it/s, LR=1.51e-5, Loss=0.24] 


Epoch 5: Train Loss=0.2396, Valid Loss=1.6946, Macro F1=0.5182, Accuracy=0.4970
Score Improved (0.2525 ---> 0.5182)
Model Saved

Epoch:  6


100%|██████████| 990/990 [02:46<00:00,  5.93it/s, LR=1.81e-5, Loss=0.184]


Epoch 6: Train Loss=0.1838, Valid Loss=1.3196, Macro F1=0.5556, Accuracy=0.5374
Score Improved (0.5182 ---> 0.5556)
Model Saved

Epoch:  7


100%|██████████| 990/990 [02:45<00:00,  5.97it/s, LR=1.98e-5, Loss=0.112]


Epoch 7: Train Loss=0.1123, Valid Loss=0.6727, Macro F1=0.7856, Accuracy=0.7717
Score Improved (0.5556 ---> 0.7856)
Model Saved

Epoch:  8


100%|██████████| 990/990 [02:45<00:00,  5.98it/s, LR=2e-5, Loss=0.0572]   


Epoch 8: Train Loss=0.0572, Valid Loss=0.5226, Macro F1=0.8476, Accuracy=0.8384
Score Improved (0.7856 ---> 0.8476)
Model Saved

Epoch:  9


100%|██████████| 990/990 [02:45<00:00,  5.97it/s, LR=1.97e-5, Loss=0.0353]


Epoch 9: Train Loss=0.0353, Valid Loss=0.5003, Macro F1=0.8441, Accuracy=0.8323

Epoch:  10


100%|██████████| 990/990 [02:46<00:00,  5.95it/s, LR=1.9e-5, Loss=0.0205] 


Epoch 10: Train Loss=0.0205, Valid Loss=0.4411, Macro F1=0.8759, Accuracy=0.8667
Score Improved (0.8476 ---> 0.8759)
Model Saved

Epoch:  11


100%|██████████| 990/990 [02:47<00:00,  5.92it/s, LR=1.81e-5, Loss=0.0113]


Epoch 11: Train Loss=0.0113, Valid Loss=0.5219, Macro F1=0.8665, Accuracy=0.8586

Epoch:  12


100%|██████████| 990/990 [02:45<00:00,  5.97it/s, LR=1.7e-5, Loss=0.00689] 


Epoch 12: Train Loss=0.0069, Valid Loss=0.5142, Macro F1=0.8753, Accuracy=0.8667

Epoch:  13


100%|██████████| 990/990 [02:45<00:00,  5.99it/s, LR=1.56e-5, Loss=0.00591]


Epoch 13: Train Loss=0.0059, Valid Loss=0.4859, Macro F1=0.8829, Accuracy=0.8727
Score Improved (0.8759 ---> 0.8829)
Model Saved

Epoch:  14


100%|██████████| 990/990 [02:46<00:00,  5.95it/s, LR=1.41e-5, Loss=0.00507]


Epoch 14: Train Loss=0.0051, Valid Loss=0.4619, Macro F1=0.9071, Accuracy=0.8990
Score Improved (0.8829 ---> 0.9071)
Model Saved

Epoch:  15


100%|██████████| 990/990 [02:46<00:00,  5.95it/s, LR=1.24e-5, Loss=0.00434]


Epoch 15: Train Loss=0.0043, Valid Loss=0.4581, Macro F1=0.9036, Accuracy=0.8949

Epoch:  16


100%|██████████| 990/990 [02:45<00:00,  5.99it/s, LR=1.06e-5, Loss=0.00423]


Epoch 16: Train Loss=0.0042, Valid Loss=0.4568, Macro F1=0.9056, Accuracy=0.8970

Epoch:  17


100%|██████████| 990/990 [02:44<00:00,  6.00it/s, LR=8.82e-6, Loss=0.0041] 


Epoch 17: Train Loss=0.0041, Valid Loss=0.4589, Macro F1=0.9056, Accuracy=0.8970

Epoch:  18


100%|██████████| 990/990 [02:45<00:00,  6.00it/s, LR=7.08e-6, Loss=0.00409]


Epoch 18: Train Loss=0.0041, Valid Loss=0.4590, Macro F1=0.9037, Accuracy=0.8949

Epoch:  19


100%|██████████| 990/990 [02:45<00:00,  5.99it/s, LR=5.42e-6, Loss=0.00402]


Epoch 19: Train Loss=0.0040, Valid Loss=0.4618, Macro F1=0.9038, Accuracy=0.8949

Epoch:  20


100%|██████████| 990/990 [02:45<00:00,  5.97it/s, LR=3.92e-6, Loss=0.004]  


Epoch 20: Train Loss=0.0040, Valid Loss=0.4630, Macro F1=0.9017, Accuracy=0.8929

Epoch:  21


100%|██████████| 990/990 [02:45<00:00,  5.98it/s, LR=2.61e-6, Loss=0.00397]


Epoch 21: Train Loss=0.0040, Valid Loss=0.4648, Macro F1=0.9017, Accuracy=0.8929

Epoch:  22


100%|██████████| 990/990 [02:46<00:00,  5.93it/s, LR=1.53e-6, Loss=0.00395]


Epoch 22: Train Loss=0.0040, Valid Loss=0.4658, Macro F1=0.9017, Accuracy=0.8929

Epoch:  23


100%|██████████| 990/990 [02:46<00:00,  5.96it/s, LR=7.19e-7, Loss=0.00398]


Epoch 23: Train Loss=0.0040, Valid Loss=0.4663, Macro F1=0.9017, Accuracy=0.8929

Epoch:  24


100%|██████████| 990/990 [02:47<00:00,  5.92it/s, LR=2.06e-7, Loss=0.00393]


Epoch 24: Train Loss=0.0039, Valid Loss=0.4665, Macro F1=0.9017, Accuracy=0.8929

Epoch:  25


100%|██████████| 990/990 [02:45<00:00,  5.98it/s, LR=3.12e-9, Loss=0.00397]


Epoch 25: Train Loss=0.0040, Valid Loss=0.4665, Macro F1=0.9017, Accuracy=0.8929

Training complete in 1h 17m 32s
Best Macro F1 Score: 0.9071
--------------------------------Training Fold 5/5---------------------------------
train shape : 1980
valid shape : 496
Epoch:  1


  0%|          | 0/990 [00:00<?, ?it/s]Attention type 'block_sparse' is not possible if sequence_length: 578 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3. Changing attention type to 'original_full'...
100%|██████████| 990/990 [02:44<00:00,  6.03it/s, LR=1.62e-6, Loss=0.28] 


Epoch 1: Train Loss=0.2797, Valid Loss=2.1866, Macro F1=0.0485, Accuracy=0.1290
Score Improved (0.0000 ---> 0.0485)
Model Saved

Epoch:  2


100%|██████████| 990/990 [02:42<00:00,  6.11it/s, LR=3.95e-6, Loss=0.271]


Epoch 2: Train Loss=0.2712, Valid Loss=2.1457, Macro F1=0.0506, Accuracy=0.1290
Score Improved (0.0485 ---> 0.0506)
Model Saved

Epoch:  3


100%|██████████| 990/990 [02:43<00:00,  6.06it/s, LR=7.38e-6, Loss=0.269]


Epoch 3: Train Loss=0.2693, Valid Loss=2.1280, Macro F1=0.0525, Accuracy=0.1371
Score Improved (0.0506 ---> 0.0525)
Model Saved

Epoch:  4


100%|██████████| 990/990 [02:43<00:00,  6.06it/s, LR=1.13e-5, Loss=0.264]


Epoch 4: Train Loss=0.2640, Valid Loss=2.0292, Macro F1=0.3173, Accuracy=0.3246
Score Improved (0.0525 ---> 0.3173)
Model Saved

Epoch:  5


100%|██████████| 990/990 [02:44<00:00,  6.03it/s, LR=1.51e-5, Loss=0.228]


Epoch 5: Train Loss=0.2277, Valid Loss=1.5566, Macro F1=0.5625, Accuracy=0.5423
Score Improved (0.3173 ---> 0.5625)
Model Saved

Epoch:  6


100%|██████████| 990/990 [02:43<00:00,  6.04it/s, LR=1.81e-5, Loss=0.146]


Epoch 6: Train Loss=0.1456, Valid Loss=0.9880, Macro F1=0.6659, Accuracy=0.6371
Score Improved (0.5625 ---> 0.6659)
Model Saved

Epoch:  7


100%|██████████| 990/990 [02:43<00:00,  6.04it/s, LR=1.98e-5, Loss=0.0755]


Epoch 7: Train Loss=0.0755, Valid Loss=0.7116, Macro F1=0.7722, Accuracy=0.7560
Score Improved (0.6659 ---> 0.7722)
Model Saved

Epoch:  8


100%|██████████| 990/990 [02:43<00:00,  6.04it/s, LR=2e-5, Loss=0.0439]   


Epoch 8: Train Loss=0.0439, Valid Loss=0.6048, Macro F1=0.8019, Accuracy=0.7903
Score Improved (0.7722 ---> 0.8019)
Model Saved

Epoch:  9


100%|██████████| 990/990 [02:43<00:00,  6.07it/s, LR=1.97e-5, Loss=0.0286]


Epoch 9: Train Loss=0.0286, Valid Loss=0.5067, Macro F1=0.8500, Accuracy=0.8367
Score Improved (0.8019 ---> 0.8500)
Model Saved

Epoch:  10


100%|██████████| 990/990 [02:42<00:00,  6.08it/s, LR=1.9e-5, Loss=0.0126] 


Epoch 10: Train Loss=0.0126, Valid Loss=0.5350, Macro F1=0.8391, Accuracy=0.8226

Epoch:  11


100%|██████████| 990/990 [02:42<00:00,  6.11it/s, LR=1.81e-5, Loss=0.00907]


Epoch 11: Train Loss=0.0091, Valid Loss=0.5078, Macro F1=0.8489, Accuracy=0.8387

Epoch:  12


100%|██████████| 990/990 [02:42<00:00,  6.08it/s, LR=1.7e-5, Loss=0.00456] 


Epoch 12: Train Loss=0.0046, Valid Loss=0.5531, Macro F1=0.8431, Accuracy=0.8306

Epoch:  13


100%|██████████| 990/990 [02:43<00:00,  6.06it/s, LR=1.56e-5, Loss=0.00478]


Epoch 13: Train Loss=0.0048, Valid Loss=0.5844, Macro F1=0.8464, Accuracy=0.8367

Epoch:  14


100%|██████████| 990/990 [02:43<00:00,  6.07it/s, LR=1.41e-5, Loss=0.0042] 


Epoch 14: Train Loss=0.0042, Valid Loss=0.5765, Macro F1=0.8582, Accuracy=0.8468
Score Improved (0.8500 ---> 0.8582)
Model Saved

Epoch:  15


100%|██████████| 990/990 [02:43<00:00,  6.07it/s, LR=1.24e-5, Loss=0.00317]


Epoch 15: Train Loss=0.0032, Valid Loss=0.4955, Macro F1=0.8772, Accuracy=0.8669
Score Improved (0.8582 ---> 0.8772)
Model Saved

Epoch:  16


100%|██████████| 990/990 [02:43<00:00,  6.05it/s, LR=1.06e-5, Loss=0.00225]


Epoch 16: Train Loss=0.0023, Valid Loss=0.5076, Macro F1=0.8710, Accuracy=0.8629

Epoch:  17


100%|██████████| 990/990 [02:42<00:00,  6.09it/s, LR=8.82e-6, Loss=0.00192]


Epoch 17: Train Loss=0.0019, Valid Loss=0.5119, Macro F1=0.8595, Accuracy=0.8508

Epoch:  18


100%|██████████| 990/990 [02:42<00:00,  6.07it/s, LR=7.08e-6, Loss=0.00153] 


Epoch 18: Train Loss=0.0015, Valid Loss=0.5146, Macro F1=0.8633, Accuracy=0.8548

Epoch:  19


100%|██████████| 990/990 [02:43<00:00,  6.07it/s, LR=5.42e-6, Loss=0.00152]


Epoch 19: Train Loss=0.0015, Valid Loss=0.5240, Macro F1=0.8689, Accuracy=0.8609

Epoch:  20


100%|██████████| 990/990 [02:43<00:00,  6.07it/s, LR=3.92e-6, Loss=0.00145]


Epoch 20: Train Loss=0.0014, Valid Loss=0.5276, Macro F1=0.8709, Accuracy=0.8629

Epoch:  21


100%|██████████| 990/990 [02:42<00:00,  6.09it/s, LR=2.61e-6, Loss=0.00142] 


Epoch 21: Train Loss=0.0014, Valid Loss=0.5279, Macro F1=0.8709, Accuracy=0.8629

Epoch:  22


100%|██████████| 990/990 [02:42<00:00,  6.10it/s, LR=1.53e-6, Loss=0.00142] 


Epoch 22: Train Loss=0.0014, Valid Loss=0.5283, Macro F1=0.8709, Accuracy=0.8629

Epoch:  23


100%|██████████| 990/990 [02:42<00:00,  6.10it/s, LR=7.19e-7, Loss=0.00138]


Epoch 23: Train Loss=0.0014, Valid Loss=0.5288, Macro F1=0.8709, Accuracy=0.8629

Epoch:  24


100%|██████████| 990/990 [02:42<00:00,  6.08it/s, LR=2.06e-7, Loss=0.00139] 


Epoch 24: Train Loss=0.0014, Valid Loss=0.5290, Macro F1=0.8709, Accuracy=0.8629

Epoch:  25


100%|██████████| 990/990 [02:43<00:00,  6.04it/s, LR=3.12e-9, Loss=0.00139]


Epoch 25: Train Loss=0.0014, Valid Loss=0.5290, Macro F1=0.8709, Accuracy=0.8629

Training complete in 1h 16m 53s
Best Macro F1 Score: 0.8772


## Get OOF

In [29]:
import gc
for i in range(10):
    gc.collect()
    torch.cuda.empty_cache()

In [30]:
target_cols = [f'target_{i}' for i in range(len(id2label))]

In [31]:
target_cols

['target_0',
 'target_1',
 'target_2',
 'target_3',
 'target_4',
 'target_5',
 'target_6',
 'target_7',
 'target_8']

In [32]:
@torch.inference_mode()
def infer(model, dataloader, device):
    model.eval()

    total = 0
    losses = AverageMeter()
    correct = 0
    preds = []
    y_test = []
    bar = tqdm(dataloader, total=len(dataloader))
    for data in bar:
        ids = data["input_ids"].to(device, dtype=torch.long)
        mask = data["attention_mask"].to(device, dtype=torch.long)
        rmask = data["R_mask"].to(device, dtype=torch.long)
        batch_size = ids.size(0)

        outputs = model(ids, mask, rmask)
        preds.append(outputs.softmax(dim=1).detach().cpu().numpy())
        
    
    preds = np.concatenate(preds)

    return preds

In [33]:
import gc
for i in range(10):
    torch.cuda.empty_cache()
    try:
        gc.collct()
    except:
        pass


In [34]:
def get_oof() :
    oof = train.copy(deep=True)
    for fold in range(5) :

        print(f'--------------------------------Inferring Fold {fold+1}/5---------------------------------')
        valid_ = train[train.fold==fold]
        idxs = valid_.index
        valid_ = valid_.reset_index()
        
        print(f'valid shape : {len(valid_)}')
        
        valid_dataset = Dataset(valid_,True)
        collate_fn = Collate(tokenizer=cfg['tokenizer'])#DataCollatorWithPadding(tokenizer=cfg['tokenizer'])
        valid_loader = DataLoader(
            valid_dataset,
            batch_size=cfg["valid_batch_size"],
            collate_fn=collate_fn,
            num_workers=8,
            shuffle=False,
            pin_memory=True,
        )
        
        model = Model()
        model.to(cfg['device'])
        model.load_state_dict(
            torch.load(f"bigbird-roberta-large/fold_{fold}.bin")
        )
        y_test = infer(
            model, valid_loader, device=cfg["device"]
        )
        oof.loc[idxs, target_cols] = y_test
        torch.cuda.empty_cache()
    return oof


In [35]:
oof = get_oof()

--------------------------------Inferring Fold 1/5---------------------------------
valid shape : 494


  0%|          | 0/247 [00:00<?, ?it/s]Attention type 'block_sparse' is not possible if sequence_length: 587 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3. Changing attention type to 'original_full'...
100%|██████████| 247/247 [00:20<00:00, 11.84it/s]


--------------------------------Inferring Fold 2/5---------------------------------
valid shape : 494


  0%|          | 0/247 [00:00<?, ?it/s]Attention type 'block_sparse' is not possible if sequence_length: 344 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3. Changing attention type to 'original_full'...
100%|██████████| 247/247 [00:21<00:00, 11.75it/s]


--------------------------------Inferring Fold 3/5---------------------------------
valid shape : 497


  0%|          | 0/249 [00:00<?, ?it/s]Attention type 'block_sparse' is not possible if sequence_length: 251 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3. Changing attention type to 'original_full'...
100%|██████████| 249/249 [00:21<00:00, 11.80it/s]


--------------------------------Inferring Fold 4/5---------------------------------
valid shape : 495


  0%|          | 0/248 [00:00<?, ?it/s]Attention type 'block_sparse' is not possible if sequence_length: 676 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3. Changing attention type to 'original_full'...
100%|██████████| 248/248 [00:20<00:00, 12.28it/s]


--------------------------------Inferring Fold 5/5---------------------------------
valid shape : 496


  0%|          | 0/248 [00:00<?, ?it/s]Attention type 'block_sparse' is not possible if sequence_length: 594 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3. Changing attention type to 'original_full'...
100%|██████████| 248/248 [00:21<00:00, 11.51it/s]


In [36]:
oof.head(1)

Unnamed: 0,conversation_id,Question,Response,tutor_class,target,context,fold,target_0,target_1,target_2,target_3,target_4,target_5,target_6,target_7,target_8
0,221-362eb11a-f190-42a6-b2a4-985fafdcfa9e,"Tutor: Hi, could you please provide a step-by-...","Great, you've correctly identified the cost of...",Sonnet,8,Now that we know the cost of 1 pound of meat i...,0,0.001539,0.002152,0.008789,0.001866,0.005211,0.897244,0.007055,0.004902,0.071242


In [37]:
from sklearn.metrics import f1_score, accuracy_score

oof['pred'] = oof[target_cols].idxmax(axis=1).apply(lambda x: x.split("_")[1])
oof['pred'] = oof['pred'].astype(int)

print(f1_score(oof['target'], oof['pred'], average='macro'))
print(accuracy_score(oof['target'], oof['pred']))

0.8760601318948884
0.867124394184168


In [38]:
oof.to_csv('bigbird-roberta-large/oofs.csv', index=False)

## Inference

In [39]:

# Data Processing Functions
def load_test_data(test_data_path='mrbench_v3_testset.json'):
    """
    Load development and (optionally) test datasets
    """    
    # Load test data if provided
    test_examples = []
    test_data = None
    if test_data_path:
        with open(test_data_path, 'r',encoding="utf-8") as f:
            test_data = json.load(f)
        
        for dialogue in test_data:
            conversation_id = dialogue["conversation_id"]
            conversation_history = dialogue["conversation_history"]
            
            for tutor_id, tutor_data in dialogue["tutor_responses"].items():
                test_examples.append({
                    "conversation_id": conversation_id,
                    "conversation_history": conversation_history,
                    "tutor_response": tutor_data["response"],
                    "tutor_id": tutor_id
                })
    
    return test_examples, test_data

In [40]:
test_examples,test_data = load_test_data()

In [41]:
test_examples

[{'conversation_id': '1030-adb61831-0383-4e51-a673-ab978590f69b',
  'conversation_history': 'Tutor: Hi, could you please provide a step-by-step solution for the question below? The question is: Tyson decided to make muffaletta sandwiches for the big game.  Each sandwich required 1 pound each of meat and cheese and would serve 4 people.  There would be 20 people in total watching the game.  The meat cost $7.00 per pound and the cheese cost $3.00 per pound.  How much money would he spend on the meat and cheese to make enough sandwiches to serve 20 people? \n Student: To serve 20 people, Tyson needs to make 20/4 = 5 sandwiches.\nEach sandwich requires 1+1 = 2 pounds of meat and cheese.\nFor 5 sandwiches, he needs a total of 2 x 5 = 10 pounds of meat and cheese.\nThe cost of 10 pounds of meat is 10 x $7.00 = $70.\nThe cost of 10 pounds of cheese is 10 x $3.00 = $30.\nThe total cost of meat and cheese is $70 + $30 = $100.\n 100 \n Tutor: do you want to talk me through your solution \n Stude

In [42]:
test = pd.DataFrame(test_examples)
test.rename(columns={'conversation_history':'Question','tutor_response':'Response'},inplace=True)
test

Unnamed: 0,conversation_id,Question,Response,tutor_id
0,1030-adb61831-0383-4e51-a673-ab978590f69b,"Tutor: Hi, could you please provide a step-by-...",It looks like you've done a great job figuring...,Tutor_1
1,1030-adb61831-0383-4e51-a673-ab978590f69b,"Tutor: Hi, could you please provide a step-by-...","You've done a great job, but there's a small m...",Tutor_2
2,1030-adb61831-0383-4e51-a673-ab978590f69b,"Tutor: Hi, could you please provide a step-by-...","OK, read the question again, and answer these ...",Tutor_3
3,1030-adb61831-0383-4e51-a673-ab978590f69b,"Tutor: Hi, could you please provide a step-by-...","Tutor: I see where you're coming from, but I t...",Tutor_4
4,1030-adb61831-0383-4e51-a673-ab978590f69b,"Tutor: Hi, could you please provide a step-by-...",Great job! Can you explain how you arrived at ...,Tutor_5
...,...,...,...,...
1542,3512-aa670406-dbb2-4993-865e-54f2d2b30d56,"Tutor: Hi, could you please provide a step-by-...","Great progress, but let's double-check the cal...",Tutor_4
1543,3512-aa670406-dbb2-4993-865e-54f2d2b30d56,"Tutor: Hi, could you please provide a step-by-...","Great job! Now, let's move on to the next prob...",Tutor_5
1544,3512-aa670406-dbb2-4993-865e-54f2d2b30d56,"Tutor: Hi, could you please provide a step-by-...","Actually, you're almost there but remember to ...",Tutor_6
1545,3512-aa670406-dbb2-4993-865e-54f2d2b30d56,"Tutor: Hi, could you please provide a step-by-...","You're very close! Remember, she only needs t...",Tutor_7


In [43]:
rows = []
for i, row in test.iterrows():
    question = row.Question
    response = row.Response
    convid = row.conversation_id
    context = test[(test.conversation_id==convid)&(test.Response!=response)].Response.values
    context = ' [SEP] '.join(context)
    test.loc[i, 'context'] = context
test.head()

Unnamed: 0,conversation_id,Question,Response,tutor_id,context
0,1030-adb61831-0383-4e51-a673-ab978590f69b,"Tutor: Hi, could you please provide a step-by-...",It looks like you've done a great job figuring...,Tutor_1,"You've done a great job, but there's a small m..."
1,1030-adb61831-0383-4e51-a673-ab978590f69b,"Tutor: Hi, could you please provide a step-by-...","You've done a great job, but there's a small m...",Tutor_2,It looks like you've done a great job figuring...
2,1030-adb61831-0383-4e51-a673-ab978590f69b,"Tutor: Hi, could you please provide a step-by-...","OK, read the question again, and answer these ...",Tutor_3,It looks like you've done a great job figuring...
3,1030-adb61831-0383-4e51-a673-ab978590f69b,"Tutor: Hi, could you please provide a step-by-...","Tutor: I see where you're coming from, but I t...",Tutor_4,It looks like you've done a great job figuring...
4,1030-adb61831-0383-4e51-a673-ab978590f69b,"Tutor: Hi, could you please provide a step-by-...",Great job! Can you explain how you arrived at ...,Tutor_5,It looks like you've done a great job figuring...


In [44]:
class Collate:
    def __init__(self, tokenizer, max_len=cfg['max_len']):
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __call__(self, batch):
        output = dict()
        output["input_ids"] = [sample["input_ids"] for sample in batch]
        output["attention_mask"] = [sample["attention_mask"] for sample in batch]
        output["R_mask"] = [sample["R_mask"] for sample in batch]

        # calculate max token length of this batch
        batch_max = max([len(ids) for ids in output["input_ids"]]) 

        batch_max = min(batch_max, self.max_len)
        
        output["input_ids"] = [s[:batch_max] for s in output["input_ids"]]
        output["attention_mask"] = [s[:batch_max] for s in output["attention_mask"]] 
        output["R_mask"] = [s[:batch_max] for s in output["R_mask"]] 
        #output["target"] = [s[:batch_max] for s in output["target"]]


        output["input_ids"] = [s + (batch_max - len(s)) * [self.tokenizer.pad_token_id] for s in output["input_ids"]]
        output["attention_mask"] = [s + (batch_max - len(s)) * [0] for s in output["attention_mask"]]
        output["R_mask"] = [s + (batch_max - len(s)) * [0] for s in output["R_mask"]]
        #output["target"] = [s + (batch_max - len(s)) * [0] for s in output["target"]]
        

        # convert to tensors
        output["input_ids"] = torch.tensor(output["input_ids"], dtype=torch.long)
        output["attention_mask"] = torch.tensor(output["attention_mask"], dtype=torch.long)
        output["R_mask"] = torch.tensor(output["R_mask"], dtype=torch.long)

        return output

In [45]:
for i in range(10):
    gc.collect()
    torch.cuda.empty_cache()

In [46]:
valid_dataset = Dataset(test,False )
collate_fn = Collate(tokenizer=cfg['tokenizer'])
test_loader = DataLoader(
                valid_dataset,
                batch_size=cfg["valid_batch_size"],
                collate_fn=collate_fn,
                num_workers=8,
                shuffle=False,
                pin_memory=True,
            )

final_preds = []
for fold in range(5):
    model = Model()
    model.to(cfg['device'])
    model.load_state_dict(
        torch.load(f'bigbird-roberta-large/fold_{fold}.bin')
    )
    model = torch.compile(model)
    preds = infer(model, test_loader, 'cuda')
    final_preds.append(preds)
    del model
    torch.cuda.empty_cache()
    gc.collect()

final_preds = np.mean(final_preds, 0) 

  0%|          | 0/774 [00:00<?, ?it/s]Attention type 'block_sparse' is not possible if sequence_length: 611 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3. Changing attention type to 'original_full'...
100%|██████████| 774/774 [02:36<00:00,  4.94it/s] 
  0%|          | 0/774 [00:00<?, ?it/s]Attention type 'block_sparse' is not possible if sequence_length: 611 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3. Changing attention type to 'original_full'...
100%|██████████| 774/774 [01:00<00:00, 12.75it/s]
  0%|          | 0/774 [00:00<?, ?it/s]Attentio

In [47]:
target_cols = [f'target_{i}' for i in range(len(id2label))]

In [48]:
test[target_cols] = final_preds

In [49]:
test.head()

Unnamed: 0,conversation_id,Question,Response,tutor_id,context,target_0,target_1,target_2,target_3,target_4,target_5,target_6,target_7,target_8
0,1030-adb61831-0383-4e51-a673-ab978590f69b,"Tutor: Hi, could you please provide a step-by-...",It looks like you've done a great job figuring...,Tutor_1,"You've done a great job, but there's a small m...",0.000561,0.000664,0.994406,0.000659,0.000687,0.000429,0.00125,0.000615,0.000729
1,1030-adb61831-0383-4e51-a673-ab978590f69b,"Tutor: Hi, could you please provide a step-by-...","You've done a great job, but there's a small m...",Tutor_2,It looks like you've done a great job figuring...,0.000658,0.000719,0.000796,0.994683,0.000457,0.000718,0.00059,0.000772,0.000608
2,1030-adb61831-0383-4e51-a673-ab978590f69b,"Tutor: Hi, could you please provide a step-by-...","OK, read the question again, and answer these ...",Tutor_3,It looks like you've done a great job figuring...,0.721512,0.003925,0.004162,0.013158,0.007993,0.005634,0.212709,0.027977,0.00293
3,1030-adb61831-0383-4e51-a673-ab978590f69b,"Tutor: Hi, could you please provide a step-by-...","Tutor: I see where you're coming from, but I t...",Tutor_4,It looks like you've done a great job figuring...,0.000471,0.000636,0.000625,0.000586,0.994395,0.001138,0.000949,0.000558,0.000642
4,1030-adb61831-0383-4e51-a673-ab978590f69b,"Tutor: Hi, could you please provide a step-by-...",Great job! Can you explain how you arrived at ...,Tutor_5,It looks like you've done a great job figuring...,0.000573,0.000712,0.00062,0.000883,0.000579,0.000505,0.001204,0.9943,0.000624


In [52]:
test[target_cols].idxmax(axis=1).value_counts()

target_2    205
target_0    194
target_6    194
target_3    192
target_4    187
target_5    187
target_7    185
target_8    181
target_1     15
Name: count, dtype: int64

In [53]:
test['pred'] = test[target_cols].astype(str).idxmax(axis=1).apply(lambda x: x.split("_")[1])
test['pred'] = test['pred'].astype(int)
test['pred']

0       2
1       3
2       0
3       4
4       7
       ..
1542    8
1543    7
1544    3
1545    2
1546    6
Name: pred, Length: 1547, dtype: int64

In [54]:
pred_labels = test['pred'].values

In [55]:
# Create submission file
submission = []
unique_conversation_ids = list(ex["conversation_id"] for ex in test_examples)

for conversation_id in unique_conversation_ids:
    conversation_data = next(d for d in test_data if d["conversation_id"] == conversation_id)
    submission_item = {
        "conversation_id": conversation_id,
        "conversation_history": conversation_data["conversation_history"],
        "tutor_responses": {}
    }
        
    for tutor_id, tutor_data in conversation_data["tutor_responses"].items():
        # Find the corresponding prediction
        idx = next(i for i, ex in enumerate(test_examples) 
                    if ex["conversation_id"] == conversation_id and ex["tutor_id"] == tutor_id)
        
        predicted_class = id2label[pred_labels[idx]]
        
        submission_item["tutor_responses"][tutor_id] = {
            "response": tutor_data["response"],
            "annotation": {
                "Tutor_Identification": predicted_class
            }
        }
    
    submission.append(submission_item)

In [56]:
submission

[{'conversation_id': '1030-adb61831-0383-4e51-a673-ab978590f69b',
  'conversation_history': 'Tutor: Hi, could you please provide a step-by-step solution for the question below? The question is: Tyson decided to make muffaletta sandwiches for the big game.  Each sandwich required 1 pound each of meat and cheese and would serve 4 people.  There would be 20 people in total watching the game.  The meat cost $7.00 per pound and the cheese cost $3.00 per pound.  How much money would he spend on the meat and cheese to make enough sandwiches to serve 20 people? \n Student: To serve 20 people, Tyson needs to make 20/4 = 5 sandwiches.\nEach sandwich requires 1+1 = 2 pounds of meat and cheese.\nFor 5 sandwiches, he needs a total of 2 x 5 = 10 pounds of meat and cheese.\nThe cost of 10 pounds of meat is 10 x $7.00 = $70.\nThe cost of 10 pounds of cheese is 10 x $3.00 = $30.\nThe total cost of meat and cheese is $70 + $30 = $100.\n 100 \n Tutor: do you want to talk me through your solution \n Stude

In [57]:
with open(os.path.join("bigbird-roberta-large", "predictions.json"), "w") as f:
    json.dump(submission, f, indent=2)

In [58]:
test.head()

Unnamed: 0,conversation_id,Question,Response,tutor_id,context,target_0,target_1,target_2,target_3,target_4,target_5,target_6,target_7,target_8,pred
0,1030-adb61831-0383-4e51-a673-ab978590f69b,"Tutor: Hi, could you please provide a step-by-...",It looks like you've done a great job figuring...,Tutor_1,"You've done a great job, but there's a small m...",0.000561,0.000664,0.994406,0.000659,0.000687,0.000429,0.00125,0.000615,0.000729,2
1,1030-adb61831-0383-4e51-a673-ab978590f69b,"Tutor: Hi, could you please provide a step-by-...","You've done a great job, but there's a small m...",Tutor_2,It looks like you've done a great job figuring...,0.000658,0.000719,0.000796,0.994683,0.000457,0.000718,0.00059,0.000772,0.000608,3
2,1030-adb61831-0383-4e51-a673-ab978590f69b,"Tutor: Hi, could you please provide a step-by-...","OK, read the question again, and answer these ...",Tutor_3,It looks like you've done a great job figuring...,0.721512,0.003925,0.004162,0.013158,0.007993,0.005634,0.212709,0.027977,0.00293,0
3,1030-adb61831-0383-4e51-a673-ab978590f69b,"Tutor: Hi, could you please provide a step-by-...","Tutor: I see where you're coming from, but I t...",Tutor_4,It looks like you've done a great job figuring...,0.000471,0.000636,0.000625,0.000586,0.994395,0.001138,0.000949,0.000558,0.000642,4
4,1030-adb61831-0383-4e51-a673-ab978590f69b,"Tutor: Hi, could you please provide a step-by-...",Great job! Can you explain how you arrived at ...,Tutor_5,It looks like you've done a great job figuring...,0.000573,0.000712,0.00062,0.000883,0.000579,0.000505,0.001204,0.9943,0.000624,7


In [59]:
test.to_csv('bigbird-roberta-large/test_probas.csv', index=False)