## Testing a single head model on custom dataset

In [None]:
import numpy as np
import torch
import torch.nn.functional as F
from ignite.metrics import Accuracy, Loss, Fbeta, recall, precision
from transformers import AdamW, AutoModelForSequenceClassification, AutoTokenizer
from tqdm.notebook import trange, tqdm
from utils import load_dataset, tokenize_dataset, create_dataloader
import os
import glob

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
ARTEMIS_EMOTIONS = ['amusement', 'awe', 'contentment', 'excitement',
                'anger', 'disgust',  'fear', 'sadness', 'something else']

In [None]:
bert_version = 'xlm-roberta-base'
tokenizer = AutoTokenizer.from_pretrained(bert_version, padding_side='right')

data_root_path = 'dataset'
model_root_path = './'

In [None]:
all_models = glob.glob('single_models/xlm*')
print(all_models)
langs = ['english', 'arabic', 'chinese', 'all_langs']

In [None]:
for lang in langs:
    for model_path in all_models:
        # loading the model
        model_path = os.path.join(model_root_path, model_path)
        model_name = '_'.join(model_path.split('/')[-1].split('_')[:2])
        model = AutoModelForSequenceClassification.from_pretrained(model_path)
        model.to(device)
        model.eval()

        data_path = os.path.join(data_root_path, f'test_{lang}/test_{lang}.csv')
        sentences, labels = load_dataset(data_path, ARTEMIS_EMOTIONS, split='test')
        tokens, masks = tokenize_dataset(tokenizer, sentences)
        dataloader = create_dataloader(tokens, masks, labels, batch_size=128, mode='test')
        
        # evaluation loop
        print(f'========= {model_name} :: {lang} =========')
        t = trange(len(dataloader), desc='ML')
        model.eval()
        metrics = {'Accuracy': Accuracy(), 
                   'Precision': precision.Precision(average=True), 
                   'Recall': recall.Recall(average=True), 
                   'F1': Fbeta(1)
                  }
        loss_avg = Loss(F.cross_entropy)
        for metric in metrics.values():
            metric.reset()
        loss_avg.reset()
        for step, batch in zip(t, dataloader):
            input_ids = batch[0].to(device)
            input_mask = batch[1].to(device)
            labels = batch[2].to(device)
            with torch.no_grad():
                outputs = model(input_ids, 
                                token_type_ids=None, 
                                attention_mask=input_mask) 
                emo_loss = F.cross_entropy(outputs.logits, labels, reduction = 'mean')
            for metric in metrics.values():
                metric.update((outputs.logits, labels.argmax(dim=1))) 
            loss_avg.update((outputs.logits, labels.argmax(dim=1))) 
            t.set_description(f'ML (loss={loss_avg.compute():.5f})')
        for n, metric in metrics.items():
            print(f'   {n}: {metric.compute():.5f}')
        print(f'   Loss:     {loss_avg.compute():.5f}')
        print(f'==========================================')