# One Shot Learning

Here we are checking the performance of the model trained on the English Dataset on other Datasets and their translated versions without any finetuning

## Imports

In [1]:
import torch
import pandas as pd
import numpy as np
from tqdm import tqdm

from data_cleaning import Data_Preprocessing
from arabert.preprocess import ArabertPreprocessor

## Model

In [2]:
# Core
import random

# Basics
import numpy as np
import pandas as pd
import torch

# Metrics
from sklearn.metrics import *

# Tokeniser
from transformers import XLMRobertaTokenizer

# Utility
from tqdm import tqdm

# Dataloader
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# Scheduler
from transformers import get_linear_schedule_with_warmup

# Optimiser
from transformers import AdamW

# Model

import torch.nn as nn
from models import weighted_Roberta


class XLM_Roberta:
    def __init__(self,args):
        # fix the random
        random.seed(args['seed_val'])
        np.random.seed(args['seed_val'])
        torch.manual_seed(args['seed_val'])
        torch.cuda.manual_seed_all(args['seed_val'])
        
        # set device
        self.device = torch.device(args['device'])

        self.weights=args['weights']
        
        # initiliase tokeniser
        self.tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base', do_lower_case = True)

        self.model_save_path = args['model_save_path']
        self.name = args['name']
        
    ##-----------------------------------------------------------##
    ##----------------- Utility Functions -----------------------##
    ##-----------------------------------------------------------##
    def encode(self,data,max_len):
        input_ids = []
        attention_masks = []
        for sent in tqdm(data):
            # use in-built tokeniser of Bert
            encoded_dict = self.tokenizer.encode_plus(
                            sent,
                            add_special_tokens =True, # for [CLS] and [SEP]
                            max_length = max_len,
                            truncation = True,
                            padding = 'max_length',
                            return_attention_mask = True,
#                             return_tensors = 'pt', # return pytorch tensors
            )
            input_ids.append(encoded_dict['input_ids'])
            # attention masks notify where padding has been added 
            # and where is the sentence
            attention_masks.append(encoded_dict['attention_mask'])
            X_data = torch.tensor(input_ids)
            attention_masks_data = torch.tensor(attention_masks)
            
        return [X_data,attention_masks_data]
    
    ##-----------------------------------------------------------##
    ##------------------ Dataloader -----------------------------##
    ##-----------------------------------------------------------##
    def get_dataloader(self,samples, batch_size,is_train=False):
        inputs,masks,labels = samples

        # Convert the lists into tensors.
#         inputs = torch.cat(inputs, dim=0)
#         masks = torch.cat(masks, dim=0)
        labels = torch.tensor(labels)

        # convert to dataset
        data = TensorDataset(inputs,masks,labels)

        if(is_train==False):
            # use random sampler for training to shuffle
            # train data
            sampler = SequentialSampler(data)
        else:
            # order does not matter for validation as we just 
            # need the metrics
            sampler = RandomSampler(data)  

        dataloader = DataLoader(data, sampler=sampler, batch_size=batch_size,drop_last=True)

        return dataloader
    
    ##-----------------------------------------------------------##
    ##----------------- Training Utilities ----------------------##
    ##-----------------------------------------------------------## 
    def get_optimiser(self,learning_rate,model):
        # using AdamW optimiser from transformers library
        return AdamW(model.parameters(),
                  lr = learning_rate, 
                  eps = 1e-8
                )
    
    def get_scheduler(self,epochs,optimiser,train_dl):
        total_steps = len(train_dl) * epochs
        return get_linear_schedule_with_warmup(optimiser, 
                num_warmup_steps = 0, 
                num_training_steps = total_steps)
    
    def evalMetric(self, y_true, y_pred, prefix):
        # calculate all the metrics and add prefix to them
        # before saving in dictionary
        accuracy = accuracy_score(y_true, y_pred)
        mf1Score = f1_score(y_true, y_pred, average='macro')
        f1Score = f1_score(y_true, y_pred)
        area_under_c = roc_auc_score(y_true, y_pred)
        recallScore = recall_score(y_true, y_pred)
        precisionScore = precision_score(y_true, y_pred)

        nonhate_f1Score = f1_score(y_true, y_pred, pos_label=0)
        non_recallScore = recall_score(y_true, y_pred, pos_label=0)
        non_precisionScore = precision_score(y_true, y_pred, pos_label=0)
        return {prefix+"accuracy": accuracy, prefix+'mF1Score': mf1Score, 
            prefix+'f1Score': f1Score, prefix+'auc': area_under_c,
            prefix+'precision': precisionScore, 
            prefix+'recall': recallScore, 
            prefix+'non_hatef1Score': nonhate_f1Score, 
            prefix+'non_recallScore': non_recallScore, 
            prefix+'non_precisionScore': non_precisionScore}
    
    ##-----------------------------------------------------------##
    ##---------------- Different Train Loops --------------------##
    ##-----------------------------------------------------------## 
    def evaluate(self,model,loader,which):
        # to evaluate model on test and validation set

        model.eval() # put model in eval mode

        # maintain total loss to save in metrics
        total_eval_loss = 0

        # maintain predictions for each batch and calculate metrics
        # at the end of the epoch
        y_pred = np.zeros(shape=(0),dtype='int')
        y_true = np.empty(shape=(0),dtype='int')

        for batch in tqdm(loader):
            # separate input, labels and attention mask
            b_input_ids = batch[0].to(self.device)
            b_input_mask = batch[1].to(self.device)
            b_labels = batch[2].to(self.device)

            with torch.no_grad(): # do not construct compute graph
                outputs = model(b_input_ids, 
                                   token_type_ids=None, 
                                   attention_mask=b_input_mask,
                                   labels=b_labels)
            
            # output is always a tuple, thus we have to 
            # separate it manually
            #loss = outputs[0]
            logits = outputs[0]

            # define new loss function so that we can include
            # weights
            loss_fct = nn.CrossEntropyLoss(weight=torch.tensor(
                        self.weights,dtype=torch.float).to(self.device))
            
            loss = loss_fct(logits.view(-1, 2), b_labels.view(-1))

            # add the current loss
            # loss.item() extracts loss value as a float
            total_eval_loss += loss.item()

            # calculate true labels and convert it into numpy array
            b_y_true = b_labels.cpu().data.squeeze().numpy()
            
            # calculate predicted labels by taking max of 
            # prediction scores
            b_y_pred = torch.max(logits,1)[1]
            b_y_pred = b_y_pred.cpu().data.squeeze().numpy()

            y_pred = np.concatenate((y_pred,b_y_pred))
            y_true = np.concatenate((y_true,b_y_true))

        # calculate metrics
        metrics = self.evalMetric(y_true,y_pred,which+"_")

        # Calculate the average loss over all of the batches.
        avg_loss = total_eval_loss / len(loader)
        # add it to the metric
        metrics[which+'_avg_loss'] = avg_loss

        return metrics
    
    
    def run_train_loop(self,model,train_loader,optimiser,scheduler):

        model.train() # put model in train mode

        # maintain total loss to add to metric
        total_loss = 0

        # maintain predictions for each batch and calculate metrics
        # at the end of the epoch
        y_pred = np.zeros(shape=(0),dtype='int')
        y_true = np.empty(shape=(0),dtype='int')

        for batch in tqdm(train_loader):
            # separate inputs, labels and attention mask
            b_input_ids = batch[0].to(self.device)
            b_input_mask = batch[1].to(self.device)
            b_labels = batch[2].to(self.device)

            # Ref: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch#:~:text=In%20PyTorch%20%2C%20we%20need%20to,backward()%20call.
            model.zero_grad()                

            outputs = model(b_input_ids, 
                             token_type_ids=None, 
                             attention_mask=b_input_mask, 
                             labels=b_labels)

            # outputs is always returned as tuple
            # Separate it manually
            logits = outputs[0]

            # define new loss function so that we can include
            # weights
            loss_fct = nn.CrossEntropyLoss(weight=torch.tensor(
                        self.weights,dtype=torch.float).to(self.device))
            
            loss = loss_fct(logits.view(-1, 2), b_labels.view(-1))
            
            # calculate current loss
            # loss.item() extracts loss value as a float
            total_loss += loss.item()

            # Back-propagation
            loss.backward()

            # calculate true labels
            b_y_true = b_labels.cpu().data.squeeze().numpy()

            # calculate predicted labels by taking max of 
            # prediction scores
            b_y_pred = torch.max(logits,1)[1]
            b_y_pred = b_y_pred.cpu().data.squeeze().numpy()

            y_pred = np.concatenate((y_pred,b_y_pred))
            y_true = np.concatenate((y_true,b_y_true))

            # clip gradient to prevent exploding gradient
            # problems
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # gradient descent
            optimiser.step()
            
            # schedule learning rate accordingly
            scheduler.step()

        # calculate avg loss 
        avg_train_loss = total_loss / len(train_loader)

        # calculate metrics
        train_metrics = self.evalMetric(y_true,y_pred,"Train_")
        
        # print results
        print('avg_train_loss',avg_train_loss)
        print('train_f1Score',train_metrics['Train_f1Score'])
        print('train_accuracy',train_metrics['Train_accuracy'])

        # add loss to metrics
        train_metrics['Train_avg_loss'] = avg_train_loss

        return train_metrics
    
    
    ##------------------------------------------------------------##
    ##----------------- Main Train Loop --------------------------##
    ##------------------------------------------------------------##
    def train(self,model,data_loaders,optimiser,scheduler,epochs,save_model):
        # save train stats per epoch
        train_stats = []
        train_loader,val_loader,test_loader = data_loaders
        # maintain best mF1 Score to save best model
        best_mf1Score=-1.0
        for epoch_i in range(0, epochs):
            print("")
            print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
            
            print("")
            print('Training...')
            # run trian loop
            train_metrics = self.run_train_loop(model,train_loader,
                                            optimiser,scheduler)

            print("")
            print("Running Validation...") 
            # test on validation set
            val_metrics = self.evaluate(model,val_loader,"Val")
            
            print("Validation Loss: ",val_metrics['Val_avg_loss'])
            print("Validation Accuracy: ",val_metrics['Val_accuracy'])
            
            stats = {}

            # save model where validation mF1Score is best
            if(val_metrics['Val_mF1Score']>best_mf1Score):
                best_mf1Score=val_metrics['Val_mF1Score']
                if(save_model):
                    torch.save(model.state_dict(), self.model_save_path+
                        '/best_bert_'+self.name+'.pt')
                # evaluate best model on test set
                test_metrics = self.evaluate(model,test_loader,"Test")

            stats['epoch']=epoch_i+1

            # add train and val metrics of the epoch to 
            # same dictionary
            stats.update(train_metrics)
            stats.update(val_metrics)

            train_stats.append(stats)

        return train_stats,test_metrics
    
    ##-----------------------------------------------------------##
    ##----------------------- Main Pipeline ---------------------##
    ##-----------------------------------------------------------##
    def run(self,args,df_train,df_val,df_test):
        # get X and Y data points 
        X_train = df_train['Text'].values
        Y_train = df_train['Label'].values
        X_test = df_test['Text'].values
        Y_test = df_test['Label'].values
        X_val = df_val['Text'].values
        Y_val = df_val['Label'].values
        
        # encode data
        # returns list of data and attention masks
        train_data = self.encode(X_train,args['max_len'])
        val_data = self.encode(X_val,args['max_len'])
        test_data = self.encode(X_test,args['max_len'])
        
        # add labels to data so that we can send them to
        # dataloader function together
        train_data.append(Y_train)
        val_data.append(Y_val)
        test_data.append(Y_test)
        
        # convert to dataloader
        train_dl =self.get_dataloader(train_data,args['batch_size'],True)
        val_dl =self.get_dataloader(val_data,args['batch_size'])                          
        test_dl =self.get_dataloader(test_data,args['batch_size'])
        
        # intialise model
        model = weighted_Roberta.from_pretrained(
            'xlm-roberta-base', # Use the 12-layer BERT model, with an uncased vocab.
            num_labels = 2, # The number of output labels--2 for binary classification             # You can increase this for multi-class tasks.   
            params=args['params'],
        )
        model.to(self.device)
        
        optimiser = self.get_optimiser(args['learning_rate'],model)
        
        scheduler = self.get_scheduler(args['epochs'],optimiser,train_dl)
        
        # Run train loop and evaluate on validation data set
        # on each epoch. Store best model from all epochs 
        # (best mF1 Score on Val set) and evaluate it on
        # test set
        train_stats,train_metrics = self.train(model,[train_dl,val_dl,test_dl],
                                optimiser,scheduler,args['epochs'],args['save_model'])
        
        return train_stats,train_metrics
        
    ##-----------------------------------------------------------##
    ##-------------------- Other Utilities ----------------------##
    ##-----------------------------------------------------------##
    def run_test(self,model,df_test,args):
        # to evaluate test set on the final saved model
        # to retrieve results if necessary
        X_test = df_test['Text'].values
        Y_test = df_test['Label'].values

        test_data = self.encode(X_test,args['max_len'])

        test_data.append(Y_test)

        test_dl =self.get_dataloader(test_data,32)

        metrics = self.evaluate(model,test_dl,"Test")

        return metrics
    
    def load_model(self,path,args):
        # load saved best model
        saved_model = weighted_Roberta.from_pretrained(
            'xlm-roberta-base', # Use the 12-layer BERT model, with an uncased vocab.
            num_labels = 2, # The number of output labels--2 for binary classification             # You can increase this for multi-class tasks.   
            params=args['params'],
        )
        
        saved_model.load_state_dict(torch.load(path))
        
        return saved_model

## Utility Functions

In [3]:
def preprocess(df,isArabic):
    
    X = df['Text']
    X_new=[]
    if(isArabic):
        prep = ArabertPreprocessor('bert-base-arabertv02')
        for text in tqdm(X):
            text = prep.preprocess(text)
            X_new.append(text)
    else:
        processer = Data_Preprocessing()
        for text in tqdm(X):
            text= processer.removeEmojis(text)
            text = processer.removeUrls(text)
            text=processer.removeSpecialChar(text)
            X_new.append(text)

    df['Text']=X_new
    return df 

In [4]:
def one_shot_output(model_path,data_path,obj,args):
    saved_model=obj.load_model(model_path,args)
    device = torch.device(args['device'])
    saved_model=saved_model.to(device)
    
    df = pd.read_csv(data_path)
    
    # preprocessing
    df = preprocess(df,args['isArabic'])
    
    metrics = obj.run_test(saved_model,df,args)
    
    return metrics

### Arabic

In [12]:
DATA_PATH = "Data_Processed/Let-Mi/all.csv"
MODEL_PATH = "Saved_Models/Let-Mi/all_but_one/best_bert_xlm_roberta_3_all.pt"


model_args={
        'seed_val': 42,
        'name': 'xlm_roberta',
        'batch_size': 8,
        'bert_model': "xlm-roberta-base",
        'learning_rate': 2e-5,
        'epochs': 10,
        'max_len': 128,
        'device': 'cuda:1',
        'weights': [1.0, 1.0],
        'save_model': True,
        'model_save_path': 'Saved_Models/Let-Mi/all_but_one/',
        'isArabic': True,
        'model_path': "",
        'max_length':128,
        'is_train':True,
        'epsilon':1e-8,
        'random_seed':30,
        'to_save':True,
        'frac':0.8,
        'params':{
            'max_length':128,
            'path_files': 'xlm-roberta-base',
            'what_bert':'weighted',
            'batch_size':8,
            'is_train':True,
            'learning_rate':2e-5,
            'epsilon':1e-8,
            'random_seed':30,
            'epochs':10,
            'to_save':True,
            'weights':[1.0,1.0],
            'frac':0.8
        }
    }

model = XLM_Roberta(model_args)

metrics = one_shot_output(MODEL_PATH,DATA_PATH,model,model_args)

You are using a model of type xlm-roberta to instantiate a model of type roberta. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at xlm-roberta-base were not used when initializing weighted_Roberta: ['lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.bias']
- This IS expected if you are initializing weighted_Roberta from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing weighted_Roberta from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of weighted_Roberta were not initialized from the model checkpoint at xlm-roberta-base and are newly initiali

In [13]:
metrics

{'Test_accuracy': 0.6278757668711656,
 'Test_mF1Score': 0.6103902191176347,
 'Test_f1Score': 0.527852104110922,
 'Test_auc': 0.6251453031194015,
 'Test_precision': 0.7054616384915474,
 'Test_recall': 0.42168674698795183,
 'Test_non_hatef1Score': 0.6929283341243474,
 'Test_non_recallScore': 0.8286038592508513,
 'Test_non_precisionScore': 0.5954323001631321,
 'Test_avg_loss': 2.7577537766263527}

### Italian

In [14]:
DATA_PATH = "Data_Processed/AMI-2020/all.csv"
MODEL_PATH = "Saved_Models/AMI-2020/all_but_one/best_bert_xlm_roberta_2_all.pt"


model_args={
        'seed_val': 42,
        'name': 'xlm_roberta',
        'batch_size': 8,
        'bert_model': "xlm-roberta-base",
        'learning_rate': 2e-5,
        'epochs': 10,
        'max_len': 128,
        'device': 'cuda',
        'weights': [1.0, 1.0],
        'save_model': True,
        'model_save_path': '',
        'isArabic': False,
        'model_path': "",
        'max_length':128,
        'is_train':True,
        'epsilon':1e-8,
        'random_seed':30,
        'to_save':True,
        'frac':0.8,
        'params':{
            'max_length':128,
            'path_files': 'xlm-roberta-base',
            'what_bert':'weighted',
            'batch_size':8,
            'is_train':True,
            'learning_rate':2e-5,
            'epsilon':1e-8,
            'random_seed':30,
            'epochs':10,
            'to_save':True,
            'weights':[1.0,1.0],
            'frac':0.8
        }
    }

model = XLM_Roberta(model_args)

metrics = one_shot_output(MODEL_PATH,DATA_PATH,model,model_args)

You are using a model of type xlm-roberta to instantiate a model of type roberta. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at xlm-roberta-base were not used when initializing weighted_Roberta: ['lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.bias']
- This IS expected if you are initializing weighted_Roberta from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing weighted_Roberta from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of weighted_Roberta were not initialized from the model checkpoint at xlm-roberta-base and are newly initiali

In [15]:
metrics

{'Test_accuracy': 0.5432459677419355,
 'Test_mF1Score': 0.5355248490400887,
 'Test_f1Score': 0.475639393588705,
 'Test_auc': 0.5395916489383066,
 'Test_precision': 0.5345993756503642,
 'Test_recall': 0.4283927454659162,
 'Test_non_hatef1Score': 0.5954103044914724,
 'Test_non_recallScore': 0.6507905524106968,
 'Test_non_precisionScore': 0.5487162606978275,
 'Test_avg_loss': 2.45584655346409}

### Hindi 

In [13]:
DATA_PATH = "Data_Processed/Shared_Task_hin/all.csv"
MODEL_PATH = "Saved_Models/Shared_Task_hin/all_but_one/best_bert_xlm_roberta_1_all.pt"


model_args={
        'seed_val': 42,
        'name': 'xlm_roberta',
        'batch_size': 8,
        'bert_model': "xlm-roberta-base",
        'learning_rate': 2e-5,
        'epochs': 10,
        'max_len': 128,
        'device': 'cuda',
        'weights': [1.0, 4.5],
        'save_model': True,
        'model_save_path': '',
        'isArabic': False,
        'model_path': "",
        'max_length':128,
        'is_train':True,
        'epsilon':1e-8,
        'random_seed':30,
        'to_save':True,
        'frac':0.8,
        'params':{
            'max_length':128,
            'path_files': 'xlm-roberta-base',
            'what_bert':'weighted',
            'batch_size':8,
            'is_train':True,
            'learning_rate':2e-5,
            'epsilon':1e-8,
            'random_seed':30,
            'epochs':10,
            'to_save':True,
            'weights':[1.0,4.5],
            'frac':0.8
        }
    }

model = XLM_Roberta(model_args)

metrics = one_shot_output(MODEL_PATH,DATA_PATH,model,model_args)

You are using a model of type xlm-roberta to instantiate a model of type roberta. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at xlm-roberta-base were not used when initializing weighted_Roberta: ['lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing weighted_Roberta from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing weighted_Roberta from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of weighted_Roberta were not initialized from the model checkpoint at xlm-roberta-base and are newly initiali

In [14]:
metrics

{'Test_accuracy': 0.8110427461139896,
 'Test_mF1Score': 0.7074607574491267,
 'Test_f1Score': 0.5333866453418632,
 'Test_auc': 0.6947429134081914,
 'Test_precision': 0.5923623445825933,
 'Test_recall': 0.4850909090909091,
 'Test_non_hatef1Score': 0.8815348695563903,
 'Test_non_recallScore': 0.9043949177254739,
 'Test_non_precisionScore': 0.8598019801980198,
 'Test_avg_loss': 1.4547508923346515}

### Bengali 

In [5]:
DATA_PATH = "Data_Processed/Shared_Task_iben/all.csv"
MODEL_PATH = "Saved_Models/Shared_Task_iben/all_but_one/best_bert_xlm_roberta_5_all.pt"


model_args={
        'seed_val': 42,
        'name': 'xlm_roberta',
        'batch_size': 8,
        'bert_model': "xlm-roberta-base",
        'learning_rate': 2e-5,
        'epochs': 10,
        'max_len': 128,
        'device': 'cuda',
        'weights': [1.0, 6.0],
        'save_model': True,
        'model_save_path': '',
        'isArabic': False,
        'model_path': "",
        'max_length':128,
        'is_train':True,
        'epsilon':1e-8,
        'random_seed':30,
        'to_save':True,
        'frac':0.8,
        'params':{
            'max_length':128,
            'path_files': 'xlm-roberta-base',
            'what_bert':'weighted',
            'batch_size':8,
            'is_train':True,
            'learning_rate':2e-5,
            'epsilon':1e-8,
            'random_seed':30,
            'epochs':10,
            'to_save':True,
            'weights':[1.0,6.0],
            'frac':0.8
        }
    }

model = XLM_Roberta(model_args)

metrics = one_shot_output(MODEL_PATH,DATA_PATH,model,model_args)

You are using a model of type xlm-roberta to instantiate a model of type roberta. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at xlm-roberta-base were not used when initializing weighted_Roberta: ['lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing weighted_Roberta from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing weighted_Roberta from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of weighted_Roberta were not initialized from the model checkpoint at xlm-roberta-base and are newly initiali

In [7]:
metrics

{'Test_accuracy': 0.8220766129032258,
 'Test_mF1Score': 0.6104474913839375,
 'Test_f1Score': 0.3233226837060703,
 'Test_auc': 0.5931262235258643,
 'Test_precision': 0.55,
 'Test_recall': 0.22895927601809954,
 'Test_non_hatef1Score': 0.8975722990618048,
 'Test_non_recallScore': 0.9572931710336291,
 'Test_non_precisionScore': 0.8448652585579024,
 'Test_avg_loss': 1.9111749583915356}

### Spanish

In [15]:
DATA_PATH = "Data_Processed/AMI-Spanish/all.csv"
MODEL_PATH = "Saved_Models/AMI-Spanish/all_but_one/best_bert_xlm_roberta_1_all.pt"


model_args={
        'seed_val': 42,
        'name': 'xlm_roberta',
        'batch_size': 8,
        'bert_model': "xlm-roberta-base",
        'learning_rate': 2e-5,
        'epochs': 10,
        'max_len': 128,
        'device': 'cuda',
        'weights': [1.0, 1.0],
        'save_model': True,
        'model_save_path': '',
        'isArabic': False,
        'model_path': "",
        'max_length':128,
        'is_train':True,
        'epsilon':1e-8,
        'random_seed':30,
        'to_save':True,
        'frac':0.8,
        'params':{
            'max_length':128,
            'path_files': 'xlm-roberta-base',
            'what_bert':'weighted',
            'batch_size':8,
            'is_train':True,
            'learning_rate':2e-5,
            'epsilon':1e-8,
            'random_seed':30,
            'epochs':10,
            'to_save':True,
            'weights':[1.0,1.0],
            'frac':0.8
        }
    }

model = XLM_Roberta(model_args)

metrics = one_shot_output(MODEL_PATH,DATA_PATH,model,model_args)

You are using a model of type xlm-roberta to instantiate a model of type roberta. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at xlm-roberta-base were not used when initializing weighted_Roberta: ['lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing weighted_Roberta from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing weighted_Roberta from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of weighted_Roberta were not initialized from the model checkpoint at xlm-roberta-base and are newly initiali

In [16]:
metrics

{'Test_accuracy': 0.6119538834951457,
 'Test_mF1Score': 0.6046830179878536,
 'Test_f1Score': 0.551070551070551,
 'Test_auc': 0.6111534558458562,
 'Test_precision': 0.64822460776218,
 'Test_recall': 0.47924297924297926,
 'Test_non_hatef1Score': 0.6582954849051562,
 'Test_non_recallScore': 0.7430639324487334,
 'Test_non_precisionScore': 0.5908872901678657,
 'Test_avg_loss': 1.2461398206289531}

## English

In [8]:
DATA_PATH = "Data_Processed/Shared_Task_eng/all.csv"
MODEL_PATH = "Saved_Models/Shared_Task_eng/all_but_one/best_bert_xlm_roberta_4_all.pt"


model_args={
        'seed_val': 42,
        'name': 'xlm_roberta',
        'batch_size': 8,
        'bert_model': "xlm-roberta-base",
        'learning_rate': 2e-5,
        'epochs': 10,
        'max_len': 128,
        'device': 'cuda',
        'weights': [1.0, 8.0],
        'save_model': True,
        'model_save_path': '',
        'isArabic': False,
        'model_path': "",
        'max_length':128,
        'is_train':True,
        'epsilon':1e-8,
        'random_seed':30,
        'to_save':True,
        'frac':0.8,
        'params':{
            'max_length':128,
            'path_files': 'xlm-roberta-base',
            'what_bert':'weighted',
            'batch_size':8,
            'is_train':True,
            'learning_rate':2e-5,
            'epsilon':1e-8,
            'random_seed':30,
            'epochs':10,
            'to_save':True,
            'weights':[1.0,8.0],
            'frac':0.8
        }
    }

model = XLM_Roberta(model_args)

metrics = one_shot_output(MODEL_PATH,DATA_PATH,model,model_args)

You are using a model of type xlm-roberta to instantiate a model of type roberta. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at xlm-roberta-base were not used when initializing weighted_Roberta: ['lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing weighted_Roberta from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing weighted_Roberta from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of weighted_Roberta were not initialized from the model checkpoint at xlm-roberta-base and are newly initiali

In [10]:
metrics

{'Test_accuracy': 0.8345588235294118,
 'Test_mF1Score': 0.648073605520414,
 'Test_f1Score': 0.39189189189189194,
 'Test_auc': 0.6264972844101598,
 'Test_precision': 0.5367057371992597,
 'Test_recall': 0.3086200780418588,
 'Test_non_hatef1Score': 0.9042553191489361,
 'Test_non_recallScore': 0.9443744907784608,
 'Test_non_precisionScore': 0.8674059459827199,
 'Test_avg_loss': 1.5595395457109107}

# Few Shot Learning

Here we are checking the performance of the model trained on the English Dataset on other Datasets and their translated versions with finetuning

## XLM RoBERTa Model

### Main Class

In [5]:
# Core
import random

# Basics
import numpy as np
import pandas as pd
import torch

# Metrics
from sklearn.metrics import *

# Tokeniser
from transformers import XLMRobertaTokenizer

# Utility
from tqdm import tqdm

# Dataloader
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# Scheduler
from transformers import get_linear_schedule_with_warmup

# Optimiser
from transformers import AdamW

# Model

import torch.nn as nn
from models import weighted_Roberta


class XLM_Roberta_fewShot:
    def __init__(self,args):
        # fix the random
        random.seed(args['seed_val'])
        np.random.seed(args['seed_val'])
        torch.manual_seed(args['seed_val'])
        torch.cuda.manual_seed_all(args['seed_val'])
        
        # set device
        self.device = torch.device(args['device'])

        self.weights=args['weights']
        
        # initiliase tokeniser
        self.tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base', do_lower_case = True)

        self.model_save_path = args['model_save_path']
        self.name = args['name']
        
    ##-----------------------------------------------------------##
    ##----------------- Utility Functions -----------------------##
    ##-----------------------------------------------------------##
    def encode(self,data,max_len):
        input_ids = []
        attention_masks = []
        for sent in tqdm(data):
            # use in-built tokeniser of Bert
            encoded_dict = self.tokenizer.encode_plus(
                            sent,
                            add_special_tokens =True, # for [CLS] and [SEP]
                            max_length = max_len,
                            truncation = True,
                            padding = 'max_length',
                            return_attention_mask = True,
#                             return_tensors = 'pt', # return pytorch tensors
            )
            input_ids.append(encoded_dict['input_ids'])
            # attention masks notify where padding has been added 
            # and where is the sentence
            attention_masks.append(encoded_dict['attention_mask'])
            X_data = torch.tensor(input_ids)
            attention_masks_data = torch.tensor(attention_masks)
            
        return [X_data,attention_masks_data]
    
    ##-----------------------------------------------------------##
    ##------------------ Dataloader -----------------------------##
    ##-----------------------------------------------------------##
    def get_dataloader(self,samples, batch_size,is_train=False):
        inputs,masks,labels = samples

        # Convert the lists into tensors.
#         inputs = torch.cat(inputs, dim=0)
#         masks = torch.cat(masks, dim=0)
        labels = torch.tensor(labels)

        # convert to dataset
        data = TensorDataset(inputs,masks,labels)

        if(is_train==False):
            # use random sampler for training to shuffle
            # train data
            sampler = SequentialSampler(data)
        else:
            # order does not matter for validation as we just 
            # need the metrics
            sampler = RandomSampler(data)  

        dataloader = DataLoader(data, sampler=sampler, batch_size=batch_size,drop_last=True)

        return dataloader
    
    ##-----------------------------------------------------------##
    ##----------------- Training Utilities ----------------------##
    ##-----------------------------------------------------------## 
    def get_optimiser(self,learning_rate,model):
        # using AdamW optimiser from transformers library
        return AdamW(model.parameters(),
                  lr = learning_rate, 
                  eps = 1e-8
                )
    
    def get_scheduler(self,epochs,optimiser,train_dl):
        total_steps = len(train_dl) * epochs
        return get_linear_schedule_with_warmup(optimiser, 
                num_warmup_steps = 0, 
                num_training_steps = total_steps)
    
    def evalMetric(self, y_true, y_pred, prefix):
        # calculate all the metrics and add prefix to them
        # before saving in dictionary
        accuracy = accuracy_score(y_true, y_pred)
        mf1Score = f1_score(y_true, y_pred, average='macro')
        f1Score = f1_score(y_true, y_pred)
        area_under_c = roc_auc_score(y_true, y_pred)
        recallScore = recall_score(y_true, y_pred)
        precisionScore = precision_score(y_true, y_pred)

        nonhate_f1Score = f1_score(y_true, y_pred, pos_label=0)
        non_recallScore = recall_score(y_true, y_pred, pos_label=0)
        non_precisionScore = precision_score(y_true, y_pred, pos_label=0)
        return {prefix+"accuracy": accuracy, prefix+'mF1Score': mf1Score, 
            prefix+'f1Score': f1Score, prefix+'auc': area_under_c,
            prefix+'precision': precisionScore, 
            prefix+'recall': recallScore, 
            prefix+'non_hatef1Score': nonhate_f1Score, 
            prefix+'non_recallScore': non_recallScore, 
            prefix+'non_precisionScore': non_precisionScore}
    
    ##-----------------------------------------------------------##
    ##---------------- Different Train Loops --------------------##
    ##-----------------------------------------------------------## 
    def evaluate(self,model,loader,which):
        # to evaluate model on test and validation set

        model.eval() # put model in eval mode

        # maintain total loss to save in metrics
        total_eval_loss = 0

        # maintain predictions for each batch and calculate metrics
        # at the end of the epoch
        y_pred = np.zeros(shape=(0),dtype='int')
        y_true = np.empty(shape=(0),dtype='int')

        for batch in tqdm(loader):
            # separate input, labels and attention mask
            b_input_ids = batch[0].to(self.device)
            b_input_mask = batch[1].to(self.device)
            b_labels = batch[2].to(self.device)

            with torch.no_grad(): # do not construct compute graph
                outputs = model(b_input_ids, 
                                   token_type_ids=None, 
                                   attention_mask=b_input_mask,
                                   labels=b_labels)
            
            # output is always a tuple, thus we have to 
            # separate it manually
            #loss = outputs[0]
            logits = outputs[0]

            # define new loss function so that we can include
            # weights
            loss_fct = nn.CrossEntropyLoss(weight=torch.tensor(
                        self.weights,dtype=torch.float).to(self.device))
            
            loss = loss_fct(logits.view(-1, 2), b_labels.view(-1))

            # add the current loss
            # loss.item() extracts loss value as a float
            total_eval_loss += loss.item()

            # calculate true labels and convert it into numpy array
            b_y_true = b_labels.cpu().data.squeeze().numpy()
            
            # calculate predicted labels by taking max of 
            # prediction scores
            b_y_pred = torch.max(logits,1)[1]
            b_y_pred = b_y_pred.cpu().data.squeeze().numpy()

            y_pred = np.concatenate((y_pred,b_y_pred))
            y_true = np.concatenate((y_true,b_y_true))

        # calculate metrics
        metrics = self.evalMetric(y_true,y_pred,which+"_")

        # Calculate the average loss over all of the batches.
        avg_loss = total_eval_loss / len(loader)
        # add it to the metric
        metrics[which+'_avg_loss'] = avg_loss

        return metrics
    
    
    def run_train_loop(self,model,train_loader,optimiser,scheduler):

        model.train() # put model in train mode

        # maintain total loss to add to metric
        total_loss = 0

        # maintain predictions for each batch and calculate metrics
        # at the end of the epoch
        y_pred = np.zeros(shape=(0),dtype='int')
        y_true = np.empty(shape=(0),dtype='int')

        for batch in tqdm(train_loader):
            # separate inputs, labels and attention mask
            b_input_ids = batch[0].to(self.device)
            b_input_mask = batch[1].to(self.device)
            b_labels = batch[2].to(self.device)

            # Ref: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch#:~:text=In%20PyTorch%20%2C%20we%20need%20to,backward()%20call.
            model.zero_grad()                

            outputs = model(b_input_ids, 
                             token_type_ids=None, 
                             attention_mask=b_input_mask, 
                             labels=b_labels)

            # outputs is always returned as tuple
            # Separate it manually
            logits = outputs[0]

            # define new loss function so that we can include
            # weights
            loss_fct = nn.CrossEntropyLoss(weight=torch.tensor(
                        self.weights,dtype=torch.float).to(self.device))
            
            loss = loss_fct(logits.view(-1, 2), b_labels.view(-1))
            
            # calculate current loss
            # loss.item() extracts loss value as a float
            total_loss += loss.item()

            # Back-propagation
            loss.backward()

            # calculate true labels
            b_y_true = b_labels.cpu().data.squeeze().numpy()

            # calculate predicted labels by taking max of 
            # prediction scores
            b_y_pred = torch.max(logits,1)[1]
            b_y_pred = b_y_pred.cpu().data.squeeze().numpy()

            y_pred = np.concatenate((y_pred,b_y_pred))
            y_true = np.concatenate((y_true,b_y_true))

            # clip gradient to prevent exploding gradient
            # problems
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # gradient descent
            optimiser.step()
            
            # schedule learning rate accordingly
            scheduler.step()

        # calculate avg loss 
        avg_train_loss = total_loss / len(train_loader)

        # calculate metrics
        train_metrics = self.evalMetric(y_true,y_pred,"Train_")
        
        # print results
        print('avg_train_loss',avg_train_loss)
        print('train_f1Score',train_metrics['Train_f1Score'])
        print('train_accuracy',train_metrics['Train_accuracy'])

        # add loss to metrics
        train_metrics['Train_avg_loss'] = avg_train_loss

        return train_metrics
    
    
    ##------------------------------------------------------------##
    ##----------------- Main Train Loop --------------------------##
    ##------------------------------------------------------------##
    def train(self,model,data_loaders,optimiser,scheduler,epochs,save_model):
        # save train stats per epoch
        train_stats = []
        train_loader,val_loader,test_loader = data_loaders
        # maintain best mF1 Score to save best model
        best_mf1Score=-1.0
        for epoch_i in range(0, epochs):
            print("")
            print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
            
            print("")
            print('Training...')
            # run trian loop
            train_metrics = self.run_train_loop(model,train_loader,
                                            optimiser,scheduler)

            print("")
            print("Running Validation...") 
            # test on validation set
            val_metrics = self.evaluate(model,val_loader,"Val")
            
            print("Validation Loss: ",val_metrics['Val_avg_loss'])
            print("Validation Accuracy: ",val_metrics['Val_accuracy'])
            
            stats = {}

            # save model where validation mF1Score is best
            if(val_metrics['Val_mF1Score']>best_mf1Score):
                best_mf1Score=val_metrics['Val_mF1Score']
                if(save_model):
                    torch.save(model.state_dict(), self.model_save_path+
                        '/best_bert_'+self.name+'.pt')
                # evaluate best model on test set
                test_metrics = self.evaluate(model,test_loader,"Test")

            stats['epoch']=epoch_i+1

            # add train and val metrics of the epoch to 
            # same dictionary
            stats.update(train_metrics)
            stats.update(val_metrics)

            train_stats.append(stats)

        return train_stats,test_metrics
    
    ##-----------------------------------------------------------##
    ##----------------------- Main Pipeline ---------------------##
    ##-----------------------------------------------------------##
    def run(self,args,df_train,df_val,df_test):
        # get X and Y data points 
        X_train = df_train['Text'].values
        Y_train = df_train['Label'].values
        X_test = df_test['Text'].values
        Y_test = df_test['Label'].values
        X_val = df_val['Text'].values
        Y_val = df_val['Label'].values
        
        # encode data
        # returns list of data and attention masks
        train_data = self.encode(X_train,args['max_len'])
        val_data = self.encode(X_val,args['max_len'])
        test_data = self.encode(X_test,args['max_len'])
        
        # add labels to data so that we can send them to
        # dataloader function together
        train_data.append(Y_train)
        val_data.append(Y_val)
        test_data.append(Y_test)
        
        # convert to dataloader
        train_dl =self.get_dataloader(train_data,args['batch_size'],True)
        val_dl =self.get_dataloader(val_data,args['batch_size'])                          
        test_dl =self.get_dataloader(test_data,args['batch_size'])
        
        # intialise model
#         model = weighted_Roberta.from_pretrained(
#             'xlm-roberta-base', # Use the 12-layer BERT model, with an uncased vocab.
#             num_labels = 2, # The number of output labels--2 for binary classification             # You can increase this for multi-class tasks.   
#             params=args['params'],
#         )
        model = self.load_model(args['model_path'],args)
        model.to(self.device)
        
        optimiser = self.get_optimiser(args['learning_rate'],model)
        
        scheduler = self.get_scheduler(args['epochs'],optimiser,train_dl)
        
        # Run train loop and evaluate on validation data set
        # on each epoch. Store best model from all epochs 
        # (best mF1 Score on Val set) and evaluate it on
        # test set
        train_stats,train_metrics = self.train(model,[train_dl,val_dl,test_dl],
                                optimiser,scheduler,args['epochs'],args['save_model'])
        
        return train_stats,train_metrics
        
    ##-----------------------------------------------------------##
    ##-------------------- Other Utilities ----------------------##
    ##-----------------------------------------------------------##
    def run_test(self,model,df_test,args):
        # to evaluate test set on the final saved model
        # to retrieve results if necessary
        X_test = df_test['Text'].values
        Y_test = df_test['Label'].values

        test_data = self.encode(X_test,args['max_len'])

        test_data.append(Y_test)

        test_dl =self.get_dataloader(test_data,32)

        metrics = self.evaluate(model,test_dl,"Test")

        return metrics
    
    def load_model(self,path,args):
        # load saved best model
        saved_model = weighted_Roberta.from_pretrained(
            'xlm-roberta-base', # Use the 12-layer BERT model, with an uncased vocab.
            num_labels = 2, # The number of output labels--2 for binary classification             # You can increase this for multi-class tasks.   
            params=args['params'],
        )
        
        saved_model.load_state_dict(torch.load(path))
        
        return saved_model

### Utility Functions

In [6]:
def load_dataset(args,index):
    # initialise constants 
    path = args['data_path']
    # read dataframes
    df_train = pd.read_csv(path+'train_'+str(index)+'.csv')
    df_val = pd.read_csv(path+'val_'+str(index)+'.csv')
    df_test = pd.read_csv(path+'test_'+str(index)+'.csv')

    # clean data
    df_train=preprocess(df_train,args['isArabic'])
    df_val=preprocess(df_val,args['isArabic'])
    df_test=preprocess(df_test,args['isArabic'])

    return df_train, df_val, df_test

In [7]:
def preprocess(df,isArabic):
    
    X = df['Text']
    X_new=[]
    if(isArabic):
        prep = ArabertPreprocessor('bert-base-arabertv02')
        for text in tqdm(X):
            text = prep.preprocess(text)
            X_new.append(text)
    else:
        processer = Data_Preprocessing()
        for text in tqdm(X):
            text= processer.removeEmojis(text)
            text = processer.removeUrls(text)
            text=processer.removeSpecialChar(text)
            X_new.append(text)

    df['Text']=X_new
    return df 

In [8]:
def save_metrics(path,metrics,which):
    df = pd.DataFrame(metrics)
    df.to_csv(path+"_"+which+".csv")

In [9]:
def fix_random(seed_val=42):
    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)

### Main Train Function

In [10]:
def train(args, index,all_test_metrics,model_args):
    model_name = args['model_name']
    model_args['name']=model_name+'_'+str(index)+'_all'
    print("\tInitialising Model....")
    model = XLM_Roberta_fewShot(model_args)
    print("\tLoading Dataset....")
    df_train, df_val, df_test = load_dataset(args,index)
    print("\tTraining Starts....")
    train_metrics, test_metrics = model.run(model_args, 
                    df_train, df_val, df_test)

    # Save train metrics after generating path
    res_path=args['res_base_path']+model_name+'_'+model_args['name']
    save_metrics(res_path,train_metrics,"train")
    
    all_test_metrics.append(test_metrics)

### Main Run Function

In [11]:
def run(args,model_args):
    all_test_metrics=[]
    
    for fold in [1, 2, 3, 4, 5]:
        print("Fold: ",fold)
        fix_random()
        train(args,fold,all_test_metrics,model_args)
        print("Saving Test Metrics....")
        save_metrics(args['res_base_path']+args['model_name']+'_all',
                     all_test_metrics,"test")

## Arabic

In [12]:
run_args={
    'model_name':'few_shot_xlm',
    'data_path':'Data_Processed/Let-Mi/',
    'train_cnt':'all',
    'res_base_path': 'Results/Let-Mi/fewShot/',
    'model_save_path': 'Saved_Models/Let-Mi/',
    'isArabic': True,
}

model_args={
        'seed_val': 42,
        'name': 'xlm_roberta',
        'batch_size': 8,
        'bert_model': "xlm-roberta-base",
        'learning_rate': 2e-5,
        'epochs': 10,
        'max_len': 128,
        'device': 'cuda',
        'weights': [1.0, 1.0],
        'save_model': False,
        'model_path': 'Saved_Models/Let-Mi/all_but_one/best_bert_xlm_roberta_3_all.pt',
        'isArabic': True,
        'model_save_path': '',
        'max_length':128,
        'is_train':True,
        'epsilon':1e-8,
        'random_seed':30,
        'to_save':True,
        'frac':0.8,
        'params':{
            'max_length':128,
            'path_files': 'xlm-roberta-base',
            'what_bert':'weighted',
            'batch_size':8,
            'is_train':True,
            'learning_rate':2e-5,
            'epsilon':1e-8,
            'random_seed':30,
            'epochs':10,
            'to_save':True,
            'weights':[1.0,1.0],
            'frac':0.8
        }
    }
run(run_args,model_args)

Fold:  1
	Initialising Model....


KeyboardInterrupt: 

## Spanish

In [None]:
run_args={
    'model_name':'few_shot_xlm',
    'data_path':'Data_Processed/AMI-Spanish/',
    'train_cnt':'all',
    'res_base_path': 'Results/AMI-Spanish/fewShot/',
    'model_save_path': 'Saved_Models/AMI-Spanish/',
    'isArabic': False,
}

model_args={
        'seed_val': 42,
        'name': 'xlm_roberta',
        'batch_size': 8,
        'bert_model': "xlm-roberta-base",
        'learning_rate': 2e-5,
        'epochs': 10,
        'max_len': 128,
        'device': 'cuda',
        'weights': [1.0, 1.0],
        'save_model': False,
        'model_path': 'Saved_Models/AMI-Spanish/all_but_one/best_bert_xlm_roberta_1_all.pt',
        'isArabic': False,
        'model_save_path': '',
        'max_length':128,
        'is_train':True,
        'epsilon':1e-8,
        'random_seed':30,
        'to_save':False,
        'frac':0.8,
        'params':{
            'max_length':128,
            'path_files': 'xlm-roberta-base',
            'what_bert':'weighted',
            'batch_size':8,
            'is_train':True,
            'learning_rate':2e-5,
            'epsilon':1e-8,
            'random_seed':30,
            'epochs':10,
            'to_save':False,
            'weights':[1.0,1.0],
            'frac':0.8
        }
    }
run(run_args,model_args)

## Hindi

In [None]:
run_args={
    'model_name':'few_shot_xlm',
    'data_path':'Data_Processed/Shared_Task_hin/',
    'train_cnt':'all',
    'res_base_path': 'Results/Shared_Task_hin/fewShot/',
    'model_save_path': 'Saved_Models/Shared_Task_hin/',
    'isArabic': False,
}

model_args={
        'seed_val': 42,
        'name': 'xlm_roberta',
        'batch_size': 8,
        'bert_model': "xlm-roberta-base",
        'learning_rate': 2e-5,
        'epochs': 10,
        'max_len': 128,
        'device': 'cuda',
        'weights': [1.0, 4.5],
        'save_model': False,
        'model_path': 'Saved_Models/Shared_Task_hin/all_but_one/best_bert_xlm_roberta_1_all.pt',
        'isArabic': False,
        'model_save_path': '',
        'max_length':128,
        'is_train':True,
        'epsilon':1e-8,
        'random_seed':30,
        'to_save':False,
        'frac':0.8,
        'params':{
            'max_length':128,
            'path_files': 'xlm-roberta-base',
            'what_bert':'weighted',
            'batch_size':8,
            'is_train':True,
            'learning_rate':2e-5,
            'epsilon':1e-8,
            'random_seed':30,
            'epochs':10,
            'to_save':False,
            'weights':[1.0,4.5],
            'frac':0.8
        }
    }
run(run_args,model_args)

## Italian

In [None]:
run_args={
    'model_name':'few_shot_xlm',
    'data_path':'Data_Processed/AMI-2020/',
    'train_cnt':'all',
    'res_base_path': 'Results/AMI-2020/fewShot/',
    'model_save_path': 'Saved_Models/AMI-2020/',
    'isArabic': False,
}

model_args={
        'seed_val': 42,
        'name': 'xlm_roberta',
        'batch_size': 8,
        'bert_model': "xlm-roberta-base",
        'learning_rate': 2e-5,
        'epochs': 10,
        'max_len': 128,
        'device': 'cuda:1',
        'weights': [1.0, 1.0],
        'save_model': False,
        'model_path': 'Saved_Models/AMI-2020/all_but_one/best_bert_xlm_roberta_2_all.pt',
        'isArabic': False,
        'model_save_path': '',
        'max_length':128,
        'is_train':True,
        'epsilon':1e-8,
        'random_seed':30,
        'to_save':False,
        'frac':0.8,
        'params':{
            'max_length':128,
            'path_files': 'xlm-roberta-base',
            'what_bert':'weighted',
            'batch_size':8,
            'is_train':True,
            'learning_rate':2e-5,
            'epsilon':1e-8,
            'random_seed':30,
            'epochs':10,
            'to_save':False,
            'weights':[1.0,1.0],
            'frac':0.8
        }
    }
run(run_args,model_args)

Fold:  1
	Initialising Model....




	Loading Dataset....


100%|██████████| 6948/6948 [00:02<00:00, 2663.31it/s]
100%|██████████| 991/991 [00:00<00:00, 2988.85it/s]
100%|██████████| 1983/1983 [00:00<00:00, 3069.24it/s]


	Training Starts....


 59%|█████▉    | 4111/6948 [01:24<01:53, 24.90it/s] 

## English

In [None]:
run_args={
    'model_name':'few_shot_xlm',
    'data_path':'Data_Processed/Shared_Task_eng/',
    'train_cnt':'all',
    'res_base_path': 'Results/Shared_Task_eng/fewShot/',
    'model_save_path': 'Saved_Models/Shared_Task_eng/',
    'isArabic': False,
}

model_args={
        'seed_val': 42,
        'name': 'xlm_roberta',
        'batch_size': 8,
        'bert_model': "xlm-roberta-base",
        'learning_rate': 2e-5,
        'epochs': 10,
        'max_len': 128,
        'device': 'cuda:1',
        'weights': [1.0, 8.0],
        'save_model': False,
        'model_path': 'Saved_Models/Shared_Task_eng/all_but_one/best_bert_xlm_roberta_4_all.pt',
        'isArabic': False,
        'model_save_path': '',
        'max_length':128,
        'is_train':True,
        'epsilon':1e-8,
        'random_seed':30,
        'to_save':False,
        'frac':0.8,
        'params':{
            'max_length':128,
            'path_files': 'xlm-roberta-base',
            'what_bert':'weighted',
            'batch_size':8,
            'is_train':True,
            'learning_rate':2e-5,
            'epsilon':1e-8,
            'random_seed':30,
            'epochs':10,
            'to_save':False,
            'weights':[1.0,8.0],
            'frac':0.8
        }
    }
run(run_args,model_args)

Fold:  1
	Initialising Model....




	Loading Dataset....


100%|██████████| 11436/11436 [00:10<00:00, 1066.93it/s]
100%|██████████| 1633/1633 [00:01<00:00, 1265.37it/s]
100%|██████████| 3266/3266 [00:02<00:00, 1173.55it/s]


	Training Starts....


100%|██████████| 11436/11436 [10:58<00:00, 17.38it/s]
100%|██████████| 1633/1633 [00:13<00:00, 120.44it/s]
100%|██████████| 3266/3266 [00:54<00:00, 59.84it/s] 
You are using a model of type xlm-roberta to instantiate a model of type roberta. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at xlm-roberta-base were not used when initializing weighted_Roberta: ['lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing weighted_Roberta from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing weighted_Roberta from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a 



Training...


100%|██████████| 1429/1429 [03:53<00:00,  6.11it/s]


avg_train_loss 0.6132239549609124
train_f1Score 0.5451943715716671
train_accuracy 0.833187543736879

Running Validation...


100%|██████████| 204/204 [00:07<00:00, 28.40it/s]


Validation Loss:  0.7002191140553823
Validation Accuracy:  0.8774509803921569


100%|██████████| 408/408 [00:14<00:00, 28.63it/s]




Training...


100%|██████████| 1429/1429 [03:54<00:00,  6.10it/s]


avg_train_loss 0.573414663825265
train_f1Score 0.5913583733408642
train_accuracy 0.8734254723582925

Running Validation...


100%|██████████| 204/204 [00:07<00:00, 28.48it/s]


Validation Loss:  0.5353236360219764
Validation Accuracy:  0.8774509803921569


100%|██████████| 408/408 [00:14<00:00, 28.54it/s]




Training...


 49%|████▉     | 706/1429 [01:55<01:59,  6.05it/s]

## Bengali

In [None]:
run_args={
    'model_name':'few_shot_xlm',
    'data_path':'Data_Processed/Shared_Task_iben/',
    'train_cnt':'all',
    'res_base_path': 'Results/Shared_Task_iben/fewShot/',
    'model_save_path': 'Saved_Models/Shared_Task_iben/',
    'isArabic': False,
}

model_args={
        'seed_val': 42,
        'name': 'xlm_roberta',
        'batch_size': 8,
        'bert_model': "xlm-roberta-base",
        'learning_rate': 2e-5,
        'epochs': 10,
        'max_len': 128,
        'device': 'cuda:1',
        'weights': [1.0, 6.0],
        'save_model': False,
        'model_path': 'Saved_Models/Shared_Task_iben/all_but_one/best_bert_xlm_roberta_5_all.pt',
        'isArabic': False,
        'model_save_path': '',
        'max_length':128,
        'is_train':True,
        'epsilon':1e-8,
        'random_seed':30,
        'to_save':False,
        'frac':0.8,
        'params':{
            'max_length':128,
            'path_files': 'xlm-roberta-base',
            'what_bert':'weighted',
            'batch_size':8,
            'is_train':True,
            'learning_rate':2e-5,
            'epsilon':1e-8,
            'random_seed':30,
            'epochs':10,
            'to_save':False,
            'weights':[1.0,6.0],
            'frac':0.8
        }
    }
run(run_args,model_args)

## Less data points few Shot

In [12]:
def load_dataset_part(train_cnt,args,index,seed):
    # initialise constants 
    path = args['data_path']
    # read dataframes
    df_train = pd.read_csv(path+'train_'+str(index)+'.csv')
    df_val = pd.read_csv(path+'val_'+str(index)+'.csv')
    df_test = pd.read_csv(path+'test_'+str(index)+'.csv')
    
    # split train into hate and non-hate and take train_cnt
    # samples of each
    df_train_hate = df_train[df_train['Label'] == 1].sample(train_cnt,random_state=seed)
    df_train_non_hate = df_train[df_train['Label'] == 0].sample(train_cnt,random_state=seed)
    # concatenate hate and non_hate
    df_train = pd.concat([df_train_hate, df_train_non_hate])
    # shuffle the train data
    df_train = df_train.sample(frac=1).reset_index(drop=True)

    # clean data
    df_train=preprocess(df_train,args['isArabic'])
    df_val=preprocess(df_val,args['isArabic'])
    df_test=preprocess(df_test,args['isArabic'])

    return df_train, df_val, df_test

In [13]:
def train_part(args,train_cnt,run,index,all_test_metrics,model_args,seed):
    model_name = args['model_name']
    model_args['name']=model_name+'_'+str(index)+'_'+str(train_cnt)+'_'+str(run)
    print("\tInitialising Model....")
    model = XLM_Roberta_fewShot(model_args)
    print("\tLoading Dataset....")
    df_train, df_val, df_test = load_dataset_part(train_cnt,args,index,seed)
    print("\tTraining Starts....")
    train_metrics, test_metrics = model.run(model_args, 
                    df_train, df_val, df_test)

    # Save train metrics after generating path
    res_path=args['res_base_path']+model_name+'_'+model_args['name']
    save_metrics(res_path,train_metrics,"train")
    
    all_test_metrics.append(test_metrics)

In [14]:
def run_part(run_args,model_args,train_cnt):
    all_test_metrics=[]
    seeds = [42,43,44]
    for fold in [1, 2, 3, 4, 5]:
        print("Fold: ",fold)
        for run in [1,2,3]:
            print("Run: ",run)
            fix_random()
            train_part(run_args,train_cnt,run,fold,all_test_metrics,model_args,seeds[run-1])
            print("Saving Test Metrics....")
            save_metrics(run_args['res_base_path']+run_args['model_name']+
                         '_'+str(train_cnt),all_test_metrics,"test")

## Arabic few data

In [16]:
run_args={
    'model_name':'few_shot_xlm',
    'data_path':'Data_Processed/Let-Mi/',
    'train_cnt':256,
    'res_base_path': 'Results/Let-Mi/all_but_one/',
    'model_save_path': 'Saved_Models/Let-Mi/',
    'isArabic': True,
}

model_args={
        'seed_val': 42,
        'name': 'xlm_roberta',
        'batch_size': 8,
        'bert_model': "xlm-roberta-base",
        'learning_rate': 2e-5,
        'epochs': 10,
        'max_len': 128,
        'device': 'cuda',
        'weights': [1.0, 1.0],
        'save_model': False,
        'model_path': 'Saved_Models/Let-Mi/all_but_one/best_bert_xlm_roberta_3_all.pt',
        'isArabic': True,
        'model_save_path': '',
        'max_length':128,
        'is_train':True,
        'epsilon':1e-8,
        'random_seed':30,
        'to_save':True,
        'frac':0.8,
        'params':{
            'max_length':128,
            'path_files': 'xlm-roberta-base',
            'what_bert':'weighted',
            'batch_size':8,
            'is_train':True,
            'learning_rate':2e-5,
            'epsilon':1e-8,
            'random_seed':30,
            'epochs':10,
            'to_save':True,
            'weights':[1.0,1.0],
            'frac':0.8
        }
    }
run_part(run_args,model_args,32)

Fold:  1
Run:  1
	Initialising Model....




	Loading Dataset....


100%|██████████| 64/64 [00:00<00:00, 4878.34it/s]
100%|██████████| 523/523 [00:00<00:00, 5487.62it/s]
100%|██████████| 1047/1047 [00:00<00:00, 7058.41it/s]


	Training Starts....


100%|██████████| 64/64 [00:00<00:00, 861.19it/s]
100%|██████████| 523/523 [00:02<00:00, 203.89it/s]
100%|██████████| 1047/1047 [00:10<00:00, 103.16it/s]
You are using a model of type xlm-roberta to instantiate a model of type roberta. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at xlm-roberta-base were not used when initializing weighted_Roberta: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing weighted_Roberta from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing weighted_Roberta from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertFor

RuntimeError: CUDA error: out of memory

## Italian few Data

In [None]:
run_args={
    'model_name':'few_shot',
    'data_path':'Data_Processed/AMI-2020/',
    'train_cnt':256,
    'res_base_path': 'Results/AMI-2020/fewData_fewShot/',
    'model_save_path': 'Saved_Models/AMI-2020/',
    'isArabic': False,
}

model_args={
        'seed_val': 42,
        'batch_size': 8,
        'bert_model': "bert-base-multilingual-cased",
        'learning_rate': 2e-5,
        'epochs': 10,
        'max_len': 128,
        'device': 'cuda',
        'weights': [1.0, 1.0],
        'save_model': False,
        'model_save_path': '',
        'isArabic': False,
        'model_path': "Saved_Models/Shared_Task_eng_1/best_bert_3_all.pt",
    }
run_part(run_args,model_args)

Fold:  1
Run:  1
	Initialising Model....




100%|██████████| 64/64 [00:00<00:00, 2419.84it/s]


  0%|          | 0/991 [00:00<?, ?it/s][A[A

	Loading Dataset....




 34%|███▎      | 334/991 [00:00<00:00, 3332.80it/s][A[A

100%|██████████| 991/991 [00:00<00:00, 3406.50it/s][A[A


  0%|          | 0/1983 [00:00<?, ?it/s][A[A

 18%|█▊        | 348/1983 [00:00<00:00, 3479.52it/s][A[A

 36%|███▌      | 715/1983 [00:00<00:00, 3533.14it/s][A[A

 54%|█████▍    | 1080/1983 [00:00<00:00, 3563.82it/s][A[A

 73%|███████▎  | 1450/1983 [00:00<00:00, 3603.45it/s][A[A

100%|██████████| 1983/1983 [00:00<00:00, 3575.19it/s][A[A


100%|██████████| 64/64 [00:00<00:00, 4486.41it/s]


  0%|          | 0/991 [00:00<?, ?it/s][A[A

 39%|███▉      | 385/991 [00:00<00:00, 3848.00it/s][A[A



	Training Starts....


100%|██████████| 991/991 [00:00<00:00, 3894.98it/s][A[A


  0%|          | 0/1983 [00:00<?, ?it/s][A[A

 21%|██        | 407/1983 [00:00<00:00, 4069.45it/s][A[A

 41%|████▏     | 822/1983 [00:00<00:00, 4092.20it/s][A[A

 63%|██████▎   | 1242/1983 [00:00<00:00, 4123.88it/s][A[A

100%|██████████| 1983/1983 [00:00<00:00, 4077.94it/s][A[A
Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertFor



Training...




 50%|█████     | 4/8 [00:00<00:00, 10.60it/s][A[A

 75%|███████▌  | 6/8 [00:00<00:00, 10.59it/s][A[A

100%|██████████| 8/8 [00:00<00:00, 10.55it/s][A[A


  0%|          | 0/123 [00:00<?, ?it/s][A[A

  4%|▍         | 5/123 [00:00<00:02, 48.88it/s][A[A

avg_train_loss 1.1001893728971481
train_f1Score 0.05714285714285714
train_accuracy 0.484375

Running Validation...




  8%|▊         | 10/123 [00:00<00:02, 48.46it/s][A[A

 12%|█▏        | 15/123 [00:00<00:02, 48.13it/s][A[A

 16%|█▋        | 20/123 [00:00<00:02, 47.98it/s][A[A

 20%|██        | 25/123 [00:00<00:02, 47.87it/s][A[A

 24%|██▍       | 30/123 [00:00<00:01, 47.80it/s][A[A

 28%|██▊       | 35/123 [00:00<00:01, 47.77it/s][A[A

 33%|███▎      | 40/123 [00:00<00:01, 47.66it/s][A[A

 37%|███▋      | 45/123 [00:00<00:01, 47.37it/s][A[A

 41%|████      | 50/123 [00:01<00:01, 47.42it/s][A[A

 45%|████▍     | 55/123 [00:01<00:01, 47.49it/s][A[A

 49%|████▉     | 60/123 [00:01<00:01, 47.40it/s][A[A

 53%|█████▎    | 65/123 [00:01<00:01, 47.46it/s][A[A

 57%|█████▋    | 70/123 [00:01<00:01, 47.52it/s][A[A

 61%|██████    | 75/123 [00:01<00:01, 47.55it/s][A[A

 65%|██████▌   | 80/123 [00:01<00:00, 47.59it/s][A[A

 69%|██████▉   | 85/123 [00:01<00:00, 47.48it/s][A[A

 73%|███████▎  | 90/123 [00:01<00:00, 47.35it/s][A[A

 77%|███████▋  | 95/123 [00:01<00:00, 47.31it/

Validation Loss:  0.8366757845733224
Validation Accuracy:  0.5304878048780488
Best mF1Score....




  4%|▍         | 10/247 [00:00<00:04, 47.84it/s][A[A

  6%|▌         | 15/247 [00:00<00:04, 47.71it/s][A[A

  8%|▊         | 20/247 [00:00<00:04, 47.57it/s][A[A

 10%|█         | 25/247 [00:00<00:04, 47.51it/s][A[A

 12%|█▏        | 30/247 [00:00<00:04, 47.45it/s][A[A

 14%|█▍        | 35/247 [00:00<00:04, 47.45it/s][A[A

 16%|█▌        | 40/247 [00:00<00:04, 47.33it/s][A[A

 18%|█▊        | 45/247 [00:00<00:04, 47.31it/s][A[A

 20%|██        | 50/247 [00:01<00:04, 47.30it/s][A[A

 22%|██▏       | 55/247 [00:01<00:04, 47.31it/s][A[A

 24%|██▍       | 60/247 [00:01<00:03, 47.27it/s][A[A

 26%|██▋       | 65/247 [00:01<00:03, 47.27it/s][A[A

 28%|██▊       | 70/247 [00:01<00:03, 47.27it/s][A[A

 30%|███       | 75/247 [00:01<00:03, 47.29it/s][A[A

 32%|███▏      | 80/247 [00:01<00:03, 47.29it/s][A[A

 34%|███▍      | 85/247 [00:01<00:03, 47.30it/s][A[A

 36%|███▋      | 90/247 [00:01<00:03, 47.26it/s][A[A

 38%|███▊      | 95/247 [00:02<00:03, 47.23it/



Training...




 50%|█████     | 4/8 [00:00<00:00, 10.77it/s][A[A

 75%|███████▌  | 6/8 [00:00<00:00, 10.69it/s][A[A

100%|██████████| 8/8 [00:00<00:00, 10.58it/s][A[A


  0%|          | 0/123 [00:00<?, ?it/s][A[A

  4%|▍         | 5/123 [00:00<00:02, 48.19it/s][A[A

avg_train_loss 0.7565140500664711
train_f1Score 0.3255813953488372
train_accuracy 0.546875

Running Validation...




  8%|▊         | 10/123 [00:00<00:02, 47.87it/s][A[A

 12%|█▏        | 15/123 [00:00<00:02, 47.68it/s][A[A

 16%|█▋        | 20/123 [00:00<00:02, 47.43it/s][A[A

 20%|██        | 25/123 [00:00<00:02, 47.34it/s][A[A

 24%|██▍       | 30/123 [00:00<00:01, 47.23it/s][A[A

 28%|██▊       | 35/123 [00:00<00:01, 47.20it/s][A[A

 33%|███▎      | 40/123 [00:00<00:01, 47.33it/s][A[A

 37%|███▋      | 45/123 [00:00<00:01, 46.79it/s][A[A

 41%|████      | 50/123 [00:01<00:01, 46.74it/s][A[A

 45%|████▍     | 55/123 [00:01<00:01, 46.84it/s][A[A

 49%|████▉     | 60/123 [00:01<00:01, 46.84it/s][A[A

 53%|█████▎    | 65/123 [00:01<00:01, 46.85it/s][A[A

 57%|█████▋    | 70/123 [00:01<00:01, 46.89it/s][A[A

 61%|██████    | 75/123 [00:01<00:01, 46.87it/s][A[A

 65%|██████▌   | 80/123 [00:01<00:00, 46.86it/s][A[A

 69%|██████▉   | 85/123 [00:01<00:00, 46.91it/s][A[A

 73%|███████▎  | 90/123 [00:01<00:00, 46.90it/s][A[A

 77%|███████▋  | 95/123 [00:02<00:00, 46.85it/

Validation Loss:  0.6856275331683275
Validation Accuracy:  0.5894308943089431
Testing Model....




  4%|▍         | 10/247 [00:00<00:05, 47.29it/s][A[A

  6%|▌         | 15/247 [00:00<00:04, 47.14it/s][A[A

  8%|▊         | 20/247 [00:00<00:04, 47.07it/s][A[A

 10%|█         | 25/247 [00:00<00:04, 46.97it/s][A[A

 12%|█▏        | 30/247 [00:00<00:04, 47.00it/s][A[A

 14%|█▍        | 35/247 [00:00<00:04, 46.82it/s][A[A

 16%|█▌        | 40/247 [00:00<00:04, 46.81it/s][A[A

 18%|█▊        | 45/247 [00:00<00:04, 46.67it/s][A[A

 20%|██        | 50/247 [00:01<00:04, 46.63it/s][A[A

 22%|██▏       | 55/247 [00:01<00:04, 46.69it/s][A[A

 24%|██▍       | 60/247 [00:01<00:04, 46.69it/s][A[A

 26%|██▋       | 65/247 [00:01<00:03, 46.68it/s][A[A

 28%|██▊       | 70/247 [00:01<00:03, 46.66it/s][A[A

 30%|███       | 75/247 [00:01<00:03, 46.66it/s][A[A

 32%|███▏      | 80/247 [00:01<00:03, 46.64it/s][A[A

 34%|███▍      | 85/247 [00:01<00:03, 46.62it/s][A[A

 36%|███▋      | 90/247 [00:01<00:03, 46.51it/s][A[A

 38%|███▊      | 95/247 [00:02<00:03, 46.50it/

Best mF1Score....




  4%|▍         | 10/247 [00:00<00:05, 47.02it/s][A[A

  6%|▌         | 15/247 [00:00<00:04, 46.92it/s][A[A

  8%|▊         | 20/247 [00:00<00:04, 46.81it/s][A[A

 10%|█         | 25/247 [00:00<00:04, 46.71it/s][A[A

 12%|█▏        | 30/247 [00:00<00:04, 46.58it/s][A[A

 14%|█▍        | 35/247 [00:00<00:04, 46.52it/s][A[A

 16%|█▌        | 40/247 [00:00<00:04, 46.47it/s][A[A

 18%|█▊        | 45/247 [00:00<00:04, 46.37it/s][A[A

 20%|██        | 50/247 [00:01<00:04, 46.37it/s][A[A

 22%|██▏       | 55/247 [00:01<00:04, 46.37it/s][A[A

 24%|██▍       | 60/247 [00:01<00:04, 46.40it/s][A[A

 26%|██▋       | 65/247 [00:01<00:03, 46.38it/s][A[A

 28%|██▊       | 70/247 [00:01<00:03, 46.37it/s][A[A

 30%|███       | 75/247 [00:01<00:03, 46.43it/s][A[A

 32%|███▏      | 80/247 [00:01<00:03, 46.32it/s][A[A

 34%|███▍      | 85/247 [00:01<00:03, 46.35it/s][A[A

 36%|███▋      | 90/247 [00:01<00:03, 46.28it/s][A[A

 38%|███▊      | 95/247 [00:02<00:03, 46.25it/



Training...




 50%|█████     | 4/8 [00:00<00:00, 10.65it/s][A[A

 75%|███████▌  | 6/8 [00:00<00:00, 10.57it/s][A[A

100%|██████████| 8/8 [00:00<00:00, 10.46it/s][A[A


  0%|          | 0/123 [00:00<?, ?it/s][A[A

  4%|▍         | 5/123 [00:00<00:02, 47.33it/s][A[A

avg_train_loss 0.6849025785923004
train_f1Score 0.53125
train_accuracy 0.53125

Running Validation...




  8%|▊         | 10/123 [00:00<00:02, 47.04it/s][A[A

 12%|█▏        | 15/123 [00:00<00:02, 46.85it/s][A[A

 16%|█▋        | 20/123 [00:00<00:02, 46.67it/s][A[A

 20%|██        | 25/123 [00:00<00:02, 46.58it/s][A[A

 24%|██▍       | 30/123 [00:00<00:02, 46.47it/s][A[A

 28%|██▊       | 35/123 [00:00<00:01, 46.45it/s][A[A

 33%|███▎      | 40/123 [00:00<00:01, 46.25it/s][A[A

 37%|███▋      | 45/123 [00:00<00:01, 46.01it/s][A[A

 41%|████      | 50/123 [00:01<00:01, 46.04it/s][A[A

 45%|████▍     | 55/123 [00:01<00:01, 46.12it/s][A[A

 49%|████▉     | 60/123 [00:01<00:01, 46.19it/s][A[A

 53%|█████▎    | 65/123 [00:01<00:01, 46.19it/s][A[A

 57%|█████▋    | 70/123 [00:01<00:01, 46.18it/s][A[A

 61%|██████    | 75/123 [00:01<00:01, 46.15it/s][A[A

 65%|██████▌   | 80/123 [00:01<00:00, 46.18it/s][A[A

 69%|██████▉   | 85/123 [00:01<00:00, 46.09it/s][A[A

 73%|███████▎  | 90/123 [00:01<00:00, 46.00it/s][A[A

 77%|███████▋  | 95/123 [00:02<00:00, 46.04it/

Validation Loss:  0.6618457746699573
Validation Accuracy:  0.5985772357723578


Training...




 50%|█████     | 4/8 [00:00<00:00, 10.64it/s][A[A

 75%|███████▌  | 6/8 [00:00<00:00, 10.56it/s][A[A

100%|██████████| 8/8 [00:00<00:00, 10.44it/s][A[A


  0%|          | 0/123 [00:00<?, ?it/s][A[A

  4%|▍         | 5/123 [00:00<00:02, 47.25it/s][A[A

avg_train_loss 0.552492044866085
train_f1Score 0.7936507936507936
train_accuracy 0.796875

Running Validation...




  8%|▊         | 10/123 [00:00<00:02, 46.87it/s][A[A

 12%|█▏        | 15/123 [00:00<00:02, 46.66it/s][A[A

 16%|█▋        | 20/123 [00:00<00:02, 46.47it/s][A[A

 20%|██        | 25/123 [00:00<00:02, 46.33it/s][A[A

 24%|██▍       | 30/123 [00:00<00:02, 46.30it/s][A[A

 28%|██▊       | 35/123 [00:00<00:01, 46.26it/s][A[A

 33%|███▎      | 40/123 [00:00<00:01, 46.11it/s][A[A

 37%|███▋      | 45/123 [00:00<00:01, 45.79it/s][A[A

 41%|████      | 50/123 [00:01<00:01, 45.86it/s][A[A

 45%|████▍     | 55/123 [00:01<00:01, 45.86it/s][A[A

 49%|████▉     | 60/123 [00:01<00:01, 45.89it/s][A[A

 53%|█████▎    | 65/123 [00:01<00:01, 45.90it/s][A[A

 57%|█████▋    | 70/123 [00:01<00:01, 45.82it/s][A[A

 61%|██████    | 75/123 [00:01<00:01, 45.85it/s][A[A

 65%|██████▌   | 80/123 [00:01<00:00, 45.84it/s][A[A

 69%|██████▉   | 85/123 [00:01<00:00, 45.82it/s][A[A

 73%|███████▎  | 90/123 [00:01<00:00, 45.75it/s][A[A

 77%|███████▋  | 95/123 [00:02<00:00, 45.70it/

Validation Loss:  0.6551991340106096
Validation Accuracy:  0.6361788617886179
Testing Model....




  4%|▍         | 10/247 [00:00<00:05, 46.35it/s][A[A

  6%|▌         | 15/247 [00:00<00:05, 46.13it/s][A[A

  8%|▊         | 20/247 [00:00<00:04, 46.00it/s][A[A

 10%|█         | 25/247 [00:00<00:04, 45.90it/s][A[A

 12%|█▏        | 30/247 [00:00<00:04, 45.85it/s][A[A

 14%|█▍        | 35/247 [00:00<00:04, 45.83it/s][A[A

 16%|█▌        | 40/247 [00:00<00:04, 45.77it/s][A[A

 18%|█▊        | 45/247 [00:00<00:04, 45.74it/s][A[A

 20%|██        | 50/247 [00:01<00:04, 45.68it/s][A[A

 22%|██▏       | 55/247 [00:01<00:04, 45.71it/s][A[A

 24%|██▍       | 60/247 [00:01<00:04, 45.66it/s][A[A

 26%|██▋       | 65/247 [00:01<00:03, 45.65it/s][A[A

 28%|██▊       | 70/247 [00:01<00:03, 45.64it/s][A[A

 30%|███       | 75/247 [00:01<00:03, 45.62it/s][A[A

 32%|███▏      | 80/247 [00:01<00:03, 45.64it/s][A[A

 34%|███▍      | 85/247 [00:01<00:03, 45.58it/s][A[A

 36%|███▋      | 90/247 [00:01<00:03, 45.60it/s][A[A

 38%|███▊      | 95/247 [00:02<00:03, 45.54it/

Best mF1Score....




  4%|▍         | 10/247 [00:00<00:05, 46.15it/s][A[A

  6%|▌         | 15/247 [00:00<00:05, 45.99it/s][A[A

  8%|▊         | 20/247 [00:00<00:04, 45.86it/s][A[A

 10%|█         | 25/247 [00:00<00:04, 45.60it/s][A[A

 12%|█▏        | 30/247 [00:00<00:04, 45.54it/s][A[A

 14%|█▍        | 35/247 [00:00<00:04, 45.45it/s][A[A

 16%|█▌        | 40/247 [00:00<00:04, 45.48it/s][A[A

 18%|█▊        | 45/247 [00:00<00:04, 45.42it/s][A[A

 20%|██        | 50/247 [00:01<00:04, 45.48it/s][A[A

 22%|██▏       | 55/247 [00:01<00:04, 45.38it/s][A[A

 24%|██▍       | 60/247 [00:01<00:04, 45.42it/s][A[A

 26%|██▋       | 65/247 [00:01<00:04, 45.34it/s][A[A

 28%|██▊       | 70/247 [00:01<00:03, 45.40it/s][A[A

 30%|███       | 75/247 [00:01<00:03, 45.42it/s][A[A

 32%|███▏      | 80/247 [00:01<00:03, 45.40it/s][A[A

 34%|███▍      | 85/247 [00:01<00:03, 45.33it/s][A[A

 36%|███▋      | 90/247 [00:01<00:03, 45.36it/s][A[A

 38%|███▊      | 95/247 [00:02<00:03, 45.39it/



Training...




 50%|█████     | 4/8 [00:00<00:00, 10.39it/s][A[A

 75%|███████▌  | 6/8 [00:00<00:00, 10.29it/s][A[A

100%|██████████| 8/8 [00:00<00:00, 10.18it/s][A[A


  0%|          | 0/123 [00:00<?, ?it/s][A[A

  4%|▍         | 5/123 [00:00<00:02, 45.78it/s][A[A

avg_train_loss 0.4307372123003006
train_f1Score 0.911764705882353
train_accuracy 0.90625

Running Validation...




  8%|▊         | 10/123 [00:00<00:02, 45.57it/s][A[A

 12%|█▏        | 15/123 [00:00<00:02, 45.45it/s][A[A

 16%|█▋        | 20/123 [00:00<00:02, 45.34it/s][A[A

 20%|██        | 25/123 [00:00<00:02, 45.30it/s][A[A

 24%|██▍       | 30/123 [00:00<00:02, 45.19it/s][A[A

 28%|██▊       | 35/123 [00:00<00:01, 45.16it/s][A[A

 33%|███▎      | 40/123 [00:00<00:01, 45.13it/s][A[A

 37%|███▋      | 45/123 [00:00<00:01, 45.10it/s][A[A

 41%|████      | 50/123 [00:01<00:01, 45.10it/s][A[A

 45%|████▍     | 55/123 [00:01<00:01, 45.16it/s][A[A

 49%|████▉     | 60/123 [00:01<00:01, 45.09it/s][A[A

 53%|█████▎    | 65/123 [00:01<00:01, 45.03it/s][A[A

 57%|█████▋    | 70/123 [00:01<00:01, 45.06it/s][A[A

 61%|██████    | 75/123 [00:01<00:01, 45.00it/s][A[A

 65%|██████▌   | 80/123 [00:01<00:00, 45.07it/s][A[A

 69%|██████▉   | 85/123 [00:01<00:00, 45.05it/s][A[A

 73%|███████▎  | 90/123 [00:01<00:00, 45.10it/s][A[A

 77%|███████▋  | 95/123 [00:02<00:00, 45.06it/

Validation Loss:  0.6636197000015073
Validation Accuracy:  0.6453252032520326
Best mF1Score....




  4%|▍         | 10/247 [00:00<00:05, 45.35it/s][A[A

  6%|▌         | 15/247 [00:00<00:05, 45.30it/s][A[A

  8%|▊         | 20/247 [00:00<00:05, 45.20it/s][A[A

 10%|█         | 25/247 [00:00<00:04, 45.10it/s][A[A

 12%|█▏        | 30/247 [00:00<00:04, 45.15it/s][A[A

 14%|█▍        | 35/247 [00:00<00:04, 45.08it/s][A[A

 16%|█▌        | 40/247 [00:00<00:04, 45.09it/s][A[A

 18%|█▊        | 45/247 [00:00<00:04, 45.09it/s][A[A

 20%|██        | 50/247 [00:01<00:04, 45.02it/s][A[A

 22%|██▏       | 55/247 [00:01<00:04, 44.99it/s][A[A

 24%|██▍       | 60/247 [00:01<00:04, 44.97it/s][A[A

 26%|██▋       | 65/247 [00:01<00:04, 44.95it/s][A[A

 28%|██▊       | 70/247 [00:01<00:03, 44.95it/s][A[A

 30%|███       | 75/247 [00:01<00:03, 44.99it/s][A[A

 32%|███▏      | 80/247 [00:01<00:03, 45.03it/s][A[A

 34%|███▍      | 85/247 [00:01<00:03, 45.02it/s][A[A

 36%|███▋      | 90/247 [00:01<00:03, 45.03it/s][A[A

 38%|███▊      | 95/247 [00:02<00:03, 45.01it/



Training...




 50%|█████     | 4/8 [00:00<00:00, 10.47it/s][A[A

 75%|███████▌  | 6/8 [00:00<00:00, 10.40it/s][A[A

100%|██████████| 8/8 [00:00<00:00, 10.29it/s][A[A


  0%|          | 0/123 [00:00<?, ?it/s][A[A

  4%|▍         | 5/123 [00:00<00:02, 46.10it/s][A[A

avg_train_loss 0.33883108012378216
train_f1Score 0.9375
train_accuracy 0.9375

Running Validation...




  8%|▊         | 10/123 [00:00<00:02, 45.79it/s][A[A

 12%|█▏        | 15/123 [00:00<00:02, 45.62it/s][A[A

 16%|█▋        | 20/123 [00:00<00:02, 45.42it/s][A[A

 20%|██        | 25/123 [00:00<00:02, 45.27it/s][A[A

 24%|██▍       | 30/123 [00:00<00:02, 45.12it/s][A[A

 28%|██▊       | 35/123 [00:00<00:01, 45.08it/s][A[A

 33%|███▎      | 40/123 [00:00<00:01, 44.96it/s][A[A

 37%|███▋      | 45/123 [00:00<00:01, 44.98it/s][A[A

 41%|████      | 50/123 [00:01<00:01, 44.90it/s][A[A

 45%|████▍     | 55/123 [00:01<00:01, 44.99it/s][A[A

 49%|████▉     | 60/123 [00:01<00:01, 44.97it/s][A[A

 53%|█████▎    | 65/123 [00:01<00:01, 44.92it/s][A[A

 57%|█████▋    | 70/123 [00:01<00:01, 44.95it/s][A[A

 61%|██████    | 75/123 [00:01<00:01, 44.89it/s][A[A

 65%|██████▌   | 80/123 [00:01<00:00, 44.91it/s][A[A

 69%|██████▉   | 85/123 [00:01<00:00, 44.85it/s][A[A

 73%|███████▎  | 90/123 [00:02<00:00, 44.88it/s][A[A

 77%|███████▋  | 95/123 [00:02<00:00, 44.81it/

Validation Loss:  0.6833020134912273
Validation Accuracy:  0.649390243902439
Testing Model....




  4%|▍         | 10/247 [00:00<00:05, 45.16it/s][A[A

  6%|▌         | 15/247 [00:00<00:05, 45.11it/s][A[A

  8%|▊         | 20/247 [00:00<00:05, 45.06it/s][A[A

 10%|█         | 25/247 [00:00<00:04, 44.96it/s][A[A

 12%|█▏        | 30/247 [00:00<00:04, 44.95it/s][A[A

 14%|█▍        | 35/247 [00:00<00:04, 44.88it/s][A[A

 16%|█▌        | 40/247 [00:00<00:04, 44.92it/s][A[A

 18%|█▊        | 45/247 [00:01<00:04, 44.84it/s][A[A

 20%|██        | 50/247 [00:01<00:04, 44.88it/s][A[A

 22%|██▏       | 55/247 [00:01<00:04, 44.84it/s][A[A

 24%|██▍       | 60/247 [00:01<00:04, 44.88it/s][A[A

 26%|██▋       | 65/247 [00:01<00:04, 44.89it/s][A[A

 28%|██▊       | 70/247 [00:01<00:03, 44.90it/s][A[A

 30%|███       | 75/247 [00:01<00:03, 44.91it/s][A[A

 32%|███▏      | 80/247 [00:01<00:03, 44.89it/s][A[A

 34%|███▍      | 85/247 [00:01<00:03, 44.90it/s][A[A

 36%|███▋      | 90/247 [00:02<00:03, 44.93it/s][A[A

 38%|███▊      | 95/247 [00:02<00:03, 44.94it/

Best mF1Score....




  4%|▍         | 10/247 [00:00<00:05, 45.35it/s][A[A

  6%|▌         | 15/247 [00:00<00:05, 45.25it/s][A[A

  8%|▊         | 20/247 [00:00<00:05, 45.08it/s][A[A

 10%|█         | 25/247 [00:00<00:04, 45.05it/s][A[A

 12%|█▏        | 30/247 [00:00<00:04, 44.96it/s][A[A

 14%|█▍        | 35/247 [00:00<00:04, 44.97it/s][A[A

 16%|█▌        | 40/247 [00:00<00:04, 44.96it/s][A[A

 18%|█▊        | 45/247 [00:01<00:04, 44.98it/s][A[A

 20%|██        | 50/247 [00:01<00:04, 44.96it/s][A[A

 22%|██▏       | 55/247 [00:01<00:04, 44.98it/s][A[A

 24%|██▍       | 60/247 [00:01<00:04, 44.99it/s][A[A

 26%|██▋       | 65/247 [00:01<00:04, 44.93it/s][A[A

 28%|██▊       | 70/247 [00:01<00:03, 44.93it/s][A[A

 30%|███       | 75/247 [00:01<00:03, 44.87it/s][A[A

 32%|███▏      | 80/247 [00:01<00:03, 44.90it/s][A[A

 34%|███▍      | 85/247 [00:01<00:03, 44.85it/s][A[A

 36%|███▋      | 90/247 [00:02<00:03, 44.87it/s][A[A

 38%|███▊      | 95/247 [00:02<00:03, 44.82it/



Training...




 50%|█████     | 4/8 [00:00<00:00, 10.30it/s][A[A

 75%|███████▌  | 6/8 [00:00<00:00, 10.28it/s][A[A

100%|██████████| 8/8 [00:00<00:00, 10.24it/s][A[A


  0%|          | 0/123 [00:00<?, ?it/s][A[A

  4%|▍         | 5/123 [00:00<00:02, 46.08it/s][A[A

avg_train_loss 0.25378212332725525
train_f1Score 0.9523809523809523
train_accuracy 0.953125

Running Validation...




  8%|▊         | 10/123 [00:00<00:02, 45.77it/s][A[A

 12%|█▏        | 15/123 [00:00<00:02, 45.57it/s][A[A

 16%|█▋        | 20/123 [00:00<00:02, 45.42it/s][A[A

 20%|██        | 25/123 [00:00<00:02, 45.21it/s][A[A

 24%|██▍       | 30/123 [00:00<00:02, 45.16it/s][A[A

 28%|██▊       | 35/123 [00:00<00:01, 45.08it/s][A[A

 33%|███▎      | 40/123 [00:00<00:01, 44.96it/s][A[A

 37%|███▋      | 45/123 [00:00<00:01, 44.93it/s][A[A

 41%|████      | 50/123 [00:01<00:01, 44.91it/s][A[A

 45%|████▍     | 55/123 [00:01<00:01, 44.94it/s][A[A

 49%|████▉     | 60/123 [00:01<00:01, 44.90it/s][A[A

 53%|█████▎    | 65/123 [00:01<00:01, 44.92it/s][A[A

 57%|█████▋    | 70/123 [00:01<00:01, 44.85it/s][A[A

 61%|██████    | 75/123 [00:01<00:01, 44.88it/s][A[A

 65%|██████▌   | 80/123 [00:01<00:00, 44.82it/s][A[A

 69%|██████▉   | 85/123 [00:01<00:00, 44.85it/s][A[A

 73%|███████▎  | 90/123 [00:02<00:00, 44.80it/s][A[A

 77%|███████▋  | 95/123 [00:02<00:00, 44.85it/

Validation Loss:  0.7585998669387849
Validation Accuracy:  0.6351626016260162


Training...




 50%|█████     | 4/8 [00:00<00:00, 10.41it/s][A[A

 75%|███████▌  | 6/8 [00:00<00:00, 10.32it/s][A[A

100%|██████████| 8/8 [00:00<00:00, 10.19it/s][A[A


  0%|          | 0/123 [00:00<?, ?it/s][A[A

  4%|▍         | 5/123 [00:00<00:02, 45.60it/s][A[A

avg_train_loss 0.2678301874548197
train_f1Score 0.9411764705882353
train_accuracy 0.9375

Running Validation...




  8%|▊         | 10/123 [00:00<00:02, 45.46it/s][A[A

 12%|█▏        | 15/123 [00:00<00:02, 45.32it/s][A[A

 16%|█▋        | 20/123 [00:00<00:02, 45.24it/s][A[A

 20%|██        | 25/123 [00:00<00:02, 45.12it/s][A[A

 24%|██▍       | 30/123 [00:00<00:02, 45.09it/s][A[A

 28%|██▊       | 35/123 [00:00<00:01, 45.02it/s][A[A

 33%|███▎      | 40/123 [00:00<00:01, 45.05it/s][A[A

 37%|███▋      | 45/123 [00:00<00:01, 44.90it/s][A[A

 41%|████      | 50/123 [00:01<00:01, 44.91it/s][A[A

 45%|████▍     | 55/123 [00:01<00:01, 44.85it/s][A[A

 49%|████▉     | 60/123 [00:01<00:01, 44.88it/s][A[A

 53%|█████▎    | 65/123 [00:01<00:01, 44.83it/s][A[A

 57%|█████▋    | 70/123 [00:01<00:01, 44.88it/s][A[A

 61%|██████    | 75/123 [00:01<00:01, 44.85it/s][A[A

 65%|██████▌   | 80/123 [00:01<00:00, 44.86it/s][A[A

 69%|██████▉   | 85/123 [00:01<00:00, 44.83it/s][A[A

 73%|███████▎  | 90/123 [00:02<00:00, 44.87it/s][A[A

 77%|███████▋  | 95/123 [00:02<00:00, 44.81it/

Validation Loss:  0.8025166382634543
Validation Accuracy:  0.6361788617886179
Testing Model....




  4%|▍         | 10/247 [00:00<00:05, 45.20it/s][A[A

  6%|▌         | 15/247 [00:00<00:05, 45.05it/s][A[A

  8%|▊         | 20/247 [00:00<00:05, 45.03it/s][A[A

 10%|█         | 25/247 [00:00<00:04, 44.93it/s][A[A

 12%|█▏        | 30/247 [00:00<00:04, 44.95it/s][A[A

 14%|█▍        | 35/247 [00:00<00:04, 44.95it/s][A[A

 16%|█▌        | 40/247 [00:00<00:04, 44.96it/s][A[A

 18%|█▊        | 45/247 [00:01<00:04, 44.93it/s][A[A

 20%|██        | 50/247 [00:01<00:04, 44.93it/s][A[A

 22%|██▏       | 55/247 [00:01<00:04, 44.93it/s][A[A

 24%|██▍       | 60/247 [00:01<00:04, 44.87it/s][A[A

 26%|██▋       | 65/247 [00:01<00:04, 44.88it/s][A[A

 28%|██▊       | 70/247 [00:01<00:03, 44.84it/s][A[A

 30%|███       | 75/247 [00:01<00:03, 44.89it/s][A[A

 32%|███▏      | 80/247 [00:01<00:03, 44.83it/s][A[A

 34%|███▍      | 85/247 [00:01<00:03, 44.87it/s][A[A

 36%|███▋      | 90/247 [00:02<00:03, 44.82it/s][A[A

 38%|███▊      | 95/247 [00:02<00:03, 44.88it/



Training...




 50%|█████     | 4/8 [00:00<00:00, 10.43it/s][A[A

 75%|███████▌  | 6/8 [00:00<00:00, 10.37it/s][A[A

100%|██████████| 8/8 [00:00<00:00, 10.28it/s][A[A


  0%|          | 0/123 [00:00<?, ?it/s][A[A

  4%|▍         | 5/123 [00:00<00:02, 46.14it/s][A[A

avg_train_loss 0.19670867547392845
train_f1Score 0.9846153846153847
train_accuracy 0.984375

Running Validation...




  8%|▊         | 10/123 [00:00<00:02, 45.82it/s][A[A

 12%|█▏        | 15/123 [00:00<00:02, 45.66it/s][A[A

 16%|█▋        | 20/123 [00:00<00:02, 45.41it/s][A[A

 20%|██        | 25/123 [00:00<00:02, 45.33it/s][A[A

 24%|██▍       | 30/123 [00:00<00:02, 45.22it/s][A[A

 28%|██▊       | 35/123 [00:00<00:01, 45.07it/s][A[A

 33%|███▎      | 40/123 [00:00<00:01, 44.97it/s][A[A

 37%|███▋      | 45/123 [00:00<00:01, 44.99it/s][A[A

 41%|████      | 50/123 [00:01<00:01, 44.95it/s][A[A

 45%|████▍     | 55/123 [00:01<00:01, 44.95it/s][A[A

 49%|████▉     | 60/123 [00:01<00:01, 44.95it/s][A[A

 53%|█████▎    | 65/123 [00:01<00:01, 44.97it/s][A[A

 57%|█████▋    | 70/123 [00:01<00:01, 44.95it/s][A[A

 61%|██████    | 75/123 [00:01<00:01, 44.97it/s][A[A

 65%|██████▌   | 80/123 [00:01<00:00, 44.90it/s][A[A

 69%|██████▉   | 85/123 [00:01<00:00, 44.95it/s][A[A

 73%|███████▎  | 90/123 [00:01<00:00, 44.92it/s][A[A

 77%|███████▋  | 95/123 [00:02<00:00, 44.95it/

Validation Loss:  0.7877049620558576
Validation Accuracy:  0.6453252032520326


Training...




 50%|█████     | 4/8 [00:00<00:00, 10.36it/s][A[A

 75%|███████▌  | 6/8 [00:00<00:00, 10.26it/s][A[A

100%|██████████| 8/8 [00:00<00:00, 10.16it/s][A[A


  0%|          | 0/123 [00:00<?, ?it/s][A[A

  4%|▍         | 5/123 [00:00<00:02, 45.82it/s][A[A

avg_train_loss 0.19381932727992535
train_f1Score 0.9846153846153847
train_accuracy 0.984375

Running Validation...




  8%|▊         | 10/123 [00:00<00:02, 45.61it/s][A[A

 12%|█▏        | 15/123 [00:00<00:02, 45.48it/s][A[A

 16%|█▋        | 20/123 [00:00<00:02, 45.34it/s][A[A

 20%|██        | 25/123 [00:00<00:02, 45.21it/s][A[A

 24%|██▍       | 30/123 [00:00<00:02, 45.14it/s][A[A

 28%|██▊       | 35/123 [00:00<00:01, 45.04it/s][A[A

 33%|███▎      | 40/123 [00:00<00:01, 44.92it/s][A[A

 37%|███▋      | 45/123 [00:00<00:01, 44.87it/s][A[A

 41%|████      | 50/123 [00:01<00:01, 44.84it/s][A[A

 45%|████▍     | 55/123 [00:01<00:01, 44.86it/s][A[A

 49%|████▉     | 60/123 [00:01<00:01, 44.82it/s][A[A

 53%|█████▎    | 65/123 [00:01<00:01, 44.86it/s][A[A

 57%|█████▋    | 70/123 [00:01<00:01, 44.87it/s][A[A

 61%|██████    | 75/123 [00:01<00:01, 44.89it/s][A[A

 65%|██████▌   | 80/123 [00:01<00:00, 44.91it/s][A[A

 69%|██████▉   | 85/123 [00:01<00:00, 44.85it/s][A[A

 73%|███████▎  | 90/123 [00:02<00:00, 44.85it/s][A[A

 77%|███████▋  | 95/123 [00:02<00:00, 44.81it/

Validation Loss:  0.7876894455857393
Validation Accuracy:  0.641260162601626
Testing Model....




  4%|▍         | 10/247 [00:00<00:05, 45.10it/s][A[A

  6%|▌         | 15/247 [00:00<00:05, 45.00it/s][A[A

  8%|▊         | 20/247 [00:00<00:05, 44.97it/s][A[A

 10%|█         | 25/247 [00:00<00:04, 44.90it/s][A[A

 12%|█▏        | 30/247 [00:00<00:04, 44.91it/s][A[A

 14%|█▍        | 35/247 [00:00<00:04, 44.84it/s][A[A

 16%|█▌        | 40/247 [00:00<00:04, 44.86it/s][A[A

 18%|█▊        | 45/247 [00:01<00:04, 44.83it/s][A[A

 20%|██        | 50/247 [00:01<00:04, 44.85it/s][A[A

 22%|██▏       | 55/247 [00:01<00:04, 44.83it/s][A[A

 24%|██▍       | 60/247 [00:01<00:04, 44.87it/s][A[A

 26%|██▋       | 65/247 [00:01<00:04, 44.89it/s][A[A

 28%|██▊       | 70/247 [00:01<00:03, 44.92it/s][A[A

 30%|███       | 75/247 [00:01<00:03, 44.92it/s][A[A

 32%|███▏      | 80/247 [00:01<00:03, 44.95it/s][A[A

 34%|███▍      | 85/247 [00:01<00:03, 44.90it/s][A[A

 36%|███▋      | 90/247 [00:02<00:03, 44.94it/s][A[A

 38%|███▊      | 95/247 [00:02<00:03, 44.86it/

Saving Test Metrics....
Run:  2
	Initialising Model....




100%|██████████| 64/64 [00:00<00:00, 4022.83it/s]


  0%|          | 0/991 [00:00<?, ?it/s][A[A

 34%|███▍      | 335/991 [00:00<00:00, 3335.47it/s][A[A

	Loading Dataset....




100%|██████████| 991/991 [00:00<00:00, 3404.33it/s][A[A


  0%|          | 0/1983 [00:00<?, ?it/s][A[A

 17%|█▋        | 347/1983 [00:00<00:00, 3462.20it/s][A[A

 36%|███▌      | 715/1983 [00:00<00:00, 3523.44it/s][A[A

 55%|█████▍    | 1087/1983 [00:00<00:00, 3578.96it/s][A[A

 73%|███████▎  | 1451/1983 [00:00<00:00, 3590.84it/s][A[A

100%|██████████| 1983/1983 [00:00<00:00, 3572.49it/s][A[A


100%|██████████| 64/64 [00:00<00:00, 4149.50it/s]


  0%|          | 0/991 [00:00<?, ?it/s][A[A

 38%|███▊      | 375/991 [00:00<00:00, 3744.79it/s][A[A



	Training Starts....


100%|██████████| 991/991 [00:00<00:00, 3861.74it/s][A[A


  0%|          | 0/1983 [00:00<?, ?it/s][A[A

 20%|██        | 405/1983 [00:00<00:00, 4042.02it/s][A[A

 41%|████      | 814/1983 [00:00<00:00, 4055.91it/s][A[A

 62%|██████▏   | 1225/1983 [00:00<00:00, 4068.00it/s][A[A

100%|██████████| 1983/1983 [00:00<00:00, 4050.04it/s][A[A
Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertFor



Training...


[A[A

 50%|█████     | 4/8 [00:00<00:00, 10.17it/s][A[A

 75%|███████▌  | 6/8 [00:00<00:00, 10.24it/s][A[A

100%|██████████| 8/8 [00:00<00:00, 10.28it/s][A[A


  0%|          | 0/123 [00:00<?, ?it/s][A[A

  4%|▍         | 5/123 [00:00<00:02, 47.26it/s][A[A

avg_train_loss 1.0545953065156937
train_f1Score 0.25
train_accuracy 0.4375

Running Validation...




  8%|▊         | 10/123 [00:00<00:02, 46.92it/s][A[A

 12%|█▏        | 15/123 [00:00<00:02, 46.71it/s][A[A

 16%|█▋        | 20/123 [00:00<00:02, 46.56it/s][A[A

 20%|██        | 25/123 [00:00<00:02, 46.38it/s][A[A

 24%|██▍       | 30/123 [00:00<00:02, 46.30it/s][A[A

 28%|██▊       | 35/123 [00:00<00:01, 46.26it/s][A[A

 33%|███▎      | 40/123 [00:00<00:01, 46.12it/s][A[A

 37%|███▋      | 45/123 [00:00<00:01, 45.82it/s][A[A

 41%|████      | 50/123 [00:01<00:01, 45.84it/s][A[A

 45%|████▍     | 55/123 [00:01<00:01, 45.88it/s][A[A

 49%|████▉     | 60/123 [00:01<00:01, 45.99it/s][A[A

 53%|█████▎    | 65/123 [00:01<00:01, 45.92it/s][A[A

 57%|█████▋    | 70/123 [00:01<00:01, 46.00it/s][A[A

 61%|██████    | 75/123 [00:01<00:01, 46.02it/s][A[A

 65%|██████▌   | 80/123 [00:01<00:00, 46.02it/s][A[A

 69%|██████▉   | 85/123 [00:01<00:00, 45.91it/s][A[A

 73%|███████▎  | 90/123 [00:01<00:00, 45.72it/s][A[A

 77%|███████▋  | 95/123 [00:02<00:00, 45.76it/

Validation Loss:  0.7324313281512842
Validation Accuracy:  0.5223577235772358
Best mF1Score....




  4%|▍         | 10/247 [00:00<00:05, 46.29it/s][A[A

  6%|▌         | 15/247 [00:00<00:05, 46.10it/s][A[A

  8%|▊         | 20/247 [00:00<00:04, 46.00it/s][A[A

 10%|█         | 25/247 [00:00<00:04, 45.93it/s][A[A

 12%|█▏        | 30/247 [00:00<00:04, 45.89it/s][A[A

 14%|█▍        | 35/247 [00:00<00:04, 45.83it/s][A[A

 16%|█▌        | 40/247 [00:00<00:04, 45.76it/s][A[A

 18%|█▊        | 45/247 [00:00<00:04, 45.71it/s][A[A

 20%|██        | 50/247 [00:01<00:04, 45.71it/s][A[A

 22%|██▏       | 55/247 [00:01<00:04, 45.74it/s][A[A

 24%|██▍       | 60/247 [00:01<00:04, 45.69it/s][A[A

 26%|██▋       | 65/247 [00:01<00:03, 45.70it/s][A[A

 28%|██▊       | 70/247 [00:01<00:03, 45.71it/s][A[A

 30%|███       | 75/247 [00:01<00:03, 45.68it/s][A[A

 32%|███▏      | 80/247 [00:01<00:03, 45.71it/s][A[A

 34%|███▍      | 85/247 [00:01<00:03, 45.68it/s][A[A

 36%|███▋      | 90/247 [00:01<00:03, 45.68it/s][A[A

 38%|███▊      | 95/247 [00:02<00:03, 45.62it/



Training...




 50%|█████     | 4/8 [00:00<00:00, 10.58it/s][A[A

 75%|███████▌  | 6/8 [00:00<00:00, 10.49it/s][A[A

100%|██████████| 8/8 [00:00<00:00, 10.38it/s][A[A


  0%|          | 0/123 [00:00<?, ?it/s][A[A

  4%|▍         | 5/123 [00:00<00:02, 46.89it/s][A[A

avg_train_loss 0.6408836767077446
train_f1Score 0.6857142857142857
train_accuracy 0.65625

Running Validation...




  8%|▊         | 10/123 [00:00<00:02, 46.46it/s][A[A

 12%|█▏        | 15/123 [00:00<00:02, 46.24it/s][A[A

 16%|█▋        | 20/123 [00:00<00:02, 46.05it/s][A[A

 20%|██        | 25/123 [00:00<00:02, 45.93it/s][A[A

 24%|██▍       | 30/123 [00:00<00:02, 45.85it/s][A[A

 28%|██▊       | 35/123 [00:00<00:01, 45.79it/s][A[A

 33%|███▎      | 40/123 [00:00<00:01, 45.59it/s][A[A

 37%|███▋      | 45/123 [00:00<00:01, 45.31it/s][A[A

 41%|████      | 50/123 [00:01<00:01, 45.39it/s][A[A

 45%|████▍     | 55/123 [00:01<00:01, 45.45it/s][A[A

 49%|████▉     | 60/123 [00:01<00:01, 45.53it/s][A[A

 53%|█████▎    | 65/123 [00:01<00:01, 45.51it/s][A[A

 57%|█████▋    | 70/123 [00:01<00:01, 45.56it/s][A[A

 61%|██████    | 75/123 [00:01<00:01, 45.46it/s][A[A

 65%|██████▌   | 80/123 [00:01<00:00, 45.47it/s][A[A

 69%|██████▉   | 85/123 [00:01<00:00, 45.40it/s][A[A

 73%|███████▎  | 90/123 [00:01<00:00, 45.30it/s][A[A

 77%|███████▋  | 95/123 [00:02<00:00, 45.35it/

Validation Loss:  0.6708171692320971
Validation Accuracy:  0.5589430894308943
Testing Model....




  4%|▍         | 10/247 [00:00<00:05, 45.85it/s][A[A

  6%|▌         | 15/247 [00:00<00:05, 45.73it/s][A[A

  8%|▊         | 20/247 [00:00<00:04, 45.68it/s][A[A

 10%|█         | 25/247 [00:00<00:04, 45.51it/s][A[A

 12%|█▏        | 30/247 [00:00<00:04, 45.47it/s][A[A

 14%|█▍        | 35/247 [00:00<00:04, 45.39it/s][A[A

 16%|█▌        | 40/247 [00:00<00:04, 45.40it/s][A[A

 18%|█▊        | 45/247 [00:00<00:04, 45.38it/s][A[A

 20%|██        | 50/247 [00:01<00:04, 45.39it/s][A[A

 22%|██▏       | 55/247 [00:01<00:04, 45.40it/s][A[A

 24%|██▍       | 60/247 [00:01<00:04, 45.37it/s][A[A

 26%|██▋       | 65/247 [00:01<00:04, 45.41it/s][A[A

 28%|██▊       | 70/247 [00:01<00:03, 45.30it/s][A[A

 30%|███       | 75/247 [00:01<00:03, 45.33it/s][A[A

 32%|███▏      | 80/247 [00:01<00:03, 45.22it/s][A[A

 34%|███▍      | 85/247 [00:01<00:03, 45.27it/s][A[A

 36%|███▋      | 90/247 [00:01<00:03, 45.25it/s][A[A

 38%|███▊      | 95/247 [00:02<00:03, 45.30it/

Best mF1Score....




  4%|▍         | 10/247 [00:00<00:05, 45.34it/s][A[A

  6%|▌         | 15/247 [00:00<00:05, 45.28it/s][A[A

  8%|▊         | 20/247 [00:00<00:05, 45.17it/s][A[A

 10%|█         | 25/247 [00:00<00:04, 45.14it/s][A[A

 12%|█▏        | 30/247 [00:00<00:04, 45.05it/s][A[A

 14%|█▍        | 35/247 [00:00<00:04, 45.02it/s][A[A

 16%|█▌        | 40/247 [00:00<00:04, 44.96it/s][A[A

 18%|█▊        | 45/247 [00:01<00:04, 44.90it/s][A[A

 20%|██        | 50/247 [00:01<00:04, 44.96it/s][A[A

 22%|██▏       | 55/247 [00:01<00:04, 44.91it/s][A[A

 24%|██▍       | 60/247 [00:01<00:04, 44.91it/s][A[A

 26%|██▋       | 65/247 [00:01<00:04, 44.85it/s][A[A

 28%|██▊       | 70/247 [00:01<00:03, 44.91it/s][A[A

 30%|███       | 75/247 [00:01<00:03, 44.91it/s][A[A

 32%|███▏      | 80/247 [00:01<00:03, 44.92it/s][A[A

 34%|███▍      | 85/247 [00:01<00:03, 44.91it/s][A[A

 36%|███▋      | 90/247 [00:02<00:03, 44.97it/s][A[A

 38%|███▊      | 95/247 [00:02<00:03, 44.94it/



Training...




 38%|███▊      | 3/8 [00:00<00:00, 10.27it/s][A[A

 50%|█████     | 4/8 [00:00<00:00, 10.14it/s][A[A

 75%|███████▌  | 6/8 [00:00<00:00, 10.14it/s][A[A

100%|██████████| 8/8 [00:00<00:00, 10.17it/s][A[A


  0%|          | 0/123 [00:00<?, ?it/s][A[A

  4%|▍         | 5/123 [00:00<00:02, 46.06it/s][A[A

avg_train_loss 0.5591603629291058
train_f1Score 0.7692307692307692
train_accuracy 0.765625

Running Validation...




  8%|▊         | 10/123 [00:00<00:02, 45.59it/s][A[A

 12%|█▏        | 15/123 [00:00<00:02, 45.13it/s][A[A

 16%|█▋        | 20/123 [00:00<00:02, 45.02it/s][A[A

 20%|██        | 25/123 [00:00<00:02, 44.97it/s][A[A

 24%|██▍       | 30/123 [00:00<00:02, 44.85it/s][A[A

 28%|██▊       | 35/123 [00:00<00:01, 44.93it/s][A[A

 33%|███▎      | 40/123 [00:00<00:01, 44.60it/s][A[A

 37%|███▋      | 45/123 [00:01<00:01, 44.42it/s][A[A

 41%|████      | 50/123 [00:01<00:01, 44.62it/s][A[A

 45%|████▍     | 55/123 [00:01<00:01, 44.64it/s][A[A

 49%|████▉     | 60/123 [00:01<00:01, 44.60it/s][A[A

 53%|█████▎    | 65/123 [00:01<00:01, 44.64it/s][A[A

 57%|█████▋    | 70/123 [00:01<00:01, 44.72it/s][A[A

 61%|██████    | 75/123 [00:01<00:01, 44.64it/s][A[A

 65%|██████▌   | 80/123 [00:01<00:00, 44.68it/s][A[A

 69%|██████▉   | 85/123 [00:01<00:00, 44.76it/s][A[A

 73%|███████▎  | 90/123 [00:02<00:00, 44.83it/s][A[A

 77%|███████▋  | 95/123 [00:02<00:00, 44.88it/

Validation Loss:  0.6712457622454419
Validation Accuracy:  0.6067073170731707
Best mF1Score....




  4%|▍         | 10/247 [00:00<00:05, 45.06it/s][A[A

  6%|▌         | 15/247 [00:00<00:05, 44.88it/s][A[A

  8%|▊         | 20/247 [00:00<00:05, 44.83it/s][A[A

 10%|█         | 25/247 [00:00<00:04, 44.74it/s][A[A

 12%|█▏        | 30/247 [00:00<00:04, 44.71it/s][A[A

 14%|█▍        | 35/247 [00:00<00:04, 44.54it/s][A[A

 16%|█▌        | 40/247 [00:00<00:04, 44.68it/s][A[A

 18%|█▊        | 45/247 [00:01<00:04, 44.78it/s][A[A

 20%|██        | 50/247 [00:01<00:04, 44.76it/s][A[A

 22%|██▏       | 55/247 [00:01<00:04, 44.57it/s][A[A

 24%|██▍       | 60/247 [00:01<00:04, 44.52it/s][A[A

 26%|██▋       | 65/247 [00:01<00:04, 44.54it/s][A[A

 28%|██▊       | 70/247 [00:01<00:03, 44.68it/s][A[A

 30%|███       | 75/247 [00:01<00:03, 44.77it/s][A[A

 32%|███▏      | 80/247 [00:01<00:03, 44.58it/s][A[A

 34%|███▍      | 85/247 [00:01<00:03, 44.54it/s][A[A

 36%|███▋      | 90/247 [00:02<00:03, 44.44it/s][A[A

 38%|███▊      | 95/247 [00:02<00:03, 44.56it/



Training...




 38%|███▊      | 3/8 [00:00<00:00, 10.20it/s][A[A

 62%|██████▎   | 5/8 [00:00<00:00, 10.14it/s][A[A

100%|██████████| 8/8 [00:00<00:00, 10.14it/s][A[A


  0%|          | 0/123 [00:00<?, ?it/s][A[A

  4%|▍         | 5/123 [00:00<00:02, 45.75it/s][A[A

avg_train_loss 0.4099707920104265
train_f1Score 0.8615384615384615
train_accuracy 0.859375

Running Validation...




  8%|▊         | 10/123 [00:00<00:02, 45.52it/s][A[A

 12%|█▏        | 15/123 [00:00<00:02, 45.26it/s][A[A

 16%|█▋        | 20/123 [00:00<00:02, 45.05it/s][A[A

 20%|██        | 25/123 [00:00<00:02, 45.02it/s][A[A

 24%|██▍       | 30/123 [00:00<00:02, 44.99it/s][A[A

 28%|██▊       | 35/123 [00:00<00:01, 44.64it/s][A[A

 33%|███▎      | 40/123 [00:00<00:01, 44.61it/s][A[A

 37%|███▋      | 45/123 [00:01<00:01, 44.55it/s][A[A

 41%|████      | 50/123 [00:01<00:01, 44.62it/s][A[A

 45%|████▍     | 55/123 [00:01<00:01, 44.57it/s][A[A

 49%|████▉     | 60/123 [00:01<00:01, 44.68it/s][A[A

 53%|█████▎    | 65/123 [00:01<00:01, 44.76it/s][A[A

 57%|█████▋    | 70/123 [00:01<00:01, 44.83it/s][A[A

 61%|██████    | 75/123 [00:01<00:01, 44.89it/s][A[A

 65%|██████▌   | 80/123 [00:01<00:00, 44.88it/s][A[A

 69%|██████▉   | 85/123 [00:01<00:00, 44.74it/s][A[A

 73%|███████▎  | 90/123 [00:02<00:00, 44.63it/s][A[A

 77%|███████▋  | 95/123 [00:02<00:00, 44.61it/

Validation Loss:  0.6965522813360866
Validation Accuracy:  0.6178861788617886
Testing Model....




  4%|▍         | 10/247 [00:00<00:05, 44.46it/s][A[A

  6%|▌         | 15/247 [00:00<00:05, 44.59it/s][A[A

  8%|▊         | 20/247 [00:00<00:05, 44.54it/s][A[A

 10%|█         | 25/247 [00:00<00:04, 44.59it/s][A[A

 12%|█▏        | 30/247 [00:00<00:04, 44.55it/s][A[A

 14%|█▍        | 35/247 [00:00<00:04, 44.70it/s][A[A

 16%|█▌        | 40/247 [00:00<00:04, 44.47it/s][A[A

 18%|█▊        | 45/247 [00:01<00:04, 44.50it/s][A[A

 20%|██        | 50/247 [00:01<00:04, 44.46it/s][A[A

 22%|██▏       | 55/247 [00:01<00:04, 44.56it/s][A[A

 24%|██▍       | 60/247 [00:01<00:04, 44.50it/s][A[A

 26%|██▋       | 65/247 [00:01<00:04, 44.64it/s][A[A

 28%|██▊       | 70/247 [00:01<00:03, 44.74it/s][A[A

 30%|███       | 75/247 [00:01<00:03, 44.80it/s][A[A

 32%|███▏      | 80/247 [00:01<00:03, 44.83it/s][A[A

 34%|███▍      | 85/247 [00:01<00:03, 44.52it/s][A[A

 36%|███▋      | 90/247 [00:02<00:03, 44.54it/s][A[A

 38%|███▊      | 95/247 [00:02<00:03, 44.48it/

Best mF1Score....




  4%|▍         | 10/247 [00:00<00:05, 44.67it/s][A[A

  6%|▌         | 15/247 [00:00<00:05, 44.73it/s][A[A

  8%|▊         | 20/247 [00:00<00:05, 44.63it/s][A[A

 10%|█         | 25/247 [00:00<00:04, 44.69it/s][A[A

 12%|█▏        | 30/247 [00:00<00:04, 44.45it/s][A[A

 14%|█▍        | 35/247 [00:00<00:04, 44.30it/s][A[A

 16%|█▌        | 40/247 [00:00<00:04, 44.51it/s][A[A

 18%|█▊        | 45/247 [00:01<00:04, 44.55it/s][A[A

 20%|██        | 50/247 [00:01<00:04, 44.70it/s][A[A

 22%|██▏       | 55/247 [00:01<00:04, 44.67it/s][A[A

 24%|██▍       | 60/247 [00:01<00:04, 44.42it/s][A[A

 26%|██▋       | 65/247 [00:01<00:04, 44.47it/s][A[A

 28%|██▊       | 70/247 [00:01<00:03, 44.57it/s][A[A

 30%|███       | 75/247 [00:01<00:03, 44.67it/s][A[A

 32%|███▏      | 80/247 [00:01<00:03, 44.67it/s][A[A

 34%|███▍      | 85/247 [00:01<00:03, 44.80it/s][A[A

 36%|███▋      | 90/247 [00:02<00:03, 44.71it/s][A[A

 38%|███▊      | 95/247 [00:02<00:03, 44.73it/



Training...




 50%|█████     | 4/8 [00:00<00:00, 10.42it/s][A[A

 75%|███████▌  | 6/8 [00:00<00:00, 10.35it/s][A[A

100%|██████████| 8/8 [00:00<00:00, 10.27it/s][A[A


  0%|          | 0/123 [00:00<?, ?it/s][A[A

  4%|▍         | 5/123 [00:00<00:02, 45.57it/s][A[A

avg_train_loss 0.27938369009643793
train_f1Score 0.9180327868852458
train_accuracy 0.921875

Running Validation...




  8%|▊         | 10/123 [00:00<00:02, 45.25it/s][A[A

 12%|█▏        | 15/123 [00:00<00:02, 45.15it/s][A[A

 16%|█▋        | 20/123 [00:00<00:02, 44.96it/s][A[A

 20%|██        | 25/123 [00:00<00:02, 44.75it/s][A[A

 24%|██▍       | 30/123 [00:00<00:02, 44.77it/s][A[A

 28%|██▊       | 35/123 [00:00<00:01, 44.72it/s][A[A

 33%|███▎      | 40/123 [00:00<00:01, 44.77it/s][A[A

 37%|███▋      | 45/123 [00:01<00:01, 44.75it/s][A[A

 41%|████      | 50/123 [00:01<00:01, 44.83it/s][A[A

 45%|████▍     | 55/123 [00:01<00:01, 44.81it/s][A[A

 49%|████▉     | 60/123 [00:01<00:01, 44.61it/s][A[A

 53%|█████▎    | 65/123 [00:01<00:01, 44.51it/s][A[A

 57%|█████▋    | 70/123 [00:01<00:01, 44.64it/s][A[A

 61%|██████    | 75/123 [00:01<00:01, 44.65it/s][A[A

 65%|██████▌   | 80/123 [00:01<00:00, 44.80it/s][A[A

 69%|██████▉   | 85/123 [00:01<00:00, 44.71it/s][A[A

 73%|███████▎  | 90/123 [00:02<00:00, 44.84it/s][A[A

 77%|███████▋  | 95/123 [00:02<00:00, 44.68it/

Validation Loss:  0.7642469804097967
Validation Accuracy:  0.616869918699187


Training...




 50%|█████     | 4/8 [00:00<00:00, 10.29it/s][A[A

 75%|███████▌  | 6/8 [00:00<00:00, 10.23it/s][A[A

100%|██████████| 8/8 [00:00<00:00, 10.13it/s][A[A


  0%|          | 0/123 [00:00<?, ?it/s][A[A

  4%|▍         | 5/123 [00:00<00:02, 45.33it/s][A[A

avg_train_loss 0.2416942873969674
train_f1Score 0.9333333333333333
train_accuracy 0.9375

Running Validation...




  8%|▊         | 10/123 [00:00<00:02, 45.18it/s][A[A

 12%|█▏        | 15/123 [00:00<00:02, 45.11it/s][A[A

 16%|█▋        | 20/123 [00:00<00:02, 45.05it/s][A[A

 20%|██        | 25/123 [00:00<00:02, 44.95it/s][A[A

 24%|██▍       | 30/123 [00:00<00:02, 44.95it/s][A[A

 28%|██▊       | 35/123 [00:00<00:01, 44.86it/s][A[A

 33%|███▎      | 40/123 [00:00<00:01, 44.91it/s][A[A

 37%|███▋      | 45/123 [00:01<00:01, 44.76it/s][A[A

 41%|████      | 50/123 [00:01<00:01, 44.87it/s][A[A

 45%|████▍     | 55/123 [00:01<00:01, 44.82it/s][A[A

 49%|████▉     | 60/123 [00:01<00:01, 44.83it/s][A[A

 53%|█████▎    | 65/123 [00:01<00:01, 44.87it/s][A[A

 57%|█████▋    | 70/123 [00:01<00:01, 44.83it/s][A[A

 61%|██████    | 75/123 [00:01<00:01, 44.84it/s][A[A

 65%|██████▌   | 80/123 [00:01<00:00, 44.78it/s][A[A

 69%|██████▉   | 85/123 [00:01<00:00, 44.83it/s][A[A

 73%|███████▎  | 90/123 [00:02<00:00, 44.80it/s][A[A

 77%|███████▋  | 95/123 [00:02<00:00, 44.84it/

Validation Loss:  0.9330049436387977
Validation Accuracy:  0.6280487804878049
Testing Model....




  4%|▍         | 10/247 [00:00<00:05, 45.15it/s][A[A

  6%|▌         | 15/247 [00:00<00:05, 45.10it/s][A[A

  8%|▊         | 20/247 [00:00<00:05, 44.98it/s][A[A

 10%|█         | 25/247 [00:00<00:04, 44.95it/s][A[A

 12%|█▏        | 30/247 [00:00<00:04, 44.87it/s][A[A

 14%|█▍        | 35/247 [00:00<00:04, 44.90it/s][A[A

 16%|█▌        | 40/247 [00:00<00:04, 44.90it/s][A[A

 18%|█▊        | 45/247 [00:01<00:04, 44.83it/s][A[A

 20%|██        | 50/247 [00:01<00:04, 44.87it/s][A[A

 22%|██▏       | 55/247 [00:01<00:04, 44.81it/s][A[A

 24%|██▍       | 60/247 [00:01<00:04, 44.84it/s][A[A

 26%|██▋       | 65/247 [00:01<00:04, 44.79it/s][A[A

 28%|██▊       | 70/247 [00:01<00:03, 44.84it/s][A[A

 30%|███       | 75/247 [00:01<00:03, 44.73it/s][A[A

 32%|███▏      | 80/247 [00:01<00:03, 44.75it/s][A[A

 34%|███▍      | 85/247 [00:01<00:03, 44.80it/s][A[A

 36%|███▋      | 90/247 [00:02<00:03, 44.75it/s][A[A

 38%|███▊      | 95/247 [00:02<00:03, 44.79it/

Best mF1Score....




  4%|▍         | 10/247 [00:00<00:05, 45.24it/s][A[A

  6%|▌         | 15/247 [00:00<00:05, 45.03it/s][A[A

  8%|▊         | 20/247 [00:00<00:05, 45.08it/s][A[A

 10%|█         | 25/247 [00:00<00:04, 44.92it/s][A[A

 12%|█▏        | 30/247 [00:00<00:04, 44.89it/s][A[A

 14%|█▍        | 35/247 [00:00<00:04, 44.88it/s][A[A

 16%|█▌        | 40/247 [00:00<00:04, 44.83it/s][A[A

 18%|█▊        | 45/247 [00:01<00:04, 44.88it/s][A[A

 20%|██        | 50/247 [00:01<00:04, 44.81it/s][A[A

 22%|██▏       | 55/247 [00:01<00:04, 44.61it/s][A[A

 24%|██▍       | 60/247 [00:01<00:04, 44.56it/s][A[A

 26%|██▋       | 65/247 [00:01<00:04, 44.73it/s][A[A

 28%|██▊       | 70/247 [00:01<00:03, 44.67it/s][A[A

 30%|███       | 75/247 [00:01<00:03, 44.85it/s][A[A

 32%|███▏      | 80/247 [00:01<00:03, 44.75it/s][A[A

 34%|███▍      | 85/247 [00:01<00:03, 44.76it/s][A[A

 36%|███▋      | 90/247 [00:02<00:03, 44.57it/s][A[A

 38%|███▊      | 95/247 [00:02<00:03, 44.49it/



Training...




 38%|███▊      | 3/8 [00:00<00:00, 10.22it/s][A[A

 50%|█████     | 4/8 [00:00<00:00, 10.14it/s][A[A

 62%|██████▎   | 5/8 [00:00<00:00, 10.08it/s][A[A

 75%|███████▌  | 6/8 [00:00<00:00, 10.04it/s][A[A

 88%|████████▊ | 7/8 [00:00<00:00, 10.01it/s][A[A

100%|██████████| 8/8 [00:00<00:00, 10.03it/s][A[A


  0%|          | 0/123 [00:00<?, ?it/s][A[A

  4%|▍         | 5/123 [00:00<00:02, 45.07it/s][A[A

avg_train_loss 0.23479019198566675
train_f1Score 0.9206349206349206
train_accuracy 0.921875

Running Validation...




  8%|▊         | 10/123 [00:00<00:02, 45.06it/s][A[A

 12%|█▏        | 15/123 [00:00<00:02, 45.08it/s][A[A

 16%|█▋        | 20/123 [00:00<00:02, 44.99it/s][A[A

 20%|██        | 25/123 [00:00<00:02, 44.99it/s][A[A

 24%|██▍       | 30/123 [00:00<00:02, 44.90it/s][A[A

 28%|██▊       | 35/123 [00:00<00:01, 44.91it/s][A[A

 33%|███▎      | 40/123 [00:00<00:01, 44.92it/s][A[A

 37%|███▋      | 45/123 [00:01<00:01, 44.85it/s][A[A

 41%|████      | 50/123 [00:01<00:01, 44.88it/s][A[A

 45%|████▍     | 55/123 [00:01<00:01, 44.83it/s][A[A

 49%|████▉     | 60/123 [00:01<00:01, 44.88it/s][A[A

 53%|█████▎    | 65/123 [00:01<00:01, 44.83it/s][A[A

 57%|█████▋    | 70/123 [00:01<00:01, 44.86it/s][A[A

 61%|██████    | 75/123 [00:01<00:01, 44.83it/s][A[A

 65%|██████▌   | 80/123 [00:01<00:00, 44.86it/s][A[A

 69%|██████▉   | 85/123 [00:01<00:00, 44.82it/s][A[A

 73%|███████▎  | 90/123 [00:02<00:00, 44.87it/s][A[A

 77%|███████▋  | 95/123 [00:02<00:00, 44.82it/

Validation Loss:  1.0178559758071977
Validation Accuracy:  0.6239837398373984


Training...




 50%|█████     | 4/8 [00:00<00:00, 10.37it/s][A[A

 75%|███████▌  | 6/8 [00:00<00:00, 10.30it/s][A[A

100%|██████████| 8/8 [00:00<00:00, 10.19it/s][A[A


  0%|          | 0/123 [00:00<?, ?it/s][A[A

  4%|▍         | 5/123 [00:00<00:02, 45.51it/s][A[A

avg_train_loss 0.22964584175497293
train_f1Score 0.9206349206349206
train_accuracy 0.921875

Running Validation...




  8%|▊         | 10/123 [00:00<00:02, 45.35it/s][A[A

 12%|█▏        | 15/123 [00:00<00:02, 45.27it/s][A[A

 16%|█▋        | 20/123 [00:00<00:02, 45.14it/s][A[A

 20%|██        | 25/123 [00:00<00:02, 45.06it/s][A[A

 24%|██▍       | 30/123 [00:00<00:02, 44.96it/s][A[A

 28%|██▊       | 35/123 [00:00<00:01, 44.96it/s][A[A

 33%|███▎      | 40/123 [00:00<00:01, 44.88it/s][A[A

 37%|███▋      | 45/123 [00:01<00:01, 44.89it/s][A[A

 41%|████      | 50/123 [00:01<00:01, 44.83it/s][A[A

 45%|████▍     | 55/123 [00:01<00:01, 44.86it/s][A[A

 49%|████▉     | 60/123 [00:01<00:01, 44.80it/s][A[A

 53%|█████▎    | 65/123 [00:01<00:01, 44.83it/s][A[A

 57%|█████▋    | 70/123 [00:01<00:01, 44.79it/s][A[A

 61%|██████    | 75/123 [00:01<00:01, 44.84it/s][A[A

 65%|██████▌   | 80/123 [00:01<00:00, 44.83it/s][A[A

 69%|██████▉   | 85/123 [00:01<00:00, 44.77it/s][A[A

 73%|███████▎  | 90/123 [00:02<00:00, 44.81it/s][A[A

 77%|███████▋  | 95/123 [00:02<00:00, 44.78it/

Validation Loss:  1.0223667839678323
Validation Accuracy:  0.6300813008130082
Testing Model....




  4%|▍         | 10/247 [00:00<00:05, 45.06it/s][A[A

  6%|▌         | 15/247 [00:00<00:05, 44.96it/s][A[A

  8%|▊         | 20/247 [00:00<00:05, 44.94it/s][A[A

 10%|█         | 25/247 [00:00<00:04, 44.86it/s][A[A

 12%|█▏        | 30/247 [00:00<00:04, 44.89it/s][A[A

 14%|█▍        | 35/247 [00:00<00:04, 44.82it/s][A[A

 16%|█▌        | 40/247 [00:00<00:04, 44.83it/s][A[A

 18%|█▊        | 45/247 [00:01<00:04, 44.79it/s][A[A

 20%|██        | 50/247 [00:01<00:04, 44.85it/s][A[A

 22%|██▏       | 55/247 [00:01<00:04, 44.86it/s][A[A

 24%|██▍       | 60/247 [00:01<00:04, 44.90it/s][A[A

 26%|██▋       | 65/247 [00:01<00:04, 44.89it/s][A[A

 28%|██▊       | 70/247 [00:01<00:03, 44.84it/s][A[A

 30%|███       | 75/247 [00:01<00:03, 44.86it/s][A[A

 32%|███▏      | 80/247 [00:01<00:03, 44.82it/s][A[A

 34%|███▍      | 85/247 [00:01<00:03, 44.84it/s][A[A

 36%|███▋      | 90/247 [00:02<00:03, 44.78it/s][A[A

 38%|███▊      | 95/247 [00:02<00:03, 44.81it/

Best mF1Score....




  4%|▍         | 10/247 [00:00<00:05, 45.29it/s][A[A

  6%|▌         | 15/247 [00:00<00:05, 45.23it/s][A[A

  8%|▊         | 20/247 [00:00<00:05, 45.12it/s][A[A

 10%|█         | 25/247 [00:00<00:04, 45.08it/s][A[A

 12%|█▏        | 30/247 [00:00<00:04, 44.99it/s][A[A

 14%|█▍        | 35/247 [00:00<00:04, 44.96it/s][A[A

 16%|█▌        | 40/247 [00:00<00:04, 44.86it/s][A[A

 18%|█▊        | 45/247 [00:01<00:04, 44.82it/s][A[A

 20%|██        | 50/247 [00:01<00:04, 44.83it/s][A[A

 22%|██▏       | 55/247 [00:01<00:04, 44.85it/s][A[A

 24%|██▍       | 60/247 [00:01<00:04, 44.80it/s][A[A

 26%|██▋       | 65/247 [00:01<00:04, 44.84it/s][A[A

 28%|██▊       | 70/247 [00:01<00:03, 44.80it/s][A[A

 30%|███       | 75/247 [00:01<00:03, 44.83it/s][A[A

 32%|███▏      | 80/247 [00:01<00:03, 44.79it/s][A[A

 34%|███▍      | 85/247 [00:01<00:03, 44.83it/s][A[A

 36%|███▋      | 90/247 [00:02<00:03, 44.80it/s][A[A

 38%|███▊      | 95/247 [00:02<00:03, 44.72it/



Training...




 38%|███▊      | 3/8 [00:00<00:00, 10.26it/s][A[A

 50%|█████     | 4/8 [00:00<00:00, 10.15it/s][A[A

 62%|██████▎   | 5/8 [00:00<00:00, 10.08it/s][A[A

 75%|███████▌  | 6/8 [00:00<00:00, 10.04it/s][A[A

 88%|████████▊ | 7/8 [00:00<00:00, 10.01it/s][A[A

100%|██████████| 8/8 [00:00<00:00, 10.03it/s][A[A


  0%|          | 0/123 [00:00<?, ?it/s][A[A

  4%|▍         | 5/123 [00:00<00:02, 45.06it/s][A[A

avg_train_loss 0.1877595642581582
train_f1Score 0.9508196721311475
train_accuracy 0.953125

Running Validation...




  8%|▊         | 10/123 [00:00<00:02, 45.00it/s][A[A

 12%|█▏        | 15/123 [00:00<00:02, 44.97it/s][A[A

 16%|█▋        | 20/123 [00:00<00:02, 44.90it/s][A[A

 20%|██        | 25/123 [00:00<00:02, 44.87it/s][A[A

 24%|██▍       | 30/123 [00:00<00:02, 44.80it/s][A[A

 28%|██▊       | 35/123 [00:00<00:01, 44.83it/s][A[A

 33%|███▎      | 40/123 [00:00<00:01, 44.79it/s][A[A

 37%|███▋      | 45/123 [00:01<00:01, 44.82it/s][A[A

 41%|████      | 50/123 [00:01<00:01, 44.78it/s][A[A

 45%|████▍     | 55/123 [00:01<00:01, 44.82it/s][A[A

 49%|████▉     | 60/123 [00:01<00:01, 44.85it/s][A[A

 53%|█████▎    | 65/123 [00:01<00:01, 44.80it/s][A[A

 57%|█████▋    | 70/123 [00:01<00:01, 44.82it/s][A[A

 61%|██████    | 75/123 [00:01<00:01, 44.79it/s][A[A

 65%|██████▌   | 80/123 [00:01<00:00, 44.83it/s][A[A

 69%|██████▉   | 85/123 [00:01<00:00, 44.77it/s][A[A

 73%|███████▎  | 90/123 [00:02<00:00, 44.78it/s][A[A

 77%|███████▋  | 95/123 [00:02<00:00, 44.76it/

Validation Loss:  1.016498558404969
Validation Accuracy:  0.6270325203252033


Training...




 38%|███▊      | 3/8 [00:00<00:00, 10.23it/s][A[A

 62%|██████▎   | 5/8 [00:00<00:00, 10.23it/s][A[A

100%|██████████| 8/8 [00:00<00:00, 10.23it/s][A[A


  0%|          | 0/123 [00:00<?, ?it/s][A[A

  4%|▍         | 5/123 [00:00<00:02, 45.56it/s][A[A

avg_train_loss 0.1775457877665758
train_f1Score 0.9508196721311475
train_accuracy 0.953125

Running Validation...




  8%|▊         | 10/123 [00:00<00:02, 45.32it/s][A[A

 12%|█▏        | 15/123 [00:00<00:02, 45.26it/s][A[A

 16%|█▋        | 20/123 [00:00<00:02, 45.20it/s][A[A

 20%|██        | 25/123 [00:00<00:02, 45.04it/s][A[A

 24%|██▍       | 30/123 [00:00<00:02, 45.03it/s][A[A

 28%|██▊       | 35/123 [00:00<00:01, 44.92it/s][A[A

 33%|███▎      | 40/123 [00:00<00:01, 44.84it/s][A[A

 37%|███▋      | 45/123 [00:01<00:01, 44.77it/s][A[A

 41%|████      | 50/123 [00:01<00:01, 44.82it/s][A[A

 45%|████▍     | 55/123 [00:01<00:01, 44.87it/s][A[A

 49%|████▉     | 60/123 [00:01<00:01, 44.85it/s][A[A

 53%|█████▎    | 65/123 [00:01<00:01, 44.85it/s][A[A

 57%|█████▋    | 70/123 [00:01<00:01, 44.78it/s][A[A

 61%|██████    | 75/123 [00:01<00:01, 44.80it/s][A[A

 65%|██████▌   | 80/123 [00:01<00:00, 44.76it/s][A[A

 69%|██████▉   | 85/123 [00:01<00:00, 44.82it/s][A[A

 73%|███████▎  | 90/123 [00:02<00:00, 44.78it/s][A[A

 77%|███████▋  | 95/123 [00:02<00:00, 44.81it/

Validation Loss:  1.0110588555898123
Validation Accuracy:  0.6290650406504065
Testing Model....




  4%|▍         | 10/247 [00:00<00:05, 45.27it/s][A[A

  6%|▌         | 15/247 [00:00<00:05, 45.13it/s][A[A

  8%|▊         | 20/247 [00:00<00:05, 44.99it/s][A[A

 10%|█         | 25/247 [00:00<00:04, 44.98it/s][A[A

 12%|█▏        | 30/247 [00:00<00:04, 44.88it/s][A[A

 14%|█▍        | 35/247 [00:00<00:04, 44.90it/s][A[A

 16%|█▌        | 40/247 [00:00<00:04, 44.81it/s][A[A

 18%|█▊        | 45/247 [00:01<00:04, 44.85it/s][A[A

 20%|██        | 50/247 [00:01<00:04, 44.86it/s][A[A

 22%|██▏       | 55/247 [00:01<00:04, 44.81it/s][A[A

 24%|██▍       | 60/247 [00:01<00:04, 44.84it/s][A[A

 26%|██▋       | 65/247 [00:01<00:04, 44.80it/s][A[A

 28%|██▊       | 70/247 [00:01<00:03, 44.82it/s][A[A

 30%|███       | 75/247 [00:01<00:03, 44.77it/s][A[A

 32%|███▏      | 80/247 [00:01<00:03, 44.82it/s][A[A

 34%|███▍      | 85/247 [00:01<00:03, 44.78it/s][A[A

 36%|███▋      | 90/247 [00:02<00:03, 44.82it/s][A[A

 38%|███▊      | 95/247 [00:02<00:03, 44.78it/

Saving Test Metrics....
Run:  3
	Initialising Model....




100%|██████████| 64/64 [00:00<00:00, 3644.99it/s]


  0%|          | 0/991 [00:00<?, ?it/s][A[A

 34%|███▍      | 337/991 [00:00<00:00, 3367.08it/s][A[A

	Loading Dataset....




100%|██████████| 991/991 [00:00<00:00, 3413.28it/s][A[A


  0%|          | 0/1983 [00:00<?, ?it/s][A[A

 17%|█▋        | 347/1983 [00:00<00:00, 3460.42it/s][A[A

 36%|███▌      | 711/1983 [00:00<00:00, 3506.29it/s][A[A

 54%|█████▍    | 1080/1983 [00:00<00:00, 3556.59it/s][A[A

 73%|███████▎  | 1450/1983 [00:00<00:00, 3591.04it/s][A[A

100%|██████████| 1983/1983 [00:00<00:00, 3562.60it/s][A[A


100%|██████████| 64/64 [00:00<00:00, 3661.65it/s]


  0%|          | 0/991 [00:00<?, ?it/s][A[A

 37%|███▋      | 368/991 [00:00<00:00, 3678.24it/s][A[A



	Training Starts....


100%|██████████| 991/991 [00:00<00:00, 3810.67it/s][A[A


  0%|          | 0/1983 [00:00<?, ?it/s][A[A

 21%|██        | 407/1983 [00:00<00:00, 4065.02it/s][A[A

 41%|████      | 817/1983 [00:00<00:00, 4075.38it/s][A[A

 62%|██████▏   | 1225/1983 [00:00<00:00, 4075.64it/s][A[A

100%|██████████| 1983/1983 [00:00<00:00, 4021.30it/s][A[A
Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertFor

## Spanish Few Data

In [None]:
run_args={
    'model_name':'few_shot',
    'data_path':'Data_Processed/AMI-Spanish/',
    'train_cnt':256,
    'res_base_path': 'Results/AMI-Spanish/fewData_fewShot/',
    'model_save_path': 'Saved_Models/AMI-Spanish/',
    'isArabic': False,
}

model_args={
        'seed_val': 42,
        'batch_size': 8,
        'bert_model': "bert-base-multilingual-cased",
        'learning_rate': 2e-5,
        'epochs': 10,
        'max_len': 128,
        'device': 'cuda',
        'weights': [1.0, 1.0],
        'save_model': False,
        'model_save_path': '',
        'isArabic': False,
        'model_path': "Saved_Models/Shared_Task_eng_1/best_bert_3_all.pt",
    }
run_part(run_args,model_args)

## Hindi Few Data

In [None]:
run_args={
    'model_name':'few_shot',
    'data_path':'Data_Processed/Shared_Task_hin/',
    'train_cnt':256,
    'res_base_path': 'Results/Shared_Task_hin/fewData_fewShot/',
    'model_save_path': 'Saved_Models/Shared_Task_hin/',
    'isArabic': False,
}

model_args={
        'seed_val': 42,
        'batch_size': 8,
        'bert_model': "bert-base-multilingual-cased",
        'learning_rate': 2e-5,
        'epochs': 10,
        'max_len': 128,
        'device': 'cuda',
        'weights': [1.0, 1.0],
        'save_model': False,
        'model_save_path': '',
        'isArabic': False,
        'model_path': "Saved_Models/Shared_Task_eng_1/best_bert_3_all.pt",
    }
run_part(run_args,model_args)

## Bengali Few Data

In [None]:
run_args={
    'model_name':'few_shot',
    'data_path':'Data_Processed/Shared_Task_iben/',
    'train_cnt':256,
    'res_base_path': 'Results/Shared_Task_iben/fewData_fewShot/',
    'model_save_path': 'Saved_Models/Shared_Task_iben/',
    'isArabic': False,
}

model_args={
        'seed_val': 42,
        'batch_size': 8,
        'bert_model': "bert-base-multilingual-cased",
        'learning_rate': 2e-5,
        'epochs': 10,
        'max_len': 128,
        'device': 'cuda',
        'weights': [1.0, 1.0],
        'save_model': False,
        'model_save_path': '',
        'isArabic': False,
        'model_path': "Saved_Models/Shared_Task_eng_1/best_bert_3_all.pt",
    }
run_part(run_args,model_args)

## English

In [None]:
run_args={
    'model_name':'few_shot_xlm',
    'data_path':'Data_Processed/Shared_Task_eng/',
    'train_cnt':256,
    'res_base_path': 'Results/Shared_Task_eng/all_but_one/',
    'model_save_path': 'Saved_Models/Shared_Task_eng/',
    'isArabic': False,
}

model_args={
        'seed_val': 42,
        'name': 'xlm_roberta',
        'batch_size': 8,
        'bert_model': "xlm-roberta-base",
        'learning_rate': 2e-5,
        'epochs': 10,
        'max_len': 128,
        'device': 'cuda',
        'weights': [1.0, 8.0],
        'save_model': False,
        'model_path': 'Saved_Models/Shared_Task_eng/all_but_one/best_bert_xlm_roberta_4_all.pt',
        'isArabic': False,
        'model_save_path': '',
        'max_length':128,
        'is_train':True,
        'epsilon':1e-8,
        'random_seed':30,
        'to_save':False,
        'frac':0.8,
        'params':{
            'max_length':128,
            'path_files': 'xlm-roberta-base',
            'what_bert':'weighted',
            'batch_size':8,
            'is_train':True,
            'learning_rate':2e-5,
            'epsilon':1e-8,
            'random_seed':30,
            'epochs':10,
            'to_save':False,
            'weights':[1.0,8.0],
            'frac':0.8
        }
    }

for train_cnt in [32,64,128,256,512]:
    print("Train cnt: ",train_cnt)
    run_args['train_cnt']=train_cnt
    run_part(run_args,model_args,train_cnt)