In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

from data_cleaning import Data_Preprocessing
from arabert.preprocess import ArabertPreprocessor

In [2]:
# Core
import random

# Basics
import numpy as np
import pandas as pd
import torch

# Metrics
from sklearn.metrics import *

# Tokeniser
from transformers import BertTokenizer

# Utility
from tqdm import tqdm

# Dataloader
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# Scheduler
from transformers import get_linear_schedule_with_warmup

# Optimiser
from transformers import AdamW

# Model
from transformers import BertForSequenceClassification
import torch.nn as nn

class BERT:
    def __init__(self,args):
        # fix the random
        random.seed(args['seed_val'])
        np.random.seed(args['seed_val'])
        torch.manual_seed(args['seed_val'])
        torch.cuda.manual_seed_all(args['seed_val'])
        
        # set device
        self.device = torch.device(args['device'])

        self.weights=args['weights']
        
        # initiliase tokeniser
        self.tokenizer = BertTokenizer.from_pretrained(args['bert_model'])

        self.model_save_path = args['model_save_path']
        self.name = args['name']
        
    ##-----------------------------------------------------------##
    ##----------------- Utility Functions -----------------------##
    ##-----------------------------------------------------------##
    def encode(self,data,max_len):
        input_ids = []
        attention_masks = []
        for sent in tqdm(data):
            # use in-built tokeniser of Bert
            encoded_dict = self.tokenizer.encode_plus(
                            sent,
                            add_special_tokens =True, # for [CLS] and [SEP]
                            max_length = max_len,
                            truncation = True,
                            padding = 'max_length',
                            return_attention_mask = True,
                            return_tensors = 'pt', # return pytorch tensors
            )
            input_ids.append(encoded_dict['input_ids'])
            # attention masks notify where padding has been added 
            # and where is the sentence
            attention_masks.append(encoded_dict['attention_mask'])
        
        return [input_ids,attention_masks]
    
    ##-----------------------------------------------------------##
    ##------------------ Dataloader -----------------------------##
    ##-----------------------------------------------------------##
    def get_dataloader(self,samples, batch_size,is_train=False):
        inputs,masks,labels = samples

        # Convert the lists into tensors.
        inputs = torch.cat(inputs, dim=0)
        masks = torch.cat(masks, dim=0)
        labels = torch.tensor(labels)

        # convert to dataset
        data = TensorDataset(inputs,masks,labels)

        if(is_train==False):
            # use random sampler for training to shuffle
            # train data
            sampler = SequentialSampler(data)
        else:
            # order does not matter for validation as we just 
            # need the metrics
            sampler = RandomSampler(data)  

        dataloader = DataLoader(data, sampler=sampler, batch_size=batch_size,drop_last=True)

        return dataloader
    
    ##-----------------------------------------------------------##
    ##----------------- Training Utilities ----------------------##
    ##-----------------------------------------------------------## 
    def get_optimiser(self,learning_rate,model):
        # using AdamW optimiser from transformers library
        return AdamW(model.parameters(),
                  lr = learning_rate, 
                  eps = 1e-8
                )
    
    def get_scheduler(self,epochs,optimiser,train_dl):
        total_steps = len(train_dl) * epochs
        return get_linear_schedule_with_warmup(optimiser, 
                num_warmup_steps = 0, 
                num_training_steps = total_steps)
    
    def evalMetric(self, y_true, y_pred, prefix):
        # calculate all the metrics and add prefix to them
        # before saving in dictionary
        accuracy = accuracy_score(y_true, y_pred)
        mf1Score = f1_score(y_true, y_pred, average='macro')
        f1Score = f1_score(y_true, y_pred)
        area_under_c = roc_auc_score(y_true, y_pred)
        recallScore = recall_score(y_true, y_pred)
        precisionScore = precision_score(y_true, y_pred)

        nonhate_f1Score = f1_score(y_true, y_pred, pos_label=0)
        non_recallScore = recall_score(y_true, y_pred, pos_label=0)
        non_precisionScore = precision_score(y_true, y_pred, pos_label=0)
        return {prefix+"accuracy": accuracy, prefix+'mF1Score': mf1Score, 
            prefix+'f1Score': f1Score, prefix+'auc': area_under_c,
            prefix+'precision': precisionScore, 
            prefix+'recall': recallScore, 
            prefix+'non_hatef1Score': nonhate_f1Score, 
            prefix+'non_recallScore': non_recallScore, 
            prefix+'non_precisionScore': non_precisionScore}
    
    ##-----------------------------------------------------------##
    ##---------------- Different Train Loops --------------------##
    ##-----------------------------------------------------------## 
    def evaluate(self,model,loader,which):
        # to evaluate model on test and validation set

        model.eval() # put model in eval mode

        # maintain total loss to save in metrics
        total_eval_loss = 0

        # maintain predictions for each batch and calculate metrics
        # at the end of the epoch
        y_pred = np.zeros(shape=(0),dtype='int')
        y_true = np.empty(shape=(0),dtype='int')

        for batch in tqdm(loader):
            # separate input, labels and attention mask
            b_input_ids = batch[0].to(self.device)
            b_input_mask = batch[1].to(self.device)
            b_labels = batch[2].to(self.device)

            with torch.no_grad(): # do not construct compute graph
                outputs = model(b_input_ids, 
                                   token_type_ids=None, 
                                   attention_mask=b_input_mask,
                                   labels=b_labels)
            
            # output is always a tuple, thus we have to 
            # separate it manually
            loss = outputs[0]
            logits = outputs[1]

            # add the current loss
            # loss.item() extracts loss value as a float
            total_eval_loss += loss.item()

            # calculate true labels and convert it into numpy array
            b_y_true = b_labels.cpu().data.squeeze().numpy()
            
            # calculate predicted labels by taking max of 
            # prediction scores
            b_y_pred = torch.max(logits,1)[1]
            b_y_pred = b_y_pred.cpu().data.squeeze().numpy()

            y_pred = np.concatenate((y_pred,b_y_pred))
            y_true = np.concatenate((y_true,b_y_true))

        # calculate metrics
        metrics = self.evalMetric(y_true,y_pred,which+"_")

        # Calculate the average loss over all of the batches.
        avg_loss = total_eval_loss / len(loader)
        # add it to the metric
        metrics[which+'_avg_loss'] = avg_loss

        return metrics
    
    
    def run_train_loop(self,model,train_loader,optimiser,scheduler):

        model.train() # put model in train mode

        # maintain total loss to add to metric
        total_loss = 0

        # maintain predictions for each batch and calculate metrics
        # at the end of the epoch
        y_pred = np.zeros(shape=(0),dtype='int')
        y_true = np.empty(shape=(0),dtype='int')

        for batch in tqdm(train_loader):
            # separate inputs, labels and attention mask
            b_input_ids = batch[0].to(self.device)
            b_input_mask = batch[1].to(self.device)
            b_labels = batch[2].to(self.device)

            # Ref: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch#:~:text=In%20PyTorch%20%2C%20we%20need%20to,backward()%20call.
            model.zero_grad()                

            outputs = model(b_input_ids, 
                             token_type_ids=None, 
                             attention_mask=b_input_mask, 
                             labels=b_labels)

            # outputs is always returned as tuple
            # Separate it manually
            logits = outputs[1]

            # define new loss function so that we can include
            # weights
            loss_fct = nn.CrossEntropyLoss(weight=torch.tensor(
                        self.weights,dtype=torch.float)).to(self.device)
            
            loss = loss_fct(logits,b_labels)
            
            # calculate current loss
            # loss.item() extracts loss value as a float
            total_loss += loss.item()

            # Back-propagation
            loss.backward()

            # calculate true labels
            b_y_true = b_labels.cpu().data.squeeze().numpy()

            # calculate predicted labels by taking max of 
            # prediction scores
            b_y_pred = torch.max(logits,1)[1]
            b_y_pred = b_y_pred.cpu().data.squeeze().numpy()

            y_pred = np.concatenate((y_pred,b_y_pred))
            y_true = np.concatenate((y_true,b_y_true))

            # clip gradient to prevent exploding gradient
            # problems
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # gradient descent
            optimiser.step()
            
            # schedule learning rate accordingly
            scheduler.step()

        # calculate avg loss 
        avg_train_loss = total_loss / len(train_loader)

        # calculate metrics
        train_metrics = self.evalMetric(y_true,y_pred,"Train_")
        
        # print results
        print('avg_train_loss',avg_train_loss)
        print('train_f1Score',train_metrics['Train_f1Score'])
        print('train_accuracy',train_metrics['Train_accuracy'])

        # add loss to metrics
        train_metrics['Train_avg_loss'] = avg_train_loss

        return train_metrics
    
    
    ##------------------------------------------------------------##
    ##----------------- Main Train Loop --------------------------##
    ##------------------------------------------------------------##
    def train(self,model,data_loaders,optimiser,scheduler,epochs,save_model):
        # save train stats per epoch
        train_stats = []
        train_loader,val_loader,test_loader = data_loaders
        # maintain best mF1 Score to save best model
        best_mf1Score=-1.0
        for epoch_i in range(0, epochs):
            print("")
            print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
            
            print("")
            print('Training...')
            # run trian loop
            train_metrics = self.run_train_loop(model,train_loader,
                                            optimiser,scheduler)

            print("")
            print("Running Validation...") 
            # test on validation set
            val_metrics = self.evaluate(model,val_loader,"Val")
            
            print("Validation Loss: ",val_metrics['Val_avg_loss'])
            print("Validation Accuracy: ",val_metrics['Val_accuracy'])
            
            stats = {}

            # save model where validation mF1Score is best
            if(val_metrics['Val_mF1Score']>best_mf1Score):
                best_mf1Score=val_metrics['Val_mF1Score']
                if(save_model):
                    torch.save(model.state_dict(), self.model_save_path+
                        '/best_bert_'+self.name+'.pt')
                # evaluate best model on test set
                test_metrics = self.evaluate(model,test_loader,"Test")

            stats['epoch']=epoch_i+1

            # add train and val metrics of the epoch to 
            # same dictionary
            stats.update(train_metrics)
            stats.update(val_metrics)

            train_stats.append(stats)

        return train_stats,test_metrics
    
    ##-----------------------------------------------------------##
    ##----------------------- Main Pipeline ---------------------##
    ##-----------------------------------------------------------##
    def run(self,args,df_train,df_val,df_test):
        # get X and Y data points 
        X_train = df_train['Text'].values
        Y_train = df_train['Label'].values
        X_test = df_test['Text'].values
        Y_test = df_test['Label'].values
        X_val = df_val['Text'].values
        Y_val = df_val['Label'].values
        
        # encode data
        # returns list of data and attention masks
        train_data = self.encode(X_train,args['max_len'])
        val_data = self.encode(X_val,args['max_len'])
        test_data = self.encode(X_test,args['max_len'])
        
        # add labels to data so that we can send them to
        # dataloader function together
        train_data.append(Y_train)
        val_data.append(Y_val)
        test_data.append(Y_test)
        
        # convert to dataloader
        train_dl =self.get_dataloader(train_data,args['batch_size'],True)
        val_dl =self.get_dataloader(val_data,args['batch_size'])                          
        test_dl =self.get_dataloader(test_data,args['batch_size'])
        
        # intialise model
        model = BertForSequenceClassification.from_pretrained(
                args['bert_model'], 
                num_labels = 2, 
                output_attentions = False, # Whether the model returns attentions weights.
                output_hidden_states = False, # Whether the model returns all hidden-states.
            )
        model.to(self.device)
        
        optimiser = self.get_optimiser(args['learning_rate'],model)
        
        scheduler = self.get_scheduler(args['epochs'],optimiser,train_dl)
        
        # Run train loop and evaluate on validation data set
        # on each epoch. Store best model from all epochs 
        # (best mF1 Score on Val set) and evaluate it on
        # test set
        train_stats,train_metrics = self.train(model,[train_dl,val_dl,test_dl],
                                optimiser,scheduler,args['epochs'],args['save_model'])
        
        return train_stats,train_metrics
        
    ##-----------------------------------------------------------##
    ##-------------------- Other Utilities ----------------------##
    ##-----------------------------------------------------------##
    def run_test(self,model,df_test,args):
        # to evaluate test set on the final saved model
        # to retrieve results if necessary
        X_test = df_test['Text'].values
        Y_test = df_test['Label'].values

        test_data = self.encode(X_test,args['max_len'])

        test_data.append(Y_test)

        test_dl =self.get_dataloader(test_data,32)

        metrics = self.evaluate(model,test_dl,"Test")

        return metrics
    
    def load_model(self,path,args):
        # load saved best model
#         config = BertConfig.from_pretrained(args['bert_model'])
        saved_model = BertForSequenceClassification.from_pretrained(
                args['bert_model'], 
                num_labels = 2, 
                output_attentions = False, # Whether the model returns attentions weights.
                output_hidden_states = False, # Whether the model returns all hidden-states.
            )
#         saved_model.bert.embeddings.word_embeddings=torch.nn.Embedding(64000,768,padding_idx=0)
        
        saved_model.load_state_dict(torch.load(path))
        
        return saved_model

In [3]:
def preprocess(df,isArabic):
    
    X = df['Text']
    X_new=[]
    if(isArabic):
        prep = ArabertPreprocessor('bert-base-arabertv02')
        for text in tqdm(X):
            text = prep.preprocess(text)
            X_new.append(text)
    else:
        processer = Data_Preprocessing()
        for text in tqdm(X):
            text= processer.removeEmojis(text)
            text = processer.removeUrls(text)
            text=processer.removeSpecialChar(text)
            X_new.append(text)

    df['Text']=X_new
    return df 

In [4]:
def load_dataset(args,data_path,index):
    # read dataframes
    df_test = pd.read_csv(data_path+'test_'+str(index)+'.csv')

    # clean data
    df_test=preprocess(df_test,args['isArabic'])

    return df_test

In [5]:
def one_shot_output(model_path,data_path,obj,args):
    saved_model=obj.load_model(model_path,args)
    device = torch.device(args['device'])
    saved_model=saved_model.to(device)
    
    all_metrics=[]
    avg_metrics={}
    
    # preprocessing
    for fold in [1,2,3,4,5]:
        df = load_dataset(args,data_path,fold)

        metrics = obj.run_test(saved_model,df,args)
        
        for key,value in metrics.items():
            if(key not in avg_metrics):
                avg_metrics[key]=value
            else:
                avg_metrics[key]+=value
        
        all_metrics.append(metrics)
            
    
    for key,value in avg_metrics.items():
        avg_metrics[key]/=5
    
    return all_metrics,avg_metrics

In [23]:
DATA_PATH = "Data_Processed/Let-Mi/all.csv"
MODEL_PATH = "Saved_Models/Let-Mi/best_bert_xlm_roberta_1_all.pt"

args={
        'seed_val': 42,
        'batch_size': 8,
        'bert_model': "bert-base-multilingual-cased",
        'learning_rate': 2e-5,
        'epochs': 10,
        'max_len': 128,
        'device': 'cuda',
        'weights': [1.0, 1.0],
        'save_model': False,
        'model_save_path': '',
        'name': 'bert_one_shot',
        'isArabic': True,
    }

model = BERT(args)

metrics = one_shot_output(MODEL_PATH,DATA_PATH,model,args)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

RuntimeError: Error(s) in loading state_dict for BertForSequenceClassification:
	Missing key(s) in state_dict: "bert.embeddings.position_ids", "bert.embeddings.word_embeddings.weight", "bert.embeddings.position_embeddings.weight", "bert.embeddings.token_type_embeddings.weight", "bert.embeddings.LayerNorm.weight", "bert.embeddings.LayerNorm.bias", "bert.encoder.layer.0.attention.self.query.weight", "bert.encoder.layer.0.attention.self.query.bias", "bert.encoder.layer.0.attention.self.key.weight", "bert.encoder.layer.0.attention.self.key.bias", "bert.encoder.layer.0.attention.self.value.weight", "bert.encoder.layer.0.attention.self.value.bias", "bert.encoder.layer.0.attention.output.dense.weight", "bert.encoder.layer.0.attention.output.dense.bias", "bert.encoder.layer.0.attention.output.LayerNorm.weight", "bert.encoder.layer.0.attention.output.LayerNorm.bias", "bert.encoder.layer.0.intermediate.dense.weight", "bert.encoder.layer.0.intermediate.dense.bias", "bert.encoder.layer.0.output.dense.weight", "bert.encoder.layer.0.output.dense.bias", "bert.encoder.layer.0.output.LayerNorm.weight", "bert.encoder.layer.0.output.LayerNorm.bias", "bert.encoder.layer.1.attention.self.query.weight", "bert.encoder.layer.1.attention.self.query.bias", "bert.encoder.layer.1.attention.self.key.weight", "bert.encoder.layer.1.attention.self.key.bias", "bert.encoder.layer.1.attention.self.value.weight", "bert.encoder.layer.1.attention.self.value.bias", "bert.encoder.layer.1.attention.output.dense.weight", "bert.encoder.layer.1.attention.output.dense.bias", "bert.encoder.layer.1.attention.output.LayerNorm.weight", "bert.encoder.layer.1.attention.output.LayerNorm.bias", "bert.encoder.layer.1.intermediate.dense.weight", "bert.encoder.layer.1.intermediate.dense.bias", "bert.encoder.layer.1.output.dense.weight", "bert.encoder.layer.1.output.dense.bias", "bert.encoder.layer.1.output.LayerNorm.weight", "bert.encoder.layer.1.output.LayerNorm.bias", "bert.encoder.layer.2.attention.self.query.weight", "bert.encoder.layer.2.attention.self.query.bias", "bert.encoder.layer.2.attention.self.key.weight", "bert.encoder.layer.2.attention.self.key.bias", "bert.encoder.layer.2.attention.self.value.weight", "bert.encoder.layer.2.attention.self.value.bias", "bert.encoder.layer.2.attention.output.dense.weight", "bert.encoder.layer.2.attention.output.dense.bias", "bert.encoder.layer.2.attention.output.LayerNorm.weight", "bert.encoder.layer.2.attention.output.LayerNorm.bias", "bert.encoder.layer.2.intermediate.dense.weight", "bert.encoder.layer.2.intermediate.dense.bias", "bert.encoder.layer.2.output.dense.weight", "bert.encoder.layer.2.output.dense.bias", "bert.encoder.layer.2.output.LayerNorm.weight", "bert.encoder.layer.2.output.LayerNorm.bias", "bert.encoder.layer.3.attention.self.query.weight", "bert.encoder.layer.3.attention.self.query.bias", "bert.encoder.layer.3.attention.self.key.weight", "bert.encoder.layer.3.attention.self.key.bias", "bert.encoder.layer.3.attention.self.value.weight", "bert.encoder.layer.3.attention.self.value.bias", "bert.encoder.layer.3.attention.output.dense.weight", "bert.encoder.layer.3.attention.output.dense.bias", "bert.encoder.layer.3.attention.output.LayerNorm.weight", "bert.encoder.layer.3.attention.output.LayerNorm.bias", "bert.encoder.layer.3.intermediate.dense.weight", "bert.encoder.layer.3.intermediate.dense.bias", "bert.encoder.layer.3.output.dense.weight", "bert.encoder.layer.3.output.dense.bias", "bert.encoder.layer.3.output.LayerNorm.weight", "bert.encoder.layer.3.output.LayerNorm.bias", "bert.encoder.layer.4.attention.self.query.weight", "bert.encoder.layer.4.attention.self.query.bias", "bert.encoder.layer.4.attention.self.key.weight", "bert.encoder.layer.4.attention.self.key.bias", "bert.encoder.layer.4.attention.self.value.weight", "bert.encoder.layer.4.attention.self.value.bias", "bert.encoder.layer.4.attention.output.dense.weight", "bert.encoder.layer.4.attention.output.dense.bias", "bert.encoder.layer.4.attention.output.LayerNorm.weight", "bert.encoder.layer.4.attention.output.LayerNorm.bias", "bert.encoder.layer.4.intermediate.dense.weight", "bert.encoder.layer.4.intermediate.dense.bias", "bert.encoder.layer.4.output.dense.weight", "bert.encoder.layer.4.output.dense.bias", "bert.encoder.layer.4.output.LayerNorm.weight", "bert.encoder.layer.4.output.LayerNorm.bias", "bert.encoder.layer.5.attention.self.query.weight", "bert.encoder.layer.5.attention.self.query.bias", "bert.encoder.layer.5.attention.self.key.weight", "bert.encoder.layer.5.attention.self.key.bias", "bert.encoder.layer.5.attention.self.value.weight", "bert.encoder.layer.5.attention.self.value.bias", "bert.encoder.layer.5.attention.output.dense.weight", "bert.encoder.layer.5.attention.output.dense.bias", "bert.encoder.layer.5.attention.output.LayerNorm.weight", "bert.encoder.layer.5.attention.output.LayerNorm.bias", "bert.encoder.layer.5.intermediate.dense.weight", "bert.encoder.layer.5.intermediate.dense.bias", "bert.encoder.layer.5.output.dense.weight", "bert.encoder.layer.5.output.dense.bias", "bert.encoder.layer.5.output.LayerNorm.weight", "bert.encoder.layer.5.output.LayerNorm.bias", "bert.encoder.layer.6.attention.self.query.weight", "bert.encoder.layer.6.attention.self.query.bias", "bert.encoder.layer.6.attention.self.key.weight", "bert.encoder.layer.6.attention.self.key.bias", "bert.encoder.layer.6.attention.self.value.weight", "bert.encoder.layer.6.attention.self.value.bias", "bert.encoder.layer.6.attention.output.dense.weight", "bert.encoder.layer.6.attention.output.dense.bias", "bert.encoder.layer.6.attention.output.LayerNorm.weight", "bert.encoder.layer.6.attention.output.LayerNorm.bias", "bert.encoder.layer.6.intermediate.dense.weight", "bert.encoder.layer.6.intermediate.dense.bias", "bert.encoder.layer.6.output.dense.weight", "bert.encoder.layer.6.output.dense.bias", "bert.encoder.layer.6.output.LayerNorm.weight", "bert.encoder.layer.6.output.LayerNorm.bias", "bert.encoder.layer.7.attention.self.query.weight", "bert.encoder.layer.7.attention.self.query.bias", "bert.encoder.layer.7.attention.self.key.weight", "bert.encoder.layer.7.attention.self.key.bias", "bert.encoder.layer.7.attention.self.value.weight", "bert.encoder.layer.7.attention.self.value.bias", "bert.encoder.layer.7.attention.output.dense.weight", "bert.encoder.layer.7.attention.output.dense.bias", "bert.encoder.layer.7.attention.output.LayerNorm.weight", "bert.encoder.layer.7.attention.output.LayerNorm.bias", "bert.encoder.layer.7.intermediate.dense.weight", "bert.encoder.layer.7.intermediate.dense.bias", "bert.encoder.layer.7.output.dense.weight", "bert.encoder.layer.7.output.dense.bias", "bert.encoder.layer.7.output.LayerNorm.weight", "bert.encoder.layer.7.output.LayerNorm.bias", "bert.encoder.layer.8.attention.self.query.weight", "bert.encoder.layer.8.attention.self.query.bias", "bert.encoder.layer.8.attention.self.key.weight", "bert.encoder.layer.8.attention.self.key.bias", "bert.encoder.layer.8.attention.self.value.weight", "bert.encoder.layer.8.attention.self.value.bias", "bert.encoder.layer.8.attention.output.dense.weight", "bert.encoder.layer.8.attention.output.dense.bias", "bert.encoder.layer.8.attention.output.LayerNorm.weight", "bert.encoder.layer.8.attention.output.LayerNorm.bias", "bert.encoder.layer.8.intermediate.dense.weight", "bert.encoder.layer.8.intermediate.dense.bias", "bert.encoder.layer.8.output.dense.weight", "bert.encoder.layer.8.output.dense.bias", "bert.encoder.layer.8.output.LayerNorm.weight", "bert.encoder.layer.8.output.LayerNorm.bias", "bert.encoder.layer.9.attention.self.query.weight", "bert.encoder.layer.9.attention.self.query.bias", "bert.encoder.layer.9.attention.self.key.weight", "bert.encoder.layer.9.attention.self.key.bias", "bert.encoder.layer.9.attention.self.value.weight", "bert.encoder.layer.9.attention.self.value.bias", "bert.encoder.layer.9.attention.output.dense.weight", "bert.encoder.layer.9.attention.output.dense.bias", "bert.encoder.layer.9.attention.output.LayerNorm.weight", "bert.encoder.layer.9.attention.output.LayerNorm.bias", "bert.encoder.layer.9.intermediate.dense.weight", "bert.encoder.layer.9.intermediate.dense.bias", "bert.encoder.layer.9.output.dense.weight", "bert.encoder.layer.9.output.dense.bias", "bert.encoder.layer.9.output.LayerNorm.weight", "bert.encoder.layer.9.output.LayerNorm.bias", "bert.encoder.layer.10.attention.self.query.weight", "bert.encoder.layer.10.attention.self.query.bias", "bert.encoder.layer.10.attention.self.key.weight", "bert.encoder.layer.10.attention.self.key.bias", "bert.encoder.layer.10.attention.self.value.weight", "bert.encoder.layer.10.attention.self.value.bias", "bert.encoder.layer.10.attention.output.dense.weight", "bert.encoder.layer.10.attention.output.dense.bias", "bert.encoder.layer.10.attention.output.LayerNorm.weight", "bert.encoder.layer.10.attention.output.LayerNorm.bias", "bert.encoder.layer.10.intermediate.dense.weight", "bert.encoder.layer.10.intermediate.dense.bias", "bert.encoder.layer.10.output.dense.weight", "bert.encoder.layer.10.output.dense.bias", "bert.encoder.layer.10.output.LayerNorm.weight", "bert.encoder.layer.10.output.LayerNorm.bias", "bert.encoder.layer.11.attention.self.query.weight", "bert.encoder.layer.11.attention.self.query.bias", "bert.encoder.layer.11.attention.self.key.weight", "bert.encoder.layer.11.attention.self.key.bias", "bert.encoder.layer.11.attention.self.value.weight", "bert.encoder.layer.11.attention.self.value.bias", "bert.encoder.layer.11.attention.output.dense.weight", "bert.encoder.layer.11.attention.output.dense.bias", "bert.encoder.layer.11.attention.output.LayerNorm.weight", "bert.encoder.layer.11.attention.output.LayerNorm.bias", "bert.encoder.layer.11.intermediate.dense.weight", "bert.encoder.layer.11.intermediate.dense.bias", "bert.encoder.layer.11.output.dense.weight", "bert.encoder.layer.11.output.dense.bias", "bert.encoder.layer.11.output.LayerNorm.weight", "bert.encoder.layer.11.output.LayerNorm.bias", "bert.pooler.dense.weight", "bert.pooler.dense.bias". 
	Unexpected key(s) in state_dict: "roberta.embeddings.position_ids", "roberta.embeddings.word_embeddings.weight", "roberta.embeddings.position_embeddings.weight", "roberta.embeddings.token_type_embeddings.weight", "roberta.embeddings.LayerNorm.weight", "roberta.embeddings.LayerNorm.bias", "roberta.encoder.layer.0.attention.self.query.weight", "roberta.encoder.layer.0.attention.self.query.bias", "roberta.encoder.layer.0.attention.self.key.weight", "roberta.encoder.layer.0.attention.self.key.bias", "roberta.encoder.layer.0.attention.self.value.weight", "roberta.encoder.layer.0.attention.self.value.bias", "roberta.encoder.layer.0.attention.output.dense.weight", "roberta.encoder.layer.0.attention.output.dense.bias", "roberta.encoder.layer.0.attention.output.LayerNorm.weight", "roberta.encoder.layer.0.attention.output.LayerNorm.bias", "roberta.encoder.layer.0.intermediate.dense.weight", "roberta.encoder.layer.0.intermediate.dense.bias", "roberta.encoder.layer.0.output.dense.weight", "roberta.encoder.layer.0.output.dense.bias", "roberta.encoder.layer.0.output.LayerNorm.weight", "roberta.encoder.layer.0.output.LayerNorm.bias", "roberta.encoder.layer.1.attention.self.query.weight", "roberta.encoder.layer.1.attention.self.query.bias", "roberta.encoder.layer.1.attention.self.key.weight", "roberta.encoder.layer.1.attention.self.key.bias", "roberta.encoder.layer.1.attention.self.value.weight", "roberta.encoder.layer.1.attention.self.value.bias", "roberta.encoder.layer.1.attention.output.dense.weight", "roberta.encoder.layer.1.attention.output.dense.bias", "roberta.encoder.layer.1.attention.output.LayerNorm.weight", "roberta.encoder.layer.1.attention.output.LayerNorm.bias", "roberta.encoder.layer.1.intermediate.dense.weight", "roberta.encoder.layer.1.intermediate.dense.bias", "roberta.encoder.layer.1.output.dense.weight", "roberta.encoder.layer.1.output.dense.bias", "roberta.encoder.layer.1.output.LayerNorm.weight", "roberta.encoder.layer.1.output.LayerNorm.bias", "roberta.encoder.layer.2.attention.self.query.weight", "roberta.encoder.layer.2.attention.self.query.bias", "roberta.encoder.layer.2.attention.self.key.weight", "roberta.encoder.layer.2.attention.self.key.bias", "roberta.encoder.layer.2.attention.self.value.weight", "roberta.encoder.layer.2.attention.self.value.bias", "roberta.encoder.layer.2.attention.output.dense.weight", "roberta.encoder.layer.2.attention.output.dense.bias", "roberta.encoder.layer.2.attention.output.LayerNorm.weight", "roberta.encoder.layer.2.attention.output.LayerNorm.bias", "roberta.encoder.layer.2.intermediate.dense.weight", "roberta.encoder.layer.2.intermediate.dense.bias", "roberta.encoder.layer.2.output.dense.weight", "roberta.encoder.layer.2.output.dense.bias", "roberta.encoder.layer.2.output.LayerNorm.weight", "roberta.encoder.layer.2.output.LayerNorm.bias", "roberta.encoder.layer.3.attention.self.query.weight", "roberta.encoder.layer.3.attention.self.query.bias", "roberta.encoder.layer.3.attention.self.key.weight", "roberta.encoder.layer.3.attention.self.key.bias", "roberta.encoder.layer.3.attention.self.value.weight", "roberta.encoder.layer.3.attention.self.value.bias", "roberta.encoder.layer.3.attention.output.dense.weight", "roberta.encoder.layer.3.attention.output.dense.bias", "roberta.encoder.layer.3.attention.output.LayerNorm.weight", "roberta.encoder.layer.3.attention.output.LayerNorm.bias", "roberta.encoder.layer.3.intermediate.dense.weight", "roberta.encoder.layer.3.intermediate.dense.bias", "roberta.encoder.layer.3.output.dense.weight", "roberta.encoder.layer.3.output.dense.bias", "roberta.encoder.layer.3.output.LayerNorm.weight", "roberta.encoder.layer.3.output.LayerNorm.bias", "roberta.encoder.layer.4.attention.self.query.weight", "roberta.encoder.layer.4.attention.self.query.bias", "roberta.encoder.layer.4.attention.self.key.weight", "roberta.encoder.layer.4.attention.self.key.bias", "roberta.encoder.layer.4.attention.self.value.weight", "roberta.encoder.layer.4.attention.self.value.bias", "roberta.encoder.layer.4.attention.output.dense.weight", "roberta.encoder.layer.4.attention.output.dense.bias", "roberta.encoder.layer.4.attention.output.LayerNorm.weight", "roberta.encoder.layer.4.attention.output.LayerNorm.bias", "roberta.encoder.layer.4.intermediate.dense.weight", "roberta.encoder.layer.4.intermediate.dense.bias", "roberta.encoder.layer.4.output.dense.weight", "roberta.encoder.layer.4.output.dense.bias", "roberta.encoder.layer.4.output.LayerNorm.weight", "roberta.encoder.layer.4.output.LayerNorm.bias", "roberta.encoder.layer.5.attention.self.query.weight", "roberta.encoder.layer.5.attention.self.query.bias", "roberta.encoder.layer.5.attention.self.key.weight", "roberta.encoder.layer.5.attention.self.key.bias", "roberta.encoder.layer.5.attention.self.value.weight", "roberta.encoder.layer.5.attention.self.value.bias", "roberta.encoder.layer.5.attention.output.dense.weight", "roberta.encoder.layer.5.attention.output.dense.bias", "roberta.encoder.layer.5.attention.output.LayerNorm.weight", "roberta.encoder.layer.5.attention.output.LayerNorm.bias", "roberta.encoder.layer.5.intermediate.dense.weight", "roberta.encoder.layer.5.intermediate.dense.bias", "roberta.encoder.layer.5.output.dense.weight", "roberta.encoder.layer.5.output.dense.bias", "roberta.encoder.layer.5.output.LayerNorm.weight", "roberta.encoder.layer.5.output.LayerNorm.bias", "roberta.encoder.layer.6.attention.self.query.weight", "roberta.encoder.layer.6.attention.self.query.bias", "roberta.encoder.layer.6.attention.self.key.weight", "roberta.encoder.layer.6.attention.self.key.bias", "roberta.encoder.layer.6.attention.self.value.weight", "roberta.encoder.layer.6.attention.self.value.bias", "roberta.encoder.layer.6.attention.output.dense.weight", "roberta.encoder.layer.6.attention.output.dense.bias", "roberta.encoder.layer.6.attention.output.LayerNorm.weight", "roberta.encoder.layer.6.attention.output.LayerNorm.bias", "roberta.encoder.layer.6.intermediate.dense.weight", "roberta.encoder.layer.6.intermediate.dense.bias", "roberta.encoder.layer.6.output.dense.weight", "roberta.encoder.layer.6.output.dense.bias", "roberta.encoder.layer.6.output.LayerNorm.weight", "roberta.encoder.layer.6.output.LayerNorm.bias", "roberta.encoder.layer.7.attention.self.query.weight", "roberta.encoder.layer.7.attention.self.query.bias", "roberta.encoder.layer.7.attention.self.key.weight", "roberta.encoder.layer.7.attention.self.key.bias", "roberta.encoder.layer.7.attention.self.value.weight", "roberta.encoder.layer.7.attention.self.value.bias", "roberta.encoder.layer.7.attention.output.dense.weight", "roberta.encoder.layer.7.attention.output.dense.bias", "roberta.encoder.layer.7.attention.output.LayerNorm.weight", "roberta.encoder.layer.7.attention.output.LayerNorm.bias", "roberta.encoder.layer.7.intermediate.dense.weight", "roberta.encoder.layer.7.intermediate.dense.bias", "roberta.encoder.layer.7.output.dense.weight", "roberta.encoder.layer.7.output.dense.bias", "roberta.encoder.layer.7.output.LayerNorm.weight", "roberta.encoder.layer.7.output.LayerNorm.bias", "roberta.encoder.layer.8.attention.self.query.weight", "roberta.encoder.layer.8.attention.self.query.bias", "roberta.encoder.layer.8.attention.self.key.weight", "roberta.encoder.layer.8.attention.self.key.bias", "roberta.encoder.layer.8.attention.self.value.weight", "roberta.encoder.layer.8.attention.self.value.bias", "roberta.encoder.layer.8.attention.output.dense.weight", "roberta.encoder.layer.8.attention.output.dense.bias", "roberta.encoder.layer.8.attention.output.LayerNorm.weight", "roberta.encoder.layer.8.attention.output.LayerNorm.bias", "roberta.encoder.layer.8.intermediate.dense.weight", "roberta.encoder.layer.8.intermediate.dense.bias", "roberta.encoder.layer.8.output.dense.weight", "roberta.encoder.layer.8.output.dense.bias", "roberta.encoder.layer.8.output.LayerNorm.weight", "roberta.encoder.layer.8.output.LayerNorm.bias", "roberta.encoder.layer.9.attention.self.query.weight", "roberta.encoder.layer.9.attention.self.query.bias", "roberta.encoder.layer.9.attention.self.key.weight", "roberta.encoder.layer.9.attention.self.key.bias", "roberta.encoder.layer.9.attention.self.value.weight", "roberta.encoder.layer.9.attention.self.value.bias", "roberta.encoder.layer.9.attention.output.dense.weight", "roberta.encoder.layer.9.attention.output.dense.bias", "roberta.encoder.layer.9.attention.output.LayerNorm.weight", "roberta.encoder.layer.9.attention.output.LayerNorm.bias", "roberta.encoder.layer.9.intermediate.dense.weight", "roberta.encoder.layer.9.intermediate.dense.bias", "roberta.encoder.layer.9.output.dense.weight", "roberta.encoder.layer.9.output.dense.bias", "roberta.encoder.layer.9.output.LayerNorm.weight", "roberta.encoder.layer.9.output.LayerNorm.bias", "roberta.encoder.layer.10.attention.self.query.weight", "roberta.encoder.layer.10.attention.self.query.bias", "roberta.encoder.layer.10.attention.self.key.weight", "roberta.encoder.layer.10.attention.self.key.bias", "roberta.encoder.layer.10.attention.self.value.weight", "roberta.encoder.layer.10.attention.self.value.bias", "roberta.encoder.layer.10.attention.output.dense.weight", "roberta.encoder.layer.10.attention.output.dense.bias", "roberta.encoder.layer.10.attention.output.LayerNorm.weight", "roberta.encoder.layer.10.attention.output.LayerNorm.bias", "roberta.encoder.layer.10.intermediate.dense.weight", "roberta.encoder.layer.10.intermediate.dense.bias", "roberta.encoder.layer.10.output.dense.weight", "roberta.encoder.layer.10.output.dense.bias", "roberta.encoder.layer.10.output.LayerNorm.weight", "roberta.encoder.layer.10.output.LayerNorm.bias", "roberta.encoder.layer.11.attention.self.query.weight", "roberta.encoder.layer.11.attention.self.query.bias", "roberta.encoder.layer.11.attention.self.key.weight", "roberta.encoder.layer.11.attention.self.key.bias", "roberta.encoder.layer.11.attention.self.value.weight", "roberta.encoder.layer.11.attention.self.value.bias", "roberta.encoder.layer.11.attention.output.dense.weight", "roberta.encoder.layer.11.attention.output.dense.bias", "roberta.encoder.layer.11.attention.output.LayerNorm.weight", "roberta.encoder.layer.11.attention.output.LayerNorm.bias", "roberta.encoder.layer.11.intermediate.dense.weight", "roberta.encoder.layer.11.intermediate.dense.bias", "roberta.encoder.layer.11.output.dense.weight", "roberta.encoder.layer.11.output.dense.bias", "roberta.encoder.layer.11.output.LayerNorm.weight", "roberta.encoder.layer.11.output.LayerNorm.bias", "roberta.pooler.dense.weight", "roberta.pooler.dense.bias". 

In [19]:
metrics

{'Test_accuracy': 0.5839723926380368,
 'Test_mF1Score': 0.54390418648208,
 'Test_f1Score': 0.4087193460490463,
 'Test_auc': 0.5800991671272986,
 'Test_precision': 0.6836827711941659,
 'Test_recall': 0.2914885347842985,
 'Test_non_hatef1Score': 0.6790890269151139,
 'Test_non_recallScore': 0.8687097994702989,
 'Test_non_precisionScore': 0.5574168487496965,
 'Test_avg_loss': 1.530707430620135}

In [6]:
# base_model_path = "../../HULK/HateModels/Shrinivas/Misogyny-Analysis/"
base_model_path=""

In [7]:
def getOneShotOutput(model_path,data_path):
    args={
        'seed_val': 42,
        'batch_size': 8,
        'bert_model': "bert-base-multilingual-cased",
        'learning_rate': 2e-5,
        'epochs': 10,
        'max_len': 128,
        'device': 'cuda',
        'weights': [1.0, 1.0],
        'save_model': False,
        'model_save_path': '',
        'name': 'bert_one_shot',
        'isArabic': False,
    }
    
    if('Let-Mi' in data_path):
        args['isArabic']=True

    model = BERT(args)

    all_metrics, metrics = one_shot_output(base_model_path+model_path,data_path,model,args)
    
    return metrics,all_metrics

In [8]:
datasets=['Let-Mi','AMI-2020','AMI-Spanish','Shared_Task_eng',
          'Shared_Task_hin','Shared_Task_iben']

In [9]:
paths={
    'Let-Mi':{
        'model_path':'Saved_Models/Let-Mi/best_bert_1_all.pt',
        'data_path':'Data_Processed/Let-Mi/',
    },
    'AMI-2020':{
        'model_path': 'Saved_Models/AMI-2020/best_bert_1_all.pt',
        'data_path': 'Data_Processed/AMI-2020/',
    },
    'AMI-Spanish':{
        'model_path': 'Saved_Models/AMI-Spanish/best_bert_1_all.pt',
        'data_path': 'Data_Processed/AMI-Spanish/',
    },
    'Shared_Task_eng':{
        'model_path': 'Saved_Models/Shared_Task_eng/best_bert_1_allnew.pt',
        'data_path': 'Data_Processed/Shared_Task_eng/',
    },
    'Shared_Task_iben':{
        'model_path': 'Saved_Models/Shared_Task_iben/best_bert_4_all.pt',
        'data_path': 'Data_Processed/Shared_Task_iben/',
    },
    'Shared_Task_hin':{
        'model_path': 'Saved_Models/Shared_Task_hin/best_bert_5_all.pt',
        'data_path': 'Data_Processed/Shared_Task_hin/',
    },
}

In [10]:
res = []

In [11]:
for dataset1 in datasets:
    for dataset2 in datasets:
        if(dataset1!=dataset2):
            name = dataset1+'_'+dataset2
            metrics = getOneShotOutput(paths[dataset1]['model_path'],
                                      paths[dataset2]['data_path'])
            metrics['Name']=dataset1+'_'+dataset2
            res.append(metrics)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

RuntimeError: Error(s) in loading state_dict for BertForSequenceClassification:
	size mismatch for bert.embeddings.word_embeddings.weight: copying a param with shape torch.Size([64000, 768]) from checkpoint, the shape in current model is torch.Size([119547, 768]).

In [15]:
metrics,all_metrics = getOneShotOutput(paths['Shared_Task_iben']['model_path'],
                                      paths['AMI-Spanish']['data_path'])

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

In [16]:
metrics

{'Test_accuracy': 0.44781250000000006,
 'Test_mF1Score': 0.4303362876251269,
 'Test_f1Score': 0.33092505200430394,
 'Test_auc': 0.44708845384478246,
 'Test_precision': 0.4158416696905797,
 'Test_recall': 0.27506946436065394,
 'Test_non_hatef1Score': 0.5297475232459498,
 'Test_non_recallScore': 0.6191074433289109,
 'Test_non_precisionScore': 0.4630569261776423,
 'Test_avg_loss': 2.5630367112159727}

In [17]:
metrics,all_metrics = getOneShotOutput(paths['Shared_Task_iben']['model_path'],
                                      paths['AMI-2020']['data_path'])

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

In [18]:
metrics

{'Test_accuracy': 0.505288207297726,
 'Test_mF1Score': 0.4593800165680129,
 'Test_f1Score': 0.30191044485540497,
 'Test_auc': 0.49617032307761966,
 'Test_precision': 0.47504922850222375,
 'Test_recall': 0.2214316539800881,
 'Test_non_hatef1Score': 0.6168495882806211,
 'Test_non_recallScore': 0.7709089921751511,
 'Test_non_precisionScore': 0.5141624654457451,
 'Test_avg_loss': 2.4195593771450357}

In [19]:
metrics,all_metrics = getOneShotOutput(paths['Shared_Task_iben']['model_path'],
                                      paths['Let-Mi']['data_path'])

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

In [20]:
metrics

{'Test_accuracy': 0.5048828125,
 'Test_mF1Score': 0.34809105849848926,
 'Test_f1Score': 0.028388843360249594,
 'Test_auc': 0.4986508606676024,
 'Test_precision': 0.46743330199212557,
 'Test_recall': 0.014652260431328057,
 'Test_non_hatef1Score': 0.6677932736367289,
 'Test_non_recallScore': 0.9826494609038768,
 'Test_non_precisionScore': 0.5057544093033222,
 'Test_avg_loss': 2.514556273072958}

In [16]:
from transformers import BertConfig

In [17]:
config = BertConfig.from_pretrained('bert-base-multilingual-cased')

In [18]:
config

BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4.11.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 119547
}

In [24]:
config.vocab_size=64000

In [43]:
        saved_model = BertForSequenceClassification.from_pretrained(
                args['bert_model'], 
                num_labels = 2, 
                output_attentions = False, # Whether the model returns attentions weights.
                output_hidden_states = False, # Whether the model returns all hidden-states.
#                 config=config
            )

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

In [44]:
saved_model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [56]:
saved_model.bert.embeddings.word_embeddings=torch.nn.Embedding(64000,768,padding_idx=0)

In [55]:
torch.nn.Embedding(64000,768,padding_idx=0)

Embedding(119547, 768, padding_idx=0)

## Translated

### Arabic Translated

## XLM Roberta

In [6]:
from data_cleaning import Data_Preprocessing
from arabert.preprocess import ArabertPreprocessor
import pandas as pd
import numpy as np
import torch
import random
from tqdm import tqdm
from transformers import RobertaPreTrainedModel,RobertaModel

In [2]:
!nvidia-smi

Sun Dec 19 20:53:52 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.63.01    Driver Version: 470.63.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:61:00.0 Off |                    0 |
| N/A   54C    P0   102W / 250W |   2177MiB / 12198MiB |     80%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla P100-PCIE...  Off  | 00000000:DB:00.0 Off |                    0 |
| N/A   66C    P0   187W / 250W |   9733MiB / 16280MiB |     99%      Default |
|       

In [7]:
# Core
import random

# Basics
import numpy as np
import pandas as pd
import torch

# Metrics
from sklearn.metrics import *

# Tokeniser
from transformers import XLMRobertaTokenizer

# Utility
from tqdm import tqdm

# Dataloader
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# Scheduler
from transformers import get_linear_schedule_with_warmup

# Optimiser
from transformers import AdamW

# Model

import torch.nn as nn
from models import weighted_Roberta


class XLM_Roberta:
    def __init__(self,args):
        # fix the random
        random.seed(args['seed_val'])
        np.random.seed(args['seed_val'])
        torch.manual_seed(args['seed_val'])
        torch.cuda.manual_seed_all(args['seed_val'])
        
        # set device
        self.device = torch.device(args['device'])

        self.weights=args['weights']
        
        # initiliase tokeniser
        self.tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base', do_lower_case = True)

        self.model_save_path = args['model_save_path']
        self.name = args['name']
        
    ##-----------------------------------------------------------##
    ##----------------- Utility Functions -----------------------##
    ##-----------------------------------------------------------##
    def encode(self,data,max_len):
        input_ids = []
        attention_masks = []
        for sent in tqdm(data):
            # use in-built tokeniser of Bert
            encoded_dict = self.tokenizer.encode_plus(
                            sent,
                            add_special_tokens =True, # for [CLS] and [SEP]
                            max_length = max_len,
                            truncation = True,
                            padding = 'max_length',
                            return_attention_mask = True,
#                             return_tensors = 'pt', # return pytorch tensors
            )
            input_ids.append(encoded_dict['input_ids'])
            # attention masks notify where padding has been added 
            # and where is the sentence
            attention_masks.append(encoded_dict['attention_mask'])
            X_data = torch.tensor(input_ids)
            attention_masks_data = torch.tensor(attention_masks)
            
        return [X_data,attention_masks_data]
    
    ##-----------------------------------------------------------##
    ##------------------ Dataloader -----------------------------##
    ##-----------------------------------------------------------##
    def get_dataloader(self,samples, batch_size,is_train=False):
        inputs,masks,labels = samples

        # Convert the lists into tensors.
#         inputs = torch.cat(inputs, dim=0)
#         masks = torch.cat(masks, dim=0)
        labels = torch.tensor(labels)

        # convert to dataset
        data = TensorDataset(inputs,masks,labels)

        if(is_train==False):
            # use random sampler for training to shuffle
            # train data
            sampler = SequentialSampler(data)
        else:
            # order does not matter for validation as we just 
            # need the metrics
            sampler = RandomSampler(data)  

        dataloader = DataLoader(data, sampler=sampler, batch_size=batch_size,drop_last=True)

        return dataloader
    
    ##-----------------------------------------------------------##
    ##----------------- Training Utilities ----------------------##
    ##-----------------------------------------------------------## 
    def get_optimiser(self,learning_rate,model):
        # using AdamW optimiser from transformers library
        return AdamW(model.parameters(),
                  lr = learning_rate, 
                  eps = 1e-8
                )
    
    def get_scheduler(self,epochs,optimiser,train_dl):
        total_steps = len(train_dl) * epochs
        return get_linear_schedule_with_warmup(optimiser, 
                num_warmup_steps = 0, 
                num_training_steps = total_steps)
    
    def evalMetric(self, y_true, y_pred, prefix):
        # calculate all the metrics and add prefix to them
        # before saving in dictionary
        accuracy = accuracy_score(y_true, y_pred)
        mf1Score = f1_score(y_true, y_pred, average='macro')
        f1Score = f1_score(y_true, y_pred)
        area_under_c = roc_auc_score(y_true, y_pred)
        recallScore = recall_score(y_true, y_pred)
        precisionScore = precision_score(y_true, y_pred)

        nonhate_f1Score = f1_score(y_true, y_pred, pos_label=0)
        non_recallScore = recall_score(y_true, y_pred, pos_label=0)
        non_precisionScore = precision_score(y_true, y_pred, pos_label=0)
        return {prefix+"accuracy": accuracy, prefix+'mF1Score': mf1Score, 
            prefix+'f1Score': f1Score, prefix+'auc': area_under_c,
            prefix+'precision': precisionScore, 
            prefix+'recall': recallScore, 
            prefix+'non_hatef1Score': nonhate_f1Score, 
            prefix+'non_recallScore': non_recallScore, 
            prefix+'non_precisionScore': non_precisionScore}
    
    ##-----------------------------------------------------------##
    ##---------------- Different Train Loops --------------------##
    ##-----------------------------------------------------------## 
    def evaluate(self,model,loader,which):
        # to evaluate model on test and validation set

        model.eval() # put model in eval mode

        # maintain total loss to save in metrics
        total_eval_loss = 0

        # maintain predictions for each batch and calculate metrics
        # at the end of the epoch
        y_pred = np.zeros(shape=(0),dtype='int')
        y_true = np.empty(shape=(0),dtype='int')

        for batch in tqdm(loader):
            # separate input, labels and attention mask
            b_input_ids = batch[0].to(self.device)
            b_input_mask = batch[1].to(self.device)
            b_labels = batch[2].to(self.device)

            with torch.no_grad(): # do not construct compute graph
                outputs = model(b_input_ids, 
                                   token_type_ids=None, 
                                   attention_mask=b_input_mask,
                                   labels=b_labels)
            
            # output is always a tuple, thus we have to 
            # separate it manually
            #loss = outputs[0]
            logits = outputs[0]

            # define new loss function so that we can include
            # weights
            loss_fct = nn.CrossEntropyLoss(weight=torch.tensor(
                        self.weights,dtype=torch.float).to(self.device))
            
            loss = loss_fct(logits.view(-1, 2), b_labels.view(-1))

            # add the current loss
            # loss.item() extracts loss value as a float
            total_eval_loss += loss.item()

            # calculate true labels and convert it into numpy array
            b_y_true = b_labels.cpu().data.squeeze().numpy()
            
            # calculate predicted labels by taking max of 
            # prediction scores
            b_y_pred = torch.max(logits,1)[1]
            b_y_pred = b_y_pred.cpu().data.squeeze().numpy()

            y_pred = np.concatenate((y_pred,b_y_pred))
            y_true = np.concatenate((y_true,b_y_true))

        # calculate metrics
        metrics = self.evalMetric(y_true,y_pred,which+"_")

        # Calculate the average loss over all of the batches.
        avg_loss = total_eval_loss / len(loader)
        # add it to the metric
        metrics[which+'_avg_loss'] = avg_loss

        return metrics
    
    
    def run_train_loop(self,model,train_loader,optimiser,scheduler):

        model.train() # put model in train mode

        # maintain total loss to add to metric
        total_loss = 0

        # maintain predictions for each batch and calculate metrics
        # at the end of the epoch
        y_pred = np.zeros(shape=(0),dtype='int')
        y_true = np.empty(shape=(0),dtype='int')

        for batch in tqdm(train_loader):
            # separate inputs, labels and attention mask
            b_input_ids = batch[0].to(self.device)
            b_input_mask = batch[1].to(self.device)
            b_labels = batch[2].to(self.device)

            # Ref: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch#:~:text=In%20PyTorch%20%2C%20we%20need%20to,backward()%20call.
            model.zero_grad()                

            outputs = model(b_input_ids, 
                             token_type_ids=None, 
                             attention_mask=b_input_mask, 
                             labels=b_labels)

            # outputs is always returned as tuple
            # Separate it manually
            logits = outputs[0]

            # define new loss function so that we can include
            # weights
            loss_fct = nn.CrossEntropyLoss(weight=torch.tensor(
                        self.weights,dtype=torch.float).to(self.device))
            
            loss = loss_fct(logits.view(-1, 2), b_labels.view(-1))
            
            # calculate current loss
            # loss.item() extracts loss value as a float
            total_loss += loss.item()

            # Back-propagation
            loss.backward()

            # calculate true labels
            b_y_true = b_labels.cpu().data.squeeze().numpy()

            # calculate predicted labels by taking max of 
            # prediction scores
            b_y_pred = torch.max(logits,1)[1]
            b_y_pred = b_y_pred.cpu().data.squeeze().numpy()

            y_pred = np.concatenate((y_pred,b_y_pred))
            y_true = np.concatenate((y_true,b_y_true))

            # clip gradient to prevent exploding gradient
            # problems
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # gradient descent
            optimiser.step()
            
            # schedule learning rate accordingly
            scheduler.step()

        # calculate avg loss 
        avg_train_loss = total_loss / len(train_loader)

        # calculate metrics
        train_metrics = self.evalMetric(y_true,y_pred,"Train_")
        
        # print results
        print('avg_train_loss',avg_train_loss)
        print('train_f1Score',train_metrics['Train_f1Score'])
        print('train_accuracy',train_metrics['Train_accuracy'])

        # add loss to metrics
        train_metrics['Train_avg_loss'] = avg_train_loss

        return train_metrics
    
    
    ##------------------------------------------------------------##
    ##----------------- Main Train Loop --------------------------##
    ##------------------------------------------------------------##
    def train(self,model,data_loaders,optimiser,scheduler,epochs,save_model):
        # save train stats per epoch
        train_stats = []
        train_loader,val_loader,test_loader = data_loaders
        # maintain best mF1 Score to save best model
        best_mf1Score=-1.0
        for epoch_i in range(0, epochs):
            print("")
            print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
            
            print("")
            print('Training...')
            # run trian loop
            train_metrics = self.run_train_loop(model,train_loader,
                                            optimiser,scheduler)

            print("")
            print("Running Validation...") 
            # test on validation set
            val_metrics = self.evaluate(model,val_loader,"Val")
            
            print("Validation Loss: ",val_metrics['Val_avg_loss'])
            print("Validation Accuracy: ",val_metrics['Val_accuracy'])
            
            stats = {}

            # save model where validation mF1Score is best
            if(val_metrics['Val_mF1Score']>best_mf1Score):
                best_mf1Score=val_metrics['Val_mF1Score']
                if(save_model):
                    torch.save(model.state_dict(), self.model_save_path+
                        '/best_bert_'+self.name+'.pt')
                # evaluate best model on test set
                test_metrics = self.evaluate(model,test_loader,"Test")

            stats['epoch']=epoch_i+1

            # add train and val metrics of the epoch to 
            # same dictionary
            stats.update(train_metrics)
            stats.update(val_metrics)

            train_stats.append(stats)

        return train_stats,test_metrics
    
    ##-----------------------------------------------------------##
    ##----------------------- Main Pipeline ---------------------##
    ##-----------------------------------------------------------##
    def run(self,args,df_train,df_val,df_test):
        # get X and Y data points 
        X_train = df_train['Text'].values
        Y_train = df_train['Label'].values
        X_test = df_test['Text'].values
        Y_test = df_test['Label'].values
        X_val = df_val['Text'].values
        Y_val = df_val['Label'].values
        
        # encode data
        # returns list of data and attention masks
        train_data = self.encode(X_train,args['max_len'])
        val_data = self.encode(X_val,args['max_len'])
        test_data = self.encode(X_test,args['max_len'])
        
        # add labels to data so that we can send them to
        # dataloader function together
        train_data.append(Y_train)
        val_data.append(Y_val)
        test_data.append(Y_test)
        
        # convert to dataloader
        train_dl =self.get_dataloader(train_data,args['batch_size'],True)
        val_dl =self.get_dataloader(val_data,args['batch_size'])                          
        test_dl =self.get_dataloader(test_data,args['batch_size'])
        
        # intialise model
        model = weighted_Roberta.from_pretrained(
            'xlm-roberta-base', # Use the 12-layer BERT model, with an uncased vocab.
            num_labels = 2, # The number of output labels--2 for binary classification             # You can increase this for multi-class tasks.   
            params=args['params'],
        )
        model.to(self.device)
        
        optimiser = self.get_optimiser(args['learning_rate'],model)
        
        scheduler = self.get_scheduler(args['epochs'],optimiser,train_dl)
        
        # Run train loop and evaluate on validation data set
        # on each epoch. Store best model from all epochs 
        # (best mF1 Score on Val set) and evaluate it on
        # test set
        train_stats,train_metrics = self.train(model,[train_dl,val_dl,test_dl],
                                optimiser,scheduler,args['epochs'],args['save_model'])
        
        return train_stats,train_metrics
        
    ##-----------------------------------------------------------##
    ##-------------------- Other Utilities ----------------------##
    ##-----------------------------------------------------------##
    def run_test(self,model,df_test,args):
        # to evaluate test set on the final saved model
        # to retrieve results if necessary
        X_test = df_test['Text'].values
        Y_test = df_test['Label'].values

        test_data = self.encode(X_test,args['max_len'])

        test_data.append(Y_test)

        test_dl =self.get_dataloader(test_data,32)

        metrics = self.evaluate(model,test_dl,"Test")

        return metrics
    
    def load_model(self,path,args):
        # load saved best model
        saved_model = weighted_Roberta.from_pretrained(
            'xlm-roberta-base', # Use the 12-layer BERT model, with an uncased vocab.
            num_labels = 2, # The number of output labels--2 for binary classification             # You can increase this for multi-class tasks.   
            params=args['params'],
        )
        
        saved_model.load_state_dict(torch.load(path))
        
        return saved_model

In [8]:
def preprocess(df,isArabic):
    
    X = df['Text']
    X_new=[]
    if(isArabic):
        prep = ArabertPreprocessor('bert-base-arabertv02')
        for text in tqdm(X):
            text = prep.preprocess(text)
            X_new.append(text)
    else:
        processer = Data_Preprocessing()
        for text in tqdm(X):
            text= processer.removeEmojis(text)
            text = processer.removeUrls(text)
            text=processer.removeSpecialChar(text)
            X_new.append(text)

    df['Text']=X_new
    return df 

In [9]:
def load_dataset(args,data_path,index):
    # read dataframes
    df_test = pd.read_csv(data_path+'test_'+str(index)+'.csv')

    # clean data
    df_test=preprocess(df_test,args['isArabic'])

    return df_test

In [10]:
def one_shot_output(model_path,data_path,obj,args):
    saved_model=obj.load_model(model_path,args)
    device = torch.device(args['device'])
    saved_model=saved_model.to(device)
    
    all_metrics=[]
    avg_metrics={}
    
    # preprocessing
    for fold in [1,2,3,4,5]:
        df = load_dataset(args,data_path,fold)

        metrics = obj.run_test(saved_model,df,args)
        
        for key,value in metrics.items():
            if(key not in avg_metrics):
                avg_metrics[key]=value
            else:
                avg_metrics[key]+=value
        
        all_metrics.append(metrics)
    
    for key,value in avg_metrics.items():
        avg_metrics[key]/=5
    
    return avg_metrics,all_metrics

In [11]:
def getOneShotOutput(model_path,data_path):
    model_args={
        'seed_val': 42,
        'name': 'xlm_roberta',
        'batch_size': 8,
        'bert_model': "xlm-roberta-base",
        'learning_rate': 2e-5,
        'epochs': 10,
        'max_len': 128,
        'device': 'cuda',
        'weights': [1.0, 1.0],
        'save_model': False,
        'model_save_path': '',
        'isArabic': False,
        'model_path': "",
        'max_length':128,
        'is_train':True,
        'epsilon':1e-8,
        'random_seed':30,
        'to_save':True,
        'frac':0.8,
        'params':{
            'max_length':128,
            'path_files': 'xlm-roberta-base',
            'what_bert':'weighted',
            'batch_size':8,
            'is_train':True,
            'learning_rate':2e-5,
            'epsilon':1e-8,
            'random_seed':30,
            'epochs':10,
            'to_save':True,
            'weights':[1.0,1.0],
            'frac':0.8
        }
    }
    
    if('Let-Mi' in data_path):
        model_args['isArabic']=True

    model = XLM_Roberta(model_args)

    avg_metrics,all_metrics = one_shot_output(model_path,data_path,model,model_args)
    
    return avg_metrics,all_metrics

In [12]:
datasets=['Let-Mi','AMI-2020','AMI-Spanish','Shared_Task_eng',
          'Shared_Task_hin','Shared_Task_iben']

In [18]:
paths={
    'Let-Mi':{
        'model_path':'Saved_Models/Let-Mi/best_bert_xlm_roberta_3_all.pt',
        'data_path':'Data_Processed/Let-Mi/',
    },
    'AMI-2020':{
        'model_path': 'Saved_Models/AMI-2020/best_bert_xlm_roberta_2_all.pt',
        'data_path': 'Data_Processed/AMI-2020/',
    },
    'AMI-Spanish':{
        'model_path': 'Saved_Models/AMI-Spanish/best_bert_xlm_roberta_1_all.pt',
        'data_path': 'Data_Processed/AMI-Spanish/',
    },
    'Shared_Task_eng':{
        'model_path': 'Saved_Models/Shared_Task_eng/best_bert_xlm_roberta_4_all.pt',
        'data_path': 'Data_Processed/Shared_Task_eng/',
    },
    'Shared_Task_iben':{
        'model_path': 'Saved_Models/Shared_Task_iben/best_bert_xlm_roberta_4_all.pt',
        'data_path': 'Data_Processed/Shared_Task_iben/',
    },
    'Shared_Task_hin':{
        'model_path': 'Saved_Models/Shared_Task_hin/best_bert_xlm_roberta_1_all.pt',
        'data_path': 'Data_Processed/Shared_Task_hin/',
    },
}

In [10]:
res=[]

In [11]:
for dataset1 in datasets:
    for dataset2 in datasets:
        if(dataset1!=dataset2):
            name = dataset1+'_'+dataset2
            avg_metrics,all_metrics = getOneShotOutput(paths[dataset1]['model_path'],
                                      paths[dataset2]['data_path'])
            metrics['Name']=dataset1+'_'+dataset2
            res.append(metrics)
            df = pd.DataFrame(res)
            df.to_csv('Results_Processed/one_shot_matrix_xlm.csv')

You are using a model of type xlm-roberta to instantiate a model of type roberta. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at xlm-roberta-base were not used when initializing weighted_Roberta: ['lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing weighted_Roberta from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing weighted_Roberta from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of weighted_Roberta were not initialized from the model checkpoint at xlm-roberta-base and are newly initiali

Some weights of weighted_Roberta were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'roberta.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 1236/1236 [00:00<00:00, 2478.67it/s]
100%|██████████| 1236/1236 [00:07<00:00, 163.17it/s]
100%|██████████| 38/38 [00:06<00:00,  5.86it/s]
100%|██████████| 1236/1236 [00:00<00:00, 2373.72it/s]
100%|██████████| 1236/1236 [00:07<00:00, 167.50it/s]
100%|██████████| 38/38 [00:06<00:00,  6.01it/s]
100%|██████████| 1236/1236 [00:00<00:00, 2513.72it/s]
100%|██████████| 1236/1236 [00:07<00:00, 163.70it/s]
100%|██████████| 38/38 [00:06<00:00,  6.10it/s]
100%|██████████| 1236/1236 [00:00<00:00, 2658.37it/s]
100%|██████████| 1236/1236 [00:07<00:00, 166.63it/s]
100%|██████████| 38/38 [00:06<00:00,  6.04it/s]
100%|██████████| 1237/1237 [00:00<00:00, 2488.02it/s]
100%|███████

You are using a model of type xlm-roberta to instantiate a model of type roberta. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at xlm-roberta-base were not used when initializing weighted_Roberta: ['lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing weighted_Roberta from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing weighted_Roberta from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of weighted_Roberta were not initialized from the model checkpoint at xlm-roberta-base and are newly initiali

100%|██████████| 1047/1047 [00:00<00:00, 8937.65it/s]
100%|██████████| 1047/1047 [00:05<00:00, 189.09it/s]
100%|██████████| 32/32 [00:05<00:00,  6.12it/s]
100%|██████████| 1047/1047 [00:00<00:00, 10307.45it/s]
100%|██████████| 1047/1047 [00:05<00:00, 196.22it/s]
100%|██████████| 32/32 [00:05<00:00,  5.92it/s]
100%|██████████| 1047/1047 [00:00<00:00, 10491.02it/s]
100%|██████████| 1047/1047 [00:05<00:00, 195.92it/s]
100%|██████████| 32/32 [00:05<00:00,  5.99it/s]
100%|██████████| 1047/1047 [00:00<00:00, 10540.07it/s]
100%|██████████| 1047/1047 [00:05<00:00, 194.73it/s]
100%|██████████| 32/32 [00:05<00:00,  5.87it/s]
100%|██████████| 1052/1052 [00:00<00:00, 10290.63it/s]
100%|██████████| 1052/1052 [00:05<00:00, 192.20it/s]
100%|██████████| 32/32 [00:05<00:00,  5.89it/s]
You are using a model of type xlm-roberta to instantiate a model of type roberta. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at xlm-roberta-base were 

You are using a model of type xlm-roberta to instantiate a model of type roberta. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at xlm-roberta-base were not used when initializing weighted_Roberta: ['lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing weighted_Roberta from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing weighted_Roberta from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of weighted_Roberta were not initialized from the model checkpoint at xlm-roberta-base and are newly initiali

You are using a model of type xlm-roberta to instantiate a model of type roberta. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at xlm-roberta-base were not used when initializing weighted_Roberta: ['lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing weighted_Roberta from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing weighted_Roberta from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of weighted_Roberta were not initialized from the model checkpoint at xlm-roberta-base and are newly initiali

100%|██████████| 1047/1047 [00:00<00:00, 10210.15it/s]
100%|██████████| 1047/1047 [00:05<00:00, 176.43it/s]
100%|██████████| 32/32 [00:05<00:00,  5.96it/s]
100%|██████████| 1047/1047 [00:00<00:00, 10239.57it/s]
100%|██████████| 1047/1047 [00:05<00:00, 196.93it/s]
100%|██████████| 32/32 [00:05<00:00,  6.07it/s]
100%|██████████| 1047/1047 [00:00<00:00, 10512.90it/s]
100%|██████████| 1047/1047 [00:05<00:00, 196.18it/s]
100%|██████████| 32/32 [00:05<00:00,  5.97it/s]
100%|██████████| 1047/1047 [00:00<00:00, 10948.26it/s]
100%|██████████| 1047/1047 [00:05<00:00, 195.25it/s]
100%|██████████| 32/32 [00:05<00:00,  5.99it/s]
100%|██████████| 1052/1052 [00:00<00:00, 10796.49it/s]
100%|██████████| 1052/1052 [00:05<00:00, 198.53it/s]
100%|██████████| 32/32 [00:05<00:00,  5.90it/s]
You are using a model of type xlm-roberta to instantiate a model of type roberta. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at xlm-roberta-base were

You are using a model of type xlm-roberta to instantiate a model of type roberta. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at xlm-roberta-base were not used when initializing weighted_Roberta: ['lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing weighted_Roberta from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing weighted_Roberta from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of weighted_Roberta were not initialized from the model checkpoint at xlm-roberta-base and are newly initiali

100%|██████████| 660/660 [00:00<00:00, 2268.84it/s]
100%|██████████| 660/660 [00:02<00:00, 307.76it/s] 
100%|██████████| 20/20 [00:03<00:00,  5.84it/s]
100%|██████████| 660/660 [00:00<00:00, 2328.74it/s]
100%|██████████| 660/660 [00:02<00:00, 311.94it/s] 
100%|██████████| 20/20 [00:03<00:00,  5.90it/s]
100%|██████████| 660/660 [00:00<00:00, 2251.50it/s]
100%|██████████| 660/660 [00:02<00:00, 307.68it/s] 
100%|██████████| 20/20 [00:03<00:00,  6.02it/s]
100%|██████████| 660/660 [00:00<00:00, 2162.65it/s]
100%|██████████| 660/660 [00:02<00:00, 303.09it/s] 
100%|██████████| 20/20 [00:03<00:00,  5.90it/s]
100%|██████████| 667/667 [00:00<00:00, 2125.11it/s]
100%|██████████| 667/667 [00:02<00:00, 291.19it/s] 
100%|██████████| 20/20 [00:03<00:00,  6.14it/s]
You are using a model of type xlm-roberta to instantiate a model of type roberta. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at xlm-roberta-base were not used when initi

In [19]:
avg_metrics,all_metrics = getOneShotOutput(paths['Shared_Task_iben']['model_path'],
                                      paths['Shared_Task_hin']['data_path'])

You are using a model of type xlm-roberta to instantiate a model of type roberta. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at xlm-roberta-base were not used when initializing weighted_Roberta: ['lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias']
- This IS expected if you are initializing weighted_Roberta from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing weighted_Roberta from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of weighted_Roberta were not initialized from the model checkpoint at xlm-roberta-base and are newly initiali

In [20]:
all_metrics

[{'Test_accuracy': 0.6973684210526315,
  'Test_mF1Score': 0.5339909357504665,
  'Test_f1Score': 0.25806451612903225,
  'Test_auc': 0.5328978617665647,
  'Test_precision': 0.28193832599118945,
  'Test_recall': 0.2379182156133829,
  'Test_non_hatef1Score': 0.8099173553719008,
  'Test_non_recallScore': 0.8278775079197466,
  'Test_non_precisionScore': 0.7927199191102123,
  'Test_avg_loss': 0.6579169536891737},
 {'Test_accuracy': 0.6743421052631579,
  'Test_mF1Score': 0.49396162172386976,
  'Test_f1Score': 0.19183673469387755,
  'Test_auc': 0.4960636632727386,
  'Test_precision': 0.2175925925925926,
  'Test_recall': 0.17153284671532848,
  'Test_non_hatef1Score': 0.796086508753862,
  'Test_non_recallScore': 0.8205944798301487,
  'Test_non_precisionScore': 0.773,
  'Test_avg_loss': 0.6599660327560023},
 {'Test_accuracy': 0.6949013157894737,
  'Test_mF1Score': 0.5131312411842345,
  'Test_f1Score': 0.2156448202959831,
  'Test_auc': 0.5143008474576272,
  'Test_precision': 0.2537313432835821,
  '