In [1]:
import os
import shutil
from collections import Counter
import numpy as np
import json
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoTokenizer, ElectraForQuestionAnswering, DataCollatorWithPadding,BertModel, ElectraForSequenceClassification, ElectraModel
from Preprocess.arabertpreprocess import ArabertPreprocessor
import matplotlib.pyplot as plt
import seaborn as sns
import csv
torch.manual_seed(3407)

<torch._C.Generator at 0x7ff3556e6290>

## Preprocessing

In [2]:
def add_end_index(answer, context):
  ## 1 if span match the context 0 otherwise
  text = answer['text']
  start_idx = answer['answer_start']
  end_idx = start_idx + len(text)
  answer['answer_end'] = end_idx
  if text == context[start_idx:end_idx]:
    answer['answer_end'] = end_idx
    return False
  for i in range(1,3):
    if text == context[start_idx-i:end_idx-i]:
      answer['answer_end']= end_idx-1
      answer['answer_start'] = start_idx-1
      return False
  return True

In [3]:
def arabert_preprocess(context,question, answer, arabert_prep):
    answer['text'] = arabert_prep.preprocess(answer['text'])
    context = arabert_prep.preprocess(context)
    question = arabert_prep.preprocess(question)
    res = context.find(answer['text'])
    if res !=-1:
        answer['answer_start'] = res
    return context, question, answer, res

In [4]:
def Read_AAQAD(path,arabert_prep):
  contexts =[]
  answers =[]
  questions =[]
  IDs= []
  plausible = []
  with open(path) as f:
    aaqad_dict = json.load(f)
    for article in aaqad_dict['data']:
      for passage in article['paragraphs']:
        context = passage['context']
        for qa in passage['qas']:
          question = qa['question']
          if 'plausible_answers' in qa.keys():# there is two cases if the question have no answer then use plausible answer
            access = 'plausible_answers'
            plausible.append(True)
          else:
            access = 'answers'
            plausible.append(False)
          for answer in qa[access]:
            contexts.append(context)
            answers.append(answer)
            questions.append(question)
            IDs.append(int(qa['id']))
  return contexts,questions,answers,plausible,IDs

In [5]:
def fix_ids_span(path):
  #IDs need to be fixed for evaluating purposes
    a_file = open(path, "r")
    json_object = json.load(a_file)
    a_file.close()
    newIDCnt = 0
    for article in json_object['data']:
      for passage in article['paragraphs']:
        context = passage['context']
        for qa in passage['qas']:
            qa['id'] = str(newIDCnt)
            newIDCnt = newIDCnt +1
            if 'plausible_answers' in qa.keys():
              qa['plausible_answers'][0]['answer_start'] = 0
    a_file = open(path, "w")
    json.dump(json_object, a_file)
    a_file.close()

In [6]:
model_name = "araelectra-base-discriminator"
arabert_prep = ArabertPreprocessor(model_name=model_name)
fix_ids_span('Data/asquadv2-train.json')
fix_ids_span('Data/asquadv2-val.json')
fix_ids_span('Data/asquadv2-test.json')
aqad_train_contexts, aqad_train_questions, aqad_train_answers,aqad_train_plausible, aqad_train_ids = Read_AAQAD('Data/asquadv2-train.json', arabert_prep)
aqad_val_contexts, aqad_val_questions, aqad_val_answers,aqad_val_plausible, aqad_val_ids = Read_AAQAD('Data/asquadv2-val.json', arabert_prep)
aqad_test_contexts, aqad_test_questions, aqad_test_answers,aqad_test_plausible, aqad_test_ids = Read_AAQAD('Data/asquadv2-test.json', arabert_prep)


In [11]:
for i in range(len(aqad_train_contexts)):
    if aqad_train_answers[i]['answer_start']==-2:
        print(True, aqad_train_plausible[i])
    else:
        print(False,aqad_train_plausible[i])

False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True
False True

In [8]:
def get_answered_feat(contexts, questions, answers, plausible):
    new_contexts, new_questions, new_answers = [], [], []
    for i in range(len(answers)):
        if plausible[i] == False:
            new_contexts.append(contexts[i])
            new_questions.append(questions[i])
            new_answers.append(answers[i])
    return new_contexts, new_questions, new_answers
span_train_contexts, span_train_questions, span_train_answers = get_answered_feat(aqad_train_contexts, aqad_train_questions, aqad_train_answers, aqad_train_plausible)   

In [9]:
print(len(span_train_contexts), len(span_train_questions))

34798 34798


## Tokenization

In [10]:
#Creating the tokenizer
model_name = model_name = "aubmindlab/araelectra-base-discriminator"
araelectra_tokenizer = AutoTokenizer.from_pretrained(model_name,do_lower_case=False)

In [11]:
train_encodings = araelectra_tokenizer(aqad_train_questions, aqad_train_contexts, truncation=True)
span_train_encodings = araelectra_tokenizer(span_train_questions, span_train_contexts, truncation=True)
val_encodings = araelectra_tokenizer(aqad_val_questions, aqad_val_contexts, truncation=True)
test_encodings = araelectra_tokenizer(aqad_test_questions, aqad_test_contexts,truncation= True)

In [12]:
def index_to_token_position(encodings , answers):
  start_positions = list()
  end_positions = list()
  for i in range(len(answers)):
    start_positions.append(encodings.char_to_token(i, answers[i]['answer_start'], 1))
    end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'], 1))
    #if context truncated
    if start_positions[-1] is None: 
      start_positions[-1] = araelectra_tokenizer.model_max_length
    #if end index is space
    itt = 1
    while end_positions[-1] is None: 
      end_positions[-1] = encodings.char_to_token(i, answers[i]['answer_end']-itt, 1)
      itt = itt + 1 
  encodings.update({'start_positions': torch.tensor(start_positions), 'end_positions': torch.tensor(end_positions)})
  encodings['start_positions'] = encodings['start_positions'].view(len(answers), 1)
  encodings['end_positions'] = encodings['end_positions'].view(len(answers), 1)

In [14]:
for i in range(len(span_train_answers)):
    print(span_train_answers[i])

{'text': 'إسبانيا', 'answer_start': -2}
{'text': 'الحكومة الماركسية اللينينية', 'answer_start': -2}
{'text': 'كوبا', 'answer_start': -2}
{'text': '1959', 'answer_start': -2}
{'text': 'العقود القانونية والزيجات والأحكام الجنائية', 'answer_start': -2}
{'text': 'العلاقة بين الدول', 'answer_start': -2}
{'text': 'الدفاع العسكري والمدني', 'answer_start': -2}
{'text': 'القدرة على قبول دول جديدة في الاتحاد', 'answer_start': -2}
{'text': 'البند الإقليمي من الدستور الأمريكي', 'answer_start': -2}
{'text': 'قانون بورتوريكو للديمقراطية ( H . R . 2499 )', 'answer_start': -2}
{'text': 'كومنولث بويرتو', 'answer_start': -2}
{'text': 'ألفريد دي غراتسيا', 'answer_start': -2}
{'text': 'التمويل السنوي والدعم الدفاعي الذي تتلقاه من الولايات المتحدة', 'answer_start': -2}
{'text': '2003', 'answer_start': -2}
{'text': 'نجمتان من أجل السلام', 'answer_start': -2}
{'text': 'التدخل الأمريكي الإيجابي في الحرب العالمية الأولى', 'answer_start': -2}
{'text': 'جيمس بافيت', 'answer_start': -2}
{'text': 'الحرب الباردة', 

In [13]:
index_to_token_position(span_train_encodings, span_train_answers)
index_to_token_position(val_encodings, aqad_val_answers)
index_to_token_position(test_encodings, aqad_test_answers)

OverflowError: can't convert negative int to unsigned

In [None]:
def add_weights_labels_tensors(encodings, plausible):
  plausible = torch.tensor(plausible)
  weights = torch.zeros(plausible.shape)
  no_ans = torch.ones(plausible.shape)
  weights[plausible==False]=1.0
  no_ans[plausible==False]=0.0
  weights = weights.view(-1,1)
  no_ans = no_ans.view(-1,1)
  encodings.update({'weights':weights, 'no_ans':no_ans})

In [None]:
add_weights_labels_tensors(train_encodings, aqad_train_plausible)
add_weights_labels_tensors(val_encodings, aqad_val_plausible)
add_weights_labels_tensors(test_encodings, aqad_test_plausible)

In [None]:
val_encodings['IDs'] = aqad_val_ids
test_encodings['IDs'] = aqad_test_ids

In [None]:
print(train_encodings.keys())
print(val_encodings.keys())
print(test_encodings.keys())
print(span_train_encodings.keys())

## Dataset and DataLoader

In [None]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from tqdm import tqdm

In [None]:
class AqadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

cls_train_dataset = AqadDataset(train_encodings)
span_train_dataset = AqadDataset(span_train_encodings)
val_dataset = AqadDataset(val_encodings)
test_dataset = AqadDataset(test_encodings)

In [None]:
data_collator = DataCollatorWithPadding(araelectra_tokenizer)

In [None]:
cls_train_loader = DataLoader(cls_train_dataset, batch_size=8, shuffle= True, collate_fn= data_collator)
span_train_loader = DataLoader(span_train_dataset, batch_size=8, shuffle= True, collate_fn = data_collator)
val_loader = DataLoader(val_dataset, batch_size = 8, shuffle = True, collate_fn = data_collator)
test_loader = DataLoader(test_dataset, batch_size = 8, shuffle = True, collate_fn = data_collator)


## Checkpoints

In [None]:
def save_ckp(state, is_best, checkpoint_path, best_model_path):
    """
    state: checkpoint to save
    is_best: is this the best checkpoint; min validation loss
    checkpoint_path: path to save checkpoint
    best_model_path: path to save best checkpoint
    """
    f_path = checkpoint_path
    # save checkpoint data to the path given, checkpoint_path
    torch.save(state, f_path)
    # if it is a best model, min validation loss
    if is_best:
        best_fpath = best_model_path
        # copy that checkpoint file to best path given, best_model_path
        shutil.copyfile(f_path, best_fpath)

In [None]:
def load_ckp(checkpoint_fpath, model, optimizer):
    """
    checkpoint_path: path to saved checkpoint
    model: model to load checkpoint parameters into       
    optimizer: optimizer defined in previous training
    """
    # load check point
    checkpoint = torch.load(checkpoint_fpath)
    # initialize state_dict from checkpoint to model
    model.load_state_dict(checkpoint['state_dict'])
    # initialize optimizer from checkpoint to optimizer
    optimizer.load_state_dict(checkpoint['optimizer'])
    # initialize valid_loss_min from checkpoint to valid_loss_min
    results = checkpoint['result_dict']
    # return model, optimizer, epoch value, min validation loss 
    return model, optimizer, checkpoint['epoch'], results

In [None]:
def order_exp(base_path, exp_name):
  exp_path = os.path.join(base_path, exp_name)
  if not os.path.exists(exp_path):
    os.mkdir(exp_path)
  curr_ckp_path = os.path.join(exp_path,'curr.pt')
  best_ckp_path = os.path.join(exp_path, 'best.pt')
  return curr_ckp_path, best_ckp_path, exp_path

## Classification train and evaluation 

In [None]:
def cls_eval(model, data_loader, exp_path, train_loss):
    model.eval()
    total_acc = 0
    for batch in data_loader:
      tokens = batch['input_ids'].to(device)
      masks = batch['attention_mask'].to(device)
      tokens_type = batch['token_type_ids'].to(device)
      gt_no_ans = batch['no_ans'].to(device)
      output = model(tokens, masks, tokens_type)
      pred = output.logits.view(masks.shape[0],2,)
      pred = torch.argmax(pred, dim=1)
      target = batch['no_ans'].to(device).view(masks.shape[0],)
      total_acc += torch.sum(target==pred)
    total_acc = total_acc/ val_dataset.__len__()
    res_dict = {'acc':total_acc.item()*100, 'train_loss':train_loss}
    if exp_path:
        log_path = os.path.join(exp_path,'res.csv')
        if not os.path.exists(log_path):
            with open(log_path,'w') as f:
                writer = csv.DictWriter(f, fieldnames=res_dict.keys())
                writer.writeheader()
        with open(log_path, 'a') as f:
            writer = csv.DictWriter(f, fieldnames=res_dict.keys())
            #writer.writeheader()
            writer.writerow(res_dict)
    return res_dict


In [None]:
def cls_train(model,start_epoch, num_epochs, optimizer,max_acc, train_loader, val_loader, log, exp_name):
  curr_ckp_path, best_ckp_path, exp_path = order_exp('Runs/AraElectraDecoupledAsquadv2/train/cls', exp_name)
  model.train()
  for epoch in range(start_epoch,num_epochs):
    total_loss = 0.0
    loop = tqdm(train_loader, leave=True)
    for batch_idx, batch in enumerate(loop):
      tokens = batch['input_ids'].to(device)
      masks = batch['attention_mask'].to(device)
      tokens_type = batch['token_type_ids'].to(device)
      output = model(tokens, masks, tokens_type)
      pred = output.logits.view(masks.shape[0],2,)
      target = batch['no_ans'].type(torch.LongTensor)
      target = target.to(device)
      loss = cls_criterion(pred, target.view(masks.shape[0],))
      loss.backward()
      optimizer.step()
      optimizer.zero_grad()
      total_loss = total_loss + ((1 / (batch_idx + 1)) * (loss.item() - total_loss)) 
      loop.set_description(f'Epoch {epoch}')
      loop.set_postfix(loss=loss.item())

    result_dict = cls_eval(model, val_loader,exp_path,total_loss )
    checkpoint = {
            'epoch': epoch + 1,
            'result_dict':result_dict,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict(),
        }
    curr_acc = result_dict['acc']
    if curr_acc>=max_acc:
      max_acc = curr_acc
      save_ckp(checkpoint, True, curr_ckp_path, best_ckp_path)
    else:
      save_ckp(checkpoint, False, curr_ckp_path, best_ckp_path)
    print(result_dict)
  return model


## Modeling

In [None]:
Cls_AraElectra = ElectraForSequenceClassification.from_pretrained(model_name, num_labels=2)
QA_AraElectra = ElectraForQuestionAnswering.from_pretrained(model_name)

In [None]:
def freeze(Electra, count=None):
    if count is not None:
	      # We freeze here the embeddings of the model
        for param in Electra.embeddings.parameters():
            param.requires_grad = False

        if count != -1:
	          # if freeze_layer_count == -1, we only freeze the embedding layer
	          # otherwise we freeze the first `freeze_layer_count` encoder layers
            for layer in Electra.encoder.layer[:count]:
                for param in layer.parameters():
                    param.requires_grad = False
    print(sum(p.numel() for p in Electra.parameters()), sum(p.numel() for p in Electra.parameters() if p.requires_grad))

In [None]:
freeze(Cls_AraElectra.electra,4)
freeze(QA_AraElectra.electra, 4)

## Classification Training

In [None]:
num_epochs = 2
learning_rate = 3e-5
optimizer = torch.optim.Adam(Cls_AraElectra.parameters(), lr=learning_rate)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
#criterion_span = nn.CrossEntropyLoss(reduction='none')
cls_criterion = nn.CrossEntropyLoss()
Cls_AraElectra.to(device)

In [None]:
cls_trained_model = cls_train(Cls_AraElectra, 0, 2, optimizer, 0, cls_train_loader, val_loader , True, 'first')

## Load Cls Model if needed

In [None]:
cls_model = ElectraForSequenceClassification.from_pretrained(model_name)

In [None]:
learning_rate = 3e-5
optimizer = torch.optim.AdamW(cls_model.parameters(), lr=learning_rate)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
#criterion_span = nn.CrossEntropyLoss(reduction='none')
#cls_criterion = nn.CrossEntropyLoss()
cls_model.to(device)

In [None]:
cls_model, optimizer, start_epoch, result_dict = load_ckp('Runs/AraElectraDecoupledAsquadv2/train/cls/first/best.pt', cls_model, optimizer)


## Span Training

In [None]:
def get_raw_preds(data_loader, model, cls_model): 
  model.eval()
  with torch.no_grad():
    #F1 = EM = Total = 0
    total_loss = 0.0
    total_predictions = dict()
    no_probs_pred = dict()
    soft = torch.nn.Softmax(dim=1)
    #loop = tqdm(data_loader)
    #loop = tqdm(data_loader, leave=True)
    for batch_idx, batch in enumerate(data_loader):
      #moving tensors to gpu    
      tokens = batch['input_ids'].to(device)
      masks = batch['attention_mask'].to(device)
      tokens_type = batch['token_type_ids'].to(device)
      gt_start = batch['start_positions'].to(device)
      gt_end = batch['end_positions'].to(device)
      weights = batch['weights'].view(-1,1).to(device)
      gt_no_ans = batch['no_ans'].to(device)
      IDs = batch['IDs'].to(device)
      outputs = model(tokens, masks, tokens_type, start_positions=gt_start, end_positions=gt_end)
      no_probs = cls_model(tokens, masks, tokens_type)
      no_probs = soft(no_probs.logits)
      #calculating loss
      loss = outputs.loss
      #update average total loss 
      total_loss = total_loss + ((1 / (batch_idx + 1)) * (loss.item() - total_loss)) 
      #calculating f1 score and EM
      curr_batch_size = tokens.shape[0]
      #print(curr_batch_size)
      #print(outputs.start_logits.shape)
      #print(no_probs.logits.shape)
      for i in range(curr_batch_size):
        #print(f"this is tensor index {i}")
        start_pred, end_pred= torch.argmax(outputs.start_logits[i],dim=0), torch.argmax(outputs.end_logits[i],dim =0)
        #print(start_pred.shape, end_pred.shape)
        #print(start_pred)
        #print(start_pred, end_pred)
        total_predictions[str(IDs[i].item())] = araelectra_tokenizer.decode(tokens[i][start_pred.item():end_pred.item()], skip_special_tokens=True, clean_up_tokenization_spaces=True)
        no_probs_pred[str(IDs[i].item())] = no_probs[i][1].item()
    #saving evaluation results
    #evaluation

    model.train()
    return total_predictions, no_probs_pred

In [None]:
def get_preds(total_preds, no_probs_preds,data_path, log_path):
    preds_path = os.path.join(log_path, 'preds')
    if not os.path.exists(preds_path):
        os.mkdir(preds_path)
    no_probs_path = os.path.join(preds_path, 'na_probs.json')
    text_preds_path = os.path.join(preds_path, 'preds.json')
    jsonString = json.dumps(total_preds)
    jsonFile = open(text_preds_path, "w")
    jsonFile.write(jsonString)
    jsonFile.close()
    jsonString = json.dumps(no_probs_preds)
    jsonFile = open(no_probs_path, "w")
    jsonFile.write(jsonString)
    jsonFile.close()
    #!python evaluatev2.py data_path text_preds_path electra --na-prob-file no_probs_path --na-prob-thresh 0.4 --out-file log_path
    #os.system(f"python evaluatev2.py {data_path} {text_preds_path} electra --na-prob-file {no_probs_path} --na-prob-thresh 0.5 --out-file {log_path}")
    !/anaconda/envs/azureml_py38/bin/python3 evaluatev2.py Data/AAQAD-dev.json Runs/AraElectraDecoupledAsquadv2/train/span/first/preds/preds.json electra --na-prob-file Runs/AraElectraDecoupledAsquadv2/train/span/first/preds/na_probs.json --na-prob-thresh 0.5 --out-file Runs/AraElectraDecoupled/train/span/first 
    with open(os.path.join(log_path, 'res.csv')) as f:
        DictReader_obj = csv.DictReader(f)
        lastrow = None
        for item in DictReader_obj:
            lastrow = dict(item)
    #print(lastrow)
    return lastrow


In [None]:
def span_train(model,start_epoch, num_epochs, optimizer,max_compined_metric, train_loader, val_loader, log, exp_name):
  curr_ckp_path, best_ckp_path, exp_path = order_exp('Runs/AraElectraDecoupledAsquadv2/train/span', exp_name)
  model.train()
  for epoch in range(start_epoch,num_epochs):
    total_loss = 0.0
    loop = tqdm(train_loader, leave=True)
    for batch_idx, batch in enumerate(loop):
      tokens = batch['input_ids'].to(device)
      masks = batch['attention_mask'].to(device)
      tokens_type = batch['token_type_ids'].to(device)
      gt_start = batch['start_positions'].to(device)
      gt_end = batch['end_positions'].to(device)
      outputs = model(tokens, masks, tokens_type, start_positions=gt_start, end_positions=gt_end)
      loss = outputs.loss
      loss.backward()
      optimizer.step()
      optimizer.zero_grad()
      total_loss = total_loss + ((1 / (batch_idx + 1)) * (loss.item() - total_loss)) 
      loop.set_description(f'Epoch {epoch}')
      loop.set_postfix(loss=loss.item())

    total_preds, no_probs_preds = get_raw_preds(val_loader, model, cls_model)
    result_dict = get_preds(total_preds, no_probs_preds,'Data/asquadv2-val.json',exp_path )
    checkpoint = {
            'epoch': epoch + 1,
            'result_dict':result_dict,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict(),
        }
    curr_compined_metric = float(result_dict['exact'])+1.5*float(result_dict['f1'])
    if curr_compined_metric>=max_compined_metric:
      max_compined_metric = curr_compined_metric
      save_ckp(checkpoint, True, curr_ckp_path, best_ckp_path)
    else:
      save_ckp(checkpoint, False, curr_ckp_path, best_ckp_path)
    print(result_dict)
  return model


In [None]:
span_num_epochs = 2
span_learning_rate = 3e-5
span_optimizer = torch.optim.AdamW(QA_AraElectra.parameters(), lr=span_learning_rate)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
#criterion_span = nn.CrossEntropyLoss(reduction='none')
#cls_criterion = nn.CrossEntropyLoss()
QA_AraElectra.to(device)

In [None]:
span_trained_model = span_train(QA_AraElectra,0, num_epochs, optimizer,0.0, span_train_loader, val_loader, True, 'first')