In [1]:
''' Decaying learning rate model with bert-base-uncased and accelerator amd docstride'''
import json
def read_squad(path):
    # open JSON file and load intro dictionary
    with open(path, 'rb') as f:
        squad_dict = json.load(f)

    # initialize lists for contexts, questions, and answers
    contexts = []
    questions = []
    answers = []
    # iterate through all data in squad data
    for group in squad_dict['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                # check if we need to be extracting from 'answers' or 'plausible_answers'
                if 'plausible_answers' in qa.keys():
                    access = 'plausible_answers'
                else:
                    access = 'answers'
                for answer in qa[access]:
                    # append data to lists
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)
    # return formatted data lists
    return contexts, questions, answers

# execute our read SQuAD function for training and validation sets
train_contexts, train_questions, train_answers = read_squad('SpokenData/spoken_train-v1.1.json')
val_contexts, val_questions, val_answers = read_squad('SpokenData/spoken_test-v1.1.json')

In [2]:
#from transformers import BertTokenizer

# Load fast tokenizer
#tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", use_fast=True)

In [3]:
#print(type(tokenizer))


In [4]:
from transformers import BertTokenizerFast, BertForQuestionAnswering

tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased", use_fast=True)
model = BertForQuestionAnswering.from_pretrained("bert-base-uncased")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForQuestionAnswering: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_out

In [5]:
'''def add_end_idx(answers, contexts):
    # loop through each answer-context pair
    for answer, context in zip(answers, contexts):
        # gold_text refers to the answer we are expecting to find in context
        gold_text = answer['text']
        # we already know the start index
        start_idx = answer['answer_start']
        # and ideally this would be the end index...
        end_idx = start_idx + len(gold_text)

        # ...however, sometimes squad answers are off by a character or two
        if context[start_idx:end_idx] == gold_text:
            # if the answer is not off :)
            answer['answer_end'] = end_idx
        else:
            # this means the answer is off by 1-2 tokens
            for n in [1, 2]:
                if context[start_idx-n:end_idx-n] == gold_text:
                    answer['answer_start'] = start_idx - n
                    answer['answer_end'] = end_idx - n
            
# and apply the function to our two answer lists
add_end_idx(train_answers, train_contexts)
add_end_idx(val_answers, val_contexts)
'''

"def add_end_idx(answers, contexts):\n    # loop through each answer-context pair\n    for answer, context in zip(answers, contexts):\n        # gold_text refers to the answer we are expecting to find in context\n        gold_text = answer['text']\n        # we already know the start index\n        start_idx = answer['answer_start']\n        # and ideally this would be the end index...\n        end_idx = start_idx + len(gold_text)\n\n        # ...however, sometimes squad answers are off by a character or two\n        if context[start_idx:end_idx] == gold_text:\n            # if the answer is not off :)\n            answer['answer_end'] = end_idx\n        else:\n            # this means the answer is off by 1-2 tokens\n            for n in [1, 2]:\n                if context[start_idx-n:end_idx-n] == gold_text:\n                    answer['answer_start'] = start_idx - n\n                    answer['answer_end'] = end_idx - n\n            \n# and apply the function to our two answer list

In [6]:
import re

def add_end_idx(answers, contexts):
    # loop through each answer-context pair
    for answer, context in zip(answers, contexts):
        # gold_text refers to the answer we are expecting to find in context
        gold_text = answer['text']
        start_idx = answer['answer_start']
        end_idx = start_idx + len(gold_text)

        # Check if exact match found at initial start and end
        if context[start_idx:end_idx] == gold_text:
            answer['answer_end'] = end_idx
        else:
            # Search for exact match within a window of characters around start_idx
            found = False
            for offset in range(-5, 6):  # Check a window of offsets [-5, +5]
                new_start = start_idx + offset
                new_end = new_start + len(gold_text)

                # Ensure indices are within context bounds
                if 0 <= new_start < len(context) and context[new_start:new_end] == gold_text:
                    answer['answer_start'] = new_start
                    answer['answer_end'] = new_end
                    found = True
                    break

            # If no exact match, use regex as a fallback to find the gold_text span
            if not found:
                match = re.search(re.escape(gold_text), context)
                if match:
                    answer['answer_start'] = match.start()
                    answer['answer_end'] = match.end()

# Apply the function to your answer and context lists
add_end_idx(train_answers, train_contexts)
add_end_idx(val_answers, val_contexts)


In [7]:
import transformers
from transformers import AutoTokenizer

max_length=512


train_encodings = tokenizer(train_contexts, train_questions, truncation=True, max_length=512,padding=True,
                            stride=102)
val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True,
                          max_length=512,stride=102)

In [8]:
#train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
#val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True)

In [9]:
def add_token_positions(encodings, answers):
    # initialize lists to contain the token indices of answer start/end
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        
  
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end']))

       
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
     
        shift = 1
        while end_positions[-1] is None:
            end_positions[-1] = encodings.char_to_token(i, answers[i]['answer_end'] - shift)
            shift += 1
   
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})


add_token_positions(train_encodings, train_answers)
add_token_positions(val_encodings, val_answers)

In [10]:
import torch

class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

# build datasets for both our training and validation sets
train_dataset = SquadDataset(train_encodings)
val_dataset = SquadDataset(val_encodings)

In [11]:
from transformers import AutoModelForQuestionAnswering
from transformers import AutoTokenizer, IBertModel
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

#model = AutoModelForQuestionAnswering.from_pretrained("huawei-noah/TinyBERT_General_4L_312D")
#model = RobertaForQuestionAnswering.from_pretrained("roberta-base")
#model = AutoModelForQuestionAnswering.from_pretrained("google/electra-base-discriminator")
#model = IBertModel.from_pretrained("kssteven/ibert-roberta-base")

In [12]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)

cuda


In [13]:
from torch.utils.data import DataLoader
from transformers import AdamW , get_linear_schedule_with_warmup
from tqdm import tqdm
from accelerate import Accelerator
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
model.train()

learning_rate=2e-6
optim = AdamW(model.parameters(), lr=learning_rate)

total_step=900
for i in range(total_step):
    optim.param_groups[0]["lr"] -= learning_rate/total_step
# initialize data loader for training data

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# initialize scheduler
num_training_steps = len(train_loader) * 30
scheduler = get_linear_schedule_with_warmup(optim, num_warmup_steps=0, num_training_steps=num_training_steps)
accelerator = Accelerator()
model, optimizer, training_dataloader, scheduler = accelerator.prepare(model, optim, train_loader, scheduler)


for epoch in range(7):
    model.train()
    
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
       
        optim.zero_grad()
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
       
        outputs = model(input_ids, attention_mask=attention_mask,
                        start_positions=start_positions,
                        end_positions=end_positions)
       
        loss = outputs[0]
        
        loss.backward()
        optim.step()
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item(),lr=optim.param_groups[0]['lr'])

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch 0: 100%|██████████| 1160/1160 [17:51<00:00,  1.08it/s, loss=6.2, lr=2.51e-20] 
Epoch 1: 100%|██████████| 1160/1160 [17:51<00:00,  1.08it/s, loss=6.24, lr=2.51e-20]
Epoch 2: 100%|██████████| 1160/1160 [17:54<00:00,  1.08it/s, loss=6.22, lr=2.51e-20]
Epoch 3: 100%|██████████| 1160/1160 [17:52<00:00,  1.08it/s, loss=6.28, lr=2.51e-20]
Epoch 4: 100%|██████████| 1160/1160 [17:51<00:00,  1.08it/s, loss=6.2, lr=2.51e-20] 
Epoch 5: 100%|██████████| 1160/1160 [17:50<00:00,  1.08it/s, loss=6.22, lr=2.51e-20]
Epoch 6: 100%|██████████| 1160/1160 [17:50<00:00,  1.08it/s, loss=6.28, lr=2.51e-20]


In [14]:
model.eval()
val_loader = DataLoader(val_dataset, batch_size=32)

acc = []

# initialize loop for progress bar
loop = tqdm(val_loader)
# loop through batches
answers = []
references = []
for batch in loop:
    # we don't need to calculate gradients as we're not training
    with torch.no_grad():
        # pull batched items from loader
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)
        # make predictions
        outputs = model(input_ids, attention_mask=attention_mask)
        # pull preds out
        start_pred = torch.argmax(outputs['start_logits'], dim=1)
        end_pred = torch.argmax(outputs['end_logits'], dim=1)
        # calculate accuracy for both and append to accuracy list
        acc.append(((start_pred == start_true).sum()/len(start_pred)).item())
        acc.append(((end_pred == end_true).sum()/len(end_pred)).item())
        for i in range(start_pred.shape[0]):
            all_tokens = tokenizer.convert_ids_to_tokens(batch['input_ids'][i])
            answer = ' '.join(all_tokens[start_pred[i] : end_pred[i]+1])
            ref = ' '.join(all_tokens[start_true[i] : end_true[i]+1])
            ans_ids = tokenizer.convert_tokens_to_ids(answer.split())
            answer = tokenizer.decode(ans_ids)
            answers.append(answer)
            references.append(ref)

100%|██████████| 497/497 [02:43<00:00,  3.05it/s]


In [26]:
from __future__ import print_function
from collections import Counter
import string
import re
import argparse
import sys


def normalize_answer(s):

    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

def exact_match_score(prediction, ground_truth):
    return (normalize_answer(prediction) == normalize_answer(ground_truth))


def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
    scores_for_ground_truths = []
    for ground_truth in ground_truths:
        score = metric_fn(prediction, ground_truth)
        scores_for_ground_truths.append(score)
    if len(scores_for_ground_truths)==0: return 0
    return max(scores_for_ground_truths)

def f1_score(prediction, ground_truth):
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1

def evaluate(gold_answers, predictions):
    f1 = exact_match = total = 0

    for ground_truths, prediction in zip(gold_answers, predictions):
        total += 1
        exact_match += metric_max_over_ground_truths(
                    exact_match_score, prediction, ground_truths)
        f1 += metric_max_over_ground_truths(
          f1_score, prediction, [ground_truths])
    
    exact_match = 100.0 * exact_match / total
    f1 = 100.0 * f1 / total

    return  f1

val=evaluate(references,answers)

     


In [27]:
print(val)

14.780107125767412
