In [None]:
import requests
import json
import torch
import os
from tqdm import tqdm

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
with open('spoken_train-v1.1.json', 'r') as f:
    squad = json.load(f)

In [None]:
squad['data'][0].keys()

dict_keys(['title', 'paragraphs'])

In [None]:
for idx, group in enumerate(squad['data']):
    if group['title'] == 'Greece':
        gr = idx
        print(group['title'])
        print(gr)
        break


Greece
202


In [None]:
squad['data'][186]['paragraphs'][0]['context']

'napoleon bonaparte nato mean l l j e n french nap led the napa tea or nepali on deep wanna parte the fifteenth of august seventeen sixty nine to the fifth of may eighteen twenty one was a french military and political leader who rose to prominence during the french revolution and led several successful campaigns during the revolutionary war sir. as napoleon i he was emperor of the french from eighty know for intel eighteen fourteen and again in eighteen fifteen. napoleon dominated european in global affairs for more than a decade while leaving france against a series of coalitions in the napoleonic wars. he won most of these wars and the vast majority of his battles building a large empire that ruled over continental europe before its final collapse in eighteen fifteen. often considered one of the greatest commanders in history his wars in campaigns are studied at military schools worldwide. he also remains one of the most celebrated and controversial political figures in western hist

In [None]:
def read_data(path):
    with open(path, 'r') as f:
        data = json.load(f)

    contexts = []
    questions = []
    answers = []

    for group in data['data']:
        for paragraph in group['paragraphs']:
            context = paragraph['context']
            for qa in paragraph['qas']:
                question = qa['question']
                for answer in qa['answers']:
                    answer_text = answer['text']
                    answer_start = answer['answer_start']
                    answer_end = answer_start + len(answer_text)
                    contexts.append(context)
                    questions.append(question)
                    answers.append({
                        'text': answer_text,
                        'answer_start': answer_start,
                        'answer_end': answer_end
                    })

    return contexts, questions, answers


In [None]:
train_contexts, train_questions, train_answers = read_data('spoken_train-v1.1.json')
valid_contexts, valid_questions, valid_answers = read_data('spoken_test-v1.1.json')
     


In [None]:
print(f'There are {len(train_questions)} questions')
print(train_questions[986])
print(train_answers[986])

There are 37111 questions
What country borders south Estonia?
{'text': 'latvia', 'answer_start': 262, 'answer_end': 268}


In [None]:
def add_end_idx(answers, contexts):
    for answer, context in zip(answers, contexts):
        gold_text = answer['text']
        start_idx = answer['answer_start']
        end_idx = start_idx + len(gold_text)

        # check if answer is off by a character
        if context[start_idx:end_idx] == gold_text:
            answer['answer_end'] = end_idx
        elif context[start_idx-1:end_idx-1] == gold_text:
            # when the gold label is off by one character
            answer['answer_start'] = start_idx - 1
            answer['answer_end'] = end_idx - 1
        elif context[start_idx-2:end_idx-2] == gold_text:
            # when the gold label is off by two characters
            answer['answer_start'] = start_idx - 2
            answer['answer_end'] = end_idx - 2

    return answers


In [None]:
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

doc_stride = 128  # Set the doc stride value as per your requirements

def tokenize_examples(contexts, questions, max_length):
    encodings = tokenizer(contexts, questions, truncation=True, padding=True, max_length=max_length, stride=doc_stride)
    return encodings

train_encodings = tokenize_examples(train_contexts, train_questions, max_length=512)
valid_encodings = tokenize_examples(valid_contexts, valid_questions, max_length=512)


In [None]:
train_encodings.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [None]:
no_of_encodings = len(train_contexts)
print(f'We have {no_of_encodings} context-question pairs')


We have 37111 context-question pairs


In [None]:
tokenizer.decode(train_encodings['input_ids'][0])

'[CLS] architecturally the school has a catholic character. atop the main building school dome is the golden statue of the virgin mary. immediately in front of the main building in facing it is a copper statue of christ with arms appraised with the legend and the bad meow names. next to the main building is the basilica of the sacred heart. immediately behind the basilica is the grotto im mary in place of prayer and reflection. it is a replica of the grotto at lourdes france where the virgin mary reputedly appeared to st bernadette still burning eighteen fifty eight. at the end of the main drive and in a direct line that connects through three statues in the gold dome is as simple modern stone statue of mary. [SEP] what is in front of the notre dame main building? [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PA

In [None]:
def add_token_positions(encodings, answers):
    start_positions = []
    end_positions = []
    for i, answer in enumerate(answers):
        start_pos = max(0, answer['answer_start'])
        end_pos = max(0, answer['answer_end'] - 1)
        start_positions.append(encodings.char_to_token(i, start_pos))
        end_positions.append(encodings.char_to_token(i, end_pos))

        # if start position is None, the answer passage has been truncated
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        if end_positions[-1] is None:
            end_positions[-1] = tokenizer.model_max_length

    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

add_token_positions(train_encodings, train_answers)
add_token_positions(valid_encodings, valid_answers)

In [None]:
train_encodings['start_positions'][:10]

[36, 11, 55, 107, 25, 21, 43, 67, 28, 22]

In [None]:
class SQuAD_Dataset(torch.utils.data.Dataset):
  def __init__(self, contexts, questions, answers, tokenizer, max_length=512, doc_stride=128):
    self.tokenizer = tokenizer
    self.max_length = max_length
    self.doc_stride = doc_stride
    self.contexts = contexts
    self.questions = questions
    self.answers = answers
    self.encodings = self._get_encodings()

  def _get_encodings(self):
    encodings = self.tokenizer(
      self.contexts,
      self.questions,
      truncation=True,
      padding=True,
      max_length=self.max_length,
      stride=self.doc_stride
    )

    start_positions = []
    end_positions = []

    for i, answer in enumerate(self.answers):
      start_pos = max(0, answer['answer_start'])
      end_pos = max(0, answer['answer_end'] - 1)
      start_positions.append(encodings.char_to_token(i, start_pos))
      end_positions.append(encodings.char_to_token(i, end_pos))

      # if start position is None, the answer passage has been truncated
      if start_positions[-1] is None:
        start_positions[-1] = self.tokenizer.model_max_length
      if end_positions[-1] is None:
        end_positions[-1] = self.tokenizer.model_max_length

    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

    return encodings

  def __getitem__(self, idx):
    item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    item['input_ids'] = item['input_ids'].squeeze()
    item['attention_mask'] = item['attention_mask'].squeeze()
    item['token_type_ids'] = item['token_type_ids'].squeeze()
    item['start_positions'] = item['start_positions'].item()
    item['end_positions'] = item['end_positions'].item()

    return item

  def __len__(self):
    return len(self.contexts)

train_dataset = SQuAD_Dataset(train_contexts, train_questions, train_answers, tokenizer)
valid_dataset = SQuAD_Dataset(valid_contexts, valid_questions, valid_answers, tokenizer)


In [None]:
from torch.utils.data import DataLoader

# Define the dataloaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=16)

In [None]:
from transformers import BertForQuestionAnswering

model = BertForQuestionAnswering.from_pretrained("roberta-base")

You are using a model of type albert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at albert-base-v2 were not used when initializing BertForQuestionAnswering: ['predictions.bias', 'albert.encoder.albert_layer_groups.0.albert_layers.0.attention.LayerNorm.bias', 'albert.encoder.albert_layer_groups.0.albert_layers.0.ffn.bias', 'albert.encoder.albert_layer_groups.0.albert_layers.0.attention.dense.weight', 'albert.encoder.albert_layer_groups.0.albert_layers.0.attention.value.bias', 'albert.encoder.albert_layer_groups.0.albert_layers.0.ffn_output.bias', 'albert.embeddings.token_type_embeddings.weight', 'predictions.dense.bias', 'predictions.decoder.weight', 'predictions.decoder.bias', 'predictions.dense.weight', 'albert.encoder.albert_layer_groups.0.albert_layers.0.full_layer_layer_norm.weight', 'albert.encoder.embedding_hidden_mapping_in.weight', 'predictions.LayerNorm.bias', 'albert.enc

In [None]:
# Check on the available device - use GPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(f'Working on {device}')

Working on cuda


In [None]:
#update
from transformers import AdamW
from tqdm.auto import tqdm

model_name = "bert-base-uncased"
model = BertForQuestionAnswering.from_pretrained(model_name)

N_EPOCHS = 1
optim = AdamW(model.parameters(), lr=3e-5)

model.to(device)
model.train()

for epoch in range(N_EPOCHS):
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
        loss = outputs[0]
        loss.backward()
        optim.step()

        loop.set_description(f'Epoch {epoch+1}')
        loop.set_postfix(loss=loss.item())


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForQuestionAnswering: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased a

  0%|          | 0/2320 [00:00<?, ?it/s]

In [None]:
model_path = 'DeepLearning'
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

('DeepLearning/tokenizer_config.json',
 'DeepLearning/special_tokens_map.json',
 'DeepLearning/vocab.txt',
 'DeepLearning/added_tokens.json',
 'DeepLearning/tokenizer.json')

In [None]:
model.eval()

acc = []

for batch in tqdm(valid_loader):
  with torch.no_grad():
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    start_true = batch['start_positions'].to(device)
    end_true = batch['end_positions'].to(device)
    
    outputs = model(input_ids, attention_mask=attention_mask)

    start_pred = torch.argmax(outputs['start_logits'], dim=1)
    end_pred = torch.argmax(outputs['end_logits'], dim=1)

    acc.append(((start_pred == start_true).sum()/len(start_pred)).item())
    acc.append(((end_pred == end_true).sum()/len(end_pred)).item())

acc = sum(acc)/len(acc)

print("\n\nT/P\tanswer_start\tanswer_end\n")
for i in range(len(start_true)):
  print(f"true\t{start_true[i]}\t{end_true[i]}\n"
        f"pred\t{start_pred[i]}\t{end_pred[i]}\n")
     

  0%|          | 0/993 [00:00<?, ?it/s]



T/P	answer_start	answer_end

true	59	59
pred	51	54

true	59	60
pred	51	54

true	59	59
pred	51	54



In [None]:
pip install jiwer

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting jiwer
  Downloading jiwer-3.0.1-py3-none-any.whl (21 kB)
Collecting rapidfuzz==2.13.7
  Downloading rapidfuzz-2.13.7-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m29.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, jiwer
Successfully installed jiwer-3.0.1 rapidfuzz-2.13.7


In [None]:
import jiwer

model.eval()
acc = []
wer = []

for batch in tqdm(valid_loader):
    with torch.no_grad():
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask)
        
        start_pred = torch.argmax(outputs['start_logits'], dim=1)
        end_pred = torch.argmax(outputs['end_logits'], dim=1)
        
        # Calculate accuracy
        acc.append(((start_pred == start_true).sum() / len(start_pred)).item())
        acc.append(((end_pred == end_true).sum() / len(end_pred)).item())
        
        # Calculate WER
        for i in range(len(start_true)):
            true_text = tokenizer.decode(input_ids[i][start_true[i]:end_true[i]])
            pred_text = tokenizer.decode(input_ids[i][start_pred[i]:end_pred[i]])
            if true_text.strip() == "":
                continue
            wer.append(jiwer.wer(true_text, pred_text))
        
acc = sum(acc) / len(acc)
wer = sum(wer) / len(wer)

print(f'Accuracy: {acc:.4f}')
print(f'WER: {wer:.4f}')

print("\n\nT/P\tanswer_start\tanswer_end\n")
for i in range(len(start_true)):
    print(f"true\t{start_true[i]}\t{end_true[i]}\n"
          f"pred\t{start_pred[i]}\t{end_pred[i]}")

  0%|          | 0/993 [00:00<?, ?it/s]

Accuracy: 0.5956
WER: 1.8495


T/P	answer_start	answer_end

true	59	59
pred	51	54
true	59	60
pred	51	54
true	59	59
pred	51	54


In [None]:
def get_prediction(context, question):
  inputs = tokenizer.encode_plus(question, context, return_tensors='pt').to(device)
  outputs = model(**inputs)
  
  answer_start = torch.argmax(outputs[0])  
  answer_end = torch.argmax(outputs[1]) + 1 
  
  answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][answer_start:answer_end]))
  
  return answer

def normalize_text(s):
  """Removing articles and punctuation, and standardizing whitespace are all typical text processing steps."""
  import string, re
  def remove_articles(text):
    regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
    return re.sub(regex, " ", text)
  def white_space_fix(text):
    return " ".join(text.split())
  def remove_punc(text):
    exclude = set(string.punctuation)
    return "".join(ch for ch in text if ch not in exclude)
  def lower(text):
    return text.lower()

  return white_space_fix(remove_articles(remove_punc(lower(s))))

def exact_match(prediction, truth):
    return bool(normalize_text(prediction) == normalize_text(truth))

def compute_f1(prediction, truth):
  pred_tokens = normalize_text(prediction).split()
  truth_tokens = normalize_text(truth).split()
  
  # if either the prediction or the truth is no-answer then f1 = 1 if they agree, 0 otherwise
  if len(pred_tokens) == 0 or len(truth_tokens) == 0:
    return int(pred_tokens == truth_tokens)
  
  common_tokens = set(pred_tokens) & set(truth_tokens)
  
  # if there are no common tokens then f1 = 0
  if len(common_tokens) == 0:
    return 0
  
  prec = len(common_tokens) / len(pred_tokens)
  rec = len(common_tokens) / len(truth_tokens)
  
  return round(2 * (prec * rec) / (prec + rec), 2)
  
def question_answer(context, question,answer):
  prediction = get_prediction(context,question)
  em_score = exact_match(prediction, answer)
  f1_score = compute_f1(prediction, answer)

  print(f'Question: {question}')
  print(f'Prediction: {prediction}')
  print(f'True Answer: {answer}')
  print(f'Exact match: {em_score}')
  print(f'F1 score: {f1_score}\n')

In [None]:
context = """Tetris (Russian: Тетрис)[a] is a puzzle video game created by the Soviet software engineer Alexey Pajitnov in 1984. 
It has been published by several companies for multiple platforms, most prominently during a dispute over the appropriation of the rights 
in the late 1980s. After a significant period of publication by Nintendo, the rights reverted to Pajitnov in 1996, who co-founded 
the Tetris Company with Henk Rogers to manage licensing. In Tetris, players complete lines by moving differently shaped pieces (tetrominoes), 
which descend onto the playing field. The completed lines disappear and grant the player points, and the player can proceed to fill 
the vacated spaces. The game ends when the uncleared lines reach the top of the playing field. The longer the player can delay this 
outcome, the higher their score will be. In multiplayer games, players must last longer than their opponents; in certain versions, 
players can inflict penalties on opponents by completing a significant number of lines. Some versions add variations on the rules, 
such as three-dimensional displays or a system for reserving pieces."""


questions = ["What is the paragraph talking about?",
             "What is Tetris?",
             "When was it created?",
             "Who created it?",
             "How the game is played?",
             "When the game ends?"]

answers = ["Tetris", "puzzle video game", "1984", 
           "Alexey Pajitnov", "complete lines by moving differently shaped pieces", 
           "uncleared lines reach the top of the playing field"]

for question, answer in zip(questions, answers):
  question_answer(context, question, answer)

Question: What is the paragraph talking about?
Prediction: puzzle video game created by the soviet software engineer alexey pajitnov in 1984. it has been published by several companies for multiple platforms, most prominently during a dispute over the appropriation of the rights in the late 1980s. after a significant period of publication by nintendo, the rights reverted to pajitnov in 1996, who co - founded the tetris company with henk rogers to manage licensing. in tetris, players complete lines by moving differently shaped pieces ( tetrominoes ), which descend onto the playing field
True Answer: Tetris
Exact match: False
F1 score: 0.03

Question: What is Tetris?
Prediction: puzzle video game created by the soviet software engineer alexey pajitnov in 1984. it has been published by several companies for multiple platforms, most prominently during a dispute over the appropriation of the rights in the late 1980s. after a significant period of publication by nintendo, the rights reverted