In [28]:
!pip install transformers
from torchvision.datasets.utils import download_url



In [29]:
# Dowload the dataset
dataset_url_train = " https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json"
dataset_url_test = "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json"
download_url(dataset_url_train, '.')
download_url(dataset_url_test, '.')

Using downloaded and verified file: ./train-v2.0.json
Using downloaded and verified file: ./dev-v2.0.json


In [30]:
import json
from pathlib import Path

def read_squad(path):
    path = Path(path)
    with open(path, 'rb') as f:
        squad_dict = json.load(f)

    contexts = []
    questions = []
    answers = []
    for group in squad_dict['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                for answer in qa['answers']:
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)

    return contexts, questions, answers

# Read all data
train_contexts, train_questions, train_answers = read_squad('train-v2.0.json')
val_contexts, val_questions, val_answers = read_squad('dev-v2.0.json')

# Limit the datasets to the first 10,000 and 3,000 samples, respectively
train_contexts = train_contexts[:10000]
train_questions = train_questions[:10000]
train_answers = train_answers[:10000]

val_contexts = val_contexts[:3000]
val_questions = val_questions[:3000]
val_answers = val_answers[:3000]


In [31]:
print(len(train_contexts))
print(len(train_questions))
print(len(train_answers))

10000
10000
10000


In [32]:
print(len(val_contexts))
print(len(val_questions))
print(len(val_answers))

3000
3000
3000


In [33]:
print(f'{train_contexts[0]} \n')
print(f'{train_questions[0]} \n')
print(train_answers[0])

Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy". 

When did Beyonce start becoming popular? 

{'text': 'in the late 1990s', 'answer_start': 269}


In [34]:
def add_end_idx(answers, contexts):
    for answer, context in zip(answers, contexts):
        gold_text = answer['text']
        start_idx = answer['answer_start']
        end_idx = start_idx + len(gold_text)

        # sometimes squad answers are off by a character or two – fix this
        if context[start_idx:end_idx] == gold_text:
            answer['answer_end'] = end_idx
        elif context[start_idx-1:end_idx-1] == gold_text:
            answer['answer_start'] = start_idx - 1
            answer['answer_end'] = end_idx - 1     # When the gold label is off by one character
        elif context[start_idx-2:end_idx-2] == gold_text:
            answer['answer_start'] = start_idx - 2
            answer['answer_end'] = end_idx - 2     # When the gold label is off by two characters

add_end_idx(train_answers, train_contexts)
add_end_idx(val_answers, val_contexts)

In [35]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True)

In [36]:
train_encodings[0:5]

[Encoding(num_tokens=512, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=512, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=512, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=512, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=512, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])]

In [37]:
def add_token_positions(encodings, answers):
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'] - 1))
        # if None, the answer passage has been truncated
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        if end_positions[-1] is None:
            end_positions[-1] = tokenizer.model_max_length
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

add_token_positions(train_encodings, train_answers)
add_token_positions(val_encodings, val_answers)

In [38]:
import torch

class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = SquadDataset(train_encodings)
val_dataset = SquadDataset(val_encodings)

In [39]:
from transformers import DistilBertForQuestionAnswering
model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [40]:
from tqdm.notebook import tqdm

In [27]:
from torch.utils.data import DataLoader
from transformers import AdamW

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model.to(device)
model.train()

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

optim = AdamW(model.parameters(), lr=5e-5)

for epoch in range(3):
    for batch in tqdm(train_loader):
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
        loss = outputs[0]
        loss.backward()
        optim.step()

model.eval()

  0%|          | 0/625 [00:00<?, ?it/s]

  0%|          | 0/625 [00:00<?, ?it/s]

  0%|          | 0/625 [00:00<?, ?it/s]

DistilBertForQuestionAnswering(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
      

In [41]:
pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=0708f5f5a3fdc792d90afbd71cb565d560f170dc8a50c6c34ec5cd95d71e67c0
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [42]:
import torch
from torch.utils.data import DataLoader
from transformers import AdamW
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu
from tqdm import tqdm
import numpy as np

# Set device and model to evaluation mode
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
optim = AdamW(model.parameters(), lr=5e-5)

# Initialize ROUGE scorer
rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Training loop
for epoch in range(3):
    model.train()
    for batch in tqdm(train_loader):
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
        loss = outputs[0]
        loss.backward()
        optim.step()

# Evaluation loop
model.eval()
val_loader = DataLoader(val_dataset, batch_size=16)
predictions = []
references = []

with torch.no_grad():
    for batch in tqdm(val_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        start_logits = outputs.start_logits
        end_logits = outputs.end_logits

        # Process predictions and references
        for i in range(len(start_logits)):
            # Get predicted and true answer spans
            pred_start = torch.argmax(start_logits[i])
            pred_end = torch.argmax(end_logits[i])
            pred_answer = tokenizer.decode(input_ids[i][pred_start:pred_end+1], skip_special_tokens=True)
            true_answer = tokenizer.decode(input_ids[i][start_positions[i]:end_positions[i]+1], skip_special_tokens=True)

            predictions.append(pred_answer)
            references.append(true_answer)

# Calculate metrics
def compute_metrics(predictions, references):
    # Calculate Exact Match and Partial Match
    exact_matches = [1 if pred == ref else 0 for pred, ref in zip(predictions, references)]
    partial_matches = [1 if ref in pred or pred in ref else 0 for pred, ref in zip(predictions, references)]
    exact_match_score = np.mean(exact_matches)
    partial_match_score = np.mean(partial_matches)

    # Calculate Accuracy, Precision, Recall, and F1
    binary_predictions = [1 if pred == ref else 0 for pred, ref in zip(predictions, references)]
    accuracy = accuracy_score(binary_predictions, [1] * len(binary_predictions))
    precision, recall, f1, _ = precision_recall_fscore_support(binary_predictions, [1] * len(binary_predictions), average='binary')

    # Calculate BLEU Score
    bleu_scores = [sentence_bleu([ref.split()], pred.split()) for ref, pred in zip(references, predictions)]
    bleu_score_avg = np.mean(bleu_scores)

    # Calculate ROUGE Scores
    rouge_1, rouge_2, rouge_L = [], [], []
    for ref, pred in zip(references, predictions):
        rouge_scores = rouge.score(ref, pred)
        rouge_1.append(rouge_scores['rouge1'].fmeasure)
        rouge_2.append(rouge_scores['rouge2'].fmeasure)
        rouge_L.append(rouge_scores['rougeL'].fmeasure)

    rouge_1_avg = np.mean(rouge_1)
    rouge_2_avg = np.mean(rouge_2)
    rouge_L_avg = np.mean(rouge_L)

    # Combine all metrics in a dictionary
    metrics = {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1_score": f1,
        "exact_match": exact_match_score,
        "partial_match": partial_match_score,
        "bleu": bleu_score_avg,
        "rouge_1": rouge_1_avg,
        "rouge_2": rouge_2_avg,
        "rouge_L": rouge_L_avg,
    }
    return metrics

# Compute and print evaluation metrics
metrics = compute_metrics(predictions, references)
print("Evaluation Metrics:", metrics)


100%|██████████| 625/625 [07:38<00:00,  1.36it/s]
100%|██████████| 625/625 [07:37<00:00,  1.37it/s]
100%|██████████| 625/625 [07:43<00:00,  1.35it/s]
100%|██████████| 188/188 [00:37<00:00,  4.98it/s]
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Evaluation Metrics: {'accuracy': 0.43733333333333335, 'precision': 0.43733333333333335, 'recall': 1.0, 'f1_score': 0.608534322820037, 'exact_match': 0.43733333333333335, 'partial_match': 0.773, 'bleu': 0.0634879028266244, 'rouge_1': 0.602600263216409, 'rouge_2': 0.3368969725949209, 'rouge_L': 0.6022126864723895}
