# Answer Extraction Pipeline for TriviaQA

Contains dataset preprocessing, training and evaluation code for the question answering task using DistilBERT.


## Installs and Imports

In [13]:
!pip install transformers
!pip install datasets



## Load Data

In [14]:
from datasets import load_dataset
import random

dataset = load_dataset("trivia_qa", "unfiltered")

# Select 8k samples for training and 2k for testing with random seed
train_dataset = dataset["train"].shuffle(seed=42).select(range(8000))
test_dataset = dataset["validation"].shuffle(seed=42).select(range(2000))

Resolving data files:   0%|          | 0/26 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/47 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/36 [00:00<?, ?it/s]

In [15]:
# Function to construct context, question, and answer data with structured search results and answer fields
def prepare_data(data):
    contexts = []
    questions = [item['question'] for item in data]

    # Build contexts by combining fields within 'search_results'
    for item in data:
        # Extract subfields in search_results and join relevant fields


        # Combine fields into a single context
        context = item['search_results']['description']
        contexts.append(context)

    # Extract answers prioritizing 'value', then 'aliases' or 'normalized_value' if needed
    answers = []
    for item in data:
        answer_data = item.get('answer', {})
        if 'value' in answer_data and answer_data['value']:
            answer = answer_data['value']
        elif 'aliases' in answer_data and answer_data['aliases']:
            answer = answer_data['aliases'][0]
        elif 'normalized_value' in answer_data and answer_data['normalized_value']:
            answer = answer_data['normalized_value']
        else:
            answer = ''  # Handle missing answers if necessary
        answers.append(answer)

    return contexts, questions, answers

# Apply prepare_data to both train and test data
train_contexts, train_questions, train_answers = prepare_data(train_dataset)
test_contexts, test_questions, test_answers = prepare_data(test_dataset)

# Verify a few samples
print("Training Contexts:", train_contexts[:3])
print("Training Questions:", train_questions[:3])
print("Training Answers:", train_answers[:3])


Training Contexts: [[], ['The Two Double Landlocked Countries ... There are only two such countries in the world. Liechtenstein in Europe is surrounded by two landlocked countries; ...', 'Learn about how many countries are landlocked, which is the largest landlocked country, and which two countries are double landlocked.', "Which Countries are Landlocked? ... Two countries in the world are double landlocked, ... are the world's only double land locked countries and that Kazakhstan is the ...", 'Double Landlocked Countries. ... Someone beginning a journey from a double landlocked country would have to cross two international borders before reaching a coastline.', 'Map of Landlocked Countries. ... two double-landlocked counties, which are bordered only by sovereign states that are themselves landlocked. The other country in ...', "Approximately one-fifth of the world's countries are landlocked and have no access to ... learn all about landlocked countries from your About.com ...", 'Which

In [16]:
# Verify a few samples
print("Training Contexts:", train_contexts[:3])
print("Training Questions:", train_questions[:3])
print("Training Answers:", train_answers[:3])

Training Contexts: [[], ['The Two Double Landlocked Countries ... There are only two such countries in the world. Liechtenstein in Europe is surrounded by two landlocked countries; ...', 'Learn about how many countries are landlocked, which is the largest landlocked country, and which two countries are double landlocked.', "Which Countries are Landlocked? ... Two countries in the world are double landlocked, ... are the world's only double land locked countries and that Kazakhstan is the ...", 'Double Landlocked Countries. ... Someone beginning a journey from a double landlocked country would have to cross two international borders before reaching a coastline.', 'Map of Landlocked Countries. ... two double-landlocked counties, which are bordered only by sovereign states that are themselves landlocked. The other country in ...', "Approximately one-fifth of the world's countries are landlocked and have no access to ... learn all about landlocked countries from your About.com ...", 'Which

In [17]:
import torch
from transformers import DistilBertTokenizer, DistilBertForQuestionAnswering, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
# Step 2: Load the DistilBERT model and tokenizer
model_name = "distilbert-base-uncased"
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertForQuestionAnswering.from_pretrained(model_name)

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
# Flatten function to turn each list of strings into a single string
def flatten_contexts(nested_contexts):
    flat_contexts = []
    for context in nested_contexts:
        # Join inner lists into a single string, or handle empty lists as ""
        flat_contexts.append(" ".join(context) if isinstance(context, list) else context)
    return flat_contexts

# Apply flattening to train and test contexts
train_contexts = flatten_contexts(train_contexts)
test_contexts = flatten_contexts(test_contexts)

# Updated tokenize_data function
def tokenize_data(contexts, questions, answers, tokenizer, max_length=512):
    encodings = tokenizer(contexts, questions, truncation=True, padding=True, max_length=max_length)

    # Create labels (start and end positions for answers)
    start_positions = []
    end_positions = []
    for i in range(len(contexts)):
        context = contexts[i]
        answer = answers[i]

        # Find answer positions
        start_pos = context.find(answer)
        end_pos = start_pos + len(answer)

        # Append start and end positions
        start_positions.append(start_pos)
        end_positions.append(end_pos)

    encodings['start_positions'] = torch.tensor(start_positions)
    encodings['end_positions'] = torch.tensor(end_positions)

    return encodings




In [19]:
# Step 3: Tokenize the data
from tqdm import trange
def tokenize_data(contexts, questions, answers, tokenizer, max_length=512):

    encodings = tokenizer(contexts, questions, truncation=True, padding=True, max_length=max_length)

    # Create the labels (start and end positions for answers)
    start_positions = []
    end_positions = []
    for i in trange(len(contexts)):
        answer = answers[i]
        start_pos = contexts[i].find(answer)  # Find the start position of the answer
        end_pos = start_pos + len(answer)  # Find the end position of the answer
        start_positions.append(start_pos)
        end_positions.append(end_pos)

    encodings['start_positions'] = torch.tensor(start_positions)
    encodings['end_positions'] = torch.tensor(end_positions)

    return encodings

In [20]:
train_encodings = tokenize_data(train_contexts, train_questions, train_answers, tokenizer)


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

In [21]:
test_encodings = tokenize_data(test_contexts, test_questions, test_answers, tokenizer)

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

In [22]:
# Step 4: Fine-tune the Model
train_dataset = Dataset.from_dict(train_encodings)
test_dataset = Dataset.from_dict(test_encodings)

In [23]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [24]:
training_args = TrainingArguments(
    output_dir="./results",          # output directory
    evaluation_strategy="epoch",     # evaluation strategy to use
    learning_rate=2e-5,              # learning rate for training
    per_device_train_batch_size=8,   # batch size per device during training
    per_device_eval_batch_size=8,    # batch size for evaluation
    num_train_epochs=3,              # number of training epochs
    weight_decay=0.01,
    report_to= "none"# strength of weight decay
)

trainer = Trainer(
    model=model,                         # the model to train
    args=training_args,                  # training arguments
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset,

    # evaluation dataset
)



In [25]:
trainer.train()

# Step 5: Evaluate the Model
eval_results = trainer.evaluate()
print("Evaluation Results:", eval_results)

Epoch,Training Loss,Validation Loss
1,3.6069,3.479216
2,3.5469,3.469323
3,3.4309,3.477627


Evaluation Results: {'eval_loss': 3.4776270389556885, 'eval_runtime': 28.3409, 'eval_samples_per_second': 70.569, 'eval_steps_per_second': 8.821, 'epoch': 3.0}


In [28]:
save_path = "/content/drive/My Drive/finetuned_model"
import os
os.makedirs(save_path, exist_ok=True)


In [29]:
import torch

# Save model weights
torch.save(model.state_dict(), os.path.join(save_path, "model_weights.pth"))

# Save tokenizer
tokenizer.save_pretrained(save_path)

print(f"Model and tokenizer saved to Google Drive at {save_path}")


Model and tokenizer saved to Google Drive at /content/drive/My Drive/finetuned_model


In [30]:
torch.save(model, os.path.join(save_path, "model.pth"))


In [None]:
# Save model weights
torch.save(model.state_dict(), save_path)

# Save tokenizer
tokenizer.save_pretrained("fine_tuned_tokenizer")

In [None]:
import torch

# Set up device (either 'cuda' if available or 'cpu')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move model to the correct device
model.to(device)

# Ensure the dataset encodings are PyTorch tensors and move them to the device
train_encodings['input_ids'] = torch.tensor(train_encodings['input_ids']).to(device)
train_encodings['attention_mask'] = torch.tensor(train_encodings['attention_mask']).to(device)
test_encodings['input_ids'] = torch.tensor(test_encodings['input_ids']).to(device)
test_encodings['attention_mask'] = torch.tensor(test_encodings['attention_mask']).to(device)

# Function to get predictions
def get_predictions(dataset, model, tokenizer):
    predictions = []
    for i in range(len(dataset['input_ids'])):
        # Convert each instance to a tensor and add batch dimension
        input_ids = dataset['input_ids'][i].unsqueeze(0).to(device)
        attention_mask = dataset['attention_mask'][i].unsqueeze(0).to(device)

        # Get model output
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        start_logits, end_logits = outputs.start_logits, outputs.end_logits

        # Get the most probable start and end token positions
        start_pos = torch.argmax(start_logits)
        end_pos = torch.argmax(end_logits)

        # Decode the answer
        answer = tokenizer.decode(input_ids[0][start_pos:end_pos + 1])
        predictions.append(answer)

    return predictions

# Example usage
train_predictions = get_predictions(train_encodings, model, tokenizer)
test_predictions = get_predictions(test_encodings, model, tokenizer)

# Print a few predictions for verification
print("Train Predictions:", train_predictions[:3])
print("Test Predictions:", test_predictions[:3])


In [None]:

with torch.no_grad():
    for batch in tqdm(val_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        start_logits = outputs.start_logits
        end_logits = outputs.end_logits

        # Process predictions and references
        for i in range(len(start_logits)):
            # Get predicted and true answer spans
            pred_start = torch.argmax(start_logits[i])
            pred_end = torch.argmax(end_logits[i])
            pred_answer = tokenizer.decode(input_ids[i][pred_start:pred_end+1], skip_special_tokens=True)
            true_answer = tokenizer.decode(input_ids[i][start_positions[i]:end_positions[i]+1], skip_special_tokens=True)

            predictions.append(pred_answer)
            references.append(true_answer)

In [None]:
import sys
import re
import string
from collections import Counter
import pickle

def preprocessed_answer(s):

    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

In [None]:
from collections import Counter
import pickle
from copy import deepcopy
def evaluation_metric(predictions, references):

    predictions_=[]
    ground_truths_=[]
    precisions_=[]
    recalls_=[]
    f1s_=[]
    accuracy_=[]
    ZERO_METRIC = (0, 0, 0)
    for x,y in list(zip(predictions, references)):


        prediction=x
        ground_truth=y
        predictions_.append(x)
        ground_truths_.append(y)

        normalized_prediction = preprocessed_answer(prediction)
        normalized_ground_truth = preprocessed_answer(ground_truth)





        prediction_tokens = normalized_prediction.split()
        ground_truth_tokens = normalized_ground_truth.split()
        common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
        num_same = sum(common.values())
        if num_same == 0:
          precision,recall,f1=(0,0,0)
          continue
        precision = 1.0 * num_same / len(prediction_tokens) if len(prediction_tokens) > 0 else None
        recall = 1.0 * num_same / len(ground_truth_tokens) if len(ground_truth_tokens)>0 else None
        f1 = (2 * precision * recall) / (precision + recall)
        accuracy = num_same / (len(prediction_tokens) + len(ground_truth_tokens) - num_same) if (len(prediction_tokens) + len(ground_truth_tokens) - num_same) > 0 else None
        precisions_.append(deepcopy(precision))
        recalls_.append(deepcopy(recall))
        f1s_.append(deepcopy(f1))
        accuracy_.append(deepcopy(accuracy))
    print(precisions_)
    print(recalls_)
    print(f1s_)
    precision=sum(precisions_)/len(precisions_)
    recall=sum(recalls_)/len(recalls_)
    f1=sum(f1s_)/len(f1s_)
    accuracy=sum(accuracy_)/len(accuracy_)


    return f1, precision, recall,accuracy

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

# Set up device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Custom Dataset class to wrap your encodings
class QADataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return len(self.encodings['input_ids'])

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item

# Convert your encodings into Dataset instances
train_dataset = QADataset(train_encodings)
test_dataset = QADataset(test_encodings)

# Prepare DataLoader for batched evaluation
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Evaluation function with DataLoader and ground truth extraction
def evaluate_predictions(data_loader, model, tokenizer):
    model.eval()
    predictions = []
    references = []

    with torch.no_grad():
        for batch in tqdm(data_loader):
            # Move batch to device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            # Ensure ground truth start and end positions exist in the batch for reference extraction
            if 'start_positions' in batch and 'end_positions' in batch:
                start_positions = batch['start_positions'].to(device)
                end_positions = batch['end_positions'].to(device)

            # Model predictions
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            start_logits, end_logits = outputs.start_logits, outputs.end_logits

            # Process predictions and references for each item in the batch
            for i in range(len(start_logits)):
                # Get predicted start and end positions
                pred_start = torch.argmax(start_logits[i])
                pred_end = torch.argmax(end_logits[i])
                pred_answer = tokenizer.decode(input_ids[i][pred_start:pred_end + 1], skip_special_tokens=True)

                # Collect predicted answers
                predictions.append(pred_answer)

                # Decode the ground truth answer if available
                if 'start_positions' in batch and 'end_positions' in batch:
                    true_answer = tokenizer.decode(input_ids[i][start_positions[i]:end_positions[i] + 1], skip_special_tokens=True)
                    references.append(true_answer)

    return predictions, references

# Example usage with training and test datasets
train_predictions, train_references = evaluate_predictions(train_loader, model, tokenizer)
test_predictions, test_references = evaluate_predictions(test_loader, model, tokenizer)

# Print a few examples to verify
print("Train Predictions:", train_predictions[:3])
print("Train References:", train_references[:3])
print("Test Predictions:", test_predictions[:3])
print("Test References:", test_references[:3])


In [None]:
f1, precision, recall, accuracy= evaluation_metric(test_predictions, test_references)
print(f1)
print(precision)
print(recall)
print(accuracy)

In [None]:
def exact_match_score(prediction, ground_truth):
    return (preprocessed_answer(prediction) == preprocessed_answer(ground_truth))

In [None]:
exact_matches=[]
for x,y in list(zip(test_predictions, test_references)):
  prediction=x
  ground_truth=y
  exact_matches.append(exact_match_score(prediction, ground_truth))

exact_match=sum(exact_matches)/len(exact_matches)
print(exact_match)

In [None]:
pip install rouge_score

In [None]:
from rouge_score import rouge_scorer

def calculate_rouge(predictions, references):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge1_scores, rouge2_scores, rougeL_scores = [], [], []

    for pred, ref in zip(predictions, references):
        scores = scorer.score(ref, pred)
        rouge1_scores.append(scores['rouge1'].fmeasure)
        rouge2_scores.append(scores['rouge2'].fmeasure)
        rougeL_scores.append(scores['rougeL'].fmeasure)

    # Average scores across all samples
    avg_rouge1 = sum(rouge1_scores) / len(rouge1_scores)
    avg_rouge2 = sum(rouge2_scores) / len(rouge2_scores)
    avg_rougeL = sum(rougeL_scores) / len(rougeL_scores)

    return {'ROUGE-1': avg_rouge1, 'ROUGE-2': avg_rouge2, 'ROUGE-L': avg_rougeL}

In [None]:
from nltk.translate.bleu_score import sentence_bleu

def calculate_bleu(predictions, references):
    bleu_scores = []

    for pred, ref in zip(predictions, references):
        ref_tokens = ref.split()
        pred_tokens = pred.split()
        bleu_score = sentence_bleu([ref_tokens], pred_tokens)
        bleu_scores.append(bleu_score)

    avg_bleu = sum(bleu_scores) / len(bleu_scores)
    return avg_bleu

In [None]:
from collections import Counter

def calculate_partial_match(predictions, references):
    partial_match_scores = []

    for pred, ref in zip(predictions, references):
        pred_tokens = set(pred.split())
        ref_tokens = set(ref.split())
        common_tokens = pred_tokens & ref_tokens
        partial_match_score = len(common_tokens) / len(pred_tokens | ref_tokens) if (pred_tokens | ref_tokens) else 0
        partial_match_scores.append(partial_match_score)

    avg_partial_match = sum(partial_match_scores) / len(partial_match_scores)
    return avg_partial_match

In [None]:
# Calculate ROUGE
rouge_scores = calculate_rouge(test_predictions, test_references)
print("ROUGE Scores:", rouge_scores)

# Calculate BLEU
bleu_score = calculate_bleu(test_predictions, test_references)
print("BLEU Score:", bleu_score)

# Calculate Partial Match
partial_match_score = calculate_partial_match(test_predictions, test_references)
print("Partial Match Score:", partial_match_score)

# Training & Evaluation

### Run Training

### Run Evaluation