<a href="https://colab.research.google.com/github/SppEric/qa-system/blob/main/Question_Answering_System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Setup

In [None]:
!pip install datasets
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.8.0-py3-none-any.whl (452 kB)
[K     |████████████████████████████████| 452 kB 14.1 MB/s 
[?25hCollecting xxhash
  Downloading xxhash-3.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 68.5 MB/s 
[?25hCollecting huggingface-hub<1.0.0,>=0.2.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 73.1 MB/s 
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting multiprocess
  Downloading multiprocess-0.70.14-py38-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 45.4 MB/s 
Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1
  Downloading urllib3-1.25.11-py2.py3-none-any.whl (127 kB)
[K     |████████████████████████████████| 127 kB 6.8 MB/s 
Installing colle

In [None]:
import torch
import numpy as np
import random
from tqdm.auto import tqdm

# we set up some seeds so that we can reproduce results
seed = 123
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

device = "cuda" if torch.cuda.is_available() else "cpu"

# Model Definition


## Load the Model


In [None]:
"""
Some options for BERT model that can be run in colab:

"distilbert-base-uncased",
"distilbert-base-uncased-distilled-squad",
"distilbert-base-cased",
"distilbert-base-cased-distilled-squad",

"""

'\nSome options for BERT model that can be run in colab:\n\n"distilbert-base-uncased",\n"distilbert-base-uncased-distilled-squad",\n"distilbert-base-cased",\n"distilbert-base-cased-distilled-squad",\n\n'

Choosing here to go with `distilbert-base-cased-distilled-squad`

In [None]:
from transformers import DistilBertForQuestionAnswering, DistilBertTokenizerFast
def load_model():
  model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-cased-distilled-squad")
  tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased-distilled-squad') # For later in preprocessing
  return model, tokenizer

model, tokenizer = load_model()

Downloading:   0%|          | 0.00/473 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/261M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/436k [00:00<?, ?B/s]

# Dataset Loading


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Define function `load_data()` for `main()`


In [None]:
from datasets import load_dataset
def load_data(filepath):
  ''' Loads the Natural Questions dataset split into a train and validation
  tuple.
  :param String filepath: A path to the folder that contains train.json and dev.json

  :return Dataset train, Dataset validation
  '''
  # load_dataset was giving issues, please adjust FILEPATH as appropriate before running!
  FILEPATH = filepath

  data_files = {"train": FILEPATH + "train.json", "test": FILEPATH + "dev.json"}
  dataset = load_dataset('json', data_files=data_files)
  return dataset['train'], dataset['test']

train, validation = load_data("/content/drive/MyDrive/data/")



Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-37f24819b7460793/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-37f24819b7460793/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
print(len(validation))

871


### Examine the Dataset

In [None]:
print(validation[10])
print(type(train))

{'name': 'Cup (unit)', 'id': '7369861182720043754', 'questions': [{'input_text': 'what is a cup measurement in american recipes'}], 'answers': [{'candidate_id': 0, 'input_text': 'short', 'span_end': 144, 'span_start': 117, 'span_text': 'equal to half a liquid pint'}], 'has_correct_context': True, 'contexts': 'The cup is an English unit of volume , most commonly associated with cooking and serving sizes . It is traditionally equal to half a liquid pint in either US customary units or the British imperial system but is now separately defined in terms of the metric system at values between ⁄ and ⁄ of a liter . Because actual drinking cups may differ greatly from the size of this unit , standard measuring cups are usually used instead .'}
<class 'datasets.arrow_dataset.Dataset'>


# Preprocessing and Tokenizing

### Dataset Processing and Tokenizing

In [None]:
batch_size = 32
def preprocess(data):
  # Modify questions to be parseable by the tokenizer
  questions = [question[0]['input_text'] for question in data['questions']]

  # Tokenize input
  inputs = tokenizer(
    # Concatenate question with context
    questions,
    data['contexts'],

    # Additional Arguments
    max_length              = tokenizer.model_max_length,
    truncation              = 'only_second',
    return_offsets_mapping  = True,
    padding                 = 'max_length',
    add_special_tokens      = True,
    return_tensors          = 'pt',
  )
  offset_mapping = inputs["offset_mapping"]
  answers = data['answers']
  start_positions = []
  end_positions = []

  # For each context insert target tokens
  for i, offset in enumerate(offset_mapping):
    # Grab answer info for this context
    answer = answers[i][0]
    span_start = answer['span_start']
    span_end = answer['span_end']
    ids = inputs['input_ids'][i]

    # First figure out where context begins/ends
    idx = 0
    while ids[idx] != 102:
      idx += 1
    context_start = idx

    # Now we can use span_start/end to mark where the target indices are
    idx = context_start
    while idx < tokenizer.model_max_length and offset[idx][0] <= span_start:
        idx += 1

    start_positions.append(idx - 1)

    while idx < tokenizer.model_max_length and offset[idx][1] < span_end:
        idx += 1
    end_positions.append(idx + 1) # To account for the dropped index when slicing

  # Add start_positions and end_positions to be accessed later and then return
  inputs['start_positions'] = start_positions
  inputs['end_positions'] = end_positions
  return inputs

training_data = train.map(preprocess, batched=True, batch_size=batch_size, remove_columns=['has_correct_context', 'name', 'questions', 'answers'])
validation_data = validation.map(preprocess, batched=True, batch_size=batch_size, remove_columns=['has_correct_context', 'name', 'questions', 'answers'])

  0%|          | 0/436 [00:00<?, ?ba/s]

  0%|          | 0/28 [00:00<?, ?ba/s]

In [None]:
print(validation_data)
print(validation[0]['answers'][0]['span_text'])
print(tokenizer.decode(validation_data[0]['input_ids'][validation_data[0]['start_positions'] : validation_data[0]['end_positions']]))
print(validation_data[0]['contexts'])

Dataset({
    features: ['id', 'contexts', 'input_ids', 'attention_mask', 'offset_mapping', 'start_positions', 'end_positions'],
    num_rows: 871
})
MGM Resorts International
MGM Resorts International
Mandalay Bay Location Paradise , Nevada , U.S. Address 3950 South Las Vegas Boulevard Opening date March 2 , 1999 ; 18 years ago ( March 2 , 1999 ) Theme Tropical No. of rooms 3,309 Total gaming space 135,000 sq ft ( 12,500 m ) Permanent shows Michael Jackson : One Signature attractions Mandalay Bay Convention Center Mandalay Bay Events Center Shark Reef House of Blues Mandalay Beach Notable restaurants Aureole Alain Ducasse Rivea Charlie Palmer Steak Fleur by Hubert Keller Kumi Lupo Red Square RM Seafood Stripsteak Border Grill Las Vegas Casino type Land - based Owner MGM Resorts International Renovated in 2002 , 2007 Coordinates 36 ° 5 ′ 30 '' N 115 ° 10 ′ 29 '' W ﻿ / ﻿ 36.09167 ° N 115.17472 ° W ﻿ / 36.09167 ; - 115.17472 Coordinates : 36 ° 5 ′ 30 '' N 115 ° 10 ′ 29 '' W ﻿ / ﻿ 36.0916

###Dataloader Definition

In [None]:
from torch.utils.data import DataLoader
from transformers import default_data_collator
data_collator = default_data_collator
# Save offsets for later
validation_offsets = validation_data["offset_mapping"]

# Construct dataloaders
train_dataloader = DataLoader(training_data.remove_columns('offset_mapping'), batch_size=batch_size, shuffle=True, collate_fn=data_collator)
validation_dataloader = DataLoader(validation_data.remove_columns('offset_mapping'), batch_size=batch_size, collate_fn=data_collator)
print(validation_dataloader)

<torch.utils.data.dataloader.DataLoader object at 0x7f02d7fce2e0>


#Define Metrics


In [None]:
from collections import Counter

def compute_metrics(validation_data, start_logits, end_logits, raw_data, offsets):
  # Initialize vars
  total_precision = 0
  total_recall = 0
  total_f1 = 0

  # Iterate for each logit guess
  for i, example in enumerate(validation_data):
    # Initialize vars for this batch's training
    context = example["contexts"]
    ids = example['input_ids']
    offset_mapping = offsets[i]
    data = raw_data[i]

    # Find guess beginning and ending indices in context
    start_guess = torch.argmax(start_logits[i // batch_size][i % batch_size]) # account for batch_size-ing
    end_guess = torch.argmax(end_logits[i // batch_size][i % batch_size])

    # Find matching tokens
    s = example['start_positions']
    e = example['end_positions']
    prediction = Counter(ids[start_guess : end_guess])
    ground_truth = Counter(ids[s : e])

    # Calculate metrics
    true_positives = sum((prediction & ground_truth).values())

    precision = (true_positives / (end_guess - start_guess)) if (end_guess - start_guess > 0) else 0  # TP / (TP + FP)
    recall = (true_positives / (e - s)) if (e - s > 0) else 0 # TP / (TP + FN)
    f1 = 2 / (1 / precision + 1 / recall) if precision and recall != 0 else 0

    # Sum to the total
    total_precision += precision
    total_recall += recall
    total_f1 += f1

    # print(f"Guessed answer: {tokenizer.decode(ids[start_guess : end_guess])}")
    # print(f"Guessed answer: {context[offset_mapping[start_guess][0] : offset_mapping[end_guess][1]]}")
    # print(f"Correct answer: {tokenizer.decode(ids[s : e])}")


  return {
      "precision": total_precision / len(validation_data),
      "recall": total_recall / len(validation_data),
      "f1": total_f1 / len(validation_data)
  }

# Train the Model

## Define model loops


In [None]:
def eval_loop(model, validation_dataloader, device):
  """
  :param torch.nn.Module model: the model to be trained
  :param torch.utils.data.DataLoader vaildation_data_loader: DataLoader containing the validation set
  :param torch.device device: the device that we'll be training on

  :return float precision, float recall, float f1_score
  """
  # Put model into evaluation mode
  model.eval()

  # we like progress bars :)
  progress_bar = tqdm(range(len(validation_dataloader)))

  model.to(device)

  # Store logit results to calculate metrics macro batch-wise
  start_logits = []
  end_logits = []
  for batch in validation_dataloader:
    ## Create predictions for this batch
    # Send the batch to the GPU
    batch["input_ids"] = batch["input_ids"].to(device)
    batch["start_positions"] = batch["start_positions"].to(device)
    batch["end_positions"] = batch["end_positions"].to(device)
    batch["attention_mask"] = batch["attention_mask"].to(device)

    # Calculate the predictions
    with torch.no_grad():
      outputs = model(**batch)
    start_logits.append(outputs.start_logits)
    end_logits.append(outputs.end_logits)

    progress_bar.update(1)

  # Update the metrics
  metrics = compute_metrics(
      validation_data, start_logits, end_logits, validation, validation_offsets
  )

  print(metrics)
  return outputs.loss, metrics

In [None]:
from datasets import load_metric
def train_loop(model, optimizer, num_epochs, train_dataloader, validation_dataloader, lr_scheduler, device):
  """
  :param torch.nn.Module model: the model to be trained
  :param torch.optim.Optimizer optimizer: the training optimizer
  :param int num_epochs: number of epochs to train for
  :param torch.utils.data.DataLoader train_dataloader: DataLoader containing training examples
  :param torch.utils.data.DataLoader validation_dataloader: DataLoader containing validation examples
  :param _ lr_scheduler: learning rate scheduler
  :param torch.device device: the device that we'll be training on

  :return int train_losses, int val_losses
  """
  model.to(device)
  for epoch in range(num_epochs):
    ## Training
    # put the model in training mode (important that this is done each epoch,
    # since we put the model into eval mode during validation)
    model.train()

    print(f"Epoch {epoch + 1} training:")
    progress_bar = tqdm(range(len(train_dataloader)))

    for i, batch in enumerate(train_dataloader):
      batch["input_ids"] = batch["input_ids"].to(device)
      batch["start_positions"] = batch["start_positions"].to(device)
      batch["end_positions"] = batch["end_positions"].to(device)
      batch["attention_mask"] = batch["attention_mask"].to(device)

      # Calculate the predictions
      predictions = model(**batch)

      # Back propagate
      loss = predictions.loss
      loss.backward()

      # Adjust learning rate
      optimizer.step()
      lr_scheduler.step()

      # Zero the optimizer
      optimizer.zero_grad()

      progress_bar.update(1)

    ## Validation
    print("Running validation:")
    val_loss, val_metrics = eval_loop(model, validation_dataloader, device)
    print(f"Epoch {epoch+1} validation: {val_metrics}")
    print(f"Epoch {epoch+1} losses; Train: {loss}, Validation: {val_loss}")

  return loss, val_loss

## Define inputs to the model

In [None]:
from transformers import get_scheduler

num_epochs = 5

optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5)
lr_scheduler = get_scheduler(
  "linear",
  optimizer=optimizer,
  num_warmup_steps=5,
  num_training_steps=len(train_dataloader) * num_epochs
)

## Run the Model!!

In [None]:
def main():
  '''Here's the basic structure of the main block -- feel free to add or
  remove parameters/helper functions as you see fit, but all steps here are
  needed and we expect to see precision, recall, and f1 scores printed out'''
  # model, tokenizer = load_model() # Using cased-squad by default
  # train, validation = load_data(filepath="/content/drive/MyDrive/CS/cs1460/") # Change filepath accordingly when testing

  # train_data_loader = preprocess_and_tokenize(train, tokenizer, batch_size=16)
  # validation_data_loader = preprocess_and_tokenize(validation, tokenizer, batch_size=16)

  train_losses, val_losses = train_loop(model, optimizer, num_epochs, train_dataloader, validation_dataloader, lr_scheduler, device)
  loss, metrics  = eval_loop(model, validation_dataloader, device)

  print("PRECISION: ", metrics['precision'])
  print("RECALL: ", metrics['recall'])
  print("F1-SCORE: ", metrics['f1'])

if __name__ == "__main__":
  main()

Epoch 1 training:


  0%|          | 0/436 [00:00<?, ?it/s]

Running validation:


  0%|          | 0/28 [00:00<?, ?it/s]

{'precision': tensor(0.7305, device='cuda:0'), 'recall': 0.7331207697013703, 'f1': tensor(0.6865, device='cuda:0')}
Epoch 1 validation: {'precision': tensor(0.7305, device='cuda:0'), 'recall': 0.7331207697013703, 'f1': tensor(0.6865, device='cuda:0')}
Epoch 1 losses; Train: 0.3862578272819519, Validation: 1.2947524785995483
Epoch 2 training:


  0%|          | 0/436 [00:00<?, ?it/s]

Running validation:


  0%|          | 0/28 [00:00<?, ?it/s]

{'precision': tensor(0.7271, device='cuda:0'), 'recall': 0.7399323836866709, 'f1': tensor(0.6854, device='cuda:0')}
Epoch 2 validation: {'precision': tensor(0.7271, device='cuda:0'), 'recall': 0.7399323836866709, 'f1': tensor(0.6854, device='cuda:0')}
Epoch 2 losses; Train: 0.3054884076118469, Validation: 1.5163551568984985
Epoch 3 training:


  0%|          | 0/436 [00:00<?, ?it/s]

Running validation:


  0%|          | 0/28 [00:00<?, ?it/s]

{'precision': tensor(0.7253, device='cuda:0'), 'recall': 0.7508683574345051, 'f1': tensor(0.6877, device='cuda:0')}
Epoch 3 validation: {'precision': tensor(0.7253, device='cuda:0'), 'recall': 0.7508683574345051, 'f1': tensor(0.6877, device='cuda:0')}
Epoch 3 losses; Train: 0.7705205678939819, Validation: 1.6806044578552246
Epoch 4 training:


  0%|          | 0/436 [00:00<?, ?it/s]

Running validation:


  0%|          | 0/28 [00:00<?, ?it/s]

{'precision': tensor(0.7175, device='cuda:0'), 'recall': 0.7323248465698347, 'f1': tensor(0.6763, device='cuda:0')}
Epoch 4 validation: {'precision': tensor(0.7175, device='cuda:0'), 'recall': 0.7323248465698347, 'f1': tensor(0.6763, device='cuda:0')}
Epoch 4 losses; Train: 0.2446780502796173, Validation: 1.6581765413284302
Epoch 5 training:


  0%|          | 0/436 [00:00<?, ?it/s]

Running validation:


  0%|          | 0/28 [00:00<?, ?it/s]

{'precision': tensor(0.7204, device='cuda:0'), 'recall': 0.7369075148395703, 'f1': tensor(0.6788, device='cuda:0')}
Epoch 5 validation: {'precision': tensor(0.7204, device='cuda:0'), 'recall': 0.7369075148395703, 'f1': tensor(0.6788, device='cuda:0')}
Epoch 5 losses; Train: 0.38927242159843445, Validation: 1.6376657485961914


  0%|          | 0/28 [00:00<?, ?it/s]

{'precision': tensor(0.7204, device='cuda:0'), 'recall': 0.7369075148395703, 'f1': tensor(0.6788, device='cuda:0')}
PRECISION:  tensor(0.7204, device='cuda:0')
RECALL:  0.7369075148395703
F1-SCORE:  tensor(0.6788, device='cuda:0')
