In [1]:
!pip install datasets
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.7.1-py3-none-any.whl (451 kB)
[K     |████████████████████████████████| 451 kB 5.2 MB/s 
[?25hCollecting huggingface-hub<1.0.0,>=0.2.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 83.1 MB/s 
Collecting multiprocess
  Downloading multiprocess-0.70.14-py38-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 78.1 MB/s 
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting xxhash
  Downloading xxhash-3.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 53.7 MB/s 
Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1
  Downloading urllib3-1.25.11-py2.py3-none-any.whl (127 kB)
[K     |████████████████████████████████| 127 kB 90.3 MB/s 
Installing collected p

# Preprocessing

In [2]:
from transformers import DistilBertTokenizerFast, DistilBertForQuestionAnswering
from datasets import load_dataset

import torch
import numpy as np
import random

# we set up some seeds so that we can reproduce results
seed = 123
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True


In [3]:
"""
Some options for BERT model that can be run in colab:

"distilbert-base-uncased",
"distilbert-base-uncased-distilled-squad",
"distilbert-base-cased",
"distilbert-base-cased-distilled-squad",

"""

'\nSome options for BERT model that can be run in colab:\n\n"distilbert-base-uncased",\n"distilbert-base-uncased-distilled-squad",\n"distilbert-base-cased",\n"distilbert-base-cased-distilled-squad",\n\n'

In [4]:
from transformers import AutoTokenizer

def load_data():
  dataset = load_dataset('cjlovering/natural-questions-short')

  dataset = dataset.remove_columns(["name", "id", "has_correct_context"])

  return dataset['train'], dataset['validation']

def load_model():
  model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased')
  tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

  return model, tokenizer

In [5]:
def train_eval(model, validation_dataloader, device):
  model.eval()

  progress_bar = tqdm(range(len(validation_dataloader)))
  valid_loss_batch = []
  with torch.no_grad():
    for batch in validation_dataloader:
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      start_positions = batch['start_positions'].to(device)
      end_positions = batch['end_positions'].to(device)

      predictions = model(
          input_ids=input_ids, 
          attention_mask=attention_mask, 
          start_positions=start_positions, 
          end_positions=end_positions
      )

      validation_loss = predictions.loss

      valid_loss_batch.append(validation_loss)
      progress_bar.update(1)

  return torch.mean(torch.tensor([valid_loss_batch]))

In [6]:
from datasets import load_metric

def train_loop(model, optimizer, num_epochs, train_dataloader, validation_dataloader, lr_scheduler, device):
  """ Train a PyTorch Module
  
  :param torch.nn.Module model: the model to be trained
  :param torch.optim.Optimizer optimizer: the training optimizer
  :param int num_epochs: number of epochs to train for
  :param torch.utils.data.DataLoader train_dataloader: DataLoader containing training examples
  :param torch.utils.data.DataLoader validation_dataloader: DataLoader containing validation examples
  :param _ lr_scheduler: learning rate scheduler
  :param torch.device device: the device that we'll be training on
  
  :return training_losses, validation_losses
  """

  training_loss_epoch = []
  validation_loss_epoch = []

  for epoch in range(num_epochs):
    # put the model in training mode
    model.train()

    print(f"Epoch {epoch + 1} training:")
    train_loss_batch = []
    progress_bar = tqdm(range(len(train_dataloader)))

    for batch in train_dataloader:
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      start_positions = batch['start_positions'].to(device)
      end_positions = batch['end_positions'].to(device)

      optimizer.zero_grad()

      predictions = model(
          input_ids=input_ids, 
          attention_mask=attention_mask, 
          start_positions=start_positions, 
          end_positions=end_positions
      )

      training_loss = predictions.loss
      training_loss.backward()

      optimizer.step()
      lr_scheduler.step()
      
      train_loss_batch.append(training_loss)
      progress_bar.update(1)

    train_loss_for_epoch = torch.mean(torch.tensor([train_loss_batch]))
    training_loss_epoch.append(train_loss_for_epoch)
    # print the epoch's average metrics
    print(f"Epoch {epoch+1} average training loss:{train_loss_for_epoch}")

    print("Running validation:")

    # evaluate model on validation dataset
    valid_loss_for_epoch = train_eval(model, validation_dataloader, device)
    validation_loss_epoch.append(valid_loss_for_epoch)
    print(f"Epoch {epoch+1} average evaluation loss: {valid_loss_for_epoch}")

  return training_loss_epoch, validation_loss_epoch

In [7]:
from tqdm.auto import tqdm
from torch.utils.data import DataLoader
from datasets import Dataset
from transformers import DefaultDataCollator

def tokenize(dataset, tokenizer):
  questions = [ques.strip() for ques in dataset["question"]]

  inputs = tokenizer(
      questions,
      dataset["context"],
      max_length=512,
      truncation="only_second",
      return_offsets_mapping=True,
      padding="max_length",
      return_tensors='pt'
  )

  offset_mapping = inputs.pop("offset_mapping")
  answers = dataset["answers"]
  start_positions = []
  end_positions = []

  for i, offset in enumerate(offset_mapping):
    answer = answers[i]
    start_char = answer["span_start"]
    end_char = answer["span_end"]
    sequence_ids = inputs.sequence_ids(i)

    # Find the start and end of the context
    idx = 0
    while sequence_ids[idx] != 1:
      idx += 1
    context_start = idx
    while sequence_ids[idx] == 1:
      idx += 1
    context_end = idx - 1

    # This is not required but per paper if the answer is not fully inside the context, label it (0, 0)
    if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
        start_positions.append(0)
        end_positions.append(0)
    else:
    # Otherwise it's the start and end token positions
      idx = context_start
      while idx <= context_end and offset[idx][0] <= start_char:
          idx += 1
      start_positions.append(idx - 1)

      idx = context_end
      while idx >= context_start and offset[idx][1] >= end_char:
          idx -= 1
      end_positions.append(idx + 1)

  inputs["start_positions"] = start_positions
  inputs["end_positions"] = end_positions
  return inputs

def preprocess(dataset):
  results = []
  for data in dataset:
    map = {}
    questions = data['questions']
    answers = data['answers']
    contexts = data['contexts']

    for question in questions:
      for answer in answers:
        map['question'] = question['input_text']
        map['context'] = contexts
        map['answers'] = {"span_start": int(answer['span_start']), "span_end": int(answer['span_end'])}

        results.append(map)
  return results

def preprocess_and_tokenize(dataset, batch_size, tokenizer):
  preprocessed_data = preprocess(dataset) # expand data to match {questions, context, span_start, spand_end} format per instance
  data = Dataset.from_list(preprocessed_data) # convert to Dataset
  tokenized_data = data.map(lambda x: tokenize(x, tokenizer), batched=True, remove_columns=data.column_names) # Tokenize each batched data
  return DataLoader(tokenized_data, batch_size=batch_size, collate_fn=DefaultDataCollator(), shuffle=True) # use collate fn to create batches of examples

In [8]:
# device = "cuda" if torch.cuda.is_available() else "cpu"
# batch_size = 16

# model, tokenizer = load_model()
# train, validation = load_data()

In [9]:
# train_data_loader = preprocess_and_tokenize(train, batch_size, tokenizer)

In [10]:
# validation_data_loader = preprocess_and_tokenize(validation, batch_size, tokenizer)

In [11]:
# torch.cuda.empty_cache() 

In [12]:
# from transformers import get_scheduler
# num_epochs = 5

# optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)
# lr_scheduler = get_scheduler(
#     "linear", 
#     optimizer=optimizer, 
#     num_warmup_steps=0,
#     num_training_steps=len(train_data_loader) * num_epochs
#   )
# model.to(device)
# train_losses, val_losses = train_loop(model, 
#                                       optimizer, 
#                                       num_epochs, 
#                                       train_data_loader, 
#                                       validation_data_loader, 
#                                       lr_scheduler,
#                                       device)

In [13]:
def compute_metrics(input_ids, start_positions, end_positions, pred_start, pred_end):
  precisions = []
  recalls = []
  f1s = []
  for idx, input in enumerate(input_ids):
    labels = input[start_positions[idx] : end_positions[idx] + 1]
    predicted = input[pred_start[idx].argmax() : pred_end[idx].argmax() + 1]

    matched_tokens = np.intersect1d(labels.cpu(),predicted.cpu())

    precision, recall, F1 = 0, 0, 0

    if len(predicted) > 0:
      precision = len(matched_tokens) / len(predicted)
    
    if len(labels) > 0:
      recall = len(matched_tokens) / len(labels)

    if precision > 0 and recall > 0:
      F1 = 2/(1/precision+1/recall)
      
    precisions.append(precision)
    recalls.append(recall)
    f1s.append(F1)

  return np.mean(precisions), np.mean(recalls), np.mean(f1s)

In [14]:
# trained model
def eval_loop(model, validation_dataloader, device):
  model.eval()

  progress_bar = tqdm(range(len(validation_dataloader)))
  f1_batches, precision_batches, recall_batches = [],[],[]
  with torch.no_grad():
    for i, batch in enumerate(validation_dataloader):
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      start_positions = batch['start_positions'].to(device)
      end_positions = batch['end_positions'].to(device)

      predictions = model(
          input_ids=input_ids, 
          attention_mask=attention_mask, 
          start_positions=start_positions, 
          end_positions=end_positions
      )

      pred_start = predictions.start_logits
      pred_end = predictions.end_logits

      precision, recall, f1 = compute_metrics(input_ids, start_positions, end_positions, pred_start, pred_end)
      f1_batches.append(f1)
      precision_batches.append(precision)
      recall_batches.append(recall)

      progress_bar.update(1)

  return np.mean(precision_batches), np.mean(recall_batches), np.mean(f1_batches)

In [15]:
# from google.colab import drive
# drive.mount('/content/gdrive')

In [16]:
path = f'/content/gdrive/My Drive/Colab Notebooks/finals.pth'
# print(f'Saving model...')
# torch.save(model.state_dict(), path)
# print(f'saved successfully.')

In [17]:
# model.load_state_dict(torch.load(path))

In [18]:
from transformers import get_scheduler

def main():
  '''Here's the basic structure of the main block -- feel free to add or
  remove parameters/helper functions as you see fit, but all steps here are 
  needed and we expect to see precision, recall, and f1 scores printed out'''
  device = "cuda" if torch.cuda.is_available() else "cpu"
  batch_size = 16

  model, tokenizer = load_model()
  train, validation = load_data()

  train_data_loader = preprocess_and_tokenize(train, batch_size, tokenizer)
  validation_data_loader = preprocess_and_tokenize(validation, batch_size, tokenizer)

  num_epochs = 5

  model.to(device)
  optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)
  lr_scheduler = get_scheduler(
      "linear", 
      optimizer=optimizer, 
      num_warmup_steps=0,
      num_training_steps=len(train_data_loader) * num_epochs
    )
  train_losses, val_losses = train_loop(model, 
                                        optimizer, 
                                        num_epochs, 
                                        train_data_loader, 
                                        validation_data_loader, 
                                        lr_scheduler,
                                        device)

  print(train_losses[-1], val_losses[-1])
  precision, recall, f1_score  = eval_loop(model, validation_data_loader, device)
  
  print("PRECISION: ", precision)
  print("RECALL: ", recall)
  print("F1-SCORE: ", f1_score)

if __name__ == "__main__":
  main()

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this mode

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/28.0 [00:00<?, ?B/s]



Downloading and preparing dataset json/cjlovering--natural-questions-short to /root/.cache/huggingface/datasets/cjlovering___json/cjlovering--natural-questions-short-63df990b626b5a72/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/14.1M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/889k [00:00<?, ?B/s]

  

Extracting data files #0:   0%|          | 0/1 [00:00<?, ?obj/s]

Extracting data files #1:   0%|          | 0/1 [00:00<?, ?obj/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/cjlovering___json/cjlovering--natural-questions-short-63df990b626b5a72/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

Epoch 1 training:


  0%|          | 0/871 [00:00<?, ?it/s]

Epoch 1 average training loss:2.480591058731079
Running validation:


  0%|          | 0/55 [00:00<?, ?it/s]

Epoch 1 average evaluation loss: 1.8987867832183838
Epoch 2 training:


  0%|          | 0/871 [00:00<?, ?it/s]

Epoch 2 average training loss:1.4881919622421265
Running validation:


  0%|          | 0/55 [00:00<?, ?it/s]

Epoch 2 average evaluation loss: 1.760800838470459
Epoch 3 training:


  0%|          | 0/871 [00:00<?, ?it/s]

Epoch 3 average training loss:0.7993131875991821
Running validation:


  0%|          | 0/55 [00:00<?, ?it/s]

Epoch 3 average evaluation loss: 2.089176893234253
Epoch 4 training:


  0%|          | 0/871 [00:00<?, ?it/s]

Epoch 4 average training loss:0.38306111097335815
Running validation:


  0%|          | 0/55 [00:00<?, ?it/s]

Epoch 4 average evaluation loss: 2.539820432662964
Epoch 5 training:


  0%|          | 0/871 [00:00<?, ?it/s]

Epoch 5 average training loss:0.17185398936271667
Running validation:


  0%|          | 0/55 [00:00<?, ?it/s]

Epoch 5 average evaluation loss: 2.9517858028411865
tensor(0.1719) tensor(2.9518)


  0%|          | 0/55 [00:00<?, ?it/s]

PRECISION:  0.6012549668558393
RECALL:  0.6546201800414038
F1-SCORE:  0.5717480545159076
