In [1]:
import torch
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: GeForce RTX 2080 Ti


In [2]:
!pip install transformers



In [3]:
!pip install wget



In [4]:
import wget
import os

print('Downloading dataset...')

local_dir = './squad_dataset/'

# The filenames and URLs for the dataset files.
files = [('train-v1.1.json', 'https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json'), 
         ('dev-v1.1.json', 'https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json'),
         ('evaluate-v1.1.py', 'https://github.com/allenai/bi-att-flow/blob/master/squad/evaluate-v1.1.py')]

# Create the local folder for the files.
if not os.path.exists(local_dir):
    os.mkdir(local_dir)

# Download each of the files.
for (filename, url) in files:

    # Construct the local file path.
    file_path = local_dir + filename

    # Download the file (if we haven't already)
    if not os.path.exists(file_path):
        print('  ' + file_path)
        wget.download(url, local_dir + filename)

print('Done.')

Downloading dataset...
Done.


In [5]:
data_dir = './squad_dataset/'

# Check out the sizes on the saved files.
files = list(os.listdir(data_dir))

print(data_dir)

# For each file in the directory...
for f in files:
    # Get the file size, in MB
    f_size = float(os.stat(data_dir + '/' + f).st_size) / 2**20
    
    # Print the filename and its size.
    print("     {:25s}    {:>6.2f} MB".format(f, f_size))

./squad_dataset/
     dev-v1.1.json                  4.63 MB
     evaluate-v1.1.py               0.00 MB
     train-v1.1.json               28.89 MB


In [6]:
import json

# Open the training dataset file.
with open(os.path.join('./squad_dataset/train-v1.1.json'), "r", encoding="utf-8") as reader:
    input_data = json.load(reader)["data"]

print('Unpacking SQuAD Examples...')

#print('Articles:')

# We'll unpack all of the 
examples = []

# For each Wikipedia article in the dataset...
for entry in input_data:

    # The Wikipedia Article title.
    title = entry["title"]
    #print('  ', title)

    # For each paragraph in the article...
    for paragraph in entry["paragraphs"]:
        
        # The paragraph, where the answer is found, is referred to as the
        # "context".
        context_text = paragraph["context"]
        
        # There can be multiple questions per paragraph.
        for qa in paragraph["qas"]:
            
            # Define a dictionary to store the properties.
            ex = {}

            # The unique ID of this question.
            ex['qas_id'] = qa["id"]

            # The question.
            ex['question_text'] = qa["question"]

            # In the training data, there is only one answer per question.
            answer = qa["answers"][0]

            # The answer string.
            ex['answer_text'] = answer["text"]

            # The character index of the answer in the context.
            ex['start_position_character'] = answer["answer_start"]                

            # Store the title and paragraph text.
            ex['title'] = title
            ex['context_text'] = context_text

            examples.append(ex)

print('DONE!')
print('There are {:,} training examples.'.format(len(examples)))

Unpacking SQuAD Examples...
DONE!
There are 87,599 training examples.


In [7]:
import textwrap

# Wrap text to 80 characters.
wrapper = textwrap.TextWrapper(width=80) 

# Select an example to check out.
ex = examples[1200]

print('Title:', ex['title'])
print('ID:', ex['qas_id'])

print('\n======== Question =========')
print(ex['question_text'])

print('\n======== Context =========')
print(wrapper.fill(ex['context_text']))

print('\n======== Answer =========')
print(ex['answer_text'])

Title: Genocide
ID: 5733963c4776f41900660df8

What form of destruction was considered too limited by a smaller group of experts?

In 2007 the European Court of Human Rights (ECHR), noted in its judgement on
Jorgic v. Germany case that in 1992 the majority of legal scholars took the
narrow view that "intent to destroy" in the CPPCG meant the intended physical-
biological destruction of the protected group and that this was still the
majority opinion. But the ECHR also noted that a minority took a broader view
and did not consider biological-physical destruction was necessary as the intent
to destroy a national, racial, religious or ethnic group was enough to qualify
as genocide.

biological-physical


In [8]:
import time
import datetime
import os
import pandas as pd
import csv

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))


def good_update_interval(total_iters, num_desired_updates):
    '''
    This function will try to pick an intelligent progress update interval 
    based on the magnitude of the total iterations.

    Parameters:
      `total_iters` - The number of iterations in the for-loop.
      `num_desired_updates` - How many times we want to see an update over the 
                              course of the for-loop.
    '''
    # Divide the total iterations by the desired number of updates. Most likely
    # this will be some ugly number.
    exact_interval = total_iters / num_desired_updates

    # The `round` function has the ability to round down a number to, e.g., the
    # nearest thousandth: round(exact_interval, -3)
    #
    # To determine the magnitude to round to, find the magnitude of the total,
    # and then go one magnitude below that.

    # Get the order of magnitude of the total.
    order_of_mag = len(str(total_iters)) - 1

    # Our update interval should be rounded to an order of magnitude smaller. 
    round_mag = order_of_mag - 1

    # Round down and cast to an int.
    update_interval = int(round(exact_interval, -round_mag))

    # Don't allow the interval to be zero!
    if update_interval == 0:
        update_interval = 1

    return update_interval


def check_gpu_mem():
    '''
    Uses Nvidia's SMI tool to check the current GPU memory usage.
    Reported values are in "MiB". 1 MiB = 2^20 bytes = 1,048,576 bytes.
    '''
    
    # Run the command line tool and get the results.
    buf = os.popen('nvidia-smi --query-gpu=memory.total,memory.used --format=csv')

    # Use csv module to read and parse the result.
    reader = csv.reader(buf, delimiter=',')

    # Use a pandas table just for nice formatting.
    df = pd.DataFrame(reader)

    # Use the first row as the column headers.
    new_header = df.iloc[0] #grab the first row for the header
    df = df[1:] #take the data less the header row
    df.columns = new_header #set the header row as the df header

    # Display the formatted table.
    #display(df)

    return df


In [10]:
from transformers import RobertaTokenizer, RobertaForQuestionAnswering

# Set this flag to `True` to load a version of BERT-large which has already been
# fine-tuned on SQuAD.
pre_tuned = True

# If using the pre-fine-tuned BERT-large model...
if pre_tuned:

    # Load the tokenizer.
    tokenizer = RobertaTokenizer.from_pretrained(
        'csarron/roberta-base-squad-v1',
        do_lower_case=True
    )

    # Create the model and initialize the weights.
    model = RobertaForQuestionAnswering.from_pretrained(
        'csarron/roberta-base-squad-v1', 
    )

    # Tell pytorch to run this model on the GPU.
    desc = model.cuda()


In [11]:
import json

# Open the training dataset file.
with open(os.path.join('./squad_dataset/dev-v1.1.json'), "r", encoding="utf-8") as reader:
    input_data = json.load(reader)["data"]


print_count = 0

print('Unpacking SQuAD Examples...')

print('Articles:')

# We'll unpack all of the 
examples = []

# For each Wikipedia article in the dataset...
for entry in input_data:

    # The Wikipedia Article title.
    title = entry["title"]
    print('  ', title)

    # The article contains multiple paragraphs...
    for paragraph in entry["paragraphs"]:
        
        # The paragraph, where the answer is found, is referred to as the
        # "context".
        context_text = paragraph["context"]
        
        # There can be multiple questions per paragraph.
        for qa in paragraph["qas"]:
            
            # Define a dictionary to store the properties.
            ex = {}

            # The unique ID of this question.
            ex['qas_id'] = qa["id"]

            # The question.
            ex['question_text'] = qa["question"]

            # In the test data, there are three answers per question, so we'll
            # store all three. 
            # Each answer has two fields: `answer_start` and `text`.
            ex['answers'] = qa["answers"]

            # Store the title and paragraph text.
            ex['title'] = title
            ex['context_text'] = context_text

            examples.append(ex)

print('DONE!')
print('There are {:,} test examples.'.format(len(examples)))

Unpacking SQuAD Examples...
Articles:
   Super_Bowl_50
   Warsaw
   Normans
   Nikola_Tesla
   Computational_complexity_theory
   Teacher
   Martin_Luther
   Southern_California
   Sky_(United_Kingdom)
   Victoria_(Australia)
   Huguenot
   Steam_engine
   Oxygen
   1973_oil_crisis
   Apollo_program
   European_Union_law
   Amazon_rainforest
   Ctenophora
   Fresno,_California
   Packet_switching
   Black_Death
   Geology
   Newcastle_upon_Tyne
   Victoria_and_Albert_Museum
   American_Broadcasting_Company
   Genghis_Khan
   Pharmacy
   Immune_system
   Civil_disobedience
   Construction
   Private_school
   Harvard_University
   Jacksonville,_Florida
   Economic_inequality
   Doctor_Who
   University_of_Chicago
   Yuan_dynasty
   Kenya
   Intergovernmental_Panel_on_Climate_Change
   Chloroplast
   Prime_number
   Rhine
   Scottish_Parliament
   Islamism
   Imperialism
   United_Methodist_Church
   French_and_Indian_War
   Force
DONE!
There are 10,570 test examples.


In [12]:
max_len = 384

In [21]:
import time
import torch
import numpy as np
import logging

# By default, the tokenizer will spit out a warning whenever we tokenize a 
# sample which ends up being more than 512 tokens. We don't care about that for
# now, though, and this cell will produce a lot of those warnings! So we'll 
# adjust the logging settings to suppress those warnings and keep the output
# cell cleaner.
logging.getLogger("transformers.tokenization_utils_base").setLevel(logging.ERROR)

# Track the time. Tokenizing all training examples takes around 3 minutes.
t0 = time.time()

# Lists to store locations
start_positions = []
end_positions = []

# We'll count up the number of answers which are truncated, as well as the
# number of test samples for which all three answers were truncated (it's 
# impossible for us to answer these).
num_clipped_answers = 0
num_impossible = 0

# Pick an interval on which to print progress updates.
update_interval = good_update_interval(
            total_iters = len(examples), 
            num_desired_updates = 15
        )

print('Processing {:,} examples...'.format(len(examples)))

# For each of the training examples...
for (ex_num, ex) in enumerate(examples):

    # =====================
    #   Progress Update
    # =====================

    # Progress update every, e.g., 10k samples.
    if (ex_num % update_interval) == 0 and not (ex_num == 0):

        # Calculate elapsed time and format it.
        elapsed = format_time(time.time() - t0)
        
        # Calculate the time remaining based on our progress.
        ex_per_sec = (time.time() - t0) / ex_num
        remaining_sec = ex_per_sec * (len(examples) - ex_num)
        remaining = format_time(remaining_sec)

        # Report progress.
        print('  Example {:>7,}  of  {:>7,}.    Elapsed: {:}. Remaining: {:}'.format(ex_num, len(examples), elapsed, remaining))

    # To store the start and end indeces of the three possible answers.
    start_options = []
    end_options = []

    # Flag to indicate whether we've saved the encoded form of the input yet.
    # We'll tokenize the input three times, but only need to store it once!
    encoded_stored = False

    # For each of the three possible answers...
    for answer in ex['answers']:

        # =============================
        #     Add Sentinel String
        # =============================
        # To help us determine which of the BERT tokens correspond to the answer,
        # we'll replace the answer with, e.g., "[MASK] [MASK] [MASK]" (based on 
        # the number of tokens in the answer).

        # Tokenize the answer--it may be broken into multiple words and/or subwords.
        answer_tokens = tokenizer.tokenize(answer['text'])

        # Create our sentinel string, e.g., "[MASK] [MASK] [MASK]"
        sentinel_str = ' '.join(['<mask>']*len(answer_tokens))

        # Within the "context" string, replace the answer with our sentinel.
        # Python doesn't appear to have a built-in function for replacing a 
        # substring *starting at a specific index*, so we'll implement it in a 
        # more manual way.

        # Locate the exact start and end of the answer text within the "context"
        # string. The dataset gives us this information because the answer text
        # may occur more than once in the context!
        start_char_i = answer['answer_start']
        end_char_i = start_char_i + len(answer['text'])

        # To make the replacement, we use slicing and string concatenation.
        context_w_sentinel = ex['context_text'][:start_char_i] + \
                            sentinel_str + \
                            ex['context_text'][end_char_i:]

        # =============================
        #      Tokenize & Encode
        # =============================
        # Combine the question and the context strings and encode them.
        input_ids = tokenizer.encode(
            ex['question_text'], 
            context_w_sentinel,
            add_special_tokens = True,  # Add '[CLS]' and '[SEP]'
            #max_length = max_len,       # Pad & truncate all sentences.
            pad_to_max_length = False,
            truncation = False,
        )

        # =============================
        #     Locate Answer Tokens
        # =============================
        # Locate all of the instances of the '[MASK]' token. 
        
        # Find all indeces of the [MASK] token.
        mask_token_indeces = np.where(np.array(input_ids) == tokenizer.mask_token_id)[0]

        # Note: You can use the alternate code below if the input_ids are in a 
        #       PyTorch tensor
        # First, compare all of the tokens to the mask token. 
        #is_mask_token = (input_ids[0] == tokenizer.mask_token_id)
        # Then get the indeces of the '1's using the `nonzero` function.
        #mask_token_indeces = is_mask_token.nonzero(as_tuple=False)[:, 0]

        # As a sanity check, make sure the number of MASK tokens we found is the
        # same as the number of answer tokens.
        assert(len(mask_token_indeces) == len(answer_tokens))           

        # `mask_token_indeces` is the range of indeces (e.g., [68, 69, 70, 71]), 
        # but we really just want the start and end indeces (e.g., 68 and 71).
        start_index = mask_token_indeces[0]
        end_index = mask_token_indeces[-1]

        # Store these indeces in our lists.
        start_options.append(start_index)
        end_options.append(end_index)
    
    # Store the start and end indeces of the three possible correct answers.
    start_positions.append(start_options)
    end_positions.append(end_options)
    
    # Continue looping through all of the test samples.

# =========================
#        Wrap-Up
# =========================

print('DONE.  Tokenization took {:}'.format(format_time(time.time() - t0)))

Processing 10,570 examples...
  Example   1,000  of   10,570.    Elapsed: 0:00:02. Remaining: 0:00:17
  Example   2,000  of   10,570.    Elapsed: 0:00:03. Remaining: 0:00:14
  Example   3,000  of   10,570.    Elapsed: 0:00:05. Remaining: 0:00:13
  Example   4,000  of   10,570.    Elapsed: 0:00:07. Remaining: 0:00:12
  Example   5,000  of   10,570.    Elapsed: 0:00:10. Remaining: 0:00:11
  Example   6,000  of   10,570.    Elapsed: 0:00:12. Remaining: 0:00:09
  Example   7,000  of   10,570.    Elapsed: 0:00:14. Remaining: 0:00:07
  Example   8,000  of   10,570.    Elapsed: 0:00:16. Remaining: 0:00:05
  Example   9,000  of   10,570.    Elapsed: 0:00:18. Remaining: 0:00:03
  Example  10,000  of   10,570.    Elapsed: 0:00:20. Remaining: 0:00:01
DONE.  Tokenization took 0:00:21


In [22]:
import time
import torch

# Track the time. Tokenizing all training examples takes around 3 minutes.
t0 = time.time()

# Lists to store the encoded samples.
all_input_ids = []
attention_masks = []
segment_ids = [] 

# Pick an interval on which to print progress updates.
update_interval = good_update_interval(
            total_iters = len(examples), 
            num_desired_updates = 15
        )

print('Tokenizing {:,} examples...'.format(len(examples)))

# For each of the training examples...
for (ex_num, ex) in enumerate(examples):

    # =====================
    #   Progress Update
    # =====================

    # Progress update every, e.g., 10k samples.
    if (ex_num % update_interval) == 0 and not (ex_num == 0):

        # Calculate elapsed time and format it.
        elapsed = format_time(time.time() - t0)
        
        # Calculate the time remaining based on our progress.
        ex_per_sec = (time.time() - t0) / ex_num
        remaining_sec = ex_per_sec * (len(examples) - ex_num)
        remaining = format_time(remaining_sec)

        # Report progress.
        print('  Example {:>7,}  of  {:>7,}.    Elapsed: {:}. Remaining: {:}'.format(ex_num, len(examples), elapsed, remaining))

    # =============================
    #      Tokenize & Encode
    # =============================
    # Combine the question and the context strings, and tokenize them all 
    # together.
    # `encode_plus` will:    
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Place an `[SEP]` token between the question and reference text, and 
    #       and at the end of the reference text.
    #   (4) Map tokens to their IDs ("encode" the text)
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    #   (7) Create the list of segment IDs, indicating which tokens belong
    #       to the question vs. the context.
    #   (8) Casts everything as PyTorch tensors.

    encoded_dict = tokenizer.encode_plus(
        ex['question_text'], 
        ex['context_text'],
        add_special_tokens = True,  # Add '[CLS]' and '[SEP]'
        max_length = max_len,       # Pad & truncate all sentences.
        pad_to_max_length = True,
        truncation = True,
        return_attention_mask = True, # Construct attention masks.
        return_tensors = 'pt',        # Return pytorch tensors.
    )

    # Retrieve the encoded sequence.
    input_ids = encoded_dict['input_ids']

    # =============================
    #     Store Encoded Sample
    # =============================

    # Add the encoded sentence to the list.    
    all_input_ids.append(input_ids)

    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])    

    # ^^^ Continue looping through all of the test samples. ^^^

# =========================
#        Wrap-Up
# =========================

# Convert the lists of tensors into 2D tensors.
all_input_ids = torch.cat(all_input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)

# We don't need the indeces to be tensors, since we're not doing training here.
# Convert the "labels" (the start and end indeces) into tensors.
#start_positions = torch.tensor(start_positions)
#end_positions = torch.tensor(end_positions)

print('DONE.  Tokenization took {:}'.format(format_time(time.time() - t0)))

Tokenizing 10,570 examples...




  Example   1,000  of   10,570.    Elapsed: 0:00:01. Remaining: 0:00:05
  Example   2,000  of   10,570.    Elapsed: 0:00:01. Remaining: 0:00:04
  Example   3,000  of   10,570.    Elapsed: 0:00:02. Remaining: 0:00:04
  Example   4,000  of   10,570.    Elapsed: 0:00:02. Remaining: 0:00:03
  Example   5,000  of   10,570.    Elapsed: 0:00:03. Remaining: 0:00:03
  Example   6,000  of   10,570.    Elapsed: 0:00:03. Remaining: 0:00:03
  Example   7,000  of   10,570.    Elapsed: 0:00:04. Remaining: 0:00:02
  Example   8,000  of   10,570.    Elapsed: 0:00:05. Remaining: 0:00:01
  Example   9,000  of   10,570.    Elapsed: 0:00:05. Remaining: 0:00:01
  Example  10,000  of   10,570.    Elapsed: 0:00:06. Remaining: 0:00:00
DONE.  Tokenization took 0:00:06


In [23]:
import time
import numpy as np

t0 = time.time()

# Tracking variables 
pred_start = []
pred_end = []

# Get the total number of test samples (not answers).
num_test_samples = all_input_ids.shape[0]

# We'll batch the samples to speed up processing. 
batch_size = 16

num_batches = int(np.ceil(num_test_samples / batch_size))

print('Training on {:,} test batches...'.format(num_batches))

batch_num = 0

# Train
for start_i in range(0, num_test_samples, batch_size):
    # Report progress.
    if ((batch_num % 50) == 0) and not (batch_num == 0):

        # Calculate elapsed time and format it.
        elapsed = format_time(time.time() - t0)
        
        # Calculate the time remaining based on our progress.
        batches_per_sec = (time.time() - t0) / batch_num
        remaining_sec = batches_per_sec * (num_batches - batch_num)
        remaining = format_time(remaining_sec)

        # Report progress.
        print('  Batch {:>7,}  of  {:>7,}.    Elapsed: {:}. Remaining: {:}'.format(batch_num, num_batches, elapsed, remaining))

    # Calculate the ending index for this batch.
    # `end_i` is equal to the index of the last sample in the batch, +1.
    end_i = min(start_i + batch_size, num_test_samples)

    # Select our batch inputs (`b` stands for batch here).
    b_input_ids = all_input_ids[start_i:end_i, :]
    b_attn_masks = attention_masks[start_i:end_i, :]

    # Copy these to the GPU.
    b_input_ids = b_input_ids.to(device)
    b_attn_masks = b_attn_masks.to(device)
    
    # Telling the model not to compute or store the compute graph, saving memory 
    # and speeding up prediction
    
    with torch.no_grad():
        
        # Forward pass, calculate logit predictions
        outputs = model(b_input_ids, attention_mask=b_attn_masks)

    # Move logits and labels to CPU
    start_logits = outputs.start_logits.detach().cpu().numpy()
    end_logits = outputs.end_logits.detach().cpu().numpy()
    
    # Find the tokens with the highest `start` and `end` scores.
    answer_start = np.argmax(start_logits, axis=1)
    answer_end = np.argmax(end_logits, axis=1)

    # Store predictions and true labels
    pred_start.append(answer_start)
    pred_end.append(answer_end)

    batch_num += 1

    # ^^^ Continue looping through the batches. ^^^


Training on 661 test batches...
  Batch      50  of      661.    Elapsed: 0:00:07. Remaining: 0:01:20
  Batch     100  of      661.    Elapsed: 0:00:13. Remaining: 0:01:12
  Batch     150  of      661.    Elapsed: 0:00:19. Remaining: 0:01:05
  Batch     200  of      661.    Elapsed: 0:00:25. Remaining: 0:00:58
  Batch     250  of      661.    Elapsed: 0:00:32. Remaining: 0:00:52
  Batch     300  of      661.    Elapsed: 0:00:38. Remaining: 0:00:46
  Batch     350  of      661.    Elapsed: 0:00:44. Remaining: 0:00:39
  Batch     400  of      661.    Elapsed: 0:00:51. Remaining: 0:00:33
  Batch     450  of      661.    Elapsed: 0:00:57. Remaining: 0:00:27
  Batch     500  of      661.    Elapsed: 0:01:03. Remaining: 0:00:20
  Batch     550  of      661.    Elapsed: 0:01:10. Remaining: 0:00:14
  Batch     600  of      661.    Elapsed: 0:01:16. Remaining: 0:00:08
  Batch     650  of      661.    Elapsed: 0:01:22. Remaining: 0:00:01


In [24]:
import time
import numpy as np

# Prediction on test set

# Put model in evaluation mode
model.eval()

t0 = time.time()

# Tracking variables 
pred_start = []
pred_end = []

# Get the total number of test samples (not answers).
num_test_samples = all_input_ids.shape[0]

# We'll batch the samples to speed up processing. 
batch_size = 16

num_batches = int(np.ceil(num_test_samples / batch_size))

print('Evaluating on {:,} test batches...'.format(num_batches))

batch_num = 0

# Predict 
for start_i in range(0, num_test_samples, batch_size):
    
    # Report progress.
    if ((batch_num % 50) == 0) and not (batch_num == 0):

        # Calculate elapsed time and format it.
        elapsed = format_time(time.time() - t0)
        
        # Calculate the time remaining based on our progress.
        batches_per_sec = (time.time() - t0) / batch_num
        remaining_sec = batches_per_sec * (num_batches - batch_num)
        remaining = format_time(remaining_sec)

        # Report progress.
        print('  Batch {:>7,}  of  {:>7,}.    Elapsed: {:}. Remaining: {:}'.format(batch_num, num_batches, elapsed, remaining))

    # Calculate the ending index for this batch.
    # `end_i` is equal to the index of the last sample in the batch, +1.
    end_i = min(start_i + batch_size, num_test_samples)

    # Select our batch inputs (`b` stands for batch here).
    b_input_ids = all_input_ids[start_i:end_i, :]
    b_attn_masks = attention_masks[start_i:end_i, :]

    # Copy these to the GPU.
    b_input_ids = b_input_ids.to(device)
    b_attn_masks = b_attn_masks.to(device)
    
    # Telling the model not to compute or store the compute graph, saving memory 
    # and speeding up prediction
    with torch.no_grad():
        
        # Forward pass, calculate logit predictions
        outputs = model(b_input_ids, attention_mask=b_attn_masks)
                        

    # Move logits and labels to CPU
    start_logits = outputs.start_logits.detach().cpu().numpy()
    end_logits = outputs.end_logits.detach().cpu().numpy()
    
    # Find the tokens with the highest `start` and `end` scores.
    answer_start = np.argmax(start_logits, axis=1)
    answer_end = np.argmax(end_logits, axis=1)

    # Store predictions and true labels
    pred_start.append(answer_start)
    pred_end.append(answer_end)

    batch_num += 1

    # ^^^ Continue looping through the batches. ^^^

# Combine the results across the batches.
pred_start = np.concatenate(pred_start, axis=0)
pred_end = np.concatenate(pred_end, axis=0)

print('    DONE.')

print('\nEvaluation took {:.0f} seconds.'.format(time.time() - t0))

Evaluating on 661 test batches...
  Batch      50  of      661.    Elapsed: 0:00:06. Remaining: 0:01:19
  Batch     100  of      661.    Elapsed: 0:00:13. Remaining: 0:01:12
  Batch     150  of      661.    Elapsed: 0:00:19. Remaining: 0:01:05
  Batch     200  of      661.    Elapsed: 0:00:26. Remaining: 0:00:59
  Batch     250  of      661.    Elapsed: 0:00:32. Remaining: 0:00:52
  Batch     300  of      661.    Elapsed: 0:00:38. Remaining: 0:00:46
  Batch     350  of      661.    Elapsed: 0:00:45. Remaining: 0:00:40
  Batch     400  of      661.    Elapsed: 0:00:51. Remaining: 0:00:33
  Batch     450  of      661.    Elapsed: 0:00:58. Remaining: 0:00:27
  Batch     500  of      661.    Elapsed: 0:01:04. Remaining: 0:00:21
  Batch     550  of      661.    Elapsed: 0:01:10. Remaining: 0:00:14
  Batch     600  of      661.    Elapsed: 0:01:17. Remaining: 0:00:08
  Batch     650  of      661.    Elapsed: 0:01:23. Remaining: 0:00:01
    DONE.

Evaluation took 85 seconds.


In [25]:
total_correct = 0

# For each test sample...
for i in range(0, len(pred_start)):

    match_options = []

    # For each of the three possible answers...
    for j in range (0, len(start_positions[i])):
    
        matches = 0

        # Add a point if the start indeces match.
        if pred_start[i] == start_positions[i][j]:
            matches += 1

        # Add a point if the end indeces match.
        if pred_end[i] == end_positions[i][j]:
            matches += 1

        # Store the total.
        match_options.append(matches)

    # Between the three possible answers, pick the one with the highest "score".
    total_correct += (max(match_options))

    # ^^^ Continue looping through test samples ^^^

total_indeces = len(pred_start) + len(pred_end)

print('Correctly predicted indices: {:,} of {:,} ({:.2%})'.format(
    total_correct,
    total_indeces,
    float(total_correct) / float(total_indeces)
))


Correctly predicted indices: 14,348 of 21,140 (67.87%)


In [26]:
# The final F1 score for each sample.
f1s = []

# For each test sample...
for i in range(0, len(pred_start)):

    # Expand the start and end indeces into sequences of indeces stored as sets.
    # For example, if pred_start = 137 and pred_end = 140, then
    #   pred_span = {137, 138, 139, 140}
    pred_span = set(range(pred_start[i], pred_end[i] + 1))


    f1_options = []

    # For each of the three possible answers...
    for j in range (0, len(start_positions[i])):
    
        # Expand this answer into a range, as above.
        true_span = set(range(start_positions[i][j], end_positions[i][j] + 1))    

        # Use the `intersection` function from Python `set` to get the set of 
        # indeces occurring in both spans. Take the length of this resulting set
        # as the number of overlapping indeces between the two spans.
        num_same = len(pred_span.intersection(true_span))    

        # If there's no overlap, then the F1 score is 0 for this sample.
        if num_same == 0:
            f1_options.append(0)
            continue

        # Precision - How many tokens overlap relative to the total number of tokens
        #             in the predicted span? If the model predicts too large of a 
        #             span, it has bad precision.      
        precision = float(num_same) / float(len(pred_span))
    
        # Recall - How many of the correct tokens made it into the predicted span?
        #          A model could have perfect recall if it just predicted the entire
        #          paragraph as the answer :).    
        recall = float(num_same) / float(len(true_span))

        # F1 - Does the model have both good precision and good recall?
        f1 = (2 * precision * recall) / (precision + recall)

        # Store the score.
        f1_options.append(f1)

        # ^^^ Continue looping through possible answers ^^^

    # Take the highest of the three F1 scores as our score for this sample.
    f1s.append(max(f1_options))

    # ^^^ Continue looping through test samples ^^^


print('Average F1 Score: {:.3f}'.format(np.mean(f1s)))

Average F1 Score: 0.800
