In [85]:
%load_ext autoreload
%autoreload 2

import torch
import torch.nn.functional as F
import os
import json
from torch.utils.data import DataLoader
from transformers import (
    GPT2Tokenizer,
    GPT2LMHeadModel,
    BertTokenizer,
    BertForQuestionAnswering
)
from utils import find_subsequences
from question_generator_utils import SyntheticAnswersDataset

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [86]:
def rate_questions(BERT_tokenizer, BERT_model, context, questions):
    batch_contexts = [context] * len(questions)
    inputs = BERT_tokenizer(batch_contexts, questions, padding=True, return_tensors='pt')
    with torch.no_grad():
        start_logits, end_logits = BERT_model(**inputs)[:2]
    
    token_type_ids = inputs['token_type_ids'].bool()
    attention_mask = inputs['attention_mask'].bool()
    start_logits.masked_fill_(torch.logical_or(token_type_ids, ~attention_mask), -float('inf'))
    end_logits.masked_fill_(torch.logical_or(token_type_ids, ~attention_mask), -float('inf'))

    is_answerable = ~ torch.logical_and(
        (start_logits[:, 0].unsqueeze(-1) >= start_logits).all(dim=-1),
        (end_logits[:, 0].unsqueeze(-1) >= end_logits).all(dim=-1)
    )

    start_probs = F.softmax(start_logits, dim=-1)
    end_probs = F.softmax(end_logits, dim=-1)

    ratings = start_probs[:, 0] * end_probs[:, 0]

    return ratings, is_answerable, (start_logits, end_logits)

In [4]:
gpt2_tokenizer = GPT2Tokenizer.from_pretrained(
    'question_generation_model_q-loss',
    do_lower_case=True
)
gpt2_model = GPT2LMHeadModel.from_pretrained(
    'question_generation_model_q-loss',
)
gpt2_model.eval();
bert_tokenizer = BertTokenizer.from_pretrained(
    'deepset/bert-large-uncased-whole-word-masking-squad2',
    do_lower_case=True
)
bert_model = BertForQuestionAnswering.from_pretrained(
    'deepset/bert-large-uncased-whole-word-masking-squad2'
)
bert_model.eval();

def prepare_inputs_for_generation(input_ids, past, **kwargs):
    if past:
        input_ids = input_ids[:, -1].unsqueeze(-1)
        token_type_ids = kwargs['token_type_ids'][:, -1].unsqueeze(-1)
    else:
        token_type_ids = kwargs['token_type_ids']
    return {'input_ids': input_ids, 'past': past, 'use_cache': kwargs['use_cache'],
            'token_type_ids': token_type_ids}
gpt2_model.prepare_inputs_for_generation = prepare_inputs_for_generation

In [5]:
ds = SyntheticAnswersDataset('generated_answers', gpt2_tokenizer)
dl = DataLoader(
    ds,
    batch_size=1
)
dl = iter(dl)

In [36]:
batch = next(dl)

In [37]:
output_sequences = gpt2_model.generate(
            input_ids=batch[1],
            token_type_ids=batch[2],
            max_length=64 + batch[1].shape[-1],
            temperature=1.0,
            top_k=30,
            top_p=0.9,
            do_sample=True,
            num_return_sequences=5
        )

Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


In [38]:
len(output_sequences)

5

In [39]:
q_start_token = torch.tensor(gpt2_tokenizer.encode('question:'),dtype=torch.long)
q_end_token = torch.tensor(gpt2_tokenizer.encode(':question'), dtype=torch.long)

# Decode questions
questions = []
for output in output_sequences:
    q_start_idx = (output.squeeze() == q_start_token).nonzero()[0]
    q_end_indices = (output.squeeze() == q_end_token).nonzero()
    if len(q_end_indices):
        q_end_idx = q_end_indices[0]
    else:
        continue
    question = gpt2_tokenizer.decode(output[q_start_idx + 1: q_end_idx], clean_up_tokenization_spaces=True)
    questions.append(question.strip())

In [40]:
questions

['who replaced giovanni studdani after his injury?',
 'what position did doni hold at miafa?',
 'who did ku hire in the summer of 2012?',
 'how long was captain glover alderwood suspended for?',
 'who was named player of the year in 2012?']

In [87]:
context = batch[0][0]
ratings, is_answerable, logits= rate_questions(bert_tokenizer, bert_model, context, questions)

In [88]:
is_answerable

tensor([False, False, False, False, False])

In [84]:
logits[0]

tensor([[ 5.7868, -3.7926, -6.0274, -7.4328, -8.3713, -5.6004, -2.0553, -7.4835,
         -6.3547, -7.3850, -7.6516, -5.3609, -3.0608, -5.8211, -4.3033, -1.8704,
         -5.4540, -1.6528, -4.8081, -5.6938, -4.6707, -4.4648, -6.1712, -6.9896,
         -5.1679, -5.2939, -5.4064, -2.8466, -6.1642, -6.1090, -5.5700, -6.2520,
         -5.5366, -3.8833, -6.5645, -5.0065, -4.8451, -6.9143, -5.6570, -6.1770,
         -5.3950, -5.9186, -4.3723, -5.7299, -3.5920, -4.8077, -3.3176, -2.9515,
         -5.7393, -5.3693, -3.2375, -7.1916, -5.8638, -4.7915, -7.4790, -6.7465,
         -6.6031, -5.9000, -6.9159, -7.4453, -5.6511, -5.4522, -4.6808, -6.2935,
         -3.9609, -4.2125, -5.7507, -6.7583, -5.6914, -4.2513, -6.1823, -2.2694,
         -6.6377, -6.4860, -6.8543, -7.5613, -6.8649, -6.5271, -7.5793, -7.7510,
         -7.2026, -6.8518, -8.2672, -6.5676, -6.7495, -7.9915, -6.5225, -6.6077,
         -7.6720, -5.8631, -6.3905, -4.1636, -5.5168, -7.2593, -6.3871, -5.2770,
         -6.8056, -7.2830, -

In [32]:
with open('generated_answers/answers_0.json', 'r') as f:
    examples = json.load(f)

In [33]:
examples[0]['context']

'on 19 july 2011, he signed a new contract with liverpool and joined championship team hull city in a year - long loan move. he made his full debut for hull in a start of the season clash against recently relegated blackpool at the kc stadium. he received a knee injury in a 1 – 0 defeat at burnley on 31 december 2011 and was substituted by adriano basso on the 42nd minute mark, shortly after conceding a goal, scored by martin paterson, as a result of a defensive mix - up with jack hobbs. following the injury, gulacsi returned to liverpool for a scan on his knee. on 11 april 2012, liverpool contacted hull with a view to recalling gulacsi from his loan subject to premier league, football league and fa approval. liverpool were at that time suffering a goalkeeper crisis, with both pepe reina and doni serving suspensions, leaving brad jones as their only remaining senior'

In [15]:
examples[0]['generated_answers']

[{'answer': '1888', 'seq_num': 1},
 {'answer': 'yves bouvier', 'seq_num': 2},
 {'answer': 'yves bouvier', 'seq_num': 1},
 {'answer': '5 %', 'seq_num': 1},
 {'answer': '1. 2 million works of art, allegedly including around 1000',
  'seq_num': 1}]

In [17]:
context_encoded = torch.tensor(tokenizer.encode(examples[0]['context']), dtype=torch.long)

In [26]:
answer_encoded = torch.tensor(tokenizer.encode(' 1888,'), dtype=torch.long)

In [27]:
answer_encoded

tensor([49584,    11])

In [28]:
tokenizer.decode(answer_encoded)

' 1888,'

In [20]:
context_encoded

tensor([ 1169, 15587,   286,   262,  2030, 45813,   460,   307, 23246,   736,
          284, 49584,    11,   475,   355,   340,  9902,   287,  2546,    11,
          340,  8197,   262,   564,   250, 32191, 15421,   286,  1509,   747,
        11754,   564,   251,    11,  1642,   340,   262,  9871,  6143,  6841,
          329,   262,  3230,  9085,    13,  1864,   284,   281,  2708,   287,
          262,   649,   331,   967,   263,    11,  1509,   747,  1242,  1730,
          331,  1158, 35833, 49663, 43185,   262,  2030, 45813,  3721, 10730,
          284,   262,  1242,  1910,  1642,   465,  8440,  1664,  3288,   443,
         2284,  2528,   260,   262,  4094, 18285,   379,   262,  2030, 45813,
          351,  6143,  2272, 26399,   287,  6992,   286,  1160,  7319,   285,
        31185,  1201,  2211,    13,  1509,   747, 19834,   331,  1158, 35833,
        49663,    11, 17494,   262,   366,  2030, 45813,  5822, 33172,   318,
          262,  3741, 15811,   287,   262,  1702, 11656,   290, 

In [25]:
find_subsequences(context_encoded, answer_encoded)

[]

In [22]:
examples[0]['context']

'the origins of the freeport can be traced back to 1888, but as it expanded in size, it adopted the “ opaque traditions of swiss banking ”, making it the preferred storage facility for the international elite. according to an article in the new yorker, swiss art deal yves bouvier pioneered the freeport concept parallel to the art market making his shipping company natural le coultre the biggest tenant at the freeport with storage space rented in excess of 20 thousand m² since 2013. swiss businessman yves bouvier, dubbed the " freeport king ", is the majority investor in the singapore and luxembourg freeports and has been variously described as the owner of the geneva freeport, or its largest shareholder, though in an interview in october 2016 he said he owned only 5 % of it, with 85 % of it being owned by the swiss state. in 2013, the freeport held about 1. 2 million works of art, allegedly including around 1000 works by pablo picasso. as well as art and gold bars, the facility contain