In [7]:
import json
import os

with open( 'data/squad/dev.json', 'rb') as f:
    squad = json.load(f)

# Q&A with Pretrained Bert model

## Training model

In [43]:
from transformers import BertTokenizer, BertForQuestionAnswering

modelname = 'deepset/bert-base-cased-squad2'

tokenizer = BertTokenizer.from_pretrained(modelname)
model = BertForQuestionAnswering.from_pretrained(modelname)

In [44]:
from transformers import pipeline

qa = pipeline('question-answering', model=model, tokenizer=tokenizer)

In [10]:
squad[:2]

[{'question': 'In what country is Normandy located?',
  'answer': 'France',
  'context': 'The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse ("Norman" comes from "Norseman") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.'},
 {'question': 'When were the Normans in Normandy?',
  'answer': '10th and 11th centuries',
  'context': 'The Normans (Norman: Nourmands; French: Normands; Latin: No

In [11]:
# we will intialize a list for answers
answers = []

for pair in squad[:5]:
    # pass in our question and context to return an answer
    ans = qa({
        'question': pair['question'],
        'context': pair['context']
    })
    # append predicted answer and real to answers list
    answers.append({
        'predicted': ans['answer'],
        'true': pair['answer']
    })

In [13]:
answers

[{'predicted': 'France.', 'true': 'France'},
 {'predicted': '10th and 11th centuries', 'true': '10th and 11th centuries'},
 {'predicted': '10th and 11th centuries',
  'true': 'in the 10th and 11th centuries'},
 {'predicted': 'Denmark, Iceland and Norway',
  'true': 'Denmark, Iceland and Norway'},
 {'predicted': 'Rollo,', 'true': 'Rollo'}]

## Similarity metrics

### Exact Match (Naive aproach)

In [16]:
em = []

for answer in answers:
    if answer['predicted'] == answer['true']:
        em.append(1)
    else:
        em.append(0)
print(sum(em)/len(em))

0.4


Same with preprocessing

In [17]:
import re

em = []

for answer in answers:
    pred = re.sub('[^0-9a-z ]', '', answer['predicted'].lower())
    true = re.sub('[^0-9a-z ]', '', answer['true'].lower())
    if pred == true:
        em.append(1)
    else:
        em.append(0)
print(sum(em)/len(em))

0.8


### Rouge (Recall Oriented Understudy for Gisting Evaluation)

In [22]:
from rouge import Rouge

In [23]:
rouge = Rouge()

In [24]:
model_out = [ans['predicted'] for ans in answers]
reference = [ans['true'] for ans in answers]

In [26]:
rouge.get_scores(model_out, reference)

[{'rouge-1': {'r': 1.0, 'p': 1.0, 'f': 0.999999995},
  'rouge-2': {'r': 0.0, 'p': 0.0, 'f': 0.0},
  'rouge-l': {'r': 1.0, 'p': 1.0, 'f': 0.999999995}},
 {'rouge-1': {'r': 1.0, 'p': 1.0, 'f': 0.999999995},
  'rouge-2': {'r': 1.0, 'p': 1.0, 'f': 0.999999995},
  'rouge-l': {'r': 1.0, 'p': 1.0, 'f': 0.999999995}},
 {'rouge-1': {'r': 0.6666666666666666, 'p': 1.0, 'f': 0.7999999952000001},
  'rouge-2': {'r': 0.6, 'p': 1.0, 'f': 0.7499999953125},
  'rouge-l': {'r': 0.6666666666666666, 'p': 1.0, 'f': 0.7999999952000001}},
 {'rouge-1': {'r': 1.0, 'p': 1.0, 'f': 0.999999995},
  'rouge-2': {'r': 1.0, 'p': 1.0, 'f': 0.999999995},
  'rouge-l': {'r': 1.0, 'p': 1.0, 'f': 0.999999995}},
 {'rouge-1': {'r': 0.0, 'p': 0.0, 'f': 0.0},
  'rouge-2': {'r': 0.0, 'p': 0.0, 'f': 0.0},
  'rouge-l': {'r': 0.0, 'p': 0.0, 'f': 0.0}}]

In [27]:
rouge.get_scores(model_out, reference, avg=True)

{'rouge-1': {'r': 0.7333333333333333, 'p': 0.8, 'f': 0.7599999960400001},
 'rouge-2': {'r': 0.52, 'p': 0.6, 'f': 0.5499999970625},
 'rouge-l': {'r': 0.7333333333333333, 'p': 0.8, 'f': 0.7599999960400001}}

In [45]:
from tqdm import tqdm

model_out = []
reference = []

for pair in tqdm(squad[:50], leave=True):
    ans = qa({
        'question': pair['question'],
        'context': pair['context']
    })
    # append predicted answer and real to answers list
    model_out.append(ans['answer'])
    reference.append(pair['answer'])

100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [01:40<00:00,  2.01s/it]


In [39]:
rouge.get_scores(model_out, reference, avg=True)

{'rouge-1': {'r': 0.42857142857142855, 'p': 1.0, 'f': 0.5999999958},
 'rouge-2': {'r': 0.375, 'p': 1.0, 'f': 0.5454545414876033},
 'rouge-l': {'r': 0.42857142857142855, 'p': 1.0, 'f': 0.5999999958}}

In [41]:
reference

'the Pechenegs, the Bulgars, and especially the Seljuk Turks'

With preprocessing 

In [31]:
clean = re.compile('(?i)[^0-9a-z ]')

In [32]:
new_model_out = [clean.sub('', text) for text in model_out]
new_reference = [clean.sub('', text) for text in reference]

In [33]:
rouge.get_scores(new_model_out, new_reference, avg=True)

AssertionError: 