In [1]:
!pip install transformers

[0m

In [2]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.3.0-py3-none-any.whl (72 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: evaluate
Successfully installed evaluate-0.3.0
[0m

In [3]:
!pip install bert_score

Collecting bert_score
  Downloading bert_score-0.3.12-py3-none-any.whl (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.8/60.8 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: bert_score
Successfully installed bert_score-0.3.12
[0m

In [4]:
import torch, requests, json
from transformers import BertTokenizer, BertForQuestionAnswering
import numpy as np   
import pandas as pd
from tqdm.notebook import tqdm  
tqdm.pandas()
from evaluate import load

In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/443 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.25G [00:00<?, ?B/s]

In [6]:
question = "Can anyone use this library?"
paragraph = "The Vatican Apostolic Library (), more commonly called the Vatican Library or simply the Vat, is the library of the Holy See, located in Vatican City. Formally established in 1475, although it is much older, it is one of the oldest libraries in the world and contains one of the most significant collections of historical texts. It has 75,000 codices from throughout history, as well as 1.1 million printed books, which include some 8,500 incunabula. \n\nThe Vatican Library is a research library for history, law, philosophy, science and theology. The Vatican Library is open to anyone who can document their qualifications and research needs. Photocopies for private study of pages from books published between 1801 and 1990 can be requested in person or by mail. \n\nIn March 2014, the Vatican Library began an initial four-year project of digitising its collection of manuscripts, to be made available online. \n\nThe Vatican Secret Archives were separated from the library at the beginning of the 17th century; they contain another 150,000 items. \n\nScholars have traditionally divided the history of the library into five periods, Pre-Lateran, Lateran, Avignon, Pre-Vatican and Vatican. \n\nThe Pre-Lateran period, comprising the initial days of the library, dated from the earliest days of the Church. Only a handful of volumes survive from this period, though some are very significant."

encoded_text = tokenizer.encode_plus(text=question,text_pair=paragraph)

inputs = encoded_text['input_ids'] # Token embeddings
sent_embs = encoded_text['token_type_ids'] # Segment embeddings
tokens = tokenizer.convert_ids_to_tokens(inputs) # input tokens

In [7]:
outputs = model(input_ids=torch.tensor([inputs]), token_type_ids=torch.tensor([sent_embs]))

In [8]:
start_idx = torch.argmax(outputs.start_logits)
end_idx = torch.argmax(outputs.end_logits)

In [9]:
start = start_idx.item()
end = end_idx.item()
ans = tokens[start:end+1]
ans = ' '.join(ans)
ans

'the vatican library is open to anyone who can document their qualifications and research needs'

In [10]:
corrected_answer = ''

for word in ans.split():
    if word[0:2] == '##':
        corrected_answer += word[2:]
    else:
        corrected_answer += ' ' + word

print(corrected_answer)

 the vatican library is open to anyone who can document their qualifications and research needs


In [11]:
def squad_json_to_dataframe_train(input_file_path, record_path = ['data','paragraphs','qas','answers'], verbose = 1):
    """
    input_file_path: path to the squad json file.
    record_path: path to deepest level in json file default value is
    ['data','paragraphs','qas','answers']
    verbose: 0 to suppress it default is 1
    """
    if verbose:
        print("Reading the json file")    
    file = json.loads(open(input_file_path).read())
    if verbose:
        print("processing...")
    # parsing different level's in the json file
    js = pd.io.json.json_normalize(file , record_path )
    m = pd.io.json.json_normalize(file, record_path[:-1] )
    r = pd.io.json.json_normalize(file,record_path[:-2])
    
    #combining it into single dataframe
    idx = np.repeat(r['context'].values, r.qas.str.len())
    ndx  = np.repeat(m['id'].values,m['answers'].str.len())
    m['context'] = idx
    js['q_idx'] = ndx
    main = pd.concat([ m[['id','question','context']].set_index('id'),js.set_index('q_idx')],1,sort=False).reset_index()
    main['c_id'] = main['context'].factorize()[0]
    if verbose:
        print("shape of the dataframe is {}".format(main.shape))
        print("Done")
    return main

In [12]:
def squad_json_to_dataframe_dev(input_file_path, record_path = ['data','paragraphs','qas','answers'],
                           verbose = 1):
    """
    input_file_path: path to the squad json file.
    record_path: path to deepest level in json file default value is
    ['data','paragraphs','qas','answers']
    verbose: 0 to suppress it default is 1
    """
    if verbose:
        print("Reading the json file")    
    file = json.loads(open(input_file_path).read())
    if verbose:
        print("processing...")
    # parsing different level's in the json file
    js = pd.io.json.json_normalize(file , record_path )
    m = pd.io.json.json_normalize(file, record_path[:-1] )
    r = pd.io.json.json_normalize(file,record_path[:-2])
    
    #combining it into single dataframe
    idx = np.repeat(r['context'].values, r.qas.str.len())
    m['context'] = idx
    main = m[['id','question','context','answers']].set_index('id').reset_index()
    main['c_id'] = main['context'].factorize()[0]
    if verbose:
        print("shape of the dataframe is {}".format(main.shape))
        print("Done")
    return main

In [13]:
# training data
input_file_path = '/kaggle/input/squad-2/train-v2.0.json'
record_path = ['data','paragraphs','qas','answers']
train = squad_json_to_dataframe_train(input_file_path=input_file_path,record_path=record_path)

Reading the json file
processing...


  
  from ipykernel import kernelapp as app
  


shape of the dataframe is (130319, 6)
Done


In [14]:
def squad_json_to_dataframe_dev(input_file_path, record_path = ['data','paragraphs','qas','answers'],
                           verbose = 1):
    """
    input_file_path: path to the squad json file.
    record_path: path to deepest level in json file default value is
    ['data','paragraphs','qas','answers']
    verbose: 0 to suppress it default is 1
    """
    if verbose:
        print("Reading the json file")    
    file = json.loads(open(input_file_path).read())
    if verbose:
        print("processing...")
    # parsing different levels in the json file
    js = pd.io.json.json_normalize(file , record_path )
    m = pd.io.json.json_normalize(file, record_path[:-1] )
    r = pd.io.json.json_normalize(file,record_path[:-2])
    idx = np.repeat(r['context'].values, r.qas.str.len())
    m['context'] = idx
    main = m[['id','question','context','answers']].set_index('id').reset_index()
    main['c_id'] = main['context'].factorize()[0]
    if verbose:
        print("shape of the dataframe is {}".format(main.shape))
        print("Done")
    return main

In [15]:
# dev data
input_file_path = '/kaggle/input/squad-2/dev-v2.0.json'
record_path = ['data','paragraphs','qas','answers']
dev = squad_json_to_dataframe_dev(input_file_path=input_file_path,record_path=record_path)

Reading the json file
processing...


  from ipykernel import kernelapp as app
  
  app.launch_new_instance()


shape of the dataframe is (11873, 5)
Done


In [16]:
train.head()

Unnamed: 0,index,question,context,text,answer_start,c_id
0,56be85543aeaaa14008c9063,When did Beyonce start becoming popular?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,in the late 1990s,269.0,0
1,56be85543aeaaa14008c9065,What areas did Beyonce compete in when she was...,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,singing and dancing,207.0,0
2,56be85543aeaaa14008c9066,When did Beyonce leave Destiny's Child and bec...,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,2003,526.0,0
3,56bf6b0f3aeaaa14008c9601,In what city and state did Beyonce grow up?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,"Houston, Texas",166.0,0
4,56bf6b0f3aeaaa14008c9602,In which decade did Beyonce become famous?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,late 1990s,276.0,0


In [17]:
dev.head()

Unnamed: 0,id,question,context,answers,c_id
0,56ddde6b9a695914005b9628,In what country is Normandy located?,The Normans (Norman: Nourmands; French: Norman...,"[{'text': 'France', 'answer_start': 159}, {'te...",0
1,56ddde6b9a695914005b9629,When were the Normans in Normandy?,The Normans (Norman: Nourmands; French: Norman...,"[{'text': '10th and 11th centuries', 'answer_s...",0
2,56ddde6b9a695914005b962a,From which countries did the Norse originate?,The Normans (Norman: Nourmands; French: Norman...,"[{'text': 'Denmark, Iceland and Norway', 'answ...",0
3,56ddde6b9a695914005b962b,Who was the Norse leader?,The Normans (Norman: Nourmands; French: Norman...,"[{'text': 'Rollo', 'answer_start': 308}, {'tex...",0
4,56ddde6b9a695914005b962c,What century did the Normans first gain their ...,The Normans (Norman: Nourmands; French: Norman...,"[{'text': '10th century', 'answer_start': 671}...",0


In [18]:
train.shape, dev.shape

((130319, 6), (11873, 5))

In [19]:
train = train.drop(['index','c_id','answer_start'],axis=1)
train.head()

Unnamed: 0,question,context,text
0,When did Beyonce start becoming popular?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,in the late 1990s
1,What areas did Beyonce compete in when she was...,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,singing and dancing
2,When did Beyonce leave Destiny's Child and bec...,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,2003
3,In what city and state did Beyonce grow up?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,"Houston, Texas"
4,In which decade did Beyonce become famous?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,late 1990s


In [20]:
dev = dev.drop(['id','c_id'],axis=1)
dev.head()

Unnamed: 0,question,context,answers
0,In what country is Normandy located?,The Normans (Norman: Nourmands; French: Norman...,"[{'text': 'France', 'answer_start': 159}, {'te..."
1,When were the Normans in Normandy?,The Normans (Norman: Nourmands; French: Norman...,"[{'text': '10th and 11th centuries', 'answer_s..."
2,From which countries did the Norse originate?,The Normans (Norman: Nourmands; French: Norman...,"[{'text': 'Denmark, Iceland and Norway', 'answ..."
3,Who was the Norse leader?,The Normans (Norman: Nourmands; French: Norman...,"[{'text': 'Rollo', 'answer_start': 308}, {'tex..."
4,What century did the Normans first gain their ...,The Normans (Norman: Nourmands; French: Norman...,"[{'text': '10th century', 'answer_start': 671}..."


In [21]:
def generate_answers(question,context):
  encoded_text = tokenizer.encode_plus(text=question,text_pair=context)
  inputs = encoded_text['input_ids'] 
  sentence_embeddings = encoded_text['token_type_ids']
  tokens = tokenizer.convert_ids_to_tokens(inputs)
  outputs = model(input_ids=torch.tensor([inputs]),token_type_ids=torch.tensor([sentence_embeddings]))
  start_index = torch.argmax(outputs.start_logits).item()
  end_index = torch.argmax(outputs.end_logits).item()
  answer = ' '.join(tokens[start_index:end_index+1])
  corrected_ans = ''

  for word in answer.split():
    if word[:2] == '##':
      corrected_ans += word[2:]
    else:
      corrected_ans += ' ' + word
  return corrected_ans

In [22]:
train_sample = train.sample(n=90000).dropna()

In [23]:
predicted_answers = []

for i in tqdm(range(len(train_sample))):
  ques = train_sample.question.iloc[i]
  ans = train_sample.text.iloc[i]
  predicted_answers.append(generate_answers(ques,ans))

  0%|          | 0/59833 [00:00<?, ?it/s]

In [24]:
predicted_answers = [x.strip().capitalize() for x in predicted_answers]
predicted_answers[:5]

['1894',
 'Reduction in speed',
 'Faculty of arts and sciences',
 'Operation sea lion',
 'Michael buble']

In [25]:
len(predicted_answers), len(train_sample.text.iloc[:20])

(59833, 20)

In [26]:
train_sample.text.iloc[:20].values

array(['1894',
       'reduction in speed caused by friction of the valley walls',
       'Faculty of Arts and Sciences', 'Operation Sea Lion',
       'Michael Bublé', 'Presentational immediacy', 'Greek',
       'neurobiology', 'Second Opium War', 'Dutch East India Company',
       'Salafis and Zahiris', 'the Fed',
       'Many slaves were freed by the masters for services rendered',
       '3.7 billion', 'laminate', 'the 1640s',
       'Nigel Lythgoe and Ken Warwick', 'Pliny', 'Black Sea',
       'the conscription of every male'], dtype=object)

In [27]:
bert_score = load("bertscore")
results = bert_score.compute(predictions=predicted_answers,references=train_sample.text.values,lang='en')

Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/482 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.33G [00:00<?, ?B/s]



In [28]:
precision, recall, f1_scores, hashcode = results['precision'], results['recall'], results['f1'], results['hashcode']
f1_scores[:5]

[0.9999997615814209,
 0.9242556691169739,
 0.9728001356124878,
 0.9669426679611206,
 0.8094959259033203]

In [29]:
train_sample['answer'] = predicted_answers
train_sample['f1_score'] = f1_scores
train_sample.head()

Unnamed: 0,question,context,text,answer,f1_score
63769,When were the first campus police formed at Yale?,Several campus safety strategies have been pio...,1894,1894,1.0
72120,Why do marginal crevasses form on the edge of ...,Crevasses can form in several different ways. ...,reduction in speed caused by friction of the v...,Reduction in speed,0.924256
63821,Who teaches Yale's residential college's under...,Yale's residential college system was establis...,Faculty of Arts and Sciences,Faculty of arts and sciences,0.9728
104598,What was the name of the Luftwaffe plan to inv...,The bombing failed to demoralise the British i...,Operation Sea Lion,Operation sea lion,0.966943
18747,"Along with Josh Groban, what notable pop artis...",While most artists became established in other...,Michael Bublé,Michael buble,0.809496
