# Extrative Question Answering

From lecture slides
Given a query and a passage/document: Select the words in the passage that answer the query

Having a passage guaranteed to contain the answer is somewhat artificial

TASK: run the extractive QA pipeline on the top-1 neural re-ranking result of the MSMARCO FIRA + on the gold-label pairs of MSMARCO-FiRA-2021

In [1]:
## implement part 3 here 
from core_metrics import compute_f1, compute_exact
from tqdm import tqdm
import pandas as pd
import numpy as np
import csv
from transformers import pipeline
pd.set_option('display.max_colwidth', None)

In [2]:
# https://huggingface.co/deepset/tinyroberta-squad2
model_name = "deepset/tinyroberta-squad2"
eqa_model = pipeline('question-answering', model=model_name, tokenizer=model_name)



In [3]:
def read_file(filename, reranked=False):
    rows = []
    with open(filename, "r", encoding="utf-8") as file:
        for line in file:
            sep = line.split("\t")
            if reranked:
                answers = [element.replace('\n', '') for element in sep[4:]] # handles variable ground truth amounts
                row = {'queryid':sep[0], 
                    'documentid':sep[1], 
                    'relevance-grade':sep[2], 
                    'text-selection':answers
                }
            else:
                answers = [element.replace('\n', '') for element in sep[6:]] # handles variable ground truth amounts
                row = {'queryid':sep[0], 
                        'documentid':sep[1], 
                        'relevance-grade':sep[2], 
                        'question':sep[3], 
                        'context':sep[4], 
                        'text-selection':answers
                    }
            rows.append(row)
    return pd.DataFrame(rows)

def prep_input(qatuples):
    '''
    Prepares tuples into dictionaries containing questions and contexts, a suitable format for the extractive qa model.
    '''
    return qatuples[['question', 'context']].to_dict(orient='records')

def gen_answer(inputs):
    '''
    Computes a list of answers for given questions and contexts (qa tuples).
    '''
    return [eqa_model(i) for i in tqdm(inputs)]

### gold-label pairs of MSMARCO-FiRA-2021

In [13]:
input = read_file('Part-3/msmarco-fira-21.qrels.qa-tuples.tsv')
model_input = prep_input(input)
answer_pred = gen_answer(model_input)

100%|██████████████████████████████████████████████████████████████████████████| 52606/52606 [1:59:46<00:00,  7.32it/s]


In [15]:
answer_pred_vec = [a['answer'] for a in answer_pred]

In [16]:
# Computing f-statistics with gold-label pairs
f1_tuples = []
for i in range(len(input)):
    res = compute_f1(str(input.loc[i, 'text-selection']), answer_pred_vec[i])
    f1_tuples.append(res)

print("f1 score")
print(f"MEAN:  {np.mean(f1_tuples):.4f}")
print(f"STD:  {np.std(f1_tuples):.4f}")

f1 score
MEAN:  0.3710
STD:  0.3063


In [17]:
# Computing exact matching with gold-label pairs
compute_exact_tuples = []
for i in range(len(input)):
    res = compute_exact(str(input.loc[i, 'text-selection']), answer_pred_vec[i])
    compute_exact_tuples.append(res)

print("Exact score")
print(f"MEAN:  {np.mean(compute_exact_tuples):.4f}")
print(f"STD:  {np.std(compute_exact_tuples):.4f}")

Exact score
MEAN:  0.0934
STD:  0.2910


In [5]:
answer = read_file('Part-3/msmarco-fira-21.qrels.qa-answers.tsv', reranked=True)

### Use here our top-1 MSMARCO passage results from the best re-ranking model

only evaluate the overlap of pairs that are in the result and the qrels, hence below data from part 2 and answers are merged. Only 889 from 2000 rows remain, i.e., 1111 have no reference.

In [6]:
input = pd.read_csv('../Project_Part_3/tk_dataset_for_part3.tsv', delimiter='\t', header=None, names=['queryid', 'documentid', 'question', 'context'])

In [7]:
answer['queryid'] = answer['queryid'].astype('int64')
answer['documentid'] = answer['documentid'].astype('int64')

In [8]:
input = input.merge(answer, on=['queryid', 'documentid'])

In [9]:
model_input = prep_input(input)
answer_pred = gen_answer(model_input)

100%|████████████████████████████████████████████████████████████████████████████████| 889/889 [02:31<00:00,  5.86it/s]


In [10]:
answer_pred_vec = [a['answer'] for a in answer_pred]

In [11]:
# Computing f-statistics with gold-label pairs
f1_tuples = []
for i in range(len(input)):
    res = compute_f1(str(input.loc[i, 'text-selection']), answer_pred_vec[i])
    f1_tuples.append(res)

print("f1 score")
print(f"MEAN:  {np.mean(f1_tuples):.4f}")
print(f"STD:  {np.std(f1_tuples):.4f}")

f1 score
MEAN:  0.4180
STD:  0.3087


In [12]:
# Computing exact matching with gold-label pairs
compute_exact_tuples = []
for i in range(len(input)):
    res = compute_exact(str(input.loc[i, 'text-selection']), answer_pred_vec[i])
    compute_exact_tuples.append(res)

print("Exact score")
print(f"MEAN:  {np.mean(compute_exact_tuples):.4f}")
print(f"STD:  {np.std(compute_exact_tuples):.4f}")

Exact score
MEAN:  0.1125
STD:  0.3160
