In [3]:
import json 
import pandas as pd 
import torch 
from torch import nn
from datasets import load_dataset

### Format Eli5 Dataset

In [5]:
# load data from hugging face
# can be loaded & formatted in collab if 'file not found' error appears 
dataset = load_dataset("vblagoje/lfqa_support_docs", split='train') 
eli5 = pd.DataFrame(dataset)

In [None]:
# delete 'meta' column 
eli5 = eli5.drop('meta', 1)

# create answer column 
answers = []
for i in range(0,len(eli5)):
    answers.append(eli5['output'][i][:-1])

for i in range(0,len(answers)):
    for j in range(0,len(answers[i])):
        answers[i][j] = answers[i][j]['answer']

# add answer column to df
eli5['answer'] = answers

# extract related wiki passages from outputs column 
outputs = [] 
for i in range(0,len(eli5)):
    outputs.append(eli5['output'][i][-1]['provenance']) # reduce dimension of nested dicts by removing answer keys

eli5['passages'] = outputs
eli5 = eli5.drop('output',1) # drop original outputs column 

### Re-Rank Passages & Answers 
The original dataset has 7 wikipedia passages per query, with no indication of which passages are most relevant. Since the passage retrieval input requires 1 passage per query, a re-ranker cross-encoder was used to rank the 7 passages. The top (most relevant) passage is then selected for the input pairs. An additional field was added to include the top passage from each set. 

In [None]:
queries = list(eli5['input'])
passage_list = list(eli5['passages'])
answers = list(eli5['answer'])

In [None]:
# load pre-trained cross-encoder from sentence transformers library 
cross_encoder = CrossEncoder('/contextretrieval/cross-encoder/ms-marco-MiniLM-L-6-v2',default_activation_function=nn.Sigmoid())

In [None]:
def re_rank(list_to_rank):
    ranked_list = []
    
    for i in range(len(list_to_rank)):
        cross_inp = []
        for j in range(0,len(list_to_rank[i])):
            cross_inp.append([queries[i], list_to_rank[i][j]['text']])

        cross_scores = cross_encoder.predict(cross_inp)  
        for j in range(len(cross_scores)):
                list_to_rank[i][j]['cross-score'] = cross_scores[j]

        hits = sorted(list_to_rank[i], key=lambda x: x['cross-score'], reverse=True)
        ranked_list.append(hits)
    return ranked_list

In [None]:
# answers into dict format 
answers_list = []
for i in range(len(answers)):    
    dicts = []
    for j in range(len(answers[i])):
        dicts.append({'text' : answers[i][j]})
    answers_list.append(dicts)

In [None]:
ranked_passages = re_rank(passage_list)
ranked_answers = re_rank(answers_list)

In [None]:
eli5['answer'] = ranked_answers
eli5['passages'] = ranked_passages

In [None]:
# add column for top passages 
top_passages = []
for i in range(0,len(eli5)):
    passages.append(eli5['passages'][i][0]['text'])
    
eli5['passages_text'] = top_passages

### Save Dataset 

In [None]:
eli5_reranked = eli5.to_json(orient='records')

output_path = '/data/Eli5/Eli5_reranked/'
with open(output_path + 'eli5_reranked.json', 'w') as fp:
    json.dump(eli5_reranked, fp)

## Merge with Categories Dataset

The version of Eli5 that contains categories (used for response generation) is much smaller than the Eli5 train dataset above. However, they share the same query IDs so the overlapping records can be merged to form a new dataset that includes the categories 

In [None]:
# load from huggingface
dataset = load_dataset("eli5_category", split='train')
eli5_categories = pd.DataFrame(dataset)

In [None]:
eli5_merged = pd.merge(eli5, eli5_categories.rename(columns={'q_id':'id'}), on='id',  how='left')

In [None]:
eli5_merged= eli5_merged.dropna()
eli5_merged=eli5_merged.reset_index(drop=True)
eli5_merged = eli5_merged.rename(columns={'answer': 'answers_ranked'}) 
eli5_merged = eli5_merged.rename(columns={'passages': 'passages_ranked'}) 

### Save Dataset

In [None]:
eli5_categories_reranked = eli5_merged.to_json(orient="records")
with open(output_path + 'eli5_categories_reranked.json', 'w') as fp:
    json.dump(eli5_categories_reranked, fp)