The Wizard of Wikipedia dataset contains user inputs, answers, and a relevant Wikipedia ID per input. The ID references the relevant passage in a wikipedia knowledge base. The following script maps the IDs to the passages and appends them to the main dataset. It is further formatted to the proper format needed for both the bi-encoder and cross-encoder models.

In [3]:
!pip install transformers
!pip install datasets
from datasets import load_dataset
import json 
import pandas as pd
import random 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.3.2-py3-none-any.whl (362 kB)
[K     |████████████████████████████████| 362 kB 4.9 MB/s 
[?25hCollecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting fsspec[http]>=2021.05.0
  Downloading fsspec-2022.5.0-py3-none-any.whl (140 kB)
[K     |████████████████████████████████| 140 kB 45.6 MB/s 
Collecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 44.8 MB/s 
[?25hCollecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 63.6 MB/s 
Collecting urllib3!=1.25.0,!=1.25.

In [None]:
dataset = load_dataset("namespace/your_dataset_name", data_files=data_files)


In [None]:
# load wizard of wikipedia from hugging face
wow_train = load_dataset("kilt_tasks", "wow", split='train')
wow_val = load_dataset("kilt_tasks", "wow", split='validation')

In [None]:
# combine both datasets (splits generated later)
wow = pd.concat([pd.DataFrame(wow_train), pd.DataFrame(wow_val)], axis=0)
wow = wow.sample(frac=1).reset_index(drop=True) #shuffle dataset

In [None]:
# find all unique wikipedia ids to extract from knowledge base
# find indices of inputs with no passage
needed_ids = []
irrelevant_inputs = []
for i in range(0, len(wow)):
    if wow['output'][i][0]['provenance'] != []:
        needed_ids.append(wow['output'][i][0]['provenance'][0]['wikipedia_id'])
    else:
        irrelevant_inputs.append(i)

# filter out duplicate IDs 
needed_ids = list(set(needed_ids))

In [None]:
# drop recrods with no passage 
no_passage_idx = wow.index[irrelevant_inputs]
wow = wow.drop(no_passage_idx).reset_index(drop=True)

In [None]:
# filter knowledge base to only keep needed IDs
wiki_knowledge = wiki_knowledge.filter(lambda x: x['wikipedia_id'] in needed_ids)

In [None]:
# collect the subsections containing the passages 
# the subsection of each passage is specified in the data

needed_sections = []

for i in range(0, len(wow)):
    if wow['output'][i][0]['provenance'] != []:
        needed_sections.append([wow['output'][i][0]['provenance'][0]['wikipedia_id'], wow['output'][i][0]['provenance'][0]['section']])

In [None]:
# using the ID and needed subsection, collect all passages from knowledge base

passages = []

for i in range(0, len(needed_sections)):
    row = wiki_knowledge.loc[wiki_knowledge['wikipedia_id'] == needed_sections[i][0]]
    section_list = row['text'].values[0]['paragraph']
    section = needed_sections[i][1]
    
    if section == 'Section::::Abstract.':
        passage = section_list[1]
    else:
        passage = section_list[section_list.index(section) + 1]
    
    passages.append(passage)

In [None]:
# extract answers from nested output
answers = []
for i in range(0, len(wow)):
    answers.append(wow['output'][i][0]['answer'])

In [None]:
# add passage and answer columns to dataset
wow['answer'] = answers
wow['passages_text'] = passages

In [None]:
# save dataset 
wow = wow.to_json()

output_folder = usr_path+ '/data/Wizard_of_Wikipedia/'

with open(output_folder+'wizard_of_wikipedia.json', 'w') as fp:
    json.dump(wow, fp)