In [15]:
# specify all paths
dataset_name = 'WD18'
data_path = '/ivi/ilps/personal/svakule/spoken_qa/'

# Load Corpus

In [16]:
# load entities and relations
import os
import json

def store_beir_corpus(split):
    path = '../data/%s.json' % split
    corpus_path = data_path + dataset_name + "/corpus"
    
    with open(path, 'r') as fin:
        items = json.load(fin)

    corpus = []
    for _id, label in items.items():
        corpus.append({"_id": _id, "title": "", "text": label, "metadata": {}})

    print(len(corpus), split)
    print(corpus[0])
    
    if not os.path.exists(corpus_path):
        os.makedirs(corpus_path)
        
    with open(corpus_path+"/%s.jsonl"%split, 'w') as out_file:
        for d in corpus:
            out_file.write(json.dumps(d))
            out_file.write("\n")
    
    return items
            
entities = store_beir_corpus('entities')
relations = store_beir_corpus('relations')

28497 entities
{'_id': 'Q1938494', 'title': '', 'text': 'Mirosław Bork', 'metadata': {}}
8913 relations
{'_id': '9591', 'title': '', 'text': '#SOSBrutalism ID', 'metadata': {}}


# Load Original Queries with Qrels

In [21]:
# first load original dataset from https://github.com/askplatypus/wikidata-simplequestions

def store_beir_qrels(split, qrels):
    qrels_path = data_path + dataset_name + "/qrels/%s.tsv" % split
    with open(qrels_path, 'w') as out_file:
        for qrel in qrels:
            out_file.write('\t'.join(qrel)+'\n')


def process_wd_questions(split):
    path_to_questions = data_path + 'annotated_wd_data_%s.txt' % split
    
    queries, rqrels, eqrels = [], [], []
    with open(path_to_questions) as fin:
        lines = fin.readlines()
        for i, l in enumerate(lines):
            s, p, o, q = lines[i].strip('\n').split('\t')
            _id = split[0] + str(i)  # t0 for train split v0 for validation split
            # store all queries
            queries.append({"_id": _id, "text": q, "metadata": {}})
            # filter questions with entities for which we have labels
            if s in entities:
                eqrels.append([_id, s, '1'])
            # filter questions with relations for which we have labels
            if p[1:] in relations:
                rqrels.append([_id, p[1:], '1'])

    # store entities and relations qrels separately for each split
    store_beir_qrels("%s_entities"%split, eqrels)
    store_beir_qrels("%s_relations"%split, rqrels)
    
    print(len(queries), split)
    print(queries[0], eqrels[0], rqrels[0])
    return queries


queries = []
queries = process_wd_questions(split='train')
queries.extend(process_wd_questions(split='valid'))
print(len(queries), 'questions in total')

# save queries
query_path = data_path + dataset_name + "/queries/original.jsonl"
with open(query_path, 'w') as out_file:
    for d in queries:
        out_file.write(json.dumps(d))
        out_file.write("\n")

34374 train
{'_id': 't0', 'text': 'what movie is produced by warner bros.', 'metadata': {}} ['t0', 'Q126399', '1'] ['t0', '272', '1']
4867 valid
{'_id': 'v0', 'text': 'Who was the trump ocean club international hotel and tower named after', 'metadata': {}} ['v1', 'Q318926', '1'] ['v0', '138', '1']
39241 questions in total


In [23]:
from beir.datasets.data_loader import GenericDataLoader

# e.g. to test entity retrieval
data_path = '/ivi/ilps/personal/svakule/spoken_qa/WD18/'
corpus_path = data_path + 'corpus/entities.jsonl'  # relations.jsonl
query_path = data_path + 'queries/original.jsonl'  # wav2vec2-base-960h.jsonl
qrels_path = data_path + 'qrels/valid_entities.tsv'  # valid_relations.tsv train_entities.tsv train_relations.tsv

corpus, queries, qrels = GenericDataLoader(corpus_file=corpus_path, 
                                           query_file=query_path, 
                                           qrels_file=qrels_path).load_custom()