# Load Corpus

In [1]:
# load entities and relations
import os
import json

from utils import data_path, dataset_name


def store_beir_corpus(split):
    path = '../data/%s.json' % split
    corpus_path = data_path + dataset_name + "/corpus"
    
    with open(path, 'r') as fin:
        items = json.load(fin)

    corpus = []
    for _id, label in items.items():
        corpus.append({"_id": _id, "title": "", "text": label, "metadata": {}})

    print(len(corpus), split)
    print(corpus[0])
    
    if not os.path.exists(corpus_path):
        os.makedirs(corpus_path)
        
    with open(corpus_path+"/%s.jsonl"%split, 'w') as out_file:
        for d in corpus:
            out_file.write(json.dumps(d))
            out_file.write("\n")
    
    return items
            
entities = store_beir_corpus('entities')
relations = store_beir_corpus('relations')

28497 entities
{'_id': 'Q1938494', 'title': '', 'text': 'Mirosław Bork', 'metadata': {}}
8913 relations
{'_id': '9591', 'title': '', 'text': '#SOSBrutalism ID', 'metadata': {}}


In [2]:
# store an extended corpus for relations with alternative aliases and descriptions
split = 'relations-extra'

path = '../data/%s.json' % split
corpus_path = os.path.join(data_path, dataset_name, "corpus")
print(corpus_path)

with open(path, 'r') as fin, open(corpus_path+"/%s.jsonl"%split, 'w') as out_file:
    items = json.load(fin)
    for e in items:
        _id = e['id'][1:]

        label = e['label']
        aliases = ' '.join(e['aliases'])
        description = str(e['description'])
        text = ' '.join([label, aliases, description])

        d = {"_id": _id, "title": label, "text": text, "metadata": {}}
#         print(d)
        out_file.write(json.dumps(d))
        out_file.write("\n")
    #     break

/ivi/ilps/personal/svakule/spoken_qa/datasets/WD18/corpus


# Load Original Queries with Qrels

In [3]:
# load KG to check triples are there and add alternative answers with the same s, p
from kgqa import load_kg, check_triple

kg = load_kg()

nb triples: 2935160017
nb subjects: 760717318
nb predicates: 23387


In [5]:
# first load original dataset from https://github.com/askplatypus/wikidata-simplequestions

def store_beir_qrels(split, qrels):
    qrels_path = os.path.join(data_path, dataset_name, "qrels/%s.tsv"%split)
    with open(qrels_path, 'w') as out_file:
        for qrel in qrels:
            out_file.write('\t'.join(qrel)+'\n')


def process_wd_questions(split):
    path_to_questions = '/ivi/ilps/personal/svakule/spoken_qa/annotated_wd_data_%s.txt' % split
    
    queries, rqrels, eqrels, aqrels, aqrels_all = [], [], [], [], []
    answers_extended, answers_removed = 0, 0
    with open(path_to_questions) as fin:
        lines = fin.readlines()
        for i, l in enumerate(lines):
            s, p, o, q = lines[i].strip('\n').split('\t')
            _id = split[0] + str(i)  # t0 for train split v0 for validation split
            # store all queries
            queries.append({"_id": _id, "text": q, "metadata": {}})
            # filter questions with entities for which we have labels
            if s in entities:
                eqrels.append([_id, s, '1'])
            # filter questions with relations for which we have labels
            if p[1:] in relations:
                rqrels.append([_id, p[1:], '1'])
            
            # extend the ground-truth answers by getting the set of all answers from KG using the s, p pattern
            e_set = check_triple(kg, (s, p[1:], o))
            if not e_set:
                answers_removed += 1
                # filter questions with entities for which we have labels
                for e_id in e_set:
                    if e_id in entities:  # add only the entities that are in our entity corpus
                        aqrels_all.append([_id, e_id, '1'])  # add only to aqrels_all
            if len(e_set) > 1:
                answers_extended += 1
        
            # filter questions with entities for which we have labels
            for e_id in e_set:
                if e_id in entities:  # add only the entities that are in our entity corpus
                    aqrels.append([_id, e_id, '1'])  # those qrels are filtered by the triples we found in this KG
                    aqrels_all.append([_id, e_id, '1'])

    # store entities and relations qrels separately for each split
    store_beir_qrels("%s_entities"%split, eqrels)
    store_beir_qrels("%s_relations"%split, rqrels)
    store_beir_qrels("%s_answers"%split, aqrels)
    store_beir_qrels("%s_answers-all"%split, aqrels_all)
    
    print(len(queries), split)
    print(queries[0], eqrels[0], rqrels[0], aqrels[0])
    print("%d questions with more than one correct answer" % answers_extended)
    print("%d questions without answer found in the KG" % answers_removed)
    return queries


queries = []
queries = process_wd_questions(split='valid')
queries.extend(process_wd_questions(split='train'))
print(len(queries), 'questions in total')

# save queries
query_path = os.path.join(data_path, dataset_name, "queries/original.jsonl")
with open(query_path, 'w') as out_file:
    for d in queries:
        out_file.write(json.dumps(d))
        out_file.write("\n")

4867 valid
{'_id': 'v0', 'text': 'Who was the trump ocean club international hotel and tower named after', 'metadata': {}} ['v1', 'Q318926', '1'] ['v0', '138', '1'] ['v1', 'Q1010', '1']
2 questions with more than one correct answer
3534 questions without answer found in the KG
34374 train
{'_id': 't0', 'text': 'what movie is produced by warner bros.', 'metadata': {}} ['t0', 'Q126399', '1'] ['t0', '272', '1'] ['t4', 'Q2888523', '1']
29 questions with more than one correct answer
25082 questions without answer found in the KG
39241 questions in total


In [None]:
from beir.datasets.data_loader import GenericDataLoader

# e.g. to test entity retrieval
data_path = '/ivi/ilps/personal/svakule/spoken_qa/WD18/'
corpus_path = data_path + 'corpus/entities.jsonl'  # relations.jsonl
query_path = data_path + 'queries/original.jsonl'  # wav2vec2-base-960h.jsonl
qrels_path = data_path + 'qrels/valid_entities.tsv'  # valid_relations.tsv train_entities.tsv train_relations.tsv

corpus, queries, qrels = GenericDataLoader(corpus_file=corpus_path, 
                                           query_file=query_path, 
                                           qrels_file=qrels_path).load_custom()

# Generate Transcripts with ASR

In [None]:
import os
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from datasets import load_dataset
import soundfile as sf

model_name = "wav2vec2-large-960h-lv60-self"  # wav2vec2-base-960h"

model = Wav2Vec2ForCTC.from_pretrained("facebook/"+model_name)
processor = Wav2Vec2Processor.from_pretrained("facebook/" + model_name)
model.to('cuda')

In [None]:
# generate transcripts
import json
from utils import dataset_name, data_path

def generate_transcripts(split, query_path):
    wav_path = "/ivi/ilps/personal/svakule/spoken_qa/gtts/annotated_wd_data_%s/wav16000/" % split

    with open(query_path, 'a') as out_file:
        for file in os.listdir(wav_path):
            _id = file.split('.')[0] # t0 for train split v0 for validation split

            speech, samplerate = sf.read(wav_path+file)
            
            input_values = processor(speech, return_tensors="pt", padding="longest",
                                     sampling_rate=samplerate).input_values

            input_values = input_values.to('cuda')

            logits = model(input_values).logits
            predicted_ids = torch.argmax(logits, dim=-1)
            transcription = tokenizer.batch_decode(predicted_ids)[0].lower()
            
            # save
            q = {"_id": _id, "text": transcription, "metadata": {}}
            out_file.write(json.dumps(q)+"\n")

query_path = os.path.join(data_path, dataset_name, "queries", "%s.jsonl"%model_name)
print(query_path)

# generate_transcripts('valid', query_path)
# generate_transcripts('train', query_path)