In [1]:
import pandas as pd
import re
import json
import jsonlines
import gzip
from tqdm import tqdm
from transformers import BertTokenizer



## Origina data transformation

###### Download NQ Train and Dev dataset from https://ai.google.com/research/NaturalQuestions/download
###### NQ Train: https://storage.cloud.google.com/natural_questions/v1.0-simplified/simplified-nq-train.jsonl.gz
###### NQ Dev: https://storage.cloud.google.com/natural_questions/v1.0-simplified/nq-dev-all.jsonl.gz

In [2]:
nq_dev = []

with gzip.open("v1.0-simplified_nq-dev-all.jsonl.gz", "r+") as f:
    for item in tqdm(jsonlines.Reader(f)):
        
        arr = []
        ## question_text
        question_text = item['question_text']
        arr.append(question_text)

        tokens = []
        for i in item['document_tokens']:
            tokens.append(i['token'])
        document_text = ' '.join(tokens)
        
        ## example_id
        example_id = str(item['example_id'])
        arr.append(example_id)

        # document_text = item['document_text']
        ## long_answer
        annotation = item['annotations'][0]
        has_long_answer = annotation['long_answer']['start_token'] >= 0

        long_answers = [
            a['long_answer']
            for a in item['annotations']
            if a['long_answer']['start_token'] >= 0 and has_long_answer
        ]
        if has_long_answer:
            start_token = long_answers[0]['start_token']
            end_token = long_answers[0]['end_token']
            x = document_text.split(' ')
            long_answer = ' '.join(x[start_token:end_token])
            long_answer = re.sub('<[^<]+?>', '', long_answer).replace('\n', '').strip()
        arr.append(long_answer) if has_long_answer else arr.append('')

        # short_answer
        has_short_answer = annotation['short_answers'] or annotation['yes_no_answer'] != 'NONE'
        short_answers = [
            a['short_answers']
            for a in item['annotations']
            if a['short_answers'] and has_short_answer
        ]
        if has_short_answer and len(annotation['short_answers']) != 0:
            sa = []
            for i in short_answers[0]:
                start_token_s = i['start_token']
                end_token_s = i['end_token']
                shorta = ' '.join(x[start_token_s:end_token_s])
                sa.append(shorta)
            short_answer = '|'.join(sa)
            short_answer = re.sub('<[^<]+?>', '', short_answer).replace('\n', '').strip()
        arr.append(short_answer) if has_short_answer else arr.append('')

        ## url
        arr.append(item['document_url'])
        
        ## title
        arr.append(item['document_title'])

        ## abs
        if document_text.find('<P>') != -1:
            abs_start = document_text.index('<P>')
            abs_end = document_text.index('</P>')
            abs = document_text[abs_start+3:abs_end]
        else:
            abs = ''
        arr.append(abs)

        ## content
        if document_text.rfind('</Ul>') != -1:
            final = document_text.rindex('</Ul>')
            document_text = document_text[:final]
            if document_text.rfind('</Ul>') != -1:
                final = document_text.rindex('</Ul>')
                content = document_text[abs_end+4:final]
                content = re.sub('<[^<]+?>', '', content).replace('\n', '').strip()
                content = re.sub(' +', ' ', content)
                arr.append(content)
            else:
                content = document_text[abs_end+4:final]
                content = re.sub('<[^<]+?>', '', content).replace('\n', '').strip()
                content = re.sub(' +', ' ', content)
                arr.append(content)
        else:
            content = document_text[abs_end+4:]
            content = re.sub('<[^<]+?>', '', content).replace('\n', '').strip()
            content = re.sub(' +', ' ', content)
            arr.append(content)
        doc_tac = item['document_title'] + abs + content
        
        arr.append(doc_tac)
        language = 'en'
        arr.append(language)
        nq_dev.append(arr)

nq_dev = pd.DataFrame(nq_dev)

7830it [02:34, 50.81it/s]


In [3]:
nq_train = []
with gzip.open("v1.0-simplified_simplified-nq-train.jsonl.gz", "r+") as f:
    for item in tqdm(jsonlines.Reader(f)):
        ## question_text
        arr = []
        question_text = item['question_text']
        arr.append(question_text)

        ## example_id
        example_id = str(item['example_id'])
        arr.append(example_id)
        
        document_text = item['document_text']
        
        ## long_answer
        annotation = item['annotations'][0]
        has_long_answer = annotation['long_answer']['start_token'] >= 0

        long_answers = [
            a['long_answer']
            for a in item['annotations']
            if a['long_answer']['start_token'] >= 0 and has_long_answer
        ]
        if has_long_answer:
            start_token = long_answers[0]['start_token']
            end_token = long_answers[0]['end_token']
            x = document_text.split(' ')
            long_answer = ' '.join(x[start_token:end_token])
            long_answer = re.sub('<[^<]+?>', '', long_answer).replace('\n', '').strip()
        arr.append(long_answer) if has_long_answer else arr.append('')

        # short_answer
        has_short_answer = annotation['short_answers'] or annotation['yes_no_answer'] != 'NONE'
        short_answers = [
            a['short_answers']
            for a in item['annotations']
            if a['short_answers'] and has_short_answer
        ]
        if has_short_answer and len(annotation['short_answers']) != 0:
            sa = []
            for i in short_answers[0]:
                start_token_s = i['start_token']
                end_token_s = i['end_token']
                shorta = ' '.join(x[start_token_s:end_token_s])
                sa.append(shorta)
            short_answer = '|'.join(sa)
            short_answer = re.sub('<[^<]+?>', '', short_answer).replace('\n', '').strip()
        arr.append(short_answer) if has_short_answer else arr.append('')

        ## url
        arr.append(item['document_url'])
        
        ## title
        if document_text.find('<H1>') != -1:
            title_start = document_text.index('<H1>')
            title_end = document_text.index('</H1>')
            title = document_text[title_start+4:title_end]
        else:
            title = ''
        arr.append(title)

        ## abs
        if document_text.find('<P>') != -1:
            abs_start = document_text.index('<P>')
            abs_end = document_text.index('</P>')
            abs = document_text[abs_start+3:abs_end]
        else:
            abs = ''
        arr.append(abs)

        ## content
        if document_text.rfind('</Ul>') != -1:
            final = document_text.rindex('</Ul>')
            document_text = document_text[:final]
            if document_text.rfind('</Ul>') != -1:
                final = document_text.rindex('</Ul>')
                content = document_text[abs_end+4:final]
                content = re.sub('<[^<]+?>', '', content).replace('\n', '').strip()
                content = re.sub(' +', ' ', content)
                arr.append(content)
            else:
                content = document_text[abs_end+4:final]
                content = re.sub('<[^<]+?>', '', content).replace('\n', '').strip()
                content = re.sub(' +', ' ', content)
                arr.append(content)
        else:
            content = document_text[abs_end+4:]
            content = re.sub('<[^<]+?>', '', content).replace('\n', '').strip()
            content = re.sub(' +', ' ', content)
            arr.append(content)

        doc_tac = title + abs + content
        
        arr.append(doc_tac)

        language = 'en'
        arr.append(language)
        nq_train.append(arr)

nq_train = pd.DataFrame(nq_train)

307373it [19:11, 266.87it/s]


In [6]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
def lower(x):
    text = tokenizer.tokenize(x)
    id_ = tokenizer.convert_tokens_to_ids(text)
    return tokenizer.decode(id_)

nq_dev['title'] = nq_dev['title'].map(lower)
nq_train['title'] = nq_train['title'].map(lower)

nq_all_doc = nq_train.append(nq_dev)
nq_all_doc.reset_index(inplace = True)
nq_all_doc.drop_duplicates('title', inplace = True)
nq_all_doc.reset_index(inplace = True)
len(nq_all_doc)

109739

In [7]:
title_doc = {}
title_doc_id = {}
id_doc = {}
id_url = {}
id_title = {}
ran_id_old_id = {}
idx = 0
for i in range(len(nq_all_doc)):
    title_doc[nq_all_doc['title'][i]] =  nq_all_doc['doc_tac'][i]
    title_doc_id[nq_all_doc['title'][i]] = idx
    id_url[idx] = nq_all_doc['url'][i]
    id_doc[idx] = nq_all_doc['doc_tac'][i]
    id_title[idx] = nq_all_doc['title'][i]
    ran_id_old_id[idx] = nq_all_doc['id'][i]
    idx += 1

In [8]:
with open('dataset/corpus.json', 'w') as fw:
    for docid in id_doc.keys():
        fw.write(json.dumps({"docid": str(docid), "url" : id_url[docid].replace("&amp", ""), "title" : id_title[docid], "body" : " ".join(id_doc[docid].split()[:512])}) + '\n')

In [22]:
print(len(nq_train))
print(len(nq_dev))
with open("dataset/nq-doctrain-qrels.tsv", "w") as fw1, open("dataset/nq-doctrain-queries.tsv", "w") as fw2:
    for i in tqdm(range(len(nq_train))):
        fw2.write(str(i) + "\t" + nq_train['query'][i] + "\n")
        fw1.write(str(i) + "\t0\t" + str(title_doc_id[nq_train['title'][i]]) + "\t1\n")

with open("dataset/nq-docdev-qrels.tsv", "w") as fw1, open("dataset/nq-docdev-queries.tsv", "w") as fw2:
    for i in tqdm(range(len(nq_dev))):
        fw2.write(str(i) + "\t" + nq_dev['query'][i] + "\n")
        fw1.write(str(i) + "\t0\t" + str(title_doc_id[nq_dev['title'][i]]) + "\t1\n")


307373
7830


100%|██████████| 307373/307373 [00:04<00:00, 73133.60it/s]
100%|██████████| 7830/7830 [00:00<00:00, 78363.22it/s]
