In [2]:
from collections import defaultdict
from itertools import chain
import json
import ijson 
import argparse

import random
from argparse import Namespace
from tqdm.notebook import tqdm
import torch
from transformers import GPT2Tokenizer

In [6]:
msmarco_path = '/ssd003/projects/aieng/conversational_ai/data/MSMARCO/'

## Create Dataset

In [8]:
with open(msmarco_path + "train.json") as f:
    train_stuff = json.load(f)
    
with open(msmarco_path+ "valid.json") as f:
    valid_stuff = json.load(f)

In [4]:
def get_distractor():
    # get a distractor that is not the given answer and also not "No Answer Present."
    data = valid_stuff
    all_keys = list(data['answers'].keys())
    idx = random.choice(all_keys)
    if data['answers'][idx][0] == 'No Answer Present.':
        return get_distractor()
    else:
        return data['answers'][idx][0]
    
def create_marco_dataset(data):
    all_keys = list(data['answers'].keys())
    new_data = []
    for key in tqdm(data['answers']):
        data_chunk = {}
        question = data['query'][key]
        answers = data['answers'][key]
        better_answers = data['wellFormedAnswers'][key]
        contexts = data['passages'][key]
        question_type = data['query_type'][key]

        # if there exists a well-formed answer, use that one. Otherwise just use the answers provided.
        if isinstance(better_answers, list):
            new_answer = better_answers[0]
        else:
            new_answer = answers[0]

        # if there's multiple answers and a person has not provided a correct good response, skip this question
        if len(answers) > 1 and not isinstance(better_answers, list):
            continue

        # if the model can't extract the answer, at least be nice.
        if new_answer == 'No Answer Present.':
            new_answer = "I'm sorry, I don't know."

        data_chunk['context'] = contexts
        data_chunk['query_type'] = question_type
        data_chunk['utterances'] = []

        if new_answer == "I'm sorry, I don't know.":
            candidates = [get_distractor(), get_distractor(), new_answer]
        else:
            candidates = ["I'm sorry, I don't know.", get_distractor(), new_answer]
        data_chunk['utterances'].append({
            'history': [question],
            'candidates': candidates
        })

        new_data.append(data_chunk)
    return new_data

In [None]:
marco_train = create_marco_dataset(train_stuff)
with open('data/marco_train.json', 'w') as json_file:
    json.dump(marco_train, json_file, indent=2)

marco_valid = create_marco_dataset(valid_stuff)
with open('data/marco_valid.json', 'w') as json_file:
    json.dump(marco_valid, json_file, indent=2)

In [7]:
with open('data/MSMARCO/marco_train_data.json', 'w') as json_file:
    json.dump(marco_data['train'], json_file, indent=2)

with open('data/MSMARCO/marco_valid_data.json', 'w') as json_file:
    json.dump(marco_data['valid'], json_file, indent=2)

## Tokenize Dataset

In [7]:
def tokenize(obj, tokenizer):
    for i in range(len(obj['context'])):
        obj['context'][i]['passage_text'] = tokenizer.encode(obj['context'][i]['passage_text'])
    for i in range(len(obj['utterances'])):
        obj['utterances'][i]['history'] = [tokenizer.encode(x.lstrip(' ()_')) for x in obj['utterances'][i]['history']]
        for j in range(len(obj['utterances'][i]['candidates'])):
            obj['utterances'][i]['candidates'][j] = tokenizer.encode(obj['utterances'][i]['candidates'][j])

In [8]:
for i in tqdm(range(len(train_data))):
    tokenize(train_data[i], tokenizer)

  0%|          | 0/799698 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1137 > 1024). Running this sequence through the model will result in indexing errors


In [9]:
for i in tqdm(range(len(valid_data))):
    tokenize(valid_data[i], tokenizer)

  0%|          | 0/100661 [00:00<?, ?it/s]

In [10]:
with open('data/MSMARCO/marco_train_tokenized.json', 'w') as json_file:
    json.dump(train_data, json_file, indent=2)

with open('data/MSMARCO/marco_valid_tokenized.json', 'w') as json_file:
    json.dump(valid_data, json_file, indent=2)