# code to create HotpotQA_cite

In [None]:
import pandas as pd
import json
from transformers import pipeline
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
# train
with open('datasets/hotpot_train_v1.1.json') as f:
    hotpotqa_train = json.load(f)

# val
with open('datasets/hotpot_dev_distractor_v1.json') as f: # note that I'll need to split this in 2: val and test
    hotpotqa_val = json.load(f)    

In [3]:
# train
train_questions = []
train_original_contexts = []
train_original_answers = []
train_cited_original_answers = []
train_title_to_contexts = []
train_cited_oracle_contexts = []
train_number_of_documents_list = []
train_distractor_contexts = []

for entry_idx in range(len(hotpotqa_train)):
    # extract
    question = hotpotqa_train[entry_idx]['question']
    original_answer = hotpotqa_train[entry_idx]['answer']

    context = hotpotqa_train[entry_idx]['context']
    title_to_citation_dict = {}
    title_to_context = {}
    full_cited_context = ""
    for idx, sentences_list in enumerate(context):
        doc_id = f'Document {idx+1}'
        document_title = sentences_list[0]
        paragraph_text = ' '.join(sentences_list[1])
        title_to_citation_dict[document_title] = doc_id
        title_to_context[document_title] = paragraph_text
        cited_sentences = f'{doc_id}: {paragraph_text}'
        full_cited_context += f'{cited_sentences} '
    
    titles = [i[0] for i in hotpotqa_train[entry_idx]['context']]
    relevant_titles = list(set([i[0] for i in hotpotqa_train[entry_idx]['supporting_facts']]))
    irrelevant_titles = [i for i in titles if i not in relevant_titles]
    relevant_citations = [title_to_citation_dict[title] for title in relevant_titles]
    irrelevant_citations = [title_to_citation_dict[title] for title in irrelevant_titles]
    relevant_context = " ".join([title_to_context[relevant_title] for relevant_title in relevant_titles])
    irrelevant_context = " ".join([title_to_context[irrelevant_title] for irrelevant_title in irrelevant_titles])
    relevant_cited_context = " ".join([f'{title_to_citation_dict[relevant_title]}: {title_to_context[relevant_title]}' for relevant_title in relevant_titles])
    irrelevant_cited_context = " ".join([f'{title_to_citation_dict[irrelevant_title]}: {title_to_context[irrelevant_title]}' for irrelevant_title in irrelevant_titles])

    original_context = full_cited_context
    for i in relevant_cited_context.split('. '):
        original_context = original_context.replace(i, '')

    cited_original_answer = f'According to Documents {sorted(relevant_citations)} the answer is {original_answer}'
    number_of_documents = len(hotpotqa_train[entry_idx]['context'])
    # append
    train_questions.append(question)
    train_original_contexts.append(full_cited_context)
    train_original_answers.append(original_answer)
    train_cited_original_answers.append(cited_original_answer)
    train_title_to_contexts.append(title_to_context)
    train_cited_oracle_contexts.append(relevant_cited_context)
    train_number_of_documents_list.append(number_of_documents)
    train_distractor_contexts.append(irrelevant_cited_context)

In [5]:
# val
val_questions = []
val_original_contexts = []
val_original_answers = []
val_cited_original_answers = []
val_title_to_contexts = []
val_cited_oracle_contexts = []
val_number_of_documents_list = []
val_distractor_contexts = []

for entry_idx in range(len(hotpotqa_val)):
    # extract
    question = hotpotqa_val[entry_idx]['question']
    original_answer = hotpotqa_val[entry_idx]['answer']

    context = hotpotqa_val[entry_idx]['context']
    title_to_citation_dict = {}
    title_to_context = {}
    full_cited_context = ""
    for idx, sentences_list in enumerate(context):
        doc_id = f'Document {idx+1}'
        document_title = sentences_list[0]
        paragraph_text = ' '.join(sentences_list[1])
        title_to_citation_dict[document_title] = doc_id
        title_to_context[document_title] = paragraph_text
        cited_sentences = f'{doc_id}: {paragraph_text}'
        full_cited_context += f'{cited_sentences} '
    
    titles = [i[0] for i in hotpotqa_val[entry_idx]['context']]
    relevant_titles = list(set([i[0] for i in hotpotqa_val[entry_idx]['supporting_facts']]))
    irrelevant_titles = [i for i in titles if i not in relevant_titles]
    relevant_citations = [title_to_citation_dict[title] for title in relevant_titles]
    irrelevant_citations = [title_to_citation_dict[title] for title in irrelevant_titles]
    relevant_context = " ".join([title_to_context[relevant_title] for relevant_title in relevant_titles])
    irrelevant_context = " ".join([title_to_context[irrelevant_title] for irrelevant_title in irrelevant_titles])
    relevant_cited_context = " ".join([f'{title_to_citation_dict[relevant_title]}: {title_to_context[relevant_title]}' for relevant_title in relevant_titles])
    irrelevant_cited_context = " ".join([f'{title_to_citation_dict[irrelevant_title]}: {title_to_context[irrelevant_title]}' for irrelevant_title in irrelevant_titles])

    original_context = full_cited_context
    for i in relevant_cited_context.split('. '):
        original_context = original_context.replace(i, '')

    cited_original_answer = f'According to Documents {sorted(relevant_citations)} the answer is {original_answer}'
    number_of_documents = len(hotpotqa_val[entry_idx]['context'])
    # append
    val_questions.append(question)
    val_original_contexts.append(full_cited_context)
    val_original_answers.append(original_answer)
    val_cited_original_answers.append(cited_original_answer)
    val_title_to_contexts.append(title_to_context)
    val_cited_oracle_contexts.append(relevant_cited_context)
    val_number_of_documents_list.append(number_of_documents)
    val_distractor_contexts.append(irrelevant_cited_context)

In [6]:
train_df = pd.DataFrame({
    'questions' : train_questions,
    'original_contexts' : train_original_contexts,
    'original_answers' : train_original_answers,
    'cited_original_answers' : train_cited_original_answers,
    'title_to_contexts' : train_title_to_contexts,
    'cited_oracle_contexts' : train_cited_oracle_contexts,
    'number_of_documents_list' : train_number_of_documents_list,
    'distractor_contexts': train_distractor_contexts,
    })

In [7]:
val_df = pd.DataFrame({
    'questions' : val_questions,
    'original_contexts' : val_original_contexts,
    'original_answers' : val_original_answers,
    'cited_original_answers' : val_cited_original_answers,
    'title_to_contexts' : val_title_to_contexts,
    'cited_oracle_contexts' : val_cited_oracle_contexts,
    'number_of_documents_list' : val_number_of_documents_list,
    'distractor_contexts': val_distractor_contexts,
    })

In [None]:
val_df

# create conflicting information using MLM

In [None]:
# to create conflicting answers use RoBERTa (distilroberta-base)
classifier = pipeline("fill-mask", top_k=5, device=device) # up to 5 conflicting strings (some may be existing answers and then removed)

In [None]:
# train
train_conflicting_contexts_1 = []
train_conflicting_answers_1 = []
train_conflicting_contexts_2 = []
train_conflicting_answers_2 = []
train_cited_conflicting_answers_1 = []
train_cited_conflicting_answers_2 = []

for entry_idx in range(len(train_original_contexts)):
    if entry_idx % 100 == 0:
        print(f'entry_idx: {entry_idx} out of {len(train_original_contexts)}')
    try:
        original_answer = train_original_answers[entry_idx]
        original_context = train_cited_oracle_contexts[entry_idx]
        if original_answer in original_context:
            question = train_questions[entry_idx]
            # mask
            masked_context = original_context.replace(original_answer, '<mask>', 1) # only replace the first occurence because the fill-mask pipeline can only handle 1 mask at a time (it can do more but then the results look different)
            # predict
            fill_mask_result = classifier(masked_context) # Note that token_str may contain spaces
            # extract
            fill_mask_sequences = [i['sequence'] for i in fill_mask_result]
            fill_mask_answers = [i['token_str'].strip() for i in fill_mask_result] # remove extra spaces from the pipelines potential result
            try:
                fill_mask_answers.remove(original_answer) # filter -- remove answers if its the original and append the remaining context and answers
            except ValueError: pass # not in list
            for idx in range(2): # only append the first 2 answers for 2 conflicting contexts
                if idx == 0:
                    train_conflicting_contexts_1.append(original_context.replace(original_answer, fill_mask_answers[idx]).replace('  ', ' ')) # and remove extra spaces from the pipeline's potential result)
                    train_conflicting_answers_1.append(fill_mask_answers[idx])
                    train_cited_conflicting_answers_1.append(train_cited_original_answers[entry_idx].replace(original_answer, fill_mask_answers[idx]).replace('  ', ' '))
                if idx == 1:
                    train_conflicting_contexts_2.append(original_context.replace(original_answer, fill_mask_answers[idx]).replace('  ', ' ')) # and remove extra spaces from the pipeline's potential result)
                    train_conflicting_answers_2.append(fill_mask_answers[idx])
                    train_cited_conflicting_answers_2.append(train_cited_original_answers[entry_idx].replace(original_answer, fill_mask_answers[idx]).replace('  ', ' '))

            # break
        else: # answer does not appear in the context (still need to append something)
            train_conflicting_contexts_1.append('NA')
            train_conflicting_answers_1.append('NA')
            train_conflicting_contexts_2.append('NA')
            train_conflicting_answers_2.append('NA')
            train_cited_conflicting_answers_1.append('NA')
            train_cited_conflicting_answers_2.append('NA')
    except (RuntimeError, IndexError) as e: # context is too long
        train_conflicting_contexts_1.append('NA')
        train_conflicting_answers_1.append('NA')
        train_conflicting_contexts_2.append('NA')
        train_conflicting_answers_2.append('NA')
        train_cited_conflicting_answers_1.append('NA')
        train_cited_conflicting_answers_2.append('NA')

In [None]:
# val
val_conflicting_contexts_1 = []
val_conflicting_answers_1 = []
val_conflicting_contexts_2 = []
val_conflicting_answers_2 = []
val_cited_conflicting_answers_1 = []
val_cited_conflicting_answers_2 = []

for entry_idx in range(len(val_original_contexts)):
    if entry_idx % 100 == 0:
        print(f'entry_idx: {entry_idx} out of {len(val_original_contexts)}')
    try:
        original_answer = val_original_answers[entry_idx]
        original_context = val_cited_oracle_contexts[entry_idx]
        if original_answer in original_context:
            question = val_questions[entry_idx]
            # mask
            masked_context = original_context.replace(original_answer, '<mask>', 1) # only replace the first occurence because the fill-mask pipeline can only handle 1 mask at a time (it can do more but then the results look different)
            # predict
            fill_mask_result = classifier(masked_context) # Note that token_str may contain spaces
            # extract
            fill_mask_sequences = [i['sequence'] for i in fill_mask_result]
            fill_mask_answers = [i['token_str'].strip() for i in fill_mask_result] # remove extra spaces from the pipelines potential result
            try:
                fill_mask_answers.remove(original_answer) # filter -- remove answers if its the original and append the remaining context and answers
            except ValueError: pass # not in list
            for idx in range(2): # only append the first 2 answers for 2 conflicting contexts
                if idx == 0:
                    val_conflicting_contexts_1.append(original_context.replace(original_answer, fill_mask_answers[idx]).replace('  ', ' ')) # and remove extra spaces from the pipeline's potential result)
                    val_conflicting_answers_1.append(fill_mask_answers[idx])
                    val_cited_conflicting_answers_1.append(val_cited_original_answers[entry_idx].replace(original_answer, fill_mask_answers[idx]).replace('  ', ' '))
                if idx == 1:
                    val_conflicting_contexts_2.append(original_context.replace(original_answer, fill_mask_answers[idx]).replace('  ', ' ')) # and remove extra spaces from the pipeline's potential result)
                    val_conflicting_answers_2.append(fill_mask_answers[idx])
                    val_cited_conflicting_answers_2.append(val_cited_original_answers[entry_idx].replace(original_answer, fill_mask_answers[idx]).replace('  ', ' '))

            # break
        else: # answer does not appear in the context (still need to append something)
            val_conflicting_contexts_1.append('NA')
            val_conflicting_answers_1.append('NA')
            val_conflicting_contexts_2.append('NA')
            val_conflicting_answers_2.append('NA')
            val_cited_conflicting_answers_1.append('NA')
            val_cited_conflicting_answers_2.append('NA')
    except (RuntimeError, IndexError) as e: # context is too long
        val_conflicting_contexts_1.append('NA')
        val_conflicting_answers_1.append('NA')
        val_conflicting_contexts_2.append('NA')
        val_conflicting_answers_2.append('NA')
        val_cited_conflicting_answers_1.append('NA')
        val_cited_conflicting_answers_2.append('NA')

In [None]:
# Add distractors to the conflicting context

# # train
train_full_conflicting_contexts_1 = [f'{conflicting_context_1} {distractor_context}' for conflicting_context_1, distractor_context in zip(train_conflicting_contexts_1,train_distractor_contexts)]
train_full_conflicting_contexts_2 = [f'{conflicting_context_2} {distractor_context}' for conflicting_context_2, distractor_context in zip(train_conflicting_contexts_2,train_distractor_contexts)]

# val
val_full_conflicting_contexts_1 = [f'{conflicting_context_1} {distractor_context}' for conflicting_context_1, distractor_context in zip(val_conflicting_contexts_1,val_distractor_contexts)]
val_full_conflicting_contexts_2 = [f'{conflicting_context_2} {distractor_context}' for conflicting_context_2, distractor_context in zip(val_conflicting_contexts_2,val_distractor_contexts)]

In [None]:
train_df = pd.DataFrame({
    'questions' : train_questions,
    'original_contexts' : train_original_contexts,
    'original_answers' : train_original_answers,
    'conflicting_contexts_1' : train_conflicting_contexts_1,
    'conflicting_answers_1' : train_conflicting_answers_1,
    'conflicting_contexts_2' : train_conflicting_contexts_2,
    'conflicting_answers_2' : train_conflicting_answers_2,
    'cited_original_answers' : train_cited_original_answers,
    'cited_conflicting_answers_1' : train_cited_conflicting_answers_1,
    'cited_conflicting_answers_2' : train_cited_conflicting_answers_2,
    'title_to_contexts' : train_title_to_contexts,
    'cited_oracle_contexts' : train_cited_oracle_contexts,
    'distractor_contexts': train_distractor_contexts,
    'number_of_documents_list' : train_number_of_documents_list,
    'full_conflicting_contexts_1': train_full_conflicting_contexts_1,
    'full_conflicting_contexts_2': train_full_conflicting_contexts_2,
    })

In [None]:
val_df = pd.DataFrame({
    'questions' : val_questions,
    'original_contexts' : val_original_contexts,
    'original_answers' : val_original_answers,
    'conflicting_contexts_1' : val_conflicting_contexts_1,
    'conflicting_answers_1' : val_conflicting_answers_1,
    'conflicting_contexts_2' : val_conflicting_contexts_2,
    'conflicting_answers_2' : val_conflicting_answers_2,
    'cited_original_answers' : val_cited_original_answers,
    'cited_conflicting_answers_1' : val_cited_conflicting_answers_1,
    'cited_conflicting_answers_2' : val_cited_conflicting_answers_2,
    'title_to_contexts' : val_title_to_contexts,
    'cited_oracle_contexts' : val_cited_oracle_contexts,
    'distractor_contexts': val_distractor_contexts,
    'number_of_documents_list' : val_number_of_documents_list,
    'full_conflicting_contexts_1': val_full_conflicting_contexts_1,
    'full_conflicting_contexts_2': val_full_conflicting_contexts_2,
    })

In [None]:
clean_train_df = train_df.drop(train_df[train_df.conflicting_contexts_1 == 'NA'].index).reset_index(drop=True)
clean_train_df

In [None]:
clean_val_df = val_df.drop(val_df[val_df.conflicting_contexts_1 == 'NA'].index).reset_index(drop=True)
clean_val_df

In [None]:
# convert to dictionaries
train_dict = clean_train_df.to_dict('list') 
# save final dataset to json
with open('datasets/hotpotqa_train.json', 'w') as fp:
    json.dump(train_dict, fp)

# # split val into val and test
val_merged_df_1 = clean_val_df[:len(clean_val_df)//2]
val_merged_df_2 = clean_val_df[len(clean_val_df)//2:]

val_dict = val_merged_df_1.to_dict('list')
test_dict = val_merged_df_2.to_dict('list')

with open('datasets/hotpotqa_val.json', 'w') as fp:
    json.dump(val_dict, fp)

with open('datasets/hotpotqa_test.json', 'w') as fp:
    json.dump(test_dict, fp)

In [None]:
# load
with open('datasets/hotpotqa_train.json', 'r') as fp:
    hotpotqa_train = json.load(fp)
with open('datasets/hotpotqa_val.json', 'r') as fp:
    hotpotqa_val = json.load(fp)
with open('datasets/hotpotqa_test.json', 'r') as fp:
    hotpotqa_test = json.load(fp)

# # print(len(hotpotqa_train['questions']))
# print(len(hotpotqa_val['questions']))
# print(len(hotpotqa_test['questions']))    

# organize the final dataset by combining all the conflicting contexts and answers

In [None]:
# train
train_new_cited_conflicting_answers_1 = []
train_new_conflicting_contexts_1 = []
train_new_cited_conflicting_answers_2 = []
train_new_conflicting_contexts_2 = []

for entry_idx in range(len(hotpotqa_train['number_of_documents_list'])):
    # entry_idx = 0
    number_of_documents = hotpotqa_train['number_of_documents_list'][entry_idx]
    counter = 1 # by how much to increase the document citation count

    tmp_cited_conflicting_answer_1 = hotpotqa_train['cited_conflicting_answers_1'][entry_idx]
    tmp_conflicting_context_1 = hotpotqa_train['conflicting_contexts_1'][entry_idx]
    for possible_doc_string in hotpotqa_train['cited_conflicting_answers_1'][entry_idx].split("'"):
        if 'Document ' in possible_doc_string:
            doc_int = possible_doc_string.split(' ')
            new_doc_int = counter + number_of_documents
            new_citation_string = f'Document {new_doc_int}'
            tmp_cited_conflicting_answer_1 = tmp_cited_conflicting_answer_1.replace(possible_doc_string, new_citation_string)
            tmp_conflicting_context_1 = tmp_conflicting_context_1.replace(possible_doc_string, new_citation_string)
            counter += 1 # increase citation counter

    train_new_cited_conflicting_answers_1.append(tmp_cited_conflicting_answer_1)
    train_new_conflicting_contexts_1.append(tmp_conflicting_context_1)


    tmp_cited_conflicting_answer_2 = hotpotqa_train['cited_conflicting_answers_2'][entry_idx]
    tmp_conflicting_context_2 = hotpotqa_train['conflicting_contexts_2'][entry_idx]
    for possible_doc_string in hotpotqa_train['cited_conflicting_answers_2'][entry_idx].split("'"):
        if 'Document ' in possible_doc_string:
            doc_int = possible_doc_string.split(' ')
            new_doc_int = counter + number_of_documents
            new_citation_string = f'Document {new_doc_int}'
            tmp_cited_conflicting_answer_2 = tmp_cited_conflicting_answer_2.replace(possible_doc_string, new_citation_string)
            tmp_conflicting_context_2 = tmp_conflicting_context_2.replace(possible_doc_string, new_citation_string)
            counter += 1 # increase citation counter

    train_new_cited_conflicting_answers_2.append(tmp_cited_conflicting_answer_2)
    train_new_conflicting_contexts_2.append(tmp_conflicting_context_2)

train_cited_context = [f'{hotpotqa_train["original_contexts"][entry_idx]} {train_new_conflicting_contexts_1[entry_idx]} {train_new_conflicting_contexts_2[entry_idx]}' for entry_idx in range(len(hotpotqa_train['original_contexts']))]

In [None]:
# val
val_new_cited_conflicting_answers_1 = []
val_new_conflicting_contexts_1 = []
val_new_cited_conflicting_answers_2 = []
val_new_conflicting_contexts_2 = []

for entry_idx in range(len(hotpotqa_val['number_of_documents_list'])):
    # entry_idx = 0
    number_of_documents = hotpotqa_val['number_of_documents_list'][entry_idx]
    counter = 1 # by how much to increase the document citation count

    tmp_cited_conflicting_answer_1 = hotpotqa_val['cited_conflicting_answers_1'][entry_idx]
    tmp_conflicting_context_1 = hotpotqa_val['conflicting_contexts_1'][entry_idx]
    for possible_doc_string in hotpotqa_val['cited_conflicting_answers_1'][entry_idx].split("'"):
        if 'Document ' in possible_doc_string:
            doc_int = possible_doc_string.split(' ')
            new_doc_int = counter + number_of_documents
            new_citation_string = f'Document {new_doc_int}'
            tmp_cited_conflicting_answer_1 = tmp_cited_conflicting_answer_1.replace(possible_doc_string, new_citation_string)
            tmp_conflicting_context_1 = tmp_conflicting_context_1.replace(possible_doc_string, new_citation_string)
            counter += 1 # increase citation counter

    val_new_cited_conflicting_answers_1.append(tmp_cited_conflicting_answer_1)
    val_new_conflicting_contexts_1.append(tmp_conflicting_context_1)


    tmp_cited_conflicting_answer_2 = hotpotqa_val['cited_conflicting_answers_2'][entry_idx]
    tmp_conflicting_context_2 = hotpotqa_val['conflicting_contexts_2'][entry_idx]
    for possible_doc_string in hotpotqa_val['cited_conflicting_answers_2'][entry_idx].split("'"):
        if 'Document ' in possible_doc_string:
            doc_int = possible_doc_string.split(' ')
            new_doc_int = counter + number_of_documents
            new_citation_string = f'Document {new_doc_int}'
            tmp_cited_conflicting_answer_2 = tmp_cited_conflicting_answer_2.replace(possible_doc_string, new_citation_string)
            tmp_conflicting_context_2 = tmp_conflicting_context_2.replace(possible_doc_string, new_citation_string)
            counter += 1 # increase citation counter

    val_new_cited_conflicting_answers_2.append(tmp_cited_conflicting_answer_2)
    val_new_conflicting_contexts_2.append(tmp_conflicting_context_2)

val_cited_context = [f'{hotpotqa_val["original_contexts"][entry_idx]} {val_new_conflicting_contexts_1[entry_idx]} {val_new_conflicting_contexts_2[entry_idx]}' for entry_idx in range(len(hotpotqa_val['original_contexts']))]

In [None]:
# test
test_new_cited_conflicting_answers_1 = []
test_new_conflicting_contexts_1 = []
test_new_cited_conflicting_answers_2 = []
test_new_conflicting_contexts_2 = []

for entry_idx in range(len(hotpotqa_test['number_of_documents_list'])):
    # entry_idx = 0
    number_of_documents = hotpotqa_test['number_of_documents_list'][entry_idx]
    counter = 1 # by how much to increase the document citation count

    tmp_cited_conflicting_answer_1 = hotpotqa_test['cited_conflicting_answers_1'][entry_idx]
    tmp_conflicting_context_1 = hotpotqa_test['conflicting_contexts_1'][entry_idx]
    for possible_doc_string in hotpotqa_test['cited_conflicting_answers_1'][entry_idx].split("'"):
        if 'Document ' in possible_doc_string:
            doc_int = possible_doc_string.split(' ')
            new_doc_int = counter + number_of_documents
            new_citation_string = f'Document {new_doc_int}'
            tmp_cited_conflicting_answer_1 = tmp_cited_conflicting_answer_1.replace(possible_doc_string, new_citation_string)
            tmp_conflicting_context_1 = tmp_conflicting_context_1.replace(possible_doc_string, new_citation_string)
            counter += 1 # increase citation counter

    test_new_cited_conflicting_answers_1.append(tmp_cited_conflicting_answer_1)
    test_new_conflicting_contexts_1.append(tmp_conflicting_context_1)


    tmp_cited_conflicting_answer_2 = hotpotqa_test['cited_conflicting_answers_2'][entry_idx]
    tmp_conflicting_context_2 = hotpotqa_test['conflicting_contexts_2'][entry_idx]
    for possible_doc_string in hotpotqa_test['cited_conflicting_answers_2'][entry_idx].split("'"):
        if 'Document ' in possible_doc_string:
            doc_int = possible_doc_string.split(' ')
            new_doc_int = counter + number_of_documents
            new_citation_string = f'Document {new_doc_int}'
            tmp_cited_conflicting_answer_2 = tmp_cited_conflicting_answer_2.replace(possible_doc_string, new_citation_string)
            tmp_conflicting_context_2 = tmp_conflicting_context_2.replace(possible_doc_string, new_citation_string)
            counter += 1 # increase citation counter

    test_new_cited_conflicting_answers_2.append(tmp_cited_conflicting_answer_2)
    test_new_conflicting_contexts_2.append(tmp_conflicting_context_2)

test_cited_context = [f'{hotpotqa_test["original_contexts"][entry_idx]} {test_new_conflicting_contexts_1[entry_idx]} {test_new_conflicting_contexts_2[entry_idx]}' for entry_idx in range(len(hotpotqa_test['original_contexts']))]

In [None]:
# train
train_df = pd.DataFrame({
    'question' : hotpotqa_train['questions'],
    'original_context' : hotpotqa_train['original_contexts'],
    'original_answer' : hotpotqa_train['original_answers'],
    'conflicting_contexts_1' : hotpotqa_train['conflicting_contexts_1'],
    'conflicting_answers_1' : hotpotqa_train['conflicting_answers_1'],
    'conflicting_contexts_2' : hotpotqa_train['conflicting_contexts_2'],
    'conflicting_answers_2' : hotpotqa_train['conflicting_answers_2'],
    'cited_original_answers' : hotpotqa_train['cited_original_answers'],
    'cited_conflicting_answers_1' : hotpotqa_train['cited_conflicting_answers_1'],
    'cited_conflicting_answers_2' : hotpotqa_train['cited_conflicting_answers_2'],
    'title_to_contexts' : hotpotqa_train['title_to_contexts'],
    'cited_oracle_contexts' : hotpotqa_train['cited_oracle_contexts'],
    'distractor_contexts': hotpotqa_train['distractor_contexts'],
    'number_of_documents_list' : hotpotqa_train['number_of_documents_list'],
    'full_conflicting_contexts_1': hotpotqa_train['full_conflicting_contexts_1'],
    'full_conflicting_contexts_2': hotpotqa_train['full_conflicting_contexts_2'],
    'cited_context' : train_cited_context,
    'new_conflicting_contexts_1': train_new_conflicting_contexts_1,
    'new_conflicting_contexts_2': train_new_conflicting_contexts_2,
    'new_cited_conflicting_answers_1': train_new_cited_conflicting_answers_1,
    'new_cited_conflicting_answers_2': train_new_cited_conflicting_answers_2,
    })

# # val
val_df = pd.DataFrame({
    'question' : hotpotqa_val['questions'],
    'original_context' : hotpotqa_val['original_contexts'],
    'original_answer' : hotpotqa_val['original_answers'],
    'conflicting_contexts_1' : hotpotqa_val['conflicting_contexts_1'],
    'conflicting_answers_1' : hotpotqa_val['conflicting_answers_1'],
    'conflicting_contexts_2' : hotpotqa_val['conflicting_contexts_2'],
    'conflicting_answers_2' : hotpotqa_val['conflicting_answers_2'],
    'cited_original_answers' : hotpotqa_val['cited_original_answers'],
    'cited_conflicting_answers_1' : hotpotqa_val['cited_conflicting_answers_1'],
    'cited_conflicting_answers_2' : hotpotqa_val['cited_conflicting_answers_2'],
    'title_to_contexts' : hotpotqa_val['title_to_contexts'],
    'cited_oracle_contexts' : hotpotqa_val['cited_oracle_contexts'],
    'distractor_contexts': hotpotqa_val['distractor_contexts'],
    'number_of_documents_list' : hotpotqa_val['number_of_documents_list'],
    'full_conflicting_contexts_1': hotpotqa_val['full_conflicting_contexts_1'],
    'full_conflicting_contexts_2': hotpotqa_val['full_conflicting_contexts_2'],
    'cited_context' : val_cited_context,
    'new_conflicting_contexts_1': val_new_conflicting_contexts_1,
    'new_conflicting_contexts_2': val_new_conflicting_contexts_2,
    'new_cited_conflicting_answers_1': val_new_cited_conflicting_answers_1,
    'new_cited_conflicting_answers_2': val_new_cited_conflicting_answers_2,
    })

# # test
test_df = pd.DataFrame({
    'question' : hotpotqa_test['questions'],
    'original_context' : hotpotqa_test['original_contexts'],
    'original_answer' : hotpotqa_test['original_answers'],
    'conflicting_contexts_1' : hotpotqa_test['conflicting_contexts_1'],
    'conflicting_answers_1' : hotpotqa_test['conflicting_answers_1'],
    'conflicting_contexts_2' : hotpotqa_test['conflicting_contexts_2'],
    'conflicting_answers_2' : hotpotqa_test['conflicting_answers_2'],
    'cited_original_answers' : hotpotqa_test['cited_original_answers'],
    'cited_conflicting_answers_1' : hotpotqa_test['cited_conflicting_answers_1'],
    'cited_conflicting_answers_2' : hotpotqa_test['cited_conflicting_answers_2'],
    'title_to_contexts' : hotpotqa_test['title_to_contexts'],
    'cited_oracle_contexts' : hotpotqa_test['cited_oracle_contexts'],
    'distractor_contexts': hotpotqa_test['distractor_contexts'],
    'number_of_documents_list' : hotpotqa_test['number_of_documents_list'],
    'full_conflicting_contexts_1': hotpotqa_test['full_conflicting_contexts_1'],
    'full_conflicting_contexts_2': hotpotqa_test['full_conflicting_contexts_2'],
    'cited_context' : test_cited_context,
    'new_conflicting_contexts_1': test_new_conflicting_contexts_1,
    'new_conflicting_contexts_2': test_new_conflicting_contexts_2,
    'new_cited_conflicting_answers_1': test_new_cited_conflicting_answers_1,
    'new_cited_conflicting_answers_2': test_new_cited_conflicting_answers_2,
    })


In [None]:
print(len(train_df))
# print(len(val_df))
# print(len(test_df))

In [None]:
test_df.head()

In [None]:
test_df['cited_context'][1].split('Document ')

In [None]:
print(test_df['new_cited_conflicting_answers_1'][1])
print(test_df['new_cited_conflicting_answers_2'][1])
print(test_df['cited_original_answers'][1])

In [None]:
# save final datasets
# convert to dictionaries
train_dict = train_df.to_dict('list')
val_dict = val_df.to_dict('list')
test_dict = test_df.to_dict('list')

# save final dataset to json
with open('datasets/hotpotqa_train_final.json', 'w') as fp:
    json.dump(train_dict, fp)

with open('datasets/hotpotqa_val_final.json', 'w') as fp:
    json.dump(val_dict, fp)
with open('datasets/hotpotqa_test_final.json', 'w') as fp:
    json.dump(test_dict, fp)

In [None]:
# load
with open('datasets/hotpotqa_train_final.json', 'r') as fp:
    hotpotqa_train = json.load(fp)
with open('datasets/hotpotqa_val_final.json', 'r') as fp:
    hotpotqa_val = json.load(fp)
with open('datasets/hotpotqa_test_final.json', 'r') as fp:
    hotpotqa_test = json.load(fp)

# print(len(hotpotqa_train['question']))
print(len(hotpotqa_val['question']))
print(len(hotpotqa_test['question']))    

In [None]:
# pd.DataFrame(hotpotqa_train).head()
for k in hotpotqa_train.keys():
    print(k)
    print(hotpotqa_train[k][2])
    print()