# code to create AmbigQA_cite

In [None]:
# AmbigQA (AmbigNQ)
# paper
# https://arxiv.org/pdf/2004.10645.pdf
# github
# https://github.com/shmsw25/AmbigQA/?tab=readme-ov-file#dataset-contents
# download the data (from their github -- "AmbigNQ with evidence articles") and put it in the "datasets" folder
# https://github.com/shmsw25/AmbigQA/?tab=readme-ov-file#dataset-contents

# note that there is also data under "AmbigNQ", but the text "snippet" is bad (not full sentences, lots of "...", and is in markdown (<b>...<b>))
# so I'm using the "evidence articles" version and will truncate it

In [1]:
import pandas as pd
import json

In [None]:
# train
with open('datasets/train_with_evidence_articles.json') as f:
    ambigqa_train_dict = json.load(f)

# val
with open('datasets/dev_with_evidence_articles.json') as f: # note that I'll need to split this in 2: val and test
    ambigqa_val_dict = json.load(f)    

In [3]:
# train
ambigqa_train_pandas = pd.DataFrame.from_dict(ambigqa_train_dict)

# val
ambigqa_val_pandas = pd.DataFrame.from_dict(ambigqa_val_dict)

In [None]:
print(len(ambigqa_train_pandas))
print(len(ambigqa_val_pandas))

In [None]:
ambigqa_val_pandas.head()

In [None]:
# what we want is:
# question - original_context - conflicting_context - original_answer - conflicting_answer

In [6]:
# train
train_questions = []
train_answers = []
train_texts = []

for idx, row in ambigqa_train_pandas.iterrows():
    if any([row['annotations'][i]['type']=='multipleQAs' for i in range(len(row['annotations']))]):
        question = row['question']
        text = row['articles_plain_text']
        annotations = row['annotations']
        
        tmp_train_answers = []
        for entry in annotations:
            try:
                qaPairs = entry['qaPairs']
                for qa_pair in qaPairs:
                    unambiguous_question = qa_pair['question'] # since we just want the main, ambiguous question, we can ignore these specific ones
                    unambiguous_train_answers_list = qa_pair['answer']
                    unambiguous_answer = qa_pair['answer'][0] # just take the first answer for each question
                    tmp_train_answers.append(unambiguous_answer)
                tmp_train_answers = list(set(tmp_train_answers)) # remove dupilcates
            except KeyError:
                pass
    
        train_questions.append(question)
        train_texts.append(text)
        train_answers.append(tmp_train_answers)

# val
val_questions = []
val_answers = []
val_texts = []

for idx, row in ambigqa_val_pandas.iterrows():
    if any([row['annotations'][i]['type']=='multipleQAs' for i in range(len(row['annotations']))]):
        question = row['question']
        text = row['articles_plain_text']
        annotations = row['annotations']
        
        tmp_val_answers = []
        for entry in annotations:
            try:
                qaPairs = entry['qaPairs']
                for qa_pair in qaPairs:
                    unambiguous_question = qa_pair['question'] # since we just want the main, ambiguous question, we can ignore these specific ones
                    unambiguous_val_answers_list = qa_pair['answer']
                    unambiguous_answer = qa_pair['answer'][0] # just take the first answer for each question
                    tmp_val_answers.append(unambiguous_answer)
                tmp_val_answers = list(set(tmp_val_answers)) # remove dupilcates
            except KeyError:
                pass
    
        val_questions.append(question)
        val_texts.append(text)
        val_answers.append(tmp_val_answers)

In [7]:
# https://stackoverflow.com/questions/77929243/find-unique-answers-from-list-of-strings
def find_all_answer_combinations(texts, answers, used_texts=()):
    if not texts or not answers:
        # Nothing to do, return an empty combination.
        yield ()
        return
    
    # Pick an answer to process.
    answer, *rest_answers = answers
    # Find solutions that *don't* use the selected answer.
    yield from find_all_answer_combinations(texts, rest_answers, used_texts)

    if any(answer in text for text in used_texts):
        # The answer already appears in a returned text, so we can't use it again.
        return

    # Find texts that have this answer.
    matched_texts = [text for text in texts if answer in text]
    # Only continue checking texts that don't contain the answer,
    # otherwise we'd have two or more matches for this answer.
    other_texts = [text for text in texts if answer not in text]
    # Try generating combinations with each of the matched texts.
    for matched_text in matched_texts:
        # Assuming we use `matched_text`, generate combinations
        # for the rest of answers/texts.
        for result in find_all_answer_combinations(other_texts, rest_answers, used_texts+(matched_text,)):
            yield [answer, matched_text], *result

def find_best_answers(texts, answers):
    # Finds the combination with most answers.
    return list(max(find_all_answer_combinations(texts, answers), key=len))

In [None]:
# texts = ["this is string 1 this is string 3", "this is string 2", "this is string 3"]
# texts = ["this is string 1", "this is string 3", "this is string 1 this is string 3"]
# answers = ["string 1","this is", "string 3"]

# find_best_answers(texts, answers)

In [8]:
# train
train_final_questions = []
train_original_full_context = []
train_conflicting_full_context = []
train_original_answer = []
train_conflicting_answer = []

for qst, ans, txt in zip(train_questions, train_answers, train_texts):
    output = find_best_answers(txt, ans)
    if len(output) == 2: # only use those that have exactly 2 answers and corresponding docs (some have 3 or 4, and I could create duplicates of each combination to get a little bit more data, but I'll avoid it for now)
        train_final_questions.append(qst)
        train_original_full_context.append(output[0][1]) # first one can be used as the original
        train_original_answer.append(output[0][0]) # second one can be used as the conflicting
        train_conflicting_full_context.append(output[1][1]) # first one can be used as the original
        train_conflicting_answer.append(output[1][0]) # second one can be used as the conflicting
        
# val
val_final_questions = []
val_original_full_context = []
val_conflicting_full_context = []
val_original_answer = []
val_conflicting_answer = []

for qst, ans, txt in zip(val_questions, val_answers, val_texts):
    output = find_best_answers(txt, ans)
    if len(output) == 2: # only use those that have exactly 2 answers and corresponding docs (some have 3 or 4, and I could create duplicates of each combination to get a little bit more data, but I'll avoid it for now)
        val_final_questions.append(qst)
        val_original_full_context.append(output[0][1]) # first one can be used as the original
        val_original_answer.append(output[0][0]) # second one can be used as the conflicting
        val_conflicting_full_context.append(output[1][1]) # first one can be used as the original
        val_conflicting_answer.append(output[1][0]) # second one can be used as the conflicting

In [9]:
# the full context is too long, so let's take a snippet of k characters around the answer. This is like the top-k method, but with characters
def extract_substring_with_keywords(text, keyword, k=300):
    words = text[max(0,text.index(keyword) - k) : min(len(text),text.index(keyword) + k)].split(' ')
    substring = ' '.join(words[1:-1]) # remove the first and last because they might be truncated
    return substring


# train
train_original_contexts = [extract_substring_with_keywords(train_original_full_context_entry, answer_entry) for train_original_full_context_entry, answer_entry in zip(train_original_full_context,train_original_answer)]
train_conflicting_contexts = [extract_substring_with_keywords(train_conflicting_full_context_entry, answer_entry) for train_conflicting_full_context_entry, answer_entry in zip(train_conflicting_full_context,train_conflicting_answer)]

# val
val_original_contexts = [extract_substring_with_keywords(val_original_full_context_entry, answer_entry) for val_original_full_context_entry, answer_entry in zip(val_original_full_context,val_original_answer)]
val_conflicting_contexts = [extract_substring_with_keywords(val_conflicting_full_context_entry, answer_entry) for val_conflicting_full_context_entry, answer_entry in zip(val_conflicting_full_context,val_conflicting_answer)]


In [None]:
# train
train_df = pd.DataFrame(
    {'question': train_final_questions,
     'original_context': train_original_contexts,
     'conflicting_context': train_conflicting_contexts,
     'original_answer': train_original_answer,
     'conflicting_answer': train_conflicting_answer
    })

# val
val_df = pd.DataFrame(
    {'question': val_final_questions,
     'original_context': val_original_contexts,
     'conflicting_context': val_conflicting_contexts,
     'original_answer': val_original_answer,
     'conflicting_answer': val_conflicting_answer
    })

val_df

In [None]:
train_df

In [12]:
original_contexts = list(train_df["original_context"])
conflicting_context = list(train_df["conflicting_context"])
train_cited_context = [f'Document 1: {original_contexts[i]}. Document 2: {conflicting_context[i]}' for i in range(len(train_df))]

original_answer = list(train_df["original_answer"])
conflicting_answer = list(train_df["conflicting_answer"])
train_cited_answer = [f'According to Document 1 the answer is: {original_answer[i]}. According to Document 2 the answer is: {conflicting_answer[i]}' for i in range(len(train_df))]

train_df['cited_context'] = train_cited_context
train_df['cited_answer'] = train_cited_answer

original_contexts = list(val_df["original_context"])
conflicting_context = list(val_df["conflicting_context"])
val_cited_context = [f'Document 1: {original_contexts[i]}. Document 2: {conflicting_context[i]}' for i in range(len(val_df))]

original_answer = list(val_df["original_answer"])
conflicting_answer = list(val_df["conflicting_answer"])
val_cited_answer = [f'According to Document 1 the answer is: {original_answer[i]}. According to Document 2 the answer is: {conflicting_answer[i]}' for i in range(len(val_df))]

val_df['cited_context'] = val_cited_context
val_df['cited_answer'] = val_cited_answer

In [None]:
val_df

In [None]:
# convert to dictionaries
train_dict = train_df.to_dict('list') 
# save final dataset to json
with open('datasets/ambig_qa_train.json', 'w') as fp:
    json.dump(train_dict, fp)    

# split val into val and test
val_df_1 = val_df[:len(val_df)//2]
val_df_2 = val_df[len(val_df)//2:]

val_dict = val_df_1.to_dict('list')
test_dict = val_df_2.to_dict('list')

with open('datasets/ambig_qa_val.json', 'w') as fp:
    json.dump(val_dict, fp)

with open('datasets/ambig_qa_test.json', 'w') as fp:
    json.dump(test_dict, fp)

# load
# with open('datasets/ambig_qa_train.json', 'r') as fp:
#     ambig_qa_train = json.load(fp)
# with open('datasets/ambig_qa_dev.json', 'r') as fp:
#     ambig_qa_dev = json.load(fp)    

In [4]:
# load 
with open('datasets/ambig_qa_train.json', 'r') as fp:
    ambig_qa_train = json.load(fp)
with open('datasets/ambig_qa_val.json', 'r') as fp:
    ambig_qa_val = json.load(fp)
with open('datasets/ambig_qa_test.json', 'r') as fp:
    ambig_qa_test = json.load(fp)    

In [None]:
prompts = [f'{cited_context}' for cited_context in\
         zip(ambig_qa_test['cited_context'])]

words_per_prompt = [len(i.split(' ')) for i in prompts]
avg_word_count = sum(words_per_prompt)/len(words_per_prompt)
avg_word_count

In [None]:
ambig_qa_test.keys()

In [None]:
q = "Who voices rocket raccoon in guardians of the galaxy?"
for entry_idx, entry in enumerate(ambig_qa_test['question']):
    if entry == q:
        print(q)
        print(ambig_qa_test['original_context'][entry_idx])
        print(ambig_qa_test['original_answer'][entry_idx])
        print()
        print(ambig_qa_test['conflicting_context'][entry_idx])
        print(ambig_qa_test['conflicting_answer'][entry_idx])
