# code to create the two DisentQA_cite datasets (DisentQA_DupliCite and DisentQA_ParaCite)

In [4]:
# DisentQA
# paper
# https://aclanthology.org/2023.acl-long.559.pdf
# github
# https://github.com/ellaneeman/disent_qa?tab=readme-ov-file
# download the data (from their github) -- "more_baselines/v10-simplified_simplified-nq-train_factual_counterfactual_answerabilty_contextual_baseline_{}_split.csv.zip" and place in the "datasets" folder
# https://docs.google.com/document/d/1Z4vA7ifMQTk5YBF3BEYCFSnIvXCYaznLP_7VBcXPEeU/edit?pli=1

In [1]:
import pandas as pd
import json
import re
import openai
from openai import OpenAI
import time
import ast
from datasets import Dataset 

CLEANR = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});') # clean markups
def cleanhtml(raw_html):
  cleantext = re.sub(CLEANR, '', raw_html)
  return cleantext

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
disentqa_train_pandas = pd.read_csv('datasets/v10-simplified_simplified-nq-train_factual_counterfactual_answerabilty_contextual_baseline_train_split.csv')
disentqa_val_pandas = pd.read_csv('datasets/v10-simplified_simplified-nq-train_factual_counterfactual_answerabilty_contextual_baseline_val_split.csv')

In [None]:
print(len(disentqa_train_pandas))
print(len(disentqa_val_pandas))

In [None]:
disentqa_val_pandas.head()

In [None]:
# what we want is:
# question - original_context - conflicting_context - original_answer - conflicting_answer

In [None]:
# their data is messed up -- similar IDs map to different questions / context, so I need to match info based on the question 
# (which is the same for different contexts)

In [9]:
# train
train_counterfactual = []
train_factual = []

for idx, row in disentqa_train_pandas.iterrows():
    temp_dict = {}
    question = row['question']
    context = row['context']
    parametric_answer = row['parametric_answer']
    contextual_answer = row['contextual_answer']
    type = row['type']
        
    if type == 'counterfactual':
        temp_dict['question'] = question
        temp_dict['conflicting_context'] = cleanhtml(context)
        temp_dict['conflicting_answer'] = contextual_answer
        temp_dict['original_answer'] = parametric_answer
        train_counterfactual.append(temp_dict)
    if type == 'factual':
        temp_dict['question'] = question
        temp_dict['original_context'] = cleanhtml(context)
        train_factual.append(temp_dict)

# validation
val_counterfactual = []
val_factual = []
for idx, row in disentqa_val_pandas.iterrows():
    temp_dict = {}
    question = row['question']
    context = row['context']
    parametric_answer = row['parametric_answer']
    contextual_answer = row['contextual_answer']
    type = row['type']
        
    if type == 'counterfactual':
        temp_dict['question'] = question
        temp_dict['conflicting_context'] = cleanhtml(context)
        temp_dict['conflicting_answer'] = contextual_answer
        temp_dict['original_answer'] = parametric_answer
        val_counterfactual.append(temp_dict)
    if type == 'factual':
        temp_dict['question'] = question
        temp_dict['original_context'] = cleanhtml(context)
        val_factual.append(temp_dict)        

In [10]:
# train
train_counterfactual_df = pd.DataFrame(train_counterfactual)
train_factual_df = pd.DataFrame(train_factual)

# validation
val_counterfactual_df = pd.DataFrame(val_counterfactual)
val_factual_df = pd.DataFrame(val_factual)

In [11]:
# merge the dataframes based on the question string
# This will perform and "inner-join" thereby omitting rows in each dataframe that do not match. 
# Hence, no NaN in either the right or left part of merged dataframe.

# train
train_merged_df = train_factual_df.merge(train_counterfactual_df, how = 'inner', on = ['question'])

# validation
val_merged_df = val_factual_df.merge(val_counterfactual_df, how = 'inner', on = ['question'])

In [None]:
train_merged_df.head()

In [None]:
val_merged_df.head()   

In [None]:
print(len(train_merged_df))
print(len(val_merged_df))

In [None]:
original_contexts = list(train_merged_df["original_context"])
conflicting_context = list(train_merged_df["conflicting_context"])
train_cited_context = [f'Document 1: {original_contexts[i]}. Document 2: {conflicting_context[i]}' for i in range(len(train_merged_df))]
train_cited_context[0]

In [None]:
original_answer = list(train_merged_df["original_answer"])
conflicting_answer = list(train_merged_df["conflicting_answer"])
train_cited_answer = [f'According to Document 1 the answer is: {original_answer[i]}. According to Document 2 the answer is: {conflicting_answer[i]}' for i in range(len(train_merged_df))]
train_cited_answer[0]

In [None]:
original_contexts = list(val_merged_df["original_context"])
conflicting_context = list(val_merged_df["conflicting_context"])
val_cited_context = [f'Document 1: {original_contexts[i]}. Document 2: {conflicting_context[i]}' for i in range(len(val_merged_df))]
val_cited_context[0]

In [None]:
original_answer = list(val_merged_df["original_answer"])
conflicting_answer = list(val_merged_df["conflicting_answer"])
val_cited_answer = [f'According to Document 1 the answer is: {original_answer[i]}. According to Document 2 the answer is: {conflicting_answer[i]}' for i in range(len(val_merged_df))]
val_cited_answer[0]

In [None]:
val_merged_df['cited_context'] = val_cited_context
val_merged_df['cited_answer'] = val_cited_answer
val_merged_df.head()

In [None]:
train_merged_df['cited_context'] = train_cited_context
train_merged_df['cited_answer'] = train_cited_answer
train_merged_df.head()

In [None]:
print(len(val_merged_df))
print(len(train_merged_df))

In [None]:
# filter df if the context is too long (which will cause an error / the model wont generate anything)
# max_length = 3000
# val_merged_df.loc[len(val_merged_df['cited_context'].split(' ')) < max_length]

In [None]:
# convert to dictionaries
train_dict = train_merged_df.to_dict('list') 
# save final dataset to json
# with open('datasets/disent_qa_train.json', 'w') as fp:
#     json.dump(train_dict, fp)    

# split val into val and test
val_merged_df_1 = val_merged_df[:len(val_merged_df)//2]
val_merged_df_2 = val_merged_df[len(val_merged_df)//2:]

val_dict = val_merged_df_1.to_dict('list')
test_dict = val_merged_df_2.to_dict('list')

# with open('datasets/disent_qa_val.json', 'w') as fp:
#     json.dump(val_dict, fp)

# with open('datasets/disent_qa_test.json', 'w') as fp:
#     json.dump(test_dict, fp)

# load
# with open('datasets/disent_qa_train.json', 'r') as fp:
#     disent_qa_train = json.load(fp)
# with open('datasets/disent_qa_dev.json', 'r') as fp:
#     disent_qa_dev = json.load(fp)    

In [2]:
# load
with open('datasets/disent_qa_val.json', 'r') as fp:
    disent_qa_val = json.load(fp)
with open('datasets/disent_qa_val.json', 'r') as fp:
    disent_qa_val = json.load(fp)    
with open('datasets/disent_qa_test.json', 'r') as fp:
    disent_qa_test = json.load(fp)    

In [3]:
prompts = [f'{cited_context}' for cited_context in\
         zip(disent_qa_test['cited_context'])]

words_per_prompt = [len(i.split(' ')) for i in prompts]
avg_word_count = sum(words_per_prompt)/len(words_per_prompt)
avg_word_count

629.8752922837101

In [4]:
for k in disent_qa_test.keys():
    print(k)
    print(disent_qa_test[k][0])

question
who gave the first in person state of the union
original_context
 The address fulfills rules in Article II , Section 3 of the U.S. Constitution , requiring the President to periodically `` give to the Congress Information of the State of the Union , and recommend to their Consideration such measures as he shall judge necessary and expedient . '' During most of the country 's first century , the President primarily only submitted a written report to Congress . After 1913 , Woodrow Wilson , the 28th U.S. President , began the regular practice of delivering the address to Congress in person as a way to rally support for his agenda . With the advent of radio and television , the address is now broadcast live across the country on many networks . 
conflicting_context
 The address fulfills rules in Article II , Section 3 of the U.S. Constitution , requiring the President to periodically `` give to the Congress Information of the State of the Union , and recommend to their Considerat

# create paraphrased disetqa (DisentQA_ParaCite)

# I'm using ChatGPT to paraphrase the context (the original dataset has context duplications)

In [5]:
disent_qa_test_hf = Dataset.from_dict(disent_qa_test) 
disent_qa_test_hf

Dataset({
    features: ['question', 'original_context', 'conflicting_context', 'conflicting_answer', 'original_answer', 'cited_context', 'cited_answer'],
    num_rows: 3849
})

In [5]:
openai.api_key = '' # put API key here

client = OpenAI(
    # defaults to os.environ.get("OPENAI_API_KEY")
    api_key=openai.api_key,
)

# single string
def chat_gpt(prompt):
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content.strip()

# for batches
# def chat_gpt(list_of_contexts, list_of_answers):
#     response = client.chat.completions.create(
#         model="gpt-3.5-turbo",
#         messages = [
#         {
#             "role": "user",
#             "content": f"{list_of_contexts}"
#         },
#         {
#             "role": "system",
#             "content": f"Paraphrase every element of the array. Ensure that these corresponding answers: {list_of_answers}, are still in the paraphrased output. Reply with an array of all completions."
#         }
#     ]
#     )
#     print(f"{list_of_contexts}")
#     response_string = response.choices[0].message.content.strip()
#     print(response_string)
#     response_list = ast.literal_eval(response_string)
#     return response_list

In [None]:
# # if starting from scratch
# paraphrased_context = [] 

# while len(paraphrased_context) < len(disent_qa_test_hf): # until we paraphrase all text (in case it gives an error)
#     for entry_idx in range(len(paraphrased_context), len(disent_qa_test_hf)):
#         try:
#             if entry_idx % 20 == 0:
#                 print(f'Counter: {entry_idx} out of {len(disent_qa_test_hf)}')
#                 # save summarized dataset
#                 with open('paraphrased_disentqa.json', 'w') as fp:
#                     json.dump(paraphrased_context, fp)     

#             # extract
#             conflicting_context = disent_qa_test_hf[entry_idx]['conflicting_context'] # text to be paraphrased
#             conflicting_label = disent_qa_test_hf[entry_idx]['conflicting_answer'] 

#             # Give the text to the model and ask for a summary using the GPT-3.5-turbo model (cost money)
            
#             paraphrase = chat_gpt(f"Paraphrase this: {conflicting_context}. Ensure that '{conflicting_label}' is still in the paraphrased output")

#             paraphrased_context.append(paraphrase)
#         except: # 'RateLimitError' -- cant use it as an exception so leaving this blank
#             print('sleeping...')
#             time.sleep(10) # Delay for 1 minute (60 seconds).
        
# # Make sure to save the final batch of paraphrased data
# with open('paraphrased_disentqa.json', 'w') as fp:
#     json.dump(paraphrased_context, fp)

In [None]:
# with open('paraphrased_disentqa.json', 'r') as fp:
#     paraphrased_disentqa = json.load(fp)    
# print(len(paraphrased_disentqa))
# print(len(disent_qa_test_hf))

In [None]:
# check if the lists are the same -- they are not in 2 places because the API broke
# idx = 660
# print(paraphrased_disentqa[idx])
# print()
# print(disent_qa_test_hf[idx]['conflicting_context'])
# print()
# print(disent_qa_test_hf[idx]['conflicting_answer'])

In [128]:
# new_paraphrase = paraphrased_disentqa.copy()
# new_paraphrase.insert(661, 'NA')
# new_paraphrase.insert(769, 'NA')

# new_paraphrase = new_paraphrase[:len(disent_qa_test_hf)] # added 2 NA, need to remove the last 2 

In [122]:
# idx = 765
# for i in range(20):
#     new_idx = idx + i
#     print('new_idx', new_idx)
#     print(disent_qa_test_hf[new_idx]['conflicting_context'])
#     print()
#     print(new_paraphrase[new_idx])
#     print()
#     print(disent_qa_test_hf[new_idx]['conflicting_answer'])
#     print()
#     print()

In [136]:
# with open('datasets/clean_paraphrased_disentqa.json', 'w') as fp:
#     json.dump(new_paraphrase, fp)

In [3]:
with open('datasets/clean_paraphrased_disentqa.json', 'r') as fp:
    clean_paraphrased_disentqa = json.load(fp)    

In [4]:
clean_paraphrased_disentqa[0]

In [None]:
disent_qa_test['conflicting_context'][0]

In [None]:
disent_qa_test['cited_context'][0]

In [None]:
# replace the conflicting context in disentqa with the new paraphrased conflicting context
paraphrased_disentqa_test = disent_qa_test.copy()
print(paraphrased_disentqa_test.keys())
paraphrased_disentqa_test.pop('conflicting_context', None) # remove old one
paraphrased_disentqa_test['conflicting_context'] = clean_paraphrased_disentqa # add new one

In [None]:
new_cited_context = [f'Document 1: {original_context}. Document 2: {paraphrased_context}' for original_context, paraphrased_context in zip(disent_qa_test['original_context'],clean_paraphrased_disentqa)]
new_cited_context[0]

In [15]:
paraphrased_disentqa_test.pop('cited_context', None) # remove old one
paraphrased_disentqa_test['cited_context'] = new_cited_context # add new one

In [None]:
paraphrased_disentqa_test['cited_context'][0]

In [17]:
with open('datasets/paraphrased_disentqa_test.json', 'w') as fp:
    json.dump(paraphrased_disentqa_test, fp)

In [None]:
# print(disent_qa_test['conflicting_context'][0])
# print(paraphrased_disentqa_test['conflicting_context'][0])

In [None]:
# print(len(disent_qa_test['conflicting_context']))
# print(len(paraphrased_disentqa_test['conflicting_context']))