In [None]:
%pip install openai

In [None]:
#TODO merge feedback with retrieval results

import json

def load_json_file(filepath):
    with open(filepath, 'r') as file:
        return json.load(file)

def save_json_file(data, filepath):
    with open(filepath, 'w') as file:
        json.dump(data, file, indent=4)

def find_corresponding_question(feedback_questions, question_id):
    for question in feedback_questions:
        if question.get('id') == question_id:  # Assuming each question has an 'id' to match on
            return question
    return None

def merge_golden_snippets(run_questions, feedback_questions):
    for question in run_questions:
        question_id = question.get('id')  # Adjust based on how questions are identified
        corresponding_question = find_corresponding_question(feedback_questions, question_id)
        
        if corresponding_question:
            golden_snippets = [snippet for snippet in corresponding_question.get('snippets', []) if snippet.get('golden')]
            existing_snippets = question.get('snippets', [])
            
            for golden_snippet in golden_snippets:
                if not is_duplicate(golden_snippet, existing_snippets):
                    print("added snippet to run file:")
                    print(golden_snippet)
                    existing_snippets.append(golden_snippet)

            question['snippets'] = existing_snippets

def is_duplicate(snippet, existing_snippets):
    for existing in existing_snippets:
        if (snippet['document'] == existing['document'] and
            snippet['beginSection'] == existing['beginSection'] and
            snippet['endSection'] == existing['endSection'] and
            snippet['offsetInBeginSection'] == existing['offsetInBeginSection'] and
            snippet['offsetInEndSection'] == existing['offsetInEndSection']):
            return True
    return False

# Load the feedback and run files
feedback_file = './Round4/BioASQ-taskSynergy_2024-feedback_round4.json' 
#run_file = './Round3/Result/2024-02-07_18-32-56_gpt-3.5-turbo-0125_PhaseA_NoExpansion_output_file.json' 
run_file = './Round4/Result/2024-02-22_10-49-26_gpt-4-0125-preview_Retrieval_BioASQ-Run.json'
new_file = './Round4/gpt-4-turbo-merged.json'  

feedback_data = load_json_file(feedback_file)
run_data = load_json_file(run_file)

# Merge golden snippets into the run file questions
merge_golden_snippets(run_data['questions'], feedback_data['questions'])

# Save the updated run file data with added golden snippets
save_json_file(run_data, new_file)



In [2]:
#merged_file = './Round3/gpt-4-turbo-merged.json'
merged_file = './Round4/gpt-4-turbo-merged.json'
task_file = './Round4/BioASQ-taskSynergy_v2024-testset4.json'

task_data = load_json_file(task_file)
merged_data = load_json_file(merged_file)


def merge_ready_flag(merged_data, task_data):
    for question in merged_data:
        question_id = question.get('id') 
        corresponding_question = find_corresponding_question(task_data, question_id)
        question['answerReady'] = corresponding_question['answerReady']
    return merged_data

new_file = './Round4/gpt-4-turbo-merged_new.json' 

result = merge_ready_flag(merged_data['questions'], task_data['questions'])
result = {"questions": result}

# Save the updated run file data with added golden snippets
save_json_file(result, new_file)
    

In [None]:
import json
import datetime
import time  # Import the time module
import string
from openai import OpenAI
import os
import pickle
from concurrent.futures import ThreadPoolExecutor
import traceback

client = OpenAI()

model_name = "gpt-4-0125-preview"
#model_name = "gpt-3.5-turbo-0125"

def ask_openai(messages, model_name, json_response=True):
    # Base parameters for the completion request
    completion_params = {
        "model": model_name,
        "messages": messages,
        "temperature": 0.0,  # randomness of completion
        "seed": 90128538
    }
    
    # Conditionally add response_format if json_response is True
    if json_response:
        completion_params["response_format"] = { "type": "json_object" }

    # Make the completion request with the dynamically constructed parameters
    completion = client.chat.completions.create(**completion_params)

    return completion.choices[0].message.content

def append_to_logfile(logfile_name, text):
    with open(logfile_name, 'a') as logfile:
        logfile.write(text + "\n")


def remove_punctuation_and_lowercase(text):
    # Lowercase the string
    text = text.lower()

    # Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))

    return text

def generate_exact_answer(question, snippets):
    # stub function for generating exact answer
    exact_answer = []
    if question["type"] == "yesno":
        # Generate yes/no answer
        # the exact answer of each participating system will have to be either "yes" or "no".
        messages = [
                {"role": "system", "content": "You are BioASQ-GPT, an AI expert in question answering, \
                 research, and information retrieval in the biomedical domain."},
                {"role": "user", "content": f" {snippets}\n\n\
                 '{question['body']}'. \
                 You *must answer* only with lowercase 'yes' or 'no' even if you are not sure about the answer."}
            ]
 
        print(messages)
        answer = ask_openai(messages, model_name, json_response=False)
        print("\ngpt response yesno:")
        print(answer)
        exact_answer = remove_punctuation_and_lowercase(answer) 

    elif question["type"] == "factoid":
        # Generate factoid answer
        # each participating system will have to return a json string array of up to 5 entity names (e.g., up to 5 names of drugs), numbers, or similar short expressions, ordered by decreasing confidence.
        messages = [
                {"role": "system", "content": "You are BioASQ-GPT, an AI expert in question answering, \
                 research, and information retrieval in the biomedical domain."},
                {"role": "user", "content": f""" {snippets}\n\n
                 '{question['body']}'. 
                 Answer this question by returning a JSON string array called 'entities of entity names, numbers, or similar short expressions that are an answer to the question, 
                 ordered by decreasing confidence. The array should contain at max 5 elements but can contain less. If you don't know any answer return an empty array. 
                 Return only this array, it must not contain phrases and **must be valid JSON**. Example: {{"entities": ["entity1", "entity2"]}}"""}
            ]

        print(messages)
        answer = ask_openai(messages, model_name)
        print("\ngpt response factoid:")
        print(answer)       
        # Parse Json
        # Extract the factoids from the generated message
        factoids = json.loads(answer)
        wrapped_list = [[item] for item in factoids['entities']]  
        exact_answer = wrapped_list

    elif question["type"] == "list":
        # Generate list answer
        # each participating system will have to return a single JSON string array of entity names, numbers, or similar short expressions, jointly taken to constitute a single answer (e.g., the most common symptoms of a disease). 
        # The returned list will have to contain no more than 100 entries of no more than 100 characters each.
        messages = [
                {"role": "system", "content": "You are BioASQ-GPT, an AI expert in question answering, \
                 research, and information retrieval in the biomedical domain."},
                {"role": "user", "content": f""" {snippets}\n\n
                 '{question['body']}'. 
                 Answer this question by only returning a JSON string array called 'entities of entity names, numbers, or similar short expressions that are an answer to the question 
                 (e.g., the most common symptoms of a disease). The returned array will have to contain no more than 100 entries of no more than 100 characters each. If you don't know any answer return an empty array. 
                 Return only this array, it must not contain phrases and **must be valid JSON**. Example: {{"entities": ["entity1", "entity2"]}}"""}
            ]

        print(messages)
        answer = ask_openai(messages, model_name)
        print("\ngpt response list:")
        print(answer)       
        # Parse Json
        # Extract the factoids from the generated message
        list_answer = json.loads(answer)   
        wrapped_list = [[item] for item in list_answer['entities']]
        exact_answer = wrapped_list
    return exact_answer

def generate_ideal_answer(question, snippets):
    # stub function for generating ideal answer
    # a single paragraph-sized text ideally summarizing the most relevant information from articles and snippets
    # The maximum allowed length of each "ideal" answer is 200 words.
    # Each returned "ideal" answer is intended to approximate a short text that a biomedical expert would write to answer the corresponding question (e.g., including prominent supportive information)

    messages = [
            {"role": "system", "content": "You are BioASQ-GPT, an AI expert in question answering, \
             research, and information retrieval in the biomedical domain."},
            {"role": "user", "content": f""" {snippets}\n\n
             '{question['body']}'.
             You are a biomedical expert, write a concise and clear answer to the above question.
             It is very important that the answer is correct.
             The maximum allowed length of the answer is 200 words, but try to keep it short and concise."""}
        ]
    print(messages)
    answer = ask_openai(messages, model_name, json_response=False)
    print("\ngpt response ideal:")
    print(answer)
    return answer   



# Get the current timestamp in a sortable format
timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

logfile_name = f"{timestamp}_{model_name}_Grounded_PhaseB_log_file.json"

# Load the input file in JSON format
with open('./Round4/gpt-4-turbo-merged_new.json', encoding='utf-8') as input_file:
    data = json.loads(input_file.read())


def save_state(data, file_path='state.pkl'):
    """Save the current state to a pickle file."""
    with open(file_path, 'wb') as f:
        pickle.dump(data, f)

def load_state(file_path='state.pkl'):
    """Load the state from a pickle file if it exists, otherwise return None."""
    try:
        if os.path.exists(file_path):
            with open(file_path, 'rb') as f:
                return pickle.load(f)
    except EOFError:  # Handles empty pickle file scenario
        return None
    return None

# Try to load the saved state
saved_state = load_state()
if saved_state:
    results = saved_state
    offset = len(results)  # Determine where to continue processing
else:
    results = []
    offset = 0


# Iterate over all questions
for idx, question in enumerate(data["questions"]):
    print(f"\n\n{idx}")
    if idx < offset:
        continue
    # Determine the type of question
    question_type = question["type"]
    print(f"{question['body']}\n")

    # Get the relevant articles and snippets
    relevant_articles = question["documents"]
    relevant_snippets = question["snippets"]
    filtered_snippets = [snippet for snippet in relevant_snippets if 'golden' not in snippet]


    # Generate the exact answer and ideal answer
    try:
        exact_answer = generate_exact_answer(question, relevant_snippets)
        ideal_answer = generate_ideal_answer(question, relevant_snippets)
    except Exception as e:
        print(f"Error processing question {idx}: {e}")
        traceback.print_exc()
        exact_answer = []
        ideal_answer = []


    # Create a dictionary to store the results for this question
    question_results = {
        "id": question["id"],
        "type": question_type,
        "body": question["body"],
        "ideal_answer": ideal_answer,
        "exact_answer": exact_answer,
        "documents": relevant_articles,
        "snippets": filtered_snippets,
        "answer_ready": question['answerReady'],
    }
    
    # Add to logfile to continue after error with offset
    append_to_logfile(logfile_name, json.dumps(question_results))

    # Add the results for this question to the list of all results
    results.append(question_results)
    save_state(results)



# Create a dictionary to store the results for all questions
output = {
    "questions": results
}

# Get the current timestamp in a sortable format
timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

# Prefix the output file name with the timestamp
output_file_name = f"{timestamp}_{model_name}_Synergy_qa_output_file.json"

# Save the output to a file in pretty-formatted JSON format
with open(f"./Round4/Result/{output_file_name}", "w") as f:
    json.dump(output, f, indent=4)


# After processing all questions and saving the final output:
try:
    # Check if the pickle file exists before attempting to delete it
    if os.path.exists('state.pkl'):
        os.remove('state.pkl')
        print("Intermediate state pickle file deleted successfully.")
except Exception as e:
    print(f"Error deleting pickle file: {e}")