In [None]:
%pip install Biopython openai "elasticsearch<8" python-dotenv mistralai fireworks-ai

## Retrieval

In [7]:
import os
import json
from elasticsearch import Elasticsearch
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

#Suppress warnings about elasticsearch certificates
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)


def run_elasticsearch_query(query, index=["pubmed", "pubmed_update"]):
    # Retrieve Elasticsearch details from environment variables
    es_host = os.getenv('ELASTICSEARCH_HOST')
    es_user = os.getenv('ELASTICSEARCH_USER')
    es_password = os.getenv('ELASTICSEARCH_PASSWORD')

    # Connect to Elasticsearch
    es = Elasticsearch(
        [es_host],
        http_auth=(es_user, es_password),
        verify_certs=False,  # This will ignore SSL certificate validation
        timeout=60
    )

    # Convert the query string to a dictionary
    query_dict = json.loads(query)

    # Execute the query
    response = es.search(query_dict, index=index)

    # Process the response to extract the required information
    results = []
    if response['hits']['hits']:
        for hit in response['hits']['hits']:
            result = {
                "id": hit['_id'],
                "title": hit['_source'].get('title', 'No title available'),
                "abstract": hit['_source'].get('abstract', 'No abstract available')
            }
            results.append(result)
    return results

## Query Expansion

In [None]:
from openai import OpenAI
import os
import json
import re
import traceback
import fireworks.client
from mistralai.client import MistralClient
from mistralai.models.chat_completion import ChatMessage

client = OpenAI()

fireworks_api_key = os.environ["FIREWORKS_API_KEY"]

#mistral_api_key = os.environ["MISTRAL_API_KEY"]

#mistral_client = MistralClient(api_key=mistral_api_key)

fireworks.client.api_key = fireworks_api_key

def ask_mixtral(messages):
    return fireworks.client.ChatCompletion.create(
    model="accounts/fireworks/models/mixtral-8x7b-instruct",
    stream=False,
    n=1,
    messages=messages,
    stop=["<|im_start|>","<|im_end|>","<|endoftext|>"],
    top_p=1,
    top_k=40,
    presence_penalty=0,
    frequency_penalty=0,
    context_length_exceeded_behavior="truncate",
    temperature=0,
    max_tokens=32768
    )

def expand_query(question:str, model:str) -> str:
    if(model.startswith('mistral')):
        return expand_query_mistral(question, model)
    else:
        return expand_query_openai(question, model)


def expand_query_openai(question:str, model:str) -> str:
    completion = client.chat.completions.create(
        model=model,
        response_format={ "type": "json_object" },
        messages=[
            {"role": "system", "content": "You are BioASQ-GPT, an AI expert in question answering, research, and information retrieval in the biomedical domain."},
             {"role": "user", "content": f"""Turn the following biomedical question into an effective elasticsearch query using the query_string query type 
            by incorporating synonyms and additional terms that closely relate to the main topic and help reduce ambiguity. 
            Focus on maintaining the query's precision and relevance to the original question, the index contains the fields 'title' and 'abstract', return valid json: 
            'Is CircRNA produced by back splicing of exon, intron or both, forming exon or intron circRNA?'"""},
            {"role": "assistant", "content": """
            {
                "query": {
                    "query_string": {
                    "query": "(CircRNA OR \"circular RNA\") \"back splicing\" exon OR intron",
                    "fields": [
                        "title^10",
                        "abstract"
                    ],
                    "default_operator": "and"
                    }
                },
                "size": 50
                }
            """},
            {"role": "user", "content": f"""Turn the following biomedical question into an effective elasticsearch query using the query_string query type 
            by incorporating synonyms and additional terms that closely relate to the main topic and help reduce ambiguity. 
            Focus on maintaining the query's precision and relevance to the original question, the index contains the fields 'title' and 'abstract', return valid json:
            'Which factor is inhibited by Milvexian?'"""},
            {"role": "assistant", "content": """
             {
                "query": {
                    "query_string": {
                    "query": "Milvexian inhibitor (factor OR XIa OR FXIa)",
                    "fields": [
                        "title^10",
                        "abstract"
                    ],
                    "default_operator": "and"
                    }
                },
                "size": 50
                }
            """},
            {"role": "user", "content": f"""Turn the following biomedical question into an effective elasticsearch query using the query_string query type 
            by incorporating synonyms and additional terms that closely relate to the main topic and help reduce ambiguity. 
            Focus on maintaining the query's precision and relevance to the original question, the index contains the fields 'title' and 'abstract', return valid json:
             '{question}'"""},
        ],
        temperature=0.0, # randomness of completion
        logprobs=False,
        seed=90128538
    )
    print(completion)
    return completion.choices[0].message.content
    
#query = expand_query_openai("Can losartan reduce brain atrophy in Alzheimer\u0027s disease?", "gpt-3.5-turbo-0125")
#json.dumps(json.loads(query))



def expand_query_mistral(question: str, model:str) -> str:

    messages = [
    {"role":"system", "content":"You are BioASQ-GPT, an AI expert in question answering, research, and information retrieval in the biomedical domain."},
    {"role":"user", "content":""""
             Turn this question into an effective elasticsearch query using the simple_query_string query type, return valid json: 
             'Is CircRNA produced by back splicing of exon, intron or both, forming exon or intron circRNA?'
             response:
             {
                "query": {
                    "bool": {
                    "must": [
                        {
                        "simple_query_string": {
                            "query": "CircRNA back splicing exon intron circular RNA",
                            "fields": ["title^10", "abstract"],
                            "default_operator": "and"
                        }
                        }
                    ],
                    "should": [
                        {
                        "match": {
                            "title": {
                            "query": "CircRNA",
                            "boost": 3
                            }
                        }
                        },
                        {
                        "match": {
                            "abstract": {
                            "query": "back splicing",
                            "boost": 2
                            }
                        }
                        },
                        {
                        "match_phrase": {
                            "abstract": {
                            "query": "exon intron circular RNA",
                            "boost": 2
                            }
                        }
                        }
                    ],
                    "minimum_should_match": 1
                    }
                },
                "size": 100
                }
             Turn this question into an effective elasticsearch query using the simple_query_string query type, return valid json: 
             'Which factor is inhibited by Milvexian?'
             response:
             {
                "query": {
                    "bool": {
                        "must": [{
                                "simple_query_string": {
                                    "query": "Milvexian inhibitor factor XIa FXIa",
                                    "fields": ["title^10", "abstract"],
                                    "default_operator": "and"
                                }
                            }
                        ],
                        "should": [{
                                "match": {
                                    "title": {
                                        "query": "Milvexian",
                                        "boost": 3
                                    }
                                }
                            }, {
                                "match": {
                                    "abstract": {
                                        "query": "inhibitor",
                                        "boost": 2
                                    }
                                }
                            }, {
                                "match": {
                                    "abstract": {
                                        "query": "factor XIa FXIa",
                                        "boost": 2
                                    }
                                }
                            }
                        ],
                        "minimum_should_match": 1
                    },
                    "size": 100
                }
            }
            Turn this question into an effective elasticsearch query using the simple_query_string query type, return valid json: '"""+question+"' \nresponse:"}
    ]

    # No streaming
    chat_response = ask_mixtral(messages)
    return extract_json_response(chat_response)

def extract_json_response(chat_completion_response):
    # Check if the response is in the expected format
    if not hasattr(chat_completion_response, 'choices') or not chat_completion_response.choices:
        return "Invalid chat completion response format."

    # Extract the content from the first choice in the response
    content = chat_completion_response.choices[0].message.content
    print(content)

    # Regular expression to extract JSON object
    json_pattern = r"\{.*\}"

    # Search for the JSON pattern in the content
    match = re.search(json_pattern, content, re.DOTALL)

    if not match:
        return "No valid JSON object found in the content."

    # Extract the JSON string
    json_string = match.group()

    corrected_json_string = json_string.replace("\\\\", "\\").replace("\_", "_")

    print("corrected string:"+corrected_json_string)
    try:
        # Parse the JSON string
        json.loads(corrected_json_string)
        return corrected_json_string
    except Exception as e:
        print("Error with corrected json string:")
        print(corrected_json_string)
        traceback.print_exc()
        return f"Error decoding JSON: {e}"
"""
# Example usage
question = " Provide a list of compounds with well-characterized senomorphic activity."
model = "mistral-small"  # Replace with the actual model name
result = expand_query_mistral(question, model)
print(result)
"""

## Snippet Expansion

In [9]:
def extract_relevant_snippets(article:str, question:str, model:str) -> str:
    if(model.startswith('mistral')):
        return extract_relevant_snippets_mistral(article, question, model)
    else:
        return extract_relevant_snippets_openai(article, question, model)



def extract_relevant_snippets_openai(article, question, model):
    completion = client.chat.completions.create(
        model=model,
        response_format={ "type": "json_object" },
        messages=[
            {"role": "system", "content": "You are BioASQ-GPT, an AI expert in question answering, research, and information retrieval in the biomedical domain."},
            {"role": "user", "content": f"""Given this question: '{question}' extract relevant sentences or longer snippets from the following article that help answer the question. 
             If no relevant information is present, return an empty array. Return the extracted snippets as a json string array called 'snippets'. ```{article}```"""}
        ],
        temperature=0.0, # randomness of completion
        logprobs=False,
        seed=90128538
    )
    print(completion)
    sentences = json.loads(completion.choices[0].message.content)
    snippets = generate_snippets_from_sentences(article, sentences['snippets'])
    return snippets

def extract_relevant_snippets_mistral(article, question, model):
    messages = [
        {"role":"system", "content":"You are BioASQ-GPT, an AI expert in question answering, research, and information retrieval in the biomedical domain."},
        {"role":"user", "content":f"""Given this question: '{question}' extract relevant sentences from the following article that help answer the question. 
             If no relevant information is present, return an empty array. Return the extracted sentences as a valid json object containing a json string array called 'sentences'. 
         Example: {{"sentences":["sentence1", "sentence2"]}}```{article}```"""}
    ]

    print("\nchat question:")
    print(messages)
    print("\n")
    snippets = []
    try:
        chat_response = ask_mixtral(messages)

        print("chat response extract snippets:")
        print(chat_response)
        chat_response_json = extract_json_response(chat_response)
        sentences = json.loads(chat_response_json)
        print("sentences extract snippets:")
        print(sentences)
        snippets = generate_snippets_from_sentences(article, sentences['sentences'])
    except Exception as e:
        print("Error while extracting snippets")
        traceback.print_exc()
        
    return snippets


def find_offset_and_create_snippet(document_id, text, sentence, section):
    offset_begin = text.find(sentence)
    offset_end = offset_begin + len(sentence)
    return {
        "document": document_id,
        "offsetInBeginSection": offset_begin,
        "offsetInEndSection": offset_end,
        "text": sentence,
        "beginSection": section,
        "endSection": section
    }

def generate_snippets_from_sentences(article, sentences):
    snippets = []

    article_abstract = article.get('abstract') or ''  # This will use '' if 'abstract' is None or does not exist

    for sentence in sentences:
        if sentence in article['title']:
            snippet = find_offset_and_create_snippet(article['id'], article['title'], sentence, "title")
            snippets.append(snippet)
        elif sentence in article_abstract:
            snippet = find_offset_and_create_snippet(article['id'], article_abstract, sentence, "abstract")
            snippets.append(snippet)

    return snippets

## Reranking

In [10]:
def select_top_snippets(snippets:str, question:str, model:str) -> str:
    if len(snippets) == 0:
        return []
    if(model.startswith('mistral')):
        return select_top_snippets_mistral(snippets, question, model)
    else:
        return select_top_snippets_openai(snippets, question, model)


def select_top_snippets_openai(snippets, question, model):
    numbered_snippets = [{'id': idx, 'text': snippet['text']} for idx, snippet in enumerate(snippets)]
    completion = client.chat.completions.create(
        model=model,
        response_format={ "type": "json_object" },
        messages=[
            {"role": "system", "content": "You are BioASQ-GPT, an AI expert in question answering, research, and information retrieval in the biomedical domain."},
            {"role": "user", "content": f"""Given this question: '{question}' select the top 10 snippets that are most helpfull for answering this question from
             this list of snippets, rerank them by helpfullness: ```{numbered_snippets}``` return a json array of their ids called 'snippets'"""}
        ],
        temperature=0.0, # randomness of completion
        logprobs=False,
        seed=90128538
    )
    print(completion)
    snippets_idx = json.loads(completion.choices[0].message.content)['snippets']
    filtered_array = [snippets[i] for i in snippets_idx]

    return filtered_array
    
def select_top_snippets_mistral(snippets, question, model):
    numbered_snippets = [{'id': idx, 'text': snippet['text']} for idx, snippet in enumerate(snippets)]
    messages = [
        {"role":"system", "content":"You are BioASQ-GPT, an AI expert in question answering, research, and information retrieval in the biomedical domain."},
        {"role":"user", "content":f"""Given this question: '{question}' select the top 10 snippets that are most helpfull for answering this question from
            this list of snippets, rerank them by helpfullness: ```{numbered_snippets}``` return a json array of their ids called 'snippets'"""}
    ]

    chat_response = ask_mixtral(messages)

    print("chat response select snippets:")
    print(chat_response)
    print("snippets idx:")
    snippets_idx = json.loads(chat_response.choices[0].message.content)['snippets']
    print(snippets_idx)
    filtered_array = [snippets[i] for i in snippets_idx]

    return filtered_array
    

## Synergy Run

In [None]:
import json
import datetime
import time
import traceback
import pickle


model_name = "gpt-3.5-turbo-0125"
#model_name = "gpt-4-0125-preview"
#model_name = "mistral-small"

def append_to_logfile(logfile_name, text):
    with open(logfile_name, 'a', encoding='utf-8') as logfile:
        logfile.write(text + "\n")

# Get the current timestamp in a sortable format
timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

logfile_name = f"{timestamp}_{model_name}_PhaseA_No_Expansion_log_file.json"
debug_logfile = f"{timestamp}_{model_name}_PhaseA_No_Expansion_Debug_log_file.json"

def reorder_articles_by_snippet_sequence(relevant_article_ids, snippets):
    ordered_article_ids = []
    mentioned_article_ids = set()

    # Add article IDs in the order they appear in the snippets
    for snippet in snippets:
        document_id = snippet['document']
        if document_id in relevant_article_ids and document_id not in mentioned_article_ids:
            ordered_article_ids.append(document_id)
            mentioned_article_ids.add(document_id)

    # Add the remaining article IDs that weren't mentioned in snippets
    for article_id in relevant_article_ids:
        if article_id not in mentioned_article_ids:
            ordered_article_ids.append(article_id)

    return ordered_article_ids

def get_relevant_articles_and_snippets(question_id, question, exclude_documents_by_question):
    # stub function for getting relevant articles
    print("Question: "+question)
    append_to_logfile(debug_logfile, f"{{\"question\":\"{question}\",")
    query = expand_query(question, model_name)
    append_to_logfile(debug_logfile, f"\"query\":\"{query}\"")
    relevant_articles = run_elasticsearch_query(query)
    print("number of relevant articles before filter:") 
    print(len(relevant_articles))
    exclude_documents = exclude_documents_by_question.get(question_id, set())
    filtered_articles = [article for article in relevant_articles if article['id'] not in exclude_documents]
    print("number of relevant articles after filter:")
    print(len(filtered_articles))


    filtered_articles = get_relevant_snippets(filtered_articles[:50], question)

    return filtered_articles

def read_feedback_file(feedback_file_path):
    with open(feedback_file_path, 'r') as file:
        feedback_data = json.load(file)
    
    exclude_documents_by_question = {}
    for question in feedback_data['questions']:
        question_id = question['id']
        exclude_documents = set()
        for document in question['documents']:
            exclude_documents.add(document['id'])
        exclude_documents_by_question[question_id] = exclude_documents
    
    return exclude_documents_by_question


def get_relevant_snippets(articles, question):
    processed_articles = []

    for article in articles:
        snippets = extract_relevant_snippets(article, question, model_name)
        if snippets:
            article['snippets'] = snippets
            processed_articles.append(article)

    return processed_articles



def save_state(data, file_path='state.pkl'):
    """Save the current state to a pickle file."""
    with open(file_path, 'wb') as f:
        pickle.dump(data, f)

def load_state(file_path='state.pkl'):
    """Load the state from a pickle file if it exists, otherwise return None."""
    try:
        if os.path.exists(file_path):
            with open(file_path, 'rb') as f:
                return pickle.load(f)
    except EOFError:  # Handles empty pickle file scenario
        return None
    return None


# Read excluded document IDs from the feedback file
exclude_documents_by_question = read_feedback_file("./Round4/BioASQ-taskSynergy_2024-feedback_round4.json")

# Load the input file in JSON format
input_file_name = './Round4/BioASQ-taskSynergy_v2024-testset4.json'
#input_file_name = './test_queries'
with open(input_file_name) as input_file:
    data = json.loads(input_file.read())

# Try to load the saved state
saved_state = load_state()
if saved_state:
    results = saved_state
    offset = len(results)  # Determine where to continue processing
else:
    results = []
    offset = 0

for idx, question in enumerate(data["questions"]):
    print(idx)
    if idx < offset:
        continue

    retry_count = 0  # Initialize a counter for retries
    while retry_count < 2:  # Set the maximum number of retries to 2
        try:
            # Call the stub function to get relevant articles
            print("processing question "+str(idx))
            print(question)
            relevant_articles = get_relevant_articles_and_snippets(question["id"], question['body'], exclude_documents_by_question)
            relevant_articles_ids = [article['id'] for article in relevant_articles]
            relevant_snippets = [snippet for article in relevant_articles for snippet in article['snippets']]
            relevant_snippets = select_top_snippets(relevant_snippets,question,model_name)
            relevant_articles_ids = reorder_articles_by_snippet_sequence(relevant_articles_ids, relevant_snippets)

            # Create a dictionary to store the results for this question
            question_results = {
                "body": question["body"],
                "id": question["id"],
                "type": question["type"],
                "documents": relevant_articles_ids[:10],
                "snippets": relevant_snippets
            }


            # Add the results for this question to the list of all results
            results.append(question_results)
            save_state(results)

            break  # If no exception is thrown, break the loop
        except Exception as e:
            print(f"Error processing question {idx}: {e}")
            traceback.print_exc()
            retry_count += 1  # Increment the retry counter
            time.sleep(5)  # Sleep for 5 seconds before retrying

# Create a dictionary to store the results for all questions
output = {
    "questions": results
}

# Prefix the output file name with the timestamp
output_file_name = f"./Round4/Result/{timestamp}_{model_name}_missing_Synergy_output_file.json"

# Ensure the directory exists before saving
os.makedirs(os.path.dirname(output_file_name), exist_ok=True)

# Save the output to a file in pretty-formatted JSON format
with open(output_file_name, "w") as f:
    json.dump(output, f, indent=4)

# After processing all questions and saving the final output:
try:
    # Check if the pickle file exists before attempting to delete it
    if os.path.exists('state.pkl'):
        os.remove('state.pkl')
        print("Intermediate state pickle file deleted successfully.")
except Exception as e:
    print(f"Error deleting pickle file: {e}")