In [None]:
%pip install Biopython openai "elasticsearch<8" python-dotenv mistralai fireworks-ai
%pip install --upgrade pandas


## Retrieval

In [None]:
import os
import json
from elasticsearch import Elasticsearch
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

#Suppress warnings about elasticsearch certificates
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)


def run_elasticsearch_query(query, index=["pubmed", "pubmed_update"]):
    # Retrieve Elasticsearch details from environment variables
    es_host = os.getenv('ELASTICSEARCH_HOST')
    es_user = os.getenv('ELASTICSEARCH_USER')
    es_password = os.getenv('ELASTICSEARCH_PASSWORD')

    # Connect to Elasticsearch
    es = Elasticsearch(
        [es_host],
        http_auth=(es_user, es_password),
        verify_certs=False,  # This will ignore SSL certificate validation
        timeout=60  # Set the timeout to 60 seconds (adjust as needed)
    )

    # Convert the query string to a dictionary
    if isinstance(query, str) and not isinstance(query, dict):
        query_dict = json.loads(query)
    else:
        query_dict = query
    actual_query_string = query_dict["query"]["query_string"]["query"]
    query_dict["query"]["query_string"]["query"] = actual_query_string.replace("/", "\\/")

    # Execute the query
    response = es.search(query_dict, index=index)

    # Process the response to extract the required information
    results = []
    if response['hits']['hits']:
        for hit in response['hits']['hits']:
            result = {
                "id": hit['_id'],
                "title": hit['_source'].get('title', 'No title available'),
                "abstract": hit['_source'].get('abstract', 'No abstract available')
            }
            results.append(result)
    print(f"docs found: {len(results)}")
    return results

#run_elasticsearch_query(expand_query("*"))

## Query Expansion

In [3]:
from openai import OpenAI
import pandas as pd


client = OpenAI()

def expand_query_manual(question:str, model:str) -> str:
    completion = client.chat.completions.create(
        model=model,
        response_format={ "type": "json_object" },
        messages=[
            {"role": "system", "content": "You are BioASQ-GPT, an AI expert in question answering, research, and information retrieval in the biomedical domain."},
             {"role": "user", "content": f"""Turn the following biomedical question into an effective elasticsearch query using the query_string query type 
            by incorporating synonyms and additional terms that closely relate to the main topic and help reduce ambiguity. 
            Focus on maintaining the query's precision and relevance to the original question, the index contains the fields 'title' and 'abstract', return valid json: 
            'Is CircRNA produced by back splicing of exon, intron or both, forming exon or intron circRNA?'"""},
            {"role": "assistant", "content": """
            {
                "query": {
                    "query_string": {
                    "query": "(CircRNA OR \"circular RNA\") AND \"back splicing\" AND (exon OR intron)",
                    "fields": [
                        "title^10",
                        "abstract"
                    ],
                    "default_operator": "AND"
                    }
                },
                "size": 50
                }
            """},
            {"role": "user", "content": f"""Turn the following biomedical question into an effective elasticsearch query using the query_string query type 
            by incorporating synonyms and additional terms that closely relate to the main topic and help reduce ambiguity. 
            Focus on maintaining the query's precision and relevance to the original question, the index contains the fields 'title' and 'abstract', return valid json:
            'Which factor is inhibited by Milvexian?'"""},
            {"role": "assistant", "content": """
             {
                "query": {
                    "query_string": {
                    "query": "Milvexian AND inhibitor AND (factor OR XIa OR FXIa)",
                    "fields": [
                        "title^10",
                        "abstract"
                    ],
                    "default_operator": "AND"
                    }
                },
                "size": 50
                }
            """},
            {"role": "user", "content": f"""Turn the following biomedical question into an effective elasticsearch query using the query_string query type 
            by incorporating synonyms and additional terms that closely relate to the main topic and help reduce ambiguity. 
            Focus on maintaining the query's precision and relevance to the original question, the index contains the fields 'title' and 'abstract', return valid json:
             '{question}'"""},
        ],
        temperature=0.0, # randomness of completion
        logprobs=False,
        seed=90128538
    )
    return completion.choices[0].message.content

def expand_query_few_shot(df_prior, n, question:str, model:str):
    messages = generate_n_shot_examples(df_prior, n)
    # Add the user message
    user_message = {
        "role": "user",
        "content": f"""Turn the following biomedical question into an effective Elasticsearch query using the query_string query type 
        by incorporating synonyms and additional terms that closely relate to the main topic and help reduce ambiguity. 
        Focus on maintaining the query's precision and relevance to the original question, the index contains the fields 'title' and 'abstract', return valid json:
        '{question}'"""
    }
    messages.append(user_message)
    completion = client.chat.completions.create(
        model=model,
        response_format={ "type": "json_object" },
        messages=messages,
        temperature=0.0, # randomness of completion
        logprobs=False,
        seed=90128538
    )
    print(completion)
    return completion.choices[0].message.content


def generate_n_shot_examples(df, n):
    # Sort the DataFrame by 'f1_score' in descending order and select the top n entries
    top_entries = df.sort_values(by='recall', ascending=False).head(n)
    
    # Initialize the system message
    system_message = {"role": "system", "content": "You are BioASQ-GPT, an AI expert in question answering, research, and information retrieval in the biomedical domain."}
    
    # Initialize the list of messages with the system message
    messages = [system_message]
    
    # Loop through each of the top n entries and add the user and assistant messages
    for _, row in top_entries.iterrows():
        question = row['question_body']
        query = row['query']
        
        # Replace problematic characters in question
        question = question.replace("/", "\\\\/")
        
        # Add the user message
        user_message = {
            "role": "user",
            "content": f"""Turn the following biomedical question into an effective Elasticsearch query using the query_string query type 
            by incorporating synonyms and additional terms that closely relate to the main topic and help reduce ambiguity. 
            Focus on maintaining the query's precision and relevance to the original question, the index contains the fields 'title' and 'abstract', return valid json:
            '{question}'"""
        }
        
        # Add the assistant message
        assistant_message = {
            "role": "assistant",
            "content": query  # Assuming 'query' is already in the desired JSON format
        }
        
        messages.extend([user_message, assistant_message])
    
    # Now, messages contains n-shot examples formatted as needed
    # Here you would make your call to the language model using the messages
    # For demonstration, we'll just return the messages
    return messages

# Assuming df is your DataFrame and n is the number of examples you want
# df = pd.read_csv('path_to_your_csv_file.csv')
# n = 5  # For example, taking top 5 entries
# n_shot_examples = generate_n_shot_examples(df, n)
# Now n_shot_examples can be used in your call to the language model


### Query Refinement

In [4]:
def refine_query_with_no_results(question, original_query, model):
    messages = [
        {"role": "system", "content": "You are BioASQ-GPT, an AI expert in question answering, \
            research, and information retrieval in the biomedical domain."},
        {"role": "user", "content": f"""Given that the following search query using the elasticsearch query_string query syntax has returned
        no documents, please generate a broader query that retains the original question's context and relevance.
        Return only the query that can directly be used without any explanation text. 
        Focus on maintaining the query's precision and relevance to the original question. 
        Original question: '{question}', Original query that returned no results: '{original_query}'."""}
    ]

    completion = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=0.0, # randomness of completion
        logprobs=False,
        seed=90128538
    )
    return completion.choices[0].message.content

def refine_query_with_relevance_feedback(question, original_query, results, model):
    messages = [
        {"role": "system", "content": "You are BioASQ-GPT, an AI expert in question answering, \
            research, and information retrieval in the biomedical domain."},
        {"role": "user", "content": f"""This search query  ('{original_query}') has returned these results contained in tripple backtics:
         ```{results}```
        The goal is to find documents that help answer this question: '{question}' 
        Create a better query that finds more relevant documents to answer this question.
        Return only the query string that can directly be used without any explanation text."""}
    ]

    completion = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=0.0, # randomness of completion
        logprobs=False,
        seed=90128538
    )
    return completion.choices[0].message.content


In [5]:
import json
import datetime
import pickle
import traceback
import pandas as pd
import os
from concurrent.futures import ThreadPoolExecutor, as_completed

timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
pickl_file = "state.pkl"
model_name = "gpt-3.5-turbo-0125"
#model_name = "gpt-4-0125-preview"
n_shots = 20



def process_question(df_prior, n, question):
    try:
        print(f"Processing question {question['id']}")

        #query = expand_query_few_shot(df_prior, n, question['body'],model_name)
        query = expand_query_manual(question['body'],model_name)
        query = json.loads(query)
        actual_query_string = query["query"]["query_string"]["query"]
        print(f"Original Query: {actual_query_string}")
        relevant_articles = run_elasticsearch_query(query)
        """
        if len(relevant_articles) == 0:
            actual_query_string = refine_query_with_no_results(question['body'], actual_query_string, model_name)
            query["query"]["query_string"]["query"] = actual_query_string
            print(f"Refined after 0 Query: {actual_query_string}")
            relevant_articles = run_elasticsearch_query(query)
        #actual_query_string = refine_query_with_relevance_feedback(question['body'], actual_query_string, relevant_articles[:5], model_name)
        #query["query"]["query_string"]["query"] = actual_query_string
        print(f"Refined Query: {actual_query_string}")
        relevant_articles = run_elasticsearch_query(query)
        """
            
        relevant_articles_urls = ["http://www.ncbi.nlm.nih.gov/pubmed/"+article['id'] for article in relevant_articles]

        return {
            "question_id": question["id"],
            "question_body": question["body"],
            "question_type": question["type"],
            "query": query,
            "documents": relevant_articles_urls,
            "snippets": []
        }
    except Exception as e:
        print(f"Error processing question {question['id']}: {e}")
        traceback.print_exc()
        return None
    

def save_state(data, file_path=pickl_file):
    """Save the current state to a pickle file."""
    with open(file_path, 'wb') as f:
        pickle.dump(data, f)

def load_state(file_path=pickl_file):
    """Load the state from a pickle file if it exists, otherwise return None."""
    try:
        if os.path.exists(file_path):
            with open(file_path, 'rb') as f:
                return pickle.load(f)
    except EOFError:  # Handles empty pickle file scenario
        return None
    return None

# Define columns
columns = ['question_id', 'question_body', 'question_type', 'query', 'documents', 'snippets']

# Initialize empty DataFrame
questions_df = pd.DataFrame(columns=columns)

# Load the input file in JSON format
input_file_name = './11B2_golden.json'
#input_file_name = './test_queries'
with open(input_file_name) as input_file:
    data = json.loads(input_file.read())

# Assuming 'load_state' returns a DataFrame or None
saved_df = load_state(pickl_file)

if saved_df is not None and not saved_df.empty:
    processed_ids = set(saved_df['question_id'])  # Assuming 'question_id' is your identifier
    questions_df = saved_df
else:
    processed_ids = set()


csv_prior_file_path = './Results/2024-02-21_15-55-52_gpt-4-0125-preview_QueryExpansion_11B1.csv'
df_prior = pd.read_csv(csv_prior_file_path)

# Assuming `data["questions"]` is your list of questions to process
# Filter out questions that have already been processed
questions_to_process = [q for q in data["questions"] if q["id"] not in processed_ids]


# Use ThreadPoolExecutor to process questions in parallel
with ThreadPoolExecutor(max_workers=3) as executor:
    # Dictionary to keep track of question futures
    future_to_question = {executor.submit(process_question, df_prior, n_shots,  q): q for q in questions_to_process}
    
    for future in as_completed(future_to_question):
        question = future_to_question[future]
        try:
            result = future.result()
            if result:
                # Append result to the DataFrame
                result_df = pd.DataFrame([result])
                questions_df = pd.concat([questions_df, result_df], ignore_index=True)
                save_state(questions_df, pickl_file)
        except Exception as e:
            print(f"Error processing question {question['id']}: {e}")
            traceback.print_exc()


# Prefix the output file name with the timestamp
output_file_name = f"./Results/{timestamp}_{model_name}_QueryExpansion_Manual_Refine_Default_And_11B2.csv"

# Ensure the directory exists before saving
os.makedirs(os.path.dirname(output_file_name), exist_ok=True)

questions_df.to_csv(output_file_name, index=False)

# After processing all questions and saving the final output:
try:
    # Check if the pickle file exists before attempting to delete it
    if os.path.exists(pickl_file):
        os.remove(pickl_file)
        print("Intermediate state pickle file deleted successfully.")
except Exception as e:
    print(f"Error deleting pickle file: {e}")

Processing question 63f03006f36125a426000018
Processing question 641ad941690f196b5100003d
Processing question 642a029d57b1c7a315000011
Original Query: (LOQ OR limit of quantification OR LOD OR limit of detection) AND (same OR equal)
Original Query: ("Histone acetyltransferases" OR HATs OR "acetyltransferases") AND ("acetyl groups" OR acetylation) AND (histones OR lysine) AND purpose
Original Query: (DESTINY-Breast04 AND Trial) OR (trastuzumab deruxtecan AND breast cancer)


  response = es.search(query_dict, index=index)


docs found: 50
docs found: 35
docs found: 50
Refined Query: '(DESTINY-Breast04 AND Trial) OR (trastuzumab deruxtecan AND breast cancer AND results)'
docs found: 50
Processing question 6432fc0457b1c7a31500001f
Refined Query: '(LOQ OR limit of quantification) AND (LOD OR limit of detection) AND (same OR equal OR identical OR equivalent)'
Refined Query: ('Histone acetyltransferases' OR HATs OR 'acetyltransferases') AND ('acetyl groups' OR acetylation) AND (histones OR lysine) AND 'purpose of acetylation transfer'
docs found: 3
Processing question 64178edb690f196b51000025
docs found: 50
Processing question 6402bf2b201352f04a000007
Original Query: (hereditary angioedema) AND (mutated OR mutation) AND (gene OR genetic)
docs found: 50
Original Query: (casimersen OR SRP-4045) AND mechanism AND action
docs found: 1
Refined Query: '(hereditary angioedema) AND (mutated OR mutation) AND (gene OR genetic) AND (most frequently mutated)'
Original Query: (tiragolumab OR MTIG7192A) AND mechanism AND ac

Traceback (most recent call last):
  File "C:\Users\Samy Ateia\AppData\Local\Temp\ipykernel_73372\655031348.py", line 23, in process_question
    query = json.loads(query)
            ^^^^^^^^^^^^^^^^^
  File "c:\Program Files\Python312\Lib\json\__init__.py", line 346, in loads
    return _default_decoder.decode(s)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Program Files\Python312\Lib\json\decoder.py", line 337, in decode
    obj, end = self.raw_decode(s, idx=_w(s, 0).end())
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Program Files\Python312\Lib\json\decoder.py", line 353, in raw_decode
    obj, end = self.scan_once(s, idx)
               ^^^^^^^^^^^^^^^^^^^^^^
json.decoder.JSONDecodeError: Expecting ',' delimiter: line 4 column 137 (char 179)


Original Query: (cells OR cell types) AND produce AND erythroferrone
docs found: 7
Refined Query: '(cells OR cell types) AND produce AND erythroferrone'
docs found: 7
Processing question 6429f86657b1c7a31500000d
Original Query: (Mediator OR transcriptional mediator) AND (gene expression OR transcription regulation)
docs found: 50
Refined Query: '(Mediator OR transcriptional mediator) AND (gene expression OR transcription regulation)'
docs found: 50
Processing question 6415c0df690f196b51000010
Original Query: (FBDD OR 'Fragment-Based Drug Discovery')
docs found: 50
Refined Query: '(FBDD OR "Fragment-Based Drug Discovery")'
docs found: 50
Processing question 64041dae201352f04a00001d
Original Query: (deucravacitinib OR BMS-986165) AND (efficacy OR effectiveness) AND psoriasis
docs found: 50
Refined Query: '(deucravacitinib OR BMS-986165) AND (efficacy OR effectiveness) AND (psoriasis AND clinical trial)'
docs found: 31
Processing question 6415b6eb690f196b5100000b
Original Query: (orexin O

Traceback (most recent call last):
  File "C:\Users\Samy Ateia\AppData\Local\Temp\ipykernel_73372\655031348.py", line 23, in process_question
    query = json.loads(query)
            ^^^^^^^^^^^^^^^^^
  File "c:\Program Files\Python312\Lib\json\__init__.py", line 346, in loads
    return _default_decoder.decode(s)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Program Files\Python312\Lib\json\decoder.py", line 337, in decode
    obj, end = self.raw_decode(s, idx=_w(s, 0).end())
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Program Files\Python312\Lib\json\decoder.py", line 353, in raw_decode
    obj, end = self.scan_once(s, idx)
               ^^^^^^^^^^^^^^^^^^^^^^
json.decoder.JSONDecodeError: Expecting ',' delimiter: line 4 column 141 (char 183)


docs found: 50
Refined Query: ('(COVID-19 OR SARS-CoV-2 OR "coronavirus disease 2019") AND (respiratory illness OR respiratory infection OR influenza) AND (distinguish OR differentiate OR clinical)')
Original Query: (paclitaxel OR Taxol OR docetaxel OR Taxotere OR doxorubicin OR epirubicin) AND (cancer OR carcinoma)
docs found: 50
Processing question 63eeec79f36125a426000006
docs found: 50
Refined Query: '(paclitaxel OR Taxol OR docetaxel OR Taxotere OR doxorubicin OR epirubicin) AND (approved AND cancer)'
docs found: 50
Processing question 642321a5690f196b51000047
Original Query: (Talquetamab OR JNJ-64407564) AND (treatment OR therapy OR management) AND (disease OR condition)
docs found: 3
Refined Query: '(Talquetamab OR JNJ-64407564) AND (treatment OR therapy OR management) AND (multiple myeloma)'
docs found: 19
Processing question 6428d7da690f196b51000050
Original Query: (Feline Spongiform Encephalopathy OR FSE) AND summarize
docs found: 2
Refined Query: '(Indocyanine green OR ICG) 

Traceback (most recent call last):
  File "C:\Users\Samy Ateia\AppData\Local\Temp\ipykernel_73372\655031348.py", line 35, in process_question
    relevant_articles = run_elasticsearch_query(query)
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Samy Ateia\AppData\Local\Temp\ipykernel_73372\3659210096.py", line 37, in run_elasticsearch_query
    response = es.search(query_dict, index=index)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Samy Ateia\AppData\Roaming\Python\Python312\site-packages\elasticsearch\client\utils.py", line 347, in _wrapped
    return func(*args, params=params, headers=headers, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Samy Ateia\AppData\Roaming\Python\Python312\site-packages\elasticsearch\client\__init__.py", line 1821, in search
    return self.transport.perform_request(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Samy Ateia\AppData\Roaming\Python\Python

docs found: 0
Processing question 61fa941ec9dfcb9c09000005
Original Query: (untranslated regions OR UTRs OR 5' UTR OR 3' UTR) AND regulate AND gene expression
docs found: 50
Original Query: (alternative splicing OR splicing variants) AND (heart disease OR cardiovascular disease)
Original Query: (Intepirdine OR RVT-101) AND (Alzheimer's disease OR AD) AND (use OR treatment)
docs found: 7
Refined Query: '(untranslated regions OR UTRs OR 5\' UTR OR 3\' UTR) AND regulate AND gene expression'
docs found: 50
docs found: 50
Processing question 6422e2f1690f196b51000043
Refined Query: '(alternative splicing AND heart disease)'
Refined Query: '(Intepirdine OR RVT-101) AND (Alzheimer\'s disease OR AD) AND (efficacy OR clinical trial OR review)'
docs found: 50
Processing question 64299fa957b1c7a315000002
docs found: 4
Processing question 6432fb1957b1c7a31500001e
Original Query: (Gal-Nac OR GalNAc OR Gal N-acetylglucosamine) AND (siRNA OR ASO OR antisense oligonucleotide) AND (human cells OR cells 

### Generate Run File

In [None]:
import pandas as pd
import json

def csv_to_json(csv_filepath, json_filepath):
    # Step 1: Read the CSV file into a pandas DataFrame
    df = pd.read_csv(csv_filepath)
    
    # Transform the DataFrame into a list of dictionaries, one per question
    questions_list = df.to_dict(orient='records')
    
    # Initialize the structure of the JSON file
    json_structure = {"questions": []}
    
    # Step 2: Transform the DataFrame into the desired JSON structure
    for item in questions_list:
        question_dict = {
            "documents": item["documents"].strip("[]").replace("'", "").split(", "),
            "snippets": [],  # Assuming snippets are to be filled in manually or from another source
            "body": item["question_body"],
            "type": item["question_type"],
            "id": item["question_id"],
            # "ideal_answer" and "exact_answer" are placeholders; assuming they're to be added
            "ideal_answer": [],
            "exact_answer": ""
        }
        json_structure["questions"].append(question_dict)
    
    # Step 3: Write the JSON structure to a file
    with open(json_filepath, 'w', encoding='utf-8') as json_file:
        json.dump(json_structure, json_file, ensure_ascii=False, indent=4)

# Example usage
csv_filepath = './Results/2024-02-21_10-07-09_gpt-3.5-turbo-0125_QueryExpansion_11B1.csv'  # Update this path to your actual CSV file path
json_filepath = './Results/2024-02-21_10-07-09_gpt-3.5-turbo-0125_QueryExpansion_11B1_BioASQ-Run.json'  # Update this path to where you want to save the JSON file
csv_to_json(csv_filepath, json_filepath)
