In [3]:
import os
import requests
import json
from openai import OpenAI, AzureOpenAI

# pip install python-dotenv
from dotenv import load_dotenv

In [None]:
# Load environment variables from .env file
load_dotenv("/work/bioinformatics/s228627/module_3_materials/day3/resources/.env")

# --- Azure OpenAI Client Setup ---
AZURE_API_ENDPOINT = os.getenv("AZURE_API_ENDPOINT")
AZURE_API_KEY = os.getenv("AZURE_API_KEY")
AZURE_API_VERSION = os.getenv("AZURE_API_VERSION")
AZURE_OPENAI_DEPLOYMENT_NAME = "gpt-4.1"

if not all([AZURE_API_ENDPOINT, AZURE_API_KEY, AZURE_API_VERSION, AZURE_OPENAI_DEPLOYMENT_NAME]):
    raise ValueError("Azure OpenAI environment variables are not fully set. Please check your .env file.")

In [36]:
OLLAMA_ENDPOINT = "http://localhost:11434/v1"
OLLAMA_MODELS = ["qwen3:4b", "deepseek-r1:8b"]

In [9]:
def get_client(model_name: str):
    
    if model_name in OLLAMA_MODELS:
        client = OpenAI(
            base_url=OLLAMA_ENDPOINT,
            api_key="ollama"
        )   
    else:
        client = AzureOpenAI(
            azure_endpoint=AZURE_API_ENDPOINT,
            api_key=AZURE_API_KEY,
            api_version=AZURE_API_VERSION
        )
        
    return client

def create_messages(query):
    return [{"role": "user", "content": query}]

def ask_llm(query, model_name: str):
    
    client = get_client(model_name)
    response = client.chat.completions.create(
        model=model_name,
        messages=create_messages(query)
    )
    return response.choices[0].message.content

In [13]:
model_name = "qwen3:4b"
# model_name = "gpt-4.1"

In [None]:

print(ask_llm("Convert ENSG00000215251 to official gene symbol.", model_name))

In [None]:
!curl "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=gene&term=ENSG00000215251&retmode=json"

In [None]:
!curl "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=gene&id=60493&retmode=json"

In [15]:
# --- NCBI E-utils Tool Definitions ---
NCBI_BASE_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"

def search_gene_id(query_id: str) -> str:
    """
    Searches for an NCBI gene ID (UID) given a gene identifier (e.g., Ensembl ID, gene symbol).
    Returns the first UID found or an error message.
    """
    print(f"TOOL: Called search_gene_id with query_id: {query_id}")
    try:
        url = f"{NCBI_BASE_URL}esearch.fcgi?db=gene&term={query_id}&retmode=json"
        response = requests.get(url)
        response.raise_for_status()
        data = response.json()
        if data.get("esearchresult", {}).get("idlist"):
            uid = data["esearchresult"]["idlist"][0]
            print(f"TOOL: search_gene_id found UID: {uid}")
            return json.dumps({"uid": uid})
        else:
            print(f"TOOL: search_gene_id found no UID for {query_id}")
            return json.dumps({"error": f"No UID found for {query_id}", "details": data.get("esearchresult", {}).get("warninglist", "")})
    except requests.exceptions.RequestException as e:
        print(f"TOOL ERROR: search_gene_id failed: {e}")
        return json.dumps({"error": str(e)})
    except json.JSONDecodeError as e:
        print(f"TOOL ERROR: search_gene_id JSON decode failed: {e}")
        return json.dumps({"error": "Failed to parse NCBI response."})


def summarize_gene_details(uid: str) -> str:
    """
    Retrieves a summary of gene information using esummary,
    including official symbol, name, and organism, for a given NCBI gene ID (UID).
    Returns JSON string of the summary.
    """
    print(f"TOOL: Called summarize_gene_details with UID: {uid}")
    try:
        url = f"{NCBI_BASE_URL}esummary.fcgi?db=gene&id={uid}&retmode=json"
        response = requests.get(url)
        response.raise_for_status()
        data = response.json()
        if "result" in data and uid in data["result"]:
            summary_data = data["result"][uid]
            # Optional
            # Extract key information for easier use by LLM
            extracted_summary = {
                "uid": summary_data.get("uid"),
                "official_symbol": summary_data.get("nomenclaturesymbol"),
                "official_full_name": summary_data.get("nomenclaturename"),
                "description": summary_data.get("description"),
                "organism": summary_data.get("organism", {}).get("scientificname"),
                "summary_text": summary_data.get("summary"),
            }
            print(f"TOOL: summarize_gene_details successful for UID: {uid}")
            return json.dumps(extracted_summary)
        else:
            print(f"TOOL: summarize_gene_details found no summary for UID: {uid}")
            return json.dumps({"error": f"No summary found for UID {uid}", "original_response": data})
    except requests.exceptions.RequestException as e:
        print(f"TOOL ERROR: summarize_gene_details failed: {e}")
        return json.dumps({"error": str(e)})
    except json.JSONDecodeError as e:
        print(f"TOOL ERROR: summarize_gene_details JSON decode failed: {e}")
        return json.dumps({"error": "Failed to parse NCBI response."})


AVAILABLE_FUNCTIONS = {
    "search_gene_id": search_gene_id,
    "summarize_gene_details": summarize_gene_details,
}

In [30]:
# --- Azure OpenAI Tool Schema Definition ---
tools_definition = [
    {
        "type": "function",
        "function": {
            "name": "search_gene_id",
            "description": "Searches for an NCBI gene ID (UID) using a provided gene identifier (like an Ensembl ID or gene symbol).",
            "parameters": {
                "type": "object",
                "properties": {
                    "query_id": {
                        "type": "string",
                        "description": "The gene identifier to search for (e.g., 'ENSG00000215251', 'TP53')."
                    }
                },
                "required": ["query_id"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "summarize_gene_details",
            "description": "Retrieves a structured summary of gene information (official symbol, name, organism, etc.) for a given NCBI gene ID (UID).",
            "parameters": {
                "type": "object",
                "properties": {
                    "uid": {
                        "type": "string",
                        "description": "The NCBI gene ID (UID) obtained from search_gene_id."
                    }
                },
                "required": ["uid"]
            }
        }
    }
]


In [25]:
SYSTEM_PROMPT = """
You are a helpful AI assistant designed to retrieve biomedical information from NCBI using specific tools.
Use the provided tools sequentially to achieve the user's goal.
Your goal is to answer the user's question by calling a sequence of tools.

Workflow:
1.  Use tools as needed to get the information you need.
2.  After all tool calls are made and you have the information, provide a final answer to the user.

Remember: 
- Once you make a tool call, the response will be fed back to you.
- Once you do not need to make any more tool calls - you can output the final answer.
- Plan the sequence of your tool calls appropriately.
"""

In [31]:
def make_messages(query, system_prompt, existing_messages=None):
    if existing_messages is None:
        messages = [{"role": "system", "content": system_prompt}]
    else:
        messages = existing_messages
    messages.append({"role": "user", "content": query})
    return messages

In [44]:
# --- Main Loop using Native Tool Calling ---
MAX_TURNS = 7
model_name = "qwen3:4b"
# model_name = "gpt-4.1"
client = get_client(model_name)

def native_tool_loop(initial_query: str, max_turns: int = MAX_TURNS):
    messages = make_messages(initial_query, SYSTEM_PROMPT)
    
    for i in range(max_turns):
        print(f"\n--- Turn {i + 1} ---")
        print("Messages being sent to LLM:")
        print(messages)
        for m in messages:
            if m.get('tool_calls'):
                print(f"  Tool Calls: {m['tool_calls']}")

        try:
            response = client.chat.completions.create(
                model=model_name,
                messages=messages,
                tools=tools_definition,
                tool_choice="auto", # Let the model decide when to call tools
                temperature=0,
            )
            response_message = response.choices[0].message
        except Exception as e:
            print(f"Error calling Azure OpenAI: {e}")
            return f"Error communicating with AI model: {str(e)}"

        # Construct the assistant message for history from the ChatCompletionMessage object
        assistant_message = {
            "role": "assistant",
            "content": response_message.content, # Usually None if tool_calls are present
        }
        if response_message.tool_calls:
            assistant_message["tool_calls"] = [
                {
                    "id": tc.id,
                    "type": tc.type,
                    "function": {
                        "name": tc.function.name,
                        "arguments": tc.function.arguments
                    }
                } for tc in response_message.tool_calls
            ]
        messages.append(assistant_message)
        
        if response_message.tool_calls:
            print("LLM requested tool calls:")
            for tool_call in response_message.tool_calls:
                function_name = tool_call.function.name
                print(f"  Function: {function_name}")
                try:
                    function_args_str = tool_call.function.arguments
                    function_args = json.loads(function_args_str)
                    print(f"  Arguments: {function_args}")
                except json.JSONDecodeError:
                    print(f"  Error: Could not parse arguments for {function_name}: {function_args_str}")
                    # Append an error message for this specific tool call
                    messages.append({
                        "role": "tool",
                        "tool_call_id": tool_call.id,
                        "name": function_name,
                        "content": json.dumps({"error": f"Invalid arguments format: {function_args_str}"})
                    })
                    continue # Skip to next tool call or next turn if this was the only one

                if function_name in AVAILABLE_FUNCTIONS:
                    function_to_call = AVAILABLE_FUNCTIONS[function_name]
                    try:
                        function_response_content = function_to_call(**function_args)
                        messages.append({
                            "role": "tool",
                            "tool_call_id": tool_call.id,
                            "name": function_name,
                            "content": function_response_content,
                        })
                        print(f"  Tool {function_name} executed. Response snippet: {function_response_content[:100]}...")
                    except TypeError as te:
                        print(f"  TypeError calling tool {function_name}: {te}. Check parameters.")
                        error_resp = json.dumps({"error": f"TypeError in {function_name}: {str(te)}. Args received: {function_args}"})
                        messages.append({"role": "tool", "tool_call_id": tool_call.id, "name": function_name, "content": error_resp})
                    except Exception as e:
                        print(f"  Error executing tool {function_name}: {e}")
                        error_resp = json.dumps({"error": f"Exception in {function_name}: {str(e)}"})
                        messages.append({"role": "tool", "tool_call_id": tool_call.id, "name": function_name, "content": error_resp})
                else:
                    print(f"  Error: LLM requested unknown function '{function_name}'")
                    messages.append({
                        "role": "tool",
                        "tool_call_id": tool_call.id,
                        "name": function_name,
                        "content": json.dumps({"error": f"Function '{function_name}' not found."})
                    })
        else: # No tool_calls, LLM should have provided a direct answer
            print("\nLLM provided a direct answer (no tool calls).")
            final_answer = response_message.content
            if final_answer:
                 return final_answer
            else: # Should not happen if no tool_calls, but as a safeguard
                 return "LLM did not request tools and did not provide a content answer."
            

    print("\nMax turns reached.")
    # Try to return the last textual response from the assistant if available
    for msg in reversed(messages):
        if msg.role == "assistant" and msg.content:
            return msg.content
    return "Max turns reached, and a final textual answer was not formulated by the assistant."

In [45]:
query = "Convert ENSG00000215251 to official gene symbol."
final_response = native_tool_loop(query)
print("\n--- Final Response from AI ---")
print(final_response)


--- Turn 1 ---
Messages being sent to LLM:
[{'role': 'system', 'content': "\nYou are a helpful AI assistant designed to retrieve biomedical information from NCBI using specific tools.\nUse the provided tools sequentially to achieve the user's goal.\nYour goal is to answer the user's question by calling a sequence of tools.\n\nWorkflow:\n1.  Use tools as needed to get the information you need.\n2.  After all tool calls are made and you have the information, provide a final answer to the user.\n\nRemember: \n- Once you make a tool call, the response will be fed back to you.\n- Once you do not need to make any more tool calls - you can output the final answer.\n- Plan the sequence of your tool calls appropriately.\n"}, {'role': 'user', 'content': 'Convert ENSG00000215251 to official gene symbol.'}]
LLM requested tool calls:
  Function: search_gene_id
  Arguments: {'query_id': 'ENSG00000215251'}
TOOL: Called search_gene_id with query_id: ENSG00000215251
TOOL: search_gene_id found UID: 604