In [3]:
import os
from benchmark_datasets.data_loader import load_informbench_benchmark_data
from benchmark.rag_agent import RAGAgent
from benchmark.vectordb import create_vector_db, create_custom_retriever_tool
from langchain_core.documents import Document
from langchain_core.messages import AIMessage
from dotenv import load_dotenv

# load the environment variables for calling azure openai
THIS_FILE_PATH = "./"

load_dotenv(os.path.join(os.path.dirname(THIS_FILE_PATH), ".env"))

# Configuration
DATA_PATH = os.path.join(os.path.dirname(THIS_FILE_PATH), "benchmark_datasets/data")
TARGET_NCTID = "NCT02788201"
TARGET_SECTIONS = ["Purpose of Research", "Duration of Study Involvement"]

def main():
    # 1. Load the data
    print("Loading data...")
    data = load_informbench_benchmark_data(
        data_path=DATA_PATH,
        target_nctids=[TARGET_NCTID],
        debug=True
    )
    
    # 2. Find the target trial
    target_trial = None
    for trial in data:
        if trial["nctid"] == TARGET_NCTID:
            target_trial = trial
            break
    
    if target_trial is None:
        print(f"Trial {TARGET_NCTID} not found in the data")
        return
    
    # 3. Extract protocol documents
    protocol_docs = target_trial["protocol"]
    
    # 4. Create vector database and custom retriever tool
    print("Creating vector database...")
    vectordb = create_vector_db(protocol_docs)
    retriever_tool = create_custom_retriever_tool(vectordb)
    
    # 5. Initialize the RAG agent
    print("Initializing RAG agent...")
    rag_agent = RAGAgent(
        api_type="azure",
        api_key=os.environ.get("AZURE_OPENAI_API_KEY"),
        model_name="gpt-4o-mini",
        endpoint=os.environ.get("AZURE_OPENAI_ENDPOINT")
    )
    
    # 6. Generate ICF content
    print(f"Generating ICF content for sections: {TARGET_SECTIONS}")
    result = rag_agent.generate(
        input_query=f"Generate ICF sections for trial {TARGET_NCTID}",
        retriever_tool=retriever_tool,
        target_icf_sections=TARGET_SECTIONS
    )
    
    # 7. Print the generated content
    print("\n--- GENERATED ICF CONTENT ---\n")
    
    if "messages" in result:
        # Extract the last AI message which contains the generated content
        ai_messages = [m for m in result["messages"] if isinstance(m, AIMessage)]
        if ai_messages:
            print(ai_messages[-1].content)
        else:
            print("No AI-generated content found in the result")
    else:
        print("Unexpected result format")
        print(result)

In [4]:
main()

Loading data...
Creating vector database...
Initializing RAG agent...
Generating ICF content for sections: ['Purpose of Research', 'Duration of Study Involvement']

--- GENERATED ICF CONTENT ---

Generated ICF Sections:

## Purpose of Research

### Purpose of Research

The purpose of this research study is to evaluate a new approach for choosing treatments for patients with advanced urothelial carcinoma, which is a type of bladder cancer that has not responded to previous chemotherapy. The study will use a method called the COXEN model to help doctors select the most suitable “next therapy” for each patient based on their individual tumor data. 

The COXEN model analyzes information from tumor biopsies to predict how different FDA-approved cancer drugs might work on a specific patient's cancer. This could provide doctors with valuable information about which treatments are likely to be most effective for each individual patient, potentially leading to better treatment outcomes[[citatio