In [1]:
import os
from benchmark_datasets.data_loader import load_informbench_benchmark_data
from benchmark.informgen.agent import InformGenAgent
from benchmark.vectordb import create_vector_db, create_custom_retriever_tool
from langchain_core.documents import Document
from langchain_core.messages import AIMessage
from dotenv import load_dotenv
import pandas as pd

# load the environment variables for calling azure openai
THIS_FILE_PATH = "./"

load_dotenv(os.path.join(os.path.dirname(THIS_FILE_PATH), ".env"))

# Configuration
DATA_PATH = os.path.join(os.path.dirname(THIS_FILE_PATH), "benchmark_datasets/data")
TARGET_NCTID = "NCT02788201"
TARGET_SECTIONS = ["Purpose of Research", "Duration of Study Involvement", "Procedures", "Possible Risks, Discomforts, and Inconveniences"]
TEMPLATE_PATH = os.path.join(os.path.dirname(THIS_FILE_PATH), "benchmark/templates/stanford.json")

def main():
    # 1. Load the data
    print("Loading data...")
    data = load_informbench_benchmark_data(
        data_path=DATA_PATH,
        target_nctids=[TARGET_NCTID],
        debug=True
    )
    
    # 2. Find the target trial
    target_trial = None
    for trial in data:
        if trial["nctid"] == TARGET_NCTID:
            target_trial = trial
            break
    
    if target_trial is None:
        print(f"Trial {TARGET_NCTID} not found in the data")
        return
    
    # 3. Extract protocol documents
    protocol_docs = target_trial["protocol"]
    
    # 4. Create vector database and custom retriever tool
    print("Creating vector database...")
    vectordb = create_vector_db(protocol_docs)
    retriever_tool = create_custom_retriever_tool(vectordb)
    
    # 5. Create mock SOA table and procedure-risk pairs for testing
    print("Creating mock SOA table and procedure-risk pairs...")
    
    # Mock Schedule of Assessment table
    soa_data = {
        'Visit': ['Screening', 'Baseline', 'Week 4', 'Week 8', 'Week 12', 'Follow-up'],
        'Blood Draw': ['X', 'X', 'X', 'X', 'X', ''],
        'Physical Exam': ['X', 'X', '', 'X', '', 'X'],
        'MRI': ['', 'X', '', '', 'X', ''],
        'ECG': ['X', 'X', '', '', 'X', ''],
        'Drug Administration': ['', 'X', 'X', 'X', 'X', '']
    }
    soa_table = pd.DataFrame(soa_data)
    
    # Mock procedure-risk pairs
    procedure_risk_pairs = {
        "Blood Draw": ["Pain at injection site", "Bruising", "Dizziness"],
        "MRI": ["Claustrophobia", "Loud noise", "Discomfort from lying still"],
        "Drug Administration": ["Allergic reaction", "Headache", "Nausea"],
        "Physical Exam": ["Discomfort", "Temporary pain"]
    }
    
    # 6. Initialize the InformGen agent
    print("Initializing InformGen agent...")
    informgen_agent = InformGenAgent(
        template_path=TEMPLATE_PATH,
        api_type="azure",
        api_key=os.environ.get("AZURE_OPENAI_API_KEY"),
        model_name="gpt-4o-mini",
        endpoint=os.environ.get("AZURE_OPENAI_ENDPOINT")
    )
    
    # 7. Generate ICF content
    print(f"Generating ICF content for sections: {TARGET_SECTIONS}")
    result = informgen_agent.generate(
        input_query=f"Generate ICF sections for clinical trial {TARGET_NCTID}",
        target_sections=TARGET_SECTIONS,
        protocol_docs=protocol_docs,
        retriever_tool=retriever_tool,
        soa_table=soa_table,
        procedure_risk_pairs=procedure_risk_pairs
    )
    
    # 8. Print the generated content
    print("\n--- GENERATED ICF CONTENT ---\n")
    
    if "messages" in result:
        # Extract the last AI message which contains the generated content
        ai_messages = [m for m in result["messages"] if isinstance(m, AIMessage)]
        if ai_messages:
            print(ai_messages[-1].content)
        else:
            print("No AI-generated content found in the result")
    else:
        print("Unexpected result format")
        print(result)
    
    # 9. Print any warnings if present
    if "warnings" in result:
        print("\n--- WARNINGS ---\n")
        for warning in result["warnings"]:
            print(f"- {warning}")

In [2]:
main()

Loading data...
Creating vector database...
Creating mock SOA table and procedure-risk pairs...
Initializing InformGen agent...
Generating ICF content for sections: ['Purpose of Research', 'Duration of Study Involvement', 'Procedures', 'Possible Risks, Discomforts, and Inconveniences']

--- GENERATED ICF CONTENT ---

Generated ICF Sections:

## Purpose of Research

You are invited to participate in a research study about using the COXEN score to choose the best next treatment for patients with metastatic bladder cancer who have not responded to previous treatments. We hope to learn whether this approach can help improve treatment outcomes compared to standard options. You were selected as a potential participant because you have a diagnosis of metastatic bladder cancer and have experienced progression despite prior chemotherapy.

This research study is looking for approximately 120 participants with metastatic bladder cancer. Enrollment will occur at sites throughout the United States,