In [23]:
from dotenv import load_dotenv
load_dotenv()

True

In [5]:
from token import OP
from langchain_huggingface import (
    HuggingFaceEndpointEmbeddings,
    HuggingFaceEndpoint,
    ChatHuggingFace
)
from typing import Optional


def get_hf_llm(model_name: str, model_kwargs: Optional[dict]) -> ChatHuggingFace:

    if not model_kwargs:
        model_kwargs = {
            "max_new_tokens": 1024,
        }
    model = HuggingFaceEndpoint(
        model=model_name,
        **model_kwargs
    )
    return ChatHuggingFace(llm=model)

# test 
llm = get_hf_llm("meta-llama/Llama-3.1-8B-Instruct", model_kwargs={})
llm.invoke("Hi").content

"It's nice to meet you. Is there something I can help you with or would you like to chat?"

### Initialize the models to use

**We need to define a configuration dictionary to hold the clients for each of our chosen models. This way we can easily swap models and centralizes our model management.**

In [8]:
# This dictionary will act as our central registry, or "foundry," for all LLM and embedding model clients.
llm_config = {
    # # For the 'planner', we use Llama 3.1 8B. It's a modern, 
    # highly capable model that excels at instruction-following.
    "planner": get_hf_llm(
        model_name="meta-llama/Llama-3.1-8B-Instruct",
        model_kwargs={"temperature": 0.0}
    ),
    # For the 'drafter' and 'sql_coder', we use Qwen2 7B. It's a nimble and fast model, perfect for
    # tasks like text generation and code completion where speed is valuable.
    "drafter": get_hf_llm(
        model_name="Qwen/Qwen2.5-7B-Instruct",
        model_kwargs={"temperature": 0.2}
    ),
    "sql_coder": get_hf_llm(
        model_name="Qwen/Qwen2.5-7B-Instruct",
        model_kwargs={"temperature": 0.0}
    ),
    # For the 'director', the highest-level strategic agent, we use the powerful Llama 3 70B model.
    # This high-stakes task of diagnosing performance and evolving the system's own procedures
    # justifies the use of a larger, more powerful model.
    "director": get_hf_llm(
        model_name="meta-llama/Meta-Llama-3-70B-Instruct",
        model_kwargs={"temperature": 0.0}
    ),
    # For embeddings, we use 'Qwen/Qwen3-Embedding-8B",', a top-tier, efficient open-source model.
    "embedding_model": HuggingFaceEndpointEmbeddings(
        model="Qwen/Qwen3-Embedding-8B",
        task="feature-extraction",
    )
}

In [None]:
# test embedding model
query = "Hey that's a great tutorial."
llm_config["embedding_model"].embed_query(query)

So we have just created our llm_config dictionary, which serves as a centralized hub for all our model initializations. By assigning different models to different roles, we are creating a cost-performance optimized hierarchy.

- **Fast & Nimble (7B-8B models)**: The planner, drafter, and sql_coder roles handle frequent, well-defined tasks. Using smaller models like Qwen2.5 7B and Llama 3.1 8B for these roles ensures low latency and efficient resource usage. They are perfectly capable of following instructions to generate plans, draft text, or write SQL.

- **Deep & Strategic (70B model)**: The director agent has the most complex job, it must analyze multi-dimensional performance data and rewrite the entire system operating procedure. This requires deep reasoning and a understanding of cause and effect. For this high-stakes, low-frequency task, we allocate our most powerful resource, the Llama 3 70B model.

In [21]:
# Print the configuration to confirm the clients are initialized and their parameters are set correctly.
print("LLM clients configured:")
print(f"Planner ({llm_config['planner'].model_id}): {llm_config['planner']}")
print(f"Drafter ({llm_config['drafter'].model_id}): {llm_config['drafter']}")
print(f"SQL Coder ({llm_config['sql_coder'].model_id}): {llm_config['sql_coder']}")
print(f"Director ({llm_config['director'].model_id}): {llm_config['director']}")
print(f"Embedding Model ({llm_config['embedding_model'].model}): {llm_config['embedding_model']}")

LLM clients configured:
Planner (meta-llama/Llama-3.1-8B-Instruct): llm=HuggingFaceEndpoint(temperature=0.0, stop_sequences=[], server_kwargs={}, model_kwargs={}, model='meta-llama/Llama-3.1-8B-Instruct', client=<InferenceClient(model='meta-llama/Llama-3.1-8B-Instruct', timeout=120)>, async_client=<InferenceClient(model='meta-llama/Llama-3.1-8B-Instruct', timeout=120)>) model_id='meta-llama/Llama-3.1-8B-Instruct' top_p=0.95 max_tokens=512 model_kwargs={}
Drafter (Qwen/Qwen2.5-7B-Instruct): llm=HuggingFaceEndpoint(temperature=0.2, stop_sequences=[], server_kwargs={}, model_kwargs={}, model='Qwen/Qwen2.5-7B-Instruct', client=<InferenceClient(model='Qwen/Qwen2.5-7B-Instruct', timeout=120)>, async_client=<InferenceClient(model='Qwen/Qwen2.5-7B-Instruct', timeout=120)>) model_id='Qwen/Qwen2.5-7B-Instruct' temperature=0.2 top_p=0.95 max_tokens=512 model_kwargs={}
SQL Coder (Qwen/Qwen2.5-7B-Instruct): llm=HuggingFaceEndpoint(temperature=0.0, stop_sequences=[], server_kwargs={}, model_kwargs={

### Preparing the Knowledge Stores

The most important part for a RAG pipeline is a rich multi-modal knowledge base to draw upon. A generic, web-based search is not enough for a specialized task like clinical trial design. We need to ground our agents in authoritative, domain-specific information.

In [22]:
import os

# A dictionary to hold the paths for our different data types. This keeps our file management clean and centralized.
data_paths = {
    "base": "./data",
    "pubmed": "./data/pubmed_articles",
    "fda": "./data/fda_guidelines",
    "ethics": "./data/ethical_guidelines",
    "mimic": "./data/mimic_db"
}
# This loop iterates through our defined paths and uses os.makedirs() to create any directories that don't already exist.
# This prevents errors in later steps when we try to save files to these locations.
for path in data_paths.values():
    if not os.path.exists(path):
        os.makedirs(path)
        print(f"Created directory: {path}")

Created directory: ./data
Created directory: ./data/pubmed_articles
Created directory: ./data/fda_guidelines
Created directory: ./data/ethical_guidelines
Created directory: ./data/mimic_db


Download Pubmed articles for our medical researcher

In [24]:
from Bio import Entrez
from Bio import Medline

def download_pubmed_articles(query, max_articles=20):
    """Fetches abstracts from PubMed for a given query and saves them as text files."""
    # The NCBI API requires an email address for identification. We fetch it from our environment variables.
    Entrez.email = os.environ.get("ENTREZ_EMAIL")
    print(f"Fetching PubMed articles for query: {query}")
    
    # Step 1: Use Entrez.esearch to find the PubMed IDs (PMIDs) for articles matching our query.
    handle = Entrez.esearch(db="pubmed", term=query, retmax=max_articles, sort="relevance")
    record = Entrez.read(handle)
    id_list = record["IdList"]
    print(f"Found {len(id_list)} article IDs.")
    
    print("Downloading articles...")
    # Step 2: Use Entrez.efetch to retrieve the full records (in MEDLINE format) for the list of PMIDs.
    handle = Entrez.efetch(db="pubmed", id=id_list, rettype="medline", retmode="text")
    records = Medline.parse(handle)
    
    count = 0
    # Step 3: Iterate through the retrieved records, parse them, and save each abstract to a file.
    for i, record in enumerate(records):
        pmid = record.get("PMID", "")
        title = record.get("TI", "No Title")
        abstract = record.get("AB", "No Abstract")
        if pmid:
            # We name the file after the PMID for easy reference and to avoid duplicates.
            filepath = os.path.join(data_paths["pubmed"], f"{pmid}.txt")
            with open(filepath, "w") as f:
                f.write(f"Title: {title}\n\nAbstract: {abstract}")
            print(f"[{i+1}/{len(id_list)}] Fetching PMID: {pmid}... Saved to {filepath}")
            count += 1
    return count

In [25]:
# We define a specific, boolean query to find articles highly relevant to our trial concept.
pubmed_query = "(SGLT2 inhibitor) AND (type 2 diabetes) AND (renal impairment)"
num_downloaded = download_pubmed_articles(pubmed_query)
print(f"PubMed download complete. {num_downloaded} articles saved.")

Fetching PubMed articles for query: (SGLT2 inhibitor) AND (type 2 diabetes) AND (renal impairment)
Found 20 article IDs.
Downloading articles...
[1/20] Fetching PMID: 36945734... Saved to ./data/pubmed_articles/36945734.txt
[2/20] Fetching PMID: 40470996... Saved to ./data/pubmed_articles/40470996.txt
[3/20] Fetching PMID: 38914124... Saved to ./data/pubmed_articles/38914124.txt
[4/20] Fetching PMID: 30697905... Saved to ./data/pubmed_articles/30697905.txt
[5/20] Fetching PMID: 36335326... Saved to ./data/pubmed_articles/36335326.txt
[6/20] Fetching PMID: 36351458... Saved to ./data/pubmed_articles/36351458.txt
[7/20] Fetching PMID: 40327845... Saved to ./data/pubmed_articles/40327845.txt
[8/20] Fetching PMID: 35113333... Saved to ./data/pubmed_articles/35113333.txt
[9/20] Fetching PMID: 34619106... Saved to ./data/pubmed_articles/34619106.txt
[10/20] Fetching PMID: 33413348... Saved to ./data/pubmed_articles/33413348.txt
[11/20] Fetching PMID: 34272327... Saved to ./data/pubmed_articl

Now, let’s get the regulatory documents that our Regulatory Specialist agent will need. A key part of trial design is ensuring compliance with government guidelines.

In [82]:
from os import read
import requests
from pypdf import PdfReader
import io

def download_and_extract_text_from_pdf(url, output_path, download=False):
    """Downloads a PDF from a URL, saves it, and also extracts its text content to a separate .txt file."""
    print(f"Downloading FDA Guideline: {url}")
    try:
        # We use the 'requests' library to perform the HTTP GET request to download the file.
        if download:
            response = requests.get(url)
            response.raise_for_status() # This is a good practice that will raise an error if the download fails (e.g., a 404 error).
        
            # We save the raw PDF file, which is useful for archival purposes.
            with open(output_path, 'wb') as f:
                f.write(response.content)
            print(f"Successfully downloaded and saved to {output_path}")
        
            # We then use pypdf to read the PDF content directly from the in-memory response.
            reader = PdfReader(io.BytesIO(response.content))
        else:
            reader = PdfReader(output_path)
        text = ""
        # We loop through each page of the PDF and append its extracted text.
        for page in reader.pages:
            text += page.extract_text() + "\n\n"
        
        # Finally, we save the clean, extracted text to a .txt file. This is the file our RAG system will actually use.
        txt_output_path = os.path.splitext(output_path)[0] + '.txt'
        with open(txt_output_path, 'w') as f:
            f.write(text)
        return True
    except requests.exceptions.RequestException as e:
        print(f"Error downloading file: {e}")
        return False

This function, download_and_extract_text_from_pdf, is our tool for handling PDF documents. It's a two-stage process.

- First, it downloads and saves the original PDF from the FDA website. Second, and more importantly, it immediately processes that PDF using pypdf to extract all the text content.

- It then saves this raw text to a .txt file. This pre-processing step is crucial because it converts the complex PDF format into simple text that our document loaders can easily ingest when we build our vector stores later on.

Let’s run the function to download our FDA guidance document.

In [None]:
# This URL points to a real FDA guidance document for developing drugs for diabetes.
fda_url = "https://www.fda.gov/media/71185/download"
fda_pdf_path = os.path.join(data_paths["fda"], "fda_diabetes_guidance_ocr.pdf")
download_and_extract_text_from_pdf(fda_url, fda_pdf_path)

Downloading FDA Guideline: https://www.fda.gov/media/71185/download


True