Clinical Trial Matchmaker Agent

In [1]:
from agents import Agent, WebSearchTool, trace, Runner, gen_trace_id, function_tool
from agents.model_settings import ModelSettings
from pydantic import BaseModel, Field
from dotenv import load_dotenv
import asyncio
import sendgrid
import os
from sendgrid.helpers.mail import Mail, Email, To, Content
from typing import Dict
from IPython.display import display, Markdown

In [2]:
import requests
import json

In [3]:
load_dotenv(override=True)

True

In [None]:
# Goal: To match a patient's medical profile with a list of active clinical trials they may be eligible for.

In [None]:
# Agent 1 (Intake Agent): This agent interacts with the user (patient or doctor) to gather key information, such as disease name, stage, current treatments, and location.

In [14]:
class KeyFacts(BaseModel):
    condition: str = Field(description="Conditions or disease the person has (for example, breast cancer or high blood pressure).")
    intervention: str = Field(description="Intervention / Treatment (for example, radiation therapy or low fat diet).")
    age: int = Field(description="Age of the person in years.")
    gender: str = Field(description="Gender of the person (MALE or FEMALE).")
    location: str = Field(description="The location where the trial should be (search by address, city, state, or country).")


INSTRUCTIONS = """You are a medical assistant. Given a free text, you extract the key facts and fill them into the schema.
Return a json only - no explanations, no comments. The schema: condition, intervention, age, gender, location.
The output will be used to search clinicaltrials.gov via their API. 
Use Boolean operators (AND / OR / NOT) and Grouping operators ( () ) if necessary.
The location must be an address, city, state, or country.
The output must be in English, regardless of the input language.""" 

search_planner = Agent(
    name="Query builder helper",
    instructions=INSTRUCTIONS,
    model="gpt-4o-mini",
    output_type=KeyFacts
)


test:

In [None]:
#message = "55 year old man with nsclc and currently treated with Osimertinib" # -> condition='non-small cell lung cancer (NSCLC)' intervention='Osimertinib' age=55 gender='MALE'
message = "I have a 55-year-old female with metastatic triple-negative breast cancer. She has progressed after first-line carboplatin/paclitaxel and immunotherapy with pembrolizumab. Looking for trials with novel antibody-drug conjugates or PARP inhibitors, preferably in the Northeast US."

result = await Runner.run(search_planner, message)

In [40]:
scheme = result.final_output
print(scheme)

condition='metastatic triple-negative breast cancer' intervention='novel antibody-drug conjugates OR PARP inhibitors' age=55 gender='FEMALE' location='Northeast US'


In [41]:
# test look for trials:
trials = look_for_trials(query_codition=scheme.condition, gender=scheme.gender, age=scheme.age, query_intervention=scheme.intervention, query_location="")

(AREA[MinimumAge]RANGE[MIN, 55 years] AND AREA[MaximumAge]RANGE[55 years, MAX]) OR AREA[MinimumAge]RANGE[MIN, 55 years] AND AREA[Sex](FEMALE OR ALL)


In [None]:
trials

In [None]:
# Agent 2 (Search Agent): Once the Intake Agent has a de-identified, structured query, it performs a handoff to this agent. The Search Agent uses a tool to search a clinical trial database like ClinicalTrials.gov.

In [None]:
# Agent 3 (Analysis Agent): The Search Agent hands off its results to the Analysis Agent. This agent analyzes the eligibility criteria of the retrieved trials and compares them to the patient's profile to create a ranked list of potential matches.

In [None]:
# Output: The final output is a list of clinical trials with brief summaries of each and a note on why the patient may be a good fit, all presented in a clear, easy-to-read format.

In [None]:
""" imporvements:
Loction Filer:
    You can further refine this by letting the user input their specific city or zip code and then using a tool to calculate the distance to the geoPoint coordinates of each trial location.

"""

' imporvements:\nLoction Filer:\n    You can further refine this by letting the user input their specific city or zip code and then using a tool to calculate the distance to the geoPoint coordinates of each trial location.\n\nAge filter:\n'

In [16]:
def build_advanced_filters(age = None, sex = None):
    # Build the advanced filter string
    advanced_filters = []

    if age is not None:
        advanced_filters.append(f"(AREA[MinimumAge]RANGE[MIN, {age} years] AND AREA[MaximumAge]RANGE[{age} years, MAX]) OR AREA[MinimumAge]RANGE[MIN, {age} years]")
    
    if sex and sex.upper() in ["MALE", "FEMALE"]:
        advanced_filters.append(f"AREA[Sex]({sex.upper()} OR ALL)")

    return advanced_filters

In [51]:
def look_for_trials(
    query_codition: str = "",
    query_intervention: str = "",
    age = None,
    gender: str = "ALL",
    query_location: str = "Germany",
    overall_status: str = "RECRUITING"
):
    """ 
    query clinicaltrials.gov API to get a json af available trials
    query_condition: "Conditions or disease" query
    query_intervention: "Intervention / treatment" query
    overall_status:  ACTIVE_NOT_RECRUITING ┃ COMPLETED ┃ ENROLLING_BY_INVITATION ┃ NOT_YET_RECRUITING ┃ RECRUITING ┃ SUSPENDED ┃ TERMINATED ┃ WITHDRAWN ┃ AVAILABLE ┃ NO_LONGER_AVAILABLE ┃ TEMPORARILY_NOT_AVAILABLE ┃ APPROVED_FOR_MARKETING ┃ WITHHELD ┃ UNKNOWN
    query_location: "Location terms" query

    """
    # The search query
    query_params = {
        "query.titles": "",
        "query.term": "",
        "query.cond": query_codition,
        "query.intr": query_intervention,
        "query.locn": query_location,
        "filter.overallStatus": overall_status,
        #"filter.geo": distance(latitude,longitude,distance), #Examples: distance(39.0035707,-77.1013313,50mi)
        "pageSize": 10,
        "fields": "NCTId,BriefTitle,OfficialTitle,OverallStatus,BriefSummary,EligibilityModule,ContactsLocationsModule", #Phases?
        "sort": "@relevance",
        "countTotal": "true",
    }

    # Build the advanced filter string
    advanced_filters = build_advanced_filters(age, gender)#, gender)

    if advanced_filters:
        query_params["filter.advanced"] = " AND ".join(advanced_filters)
        print(query_params["filter.advanced"])

    # The API endpoint
    url = "https://clinicaltrials.gov/api/v2/studies"

    data = {}

    try:
        # Send the GET request
        response = requests.get(url, params=query_params)
        response.raise_for_status()

        data = response.json()

    except Exception as e:
        print(f"An error occurred: {e}")

    return data if data else {}

test:

In [21]:
# test look for trials:
trials = look_for_trials(query_codition="nsclc", gender="MALE", age=50, query_intervention="Osimertinib", query_location="Essen")

(AREA[MinimumAge]RANGE[MIN, 50 years] AND AREA[MaximumAge]RANGE[50 years, MAX]) OR AREA[MinimumAge]RANGE[MIN, 50 years] AND AREA[Sex](MALE OR ALL)


In [None]:
trials

In [34]:
def create_paylod(trials):
    lean_trials_data = []
    for trial in trials.get("studies", []):
        ps = trial.get("protocolSection", {})
        idm = ps.get("identificationModule", {})
        stm = ps.get("statusModule", {})
        dsm = ps.get("descriptionModule", {})
        elm = ps.get("eligibilityModule", {})

        record = {
            "nctId": idm.get("nctId"),
            "briefTitle": idm.get("briefTitle"),
            "officialTitle": idm.get("officialTitle"),

            "status": stm.get("overallStatus"),

            "summary": dsm.get("briefSummary"),

            "eligibilityCriteria": elm.get("eligibilityCriteria"),
            "age": {"min": elm.get("minimumAge", "None"), "max": elm.get("maximumAge", "None")},
            "sex": elm.get("sex")

        }

        lean_trials_data.append(record)

    return lean_trials_data

test

In [43]:
lean = create_paylod(trials)

In [44]:
lean

[{'nctId': 'NCT04837209',
  'briefTitle': 'Radiation, Immunotherapy and PARP Inhibitor in Triple Negative Breast Cancer',
  'officialTitle': 'A Phase II Study of NirAparib, Dostarlimab and Radiotherapy in Metastatic, PD-L1 Negative or Immunotherapy-Refractory Triple-Negative Breast Cancer (NADiR)',
  'status': 'RECRUITING',
  'summary': 'This research study is looking to see whether the combination of Dostarlimab and Niraparib plus Radiation Therapy (RT) is safe and effective in participants with metastatic triple negative breast cancer.\n\nThe names of the study treatment involved in this study are:\n\n* Dostarlimab\n* Niraparib\n* Radiation Therapy (RT), which is given per standard of care.',
  'eligibilityCriteria': "Inclusion Criteria:\n\n* Age ≥ 18 years.\n* ECOG performance status ≤ 1\n* Histologically or cytologically-confirmed TNBC (ER \\<1%, PR \\<1%, HER-2-neu 0-1+ by IHC or non-FISH-amplified63. ER-low, PR-low (defined as ER and/or PR 1-10%) and HER2-negative patients may al

In [37]:
class Trial_Format(BaseModel):
    rank: int = Field(description="The rank of the trial.")
    nctId: str = Field(description="The nctId of the trial")
    reason: str = Field("The reason for the decision")

class Rank_Format(BaseModel):
    ranks: list[Trial_Format] = Field(description="Ranked list of the trials")


INSTRUCTIONS = """You are a medical assistant. Given a free text description of a patient and a list of clinical trials, evaluate if the patient is an appropriate fit for the trials.
Go through the trials one by one and give a short reason why the patient might be a fit or why not. Then rank the trials in order, starting with the one that best fits the patient.
Return only the rank, the nctId and the reason for each trial:
1. nctId, reason
2. nctId, reason
..."""

ranker = Agent(
    name="Trial ranker",
    instructions=INSTRUCTIONS,
    model="gpt-4o-mini",
    output_type=Rank_Format
)

test:

In [45]:
message_desc =  message #"55 year old man with nsclc and currently treated with Osimertinib"

json_trial_payload = create_paylod(trials)

message = f"Description of the patient: {message_desc};\n\nTrials: {json_trial_payload}"

ranks = await Runner.run(ranker, message)

output = ranks.final_output
print(output)

ranks=[Trial_Format(rank=1, nctId='NCT04837209', reason='This trial is specifically focused on metastatic triple-negative breast cancer and includes patients who have progressed after immunotherapy, making it a strong fit for the patient. It also includes a PARP inhibitor (Niraparib) which the patient is seeking.'), Trial_Format(rank=2, nctId='NCT05933265', reason="While this trial addresses advanced solid tumors including triple-negative breast cancer, it does not specifically mention the use of a PARP inhibitor or antibody-drug conjugates, which are the primary interests of the patient. However, the patient's eligibility based on prior therapies fits this study.")]


In [None]:
# nctId -> output of the most relevant information
# nctId, locations, ,brief title, reason, contact info?

In [46]:
from geopy.distance import geodesic

def calculate_distance_to_point(point_a, point_constant):

    """the ouput of the 2 print statements looks like this:
        {'lat': 33.70918, 'lon': -117.95367}
        (33.70918, -117.95367)
        {'lat': 33.80307, 'lon': -118.07256}
        (33.80307, -118.07256)
        {'lat': 34.05223, 'lon': -118.24368}
        (34.05223, -118.24368)
    """
    #print(point_a)
    try:
        point_coords = (point_a["lat"], point_a["lon"])
        #print(point_coords)

        distance_km = geodesic(point_coords, point_constant).km
    except Exception as e:
        #print(f"Error when trying to cal the distance or extracting the point: {e}")
        distance_km = 999_999

    return distance_km
    

test:

In [91]:
# Define the two geopoints as tuples (latitude, longitude)
point_a ={"lat": 40, "lon": -74}  # New York City
point_b = (34.0522, -118.2437) # Los Angeles
print(calculate_distance_to_point(point_a, point_b))

3950.7715935815577


In [None]:
def find_trial(trials: dict, id: str) -> dict | None:
    target = id.strip().upper()
    for trial in trials.get("studies", []):
        ps = trial.get("protocolSection", {})
        idm = ps.get("identificationModule", {})

        nctId = idm.get("nctId").strip().upper()

        if nctId == target:
            return trial
    return None


def find_locations(trials: dict, id: str):
    trial_by_id = find_trial(trials, id)

    ps = trial_by_id.get("protocolSection", {})
    clm = ps.get("contactsLocationsModule")
    loc = clm.get("locations")

    locations = []

    for i in loc:
        location_dict = {}
        location_dict["facility"] = i.get("facility")
        location_dict["geoPoint"] = i.get("geoPoint")

        locations.append(location_dict)

    #print(locations)
    try:
        point = (51.4641, 6.8771) #choose a point lat | lon of your choice!!!
        sorted_locations = sorted(locations, key=lambda loc: calculate_distance_to_point(loc["geoPoint"], point))
    except Exception as e:
        #print(f"Error when calculatin distance: {e}")
        sorted_locations = locations

    return sorted_locations


test

In [48]:
def get_clean_trial(target_id: str, json_trial_payload):
    for trial in json_trial_payload:
        id = trial.get("nctId").strip().upper()
        if id == target_id.strip().upper():
            return(trial)
    return None


def format_output(trials: dict, target_id: str, clean_trials_payload: dict):
    trial = get_clean_trial(target_id, clean_trials_payload)

    trial_info = ""

    trial_info += f"nctId: {trial.get("nctId")}\n"
    trial_info += f"Title: {trial.get("briefTitle")}\n"
    trial_info += f"Status: {trial.get("status")}\n"

    locations = find_locations(trials, target_id)
    #print(locations)


    trial_info += f"Locations: "
    for l in locations[:5]:
        facility = l.get("facility")
        trial_info += f"{facility}, "

    print(trial_info)

test:

In [95]:
format_output(trials, "NCT05281406", json_trial_payload)

[{'facility': 'Charité Universitätsmedizin Berlin Campus Virchow Klinikum Klinik mit Schwerpunkt Infektiologie und Pneumologie', 'geoPoint': {'lat': 52.52437, 'lon': 13.41053}}, {'facility': 'Universitätsklinik Köln, Lung Cancer Group Cologne - Innere Medizin I', 'geoPoint': {'lat': 50.93333, 'lon': 6.95}}, {'facility': 'Technische Universität Dresden Medizinische Fakultät Carl Gustav Carus Medizinische Klinik und Poliklinik I', 'geoPoint': {'lat': 51.05089, 'lon': 13.73832}}, {'facility': 'Universitätsklinikum Essen, Westdeutsches Tumorzentrum - Innere Klinik', 'geoPoint': {'lat': 51.45657, 'lon': 7.01228}}, {'facility': 'University Hospital Frankfurt', 'geoPoint': {'lat': 49.68333, 'lon': 10.53333}}, {'facility': 'Asklepios Lungenklinik Gauting', 'geoPoint': {'lat': 48.06919, 'lon': 11.37703}}, {'facility': 'MVZ II der Niels Stensen Kliniken; Franziskus Hospital Harderberg', 'geoPoint': {'lat': 52.20296, 'lon': 8.0448}}, {'facility': 'Universitätsmedizin Göttingen, Klinik für Hämatol

In [118]:
message_desc = "55 year old man with nsclc and currently treated with Osimertinib"
message_desc = "40 year old male with diabets typ 2 and insulin"
print(message_desc)

result = await Runner.run(search_planner, message_desc)

facts = result.final_output
print(facts)

trials = look_for_trials(query_codition=facts.condition, query_intervention=facts.intervention, age=facts.age, gender=facts.gender)

count = trials.get("totalCount")
print(count)

if count <= 10 and count > 0:

    json_trial_payload = create_paylod(trials)

    message = f"Description of the patient: {message_desc};\n\nTrials: {json_trial_payload}"

    ranks = await Runner.run(ranker, message)

    output = ranks.final_output
    print(output)

40 year old male with diabets typ 2 and insulin
condition='Type 2 diabetes' intervention='Insulin' age=40 gender='MALE'
(AREA[MinimumAge]RANGE[MIN, 40 years] AND AREA[MaximumAge]RANGE[40 years, MAX]) OR AREA[MinimumAge]RANGE[MIN, 40 years] AND AREA[Sex](MALE OR ALL)
7
ranks=[Trial_Format(rank=1, nctId='NCT07068295', reason="The patient is a 40 year old male with Type 2 Diabetes and currently using insulin, fitting well within the trial's inclusion criteria for male participants aged 18-69 and treated with insulin."), Trial_Format(rank=2, nctId='NCT07146347', reason="The patient is a male diagnosed with Type 2 Diabetes, which aligns with the trial's inclusion criteria for diabetes patients aged 18-70 with a stable insulin regimen."), Trial_Format(rank=3, nctId='NCT06982859', reason='The patient has Type 2 Diabetes for at least 6 months and is on insulin treatment, meeting essential criteria for the trial.'), Trial_Format(rank=4, nctId='NCT05654805', reason='The patient has Type 2 Diabet

In [50]:
for t in output.ranks:
    print(f"Rank: {t.rank}\n")
    format_output(trials, t.nctId, json_trial_payload)
    print(f"Reason: {t.reason}\n\n")

Rank: 1

nctId: NCT04837209
Title: Radiation, Immunotherapy and PARP Inhibitor in Triple Negative Breast Cancer
Status: RECRUITING
Locations: Massachusetts General Hospital Cancer Center, University of Pennsylvania Abramson Cancer Center, Sibley Memorial Hospital, Duke University Medical Center, 
Reason: This trial is specifically focused on metastatic triple-negative breast cancer and includes patients who have progressed after immunotherapy, making it a strong fit for the patient. It also includes a PARP inhibitor (Niraparib) which the patient is seeking.


Rank: 2

nctId: NCT05933265
Title: Study of LP-184 in Patients With Advanced Solid Tumors
Status: RECRUITING
Locations: Fox Chase Cancer Center, John Hopkins - The Sidney Kimmel Comprehensive Cancer Center, Northwest Oncology & Hematology, Indiana University Melvin and Bren Simon Cancer Center, Norton Healthcare, Inc., 
Reason: While this trial addresses advanced solid tumors including triple-negative breast cancer, it does not sp