#Clinical Trials LLM

**Install necessary libraries**

In [None]:
!pip install openai --upgrade
!pip install langchain_openai --upgrade

Collecting openai
  Downloading openai-1.37.1-py3-none-any.whl.metadata (22 kB)
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.27.0-py3-none-any.whl.metadata (7.2 kB)
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.5-py3-none-any.whl.metadata (20 kB)
Collecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai)
  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Downloading openai-1.37.1-py3-none-any.whl (337 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m337.0/337.0 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading httpcore-1.0.5-py3-none-any.whl (77 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading h11-0.14.0-py3-none-an

### Import and Pre-Process Clinical Trial Data from clinicaltrials.gov

In [None]:
import requests
import pandas as pd

# Initial URL for the first API call
base_url = "https://clinicaltrials.gov/api/v2/studies"
params = {
    "query.titles": "Breast Cancer",
    "pageSize": 100
}

# Initialize an empty list to store the data
data_list = []

# Loop until there is no nextPageToken
while True:
    # Print the current URL (for debugging purposes)
    print("Fetching data from:", base_url + '?' + '&'.join([f"{k}={v}" for k, v in params.items()]))

    # Send a GET request to the API
    response = requests.get(base_url, params=params)

    # Check if the request was successful
    if response.status_code == 200:
        data = response.json()  # Parse JSON response
        studies = data.get('studies', [])  # Extract the list of studies

        # Loop through each study and extract specific information
        for study in studies:
            # Safely access nested keys
            nctId = study['protocolSection']['identificationModule'].get('nctId', 'Unknown')
            overallStatus = study['protocolSection']['statusModule'].get('overallStatus', 'Unknown')
            startDate = study['protocolSection']['statusModule'].get('startDateStruct', {}).get('date', 'Unknown Date')
            conditions = ', '.join(study['protocolSection']['conditionsModule'].get('conditions', ['No conditions listed']))
            acronym = study['protocolSection']['identificationModule'].get('acronym', 'Unknown')

            # Extract interventions safely
            interventions_list = study['protocolSection'].get('armsInterventionsModule', {}).get('interventions', [])
            interventions = ', '.join([intervention.get('name', 'No intervention name listed') for intervention in interventions_list]) if interventions_list else "No interventions listed"

            # Extract locations safely
            locations_list = study['protocolSection'].get('contactsLocationsModule', {}).get('locations', [])
            locations = ', '.join([f"{location.get('city', 'No City')} - {location.get('country', 'No Country')}" for location in locations_list]) if locations_list else "No locations listed"

            # Extract dates and phases
            primaryCompletionDate = study['protocolSection']['statusModule'].get('primaryCompletionDateStruct', {}).get('date', 'Unknown Date')
            studyFirstPostDate = study['protocolSection']['statusModule'].get('studyFirstPostDateStruct', {}).get('date', 'Unknown Date')
            lastUpdatePostDate = study['protocolSection']['statusModule'].get('lastUpdatePostDateStruct', {}).get('date', 'Unknown Date')
            studyType = study['protocolSection']['designModule'].get('studyType', 'Unknown')
            phases = ', '.join(study['protocolSection']['designModule'].get('phases', ['Not Available']))

            # Extract eligibility criteria
            #eligibilities = ', '.join(study['protocolSection']['eligibilityModule'].get('eligibilityCriteria', ['No criteria listed']))

            eligibility = study['protocolSection']['eligibilityModule'].get('eligibilityCriteria', 'Unknown')

            #eligibilities_list = study['protocolSection'].get('eligibilityModule', {}).get('eligibilityCriteria', [])
            #eligibilities = ', '.join([eligiblity.get('name', 'No criteria listed') for eligiblity in eligibilities_list]) if eligibilities_list else "No eligibility listed"

            # Append the data to the list as a dictionary
            data_list.append({
                "NCT ID": nctId,
                "Acronym": acronym,
                "Overall Status": overallStatus,
                "Start Date": startDate,
                "Conditions": conditions,
                "Interventions": interventions,
                "Locations": locations,
                "Primary Completion Date": primaryCompletionDate,
                "Study First Post Date": studyFirstPostDate,
                "Last Update Post Date": lastUpdatePostDate,
                "Study Type": studyType,
                "Phases": phases,
                "Eligibility": eligibility
            })

        # Check for nextPageToken and update the params or break the loop
        nextPageToken = data.get('nextPageToken')
        if nextPageToken:
            params['pageToken'] = nextPageToken  # Set the pageToken for the next request
        else:
            break  # Exit the loop if no nextPageToken is present
    else:
        print("Failed to fetch data. Status code:", response.status_code)
        break

# Create a DataFrame from the list of dictionaries
clinical_trials_data = pd.DataFrame(data_list)

# Print the DataFrame
#print(clinical_trial_df)

# Optionally, save the DataFrame to a CSV file
clinical_trials_data .to_csv("clinical_trials_data.csv", index=False)

Fetching data from: https://clinicaltrials.gov/api/v2/studies?query.titles=Breast Cancer&pageSize=100
Fetching data from: https://clinicaltrials.gov/api/v2/studies?query.titles=Breast Cancer&pageSize=100&pageToken=NF0g5JCFl_Ms
Fetching data from: https://clinicaltrials.gov/api/v2/studies?query.titles=Breast Cancer&pageSize=100&pageToken=NF0g5JWCk_go
Fetching data from: https://clinicaltrials.gov/api/v2/studies?query.titles=Breast Cancer&pageSize=100&pageToken=NF0g5JqClvgr
Fetching data from: https://clinicaltrials.gov/api/v2/studies?query.titles=Breast Cancer&pageSize=100&pageToken=NF0g5JKFlPIhww
Fetching data from: https://clinicaltrials.gov/api/v2/studies?query.titles=Breast Cancer&pageSize=100&pageToken=NF0g5JKClfIhwA
Fetching data from: https://clinicaltrials.gov/api/v2/studies?query.titles=Breast Cancer&pageSize=100&pageToken=NF0g5JKPlPkqyA
Fetching data from: https://clinicaltrials.gov/api/v2/studies?query.titles=Breast Cancer&pageSize=100&pageToken=NF0g5JGFlvgrxA
Fetching data f

In [None]:
clinical_trials_data = clinical_trials_data[clinical_trials_data['Overall Status'] == 'RECRUITING']

In [None]:
print('Number of Clinical Trials: ' + str(len(clinical_trials_data)))

Number of Clinical Trials: 1756


### Import Patient Records

In [None]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

Mounted at /content/drive/


In [None]:
#!ls /content/drive/MyDrive/synthea_dataset

In [None]:
def flatten_dataframe(df):
    for column in df.columns:
        if df[column].apply(lambda x: isinstance(x, (list, dict))).any():
            # Normalize the nested column
            normalized_df = pd.json_normalize(df[column].explode().dropna().tolist(), sep='_')
            normalized_df.columns = [f"{column}_{sub_col}" for sub_col in normalized_df.columns]
            # Drop the original nested column and join the normalized columns
            df = df.drop(columns=[column]).join(normalized_df)
    return df

In [None]:
patient_files = ["/content/drive/MyDrive/synthea_dataset/Paulita78_Watsica258_2dcefbbe-9f7c-4253-85f8-e1f0e74d90c1.json",
                 "/content/drive/MyDrive/synthea_dataset/Grace552_Little434_4198b779-4200-467c-acc5-5819803189fe.json",
                 "/content/drive/MyDrive/synthea_dataset/Dominique369_Daniel959_0e197e39-92cc-4621-b60b-0e0544fa7c49.json",
                 "/content/drive/MyDrive/synthea_dataset/Jeri234_Koss676_a0437435-67d7-49f0-8a7d-a2344094a8f9.json",
                 "/content/drive/MyDrive/synthea_dataset/Luis923_Cremin516_e06ba0d6-5338-4630-be9c-c7a7817fc890.json",
                 "/content/drive/MyDrive/synthea_dataset/Ronnie7_Greenfelder433_fddf3bac-f14d-45a9-a0b0-690435ea799b.json",
                 "/content/drive/MyDrive/synthea_dataset/Tambra47_Lang846_ae6b9280-f98d-455b-87fa-cc576ab01d2d.json",
                 "/content/drive/MyDrive/synthea_dataset/Veronique514_Koepp521_874cbeae-3145-40a1-940a-b5fed8a60f99.json",
                 "/content/drive/MyDrive/synthea_dataset/Landon622_Beier427_f49b5d35-10da-4fb7-80af-7bebe7047d9f.json",
                 "/content/drive/MyDrive/synthea_dataset/Adrienne302_Zulauf375_1c83fb03-d139-4626-bb39-4a062c22b533.json",
                 ]
patient_data = {}
patient_names = []

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import json
from collections import defaultdict
import pandas as pd

for patient_file in patient_files:
  patient_name = patient_file.split('/')[-1].split('.')[0]
  patient_name = '_'.join(patient_name.split('_')[:2])
  print(patient_name)
  patient_names.append(patient_name)

  patient = {}
  #Load patient file
  with open(patient_file) as f:
    data = json.load(f)

  # Extract the entries
  entries = data.get('entry', [])

  # Group by resourceType
  resource_dict = {}
  for entry in entries:
    resource = entry.get('resource', {})
    resource_type = resource.get('resourceType', 'Unknown')
    if resource_type not in resource_dict:
        resource_dict[resource_type] = []
    resource_dict[resource_type].append(resource)

  for resource_type, resources in resource_dict.items():
    # Normalize the entire resource dictionary and separate nested tags
    df = pd.json_normalize(resources, sep='_')

    # Flatten the nested columns
    df = flatten_dataframe(df)

    # Filter out inactive patient information
    #if 'status' in df.columns:
      #print(df['status'].unique())
        #df = df[(df['status'] == 'active') | (df['status'] == 'current') | (df['status'] == 'final') | (df['status'] == 'finished') | (df['status'] == 'completed')]

    if 'id' in df.columns:
        df = df.drop(columns=['id'])

    columns_to_drop = [col for col in df.columns if any(substring in col for substring in ['reference', 'system', 'telecom', 'identifier', 'Identifier', 'uid', 'Address', 'address'])]
    df = df.drop(columns=columns_to_drop)

    df = flatten_dataframe(df) #handle second layer of nesting

    patient[resource_type] = df

  patient_data[patient_name] = patient

Paulita78_Watsica258
Grace552_Little434
Dominique369_Daniel959
Jeri234_Koss676
Luis923_Cremin516
Ronnie7_Greenfelder433
Tambra47_Lang846
Veronique514_Koepp521
Landon622_Beier427
Adrienne302_Zulauf375


### **Part 1:** Query LLM with Full Patient Record and Full Clinical Trial Data

### **Part 2:** Prompt LLM to Create a Patient Summary from the Full Patient Record

### **Part 3:** Query LLM with Patient Summary and Full Clinical Trial Data

In [None]:
llm_responses = {}

In [None]:
import os
from langchain_openai import ChatOpenAI

os.environ["OPENAI_API_KEY"] = 'sk-proj-bbcCqgJsIPOhjmiRfNdyT3BlbkFJuDf9IePweh97i2lYZkTl' # API Key from Jieming

#model
model = "gpt-4-turbo" # most powerful gpt model
llm = ChatOpenAI(temperature=0.1, model=model)

In [None]:
from datetime import date
today_date = date.today()

for patient in patient_names:
  print(patient)

  responses = {}

  # Trial 1: Query LLM with Full Patient Data and Full Clinical Trial Data
  patient_df = patient_data[patient]

  query1 = f"""You are a doctor seeing a patient diagnosed with a type of breast cancer with the following medical history in the mCODE format. {patient_df}. Extract information about the patient's cancer prognosis and treatment. The patient's age can be inferred from today's data, {today_date} and the patient’s date of birth. Use this information and the clinical trial inclusion/exclusion criteria to identify up to 4 clinical trials that the patient is eligible for. The patient must meet all of the criteria listed for the clinical trial. Identify and report the NCT ID number and the clinical trial eligibility (inclusion and exclusion) criteria. For each eligibility criterion for the trial, explain why the patient is eligible. If the patient does not match the criteria for any of the trials, explain why. {clinical_trials_data}'
"""

  response1 = llm.invoke(query1)

  responses['Full+Full'] = response1.content
  responses['Full+Full_Metadata'] = response1.response_metadata

  # Trial 2: Query LLM to Write a Patient Summary
  query2 = f"""You are a doctor seeing a patient diagnosed with a type of breast cancer with the following health record in the mCODE format. Write a detailed summary for this patient's cancer diagnosis and prognosis. Report the clinical stage type of the cancer diagnosis and other descriptive information about the cancer. Report the gender and the patient's age as of today's date, {today_date}. Report the patient's comorbidities and treatments. {patient_df}'
"""

  response2 = llm.invoke(query2)

  responses['Patient_Summary'] = response2.content
  responses['Patient_Summary_Metadata'] = response2.response_metadata

  # Trial 3: Query LLM with Patient Summary and and Full Clinical Trial Data
  query3 = f"""You are a doctor seeing a patient diagnosed with a type of breast cancer with the following medical history. {response2.content}. Use this information and the clinical trial inclusion/exclusion criteria to identify up to 4 clinical trials that the patient is eligible for. The patient must meet all of the criteria listed for the clinical trial. Identify and report the NCT ID number and the clinical trial eligibility (inclusion and exclusion) criteria. For each eligibility criterion for the trial, explain why the patient is eligible. If the patient does not match the criteria for any of the trials, explain why. {clinical_trials_data}.’
"""

  response3 = llm.invoke(query3)

  responses['Summary+Full'] = response3.content
  responses['Summary+Full_Metadata'] = response3.response_metadata

  llm_responses[patient] = responses

Paulita78_Watsica258
Grace552_Little434
Dominique369_Daniel959
Jeri234_Koss676
Luis923_Cremin516
Ronnie7_Greenfelder433
Tambra47_Lang846
Veronique514_Koepp521
Landon622_Beier427
Adrienne302_Zulauf375


### **Part 4:** Query RAG-Enabled LLM with Patient Summary and Text-Based Clinical Trial Data




In [None]:
from openai import OpenAI

client = OpenAI(
  api_key=os.environ["OPENAI_API_KEY"],  # this is also the default, it can be omitted
)

clinical_trials_data['text'] = 'The trial with NCT ID number ' + clinical_trials_data['NCT ID'] + ' has the following eligbility criteria, for inclusion and exclusion ' + clinical_trials_data['Eligibility']
clinical_trials_data.head()['text']

Unnamed: 0,text
6,"The trial with NCT ID number NCT05939830 has the following eligbility criteria, for inclusion and exclusion Inclusion Criteria:\n\n1. Female aged between 18 and 70 years;\n2. Pathologically confirmed invasive breast cancer (regardless of pathological type) with a clinical stage cT1-3;\n3. Pathologically confirmed positive axillary lymph nodes with a clinical stage of N1-3;\n4. Receiving a full course of neoadjuvant therapy (including neoadjuvant chemotherapy, neoadjuvant targeted therapy, neoadjuvant immunotherapy);\n5. Positive axillary lymph nodes successfully stained by carbon nanoparticles injection with/without titanium clip marking;\n6. All patients are required to undergo immunohistochemical staining for Estrogen Receptor (ER), Progesterone Receptor (PR), human epidermal growth factor receptor 2 (HER2), Ki- 67 proliferation index, and further fluorescence in situ hybridization (FISH) should be performed in HER2 2+ cases;\n7. Preoperative clinical assessment (including physical examination, imaging, with or without nomogram assessment) suggests positive axillary lymph nodes converted to negative (ycN0);\n8. ECOG score 0 - 1;\n9. Patients voluntarily participated in this study and signed the informed consent form\n\nExclusion Criteria:\n\n1. Bilateral breast cancer;\n2. Breast cancer during lactation period or pregnancy;\n3. Physical examination or imaging examination confirmed presence of distant metastases;\n4. Previous history of malignant tumor;\n5. History of previous surgery on the affected axilla; or history of surgery affecting the function of the upper extremity;\n6. History of radiation therapy to the breast or chest;\n7. Positive incision margins for breast-conserving surgery/mastectomy;\n8. Positive results of intraoperative rapid freeze pathology (including isolated tumor cells and micrometastases) for SrLNB (ypN+);\n9. Those who unable to complete the full course of follow-up adjuvant therapy as prescribed for various reasons;\n10. Aspartate transaminase (AST) and alanine transaminase (ALT) ≥ 1.5 times the upper limit of normal, alkaline phosphatase（ALP） ≥ 2.5 times the upper limit of normal, total bile ≥ 1.5 times the upper limit of normal, serum creatinine ≥ 1.5 times the upper limit of normal; Left Ventricular Ejection Fractions (LVEF) \< 50% in cardiac ultrasound;\n11. Severe coagulation dysfunction, serious systemic disease, or uncontrolled infection;\n12. Without personal freedom and independent civil capacity;\n13. Those with mental disorders, addictions, who were not eligible for enrollment in the judgment of the investigator."
8,"The trial with NCT ID number NCT03147430 has the following eligbility criteria, for inclusion and exclusion Inclusion Criteria:\n\n* Women who receive a suspicious mammogram report or are scheduled to receive testing for suspect breast area, with a subsequent biopsy to confirm diagnosis\n* Willingness and ability to donate biospecimens for the purpose of propelling research.\n* Participants aged ≥ 18.\n\nExclusion Criteria:\n\n* Individuals under 18 years of age or over 89 years of age.\n* A known history of breast cancer.\n* A diagnosis or history of any other type of cancer.\n* Participants who are male."
10,"The trial with NCT ID number NCT04638725 has the following eligbility criteria, for inclusion and exclusion For inclusion in the study, patients must be affiliated to the national or local social security, and must meet all the following criteria:\n\nInclusion Criteria:\n\n* Age ≥ 18 years\n* Histological diagnosis of breast adenocarcinoma. Non-metastatic and operable.\n* Current or prior treatment with one therapy targeting HER2 in adjuvant or neoadjuvant phase for the current breast cancer\n* Given written informed consent\n\nExclusion Criteria:\n\n* Patients not able to comply to the protocol assessments for geographic, social or psychological reasons\n* Patients placed under judicial protection, guardianship, or supervision\n* History of cancer in the 5 years preceding anti-HER2 therapy initiation\n* Concomitant cancer (except for an other non metastatic cancer treated only with surgery)\n\nNote : Patients are eligible at any time of the follow-up if the adjuvant or neoadjuvant chemotherapy started after 01/01/2019. Patients treated with trastuzumab, pertuzumab, neratinib or T-DM1 in a clinical trial are eligible in the SIGHER study."
20,"The trial with NCT ID number NCT05989776 has the following eligbility criteria, for inclusion and exclusion Inclusion Criteria:\n\n* Patients over 18 and under 40 years of age treated for newly diagnosed breast cancer and receiving chemotherapy.\n* People who are able to read the study information poster and the fertility preservation information brochure in French, either alone or with help from a caregiver or relative or an interpreter.\n* People with intellectual disability will be included as long as they are able able to read the study information poster and the fertility preservation information brochure in French alone or with help from a caregiver or relative or an interpreter..\n\nExclusion Criteria:\n\n* People whose mental health status precludes participation in the study, as determined by the clinical team."
21,"The trial with NCT ID number NCT04506476 has the following eligbility criteria, for inclusion and exclusion Inclusion Criteria:\n\n* Capacity for consent\n* Minimum age 18\n* Presence of breast cancer\n* ECOG 0-2\n* Indication for adjuvant radiotherapy of breast cancer after breast-conserving surgery or Ablatio mammae\n\nExclusion Criteria:\n\n* Participation in any other interventional study\n* Pregnancy\n* Contraindication against physical activity/sport and others\n* Severe cardiovascular pre-existing conditions (after myocardial infarction, apoplexy in the last 6 months, congestive heart failure NYHA \> I°)\n* preexisting diseases with are relevantly accompanied by a limited mobility in patients (e.g. paraparesis of the lower limbs)\n* ECOG Status 3-4\n* prior use of activity trackers"


In [None]:
import openai
import tiktoken
from scipy import spatial
import pandas as pd
import ast
import openai

# Function to get text embeddings
def text_embedding(text):
    response = client.embeddings.create(model="text-embedding-ada-002", input=text)
    return response.data[0].embedding

# Apply the text_embedding function to the 'text' column
clinical_trials_data['embedding'] = clinical_trials_data['text'].apply(lambda x: text_embedding(x))

#print(clinical_trials_data.head())

In [None]:
def strings_ranked_by_relatedness(
    query: str,
    df: pd.DataFrame,
    relatedness_fn = lambda x, y: 1 - spatial.distance.cosine(x, y),
    top_n: int = 1000
):

    EMBEDDING_MODEL = "text-embedding-ada-002"
    query_embedding_response = openai.embeddings.create(
        model = EMBEDDING_MODEL,
        input = query,
    )
    query_embedding = query_embedding_response.data[0].embedding
    strings_and_relatednesses = [
        (row["text"], relatedness_fn(query_embedding, row["embedding"]))
        for i, row in df.iterrows()
    ]
    strings_and_relatednesses.sort(key = lambda x: x[1], reverse = True)
    strings, relatednesses = zip(*strings_and_relatednesses)
    return strings[:top_n], relatednesses[:top_n]

def num_tokens(text: str) -> int:
    encoding = tiktoken.encoding_for_model("gpt-4o")
    return len(encoding.encode(text))

def query_message(
    query: str,
    df: pd.DataFrame,
    model: str,
    token_budget: int
) :
    strings, relatednesses = strings_ranked_by_relatedness(query, df)
    introduction = 'Use the below clinical trial information to match patients to clinical trials based on their eligibility criteria. Explain why the patient does or does not match the eligibility criteria.'
    question = f"\n\nQuestion: {query}"
    message = introduction
    for string in strings:
        next_row = f'\n\nClinical trial section:\n"""\n{string}\n"""'
        if (
            num_tokens(message + next_row + question)
            > token_budget
        ):
            break
        else:
            message += next_row
    return message + question

def ask(
    query: str,
    df: pd.DataFrame = clinical_trials_data,
    model: str = "gpt-4o",
    token_budget: int = 25000,
    print_message: bool = False,
) :
    message = query_message(query, df, model=model, token_budget = token_budget)
    if print_message:
        print(message)
    messages = [
        {"role": "system", "content": "You match patients to clinical trials that they are eligible for."},
        {"role": "user", "content": message},
    ]
    response = openai.chat.completions.create(
        model = model,
        messages = messages,
        temperature = 0
    )

    return response

In [None]:
for patient in patient_names:

  patient_summary = llm_responses[patient]['Patient_Summary']

  # Trial 4: Query LLM to Write a Patient Summary
  query4 = query = f"""You are a doctor seeing a patient diagnosed with a type of breast cancer with the following medical history. {patient_summary}. Use this information and the clinical trial inclusion/exclusion criteria to identify up to 4 clinical trials that the patient is eligible for. The patient must meet all of the criteria listed for the clinical trial. Identify and report the NCT ID number and the clinical trial eligibility (inclusion and exclusion) criteria. For each eligibility criterion for the trial, explain why the patient is eligible. If the patient does not match the criteria for any of the trials, explain why."""

  response4 = ask(query4)

  llm_responses[patient]['Summary+RAG'] = response4.choices[0].message.content
  llm_responses[patient]['Summary+RAG_Metadata'] = response4

### **Part 5:** Query RAG-Enabled LLM with Patient Summary and Text-Based Clinical Trial Data Using an Alternative Relevancy Metric

The model now calculates the patient summary's relatedness with the inclusion criteria and the unrelatedness with the exclusion criteria.

In [None]:
clinical_trials_data2 = clinical_trials_data.copy()
clinical_trials_data2 = clinical_trials_data2.drop(['embedding'], axis=1)

In [None]:
def split_eligibility(row):
    parts = row['Eligibility'].split('Exclusion Criteria:')
    if len(parts) == 2:
        inclusion_criteria = parts[0].strip()
        exclusion_criteria = parts[1].strip()
    else:
        inclusion_criteria = row['Eligibility'].strip()
        exclusion_criteria = ''
    return inclusion_criteria, exclusion_criteria

# Apply the function to each row
clinical_trials_data2[['Inclusion_Criteria', 'Exclusion_Criteria']] = clinical_trials_data2.apply(split_eligibility, axis=1, result_type='expand')

# Create the new columns with the formatted strings
clinical_trials_data2['text_inclusion'] = 'The trial with NCT ID number ' + clinical_trials_data2['NCT ID'] + ' has the following eligibility inclusion criteria: ' + clinical_trials_data2['Inclusion_Criteria']
clinical_trials_data2['text_exclusion'] = 'The trial with NCT ID number ' + clinical_trials_data2['NCT ID'] + ' has the following eligibility exclusion criteria: Exclusion Criteria: ' + clinical_trials_data2['Exclusion_Criteria']

# Display the head of the new columns
#print(clinical_trials_data2.head()[['text_inclusion', 'text_exclusion']])

In [None]:
# Apply the text_embedding function to the 'text' column
clinical_trials_data2['embedding_inclusion'] = clinical_trials_data2['text_inclusion'].apply(lambda x: text_embedding(x))
clinical_trials_data2['embedding_exclusion'] = clinical_trials_data2['text_exclusion'].apply(lambda x: text_embedding(x))

#print(clinical_trials_data2.head())

In [None]:
# Function to compute the combined score
def combined_score(query_embedding, row, relatedness_fn, unrelatedness_fn):
    relatedness = relatedness_fn(query_embedding, row["embedding_inclusion"])
    unrelatedness = unrelatedness_fn(query_embedding, row["embedding_exclusion"])
    return relatedness - unrelatedness  # Higher relatedness and lower unrelatedness

# Function to rank strings by combined score
def strings_ranked_by_combined_score(
    query: str,
    df: pd.DataFrame,
    relatedness_fn = lambda x, y: 1 - spatial.distance.cosine(x, y),
    unrelatedness_fn = lambda x, y: spatial.distance.cosine(x, y),
    top_n: int = 1000
):
    EMBEDDING_MODEL = "text-embedding-ada-002"
    query_embedding_response = openai.embeddings.create(
        model = EMBEDDING_MODEL,
        input = query,
    )
    query_embedding = query_embedding_response.data[0].embedding
    strings_and_scores = [
        (row["text"], combined_score(query_embedding, row, relatedness_fn, unrelatedness_fn))
        for i, row in df.iterrows()
    ]
    strings_and_scores.sort(key=lambda x: x[1], reverse=True)
    strings, scores = zip(*strings_and_scores)
    return strings[:top_n], scores[:top_n]


# Function to generate query message
def query_message2(
    query: str,
    df: pd.DataFrame,
    model: str,
    token_budget: int
):
    strings, scores = strings_ranked_by_combined_score(query, df)
    introduction = 'Use the below clinical trial information to match patients to clinical trials based on their eligibility criteria. Explain why the patient does or does not match the eligibility criteria.'
    question = f"\n\nQuestion: {query}"
    message = introduction
    for string in strings:
        next_row = f'\n\nClinical trial section:\n"""\n{string}\n"""'
        if num_tokens(message + next_row + question) > token_budget:
            break
        else:
            message += next_row
    return message + question

# Function to ask the model
def ask2(
    query: str,
    df: pd.DataFrame = clinical_trials_data2,
    model: str = "gpt-4o",
    token_budget: int = 25000,
    print_message: bool = False,
):
    message = query_message2(query, df, model=model, token_budget=token_budget)
    if print_message:
        print(message)
    messages = [
        {"role": "system", "content": "You match patients to clinical trials that they are eligible for."},
        {"role": "user", "content": message},
    ]
    response = openai.chat.completions.create(
        model=model,
        messages=messages,
        temperature=0
    )

    return response

In [None]:
for patient in patient_names:

  patient_summary = llm_responses[patient]['Patient_Summary']

  # Trial 5: Query LLM with Alternate Relatedness Calculation
  query5 = f"""You are a doctor seeing a patient diagnosed with a type of breast cancer with the following medical history. {patient_summary}. Use this information and the clinical trial inclusion/exclusion criteria to identify up to 4 clinical trials that the patient is eligible for. The patient must meet all of the criteria listed for the clinical trial. Identify and report the NCT ID number and the clinical trial eligibility (inclusion and exclusion) criteria. For each eligibility criterion for the trial, explain why the patient is eligible. If the patient does not match the criteria for any of the trials, explain why."""

  response5 = ask2(query5)

  llm_responses[patient]['Summary+AlternateRAG'] = response5.choices[0].message.content
  llm_responses[patient]['Summary+AlternateRAG_Metadata'] = response5

### **Save Files**

In [None]:
llm_responses_df = pd.DataFrame.from_dict(llm_responses, orient='index')
llm_responses_df.to_csv('llm_responses.csv')

In [None]:
print(llm_responses_df)

                                                                Full+Full  \
Paulita78_Watsica258    Based on the provided data, the patient is a f...   
Grace552_Little434      To determine the eligibility of the patient fo...   
Dominique369_Daniel959  To determine the patient's eligibility for cli...   
Jeri234_Koss676         To determine the patient's eligibility for cli...   
Luis923_Cremin516       To determine the patient's eligibility for cli...   
Ronnie7_Greenfelder433  Based on the provided mCODE data for the patie...   
Tambra47_Lang846        Based on the provided data and the patient's m...   
Veronique514_Koepp521   To determine the patient's eligibility for cli...   
Landon622_Beier427      To determine the patient's eligibility for cli...   
Adrienne302_Zulauf375   To determine the patient's eligibility for cli...   

                                                       Full+Full_Metadata  \
Paulita78_Watsica258    {'token_usage': {'completion_tokens': 539, 'pr...  

References:

OpenAI et. al. (2023). GPT-4 Technical Report. https://arxiv.org/abs/2303.08774

OpenAI. (2024). Hello GPT-4o. https://openai.com/index/hello-gpt-4o/
Luo et. al. (2022). BioGPT: Generative Pre-trained Transformer for Biomedical Text Generation and Mining. https://arxiv.org/abs/2210.10341
Clinicaltrials.gov.

NIH: User Guide for Clinicaltrials.gov Website: https://clinicaltrials.gov/submit-studies/prs-help/user-guide#intro

Jin, Q., Wang, Z., Floudas, C. S., Chen, F., Gong, C., Bracken-Clarke, D., ... & Lu, Z. (2023). Matching patients to clinical trials with large language models. ArXiv.arXiv:2307.15051v4. Accessed on August 2, 2024. Available from: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC10418514/
Koopman, B., & Zuccon, G. (2016, July). A test collection for matching patients to clinical trials. In Proceedings of the 39th International ACM SIGIR conference on Research and Development in Information Retrieval (pp. 669-672).

Msv, J. (2023, July 21). Tutorial: Build a Q&A bot for Academy Awards based on ChatGPT. The New Stack. https://thenewstack.io/tutorial-build-a-qa-bot-for-academy-awards-based-on-chatgpt/
Stack OverFlow, Accessed on August 4th 2024. https://stackoverflow.com/questions/78415818/how-to-get-full-results-with-clinicaltrials-gov-api-in-python

Roberts, K., Demner-Fushman, D., Voorhees, E.M., Bedrick, S. & Hersh, W.R. Overview of the TREC 2021 Clinical Trials Track. in Proceedings of the Thirtieth Text REtrieval Conference (TREC 2021) (2021).

Walonoski, J., Kramer, M., Nichols, J., Quina, A., Moesel, C., Hall, D., ... & McLachlan, S. (2018). Synthea: An approach, method, and software mechanism for generating synthetic patients and the synthetic electronic health care record. Journal of the American Medical Informatics Association, 25(3), 230-238.

Ceylan, B., & Özerdoğan, N. (2015). Factors affecting age of onset of menopause and determination of quality of life in menopause. Turkish journal of obstetrics and gynecology, 12(1), 43–49. https://doi.org/10.4274/tjod.79836
Swanner, K. D., & Richmond, L. B. (2023). A 65-Year-Old Woman With No Menopause History: A Case Report. Cureus, 15(9), e44792. https://doi.org/10.7759/cureus.44792
