#Synthea Analysis

In [44]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

Mounted at /content/drive/


In [45]:
!ls /content/drive/MyDrive/synthea_dataset

'Adrian111_Bartell116_fd25b51b-dc2a-4edb-b7c9-2c5d178a0ee7 (1).json'
 Adrian111_Bartell116_fd25b51b-dc2a-4edb-b7c9-2c5d178a0ee7.json
 Adrianna470_Boyer713_6fcf56ed-8ad8-4395-a966-9ebee3822656.json
 Adrienne302_Zulauf375_1c83fb03-d139-4626-bb39-4a062c22b533.json
 Alaine226_Kassulke119_5efa5654-8190-4f4d-9ba3-3b79448440b5.json
 Ali918_Larson43_896d3035-d036-4c8a-9901-367dc8814495.json
 Allegra202_Heller342_cc4c634f-c2ff-4963-b75a-fa2e33dc208f.json
 Allena79_Pacocha935_d11e7fb0-5853-4762-b8fa-6cd9008a790b.json
 Almeda560_Davis923_c762bbaa-b207-4b4d-8123-e5bc93e3b99b.json
 Ana_María762_Gamez720_de3666a6-b4e3-4ec8-9ae7-bde168e03465.json
 Anne636_Collins926_42c179ad-99bd-4ca4-861f-7151d774a243.json
 Annelle169_King743_ddd8d7a4-5992-4572-a231-80bc305298b1.json
 Annemarie794_Kihn564_907083ad-22c9-4067-8321-a6622ce9792c.json
 Aracely711_Carter549_31665402-bfc5-45e1-b12a-fdc42836f50a.json
 Arden380_Reichel38_9c6e8766-9da5-428e-be61-e81a78a920c2.json
 Arlean815_MacGyver246_f9e2ec3e-731f-4341-8a6

In [46]:
import json
from collections import defaultdict
import pandas as pd

# Load the JSON file to extract information
with open("/content/drive/MyDrive/synthea_dataset/Paulita78_Watsica258_2dcefbbe-9f7c-4253-85f8-e1f0e74d90c1.json") as f:
    data = json.load(f)

patient={}

# Extract the entries
entries = data.get('entry', [])

# Group by resourceType
resource_dict = {}
for entry in entries:
    resource = entry.get('resource', {})
    resource_type = resource.get('resourceType', 'Unknown')
    if resource_type not in resource_dict:
        resource_dict[resource_type] = []
    resource_dict[resource_type].append(resource)


In [47]:
def flatten_dataframe(df):
    for column in df.columns:
        if df[column].apply(lambda x: isinstance(x, (list, dict))).any():
            # Normalize the nested column
            normalized_df = pd.json_normalize(df[column].explode().dropna().tolist(), sep='_')
            normalized_df.columns = [f"{column}_{sub_col}" for sub_col in normalized_df.columns]
            # Drop the original nested column and join the normalized columns
            df = df.drop(columns=[column]).join(normalized_df)
    return df


In [48]:
print(resource_dict.keys())

dict_keys(['Patient', 'Location', 'Organization', 'Practitioner', 'PractitionerRole', 'Encounter', 'Observation', 'Procedure', 'DiagnosticReport', 'DocumentReference', 'Immunization', 'Condition', 'MedicationRequest', 'CareTeam', 'CarePlan'])


In [49]:
for resource_type, resources in resource_dict.items():
    print(f"ResourceType: {resource_type}")
    # Normalize the entire resource dictionary and separate nested tags
    df = pd.json_normalize(resources, sep='_')
    #print(df.columns)

    # Flatten the nested columns
    df = flatten_dataframe(df)

    # Filter out inactive patient information
    if 'status' in df.columns:
      print(df['status'].unique())
        #df = df[(df['status'] == 'active') | (df['status'] == 'current') | (df['status'] == 'final') | (df['status'] == 'finished') | (df['status'] == 'completed')]

    if 'id' in df.columns:
        df = df.drop(columns=['id'])

    columns_to_drop = [col for col in df.columns if any(substring in col for substring in ['reference', 'system', 'telecom', 'identifier', 'Identifier', 'uid', 'Address', 'address'])]
    df = df.drop(columns=columns_to_drop)

    df = flatten_dataframe(df) #handle second layer of nesting

    print(df.columns)
    #print(df.head(2))

    patient[resource_type] = df
    print('\n')

ResourceType: Patient
Index(['resourceType', 'gender', 'birthDate', 'multipleBirthBoolean',
       'text_status', 'text_div', 'maritalStatus_text', 'extension_url',
       'extension_valueString', 'extension_valueCode',
       'extension_valueDecimal', 'name_use', 'name_family',
       'communication_language_text', 'maritalStatus_coding_code',
       'maritalStatus_coding_display', 'extension_extension_url',
       'extension_extension_valueCoding_system',
       'extension_extension_valueCoding_code',
       'extension_extension_valueCoding_display',
       'extension_extension_valueString',
       'communication_language_coding_system',
       'communication_language_coding_code',
       'communication_language_coding_display'],
      dtype='object')


ResourceType: Location
['active']
Index(['resourceType', 'status', 'name', 'position_longitude',
       'position_latitude', 'managingOrganization_display'],
      dtype='object')


ResourceType: Organization
Index(['resourceType', 'a

In [50]:
for resource_type, resources in resource_dict.items():
  print(patient[resource_type].head(2))

  resourceType  gender   birthDate  multipleBirthBoolean text_status  \
0      Patient  female  1968-03-10                 False   generated   

                                            text_div maritalStatus_text  \
0  <div xmlns="http://www.w3.org/1999/xhtml">Gene...                  M   

                                       extension_url extension_valueString  \
0  http://hl7.org/fhir/us/core/StructureDefinitio...                   NaN   

  extension_valueCode  ...  maritalStatus_coding_code  \
0                 NaN  ...                          M   

  maritalStatus_coding_display extension_extension_url  \
0                            M             ombCategory   

  extension_extension_valueCoding_system extension_extension_valueCoding_code  \
0        urn:oid:2.16.840.1.113883.6.238                               2106-3   

  extension_extension_valueCoding_display extension_extension_valueString  \
0                                   White                             NaN  

In [51]:
import requests
import pandas as pd

# Initial URL for the first API call
base_url = "https://clinicaltrials.gov/api/v2/studies"
params = {
    "query.titles": "Breast Cancer",
    "pageSize": 100
}

# Initialize an empty list to store the data
data_list = []

# Loop until there is no nextPageToken
while True:
    # Print the current URL (for debugging purposes)
    print("Fetching data from:", base_url + '?' + '&'.join([f"{k}={v}" for k, v in params.items()]))

    # Send a GET request to the API
    response = requests.get(base_url, params=params)

    # Check if the request was successful
    if response.status_code == 200:
        data = response.json()  # Parse JSON response
        studies = data.get('studies', [])  # Extract the list of studies

        # Loop through each study and extract specific information
        for study in studies:
            # Safely access nested keys
            nctId = study['protocolSection']['identificationModule'].get('nctId', 'Unknown')
            overallStatus = study['protocolSection']['statusModule'].get('overallStatus', 'Unknown')
            startDate = study['protocolSection']['statusModule'].get('startDateStruct', {}).get('date', 'Unknown Date')
            conditions = ', '.join(study['protocolSection']['conditionsModule'].get('conditions', ['No conditions listed']))
            acronym = study['protocolSection']['identificationModule'].get('acronym', 'Unknown')

            # Extract interventions safely
            interventions_list = study['protocolSection'].get('armsInterventionsModule', {}).get('interventions', [])
            interventions = ', '.join([intervention.get('name', 'No intervention name listed') for intervention in interventions_list]) if interventions_list else "No interventions listed"

            # Extract locations safely
            locations_list = study['protocolSection'].get('contactsLocationsModule', {}).get('locations', [])
            locations = ', '.join([f"{location.get('city', 'No City')} - {location.get('country', 'No Country')}" for location in locations_list]) if locations_list else "No locations listed"

            # Extract dates and phases
            primaryCompletionDate = study['protocolSection']['statusModule'].get('primaryCompletionDateStruct', {}).get('date', 'Unknown Date')
            studyFirstPostDate = study['protocolSection']['statusModule'].get('studyFirstPostDateStruct', {}).get('date', 'Unknown Date')
            lastUpdatePostDate = study['protocolSection']['statusModule'].get('lastUpdatePostDateStruct', {}).get('date', 'Unknown Date')
            studyType = study['protocolSection']['designModule'].get('studyType', 'Unknown')
            phases = ', '.join(study['protocolSection']['designModule'].get('phases', ['Not Available']))

            # Extract eligibility criteria
            #eligibilities = ', '.join(study['protocolSection']['eligibilityModule'].get('eligibilityCriteria', ['No criteria listed']))

            eligibility = study['protocolSection']['eligibilityModule'].get('eligibilityCriteria', 'Unknown')

            #eligibilities_list = study['protocolSection'].get('eligibilityModule', {}).get('eligibilityCriteria', [])
            #eligibilities = ', '.join([eligiblity.get('name', 'No criteria listed') for eligiblity in eligibilities_list]) if eligibilities_list else "No eligibility listed"

            # Append the data to the list as a dictionary
            data_list.append({
                "NCT ID": nctId,
                "Acronym": acronym,
                "Overall Status": overallStatus,
                "Start Date": startDate,
                "Conditions": conditions,
                "Interventions": interventions,
                "Locations": locations,
                "Primary Completion Date": primaryCompletionDate,
                "Study First Post Date": studyFirstPostDate,
                "Last Update Post Date": lastUpdatePostDate,
                "Study Type": studyType,
                "Phases": phases,
                "Eligibility": eligibility
            })

        # Check for nextPageToken and update the params or break the loop
        nextPageToken = data.get('nextPageToken')
        if nextPageToken:
            params['pageToken'] = nextPageToken  # Set the pageToken for the next request
        else:
            break  # Exit the loop if no nextPageToken is present
    else:
        print("Failed to fetch data. Status code:", response.status_code)
        break

# Create a DataFrame from the list of dictionaries
clinical_trials_data = pd.DataFrame(data_list)

# Print the DataFrame
#print(clinical_trials_data)

# Optionally, save the DataFrame to a CSV file
clinical_trials_data.to_csv("clinical_trials_data.csv", index=False)

Fetching data from: https://clinicaltrials.gov/api/v2/studies?query.titles=Breast Cancer&pageSize=100
Fetching data from: https://clinicaltrials.gov/api/v2/studies?query.titles=Breast Cancer&pageSize=100&pageToken=NF0g5JeHkvIv
Fetching data from: https://clinicaltrials.gov/api/v2/studies?query.titles=Breast Cancer&pageSize=100&pageToken=NF0g5JWPm_cp
Fetching data from: https://clinicaltrials.gov/api/v2/studies?query.titles=Breast Cancer&pageSize=100&pageToken=NF0g5JKHlPQoyA
Fetching data from: https://clinicaltrials.gov/api/v2/studies?query.titles=Breast Cancer&pageSize=100&pageToken=NF0g5JKDkPApxQ
Fetching data from: https://clinicaltrials.gov/api/v2/studies?query.titles=Breast Cancer&pageSize=100&pageToken=NF0g5JKBlfEsww
Fetching data from: https://clinicaltrials.gov/api/v2/studies?query.titles=Breast Cancer&pageSize=100&pageToken=NF0g5JGHk_MtwA
Fetching data from: https://clinicaltrials.gov/api/v2/studies?query.titles=Breast Cancer&pageSize=100&pageToken=NF0g5JGEl_cqwA
Fetching data

In [52]:
clinical_trials_data = clinical_trials_data[clinical_trials_data['Overall Status'] == 'RECRUITING']

In [53]:
pip install openai --upgrade



In [54]:
!pip install langchain_openai --upgrade



In [55]:
import os
from langchain_openai import ChatOpenAI

# os.environ["OPENAI_API_KEY"] = 'sk-proj-XIZYSWf9cabLNiQ4h9HvT3BlbkFJZXRUtBkclFgKigJ7WHuH' # go to platform.openai.com to find your key!
os.environ["OPENAI_API_KEY"] = 'sk-proj-bbcCqgJsIPOhjmiRfNdyT3BlbkFJuDf9IePweh97i2lYZkTl' # API Key from Jieming for MVP
# model
model = "gpt-4-turbo" # most powerful gpt model
llm = ChatOpenAI(temperature=0.1, model=model)

In [56]:
from openai import OpenAI

client = OpenAI(
  api_key=os.environ["OPENAI_API_KEY"],  # this is also the default, it can be omitted
)

### **Part 1:** Querying the LLM with the full health record and the full clinical trials dataframe.

In [57]:
from datetime import date
today_date = date.today()

In [58]:
query = f"""You are a doctor seeing a patient diagnosed with a type of breast cancer with the following medical history. {patient}. Extract information about the patient's cancer prognosis and treatment. The patient's age can be inferred from today's data, {today_date} and the patient’s date of birth. Use this information and the clinical trial inclusion/exclusion criteria to identify up to 4 clinical trials that the patient is eligible for. The patient must meet all of the criteria listed for the clinical trial. Identify and report the NCT ID number and the clinical trial eligibility criteria. List four reasons why the patient is eligible for each trial. If the patient does not match the criteria for any of the trials, explain why.{clinical_trials_data}'
"""

#print(query)

response = llm.invoke(query)

In [59]:
print(response.content)

Based on the provided data, the patient is a female born on March 10, 1968, which makes her 56 years old as of today's date, July 30, 2024. She has been diagnosed with a type of breast cancer, specifically "Malignant neoplasm of breast (disorder)." To find suitable clinical trials for her, we need to consider her age, gender, cancer type, and stage, along with other specific eligibility criteria mentioned in the trials.

From the clinical trials data snippet you provided, I will analyze the eligibility criteria for each trial to determine if the patient qualifies. Here are four clinical trials that the patient might be eligible for, based on the information available:

### Clinical Trial 1: NCT06155331
- **Eligibility Criteria:**
  - Age ≥ 18 years old.
  - Diagnosed with stage 2 or 3 breast cancer.
  - Requires chemotherapy.
  - No prior treatment for the current cancer diagnosis.

**Reasons for Eligibility:**
1. The patient is 56 years old, meeting the age criterion.
2. She has breas

### **Part 2:** Prompt the LLM to write a patient summary, given a full health record.

#### Summary 1

In [60]:
query = f"""You are a doctor seeing a patient diagnosed with a type of breast cancer with the following health record in the mCODE format. Write a summary for this patient's cancer prognosis. {patient}'
"""

response = llm.invoke(query)

print(response.content)

The patient, Mrs. Paulita78 Watsica258, has been diagnosed with malignant neoplasm of the breast (breast cancer). The diagnosis was recorded on February 26, 2020. The clinical status of the condition is active, and it has been confirmed. The specific type of breast cancer, based on the SNOMED code provided (254837009), indicates a diagnosis of malignant neoplasm of the breast without further specification of the subtype.

The patient's care has been managed by various healthcare providers, including Dr. Anderson154 Cummerata161 and Dr. Dominique369 Crist667, with encounters primarily at Saint Anne's Hospital. The patient has a history of various conditions and treatments, including acute bronchitis, viral pharyngitis, and procedures like medication reconciliation and radiation therapy.

Given the active status of the breast cancer, ongoing treatment and monitoring are critical. The patient's prognosis will depend on several factors, including the stage of cancer at diagnosis, the speci

#### Summary 2

In [61]:
query = f"""You are a doctor seeing a patient diagnosed with a type of breast cancer with the following health record in the mCODE format. Write a detailed summary for this patient's cancer diagnosis and prognosis. Report the clinical stage type of the cancer diagnosis and other descriptive information about the cancer. Report the gender and the patient's age as of today's date, {today_date}. Report the patient's comorbidities and treatments. {patient}'
"""

response = llm.invoke(query)

print(response.content)

### Patient Summary

**Patient Details:**
- **Name:** Mrs. Paulita78 Watsica258
- **Gender:** Female
- **Date of Birth:** March 10, 1968
- **Age:** 56 years old (as of today's date, July 30, 2024)
- **Marital Status:** Married
- **Race:** White
- **Primary Language:** English

**Medical History and Comorbidities:**
- Mrs. Watsica has a history of various acute conditions such as acute viral pharyngitis, acute bronchitis, and otitis media. She has also experienced facial laceration, epidermal burn, and normal pregnancies. Notably, she has been diagnosed with a malignant neoplasm of the breast (breast cancer) as of February 26, 2020.

**Cancer Diagnosis:**
- **Type of Cancer:** Malignant neoplasm of the breast
- **Stage at Diagnosis:** Stage 1A
- **Date of Diagnosis:** February 26, 2020
- **Clinical Status:** Active
- **Verification Status:** Confirmed

**Treatment and Care:**
- Mrs. Watsica has undergone various treatments and procedures, including radiation therapy care as part of her 

In [62]:
patient_summary = response.content

In [63]:
print(patient_summary)

### Patient Summary

**Patient Details:**
- **Name:** Mrs. Paulita78 Watsica258
- **Gender:** Female
- **Date of Birth:** March 10, 1968
- **Age:** 56 years old (as of today's date, July 30, 2024)
- **Marital Status:** Married
- **Race:** White
- **Primary Language:** English

**Medical History and Comorbidities:**
- Mrs. Watsica has a history of various acute conditions such as acute viral pharyngitis, acute bronchitis, and otitis media. She has also experienced facial laceration, epidermal burn, and normal pregnancies. Notably, she has been diagnosed with a malignant neoplasm of the breast (breast cancer) as of February 26, 2020.

**Cancer Diagnosis:**
- **Type of Cancer:** Malignant neoplasm of the breast
- **Stage at Diagnosis:** Stage 1A
- **Date of Diagnosis:** February 26, 2020
- **Clinical Status:** Active
- **Verification Status:** Confirmed

**Treatment and Care:**
- Mrs. Watsica has undergone various treatments and procedures, including radiation therapy care as part of her 

### **Part 3:** Querying the LLM with the patient summary and the full clinical trials dataframe.

In [64]:
query = f"""You are a doctor seeing a patient diagnosed with a type of breast cancer with the following medical history. {patient_summary}. Use this information and the clinical trial inclusion/exclusion criteria to identify up to 4 clinical trials that the patient is eligible for. The patient must meet all of the criteria listed for the clinical trial. Identify and report the NCT ID number and the clinical trial eligibility (inclusion and exclusion) criteria. For each eligibility criterion for the trial, explain why the patient is eligible. If the patient does not match the criteria for any of the trials, explain why. {clinical_trials_data}.’
"""

response = llm.invoke(query)

print(response.content)

Based on the patient's medical history and the clinical trial information provided, we can evaluate Mrs. Paulita78 Watsica258's eligibility for the listed clinical trials. Here are the assessments for four selected trials:

### Trial 1: NCT06155331
**Eligibility Criteria:**
- Age ≥ 18 years old.
- Diagnosed with breast cancer stage 2 or 3.
- Able to receive chemotherapy.
- No prior treatment with fenofibrate.

**Reasons for Ineligibility:**
1. **Stage of Cancer:** Mrs. Watsica was diagnosed with Stage 1A breast cancer, which does not meet the trial's requirement for stage 2 or 3 breast cancer.
2. **Other criteria match,** but the stage mismatch disqualifies her.

### Trial 2: NCT04603820
**Eligibility Criteria:**
- The patient should have breast cancer and be pregnant.

**Reasons for Ineligibility:**
1. **Pregnancy Status:** There is no information suggesting that Mrs. Watsica is currently pregnant.
2. **Specific Condition Requirement:** The trial specifically requires participants to 

### **Part 4:** Querying the LLM with the patient summary and the clinical trials data using a RAG-enabled approach

In [65]:
clinical_trials_data['text'] = 'The trial with NCT ID number ' + clinical_trials_data['NCT ID'] + ' has the following eligbility criteria, for inclusion and exclusion ' + clinical_trials_data['Eligibility']
clinical_trials_data.head()['text']

Unnamed: 0,text
14,"The trial with NCT ID number NCT06155331 has the following eligbility criteria, for inclusion and exclusion Inclusion Criteria:\n\n* Age ≥ 18 years old.\n* Patients with biopsy confirmed diagnosis breast cancer and with stage II and stage III breast cancer according to the American Joint Committee on Cancer (TNM staging system of breast cancer).\n* Patients with performance status \<2 according to Eastern Cooperative Oncology Group (ECOG) score.\n* Adequate baseline hematologic values (absolute neutrophilic count ≥ 1.5 × 109/L, platelet count ≥ 100 × 109/L and hemoglobin level ≥ 10 g/dl).\n* Patients with adequate liver function (serum bilirubin \< 1.2 mg/dl) and adequate renal function (serum creatinine \< 1.5 mg/d).\n\nExclusion Criteria:\n\n* Patients with prior exposure to anthracyclines in the last 6 months.\n* Patients with evidence of metastasis at the initial assessment.\n* Concomitant use of antioxidant vitamins (vitamin A, C, E).\n* Presence of clinical evidence for severe cardiac illness (angina pectoris, uncontrolled hypertension, arrhythmias and left ventricular ejection fraction \<50%).\n* Patients with inflammatory diseases (ulcerative colitis, rheumatoid arthritis).\n* Patients with conditions associated with oxidative stress (smoking, tuberculosis, comorbid obesity).\n* Patients who are candidates for monoclonal antibodies such as Trastuzumab and other targeted therapy (HER2 positive patients).\n* Patients with active liver disease (cirrhosis, fatty liver, hepatitis C, etc..).\n* Patients with myopathy.\n* Patients with renal impairment, including those with end-stage renal disease and those receiving dialysis.\n* Pregnant and breast feeding women.\n* Known allergy to the fenofibrates.\n* Concurrent use of statin, colchicine, ciprofibrate, idelalisib, ivacaftor, aspirin low strength, clopidogrel, warfarin, enzyme inducers (phenytoin, phenobarbitone, carbamazepine,...), enzyme inhibitors (allopurinol, MAOI, SSRI,...), drugs with high plasma protein binding capacity (sulfonamides, valproate, oral hypoglycemic, warfarin,...) in order to avoid potential pharmacodynamics and pharmacokinetic drug interactions."
34,"The trial with NCT ID number NCT04603820 has the following eligbility criteria, for inclusion and exclusion Inclusion Criteria:\n\n1. The patient should have signed and dated the informed consent form (ICF). The enrollment of patients who have died is allowed.\n2. Women aged ≥ 18 years.\n3. Patients in one of the following situations:\n\n * Patients with breast cancer diagnosis during pregnancy, breastfeeding or within the year after delivery.\n * Patients with breast cancer who become pregnant after treatment.\n * Patients with breast cancer who were subjected to any fertility preservation method prior to the start of breast cancer treatment.\n4. The patients referred to in the previous section and the patients who meet these characteristics prospectively could be enrolled retrospectively upon registry opening.\n5. All cases diagnosed at the same site may be included. In order to prevent duplications, in case the patient followed her treatment and follow-up at another site, she will be enrolled as per the site where the diagnosis was made, requesting information of the treatment and progression, when possible.\n6. Availability of clinical, epidemiological and progress data.\n\nExclusion Criteria:\n\nPatients who do not wish to participate in the study for any reason could not be included in the study."
35,"The trial with NCT ID number NCT04074720 has the following eligbility criteria, for inclusion and exclusion Inclusion Criteria:\n\n* Adults \>18 years old at time of consent\n* Subject must be capable to giving informed consent or have an acceptable surrogate capable of giving consent on the subject behalf.\n* Patients with carcinoma in situ or invasive breast cancer\n* Patient must be undergoing one of the following:\n\n * definitive surgical tumor resection for breast cancer OR\n * placement of a vascular access device as a prelude to neoadjuvant therapy for breast cancer OR\n * neurosurgical resection of a brain metastasis from primary breast cancer.\n\nExclusion Criteria:\n\n* \<18 years old\n* Not able to give informed consent and does not have acceptable surrogate capable to giving informed consent.\n* Active drug/alcohol dependence or abuse history"
38,"The trial with NCT ID number NCT06087120 has the following eligbility criteria, for inclusion and exclusion Inclusion Criteria:\n\n* Female,18 years old and older,\n* Are diagnosed with stage II-III HER2+/Triple Negative breast cancer and indicated for neoadjuvant chemotherapy,\n* FFPE sample is available at the time of diagnosis and operation,\n* Are voluntary to participate in the study.\n\nExclusion Criteria:\n\n* Recurrent breast cancer,\n* Other cancer metastasis to the breast,\n* Have been or are being treated for cancer,\n* Patients did not agree to participate in the studies."
39,"The trial with NCT ID number NCT01785420 has the following eligbility criteria, for inclusion and exclusion Inclusion Criteria:a.\n\n1. Female subjects aged 18 years or older.\n2. Histologically and/or cytologically confirmed diagnosis of breast cancer. Clinical stages breast cancer: HER2 positive, T1 or T2 or T3, N0 or N1, resectable T4, or resectable N2, (all M0)\n3. Documentation of erbB-2 gene amplification by FISH (as defined by a ratio \>2.2) or chromogenic in situ hybridization (CISH, as defined by the manufacturer's kit instruction) or documentation of erbB-2-overexpression by IHC (defined as IHC3+, or IHC2+ with FISH or CISH confirmation) based on local laboratory.\n4. LVEF within institutional range of normal as measured by MUGA or ECHO.\n5. Screening laboratory values within the following parameters:\n\n 1. Absolute neutrophil count (ANC) ≥1.5 x 109 /L (1500/mm3)\n 2. Platelet count ≥100 x 109/L (100,000/mm3)\n 3. Hemoglobin ≥9.0 g/dL (90 g/L)\n 4. Serum creatinine ≤1.5 x upper limit of normal (ULN)\n 5. Total bilirubin ≤1.5 x ULN (\<3 ULN if Gilbert's disease) 6Aspartate aminotransferase (AST) and/or alanine aminotransferase (ALT)\n\n * 2.5 x ULN\n\nExclusion Criteria:\n\n1. Bilateral breast cancer\n2. Active uncontrolled cardiac disease, including cardiomyopathy, CHF (New York Heart Association \[NYHA\] functional classification of ≥3), unstable angina, and myocardial infarction (within 12 months of study entry).\n3. Inadequately controlled hypertension (ie, systolic blood pressure \[BP\] \> 180 mm Hg or diastolic BP \> 100 mm Hg).\n4. Family history of congenital long or short QT syndrome, Brugada syndrome or QT/QTc interval \> 0.45 second or known history of QT/QTc prolongation or torsade de pointe (TdP).\n5. Significant chronic gastrointestinal disorder with diarrhea as a major symptom (eg, Crohn's disease, malabsorption, or grade ≥2 diarrhea of any etiology at baseline).\n6. Women who are pregnant, breast-feeding."


In [66]:
import openai
import tiktoken
from scipy import spatial
import pandas as pd
import ast
import openai

# Function to get text embeddings
def text_embedding(text):
    response = client.embeddings.create(model="text-embedding-ada-002", input=text)
    return response.data[0].embedding

# Apply the text_embedding function to the 'text' column
clinical_trials_data['embedding'] = clinical_trials_data['text'].apply(lambda x: text_embedding(x))

clinical_trials_data = clinical_trials_data.assign(embedding = (clinical_trials_data["text"].apply(lambda x : text_embedding(x))))
print(clinical_trials_data.head())

         NCT ID   Acronym Overall Status  Start Date  \
14  NCT06155331   Unknown     RECRUITING     2023-12   
34  NCT04603820  EMBARCAM     RECRUITING  2019-11-18   
35  NCT04074720   Unknown     RECRUITING  2018-05-10   
38  NCT06087120   Unknown     RECRUITING  2023-09-16   
39  NCT01785420   Unknown     RECRUITING     2013-05   

                                           Conditions  \
14                        Breast Cancer Stage 2 and 3   
34                        Breast Cancer and Pregnancy   
35  Breast Cancer, Invasive Breast Cancer, Carcino...   
38  Breast Cancer Female, Stage II Breast Cancer, ...   
39  Carcinoma Breast Stage I, HER2 Positive Breast...   

                                        Interventions  \
14  Fenofibrate, Placebo, Doxorubicin, Cyclophosph...   
34                            No interventions listed   
35  Tissue Sample collection, Blood Sample Collect...   
38                            No interventions listed   
39                               Tr

In [82]:
def strings_ranked_by_relatedness(
    query: str,
    df: pd.DataFrame,
    relatedness_fn = lambda x, y: 1 - spatial.distance.cosine(x, y),
    top_n: int = 1000
):

    EMBEDDING_MODEL = "text-embedding-ada-002"
    query_embedding_response = openai.embeddings.create(
        model = EMBEDDING_MODEL,
        input = query,
    )
    query_embedding = query_embedding_response.data[0].embedding
    strings_and_relatednesses = [
        (row["text"], relatedness_fn(query_embedding, row["embedding"]))
        for i, row in df.iterrows()
    ]
    strings_and_relatednesses.sort(key = lambda x: x[1], reverse = True)
    strings, relatednesses = zip(*strings_and_relatednesses)
    return strings[:top_n], relatednesses[:top_n]

strings, relatednesses = strings_ranked_by_relatedness("Clinical Trials", clinical_trials_data, top_n = 3)
for string, relatedness in zip(strings, relatednesses):
    print(f"{relatedness = :.3f}")
    display(string)

def num_tokens(text: str) -> int:
    encoding = tiktoken.encoding_for_model("gpt-4o")
    return len(encoding.encode(text))

def query_message(
    query: str,
    df: pd.DataFrame,
    model: str,
    token_budget: int
) :
    strings, relatednesses = strings_ranked_by_relatedness(query, df)
    introduction = 'Use the below content to match patients to clinical trials. Explain why the patient does not match any of the eligibility criteria."'
    question = f"\n\nQuestion: {query}"
    message = introduction
    for string in strings:
        next_row = f'\n\nClinical trial section:\n"""\n{string}\n"""'
        if (
            num_tokens(message + next_row + question)
            > token_budget
        ):
            break
        else:
            message += next_row
    return message + question

def ask(
    query: str,
    df: pd.DataFrame = clinical_trials_data,
    model: str = "gpt-4o",
    token_budget: int = 25000,
    print_message: bool = False,
) :
    message = query_message(query, df, model=model, token_budget = token_budget)
    if print_message:
        print(message)
    messages = [
        {"role": "system", "content": "You match patients to clinical trials that they are eligible for."},
        {"role": "user", "content": message},
    ]
    response = openai.chat.completions.create(
        model = model,
        messages = messages,
        temperature = 0
    )
    #response_message = response["choices"][0]["message"]["content"]
    response_message = response.choices[0].message.content
    return response_message

relatedness = 0.817


'The trial with NCT ID number NCT05044689 has the following eligbility criteria, for inclusion and exclusion Case Group - Patients must have histologically confirmed diagnosis of colorectal or breast cancer.\n\nControl Group - Patients must not have a diagnosis of cancer.'

relatedness = 0.816


'The trial with NCT ID number NCT05406531 has the following eligbility criteria, for inclusion and exclusion Inclusion Criteria:\n\n* Women over the age of 18\n* Breast cancer patients at the end of adjuvant treatment\n\nExclusion Criteria:\n\n* Patients without a mobil device ("smart" mobile phone)\n* Patients without internet access\n* Patients with diagnosed: psychotic disorder; severe depressive episode; panic disorder; bipolar disorder; personality disorder associated with uncontrollable mood swings; PTSD; suicidal attempts and active suicidal ideation; disability pension due to non-specific mental or somatic problems; hospitalization due to a psychiatric disorder in the past year'

relatedness = 0.816


'The trial with NCT ID number NCT05704842 has the following eligbility criteria, for inclusion and exclusion Inclusion Criteria:\n\n* Women with curative-intent breast cancer who plan to undergo at least 4 cycles of chemotherapy.\n* Age \\> 18 years\n* ECOG performance score \\< 3\n* English-speaking\n* with sufficient vision/hearing or family support\n* Coronary artery disease, if cleared by cardiologist\n* Subject must have smart phone, computer or tablet.\n* Willingness to be randomized\n\nExclusion Criteria:\n\n* Medical or psychiatric conditions (beyond those related to breast cancer and its treatment) that would impair our ability to test study hypotheses (psychotic disorders, dementia, inability to give informed consent or follow instructions).\n* Patients with overt evidence of a psychiatric disorder.\n* Coronary artery disease, not cleared by cardiologist.\n* Contraindication to exercise.\n* Chronic fatigue syndrome.'

In [83]:
query = f"""You are a doctor seeing a patient diagnosed with a type of breast cancer with the following medical history. {patient_summary}. Use this information and the clinical trial inclusion/exclusion criteria to identify up to 4 clinical trials that the patient is eligible for. The patient must meet all of the criteria listed for the clinical trial. Identify and report the NCT ID number and the clinical trial eligibility (inclusion and exclusion) criteria. For each eligibility criterion for the trial, explain why the patient is eligible. If the patient does not match the criteria for any of the trials, explain why."""

In [84]:
resp_0 = ask(query)

In [85]:
print(resp_0)

Based on the provided patient summary and the clinical trial eligibility criteria, Mrs. Paulita78 Watsica258 does not match the criteria for any of the clinical trials listed. Here is a detailed explanation of why she does not match the criteria for each trial:

1. **NCT04360330**
   - **Inclusion Criteria:**
     - Female, ≥ 50 years of age. (Eligible)
     - Oncotype or MammaPrint diagnosis results are required prior to the start of treatment. (Not mentioned)
     - Histologically confirmed invasive breast cancer. (Eligible)
     - Clinical stage T1N0M0. (Eligible)
     - Receptor status: ER/PR positive and HER2 negative. (Not mentioned)
     - Unifocal breast cancer. (Not mentioned)
     - ECOG 0, 1. (Not mentioned)
     - Ability to undergo MRI. (Not mentioned)
     - WOCBP must agree to use adequate contraception or agree to undergo sexual abstinence. (Not applicable)
     - Ability to understand the investigational nature, potential risks and benefits of the research study and wi