In [12]:
import pandas as pd

# Load the CSV files
patient_df = pd.read_csv('/Users/prayashdas/Downloads/file_with_age.csv')  
trial_df = pd.read_csv('/Users/prayashdas/Downloads/Main_file.csv')     

# Display the data
print(patient_df.head())
print(trial_df.head())


                                PATIENT   BIRTHDATE DEATHDATE GENDER  \
0  e93300bf-3a53-55c0-bd38-2ede59462f21  2007-09-26       NaN      F   
1  e93300bf-3a53-55c0-bd38-2ede59462f21  2007-09-26       NaN      F   
2  e93300bf-3a53-55c0-bd38-2ede59462f21  2007-09-26       NaN      F   
3  e93300bf-3a53-55c0-bd38-2ede59462f21  2007-09-26       NaN      F   
4  e93300bf-3a53-55c0-bd38-2ede59462f21  2007-09-26       NaN      F   

                       DESCRIPTION_x  \
0         Seasonal allergic rhinitis   
1         Seasonal allergic rhinitis   
2         Seasonal allergic rhinitis   
3         Seasonal allergic rhinitis   
4  Medication review due (situation)   

                                       DESCRIPTION_y  AGE  
0       Fexofenadine hydrochloride 30 MG Oral Tablet   17  
1  NDA020800 0.3 ML Epinephrine 1 MG/ML Auto-Inje...   17  
2                   Acetaminophen 325 MG Oral Tablet   17  
3                             Seasonique 91 Day Pack   17  
4       Fexofenadine hydro

Core algorithm implementation

In [13]:
import pandas as pd
import json
import re

# Function to extract keywords from medical conditions or medications (simple tokenization by splitting words)
def extract_keywords(text):
    """
    Extract keywords from the required columns to perform similarity search in both of the csv files.

    Parameters:
    Text

    Returns:
    Set of keywords from the extracted texts.
    """
    # Use regular expressions to find alphanumeric keywords
    keywords = re.findall(r'\b\w+\b', text.lower()) if pd.notna(text) else []
    return set(keywords)  # Return a set of unique keywords

# Function to check inclusion criteria (age range)
def check_inclusion_criteria(patient_age, min_age, max_age):
    """
    Check the patient's age with a particular trial's elegible range.

    Parameters:
    The patient's age, the minimum and maximum age of a clinical trial.

    Returns:
    Set of keywords from the extracted texts.
    """
    if min_age is None and max_age is None:
        return True
    if min_age is not None and patient_age < min_age:
        return False
    if max_age is not None and patient_age > max_age:
        return False
    return True

# Function to check exclusion criteria
def check_exclusion_criteria(patient_conditions, patient_medications, trial_exclusion_criteria):
    """
    Check if the patient has any medical conditions / medications to which the patient might be ineligible 
    if listed in the exclusion_criteria of the trial.

    Parameters:
    The patient's medical condition, medication history, and trial_exclusion_criteria.

    Returns:
    Boolean values
    """
    # Extract keywords from the exclusion_criteria column
    exclusion_keywords = extract_keywords(trial_exclusion_criteria)

    # Check for any keyword match between the patient's conditions/medications and trial's exclusion criteria
    if patient_conditions.intersection(exclusion_keywords) or patient_medications.intersection(exclusion_keywords):
        return False  # If any match is found, the patient fails the exclusion criteria
    return True

# Function to safely convert the age value to an integer if in years
def extract_age_in_years(age_string):
    """
    Extracts the age of the patient from the CSV file in years.

    Parameters:
    String in thr 'AGE' column
    
    Returns:
    Integer value of age
    """
    if pd.isna(age_string):
        return None
    try:
        # Try to handle 'XX Years' only
        if 'Years' in age_string:
            return int(age_string.replace(' Years', ''))
        else:
            # Skip cases where it's in months/days/etc.
            return None
    except ValueError:
        return None

# Function to summarize patient history
def summarize_patient_history(patient_data):
    """
    Summarize and generate the patient's medical conditions and medication history to the user.

    Parameters:
    Patient Data
    
    Returns:
    Summarized text
    """
    # Summarize patient's conditions and medications
    conditions_summary = f"Medical Conditions: {patient_data['DESCRIPTION_x']}"
    medications_summary = f"Medications: {patient_data['DESCRIPTION_y']}"
    summary = f"Patient {patient_data['PATIENT']} Summary:\n{conditions_summary}\n{medications_summary}"
    return summary

# Function to generate an explanation for the match
def generate_explanation_for_match(patient_data, trial):
    """
    Generate an explanation for the patient's eligibility to participate in the trial
    and also displays both the inclusion and exclusion criteria.

    Parameters:
    Patient Data and the Trial Data
    
    Returns:
    Summarized Explanation
    """
    explanation = (
        f"Patient {patient_data['PATIENT']} with age {patient_data['AGE']} is eligible for the trial '{trial['Study Title']}' "
        f"(NCT Number: {trial['NCT Number']}). The patient's age satisfies the age criteria ({trial['minimum_age']} to "
        f"{trial['maximum_age']}), and the patient's medical conditions and medications do not match any of the exclusion criteria."
    )
    return explanation

# Function to match a patient to trials
def match_patient_to_trials(patient_data, trial_data):
    """
    Match a patient to eligible trial(s) by first checking the age criteria , if the age criteria satisfies, it proceeds over
    to check the exclusion criteria. If only both the criteria are met, it matches the patient with the corresponding clinical trials.

    Parameters:
    Patient Data and the Trial Data

    Returns:
    The matched trials
    """
    matched_trials = []
    
    # Iterate over each trial
    for index, trial in trial_data.iterrows():
        min_age = extract_age_in_years(trial['minimum_age'])
        max_age = extract_age_in_years(trial['maximum_age'])

        # If min_age or max_age is None, we skip this trial for invalid age format
        if min_age is None or max_age is None:
            continue

        # Extract keywords from patient's conditions and medications
        patient_conditions = extract_keywords(patient_data['DESCRIPTION_x'])
        patient_medications = extract_keywords(patient_data['DESCRIPTION_y'])

        # Check both inclusion (age) and exclusion criteria
        if check_inclusion_criteria(int(patient_data['AGE']), min_age, max_age):
            # Check exclusion criteria
            trial_exclusion_criteria = trial['exclusion_criteria'] if pd.notna(trial['exclusion_criteria']) else ""
            if check_exclusion_criteria(patient_conditions, patient_medications, trial_exclusion_criteria):
                # If both criteria are satisfied, add trial to matched_trials with explanation
                explanation = generate_explanation_for_match(patient_data, trial)
                matched_trials.append({
                    'trialId': trial['NCT Number'],
                    'trialName': trial['Study Title'],
                    'eligibilityCriteriaMet': ["Age requirement met", "Exclusion criteria cleared"],
                    'explanation': explanation
                })
    
    return matched_trials

# Function to generate the output for a patient
def generate_output(patient_data, matched_trials):
    """
    Prepares and generate the final output in JSON format.

    Parameters:
    Patient Data and the Matched Trials

    Returns:
    Output
    """
    # Summarize patient history
    patient_summary = summarize_patient_history(patient_data)
    
    # Prepare final output
    output = {
        'patientId': patient_data['PATIENT'],
        'patientSummary': patient_summary,
        'eligibleTrials': matched_trials
    }
    return output

# Function to search for a patient by ID
def find_patient_by_id(patient_df, patient_id):
    """
    Searches for aparticular patient ID along with the clinical trials the patient
    is eligible for.

    Parameters:
    Patient Data and the Patient ID

    Returns:
    Patient with the given ID
    """
    # Search for the patient with the given ID
    return patient_df[patient_df['PATIENT'] == patient_id].iloc[0]

# Load the patient and trial data (replace with your file paths)
patient_df = pd.read_csv('/Users/prayashdas/Downloads/file_with_age.csv')
trial_df = pd.read_csv('/Users/prayashdas/Downloads/Main_file.csv')

# Get patient ID from user input
input_patient_id = input("Enter the Patient ID: ")

# Find the patient by ID
try:
    patient_data = find_patient_by_id(patient_df, input_patient_id)
except IndexError:
    print(f"Patient with ID {input_patient_id} not found.")
    exit()

# Match the patient to trials and generate output
matched_trials = match_patient_to_trials(patient_data, trial_df)
output = generate_output(patient_data, matched_trials)

# Save the output to a JSON file
output_file = f"eligible_trials_output_{input_patient_id}.json"
with open(output_file, 'w') as outfile:
    json.dump(output, outfile, indent=2)

print(f"Output saved to {output_file}")


Enter the Patient ID:  c6047dfa-7b8c-3001-7ce0-09b17fc73339


Output saved to eligible_trials_output_c6047dfa-7b8c-3001-7ce0-09b17fc73339.json


Unit Testing

In [10]:
import unittest

# Define test cases
class TestTrialMatchingFunctions(unittest.TestCase):

    def test_extract_keywords(self):
        self.assertEqual(extract_keywords("Hypertension, Diabetes Type 2"), {"hypertension", "diabetes", "type", "2"})
        self.assertEqual(extract_keywords(""), set())
        self.assertEqual(extract_keywords(None), set())

    def test_check_inclusion_criteria(self):
        self.assertTrue(check_inclusion_criteria(30, 20, 40))
        self.assertFalse(check_inclusion_criteria(45, 20, 40))
        self.assertTrue(check_inclusion_criteria(30, None, None))

    def test_check_exclusion_criteria(self):
        patient_conditions = {"hypertension"}
        patient_medications = {"lisinopril"}
        trial_exclusion_criteria = "diabetes, metformin"
        self.assertTrue(check_exclusion_criteria(patient_conditions, patient_medications, trial_exclusion_criteria))

        trial_exclusion_criteria = "hypertension, lisinopril"
        self.assertFalse(check_exclusion_criteria(patient_conditions, patient_medications, trial_exclusion_criteria))

    def test_summarize_patient_history(self):
        patient_data = {
            'PATIENT': '12345',
            'DESCRIPTION_x': 'Hypertension, Diabetes Type 2',
            'DESCRIPTION_y': 'Lisinopril, Metformin'
        }
        expected_summary = ("Patient 12345 Summary:\n"
                            "Medical Conditions: Hypertension, Diabetes Type 2\n"
                            "Medications: Lisinopril, Metformin")
        self.assertEqual(summarize_patient_history(patient_data), expected_summary)

    def test_generate_explanation_for_match(self):
        patient_data = {
            'PATIENT': '12345',
            'AGE': 45
        }
        trial = {
            'NCT Number': 'NCT001',
            'Study Title': 'Hypertension Treatment Study',
            'minimum_age': '40 Years',
            'maximum_age': '65 Years'
        }
        expected_explanation = ("Patient 12345 with age 45 is eligible for the trial 'Hypertension Treatment Study' "
                                "(NCT Number: NCT001). The patient's age satisfies the age criteria (40 Years to "
                                "65 Years), and the patient's medical conditions and medications do not match any "
                                "of the exclusion criteria.")
        self.assertEqual(generate_explanation_for_match(patient_data, trial), expected_explanation)

# unittest to work within Jupyter notebook
def run_tests():
    loader = unittest.TestLoader()
    suite = loader.loadTestsFromTestCase(TestTrialMatchingFunctions)
    
    runner = unittest.TextTestRunner(verbosity=2)
    result = runner.run(suite)
    return result

run_tests()


test_check_exclusion_criteria (__main__.TestTrialMatchingFunctions.test_check_exclusion_criteria) ... ok
test_check_inclusion_criteria (__main__.TestTrialMatchingFunctions.test_check_inclusion_criteria) ... ok
test_extract_keywords (__main__.TestTrialMatchingFunctions.test_extract_keywords) ... ok
test_generate_explanation_for_match (__main__.TestTrialMatchingFunctions.test_generate_explanation_for_match) ... ok
test_summarize_patient_history (__main__.TestTrialMatchingFunctions.test_summarize_patient_history) ... ok

----------------------------------------------------------------------
Ran 5 tests in 0.011s

OK


<unittest.runner.TextTestResult run=5 errors=0 failures=0>

Integra

In [9]:
import pandas as pd
import json

# Sample patient data (for testing)
patient_data = pd.DataFrame({
    'PATIENT': ['e93300bf-3a53-55c0-bd38-2ede59462f21'],
    'AGE': [17],
    'DESCRIPTION_x': ['Seasonal allergic rhinitis, Medication review due (situation), Acute bronchitis (disorder), Concussion injury of brain (disorder), Concussion with no loss of consciousness, Risk activity involvement (finding), Gingivitis (disorder), Viral sinusitis (disorder), Acute viral pharyngitis (disorder)'],
    'DESCRIPTION_y': ['Fexofenadine hydrochloride 30 MG Oral Tablet, NDA020800 0.3 ML Epinephrine 1 MG/ML Auto-Injector, Acetaminophen 325 MG Oral Tablet, Seasonique 91 Day Pack']
})

# Sample trial data (for testing)
trial_data = pd.DataFrame({
    'NCT Number': ['NCT04062292', 'NCT04792892'],
    'Study Title': ['Gait Parameters and Balance in Patients With Obstructive Lung Diseases', 'ANCA II - Quality of Life and Functional Outcome in Patients With Anal Cancer'],
    'minimum_age': ['18 Years', '18 Years'],
    'maximum_age': ['65 Years', '70 Years'],
    'exclusion_criteria': ['No informed consent', 'Severe cardiovascular disease']
})

# Function to match a patient to trials and generate the output
def match_patient_to_trials(patient_data, trial_data):
    matched_trials = []
    
    # Iterate over each trial
    for index, trial in trial_data.iterrows():
        min_age = int(trial['minimum_age'].replace(' Years', '')) if pd.notna(trial['minimum_age']) else None
        max_age = int(trial['maximum_age'].replace(' Years', '')) if pd.notna(trial['maximum_age']) else None

        # Check inclusion criteria (age)
        if min_age <= patient_data['AGE'] <= max_age:
            exclusion_criteria = trial['exclusion_criteria'].lower()
            patient_conditions = patient_data['DESCRIPTION_x'].lower()
            patient_medications = patient_data['DESCRIPTION_y'].lower()

            # Check exclusion criteria
            if any(exclusion_term in patient_conditions or exclusion_term in patient_medications for exclusion_term in exclusion_criteria.split(',')):
                continue  # Skip if any exclusion criteria match

            # If both criteria are satisfied, add the trial to matched_trials
            matched_trials.append({
                'trialId': trial['NCT Number'],
                'trialName': trial['Study Title'],
                'eligibilityCriteriaMet': ['Age requirement met', 'Exclusion criteria cleared'],
                'explanation': f"Patient {patient_data['PATIENT']} with age {patient_data['AGE']} is eligible for the trial '{trial['Study Title']}' (NCT Number: {trial['NCT Number']})."
            })
    
    return matched_trials

# Function to generate the output for a patient
def generate_output(patient_data, matched_trials):
    output = {
        'patientId': patient_data['PATIENT'],
        'patientSummary': f"Medical Conditions: {patient_data['DESCRIPTION_x']}\nMedications: {patient_data['DESCRIPTION_y']}",
        'eligibleTrials': matched_trials
    }
    return output

# Match the patient to trials and generate the output
patient = patient_data.iloc[0]
matched_trials = match_patient_to_trials(patient, trial_data)
output = generate_output(patient, matched_trials)

# Print the integration test result
print(json.dumps(output, indent=2))


{
  "patientId": "e93300bf-3a53-55c0-bd38-2ede59462f21",
  "patientSummary": "Medical Conditions: Seasonal allergic rhinitis, Medication review due (situation), Acute bronchitis (disorder), Concussion injury of brain (disorder), Concussion with no loss of consciousness, Risk activity involvement (finding), Gingivitis (disorder), Viral sinusitis (disorder), Acute viral pharyngitis (disorder)\nMedications: Fexofenadine hydrochloride 30 MG Oral Tablet, NDA020800 0.3 ML Epinephrine 1 MG/ML Auto-Injector, Acetaminophen 325 MG Oral Tablet, Seasonique 91 Day Pack",
  "eligibleTrials": []
}
