# Systematic clinical trial selection

Author: Shemra Rizzo </br>

Date Created: October 30, 2019

Objective: Read file downloaded from clinicaltrials.gov and apply additional filters to obtain the final selection of lung clinical trials for the Trial Pathfinder paper.  


# Libraries

In [7]:
import pandas as pd
import numpy as np
import itertools
import re

# List of approved drugs

Non-small lung cancer approved drugs found in [National Cancer Institute](https://www.cancer.gov/about-cancer/treatment/drugs/lung)

Generic names in parenthesis

In [21]:
# Lung approved drugs as listed in National Cancer Institute website
approved_drugs = '''
Abraxane (Paclitaxel Albumin-stabilized Nanoparticle Formulation)
Afatinib Dimaleate
Afinitor (Everolimus)
Afinitor Disperz (Everolimus)
Alecensa (Alectinib)
Alectinib
Alimta (Pemetrexed Disodium)
Alunbrig (Brigatinib)
Atezolizumab
Avastin (Bevacizumab)
Bevacizumab
Brigatinib
Carboplatin
Ceritinib
Crizotinib
Cyramza (Ramucirumab)
Dabrafenib Mesylate
Dacomitinib
Docetaxel
Doxorubicin Hydrochloride
Durvalumab
Entrectinib
Erlotinib Hydrochloride
Everolimus
Gefitinib
Gilotrif (Afatinib Dimaleate)
Gemcitabine Hydrochloride
Gemzar (Gemcitabine Hydrochloride)
Imfinzi (Durvalumab)
Iressa (Gefitinib)
Keytruda (Pembrolizumab)
Lorbrena (Lorlatinib)
Lorlatinib
Mechlorethamine Hydrochloride
Mekinist (Trametinib)
Methotrexate
Mustargen (Mechlorethamine Hydrochloride)
Mvasi (Bevacizumab)
Navelbine (Vinorelbine Tartrate)
Necitumumab
Nivolumab
Opdivo (Nivolumab)
Osimertinib Mesylate
Paclitaxel
Paclitaxel Albumin-stabilized Nanoparticle Formulation
Paraplat (Carboplatin)
Paraplatin (Carboplatin)
Pembrolizumab
Pemetrexed Disodium
Portrazza (Necitumumab)
Ramucirumab
Rozlytrek (Entrectinib)
Tafinlar (Dabrafenib Mesylate)
Tagrisso (Osimertinib Mesylate)
Tarceva (Erlotinib Hydrochloride)
Taxol (Paclitaxel)
Taxotere (Docetaxel)
Tecentriq (Atezolizumab)
Trametinib
Trexall (Methotrexate)
Vizimpro (Dacomitinib)
Vinorelbine Tartrate
Xalkori (Crizotinib)
Zykadia (Ceritinib)
'''

In [22]:
# Create a list of drugs, remove first and last entry which are empty
approved_drugs = approved_drugs.split('\n')[1:-1]

# Make all lower case
approved_drugs = [x.lower() for x in approved_drugs]

# Take only the first word from drugs list: the pharmaceautical
drugs_original = [i.split()[0] for i in approved_drugs]

# Identify those that have a generic name
is_generic = ['(' in x for x in approved_drugs]

# Get the subset of drugs with a generic equivalent
drugs_generic = list(itertools.compress(approved_drugs, is_generic))

# Extract the generic names
drugs_generic = [re.search('\(([^)]+)', x).group(1) for x in drugs_generic]

# Remove 'Paclitaxel Albumin-stabilized Nanoparticle Formulation' because it was already listed as non-generic
drugs_generic = drugs_generic[1:]

# List of all drugs, in original and generic names
approved_drugs_all = list(set(drugs_generic+drugs_original))


Functions
==

In [27]:
def count_arms(intervention):
    """Counts the number of trials in the intervention"""
    
    # Extract all arms
    arms = re.split('\|',intervention)
    
    return len(arms)

In [28]:
def separate_arms(intervention):
    """Generates a column per arm with its interventiond description"""
    
    # Extract all arms
    arms = re.split('\|', intervention)
    
    return arms[0], arms[1]
    

In [29]:
def intervention_has_drug_or_biological(intervention):
    """Flags whether the interventon is drug or biological"""
    
    # Extract first word in the intervention description, up to the colon
    first_word = re.split(':', intervention)[0]
    
    flag = 0
    if first_word in ('Drug', 'Biological'):
        flag = 1
        
    return flag
    

In [30]:
def clean_intervention_text(arm_intervention):
    """
    Removes the words 'Drug:' and 'Biological:' from the intervention text in one arm
    """
    
    arm = arm_intervention.replace('Drug: ', '')
    
    arm = arm.replace('Biological: ', '')
    
    arm = arm.lower()
    
    return arm

In [31]:
def flags_intervention_text_with_approved_drugs(arm_intervention_text, approved_drugs):
    """Flags whether the intervention text has an approved drug"""
    
    flag = 0 
    one_trial_words = re.sub("[^\w]", " ",  arm_intervention_text).split()
    if len(set(approved_drugs).intersection(set(one_trial_words)))>0:
            flag =1
            
    return flag

# Import and process clinical trials dataset

In [32]:
initial_selection = pd.read_csv('SearchResults_Nov8_2019.csv')

In [33]:
initial_selection.head(3)

Unnamed: 0,Rank,NCT Number,Title,Acronym,Status,Study Results,Conditions,Interventions,Outcome Measures,Sponsor/Collaborators,...,Other IDs,Start Date,Primary Completion Date,Completion Date,First Posted,Results First Posted,Last Update Posted,Locations,Study Documents,URL
0,1,NCT02551211,Association Between Circulating Immune Cells a...,LYMPHOLUNG,Recruiting,No Results Available,Resectable Non-small Lung Cancer,Procedure: blood drawn|Procedure: surgical res...,Level of expression of the T regulator over to...,"University Hospital, Rouen",...,2015/080/HP,"December 14, 2016",July 2019,July 2019,"September 16, 2015",,"May 28, 2018","Rouen University Hospital, Rouen, France",,https://ClinicalTrials.gov/show/NCT02551211
1,2,NCT01203735,Valproic Acid With Chemoradiotherapy for Non-S...,,Unknown status,No Results Available,Locally Advanced Inoperable Non-small-lung Cancer,Drug: Valproic acid,Toxicity|Survival,Soroka University Medical Center,...,SOR507910CTIL,February 2011,February 2013,February 2015,"September 16, 2010",,"March 29, 2011","Soroka University Medical Center, Beer Sheva, ...",,https://ClinicalTrials.gov/show/NCT01203735
2,3,NCT03944772,Phase 2 Platform Study in Patients With Advanc...,ORCHARD,Recruiting,No Results Available,Non-Small Cell Lung Cancer,Drug: Osimertinib|Drug: Savolitinib|Drug: Gefi...,Objective response rate (ORR)|Progression-free...,AstraZeneca,...,D6186C00001,"June 25, 2019","November 11, 2022","November 11, 2022","May 10, 2019",,"September 9, 2019","Research Site, Duarte, California, United Stat...",,https://ClinicalTrials.gov/show/NCT03944772


In [34]:
summary = """
There are {:.0f} total lung trials in the initial selection coming from clinicaltrials.gov
""".format(initial_selection.shape[0])
print(summary)


There are 3684 total lung trials in the initial selection coming from clinicaltrials.gov



# Filter 1: Has 2 arms


In [37]:
# Count the number of arms in the intervention
initial_selection['arms'] = initial_selection['Interventions'].apply(lambda x: count_arms(x))

# Select only trials with 2 arms
updated_selection = initial_selection[(initial_selection['arms']==2)].copy()

# Generate two columns, one for each arm, to list their intervention
updated_selection['arm1'], updated_selection['arm2'] = zip(*updated_selection['Interventions'].apply(lambda x: separate_arms(x)))


In [38]:
summary = """
After the first filter (having 2 arms), there are {:.0f} lung trials
""".format(len(updated_selection))
print(summary)


After the first filter (having 2 arms), there are 1172 lung trials



# Filter 2: Both arms have a drug or a biological 


We are excluding radiation therapies, procedures, dietary supplement, tests, behavioral, and other interventios. 

In [42]:
# Flag arms with drugs or biological only
updated_selection['arm1_included'] = updated_selection['arm1'].apply(lambda x: intervention_has_drug_or_biological(x))
updated_selection['arm2_included'] = updated_selection['arm2'].apply(lambda x: intervention_has_drug_or_biological(x))

# Select only trials where both arms have a drug or biological
updated_selection = updated_selection[(updated_selection['arm1_included']==1) & (updated_selection['arm2_included']==1)] 

# Remove the words 'Drug:' and 'Biological:'
updated_selection['arm1'] = updated_selection['arm1'].apply(lambda row: clean_intervention_text(row))
updated_selection['arm2'] = updated_selection['arm2'].apply(lambda row: clean_intervention_text(row))

# Reset index
updated_selection.reset_index(inplace=True)

In [43]:
summary = """
After the second filter (having drugs or biologicals only in both arms), there are {:.0f} lung trials
""".format(len(updated_selection))
print(summary)


After the second filter (having drugs or biologicals only in both arms), there are 835 lung trials



# Replace numeric codes in intervention with drug name

In [44]:
def interventions_with_codes(df):
    '''Find all numeric entries and export them as csv to be manually analyzed'''
    
    #Find all interventions that have numbers
    df['arm1_has_codes'] = df['arm1'].apply(lambda row: len(re.findall(r'\d+', row)))  #flag 0,1
    df['arm2_has_codes'] = df['arm2'].apply(lambda row: len(re.findall(r'\d+', row)))  #flag 0,1
    
    # export list with codes to generate dictionary
    df[(df['arm1_has_codes']==1) | (df['arm2_has_codes']==1)][['arm1', 'arm2']].to_csv('both_arms.csv')

    return

In [45]:
def replace_codes_with_drug_names(intervention_arm, codes_for_drugs_dictionary):
    "Replaces codes with drug names as declared in csv file"
    
    new_interventions = []
    for intervention in intervention_arm:
        success = 0 
        for code, drug in codes_for_drugs_dictionary.items():
            if code in intervention:
                updated_intervention = intervention.replace(code, drug).lower()
                success = 1
    
        if success == 1:
            new_interventions.append(updated_intervention.lower())
        else:
            new_interventions.append(intervention.lower())
            
    return new_interventions

In [46]:
# Read in numeric characters file
processed_numeric_characters = pd.read_csv('numeric_characters.csv')

# Numeric characters that indicate the trial needs to be removed
remove_if_found = processed_numeric_characters[processed_numeric_characters['replacement_text']=='remove']['original_text']
remove_if_found = remove_if_found.reset_index()['original_text']

# Name of drugs for codes
codes_for_drugs = processed_numeric_characters[processed_numeric_characters['replacement_text']!='remove']
codes_for_drugs = codes_for_drugs.reset_index()[['original_text', 'replacement_text']]
codes_for_drugs_dict = codes_for_drugs.set_index('original_text').T.to_dict('records')[0]

# Flag if trials need to be removed based on numeric codes that are not approved drugs
updated_selection['arm1_to_remove']= updated_selection['arm1'].apply(lambda row: any(st in row for st in remove_if_found))
updated_selection['arm2_to_remove']= updated_selection['arm2'].apply(lambda row: any(st in row for st in remove_if_found))

# Flag if trials need to be removed based on placebo or saline or vaccine treatments
other_removable_objects = ['placebo', 'saline', 'vaccine']
updated_selection['also_arm1_to_remove']= updated_selection['arm1'].apply(lambda row: any(st in row for st in other_removable_objects))
updated_selection['also_arm2_to_remove']= updated_selection['arm2'].apply(lambda row: any(st in row for st in other_removable_objects))

# Remove all trials with numeric codes that are not approved drugs
updated_selection = updated_selection[(updated_selection['arm1_to_remove'] == False)&(updated_selection['arm2_to_remove'] == False)]
updated_selection = updated_selection[(updated_selection['also_arm1_to_remove'] == False)&(updated_selection['also_arm2_to_remove'] == False)]

# Replace codes with drug names
updated_selection['new_arm1'] = replace_codes_with_drug_names(updated_selection['arm1'], codes_for_drugs_dict)
updated_selection['new_arm2'] = replace_codes_with_drug_names(updated_selection['arm2'], codes_for_drugs_dict)



# Filter 3: Both arms have approved drugs

Algorithm: 
1. Flag all the trials with arm 1 having at least one intervention drug using the list of approved drugs 
2. Flag all the trials with arm 2 having at least one intervention drug using the list of approved drugs
3. Select the trials where both arms have an approved drug

In [58]:
def interventions_wordy(df):
    '''Find all interventions that have more than one word (will need to be analyzed manually)'''
    
    selection = df[
                        ((df['new_arm1'].str.split().apply(len) > 1)
                        | (df['new_arm2'].str.split().apply(len) > 1))]
     
    return selection
                

In [59]:
def interventions_not_wordy(df):
    '''Find all interventions that have only one word (one drug in both arms)'''
    
    selection = df[
                        ((df['new_arm1'].str.split().apply(len) == 1)
                        & (df['new_arm2'].str.split().apply(len) == 1))]
     
    return selection
                

In [61]:
# Flag trials with intervention text that contains at least one word that is an approved drug
updated_selection['arm1_has_approved_drugs'] = updated_selection['new_arm1'].apply(lambda x: flags_intervention_text_with_approved_drugs(x, approved_drugs_all)) 
updated_selection['arm2_has_approved_drugs'] = updated_selection['new_arm2'].apply(lambda x: flags_intervention_text_with_approved_drugs(x, approved_drugs_all)) 

# Remove those trials that don't have a single approved drug in ther intervention text
updated_selection = updated_selection[(updated_selection['arm1_has_approved_drugs']==1)&(updated_selection['arm2_has_approved_drugs']==1)]

# Separate trials based on the length of their intervention: one word long, vs multi-word. 
single_word_interventions = interventions_not_wordy(updated_selection).copy()
multi_word_interventions = interventions_wordy(updated_selection).copy()


In [62]:
summary ="""
Of the {:.0f} trials with at least one approved drug, 
{:.0f} are single word (most likely a single drug) in both arms trials, and the remaining 
{:.0f} have at least one arm with a multi-word intervention description 
that needs to be analyzed manually by subject matter expert.
""".format(
    len(updated_selection), 
    len(single_word_interventions), 
    len(multi_word_interventions))

print(summary)


Of the 250 trials with at least one approved drug, 
137 are single word (most likely a single drug) in both arms trials, and the remaining 
113 have at least one arm with a multi-word intervention description 
that needs to be analyzed manually.



In [64]:
# Export selections of trials with single word drugs, and multi-word drugs, as separate files
# multi_word_interventions.to_csv('multi_word_interventions.csv')
# single_word_interventions.to_csv('one_word_interventions.csv')

In [66]:
# Read the files after being analyzed by subject-matter expert
selected_multi_word_interventions = pd.read_csv('processed_multi_word_interventions.csv')
selected_one_word_interventions = pd.read_csv('processed_one_word_interventions.csv')

In [68]:
# Filter those trials marked for inclusion
selected_multi_word_interventions = selected_multi_word_interventions[selected_multi_word_interventions['Included']=='TRUE']
selected_one_word_interventions = selected_one_word_interventions[selected_one_word_interventions['Included']==True]

In [71]:
# Export the selected wordy interventions to be manually edited in their arm descriptions
# selected_multi_word_interventions.to_csv('selected_multi_word_interventions.csv')
# selected_one_word_interventions.to_csv('selected_one_word_interventions.csv')

In [73]:
# Read in the manually analyzed interventions
multi_word_interventions_ready_to_merge = pd.read_csv('processed_selected_multi_word_interventions.csv')
single_word_interventions_ready_to_merge = pd.read_csv('processed_selected_one_word_interventions.csv')

In [81]:
# Remove columns that are not in common between both dataframes before concatenating 
multi_word_interventions_ready_to_merge.columns.difference(single_word_interventions_ready_to_merge.columns)
multi_word_interventions_ready_to_merge = multi_word_interventions_ready_to_merge.drop(multi_word_interventions_ready_to_merge.columns[1], axis=1)
# Concatenate single word interventions and multidrug interventions dataframes into one master dataframe
updated_selection2 = pd.concat([single_word_interventions_ready_to_merge, multi_word_interventions_ready_to_merge], 
                               sort=False, ignore_index =True)


In [82]:
updated_selection2['updated_new_arm1'] = updated_selection2['updated_new_arm1'].astype(str)
updated_selection2['updated_new_arm2'] = updated_selection2['updated_new_arm2'].astype(str)

# Remove trademark symbol
updated_selection2['updated_new_arm1'] = updated_selection2['updated_new_arm1'].apply(lambda row: row.strip('®'))
updated_selection2['updated_new_arm2'] = updated_selection2['updated_new_arm2'].apply(lambda row: row.strip('®'))

In [83]:
summary = """
After the third filter (having approved drugs in both arms), there are {:.0f} lung trials
""".format(len(updated_selection2))
print(summary)


After the third filter (having approved drugs in both arms), there are 172 lung trials



# Filter 4: enough patients in Flatiron


In [84]:
# Simplify file
columns_to_keep = ['NCT Number', 'updated_new_arm1', 'updated_new_arm2', 'arm1_multidrug', 'arm2_multidrug']
basic_selection = updated_selection2[columns_to_keep]
basic_selection = basic_selection.rename(columns = {'updated_new_arm1':'drugs_in_arm1', 
                                                                     'updated_new_arm2':'drugs_in_arm2', 
                                                                     'arm1_multidrug': 'arm1_has_drug_combo', 
                                                                     'arm2_multidrug': 'arm2_has_drug_combo'})

In [87]:
# export file with selection up to this point (three filters)
# basic_selection.to_csv('updated_selection_with_three_filters.csv')

In [89]:
# read in file that now includes the corresponding sample size in Flatiron
processed_selection = pd.read_csv('processed_updated_selection_with_three_filters.csv')

In [91]:
updated_selection = processed_selection[(processed_selection['Total drugs_in_arm1']>=250)&
                   (processed_selection['Total drugs_in_arm2']>=250)]

In [93]:
summary = """
After the fourth filter (having more than 250 patients in each arm), there are {:.0f} lung trials
""".format(len(updated_selection))
print(summary)


After the fourth filter (having more than 250 patients in each arm), there are 67 lung trials



# Filter 5: Trial is phase III

In [94]:
Phase3or4 = updated_selection2[(updated_selection2['Phases']=='Phase 3')|(updated_selection2['Phases']=='Phase 4') |
                              (updated_selection2['Phases']=='Phase 2|Phase 3')]
Phase3or4=Phase3or4[['NCT Number', 'Phases']]

In [95]:
updated_selection2['Phases'].value_counts()

Phase 2            93
Phase 3            37
Phase 1            17
Phase 1|Phase 2    17
Phase 4             5
Not Applicable      2
Phase 2|Phase 3     1
Name: Phases, dtype: int64

In [96]:
final_selection = updated_selection.merge(Phase3or4, on='NCT Number')
final_selection = final_selection.drop(final_selection.columns[0:2], axis=1)

In [99]:
summary = """
After fifth filter (trial is phase 3), there are {:.0f} lung trials
""".format(len(final_selection))
print(summary)


After fifth filter (trial is phase 3), there are 20 lung trials



In [103]:
# Export final selection
final_selection.to_csv('final_selection_lung_trials.csv')

In [104]:
final_selection

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,NCT Number,drugs_in_arm1,drugs_in_arm2,arm1_has_drug_combo,arm2_has_drug_combo,Total drugs_in_arm1,1L drugs_in_arm1,2L drugs_in_arm1,Total drugs_in_arm2,1L drugs_in_arm2,2L drugs_in_arm2,Phases
0,10,10,NCT02613507,nivolumab,docetaxel,False,False,7554,1760,3906,2102,330,975,Phase 3
1,12,12,NCT02075840,alectinib,crizotinib,False,False,379,144,117,763,525,153,Phase 3
2,17,17,NCT00391274,pemetrexed,docetaxel,False,False,2670,1037,1126,2102,330,975,Phase 3
3,22,22,NCT00441922,docetaxel,vinorelbine,False,False,2102,330,975,1072,194,235,Phase 3
4,28,28,NCT00874419,erlotinib,"gemcitabine, carboplatin",False,True,4655,2945,1026,1860,1162,429,Phase 3
5,29,29,NCT01905657,pembrolizumab,docetaxel,False,False,3453,2259,836,2102,330,975,Phase 2|Phase 3
6,34,34,NCT01642004,nivolumab,docetaxel,False,False,7554,1760,3906,2102,330,975,Phase 3
7,35,35,NCT02008227,atezolizumab,docetaxel,False,False,623,142,295,2102,330,975,Phase 3
8,38,38,NCT02838420,alectinib,crizotinib,False,False,379,144,117,763,525,153,Phase 3
9,58,58,NCT00442026,docetaxel,gemcitabine,False,False,2102,330,975,2264,407,781,Phase 3
