# Systematic clinical trial selection

Author: Shemra Rizzo </br>

Date Created: October 30, 2019
> Objective: Read file downloaded from clinicaltrials.gov and apply additional filters to obtain the final selection of lung clinical trials for the Project Vector paper.  

Updated: January 9, 2020
> Clean up code, redesign filter's order. Finished Filter 1 and 2.

Updated: January 14-16, 2020
> Finished Filter 3
> Exported file for ruishan to get the sample sizes per trial

Updated: January 20, 2020
> Finished final selection


# Libraries

In [2]:
import pandas as pd
import numpy as np
import itertools
import re

# Import and process approved drugs

Non-small lung cancer approved drugs found in [National Cancer Institute](https://www.cancer.gov/about-cancer/treatment/drugs/lung)

Generic names in parenthesis

In [3]:
# Lung approved drugs as listed in National Cancer Institute website
approved_drugs = '''
Abraxane (Paclitaxel Albumin-stabilized Nanoparticle Formulation)
Afatinib Dimaleate
Afinitor (Everolimus)
Afinitor Disperz (Everolimus)
Alecensa (Alectinib)
Alectinib
Alimta (Pemetrexed Disodium)
Alunbrig (Brigatinib)
Atezolizumab
Avastin (Bevacizumab)
Bevacizumab
Brigatinib
Carboplatin
Ceritinib
Crizotinib
Cyramza (Ramucirumab)
Dabrafenib Mesylate
Dacomitinib
Docetaxel
Doxorubicin Hydrochloride
Durvalumab
Entrectinib
Erlotinib Hydrochloride
Everolimus
Gefitinib
Gilotrif (Afatinib Dimaleate)
Gemcitabine Hydrochloride
Gemzar (Gemcitabine Hydrochloride)
Imfinzi (Durvalumab)
Iressa (Gefitinib)
Keytruda (Pembrolizumab)
Lorbrena (Lorlatinib)
Lorlatinib
Mechlorethamine Hydrochloride
Mekinist (Trametinib)
Methotrexate
Mustargen (Mechlorethamine Hydrochloride)
Mvasi (Bevacizumab)
Navelbine (Vinorelbine Tartrate)
Necitumumab
Nivolumab
Opdivo (Nivolumab)
Osimertinib Mesylate
Paclitaxel
Paclitaxel Albumin-stabilized Nanoparticle Formulation
Paraplat (Carboplatin)
Paraplatin (Carboplatin)
Pembrolizumab
Pemetrexed Disodium
Portrazza (Necitumumab)
Ramucirumab
Rozlytrek (Entrectinib)
Tafinlar (Dabrafenib Mesylate)
Tagrisso (Osimertinib Mesylate)
Tarceva (Erlotinib Hydrochloride)
Taxol (Paclitaxel)
Taxotere (Docetaxel)
Tecentriq (Atezolizumab)
Trametinib
Trexall (Methotrexate)
Vizimpro (Dacomitinib)
Vinorelbine Tartrate
Xalkori (Crizotinib)
Zykadia (Ceritinib)
'''

In [4]:
# Create a list of drugs, remove first and last entry which are empty
approved_drugs = approved_drugs.split('\n')[1:-1]

In [5]:
# Make all lower case
approved_drugs = [x.lower() for x in approved_drugs]

In [6]:
# Take only the first word from drugs list: the pharmaceautical
drugs_original = [i.split()[0] for i in approved_drugs]

In [7]:
# Identify those that have a generic name
is_generic = ['(' in x for x in approved_drugs]

In [8]:
# Get the subset of drugs with a generic equivalent
drugs_generic = list(itertools.compress(approved_drugs, is_generic))

In [9]:
# Extract the generic names
drugs_generic = [re.search('\(([^)]+)', x).group(1) for x in drugs_generic]

In [10]:
# Remove 'Paclitaxel Albumin-stabilized Nanoparticle Formulation' because it was already listed as non-generic
drugs_generic = drugs_generic[1:]

In [11]:
# List of all drugs, in original and generic names
approved_drugs_all = list(set(drugs_generic+drugs_original))

In [12]:
sorted(approved_drugs_all)

['abraxane',
 'afatinib',
 'afatinib dimaleate',
 'afinitor',
 'alecensa',
 'alectinib',
 'alimta',
 'alunbrig',
 'atezolizumab',
 'avastin',
 'bevacizumab',
 'brigatinib',
 'carboplatin',
 'ceritinib',
 'crizotinib',
 'cyramza',
 'dabrafenib',
 'dabrafenib mesylate',
 'dacomitinib',
 'docetaxel',
 'doxorubicin',
 'durvalumab',
 'entrectinib',
 'erlotinib',
 'erlotinib hydrochloride',
 'everolimus',
 'gefitinib',
 'gemcitabine',
 'gemcitabine hydrochloride',
 'gemzar',
 'gilotrif',
 'imfinzi',
 'iressa',
 'keytruda',
 'lorbrena',
 'lorlatinib',
 'mechlorethamine',
 'mechlorethamine hydrochloride',
 'mekinist',
 'methotrexate',
 'mustargen',
 'mvasi',
 'navelbine',
 'necitumumab',
 'nivolumab',
 'opdivo',
 'osimertinib',
 'osimertinib mesylate',
 'paclitaxel',
 'paraplat',
 'paraplatin',
 'pembrolizumab',
 'pemetrexed',
 'pemetrexed disodium',
 'portrazza',
 'ramucirumab',
 'rozlytrek',
 'tafinlar',
 'tagrisso',
 'tarceva',
 'taxol',
 'taxotere',
 'tecentriq',
 'trametinib',
 'trexall',

Functions
==

In [13]:
# Count arms
def count_arms(intervention):
    """Counts the number of trials in the intervention"""
    
    # Extract all arms
    arms = re.split('\|',intervention)
    
    return len(arms)

In [14]:
# Generate one column per arm, for trials with 2 arms

def separate_arms(intervention):
    """Generates a column per arm with its interventiond description"""
    
    # Extract all arms
    arms = re.split('\|', intervention)
    
    return arms[0], arms[1]
    

In [15]:
# Ensures arm's intervention is Drug or Biological 

def intervention_has_drug_or_biological(intervention):
    """Flags whether the interventon is drug or biological"""
    
    # Extract first word in the intervention description, up to the colon
    first_word = re.split(':', intervention)[0]
    
    flag = 0
    if first_word in ('Drug', 'Biological'):
        flag = 1
        
    return flag
    

In [16]:
# Removes the words 'Drug:' and 'Biological:'

def clean_intervention_text(arm_intervention):
    """
    Removes the words 'Drug:' and 'Biological:' from the intervention text in one arm
    """
    
    arm = arm_intervention.replace('Drug: ', '')
    
    arm = arm.replace('Biological: ', '')
    
    arm = arm.lower()
    
    return arm

In [17]:
# Count arms with drugs in approved drugs list
def flags_intervention_text_with_approved_drugs(arm_intervention_text, approved_drugs):
    """Flags whether the intervention text has an approved drug"""
    
    flag = 0 
    one_trial_words = re.sub("[^\w]", " ",  arm_intervention_text).split()
    if len(set(approved_drugs).intersection(set(one_trial_words)))>0:
            flag =1
            
    return flag

# Import and process clinical trials dataset

In [18]:
path = '~/Documents/Projects/2_Project_Vector/'
initial_selection = pd.read_csv(path+'SearchResults_Nov8_2019.csv')

In [19]:
initial_selection.head(3)

Unnamed: 0,Rank,NCT Number,Title,Acronym,Status,Study Results,Conditions,Interventions,Outcome Measures,Sponsor/Collaborators,...,Other IDs,Start Date,Primary Completion Date,Completion Date,First Posted,Results First Posted,Last Update Posted,Locations,Study Documents,URL
0,1,NCT02551211,Association Between Circulating Immune Cells a...,LYMPHOLUNG,Recruiting,No Results Available,Resectable Non-small Lung Cancer,Procedure: blood drawn|Procedure: surgical res...,Level of expression of the T regulator over to...,"University Hospital, Rouen",...,2015/080/HP,"December 14, 2016",July 2019,July 2019,"September 16, 2015",,"May 28, 2018","Rouen University Hospital, Rouen, France",,https://ClinicalTrials.gov/show/NCT02551211
1,2,NCT01203735,Valproic Acid With Chemoradiotherapy for Non-S...,,Unknown status,No Results Available,Locally Advanced Inoperable Non-small-lung Cancer,Drug: Valproic acid,Toxicity|Survival,Soroka University Medical Center,...,SOR507910CTIL,February 2011,February 2013,February 2015,"September 16, 2010",,"March 29, 2011","Soroka University Medical Center, Beer Sheva, ...",,https://ClinicalTrials.gov/show/NCT01203735
2,3,NCT03944772,Phase 2 Platform Study in Patients With Advanc...,ORCHARD,Recruiting,No Results Available,Non-Small Cell Lung Cancer,Drug: Osimertinib|Drug: Savolitinib|Drug: Gefi...,Objective response rate (ORR)|Progression-free...,AstraZeneca,...,D6186C00001,"June 25, 2019","November 11, 2022","November 11, 2022","May 10, 2019",,"September 9, 2019","Research Site, Duarte, California, United Stat...",,https://ClinicalTrials.gov/show/NCT03944772


In [20]:
summary = """
There are {:.0f} total lung trials in the initial selection coming from clinicaltrials.gov
""".format(initial_selection.shape[0])
print(summary)


There are 3684 total lung trials in the initial selection coming from clinicaltrials.gov



Filter 1: Has 2 arms
--

In [21]:
# Count the number of arms in the intervention
initial_selection['arms'] = initial_selection['Interventions'].apply(lambda x: count_arms(x))

In [22]:
# Select only trials with 2 arms
updated_selection = initial_selection[(initial_selection['arms']==2)].copy()

In [23]:
summary = """
After the first filter (having 2 arms), there are {:.0f} lung trials
""".format(len(updated_selection))
print(summary)


After the first filter (having 2 arms), there are 1172 lung trials



In [24]:
# Generate two columns, one for each arm, to list their intervention
updated_selection['arm1'], updated_selection['arm2'] = zip(*updated_selection['Interventions'].apply(lambda x: separate_arms(x)))


Filter 2: Both arms have a drug or a biological 
--

We are excluding radiation therapies, procedures, dietary supplement, tests, behavioral, and other interventios. 

In [25]:
updated_selection1 = updated_selection.copy()

In [26]:
# Flag arms with drugs or biological only
updated_selection1['arm1_included'] = updated_selection1['arm1'].apply(lambda x: intervention_has_drug_or_biological(x))
updated_selection1['arm2_included'] = updated_selection1['arm2'].apply(lambda x: intervention_has_drug_or_biological(x))

In [27]:
# Select only trials where both arms have a drug or biological
updated_selection1 = updated_selection1[(updated_selection1['arm1_included']==1) & (updated_selection1['arm2_included']==1)] 

In [28]:
summary = """
After the second filter (having drugs or biologicals only in both arms), there are {:.0f} lung trials
""".format(len(updated_selection1))
print(summary)


After the second filter (having drugs or biologicals only in both arms), there are 835 lung trials



In [29]:
# Remove the words 'Drug:' and 'Biological:'
updated_selection1['arm1'] = updated_selection1['arm1'].apply(lambda row: clean_intervention_text(row))
updated_selection1['arm2'] = updated_selection1['arm2'].apply(lambda row: clean_intervention_text(row))

In [30]:
# Reset index
updated_selection1.reset_index(inplace=True)

Filter 2: Data limitation
---

There are som entries in the interventions where there are two or three colons within an arm. This is relevatn because to identify those that have a drug or a biological it is assumed there is only one colon in "Drug:" and "Biological:". 

Manually analyze these outliers and identify the ones that should be extracted. Then manually add them to the updated selection

In [31]:
ex_arm1 = updated_selection['arm1']
ex_arm2 = updated_selection['arm2']

In [32]:
# How many studies have an arm where the intervention has more than one colon
ex_arm1.apply(lambda x: x.count(':')).value_counts().reset_index().rename(columns = {'index':'number of colons', 
                                                                                     'arm1' : 'number of studies'})


Unnamed: 0,number of colons,number of studies
0,1,1159
1,2,11
2,3,2


In [33]:
# Display the intervention for these cases
ex_arm1[ex_arm1.apply(lambda x: x.count(':'))>1]

168     Other: Biological: MUC1 peptide specific immun...
236        Drug: Suramin Drug:Docetaxel Drug: Carboplatin
864     Drug: Group A:cytokine-induced killer cell +ge...
907                      Procedure: A: Radiotherapy alone
1157                             Drug: Control arm (SEQ):
1171         Radiation: Comparator: CT or MRI and FDG-PET
1619    Drug: Group 1: Radiation, Paclitaxel,Carbo, Da...
2037    Drug: Erlotinib (trade name: Tarceva®) or Icot...
2243                    Drug: Arm A: Pemetrexed Cisplatin
2310            Drug: Comparator: erlotinib + dalotuzumab
2506    Drug: Intervention A: Irinotecan; Oxaliplatin;...
2949    Other: Standard dosing method arm: Cisplatin (...
3132                           Radiation: Radiation: SBRT
Name: arm1, dtype: object

There are not trials to add from this manual search.

# Replace numeric codes in intervention with drug name

In [34]:
def interventions_with_codes(df):
    '''Find all numeric entries and export them as csv to be manually analyzed'''
    
    #Find all interventions that have numbers
    df['arm1_has_codes'] = df['arm1'].apply(lambda row: len(re.findall(r'\d+', row)))  #flag 0,1
    df['arm2_has_codes'] = df['arm2'].apply(lambda row: len(re.findall(r'\d+', row)))  #flag 0,1
    
    # export list with codes to generate dictionary
    df[(df['arm1_has_codes']==1) | (df['arm2_has_codes']==1)][['arm1', 'arm2']].to_csv('both_arms.csv')

    return

In [35]:
def replace_codes_with_drug_names(intervention_arm, codes_for_drugs_dictionary):
    "Replaces codes with drug names as declared in csv file"
    
    new_interventions = []
    for intervention in intervention_arm:
        success = 0 
        for code, drug in codes_for_drugs_dictionary.items():
            if code in intervention:
                updated_intervention = intervention.replace(code, drug).lower()
                success = 1
    
        if success == 1:
            new_interventions.append(updated_intervention.lower())
        else:
            new_interventions.append(intervention.lower())
            
    return new_interventions

In [36]:
# Read in numeric characters manually created file
processed_numeric_characters = pd.read_csv(path+'numeric_characters.csv')

In [37]:
# Numeric characters that indicate the trial needs to be removed
remove_if_found = processed_numeric_characters[processed_numeric_characters['replacement_text']=='remove']['original_text']
remove_if_found = remove_if_found.reset_index()['original_text']

In [38]:
# Name of drugs for codes
codes_for_drugs = processed_numeric_characters[processed_numeric_characters['replacement_text']!='remove']
codes_for_drugs = codes_for_drugs.reset_index()[['original_text', 'replacement_text']]
codes_for_drugs_dict = codes_for_drugs.set_index('original_text').T.to_dict('records')[0]

In [39]:
# Flag if trials need to be removed based on numeric codes that are not approved drugs
updated_selection1['arm1_to_remove']= updated_selection1['arm1'].apply(lambda row: any(st in row for st in remove_if_found))
updated_selection1['arm2_to_remove']= updated_selection1['arm2'].apply(lambda row: any(st in row for st in remove_if_found))

In [40]:
# Flag if trials need to be removed based on placebo or saline or vaccine treatments
other_removable_objects = ['placebo', 'saline', 'vaccine']
updated_selection1['also_arm1_to_remove']= updated_selection1['arm1'].apply(lambda row: any(st in row for st in other_removable_objects))
updated_selection1['also_arm2_to_remove']= updated_selection1['arm2'].apply(lambda row: any(st in row for st in other_removable_objects))

In [41]:
# Remove all trials with numeric codes that are not approved drugs
updated_selection1 = updated_selection1[(updated_selection1['arm1_to_remove'] == False)&(updated_selection1['arm2_to_remove'] == False)]
updated_selection1 = updated_selection1[(updated_selection1['also_arm1_to_remove'] == False)&(updated_selection1['also_arm2_to_remove'] == False)]

In [42]:
# Replace codes with drug names
updated_selection1['new_arm1'] = replace_codes_with_drug_names(updated_selection1['arm1'], codes_for_drugs_dict)
updated_selection1['new_arm2'] = replace_codes_with_drug_names(updated_selection1['arm2'], codes_for_drugs_dict)

In [43]:
updated_selection2 = updated_selection1.copy()

In [44]:
len(updated_selection2)

560

Filter 3: Both arms have approved drugs
--
Algorithm: 
1. Flag all the trials with arm 1 having at least one intervention drug using the list of approved drugs 
2. Flag all the trials with arm 2 having at least one intervention drug using the list of approved drugs
3. For trials with only one Select the trials where both arms have an approved drug

In [46]:
def interventions_wordy(df):
    '''Find all interventions that have more than one word to analyze manually'''
    
    wordy_selection = df[
                        ((df['new_arm1'].str.split().apply(len) > 1)
                        | (df['new_arm2'].str.split().apply(len) > 1))]
     
    return wordy_selection
                

In [47]:
def interventions_not_wordy(df):
    '''Find all interventions that have onlye one word (one drug in both arms)'''
    
    wordy_selection = df[
                        ((df['new_arm1'].str.split().apply(len) == 1)
                        & (df['new_arm2'].str.split().apply(len) == 1))]
     
    return wordy_selection
                

In [48]:
# Flag trials with intervention text that contains at least one word that is an approved drug
updated_selection2['arm1_has_approved_drugs'] = updated_selection2['new_arm1'].apply(lambda x: flags_intervention_text_with_approved_drugs(x, approved_drugs_all)) 
updated_selection2['arm2_has_approved_drugs'] = updated_selection2['new_arm2'].apply(lambda x: flags_intervention_text_with_approved_drugs(x, approved_drugs_all)) 


In [49]:
# Remove those trials that don't have a single approved drug in ther intervention text
updated_selection2 = updated_selection2[(updated_selection2['arm1_has_approved_drugs']==1)&(updated_selection2['arm2_has_approved_drugs']==1)]

In [50]:
# Separate trials based on the length of their intervention: one word long, vs multi-word. 
single_word_interventions = interventions_not_wordy(updated_selection2).copy()
wordy_interventions = interventions_wordy(updated_selection2)

In [51]:
summary ="""
Of the {:.0f} trials with at least one approved drug, 
{:.0f} are single word (most likely a single drug) in both arms trials, and the remaining 
{:.0f} have at least one arm with a multi-word intervention description 
that is longer than one word and needs to analyzed manually.
""".format(
    len(updated_selection2), 
    len(single_word_interventions), 
    len(wordy_interventions))

print(summary)


Of the 250 trials with at least one approved drug, 
137 are single word (most likely a single drug) in both arms trials, and the remaining 
113 have at least one arm with a multi-word intervention description 
that is longer than one word and needs to analyzed manually.



In [52]:
# Generate .csv with trials that have interventions that are more than one word long
wordy_interventions.to_csv('wordy_interventions.csv')
single_word_interventions.to_csv('one_word_interventions.csv')

In [53]:
# Read in the manually analyzed interventions
selected_wordy_interventions = pd.read_csv(path+'wordy_interventions_manually_analyzed.csv')
selected_one_word_interventions = pd.read_csv(path+'one_word_interventions_manually_analyzed.csv')

In [54]:
selected_wordy_interventions['Included'].dtype

dtype('O')

In [55]:
selected_one_word_interventions['Included'].dtype

dtype('bool')

In [56]:
selected_wordy_interventions = selected_wordy_interventions[selected_wordy_interventions['Included']=='TRUE']
selected_one_word_interventions = selected_one_word_interventions[selected_one_word_interventions['Included']==True]

In [57]:
# Export the selected wordy interventions to be manually edited in their arm descriptions
selected_wordy_interventions.to_csv('selected_wordy_interventions.csv')
selected_one_word_interventions.to_csv('selected_one_word_interventions.csv')

In [58]:
# Read in the manually analyzed interventions
wordy_interventions_ready_to_merge = pd.read_csv(path+'selected_wordy_interventions_manually_processed.csv')
single_word_interventions_ready_to_merge = pd.read_csv(path+'selected_one_word_interventions_manually_processed.csv')

In [59]:
wordy_interventions_ready_to_merge['arm1_multidrug'].dtype

dtype('bool')

In [60]:
# Remove columns that are not in common between both dataframes before concatenating them
wordy_interventions_ready_to_merge.columns.difference(single_word_interventions_ready_to_merge.columns)
wordy_interventions_ready_to_merge = wordy_interventions_ready_to_merge.drop(wordy_interventions_ready_to_merge.columns[1], axis=1)
# Concatenate single word interventions and multidrug interventions dataframes into one master dataframe
updated_selection3 = pd.concat([single_word_interventions_ready_to_merge, wordy_interventions_ready_to_merge], 
                               sort=False, ignore_index =True)


In [61]:
summary = """
After the third filter (having approved drugs in both arms), there are {:.0f} lung trials
""".format(len(updated_selection3))
print(summary)


After the third filter (having approved drugs in both arms), there are 172 lung trials



In [62]:
updated_selection3['updated_new_arm1'] = updated_selection3['updated_new_arm1'].astype(str)
updated_selection3['updated_new_arm2'] = updated_selection3['updated_new_arm2'].astype(str)

In [63]:
# Remove trademark symbol
updated_selection3['updated_new_arm1'] = updated_selection3['updated_new_arm1'].apply(lambda row: row.strip('®'))
updated_selection3['updated_new_arm2'] = updated_selection3['updated_new_arm2'].apply(lambda row: row.strip('®'))

In [64]:
updated_selection3.columns

Index(['Unnamed: 0', 'index', 'Rank', 'NCT Number', 'Title', 'Acronym',
       'Status', 'Study Results', 'Conditions', 'Interventions',
       'Outcome Measures', 'Sponsor/Collaborators', 'Gender', 'Age', 'Phases',
       'Enrollment', 'Funded Bys', 'Study Type', 'Study Designs', 'Other IDs',
       'Start Date', 'Primary Completion Date', 'Completion Date',
       'First Posted', 'Results First Posted', 'Last Update Posted',
       'Locations', 'Study Documents', 'URL', 'arms', 'arm1', 'arm2',
       'arm1_included', 'arm2_included', 'arm1_to_remove', 'arm2_to_remove',
       'also_arm1_to_remove', 'also_arm2_to_remove', 'new_arm1', 'new_arm2',
       'arm1_has_approved_drugs', 'arm2_has_approved_drugs', 'Included',
       'Reason', 'updated_new_arm1', 'updated_new_arm2', 'arm1_multidrug',
       'arm2_multidrug'],
      dtype='object')

# Export file with updated selection (Three filters) to send to Ruishan. 
Ruishan will return the file with information on the sample size for each intervention arm for that drug in Flatiron

In [65]:
# Simplify file
columns_to_keep = ['NCT Number', 'updated_new_arm1', 'updated_new_arm2', 'arm1_multidrug', 'arm2_multidrug']

In [66]:
basic_updated_selection3 = updated_selection3[columns_to_keep]

In [67]:
basic_updated_selection3 = basic_updated_selection3.rename(columns = {'updated_new_arm1':'drugs_in_arm1', 
                                                                     'updated_new_arm2':'drugs_in_arm2', 
                                                                     'arm1_multidrug': 'arm1_has_drug_combo', 
                                                                     'arm2_multidrug': 'arm2_has_drug_combo'})

In [68]:
basic_updated_selection3.to_csv('Updated_selection_Three_Filters.csv')

In [69]:
len(basic_updated_selection3)

172

# Are our initial 9 Lung Trials in the results of the systematic search?

Trials codes:
* NCT02578680
* NCT02775435
* NCT02008227
* NCT01642004
* NCT01673867
* NCT02613507
* NCT02713867
* NCT02296125
* NCT01364012


In [70]:
initial_selection[initial_selection['NCT Number']=='NCT02008227']

Unnamed: 0,Rank,NCT Number,Title,Acronym,Status,Study Results,Conditions,Interventions,Outcome Measures,Sponsor/Collaborators,...,Start Date,Primary Completion Date,Completion Date,First Posted,Results First Posted,Last Update Posted,Locations,Study Documents,URL,arms
826,827,NCT02008227,A Study of Atezolizumab Compared With Docetaxe...,OAK,Completed,Has Results,Non-Squamous Non-Small Cell Lung Cancer,Drug: Atezolizumab|Drug: Docetaxel,Percentage of Participants Who Died: PP-ITT|Pe...,Hoffmann-La Roche,...,"March 11, 2014","July 7, 2016","January 9, 2019","December 11, 2013","July 2, 2017","February 8, 2019","Comprehensive Blood/Cancer Ctr, Bakersfield, C...",,https://ClinicalTrials.gov/show/NCT02008227,2


In [71]:
updated_selection[updated_selection['NCT Number']=='NCT01642004']

Unnamed: 0,Rank,NCT Number,Title,Acronym,Status,Study Results,Conditions,Interventions,Outcome Measures,Sponsor/Collaborators,...,Completion Date,First Posted,Results First Posted,Last Update Posted,Locations,Study Documents,URL,arms,arm1,arm2
825,826,NCT01642004,Study of BMS-936558 (Nivolumab) Compared to Do...,,"Active, not recruiting",Has Results,Squamous Cell Non-small Cell Lung Cancer,Biological: Nivolumab|Drug: Docetaxel,Overall Survival (OS) Time in Months for All R...,Bristol-Myers Squibb,...,"January 23, 2020","July 17, 2012","March 17, 2016","August 9, 2019","Mayo Clinic Arizona, Scottsdale, Arizona, Unit...",,https://ClinicalTrials.gov/show/NCT01642004,2,Biological: Nivolumab,Drug: Docetaxel


In [72]:
updated_selection1[updated_selection1['NCT Number']=='NCT01673867']

Unnamed: 0,index,Rank,NCT Number,Title,Acronym,Status,Study Results,Conditions,Interventions,Outcome Measures,...,arm1,arm2,arm1_included,arm2_included,arm1_to_remove,arm2_to_remove,also_arm1_to_remove,also_arm2_to_remove,new_arm1,new_arm2
460,2015,2016,NCT01673867,Study of BMS-936558 (Nivolumab) Compared to Do...,CheckMate057,"Active, not recruiting",Has Results,Non-Squamous Cell Non-small Cell Lung Cancer,Biological: Nivolumab|Drug: Docetaxel,Overall Survival (OS) Time in Months for All R...,...,nivolumab,docetaxel,1,1,False,False,False,False,nivolumab,docetaxel


In [73]:
updated_selection2[updated_selection2['NCT Number']=='NCT02613507']

Unnamed: 0,index,Rank,NCT Number,Title,Acronym,Status,Study Results,Conditions,Interventions,Outcome Measures,...,arm1_included,arm2_included,arm1_to_remove,arm2_to_remove,also_arm1_to_remove,also_arm2_to_remove,new_arm1,new_arm2,arm1_has_approved_drugs,arm2_has_approved_drugs
121,457,458,NCT02613507,Efficacy Study of Nivolumab Compared to Doceta...,CheckMate 078,"Active, not recruiting",Has Results,Non-Small Cell Lung Cancer,Drug: Nivolumab|Drug: Docetaxel,Median Overall Survival|Overall Survival Rate|...,...,1,1,False,False,False,False,nivolumab,docetaxel,1,1


In [74]:
# Are all our trials in here?
updated_selection3[updated_selection3['NCT Number']=='NCT01364012']

Unnamed: 0.1,Unnamed: 0,index,Rank,NCT Number,Title,Acronym,Status,Study Results,Conditions,Interventions,...,new_arm1,new_arm2,arm1_has_approved_drugs,arm2_has_approved_drugs,Included,Reason,updated_new_arm1,updated_new_arm2,arm1_multidrug,arm2_multidrug


# Read in processed file from Ruishan with sample sizes

Sample sizes in flatiron for the trials we started with are: 
* NCT02008227: n1=282, n2=941
* NCT01642004: n1=3759, n2=941
* NCT01673867: n=3759, n2=941
* NCT02613507: n1=3759, n2=941

Rule of thumb:
Choose sample sizes that are higher than 900 in both arms

In [75]:
processed_selection = pd.read_csv(path+'processed_Updated_selection_Three_Filters.csv')

In [76]:
len(processed_selection)

172

In [77]:
updated_selection4 = processed_selection[(processed_selection['Total drugs_in_arm1']>=250)&
                   (processed_selection['Total drugs_in_arm2']>=250)]

In [78]:
len(updated_selection4)

67

In [198]:
#updated_selection4.to_csv('updated_selection4.csv')

# Filter by phase III

In [89]:
Phase3or4 = updated_selection3[(updated_selection3['Phases']=='Phase 3')|(updated_selection3['Phases']=='Phase 4') |
                              (updated_selection3['Phases']=='Phase 2|Phase 3')]
Phase3or4=Phase3or4[['NCT Number', 'Phases']]

In [90]:
updated_selection3['Phases'].value_counts()

Phase 2            93
Phase 3            37
Phase 1            17
Phase 1|Phase 2    17
Phase 4             5
Not Applicable      2
Phase 2|Phase 3     1
Name: Phases, dtype: int64

In [91]:
final_selection = updated_selection4.merge(Phase3or4, on='NCT Number')

In [92]:
len(final_selection)

20

In [94]:
final_selection.to_csv('final_selection_lung_trials.csv')

In [93]:
final_selection

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,NCT Number,drugs_in_arm1,drugs_in_arm2,arm1_has_drug_combo,arm2_has_drug_combo,Total drugs_in_arm1,1L drugs_in_arm1,2L drugs_in_arm1,Total drugs_in_arm2,1L drugs_in_arm2,2L drugs_in_arm2,Phases
0,10,10,NCT02613507,nivolumab,docetaxel,False,False,7554,1760,3906,2102,330,975,Phase 3
1,12,12,NCT02075840,alectinib,crizotinib,False,False,379,144,117,763,525,153,Phase 3
2,17,17,NCT00391274,pemetrexed,docetaxel,False,False,2670,1037,1126,2102,330,975,Phase 3
3,22,22,NCT00441922,docetaxel,vinorelbine,False,False,2102,330,975,1072,194,235,Phase 3
4,28,28,NCT00874419,erlotinib,"gemcitabine, carboplatin",False,True,4655,2945,1026,1860,1162,429,Phase 3
5,29,29,NCT01905657,pembrolizumab,docetaxel,False,False,3453,2259,836,2102,330,975,Phase 2|Phase 3
6,34,34,NCT01642004,nivolumab,docetaxel,False,False,7554,1760,3906,2102,330,975,Phase 3
7,35,35,NCT02008227,atezolizumab,docetaxel,False,False,623,142,295,2102,330,975,Phase 3
8,38,38,NCT02838420,alectinib,crizotinib,False,False,379,144,117,763,525,153,Phase 3
9,58,58,NCT00442026,docetaxel,gemcitabine,False,False,2102,330,975,2264,407,781,Phase 3


In [99]:
# Misleading trial, has entry "Phase 3", but title says "Phase 2"
initial_selection[initial_selection['NCT Number']=='NCT00441922'].T

Unnamed: 0,640
Rank,641
NCT Number,NCT00441922
Title,Trial of Docetaxel Versus Vinorelbine as 1st L...
Acronym,
Status,Completed
Study Results,No Results Available
Conditions,Non Small Cell Lung Cancer
Interventions,Drug: Docetaxel|Drug: Vinorelbine
Outcome Measures,Overall survival between the two treatment arm...
Sponsor/Collaborators,Hellenic Oncology Research Group|University Ho...
