In [1]:
import pandas as pd

In [2]:
from extract import extracted_data_analyzer as eda, aact_querier as aq


In [3]:
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)

In [4]:
def breakdown(df, colname):
    brk=df[colname].value_counts().reset_index()
    brk.columns=[colname,"study_count"]
    brk['study_percent']=brk.study_count/brk.study_count.sum()
    return brk

In [None]:
## Explore cancer trials

In [5]:
# All cancer trials
onc_trials=pd.read_csv("extract/extracted_data/ct_fb_parser_data.csv", index_col=0)

In [6]:
onc_trials.rename(columns={'#nct_id':'nct_id'}, inplace=True)
onc_trials.head()

Unnamed: 0,nct_id,title,has_us_facility,conditions,eligibility_criteria,start_date,lead_sponsor,summary,overall_status,phase,enrollment,enrollment_type,study_type,number_of_arms,number_of_groups,why_stopped,has_dmc,is_fda_regulated_drug,is_fda_regulated_device,is_unapproved_device,is_ppsd,is_us_export
0,NCT00000105,Vaccination With Tetanus and KLH to Assess Imm...,True,Cancer,Inclusion Criteria:\n\nPatients must have a di...,2002-07-31,"Masonic Cancer Center, University of Minnesota",The purpose of this study is to learn how the ...,Terminated,,112.0,Actual,Observational,,3.0,Replaced by another study.,True,,,,,
1,NCT00000124,Collaborative Ocular Melanoma Study (COMS),False,Choroid Neoplasms|Uveitis,Men and women eligible for the study must be a...,1986-11-30,National Eye Institute (NEI),To evaluate therapeutic interventions for pati...,Unknown status,Phase 3,,,Interventional,,,,,,,,,
2,NCT00000603,Cord Blood Stem Cell Transplantation Study (CO...,False,"Anemia, Aplastic|Fanconi Anemia|Hematologic Di...",Mothers of Infant-donors must complete a medic...,1996-09-30,"National Heart, Lung, and Blood Institute (NHLBI)","To evaluate if HLA-mismatched, unrelated-donor...",Completed,Phase 2,,,Interventional,,,,,,,,,
3,NCT00000611,Women's Health Initiative (WHI),False,Bone Diseases|Breast Neoplasms|Cardiovascular ...,Postmenopausal women ages 50 to 79.,,"National Heart, Lung, and Blood Institute (NHLBI)","To address cardiovascular disease, cancer, and...",Completed,Phase 3,,,Interventional,,,,,,,,,
4,NCT00000764,Chemoprevention of Anal Neoplasia Arising Seco...,True,Anus Neoplasms|HIV Infections,Inclusion Criteria\n\nConcurrent Medication:\n...,,National Institute of Allergy and Infectious D...,"PRIMARY: In Phase I, to define a broadly toler...",Completed,Phase 1,98.0,,Interventional,,,,,,,,,


In [None]:
onc_trials['matches'] = ct['names'].apply(lambda x: process.extractOne(x, group['names'].to_list()))

In [36]:
cancer_trial_count=onc_trials.shape[0]
print(f"{cancer_trial_count} cancer trials.")

54404 cancer trials.


In [37]:
onc_trials.columns

Index(['nct_id', 'title', 'has_us_facility', 'conditions',
       'eligibility_criteria', 'start_date', 'lead_sponsor', 'summary',
       'overall_status', 'phase', 'enrollment', 'enrollment_type',
       'study_type', 'number_of_arms', 'number_of_groups', 'why_stopped',
       'has_dmc', 'is_fda_regulated_drug', 'is_fda_regulated_device',
       'is_unapproved_device', 'is_ppsd', 'is_us_export'],
      dtype='object')

In [93]:
breakdown(onc_trials, "phase")

Unnamed: 0,phase,study_count,study_percent
0,Phase 2,13283,0.304866
1,Not Applicable,11757,0.269842
2,Phase 1,8162,0.187331
3,Phase 3,4547,0.104361
4,Phase 1/Phase 2,3467,0.079573
5,Phase 4,1063,0.024398
6,Early Phase 1,681,0.01563
7,Phase 2/Phase 3,610,0.014


In [97]:
breakdown(onc_trials, "study_type")

Unnamed: 0,study_type,study_count,study_percent
0,Interventional,43571,0.800879
1,Observational,9654,0.17745
2,Observational [Patient Registry],1019,0.01873
3,Expanded Access,160,0.002941


In [98]:
breakdown(onc_trials, "overall_status")

Unnamed: 0,overall_status,study_count,study_percent
0,Completed,23604,0.433865
1,Recruiting,10866,0.199728
2,Unknown status,6471,0.118943
3,Terminated,4555,0.083725
4,"Active, not recruiting",3987,0.073285
5,Not yet recruiting,2502,0.045989
6,Withdrawn,1694,0.031137
7,Enrolling by invitation,322,0.005919
8,Suspended,243,0.004467
9,No longer available,75,0.001379


In [99]:
breakdown(onc_trials, "why_stopped")

Unnamed: 0,why_stopped,study_count,study_percent
0,Slow accrual,127,0.023167
1,low accrual,78,0.014228
2,slow accrual,74,0.013499
3,Low accrual,64,0.011675
4,Poor accrual,44,0.008026
...,...,...,...
4013,The CYPRESS-2 trial was closed early after the...,1,0.000182
4014,Identical study Biomed 777-CLP-029 did not mee...,1,0.000182
4015,Administrative reasons.,1,0.000182
4016,The PI is dealing with recurrent cancer and mu...,1,0.000182


In [100]:
breakdown(onc_trials, "is_fda_regulated_drug")

Unnamed: 0,is_fda_regulated_drug,study_count,study_percent
0,False,17286,0.714357
1,True,6912,0.285643


In [101]:
breakdown(onc_trials, "is_fda_regulated_device")

Unnamed: 0,is_fda_regulated_device,study_count,study_percent
0,False,23136,0.956626
1,True,1049,0.043374


In [127]:
## Sponsor Mapping

In [128]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

In [222]:
sponsor_match_dict={
    "AbbVie":{"type":"Pharma"},
    "Amgen":{"type":"Pharma"},
    "M.D. Anderson Cancer Center":{"type":"Cancer Center"},
    "Memorial Sloan Kettering Cancer Center":{"type":"Cancer Center"},
    "Roche":{"type":"Pharma"},
    "Novartis":{"type":"Pharma"},
    "Roche":{"type":"Pharma"},
    "Mayo":{"type":"Academic Medical Center","formal_name":"Mayo Clinic"},
    "Eli Lilly":{"type":"Pharma"},
    "Dana-Farber":{"type":"Cancer Center"},
    "Pfizer":{"type":"Pharma"},
    "Mass General":{"type":"Academic Medical Center"},
    "Bristol-Myers Squibb":{"type":"Pharma"},
    "AstraZeneca":{"type":"Pharma"},
    "Bayer":{"type":"Pharma"},
    "Boehringer Ingelheim":{"type":"Biotech"},
    "EQRx":{"type":"Biotech"},
    "Gilead":{"type":"Biotech"},
    "Sanofi":{"type":"Pharma"},
    "GlaxoSmithKline":{"type":"Pharma"},
    "ImmunoGen":{"type":"Biotech"},
    "Janssen":{"type":"Pharma"},
    "Merck":{"type":"Pharma"},
    "OneOncology":{"type":"Health System"},
    "Genentech":{"type":"Biotech"},
    "Takeda":{"type":"Pharma"},
    "University":{"type":"University"},
    
    
    "National Heart, Lung, and Blood Institute":{"type":"Government"},
    "National Cancer Institute":{"type":"Government"},
    "National":{"type":"Government"},
    "National Institute":{"type":"Government"},
    "Cancer Center":{"type":"Cancer Center"},
    "Oncology Group":{"type":"Oncology Group"},
    "St. Vincent Medical Center - Los Angeles":{"type":"Hospital"},
    "Medical Center":{"type":"Hospital"},
                                                
    "Shenzhen SiBiono GeneTech Co.,Ltd":{"type":"Biotech"},
    "European Organisation for Research and Treatment of Cancer - EORTC":{"type":"Government"},
    "Cancer Research UK":{"type":"Research Group"},

    "ECOG-ACRIN Cancer Research Group":{"type":"Research Group"},
    "Swiss Group for Clinical Cancer Research":{"type":"Research Group"},
    "Research Group":{"type":"Research Group"},
    "Cancer Research":{"type":"Research Group"},
    "General Hospital":{"type":"Hospital"},
    
    "University of Rochester":{"type":"University"},
    "Li Liu":{"type":"Individual"},
    "BioLeaders Corporation":{},
    "Regionalt Cancercentrum Väst":{},
    "Centre Hospitalier d'Abbeville":{},
    "Nanjing Leads Biolabs Co.,Ltd":{},
    "NRG Oncology":{},
    "Immunicon":{},
    
}
### LOOK INTO WHETHER CT.GOV HAS SPONSOR TYPE ALREADY!!!

In [7]:
sponsor_df=pd.DataFrame(sponsor_match_dict).transpose()
sponsor_df.head()

NameError: name 'sponsor_match_dict' is not defined

In [202]:
breakdown(onc_trials, "lead_sponsor").head()

Unnamed: 0,lead_sponsor,study_count,study_percent
0,National Cancer Institute (NCI),1959,0.036008
1,M.D. Anderson Cancer Center,1371,0.0252
2,Memorial Sloan Kettering Cancer Center,1052,0.019337
3,Hoffmann-La Roche,553,0.010165
4,AstraZeneca,491,0.009025


In [203]:
onc_trials['matches'] = onc_trials['lead_sponsor'].apply(lambda x: process.extractOne(x, sponsor_df.index.tolist()))

In [204]:
onc_trials['sponsor_name_match']=onc_trials.matches.apply(lambda x: x[0])
onc_trials['sponsor_match_score']=onc_trials.matches.apply(lambda x: x[1])

In [205]:
onc_trials[['lead_sponsor','sponsor_name_match','sponsor_match_score']][onc_trials.sponsor_match_score==86].iloc[0].lead_sponsor

'Mid-Atlantic Oncology Program'

In [215]:
onc_trials[['lead_sponsor','sponsor_name_match','sponsor_match_score']][onc_trials.sponsor_match_score==78].lead_sponsor.value_counts()

NRG Oncology                              27
Immunicon                                  2
Associazione Progetto Oncologia UMAN.A     1
Name: lead_sponsor, dtype: int64

In [207]:
onc_trials[['lead_sponsor','sponsor_name_match','sponsor_match_score']][onc_trials.sponsor_match_score>=75].shape[0]/onc_trials.shape[0]

0.7158480994044556

In [218]:
onc_trials[['lead_sponsor','sponsor_name_match','sponsor_match_score']].sponsor_name_match.value_counts().head(10)

University                                16115
Roche                                      3083
National Cancer Institute                  2749
Cancer Center                              2705
M.D. Anderson Cancer Center                2334
Medical Center                             2279
Oncology Group                             2044
National                                   2021
Memorial Sloan Kettering Cancer Center     1893
National Institute                         1298
Name: sponsor_name_match, dtype: int64

In [220]:
onc_trials[['lead_sponsor','sponsor_name_match','sponsor_match_score']][(onc_trials.sponsor_name_match=='Roche')&(onc_trials.sponsor_match_score>=75)].lead_sponsor.value_counts()

Hoffmann-La Roche          553
University of Rochester     89
Name: lead_sponsor, dtype: int64

In [136]:
onc_trials.head()

Unnamed: 0,nct_id,title,has_us_facility,conditions,eligibility_criteria,start_date,lead_sponsor,summary,overall_status,phase,enrollment,enrollment_type,study_type,number_of_arms,number_of_groups,why_stopped,has_dmc,is_fda_regulated_drug,is_fda_regulated_device,is_unapproved_device,is_ppsd,is_us_export,matches,sponsor_name_match,sponsor_match_score
0,NCT00000105,Vaccination With Tetanus and KLH to Assess Imm...,True,Cancer,Inclusion Criteria:\n\nPatients must have a di...,2002-07-31,"Masonic Cancer Center, University of Minnesota",The purpose of this study is to learn how the ...,Terminated,,112.0,Actual,Observational,,3.0,Replaced by another study.,True,,,,,,"(University, 90)",University,90
1,NCT00000124,Collaborative Ocular Melanoma Study (COMS),False,Choroid Neoplasms|Uveitis,Men and women eligible for the study must be a...,1986-11-30,National Eye Institute (NEI),To evaluate therapeutic interventions for pati...,Unknown status,Phase 3,,,Interventional,,,,,,,,,,"(Novartis, 45)",Novartis,45
2,NCT00000603,Cord Blood Stem Cell Transplantation Study (CO...,False,"Anemia, Aplastic|Fanconi Anemia|Hematologic Di...",Mothers of Infant-donors must complete a medic...,1996-09-30,"National Heart, Lung, and Blood Institute (NHLBI)","To evaluate if HLA-mismatched, unrelated-donor...",Completed,Phase 2,,,Interventional,,,,,,,,,,"(Novartis, 53)",Novartis,53
3,NCT00000611,Women's Health Initiative (WHI),False,Bone Diseases|Breast Neoplasms|Cardiovascular ...,Postmenopausal women ages 50 to 79.,,"National Heart, Lung, and Blood Institute (NHLBI)","To address cardiovascular disease, cancer, and...",Completed,Phase 3,,,Interventional,,,,,,,,,,"(Novartis, 53)",Novartis,53
4,NCT00000764,Chemoprevention of Anal Neoplasia Arising Seco...,True,Anus Neoplasms|HIV Infections,Inclusion Criteria\n\nConcurrent Medication:\n...,,National Institute of Allergy and Infectious D...,"PRIMARY: In Phase I, to define a broadly toler...",Completed,Phase 1,98.0,,Interventional,,,,,,,,,,"(AstraZeneca, 47)",AstraZeneca,47


In [113]:
## New function to plot distributions as time-series.  Maybe top 5 only over time.  Total num version and relative percent stacked chared

In [112]:
"2002-07-31".left(4)

AttributeError: 'str' object has no attribute 'left'

In [108]:
onc_trials[['nct_id','start_date']].dropna().head(20)
onc_trials['year']=onc_tri

Unnamed: 0,nct_id,start_date
0,NCT00000105,2002-07-31
1,NCT00000124,1986-11-30
2,NCT00000603,1996-09-30
5,NCT00001150,1976-10-31
6,NCT00001158,1977-04-30
7,NCT00001160,1977-06-01
8,NCT00001163,1985-04-02
9,NCT00001165,1978-09-30
10,NCT00001167,1979-08-31
11,NCT00001171,1979-07-31


In [43]:
## Compare document links with queried trials

In [7]:
doc_json=eda.read_doc_links()

In [None]:
len(doc_json)

In [8]:
len(doc_json)

17547

In [11]:
doc_items = list(doc_json.items())

In [15]:
doc_df = pd.DataFrame(doc_json).transpose()

In [16]:
doc_df.head()

Unnamed: 0,PROT,ICF,SAP
NCT00000105,,,
NCT00000124,,,
NCT00000603,,,
NCT00000611,,,
NCT00000764,,,


In [17]:
doc_df[~doc_df['PROT'].isna()].PROT.shape[0]

16

In [18]:
 doc_df[~doc_df['ICF'].isna()].ICF.shape[0]

3

In [19]:
doc_df[~doc_df['SAP'].isna()].SAP.shape[0]

1