In [11]:
from pathlib import Path

from IPython.display import Markdown as md

import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from matplotlib_venn import venn3
import pandas as pd
import seaborn as sns

import numpy as np

from tracking_grants import references_f, articles_f, wos_f, altmetric_f

In [12]:
from tqdm.auto import tqdm
import requests

In [13]:
# Load references
refs = pd.read_csv(references_f, index_col="reference_id")

In [14]:
# Load matched articles
articles = pd.read_csv(articles_f, index_col="DOI")
articles.index = articles.index.str.lower()

In [104]:
import re

In [97]:
def get_pmid(doi):
    burl = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term={}"
    return requests.get(burl.format(doi))

In [176]:
results = []

baseurl = f"https://clinicaltrials.gov/api/query/study_fields"
pmid_regex = re.compile(r"<Id>(\d+)<\/Id>")

dois = articles.sort_values("coci_citations", ascending=False).index.tolist()
for doi in tqdm(dois[0:100]):
    r = get_pmid(doi)
    match = pmid_regex.search(r.text)
    if match:
        pmid = match.group(1)
    else:
        continue
        
    params = {
        "expr": "{}[PUBMED-IDS]",
        "fields": "NCTId,BriefTitle,Condition,OverallStatus,Phase",
        "min_rnk": 1,
        "max_rnk": None,
        "fmt": "json"
    }
    params['expr'] = params['expr'].format(pmid)
    
    r = requests.get(baseurl, params=params).json()
    r['doi'] = doi
    results.append(r)

HBox(children=(FloatProgress(value=0.0), HTML(value='')))




In [179]:
results[0]

{'StudyFieldsResponse': {'APIVrs': '1.01.02',
  'DataVrs': '2020:06:18 22:05:13.992',
  'Expression': '18798982[PUBMED-IDS]',
  'NStudiesAvail': 343237,
  'NStudiesFound': 0,
  'MinRank': 1,
  'MaxRank': 20,
  'NStudiesReturned': 0,
  'FieldList': ['NCTId', 'BriefTitle', 'Condition', 'OverallStatus', 'Phase']},
 'doi': '10.1186/gb-2008-9-9-r137'}

In [184]:
columns = ['doi', 'PMID', 'n_trials']
parse_cols = ['NCTId', 'OverallStatus', 'Phase', 'BriefTitle', 'Condition']

df = pd.DataFrame(columns=columns+parse_cols)

for r in results:
    if r['StudyFieldsResponse']['NStudiesFound'] > 0:
        doi = r['doi']
        pmid = r['StudyFieldsResponse']['Expression'].split("[")[0]
        n_trials = r['StudyFieldsResponse']['NStudiesFound']
        for _ in r['StudyFieldsResponse']['StudyFields']:
            row = []
            for pc in parse_cols:
                if pc in _:
                    if len(_[pc]) > 0:
                        x = _[pc][0]
                    else:
                        x = None
                    row.append(x)
            df.loc[len(df)+1] = [doi, pmid, n_trials] + row

In [188]:
articles.coci_citations.count()/len(articles)

0.962002682163612

In [197]:
df.Phase.value_counts()

Phase 2           6
Not Applicable    5
Phase 1           4
Phase 3           4
Early Phase 1     1
Name: Phase, dtype: int64

In [189]:
df.describe()

Unnamed: 0,doi,PMID,n_trials,NCTId,OverallStatus,Phase,BriefTitle,Condition
count,27,27,27,27,27,20,27,27
unique,13,13,3,24,6,5,24,20
top,10.1056/nejmoa1207506,22894553,11,NCT03979339,Completed,Phase 2,Combination of Entinostat and Enzalutamide in ...,Prostate Cancer
freq,11,11,11,2,10,6,2,3


Example article with complete clinical trials:

- DOI: https://doi.org/10.1126/science.1168175
- Landing page: https://science.sciencemag.org/content/324/5928/787
- PMID: https://pubmed.ncbi.nlm.nih.gov/19359544/
- Available clinical trials (https://clinicaltrials.gov/search/term=19359544%20%5BPUBMED-IDS%5D):
    - Phase 1: https://clinicaltrials.gov/ct2/show/record/NCT03829930?term=19359544+%5BPUBMED-IDS%5D&draw=2&rank=2
    - Phase 2: https://clinicaltrials.gov/ct2/show/record/NCT03196388?term=19359544+%5BPUBMED-IDS%5D&draw=2&rank=1

Relevant links to play around with clinical trials and pubmedIds

- https://clinicaltrials.gov/ct2/show/NCT01874691
- https://pubmed.ncbi.nlm.nih.gov/20031882/
- https://clinicaltrials.gov/search/term=20031882%20%5BPUBMED-IDS%5D

In [67]:
baseurl = f"https://clinicaltrials.gov/api/query/full_studies"

pmids = articles.pmid.dropna().astype(int)
for pmid in tqdm(pmids):
    params = {
        "expr": "{}[PUBMED-IDS]",
        "min_rnk": 1,
        "max_rnk": None,
        "fmt": "json"
    }
    pmid = 20031882
    params['expr'] = params['expr'].format(pmid)
    
    r = requests.get(baseurl, params=params)
    print(r.json()['FullStudiesResponse']['NStudiesFound'])
    break

HBox(children=(FloatProgress(value=0.0, max=6279.0), HTML(value='')))

1


In [78]:
r.json()['FullStudiesResponse']['FullStudies'][0]['Study']['ProtocolSection']['ReferencesModule']

{'ReferenceList': {'Reference': [{'ReferencePMID': '20031882',
    'ReferenceType': 'background',
    'ReferenceCitation': 'Peterson ED, Roe MT, Rumsfeld JS, Shaw RE, Brindis RG, Fonarow GC, Cannon CP. A call to ACTION (acute coronary treatment and intervention outcomes network): a national effort to promote timely clinical feedback and support continuous quality improvement for acute myocardial infarction. Circ Cardiovasc Qual Outcomes. 2009 Sep;2(5):491-9. doi: 10.1161/CIRCOUTCOMES.108.847145.'},
   {'ReferencePMID': '31567475',
    'ReferenceType': 'derived',
    'ReferenceCitation': 'Song CX, Fu R, Yang JG, Xu HY, Gao XJ, Wang CY, Zheng Y, Jia SB, Dou KF, Yang YJ; CAMI Registry study group. Angiographic characteristics and in-hospital mortality among patients with ST-segment elevation myocardial infarction presenting without typical chest pain: an analysis of China Acute Myocardial Infarction registry. Chin Med J (Engl). 2019 Oct 5;132(19):2286-2291. doi: 10.1097/CM9.00000000000004