In [22]:
import os
import time
import math
import json
import re
from typing import Any, Dict, List, Optional, Tuple

import requests
import pandas as pd
from tqdm.auto import tqdm

pd.set_option("display.max_colwidth", 200)

SESSION = requests.Session()
SESSION.headers.update({"User-Agent": "biotech-ingestion-exploration/0.1 (contact: your-email@example.com)"})

def _get(url: str, params: Optional[dict] = None, timeout: int = 30) -> dict:
    r = SESSION.get(url, params=params, timeout=timeout)
    r.raise_for_status()
    # Most endpoints here are JSON; if not, we handle elsewhere.
    return r.json()

def pretty(obj: Any, max_len: int | None = None) -> None:
    s = json.dumps(obj, indent=2, ensure_ascii=False) 
    if max_len: 
        s = s[:max_len]
        print(s + ("\n... (truncated)" if len(s) > max_len else ""))   
    else:
        print(s)

def to_df(rows: List[dict], cols: Optional[List[str]] = None) -> pd.DataFrame:
    df = pd.DataFrame(rows)
    if cols:
        for c in cols:
            if c not in df.columns:
                df[c] = None
        df = df[cols]
    return df

In [3]:
NCBI_BASE = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"

def pubmed_esearch(query: str, retmax: int = 20, retstart: int = 0, api_key: Optional[str] = None) -> dict:
    """
    Returns PubMed IDs for a query.
    """
    params = {
        "db": "pubmed",
        "term": query,
        "retmode": "json",
        "retmax": retmax,
        "retstart": retstart,
        "sort": "relevance",
    }
    if api_key:
        params["api_key"] = api_key
    return _get(f"{NCBI_BASE}/esearch.fcgi", params=params)

def pubmed_esummary(pmids: List[str], api_key: Optional[str] = None) -> dict:
    """
    Lightweight metadata for a list of PubMed IDs.
    """
    params = {
        "db": "pubmed",
        "id": ",".join(pmids),
        "retmode": "json",
    }
    if api_key:
        params["api_key"] = api_key
    return _get(f"{NCBI_BASE}/esummary.fcgi", params=params)

def pubmed_efetch_xml(pmids: List[str], api_key: Optional[str] = None) -> str:
    """
    Full PubMed record XML (useful for deep parsing of abstracts, MeSH, registry IDs, etc.)
    """
    params = {
        "db": "pubmed",
        "id": ",".join(pmids),
        "retmode": "xml",
    }
    if api_key:
        params["api_key"] = api_key
    r = SESSION.get(f"{NCBI_BASE}/efetch.fcgi", params=params, timeout=30)
    r.raise_for_status()
    return r.text

def pubmed_search(query: str, n: int = 20, api_key: Optional[str] = None) -> pd.DataFrame:
    """
    Convenience: run search + summary, return a tidy DataFrame.
    """
    es = pubmed_esearch(query, retmax=n, api_key=api_key)
    pmids = es.get("esearchresult", {}).get("idlist", [])
    if not pmids:
        return pd.DataFrame()
    summ = pubmed_esummary(pmids, api_key=api_key)
    result = summ.get("result", {})
    uids = result.get("uids", [])
    rows = []
    for uid in uids:
        item = result.get(uid, {})
        rows.append({
            "pmid": uid,
            "title": item.get("title"),
            "pubdate": item.get("pubdate"),
            "source": item.get("source"),  # journal
            "authors": ", ".join([a.get("name","") for a in item.get("authors", []) if a.get("name")]),
            "doi": next((x.get("value") for x in item.get("articleids", []) if x.get("idtype") == "doi"), None),
        })
    return pd.DataFrame(rows)

In [4]:
pubmed_query = '("carbon 60"[TIAB] OR C60[TIAB] OR buckminsterfullerene[TIAB]) AND (human*[TIAB] OR trial[TIAB] OR "case report"[PT])'
df_pubmed = pubmed_search(pubmed_query, n=10)
df_pubmed

Unnamed: 0,pmid,title,pubdate,source,authors,doi
0,28325349,Nanocrystalline cellulose-fullerene: Novel conjugates.,2017 May 15,Carbohydr Polym,"Herreros-López A, Carini M, Da Ros T, Carofiglio T, Marega C, La Parola V, Rapozzi V, Xodo LE, Alshatwi AA, Hadad C, Prato M",10.1016/j.carbpol.2017.01.068
1,17309168,Fullerenes in biomedicine.,2006 Oct-Dec,J BUON,"Djordjević A, Bogdanović G, Dobrić S",
2,21413269,Mycobacteria and Nocardia.,1996,,"Baron S, McMurray DN",
3,35051749,Synthesis and biological application of glyco- and peptide derivatives of fullerene C(60).,2022 Feb 15,Eur J Med Chem,"Tanzi L, Terreni M, Zhang Y",10.1016/j.ejmech.2022.114104
4,18217343,Toxicity studies of fullerenes and derivatives.,2007,Adv Exp Med Biol,"Kolosnjaj J, Szwarc H, Moussa F",10.1007/978-0-387-76713-0_13
5,38744728,"Efficient detection of nitric oxide a biomarker associated with COVID19 via N, P co-doped C(60) fullerene: a computational study.",2024 May 14,J Mol Model,"Khan AA, Ahmad R, Mehmood F, Ahmad I",10.1007/s00894-024-05954-9
6,36643955,"Anti-Inflammatory and Antioxidant Effects of Liposoluble C60 at the Cellular, Molecular, and Whole-Animal Levels.",2023,J Inflamm Res,"Hui M, Jia X, Li X, Lazcano-Silveira R, Shi M",10.2147/JIR.S386381
7,35110453,[Development of Bio-active Fullerene Derivatives Suitable for Drug].,2022,Yakugaku Zasshi,Mashino T,10.1248/yakushi.21-00188
8,31765863,"Advances in the application, toxicity and degradation of carbon nanomaterials in environment: A review.",2020 Jan,Environ Int,"Peng Z, Liu X, Zhang W, Zeng Z, Liu Z, Zhang C, Liu Y, Shao B, Liang Q, Tang W, Yuan X",10.1016/j.envint.2019.105298
9,19886744,Role of oxidative damage in toxicity of particulates.,2010 Jan,Free Radic Res,"Møller P, Jacobsen NR, Folkmann JK, Danielsen PH, Mikkelsen L, Hemmingsen JG, Vesterdal LK, Forchhammer L, Wallin H, Loft S",10.3109/10715760903300691


In [5]:
EPMC_BASE = "https://www.ebi.ac.uk/europepmc/webservices/rest"

def epmc_search(query: str, page_size: int = 20, cursor_mark: str = "*", sort: str = "RELEVANCE") -> dict:
    """
    Europe PMC search with cursor-based pagination.
    sort options include: RELEVANCE, DATE, CITED
    """
    # format: https://.../search?query=...&format=json&pageSize=...&cursorMark=...
    params = {
        "query": query,
        "format": "json",
        "pageSize": page_size,
        "cursorMark": cursor_mark,
        "sort": sort,
    }
    return _get(f"{EPMC_BASE}/search", params=params)

def epmc_search_df(query: str, n: int = 20, sort: str = "RELEVANCE") -> pd.DataFrame:
    """
    Fetch up to n results into a DataFrame.
    """
    rows = []
    cursor = "*"
    while len(rows) < n:
        data = epmc_search(query=query, page_size=min(100, n - len(rows)), cursor_mark=cursor, sort=sort)
        hits = data.get("resultList", {}).get("result", []) or []
        if not hits:
            break
        for h in hits:
            rows.append({
                "source": h.get("source"),
                "id": h.get("id"),
                "pmid": h.get("pmid"),
                "pmcid": h.get("pmcid"),
                "doi": h.get("doi"),
                "title": h.get("title"),
                "pubYear": h.get("pubYear"),
                "journalTitle": h.get("journalTitle"),
                "authorString": h.get("authorString"),
                "isOpenAccess": h.get("isOpenAccess"),
                "pubType": h.get("pubType"),
                "citedByCount": h.get("citedByCount"),
            })
        cursor = data.get("nextCursorMark")
        if not cursor:
            break
    return pd.DataFrame(rows)

In [6]:
# --- Cell 6: Europe PMC test ---
epmc_query = '(C60 OR "carbon 60" OR buckminsterfullerene) AND (longevity OR "oxidative stress" OR mitochondria OR "case report")'
df_epmc = epmc_search_df(epmc_query, n=15, sort="DATE")
df_epmc


In [13]:
# --- Cell 7: ClinicalTrials.gov v2 client ---
CTG_BASE = "https://clinicaltrials.gov/api/v2"

def ctg_get(url: str, params: Optional[dict] = None, timeout: int = 30) -> dict:
    r = SESSION.get(url, params=params, timeout=timeout)  # uses your existing SESSION
    if r.status_code == 400:
        # Make debugging easy in notebooks
        raise requests.HTTPError(f"400 Bad Request\nURL: {r.url}\nBody: {r.text[:1000]}", response=r)
    r.raise_for_status()
    return r.json()

def ctg_field_ok(field_name: str) -> bool:
    """
    Checks if a single field name is valid by calling the v2 stats endpoint.
    If invalid, CTG generally 400s.
    """
    url = f"{CTG_BASE}/stats/field/values"
    try:
        _ = ctg_get(url, params={"fields": field_name})
        return True
    except requests.HTTPError:
        return False

def ctg_validate_fields(fields: List[str]) -> Dict[str, bool]:
    """
    Returns {field: True/False} for validity.
    """
    return {f: ctg_field_ok(f) for f in fields}

# A conservative list of fields that are commonly valid piece names.
# (Start minimal; add more only after validating.)
CTG_MIN_FIELDS = [
    "NCTId",
    "BriefTitle",
    "OfficialTitle",
    "OverallStatus",
    "StudyType",
    "Phase",
    "EnrollmentCount",
    "LeadSponsorName",
    # "Conditions", 
    # Invalid 
   # Interventions 
   # Invalid 
    "StartDate",
    "CompletionDate",
]

def ctg_search(query_term: str, page_size: int = 20, page_token: Optional[str] = None,
               fields: Optional[List[str]] = None) -> dict:
    params = {
        "query.term": query_term,
        "pageSize": page_size,
    }
    if page_token:
        params["pageToken"] = page_token
    if fields:
        params["fields"] = ",".join(fields)
    return ctg_get(f"{CTG_BASE}/studies", params=params)

def ctg_search_df(query_term: str, n: int = 50, fields: Optional[List[str]] = None) -> pd.DataFrame:
    """
    Light surface table from CTG. If fields is None, uses CTG_MIN_FIELDS.
    """
    fields = fields or CTG_MIN_FIELDS

    rows = []
    token = None
    while len(rows) < n:
        data = ctg_search(query_term, page_size=min(100, n - len(rows)), page_token=token, fields=fields)
        studies = data.get("studies") or []
        for s in studies:
            p = s.get("protocolSection", {})
            ident = p.get("identificationModule", {})
            status = p.get("statusModule", {})
            design = p.get("designModule", {})
            sponsor = p.get("sponsorsModule", {})
            cond = p.get("conditionsModule", {})
            arms = p.get("armsInterventionsModule", {})

            interventions = arms.get("interventions") or []
            phases = design.get("phases") or []
            enrollment = (design.get("enrollmentInfo") or {}).get("count")

            rows.append({
                "nctId": ident.get("nctId"),
                "briefTitle": ident.get("briefTitle"),
                "officialTitle": ident.get("officialTitle"),
                "overallStatus": status.get("overallStatus"),
                "studyType": design.get("studyType"),
                "phases": ", ".join(phases) if phases else None,
                "enrollment": enrollment,
                "leadSponsor": (sponsor.get("leadSponsor") or {}).get("name"),
                "conditions": "; ".join(cond.get("conditions") or []),
                "interventions": "; ".join([i.get("name","") for i in interventions if i.get("name")]),
                "startDate": (status.get("startDateStruct") or {}).get("date"),
                "completionDate": (status.get("completionDateStruct") or {}).get("date"),
            })

        token = data.get("nextPageToken")
        if not token or not studies:
            break

    return pd.DataFrame(rows)

def ctg_get_study(nct_id: str) -> dict:
    return ctg_get(f"{CTG_BASE}/studies/{nct_id}")

In [14]:
ctg_query = '("carbon 60" OR C60 OR buckminsterfullerene OR fullerene)'
df_ctg = ctg_search_df(ctg_query, n=25)
df_ctg

Unnamed: 0,nctId,briefTitle,officialTitle,overallStatus,studyType,phases,enrollment,leadSponsor,conditions,interventions,startDate,completionDate
0,NCT01213043,Safety and Pharmacokinetics of Alpha-1 Proteinase Inhibitor in Subjects With Alpha1-Antitrypsin Deficiency,A Randomized Double-blind Crossover Study to Assess the Safety and Pharmacokinetics of Two Different Doses of Weekly Intravenous Administration of Alpha1-Proteinase Inhibitor (Human) Prolastin®-C ...,COMPLETED,INTERVENTIONAL,PHASE2,30,,,,2010-11,2012-01
1,NCT03776175,"A Study To Assess Pharmacodynamics, Safety And Tolerability Of PF-05221304 And PF-06865571 Co-Administered For 6 Weeks In Adults With Non-Alcoholic Fatty Liver Disease.","A PHASE 2A, RANDOMIZED, DOUBLE BLIND (SPONSOR-OPEN), PLACEBO CONTROLLED, PARALLEL GROUP STUDY TO ASSESS THE PHARMACODYNAMICS, SAFETY AND TOLERABILITY OF PF-05221304 AND PF-06865571 CO-ADMINISTERED...",COMPLETED,INTERVENTIONAL,PHASE2,99,,,,2019-01-04,2019-10-11
2,NCT01934192,Nutritional Adequacy Therapeutic Enhancement in the Critically Ill. The NUTRIATE Study,"NUTRItional Adequacy Therapeutic Enhancement in the Critically Ill: A Randomized Double Blind, Placebo-controlled Trial of the Motilin Receptor Agonist GSK962040. The NUTRIATE Study",TERMINATED,INTERVENTIONAL,PHASE2,91,,,,2014-04-04,2016-07-08
3,NCT00584324,Depth of Anesthesia on Implicit Memory,The Effect of Depth of Anesthesia on Implicit Memory,COMPLETED,INTERVENTIONAL,,70,,,,2006-03,2012-01
4,NCT06968208,The Efficacy and Safety of Puerarin in Obesity Treatment,Randomized Controlled Trial of Puerarin for Obesity Treatment,NOT_YET_RECRUITING,INTERVENTIONAL,PHASE2,80,,,,2025-08-01,2029-12-31
5,NCT05298202,The Influence of Capsaicin Gel During Exercise Within the Heat,The Influence of Topical Capsaicin on Thermoregulatory and Perceptual Outcomes During Exercise Within the Heat,COMPLETED,INTERVENTIONAL,PHASE4,12,,,,2022-05-25,2022-10-01
6,NCT05096065,Study to Evaluate the Pharmacodynamics and Efficacy of Leuprolide Tablets (Ovarest®) in Women With Endometriosis,An Open-label Dose-finding Study to Evaluate the Pharmacodynamic (PD) Profiles and Efficacy of Different Dosing Regimens of Leuprolide Oral Tablets (Ovarest®) in Women With Endometriosis,UNKNOWN,INTERVENTIONAL,PHASE2,16,,,,2022-03-18,2022-05
7,NCT00664742,The Effect of Fluvastatin XL® Treatment in Patients With Metabolic Syndrome,The Effect Of Fluvastatin XL® Treatment On The Lipid Profile In Patients With Metabolic Syndrome,COMPLETED,INTERVENTIONAL,PHASE4,614,,,,2006-09,2007-10
8,NCT02083068,Evaluation of the Protective Efficacy of a Vaccine Derived From the Synthetic CS Protein of Plasmodium Vivax,Evaluation of the Protective Efficacy of a Vaccine Derived From the Synthetic CS Protein of Plasmodium Vivax,COMPLETED,INTERVENTIONAL,PHASE2,32,,,,2014-08,2017-07
9,NCT07181733,BNP in Pediatric Metabolic Syndrome,"Exploring B-Type Natriuretic Peptide (BNP) Evolution, From the Neonatal to Adolescent Period, as a Tool to Early Identify the Risk for Metabolic Syndrome (MetS) in a Former Preterm and Full Term A...",RECRUITING,OBSERVATIONAL,,70,,,,2024-05-08,2025-12-31


In [15]:
ctg_query = '(creatine OR "creatine monohydrate")'
df_ctg = ctg_search_df(ctg_query, n=25)
df_ctg.head(10)

Unnamed: 0,nctId,briefTitle,officialTitle,overallStatus,studyType,phases,enrollment,leadSponsor,conditions,interventions,startDate,completionDate
0,NCT00996840,SB-681323 IV for Subjects at Risk of Acute Lung Injury or ARDS,"Assessment of the Anti-Inflammatory Activity, Efficacy and Safety of Intravenous SB-681323 in Subjects at Risk for Development of Acute Lung Injury (ALI) or Acute Respiratory Distress Syndrome (AR...",COMPLETED,INTERVENTIONAL,PHASE2,77.0,,,,2009-10-16,2013-02-09
1,NCT00005674,Clinical Trial of Creatine in Amyotrophic Lateral Sclerosis [ALS],,COMPLETED,INTERVENTIONAL,PHASE2,,,,,,
2,NCT04207359,Effects of Creatine Supplementation in Breast Cancer Survivors,The THRIVE Study: An Open-Label Randomized Trial of Exercise ± Creatine Supplementation to Augment the Adaptations of Exercise Training in Breast Cancer Survivors,COMPLETED,INTERVENTIONAL,,20.0,,,,2020-09-30,2023-03-07
3,NCT02771808,Haptoglobin Polymorphism as a Determinant of Adverse Outcome After Cardiac Surgery in Diabetic Patients,Haptoglobin Polymorphism as a Determinant of Adverse Outcome After Cardiac Surgery in Diabetic Patients,COMPLETED,OBSERVATIONAL,,83.0,,,,2010-09,2015-05
4,NCT06599697,The MIGHT Trial - An Exploratory Clinical Trial of IVIG in Anti-HMGCR Immune Mediated Necrotizing Myopathy,The MIGHT Trial - An Exploratory Clinical Trial of Intravenous Immunoglobulin (IVIG) in Anti-3-Hydroxy-3-Methylglutaryl-CoA Reductase (HMGCR) Immune Mediated Necrotizing Myopathy (IMNM),RECRUITING,INTERVENTIONAL,PHASE2,12.0,,,,2025-10-27,2027-06-30
5,NCT07152197,Effects of Resistance Exercises in Hereditary Sensory-Motor Neuropathy (Charcot-Marie-Tooth Disease),"Effects of a Resistance Exercise Training Program on Skeletal Muscle Quality, Functional Capacity, and Quality of Life in Young Individuals With and Without Hereditary Sensorimotor Polyneuropathy",RECRUITING,INTERVENTIONAL,,22.0,,,,2025-09-20,2026-09-20
6,NCT00758810,Impact at One Year of a Secondary Prevention Educational Program on Cardiovascular Risk Factors,"Impact at One Year of a Secondary Prevention Educational Program on Cardiovascular Risk Factors, Daily Physical Activity, Dietary Habits and Blood Glucose and Fatty Acids in Coronary Syndromes Pat...",COMPLETED,OBSERVATIONAL,,354.0,,,,2006-10,2009-06
7,NCT04764448,A Study of Belcesiran in Patients With AATLD,"A Phase 2, Randomized, Double-blind, Placebo-Controlled Study Investigating Safety, Tolerability, Pharmacokinetics and Pharmacodynamics of Two Dose Levels of Belcesiran in Patients With Alpha-1 An...",TERMINATED,INTERVENTIONAL,PHASE2,16.0,,,,2021-02-12,2024-05-29
8,NCT05537948,Efficacy and Safety of Pitavastatin and PCSK9 Inhibitors in Liver Transplant Patients,Efficacy and Safety of Pitavastatin and PCSK9 Inhibitors in Liver Transplant Patients,ACTIVE_NOT_RECRUITING,INTERVENTIONAL,PHASE4,59.0,,,,2021-10-01,2025-01-31
9,NCT03688958,Iodine Supplementation on Breast Cancer,Effect of Dietary Iodine Supplementation on the Proliferation of Breast Cancer,UNKNOWN,INTERVENTIONAL,PHASE2,120.0,,,,2005-03-15,2020-06


In [24]:
# --- Cell 9: Fetch full record (pick an nctId from df_ctg first) ---
if not df_ctg.empty:
    nct = df_ctg.loc[0, "nctId"]
    study = ctg_get_study(nct)
    pretty(study, max_len=50000)
else:
    print("No studies found for that query (try broadening terms).")

{
  "protocolSection": {
    "identificationModule": {
      "nctId": "NCT00996840",
      "orgStudyIdInfo": {
        "id": "111592"
      },
      "organization": {
        "fullName": "GlaxoSmithKline",
        "class": "INDUSTRY"
      },
      "briefTitle": "SB-681323 IV for Subjects at Risk of Acute Lung Injury or ARDS",
      "officialTitle": "Assessment of the Anti-Inflammatory Activity, Efficacy and Safety of Intravenous SB-681323 in Subjects at Risk for Development of Acute Lung Injury (ALI) or Acute Respiratory Distress Syndrome (ARDS)."
    },
    "statusModule": {
      "statusVerifiedDate": "2017-09",
      "overallStatus": "COMPLETED",
      "expandedAccessInfo": {
        "hasExpandedAccess": false
      },
      "startDateStruct": {
        "date": "2009-10-16",
        "type": "ACTUAL"
      },
      "primaryCompletionDateStruct": {
        "date": "2013-02-09",
        "type": "ACTUAL"
      },
      "completionDateStruct": {
        "date": "2013-02-09",
        "ty