In [1]:
import requests
import json
from pathlib import Path

In [2]:
def fetch_clinical_trials(query="cancer", page_size=100, max_studies=500):
    """
    Fetch studies from ClinicalTrials.gov v2 API with pagination.
    Returns a list of raw study objects.
    """
    base_url = "https://clinicaltrials.gov/api/v2/studies"
    
    studies = []
    next_page_token = None
    
    while len(studies) < max_studies:
        params = {
            "query.term": query,
            "pageSize": page_size,
            "format": "json"
        }
        if next_page_token:
            params["pageToken"] = next_page_token
        
        resp = requests.get(base_url, params=params, timeout=30)
        resp.raise_for_status()
        data = resp.json()
        
        batch = data.get("studies", [])
        studies.extend(batch)
        
        print(f"Fetched {len(studies)} studies so far...")
        
        next_page_token = data.get("nextPageToken")
        if not next_page_token:
            break
    
    return studies[:max_studies]

In [15]:
trial_query = (
    '(cancer OR oncology OR tumor OR tumour) '
    'AND (drug OR therapy OR treatment OR resistance OR targeted)'
)

raw_trials = fetch_clinical_trials(query=trial_query, page_size=100, max_studies=1500)

print("Total raw trials fetched:", len(raw_trials))

Fetched 100 studies so far...
Fetched 200 studies so far...
Fetched 300 studies so far...
Fetched 400 studies so far...
Fetched 500 studies so far...
Fetched 600 studies so far...
Fetched 700 studies so far...
Fetched 800 studies so far...
Fetched 900 studies so far...
Fetched 1000 studies so far...
Fetched 1100 studies so far...
Fetched 1200 studies so far...
Fetched 1300 studies so far...
Fetched 1400 studies so far...
Fetched 1500 studies so far...
Total raw trials fetched: 1500


In [9]:
# Utility functions: need to convert raw trials into clinical trial objects, and these functions will be helpful for that.

def safe_get(d, *keys, default=None):
    cur = d
    for k in keys:
        if not isinstance(cur, dict):
            return default
        cur = cur.get(k)
        if cur is None:
            return default
    return cur

def as_list(x):
    if x is None:
        return []
    if isinstance(x, list):
        return x
    return [x]

In [10]:
def looks_cancer_related(study):
    protocol = study.get("protocolSection", {})
    ident = protocol.get("identificationModule", {})
    cond_mod = protocol.get("conditionsModule", {})
    desc_mod = protocol.get("descriptionModule", {})
    
    text_parts = []
    text_parts.append(ident.get("briefTitle", "") or "")
    text_parts.append(ident.get("officialTitle", "") or "")
    text_parts.extend(cond_mod.get("conditions", []) or [])
    text_parts.append(desc_mod.get("briefSummary", "") or "")
    text_parts.append(desc_mod.get("detailedDescription", "") or "")
    
    blob = " ".join(text_parts).lower()
    
    cancer_terms = [
        "cancer", "oncology", "tumor", "tumour", "carcinoma",
        "leukemia", "lymphoma", "melanoma", "glioma", "sarcoma",
        "neoplasm"
    ]
    
    return any(term in blob for term in cancer_terms)

In [16]:
raw_trials_filtered = [s for s in raw_trials if looks_cancer_related(s)]

print("Raw fetched:", len(raw_trials))
print("Cancer-related after local filter:", len(raw_trials_filtered))

Raw fetched: 1500
Cancer-related after local filter: 1298


In [18]:
# Building records

clinical_trials_records = []

for study in raw_trials_filtered:
    protocol = study.get("protocolSection", {})
    
    ident = protocol.get("identificationModule", {})
    status_mod = protocol.get("statusModule", {})
    design_mod = protocol.get("designModule", {})
    cond_mod = protocol.get("conditionsModule", {})
    arms_mod = protocol.get("armsInterventionsModule", {})
    desc_mod = protocol.get("descriptionModule", {})
    sponsor_mod = protocol.get("sponsorCollaboratorsModule", {})
    
    nct_id = ident.get("nctId")
    brief_title = ident.get("briefTitle") or ident.get("officialTitle") or "Untitled trial"
    
    if not nct_id:
        continue
    
    overall_status = status_mod.get("overallStatus")
    phase_list = as_list(design_mod.get("phases"))
    conditions = as_list(cond_mod.get("conditions"))
    
    interventions = []
    for iv in as_list(arms_mod.get("interventions")):
        name = iv.get("name")
        iv_type = iv.get("type")
        if name and iv_type:
            interventions.append(f"{name} ({iv_type})")
        elif name:
            interventions.append(name)
    
    brief_summary = desc_mod.get("briefSummary")
    detailed_desc = desc_mod.get("detailedDescription")
    
    lead_sponsor = safe_get(sponsor_mod, "leadSponsor", "name")
    
    # Build strong search text
    parts = []
    parts.append(f"Clinical trial: {brief_title}.")
    
    if overall_status:
        parts.append(f"Status: {overall_status}.")
    if phase_list:
        parts.append("Phase: " + ", ".join(phase_list) + ".")
    if conditions:
        parts.append("Conditions: " + "; ".join(conditions[:10]) + ".")
    if interventions:
        parts.append("Interventions: " + "; ".join(interventions[:15]) + ".")
    if lead_sponsor:
        parts.append(f"Sponsor: {lead_sponsor}.")
    if brief_summary:
        parts.append(f"Summary: {brief_summary}")
    elif detailed_desc:
        parts.append(f"Summary: {detailed_desc[:1200]}")
    
    search_text = " ".join(parts)

    text_for_cancer_check = " ".join([
        brief_title or "",
        " ".join(conditions or []),
        brief_summary or "",
        (detailed_desc or "")[:2000]
    ]).lower()

    cancer_terms = [
        "cancer", "oncology", "tumor", "tumour", "carcinoma",
        "leukemia", "lymphoma", "melanoma", "glioma", "sarcoma", "neoplasm"
    ]

    if not any(t in text_for_cancer_check for t in cancer_terms):
        continue
    
    clinical_trials_records.append({
        "id": f"ClinicalTrial::{nct_id}",
        "entity_type": "ClinicalTrial",
        "identifier": nct_id,
        "name": brief_title,
        "search_text": search_text,
        "metadata": {
            "source": "clinicaltrials",
            "status": overall_status,
            "phase": phase_list,
            "conditions": conditions,
            "interventions": interventions,
            "sponsor": lead_sponsor,
            "url": f"https://clinicaltrials.gov/study/{nct_id}"
        }
    })

print("Built clinical trial records:", len(clinical_trials_records))
print(clinical_trials_records[0]["id"])
print(clinical_trials_records[0]["search_text"][:500])

Built clinical trial records: 1294
ClinicalTrial::NCT05373134
Clinical trial: Efficacy and Safety of Pentoxifylline in Improving Oxygenation in Hepatopulmonary Syndrome. Status: UNKNOWN. Phase: NA. Conditions: Hepatopulmonary Syndrome. Interventions: Pentoxifylline (DRUG); Placebo (OTHER). Sponsor: Institute of Liver and Biliary Sciences, India. Summary: The triad of liver disease, arterial hypoxia, and extensive pulmonary vascular dilatation is known as the hepatopulmonary syndrome (HPS). The prevalence of this syndrome ranges from 10% to 30% in people wi


In [19]:
for r in clinical_trials_records[:20]:
    print(r["name"])
    print(r["metadata"].get("conditions", []))
    print("---")

Efficacy and Safety of Pentoxifylline in Improving Oxygenation in Hepatopulmonary Syndrome
['Hepatopulmonary Syndrome']
---
Paclitaxel-Carboplatin Alone or With M2ES for Non-Small-Cell Lung Cancer
['Non Small Cell Lung Cancer']
---
ATLCAR.CD30.CCR4 for CD30+ HL ATLCAR.CD30.CCR4 Cells
['Hodgkin Lymphoma', 'Relapse', 'Refractory']
---
CD7 CAR-T in Adults With Relapsed or Refractory T-LBL/ALL Clinical Study
['T-lymphoblastic Lymphoma', 'T-ALL']
---
Effect of Dual-Task Training on Pediatric Oncology Patients
['Cancer']
---
Safety and Efficacy of ALD518 for Reducing Oral Mucositis in Head and Neck Cancer Subjects
['Oral Mucositis']
---
CIFeR - A Clinician-led Intervention to Address Fear of Cancer Recurrence
['Fear of Cancer Recurrence']
---
Secukinumab in Tumor Necrosis Factor (TNF) - Inadequate Response (IR) Psoriasis Participants.
['Psoriasis']
---
MPDL3280A-treatment-IST-UMCG
['Locally Advanced or Metastatic Solid Tumors']
---
Biomarker Analysis of Tislelizumab Combined With Chemotherap

In [None]:
out_json = Path("data/records/clinical_trials_records_cancer_mvp.json")

# Add source to metadata for every record
for r in clinical_trials_records:
    if "metadata" not in r or r["metadata"] is None:
        r["metadata"] = {}
    r["metadata"]["source"] = "clinicaltrials"

with open(out_json, "w") as f:
    json.dump(clinical_trials_records, f, indent=2)

print("Saved:", out_json)

Saved: /Users/shaunak/Documents/Hacklytics2026/clinical_trials_records_cancer_mvp.json
