In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
import os
import glob
import pandas as pd
import numpy as np
import xml.etree.ElementTree as ET

from tqdm import tqdm


In [3]:
# ROOT directory where TCGA data was downloaded
TCGA_ROOT = "/content/drive/MyDrive/permanent_data_folder/TCGA_BRCA_data"
METADATA_PATH = "/content/drive/MyDrive/permanent_data_folder/files.json"



In [4]:
assert os.path.exists(TCGA_ROOT), f"Path does not exist: {TCGA_ROOT}"
assert len(os.listdir(TCGA_ROOT)) > 0, "TCGA_ROOT is empty"

print("TCGA root exists and is non-empty.")
print("Top-level entries:", os.listdir(TCGA_ROOT)[:5])


TCGA root exists and is non-empty.
Top-level entries: ['19ebd0ec-84fa-492d-ba37-62170f3cb643', '1aa278c3-2868-4a09-8b8e-98b83d3f6427', '1aa289bb-8ac9-4e9e-be61-3164afbe06a2', '1ab2be1b-4a61-480e-aa1b-eafab958f1f7', '1abc11f2-f463-41b7-8e84-4e2197fe917f']


In [5]:
all_files = [
    f for f in glob.glob(os.path.join(TCGA_ROOT, "**"), recursive=True)
    if os.path.isfile(f)
]

print("Total files found:", len(all_files))


Total files found: 4749


In [6]:
clinical_files = [
    f for f in all_files
    if (
        f.endswith(".xml")
        and "nationwidechildrens.org_clinical." in os.path.basename(f)
    )
]

print("Clinical XML files:", len(clinical_files))


rnaseq_files = [
    f for f in all_files
    if f.endswith(".rna_seq.augmented_star_gene_counts.tsv")
]

slide_files = [
    f for f in all_files
    if f.endswith(".svs")
]


print("RNA-seq files      :", len(rnaseq_files))
print("Slide (SVS) files  :", len(slide_files))


Clinical XML files: 1079
RNA-seq files      : 531
Slide (SVS) files  : 446


In [7]:
print("Clinical example:", os.path.basename(clinical_files[0]))
print("RNA-seq example  :", os.path.basename(rnaseq_files[0]))
print("Slide example    :", os.path.basename(slide_files[0]))


Clinical example: nationwidechildrens.org_clinical.TCGA-A8-A07E.xml
RNA-seq example  : f0bc4292-6662-460a-8bf8-a3c9fa2fd7be.rna_seq.augmented_star_gene_counts.tsv
Slide example    : TCGA-A7-A5ZV-01Z-00-DX1.21F2EA4A-4F31-43D6-A036-E20E326AF37E.svs


In [8]:
import re

def get_patient_id_from_name(name: str):
    """
    Extract TCGA patient ID (TCGA-XX-YYYY) from ANY filename
    """
    match = re.search(r"TCGA-[A-Z0-9]{2}-[A-Z0-9]{4}", name)
    return match.group(0) if match else None


In [9]:
import pandas as pd

sample_rna = rnaseq_files[0]

with open(sample_rna, "r") as f:
    for i in range(20):
        print(f.readline().strip())


# gene-model: GENCODE v36
gene_id	gene_name	gene_type	unstranded	stranded_first	stranded_second	tpm_unstranded	fpkm_unstranded	fpkm_uq_unstranded
N_unmapped			4631000	4631000	4631000
N_multimapping			8006124	8006124	8006124
N_noFeature			1025062	23882168	24150218
N_ambiguous			5386339	1520027	1500731
ENSG00000000003.15	TSPAN6	protein_coding	623	304	319	6.3198	4.0660	5.1555
ENSG00000000005.6	TNMD	protein_coding	11	8	3	0.3429	0.2206	0.2797
ENSG00000000419.13	DPM1	protein_coding	1589	815	774	60.5763	38.9735	49.4166
ENSG00000000457.14	SCYL3	protein_coding	364	427	416	2.4334	1.5656	1.9851
ENSG00000000460.17	C1orf112	protein_coding	108	305	293	0.8324	0.5356	0.6791
ENSG00000000938.13	FGR	protein_coding	685	324	361	9.3197	5.9961	7.6028
ENSG00000000971.16	CFH	protein_coding	2327	1172	1155	13.4178	8.6327	10.9458
ENSG00000001036.14	FUCA2	protein_coding	2564	1557	1514	41.8069	26.8976	34.1049
ENSG00000001084.13	GCLC	protein_coding	587	320	322	3.1341	2.0164	2.5567
ENSG00000001167.14	NFYA	protein_cod

In [10]:
import os

rna_uuids = list({
    os.path.basename(os.path.dirname(f))
    for f in rnaseq_files
})

len(rna_uuids)


531

In [11]:
import requests
import json

def fetch_gdc_metadata(file_ids):
    url = "https://api.gdc.cancer.gov/files"

    filters = {
        "op": "in",
        "content": {
            "field": "files.file_id",
            "value": file_ids
        }
    }

    params = {
        "filters": json.dumps(filters),
        "fields": "file_id,cases.submitter_id",
        "format": "JSON",
        "size": len(file_ids)
    }

    response = requests.get(url, params=params)
    response.raise_for_status()
    return response.json()["data"]["hits"]


In [12]:
def chunk_list(lst, chunk_size):
    for i in range(0, len(lst), chunk_size):
        yield lst[i:i + chunk_size]


In [13]:
import requests
import json

def fetch_gdc_metadata_batched(file_ids, batch_size=50):
    all_hits = []

    for batch in chunk_list(file_ids, batch_size):
        filters = {
            "op": "in",
            "content": {
                "field": "files.file_id",
                "value": batch
            }
        }

        params = {
            "filters": json.dumps(filters),
            "fields": "file_id,cases.submitter_id",
            "format": "JSON",
            "size": len(batch)
        }

        response = requests.get(
            "https://api.gdc.cancer.gov/files",
            params=params
        )
        response.raise_for_status()

        hits = response.json()["data"]["hits"]
        all_hits.extend(hits)

    return all_hits


In [14]:
hits = fetch_gdc_metadata_batched(rna_uuids, batch_size=50)
print("Metadata hits:", len(hits))


Metadata hits: 531


In [15]:
rnaseq_map = {}

for h in hits:
    file_id = h["file_id"]
    cases = h.get("cases", [])
    if cases:
        rnaseq_map[cases[0]["submitter_id"]] = file_id

print("RNA-seq patients mapped:", len(rnaseq_map))


RNA-seq patients mapped: 506


In [16]:
uuid_to_path = {
    os.path.basename(os.path.dirname(f)): f
    for f in rnaseq_files
}

rnaseq_map = {
    patient: uuid_to_path[file_id]
    for patient, file_id in rnaseq_map.items()
    if file_id in uuid_to_path
}

print("Final RNA-seq patients:", len(rnaseq_map))


Final RNA-seq patients: 506


In [17]:
slide_map = {}

for f in slide_files:
    pid = get_patient_id_from_name(os.path.basename(f))
    if pid:
        slide_map[pid] = f

print("Unique slide patients:", len(slide_map))


Unique slide patients: 433


In [18]:
def parse_clinical_xml(xml_path):
    tree = ET.parse(xml_path)
    root = tree.getroot()

    values = {}
    for elem in root.iter():
        tag = elem.tag.split("}")[-1].lower()
        if elem.text and elem.text.strip():
            values[tag] = elem.text.strip()

    # survival logic
    if "days_to_death" in values:
        time = float(values["days_to_death"])
        event = 1
    elif "days_to_last_followup" in values or "days_to_last_follow_up" in values:
        time = float(
            values.get("days_to_last_followup", values.get("days_to_last_follow_up"))
        )
        event = 0
    else:
        return None

    patient_id = get_patient_id_from_name(os.path.basename(xml_path))

    return {
        "patient_id": patient_id,
        "time": time,
        "event": event
    }


In [19]:
clinical_rows = []

for xml in tqdm(clinical_files):
    row = parse_clinical_xml(xml)
    if row is not None:
        clinical_rows.append(row)

clinical_df = pd.DataFrame(clinical_rows)

print("Clinical DF shape:", clinical_df.shape)
clinical_df.head()


100%|██████████| 1079/1079 [11:36<00:00,  1.55it/s]

Clinical DF shape: (1079, 3)





Unnamed: 0,patient_id,time,event
0,TCGA-A8-A07E,608.0,0
1,TCGA-BH-A1F5,2712.0,1
2,TCGA-A8-A08S,1004.0,0
3,TCGA-D8-A146,643.0,0
4,TCGA-A8-A09K,912.0,0


In [20]:
clinical_df = clinical_df[clinical_df["time"] > 0].copy()


In [21]:
clinical_df["event"].value_counts()
clinical_df["time"].describe()


Unnamed: 0,time
count,1058.0
mean,1256.68431
std,1194.950123
min,1.0
25%,458.75
50%,854.0
75%,1691.0
max,8605.0


In [22]:
clinical_patients = set(clinical_df["patient_id"])
rnaseq_patients   = set(rnaseq_map.keys())
slide_patients    = set(slide_map.keys())

common_patients = clinical_patients & rnaseq_patients & slide_patients
print("Final cohort size:", len(common_patients))


Final cohort size: 193


In [23]:
# Event balance
clinical_df[clinical_df["patient_id"].isin(common_patients)]["event"].value_counts()

# Survival time stats
clinical_df[clinical_df["patient_id"].isin(common_patients)]["time"].describe()


Unnamed: 0,time
count,193.0
mean,1108.595855
std,1161.007964
min,5.0
25%,411.0
50%,754.0
75%,1347.0
max,8605.0


In [24]:
cohort_df = clinical_df[
    clinical_df.patient_id.isin(common_patients)
].copy()

cohort_df["rnaseq_path"] = cohort_df.patient_id.map(rnaseq_map)
cohort_df["slide_path"]  = cohort_df.patient_id.map(slide_map)

cohort_df.to_csv("cohort.csv", index=False)

print("Saved cohort.csv with", len(cohort_df), "patients")


Saved cohort.csv with 193 patients


In [28]:
SAVE_PATH = "/content/drive/MyDrive/permanent_data_folder/cohort.csv"

cohort_df.to_csv(SAVE_PATH, index=False)

print("Saved cohort.csv")
print("Patients:", len(cohort_df))
print("Path:", SAVE_PATH)


Saved cohort.csv
Patients: 193
Path: /content/drive/MyDrive/permanent_data_folder/cohort.csv
