In [1]:
# Import required libraries
import pandas as pd
import requests
import os
import subprocess


def ensure_file_exists(url: str, file_path: str):
    if os.path.exists(file_path):
        print(f"File already exists at: {file_path}")
        return

    os.makedirs(os.path.dirname(file_path), exist_ok=True)

    cmd = [
        "wget",
        "-c",  # resume if interrupted
        "-O",
        file_path,  # output file
        url,
    ]

    print("Downloading with wget...")
    subprocess.run(cmd, check=True)
    print("Download complete.")

In [2]:
# Configuration: Define file paths and API endpoints
GDC_CASE_NUMBER_URL = "https://api.gdc.cancer.gov/cases"
MAF_FILE_URL = "https://api.gdc.cancer.gov/data/1c8cfe5f-e52d-41ba-94da-f15ea1337efc"
MAF_FILE_PATH = "./data/mc3.v0.2.8.PUBLIC.maf.gz"
TCGA_BRCA_SAVE_PATH = "./results/TCGA_BRCA.csv"

# check if the MAF file has been downloaded
# if not, this funciton will download
ensure_file_exists(MAF_FILE_URL, MAF_FILE_PATH)

# Preview the MAF file structure by loading just the first row
# This helps us understand the available columns before loading the full dataset
maf_preview = pd.read_csv(MAF_FILE_PATH, sep="\t", comment="#", nrows=1)
print(maf_preview.columns.tolist())

File already exists at: ./data/mc3.v0.2.8.PUBLIC.maf.gz
['Hugo_Symbol', 'Entrez_Gene_Id', 'Center', 'NCBI_Build', 'Chromosome', 'Start_Position', 'End_Position', 'Strand', 'Variant_Classification', 'Variant_Type', 'Reference_Allele', 'Tumor_Seq_Allele1', 'Tumor_Seq_Allele2', 'dbSNP_RS', 'dbSNP_Val_Status', 'Tumor_Sample_Barcode', 'Matched_Norm_Sample_Barcode', 'Match_Norm_Seq_Allele1', 'Match_Norm_Seq_Allele2', 'Tumor_Validation_Allele1', 'Tumor_Validation_Allele2', 'Match_Norm_Validation_Allele1', 'Match_Norm_Validation_Allele2', 'Verification_Status', 'Validation_Status', 'Mutation_Status', 'Sequencing_Phase', 'Sequence_Source', 'Validation_Method', 'Score', 'BAM_File', 'Sequencer', 'Tumor_Sample_UUID', 'Matched_Norm_Sample_UUID', 'HGVSc', 'HGVSp', 'HGVSp_Short', 'Transcript_ID', 'Exon_Number', 't_depth', 't_ref_count', 't_alt_count', 'n_depth', 'n_ref_count', 'n_alt_count', 'all_effects', 'Allele', 'Gene', 'Feature', 'Feature_type', 'Consequence', 'cDNA_position', 'CDS_position', 'P

In [2]:
# Define the columns we need for analysis
# Only loading necessary columns reduces memory usage significantly
REQUIRED_COLUMNS = [
    "Hugo_Symbol",  # Gene symbol
    "Chromosome",  # Chromosome location
    "Start_Position",  # Mutation start position
    "Variant_Classification",  # Type of mutation (missense, silent, etc.)
    "Tumor_Sample_Barcode",  # Sample identifier
]

# Load the full MAF file with only the required columns
# Using usecols parameter improves performance by reducing memory footprint
maf = pd.read_csv(MAF_FILE_PATH, sep="\t", usecols=REQUIRED_COLUMNS, low_memory=False)

In [3]:
# Configuration for GDC API request
GDC_API_SIZE = "20000"  # Maximum number of cases to retrieve
GDC_API_FIELDS = "submitter_id,project.project_id"  # Fields to request

# Fetch case metadata from the GDC (Genomic Data Commons) API
# This provides the mapping between sample barcodes and their cancer project IDs
params = {
    "size": GDC_API_SIZE,
    "fields": GDC_API_FIELDS,
    "format": "JSON",
}

r = requests.get(GDC_CASE_NUMBER_URL, params=params)
hits = r.json()["data"]["hits"]

# Convert the API response into a DataFrame
# Each row contains a case ID and its associated project ID (e.g., TCGA-BRCA)
meta = pd.DataFrame(
    [
        {
            "Tumor_Sample_Barcode": h["submitter_id"],
            "project_id": h["project"]["project_id"],
        }
        for h in hits
    ]
)
meta.head()

Unnamed: 0,Tumor_Sample_Barcode,project_id
0,HCM-WCMC-0950-C67,HCMI-CMDC
1,HCM-BROD-0682-C64,HCMI-CMDC
2,HCM-CSHL-0582-C18,HCMI-CMDC
3,HCM-EXPT-1073-C18,HCMI-CMDC
4,HCM-BROD-0328-C15,HCMI-CMDC


In [4]:
# Configuration: Specify which cancer project to extract
TARGET_PROJECT_ID = "TCGA-BRCA"  # TCGA Breast Cancer project
CASE_ID_PREFIX_LENGTH = (
    3  # Number of barcode segments to use for case ID (e.g., TCGA-3C-AALI)
)

# Extract the case ID from the tumor sample barcode
# Tumor_Sample_Barcode format: TCGA-3C-AALI-01A-11D-A41F-09
# Case ID format: TCGA-3C-AALI (first 3 segments separated by hyphens)
maf["case_id"] = (
    maf["Tumor_Sample_Barcode"].str.split("-").str[:CASE_ID_PREFIX_LENGTH].str.join("-")
)

# Prepare metadata for merging by renaming the column
meta = meta.rename(columns={"Tumor_Sample_Barcode": "case_id"})

# Merge MAF data with project metadata to get project IDs for each sample
maf_merged = maf.merge(meta, on="case_id", how="left")

# Filter to keep only the target cancer project (e.g., TCGA-BRCA)
maf_brca = maf_merged[maf_merged["project_id"] == TARGET_PROJECT_ID]

# Save the filtered data to CSV
maf_brca.to_csv(TCGA_BRCA_SAVE_PATH, index=False)

# Display the number of mutations found in the target project
print(f"Total mutations in {TARGET_PROJECT_ID}: {len(maf_brca)}")
len(maf_brca)

Total mutations in TCGA-BRCA: 110492


110492

In [5]:
maf_brca

Unnamed: 0,Hugo_Symbol,Chromosome,Start_Position,Variant_Classification,Tumor_Sample_Barcode,case_id,project_id
255156,BTRC,10,103292795,Silent,TCGA-3C-AALI-01A-11D-A41F-09,TCGA-3C-AALI,TCGA-BRCA
255157,LDB1,10,103867505,3'UTR,TCGA-3C-AALI-01A-11D-A41F-09,TCGA-3C-AALI,TCGA-BRCA
255158,NFKB2,10,104157126,Nonsense_Mutation,TCGA-3C-AALI-01A-11D-A41F-09,TCGA-3C-AALI,TCGA-BRCA
255159,CELF2,10,11378155,3'UTR,TCGA-3C-AALI-01A-11D-A41F-09,TCGA-3C-AALI,TCGA-BRCA
255160,WDR37,10,1175988,3'UTR,TCGA-3C-AALI-01A-11D-A41F-09,TCGA-3C-AALI,TCGA-BRCA
...,...,...,...,...,...,...,...
3580703,IPPK,9,95396703,Missense_Mutation,TCGA-Z7-A8R6-01A-11D-A41F-09,TCGA-Z7-A8R6,TCGA-BRCA
3580704,STAG2,X,123217344,Missense_Mutation,TCGA-Z7-A8R6-01A-11D-A41F-09,TCGA-Z7-A8R6,TCGA-BRCA
3580705,GK,X,30671631,5'UTR,TCGA-Z7-A8R6-01A-11D-A41F-09,TCGA-Z7-A8R6,TCGA-BRCA
3580706,CXorf67,X,51151398,3'UTR,TCGA-Z7-A8R6-01A-11D-A41F-09,TCGA-Z7-A8R6,TCGA-BRCA
