In [4]:
import requests

def get_case_uuids(project_id):
    url = "https://api.gdc.cancer.gov/cases"
    filters = {
        "op": "in",
        "content": {
            "field": "project.project_id",
            "value": [project_id]
        }
    }

    params = {
        "filters": filters,
        "format": "JSON",
        "size": "10000",  # fetch up to 10k cases
        "fields": "case_id"
    }

    r = requests.post(url, json=params)
    hits = r.json()["data"]["hits"]
    return [hit["case_id"] for hit in hits]

# Get case IDs for TCGA-LAML
case_ids = get_case_uuids("TCGA-LAML")
print(case_ids[:5])


['865cda60-ec30-4562-b681-0e90737a97ca', 'fb4c9803-3690-4f6a-9402-72a4f36d64d1', '61db06eb-299a-48b3-8c71-4bea78169b4c', '233400c5-6a47-40e2-bf2a-8b7fc9df462b', '5b846aad-6133-4133-a78b-65be81332cb4']


In [6]:
import os

# Step 1: Your case UUIDs
case_ids = [
    '865cda60-ec30-4562-b681-0e90737a97ca',
    'fb4c9803-3690-4f6a-9402-72a4f36d64d1',
    '61db06eb-299a-48b3-8c71-4bea78169b4c',
    '233400c5-6a47-40e2-bf2a-8b7fc9df462b',
    '5b846aad-6133-4133-a78b-65be81332cb4'
]

# Step 2: Build GDC API filter
query_url = "https://api.gdc.cancer.gov/files"

params = {
    "filters": {
        "op": "and",
        "content": [
            {"op": "in", "content": {"field": "cases.case_id", "value": case_ids}},
            {"op": "in", "content": {"field": "data_format", "value": ["MAF"]}},
            {"op": "in", "content": {"field": "access", "value": ["open"]}}
        ]
    },
    "fields": "file_id,file_name",
    "format": "JSON",
    "size": "1000"
}

response = requests.post(query_url, json=params)
maf_files = response.json()["data"]["hits"]
print(f"🧬 Found {len(maf_files)} MAF files.")

# Step 3: Create folder to save them
os.makedirs("downloaded_mafs", exist_ok=True)

# Step 4: Download each file
for file in maf_files:
    file_id = file["file_id"]
    file_name = file["file_name"]
    url = f"https://api.gdc.cancer.gov/data/{file_id}"

    print(f"⬇️  Downloading {file_name}...")
    r = requests.get(url, stream=True)
    with open(os.path.join("downloaded_mafs", file_name), 'wb') as f:
        for chunk in r.iter_content(chunk_size=1024):
            f.write(chunk)
            
if len(maf_files) < len(case_ids):
    print(f"⚠️  Only {len(maf_files)} MAF files found for {len(case_ids)} cases. Some cases may not have open-access mutation files.")


🧬 Found 4 MAF files.
⬇️  Downloading d7b6f91b-dfee-4299-a4cb-ddc76073e82f.wxs.aliquot_ensemble_masked.maf.gz...
⬇️  Downloading b831b331-7173-43a2-8387-74b41cb37ae8.wxs.aliquot_ensemble_masked.maf.gz...
⬇️  Downloading b2c9852a-4b93-418e-9a30-d44b15782108.wxs.aliquot_ensemble_masked.maf.gz...
⬇️  Downloading 323780cf-73f3-4183-9e29-5fdfd41a8409.wxs.aliquot_ensemble_masked.maf.gz...
⚠️  Only 4 MAF files found for 5 cases. Some cases may not have open-access mutation files.


In [7]:
import pandas as pd
import gzip
import os

# Path to downloaded files
maf_dir = "downloaded_mafs"
merged_maf = pd.DataFrame()

for file in os.listdir(maf_dir):
    if file.endswith(".maf.gz"):
        file_path = os.path.join(maf_dir, file)
        with gzip.open(file_path, 'rt') as f:
            df = pd.read_csv(f, sep='\t', comment='#', low_memory=False)
            merged_maf = pd.concat([merged_maf, df], ignore_index=True)

print(f"✅ Merged MAF shape: {merged_maf.shape}")
print(merged_maf[['Hugo_Symbol', 'Variant_Classification', 'Tumor_Sample_Barcode']].head())


✅ Merged MAF shape: (30, 140)
  Hugo_Symbol Variant_Classification          Tumor_Sample_Barcode
0     CACNA1S      Missense_Mutation  TCGA-AB-2803-03B-01W-0728-08
1       LMOD1      Missense_Mutation  TCGA-AB-2803-03B-01W-0728-08
2       PEX13        Frame_Shift_Del  TCGA-AB-2803-03B-01W-0728-08
3       RPL32      Missense_Mutation  TCGA-AB-2803-03B-01W-0728-08
4      CC2D2A      Missense_Mutation  TCGA-AB-2803-03B-01W-0728-08


In [8]:
missense_maf = merged_maf[merged_maf['Variant_Classification'] == 'Missense_Mutation']
print(missense_maf[['Hugo_Symbol', 'HGVSp_Short', 'Tumor_Sample_Barcode']].dropna().head())


  Hugo_Symbol HGVSp_Short          Tumor_Sample_Barcode
0     CACNA1S     p.L363F  TCGA-AB-2803-03B-01W-0728-08
1       LMOD1      p.T55M  TCGA-AB-2803-03B-01W-0728-08
3       RPL32      p.R44S  TCGA-AB-2803-03B-01W-0728-08
4      CC2D2A    p.T1114M  TCGA-AB-2803-03B-01W-0728-08
5        LNX1     p.E176K  TCGA-AB-2803-03B-01W-0728-08


In [10]:
# Sample input: Replace this with your actual dataframe
data = {
    'Hugo_Symbol': ['CACNA1S', 'LMOD1', 'RPL32', 'CC2D2A', 'LNX1'],
    'HGVSp_Short': ['p.L363F', 'p.T55M', 'p.R44S', 'p.T1114M', 'p.E176K']
}
df = pd.DataFrame(data)

def get_protein_sequence(ensembl_gene_name):
    """Fetches canonical protein sequence using Ensembl REST API."""
    server = "https://rest.ensembl.org"
    ext = f"/sequence/id/{ensembl_gene_name}?type=protein"
    headers = {"Content-Type": "text/x-fasta"}

    # Get Ensembl protein ID from gene name
    lookup_url = f"{server}/lookup/symbol/homo_sapiens/{ensembl_gene_name}?expand=1"
    r = requests.get(lookup_url, headers={"Content-Type": "application/json"})
    if not r.ok:
        return None
    decoded = r.json()

    # Find canonical transcript's protein ID
    transcripts = decoded.get("Transcript", [])
    for t in transcripts:
        if t.get("is_canonical"):
            protein_id = t.get("Translation", {}).get("id")
            break
    else:
        return None

    # Now fetch protein sequence
    seq_url = f"{server}/sequence/id/{protein_id}?type=protein"
    r = requests.get(seq_url, headers={"Content-Type": "text/plain"})
    return r.text.strip() if r.ok else None

def parse_mutation(hgvsp):
    """Parses mutation like 'p.L363F' to get original AA, position, and mutant AA"""
    import re
    match = re.match(r"p\.([A-Z])(\d+)([A-Z])", hgvsp)
    if match:
        orig, pos, mut = match.groups()
        return orig, int(pos), mut
    return None, None, None

# Store results
results = []

for index, row in df.iterrows():
    gene = row['Hugo_Symbol']
    mut_str = row['HGVSp_Short']

    orig, pos, mut = parse_mutation(mut_str)
    if pos is None:
        continue

    seq = get_protein_sequence(gene)
    if not seq or len(seq) < pos:
        continue

    # Extract 9-mer
    start = max(0, pos - 5)
    end = min(len(seq), pos + 4)
    peptide = list(seq[start:end])
    center_index = pos - start - 1

    if 0 <= center_index < len(peptide):
        peptide[center_index] = mut
        results.append({
            'Gene': gene,
            'Mutation': mut_str,
            'Neoepitope': ''.join(peptide)
        })

# Final DataFrame
neo_df = pd.DataFrame(results)
print(neo_df)

      Gene  Mutation Neoepitope
0  CACNA1S   p.L363F  LDEDFRGYM
1    LMOD1    p.T55M  QRNQMEKQS
2    RPL32    p.R44S  GIDNSVRRR
3   CC2D2A  p.T1114M  TVCHMTTAE
4     LNX1   p.E176K  LMTDKPGLD


In [13]:
os.makedirs("results", exist_ok=True)
# Save result
neo_df.to_csv("results/neoepitopes.csv", index=False)
print("✅ Neoepitopes saved to results/neoepitopes.csv")


✅ Neoepitopes saved to results/neoepitopes.csv
