## 14th July 2025 
### API Cbioportal Data Exploration
### 

In [7]:
import pandas as pd #this is to manipulate the data
import requests
import pandas as pd
!pip install biopython
from Bio import Entrez
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns




In [9]:
BASE_URL = "https://www.cbioportal.org/api"            # Base for API request                
HEADERS = {"accept": "application/json", "Content-Type": "application/json"}   #jsom formatted data

study_id = "prad_tcga_pan_can_atlas_2018"                     #study identifier
sample_list_id = "prad_tcga_pan_can_atlas_2018_all"           # list of sample identifier

# GET SAMPLE ID
samples_url = f"{BASE_URL}/sample-lists/{sample_list_id}"     #retrieve sample list 
samples_resp = requests.get(samples_url, headers=HEADERS)     #send request to the url
samples_resp.raise_for_status()                               #checck if request is successful
sample_ids = samples_resp.json().get("sampleIds", [])         #extracts the sample ID
print(f" Found {len(sample_ids)} samples in {sample_list_id}")

 Found 494 samples in prad_tcga_pan_can_atlas_2018_all


In [17]:
# Converting HUGO → Entrez IDs via Bio-python
def get_entrez_ids_biopython(hugo_list):    #define a function   
    entrez_ids = {}                         #create empty dictionary to store
    for gene in hugo_list:                  #iterate through the gene symbol list
        handle = Entrez.esearch(db="gene", term=f"{gene}[Gene Name] AND Homo sapiens[Organism]")   #search query against NCBI database
        record = Entrez.read(handle)        #reads the search0
        handle.close()                      #close ncbi connection
        if record["IdList"]:                # if an ID is found
            entrez_ids[gene] = int(record["IdList"][0])  convert the first one to an integer and store as entrez ID
        else:
            entrez_ids[gene] = None
            print(f"No match found for {gene}")
    return entrez_ids

# MAPK pathway genes
hugo_genes = [
    "EGFR","KRAS","HRAS","NRAS",
    "ARAF","BRAF","RAF1",
    "MAP2K1","MAP2K2",
    "MAPK1","MAPK3",
    "KSR1","KSR2"
]
entrez_map = get_entrez_ids_biopython(hugo_genes)          #Call function
entrez_ids = [eid for eid in entrez_map.values() if eid is not None]        
print("Converted HUGO → Entrez IDs:", entrez_map)




            Email address is not specified.

            To make use of NCBI's E-utilities, NCBI requires you to specify your
            email address with each request.  As an example, if your email address
            is A.N.Other@example.com, you can specify it as follows:
               from Bio import Entrez
               Entrez.email = 'A.N.Other@example.com'
            In case of excessive usage of the E-utilities, NCBI will attempt to contact
            a user at the email address provided before blocking access to the
            E-utilities.


Converted HUGO → Entrez IDs: {'EGFR': 1956, 'KRAS': 3845, 'HRAS': 3265, 'NRAS': 4893, 'ARAF': 369, 'BRAF': 673, 'RAF1': 5894, 'MAP2K1': 5604, 'MAP2K2': 5605, 'MAPK1': 5594, 'MAPK3': 5595, 'KSR1': 8844, 'KSR2': 22866}


In [19]:
# ==== Molecular profiles ====
molecular_profiles = {
    "cna_log2": "prad_tcga_pan_can_atlas_2018_log2CNA",
    "mrna": "prad_tcga_pan_can_atlas_2018_rna_seq_v2_mrna",
    "mrna_zscores": "prad_tcga_pan_can_atlas_2018_rna_seq_v2_mrna_median_all_sample_Zscores"
}
#create a dictionary for  moleculat value.

# Molecular data
def fetch_molecular_data(profile_id, samples, entrez_ids):               #function to get molecular data
    url = f"{BASE_URL}/molecular-profiles/{profile_id}/molecular-data/fetch"       #url for cBioportal API to get molecular data 
    payload = {"entrezGeneIds": entrez_ids, "sampleIds": samples}         #Dictionary to know ganes and samples i need data
    resp = requests.post(url, headers=HEADERS, json=payload)              #Post requestfor the gene and sample ID
    if resp.status_code == 200:
        df = pd.DataFrame(resp.json())                                   #convert to a dataframe using pandas 
        print(f"Retrieved {df.shape[0]} rows from {profile_id}")          
        return df
    else:
        print(f" Error fetching {profile_id}: {resp.status_code}")
        print(resp.text)
        return pd.DataFrame()

molecular_dfs = {}                    #Create dictionary to store dataframe
for name, profile_id in molecular_profiles.items():      # iterate through the name cna, mrna
    df = fetch_molecular_data(profile_id, sample_ids, entrez_ids)
    molecular_dfs[name] = df
    if not df.empty:
        df.to_csv(f"{name}.csv", index=False)
        print(f"Saved {name}.csv")


timeline_url = f"{BASE_URL}/studies/{study_id}/clinical-events"
timeline_resp = requests.get(timeline_url, headers=HEADERS)
if timeline_resp.status_code == 200:
    df_timeline = pd.DataFrame(timeline_resp.json())
    print(f" Retrieved {df_timeline.shape[0]} treatment timeline events")
    print("Timeline columns:", list(df_timeline.columns))
    df_timeline.to_csv(f"{study_id}_treatment_timeline.csv", index=False)
    print(f"Saved {study_id}_treatment_timeline.csv")
else:
    print("Error fetching timeline:", timeline_resp.status_code, timeline_resp.text)


clinical_url = f"{BASE_URL}/studies/{study_id}/clinical-data"
clinical_resp = requests.get(clinical_url, headers=HEADERS)
if clinical_resp.status_code == 200:
    df_clinical = pd.DataFrame(clinical_resp.json())
    print(f" Retrieved {df_clinical.shape[0]} patients, {df_clinical.shape[1]} columns")
    print(" Clinical columns:", list(df_clinical.columns))
    df_clinical.to_csv(f"{study_id}_clinical_data.csv", index=False)
    print(f"Saved FULL {study_id}_clinical_data.csv")
else:
    print("Error fetching clinical data:", clinical_resp.status_code, clinical_resp.text)

print("- cna_log2.csv")
print("- mrna.csv")
print("- mrna_zscores.csv")
print(f"- {study_id}_treatment_timeline.csv")
print(f"- {study_id}_clinical_data.csv")


Retrieved 6357 rows from prad_tcga_pan_can_atlas_2018_log2CNA
Saved cna_log2.csv
Retrieved 6409 rows from prad_tcga_pan_can_atlas_2018_rna_seq_v2_mrna
Saved mrna.csv
Retrieved 6409 rows from prad_tcga_pan_can_atlas_2018_rna_seq_v2_mrna_median_all_sample_Zscores
Saved mrna_zscores.csv
 Retrieved 2262 treatment timeline events
Timeline columns: ['uniquePatientKey', 'studyId', 'patientId', 'eventType', 'attributes', 'startNumberOfDaysSinceDiagnosis', 'endNumberOfDaysSinceDiagnosis']
Saved prad_tcga_pan_can_atlas_2018_treatment_timeline.csv
 Retrieved 8854 patients, 7 columns
 Clinical columns: ['uniqueSampleKey', 'uniquePatientKey', 'sampleId', 'patientId', 'studyId', 'clinicalAttributeId', 'value']
Saved FULL prad_tcga_pan_can_atlas_2018_clinical_data.csv
- cna_log2.csv
- mrna.csv
- mrna_zscores.csv
- prad_tcga_pan_can_atlas_2018_treatment_timeline.csv
- prad_tcga_pan_can_atlas_2018_clinical_data.csv


## Identification of Gene and Treatment Data

In [24]:
cna_df=pd.read_csv("cna_log2.csv")
mrna_df=pd.read_csv("mrna_zscores.csv")
clinical_df=pd.read_csv("prad_tcga_pan_can_atlas_2018_clinical_data.csv")

In [26]:
# Loading the  treatment timeline data
treatment_df = pd.read_csv("prad_tcga_pan_can_atlas_2018_treatment_timeline.csv")

print("Loaded timeline shape:", treatment_df.shape)
print("Current columns:", treatment_df.columns.tolist())


Loaded timeline shape: (2262, 7)
Current columns: ['uniquePatientKey', 'studyId', 'patientId', 'eventType', 'attributes', 'startNumberOfDaysSinceDiagnosis', 'endNumberOfDaysSinceDiagnosis']
