In [1]:
import requests
import pandas as pd

# Step 1: Get all human pathways from KEGG
url = "http://rest.kegg.jp/list/pathway/hsa"
response = requests.get(url)

# Parse response into lines
lines = response.text.strip().split("\n")

# Extract KEGG ID and Pathway Name from each line
data = []
for line in lines:
    kegg_id, name = line.split("\t")
    kegg_id = kegg_id.replace("path:", "")
    data.append((kegg_id, name))

# Create DataFrame
df_pathways = pd.DataFrame(data, columns=["KEGG_ID", "Pathway_Name"])
df_pathways.head()  # Display first few rows


Unnamed: 0,KEGG_ID,Pathway_Name
0,hsa01100,Metabolic pathways - Homo sapiens (human)
1,hsa01200,Carbon metabolism - Homo sapiens (human)
2,hsa01210,2-Oxocarboxylic acid metabolism - Homo sapiens...
3,hsa01212,Fatty acid metabolism - Homo sapiens (human)
4,hsa01230,Biosynthesis of amino acids - Homo sapiens (hu...


In [2]:
df_pathways.to_csv("kegg_pathways.csv", index=False)


In [4]:
import pandas as pd

# Load the uploaded pathway list CSV
file_path = "kegg_pathways.csv"
df_pathways = pd.read_csv(file_path)

# Define keywords for identifying cancer- or disease-specific pathways
keywords = ["cancer", "carcinoma", "disease", "tumor"]

# Filter pathways using the keywords
mask = df_pathways["Pathway_Name"].str.lower().str.contains("|".join(keywords))
df_disease_related = df_pathways[mask]



In [5]:
df_disease_related.head()

Unnamed: 0,KEGG_ID,Pathway_Name
270,hsa05200,Pathways in cancer - Homo sapiens (human)
271,hsa05202,Transcriptional misregulation in cancer - Homo...
272,hsa05206,MicroRNAs in cancer - Homo sapiens (human)
273,hsa05205,Proteoglycans in cancer - Homo sapiens (human)
278,hsa05230,Central carbon metabolism in cancer - Homo sap...


In [6]:
df_disease_related.to_csv("kegg_pathways_cancer.csv", index=False)


In [7]:
pip install tqdm


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [11]:
import requests
import pandas as pd
from tqdm import tqdm

# Load your pathway list
df_pathways = pd.read_csv("kegg_pathways.csv")

# Function to get genes for each pathway
def get_genes_for_pathway(kegg_id):
    url = f"http://rest.kegg.jp/link/hsa/{kegg_id}"
    response = requests.get(url)
    if response.ok and response.text.strip():
        return [line.split("\t")[1].replace("hsa:", "") for line in response.text.strip().split("\n")]
    else:
        print(f"[WARNING] No data or error for: {kegg_id}")
        return []

# Build mapping with progress bar
pathway_gene_map = []
for pid in tqdm(df_pathways["KEGG_ID"], desc="Processing pathways"):
    genes = get_genes_for_pathway(pid)
    for gene in genes:
        pathway_gene_map.append((pid, gene))

# Save results
df_gene_map = pd.DataFrame(pathway_gene_map, columns=["Pathway_ID", "Gene_ID"])
df_gene_map.to_csv("kegg_pathway_to_gene.csv", index=False)


Processing pathways: 100%|██████████| 366/366 [06:38<00:00,  1.09s/it]


In [20]:
import pandas as pd

# Load the uploaded CSV file
file_path = "kegg_pathway_to_gene.csv"
df = pd.read_csv(file_path)

# Count unique values in the 'Gene_ID' column
unique_gene_ids = df['Gene_ID'].nunique()
unique_gene_ids

8866

In [22]:
pip install openpyxl


Collecting openpyxl
  Downloading openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
     ---------------------------------------- 0.0/250.9 kB ? eta -:--:--
     --------------- ---------------------- 102.4/250.9 kB 3.0 MB/s eta 0:00:01
     -------------------------------------  245.8/250.9 kB 3.0 MB/s eta 0:00:01
     -------------------------------------- 250.9/250.9 kB 2.6 MB/s eta 0:00:00
Collecting et-xmlfile
  Downloading et_xmlfile-2.0.0-py3-none-any.whl (18 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-2.0.0 openpyxl-3.1.5
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
import pandas as pd
import requests
from tqdm import tqdm
import time

# === Step 1: Load Gene List ===
file_path = "Gene_Symbol.xlsx"
df_genes = pd.read_excel(file_path)
gene_symbols = df_genes.iloc[:, 0].dropna().unique()

# === Step 2: Map Gene Symbols to Ensembl IDs ===
gene_to_ensembl = {}
print("🔍 Mapping gene symbols to Ensembl IDs...")
for gene in tqdm(gene_symbols):
    try:
        url = f"https://rest.ensembl.org/xrefs/symbol/homo_sapiens/{gene}?"
        headers = {"Content-Type": "application/json"}
        response = requests.get(url, headers=headers)
        if response.ok:
            data = response.json()
            gene_id = next((item["id"] for item in data if item["type"] == "gene"), None)
            if gene_id:
                gene_to_ensembl[gene] = gene_id
    except:
        continue

# === Step 3: Get Variants for Each Ensembl Gene ID ===
gene_to_variants = {}
print("🧬 Fetching variants for mapped genes...")
for gene, ensembl_id in tqdm(gene_to_ensembl.items()):
    try:
        url = f"https://rest.ensembl.org/overlap/id/{ensembl_id}?feature=variation"
        headers = {"Content-Type": "application/json"}
        response = requests.get(url, headers=headers)
        if response.ok:
            variants = response.json()
            gene_to_variants[gene] = [v["id"] for v in variants if "id" in v]
        else:
            gene_to_variants[gene] = []
    except:
        gene_to_variants[gene] = []

# === Step 4: Fetch Detailed Variant Info ===
def get_variant_details(variant_id):
    try:
        url = f"https://rest.ensembl.org/variation/human/{variant_id}?"
        headers = {"Content-Type": "application/json"}
        response = requests.get(url, headers=headers)
        if response.ok:
            data = response.json()
            consequence = data.get("most_severe_consequence", "N/A")
            allele_string = data.get("mappings", [{}])[0].get("allele_string", "N/A")
            if "/" in allele_string:
                ref, alt = allele_string.split("/")
            else:
                ref, alt = "N/A", "N/A"
            substitution = f"{ref}→{alt}"
            return consequence, substitution
        else:
            return "N/A", "N/A"
    except:
        return "N/A", "N/A"

# === Step 5: Build Final Enriched Variant Table ===
detailed_variant_data = []
print("🔎 Fetching variant details for each variant...")
for gene, variants in tqdm(gene_to_variants.items()):
    ensembl_id = gene_to_ensembl.get(gene, "N/A")
    for var in variants:
        consequence, substitution = get_variant_details(var)
        detailed_variant_data.append({
            "Gene_Symbol": gene,
            "Ensembl_ID": ensembl_id,
            "Variant_ID": var,
            "Variant_Consequence": consequence,
            "Base_Substitution": substitution
        })
        time.sleep(0.1)  # polite delay to avoid throttling

# === Step 6: Save to CSV ===
df_full = pd.DataFrame(detailed_variant_data)
df_full.to_csv("gene_to_variants_latest_file.csv", index=False)
print("✅ Saved enriched variant data to 'gene_to_variants_latest_file.csv'")
