In [1]:
!pip install gdown
import gdown



[notice] A new release of pip is available: 24.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip




In [6]:
file_id = "17HnxRmgly3jsduVtEBieBuHaYmA-ARaY"  # Extracted from your link
output_path = "CTD_genes_diseases.tsv.gz"  # Save location

# Download the file
gdown.download(f"https://drive.google.com/uc?id={file_id}", output_path, quiet=False)

# Verify file exists
import os

if os.path.exists(output_path):
    print("✅ File downloaded successfully:", output_path)
else:
    print("❌ Download failed! Check the link.")

Downloading...
From (original): https://drive.google.com/uc?id=17HnxRmgly3jsduVtEBieBuHaYmA-ARaY
From (redirected): https://drive.google.com/uc?id=17HnxRmgly3jsduVtEBieBuHaYmA-ARaY&confirm=t&uuid=70dedeb9-7442-4cd8-9521-58cd83358fb7
To: e:\DRDO KGE Project\Data Preprocessing\CTD_genes_diseases.tsv.gz
  0%|          | 10.5M/2.89G [00:01<08:04, 5.94MB/s]

KeyboardInterrupt: 

  0%|          | 11.5M/2.89G [00:16<08:04, 5.94MB/s]

In [4]:
import gzip

file_path = "CTD_genes_diseases.tsv.gz"

# Read first 10 lines safely
with gzip.open(file_path, "rt", encoding="utf-8", errors="replace") as f:
    for _ in range(48):
        print(next(f))

# The Comparative Toxicogenomics Database (CTD) - http://ctdbase.org/

#   Copyright 2002-2012 MDI Biological Laboratory. All rights reserved.

#   Copyright 2012-2025 NC State University. All rights reserved.

#  

# 

# Use is subject to the terms set forth at http://ctdbase.org/about/legal.jsp

# These terms include:

# 

#   1. All forms of publication (e.g., web sites, research papers, databases,

#      software applications, etc.) that use or rely on CTD data must cite CTD.

#      Citation guidelines: http://ctdbase.org/about/publications/#citing

# 

#   2. All electronic or online applications must include hyperlinks from 

#      contexts that use CTD data to the applicable CTD data pages.

#      Linking instructions: http://ctdbase.org/help/linking.jsp

# 

#   3. You must notify CTD, and describe your use of our data:

#      http://ctdbase.org/help/contact.go

# 

#   4. For quality control purposes, you must provide CTD with periodic 

#      access to your publication 

Code for Disease - Gene CSV 

In [4]:
import pandas as pd

# File path
file_path = "Original_data/CTD_chemicals_diseases.csv"

# Column headers
columns = [
    "ChemicalName",
    "ChemicalID",
    "CasRN",
    "DiseaseName",
    "DiseaseID",
    "DirectEvidence",
    "InferenceGeneSymbol",
    "InferenceScore",
    "OmimIDs",
    "PubMedIDs",
]

# List of diseases to search
disease_list = [
    "Heart Disease",
    "Stroke",
    "Hypertension",
    "Diabetes Mellitus",
    "COPD",
    "Asthma",
    "ILD",
    "Lung Cancer",
    "Breast Cancer",
    "Cervical Cancer",
    "Oral Cancer",
    "Liver Cancer",
    "Chronic Kidney Disease",
    "Cirrhosis",
    "Hepatitis",
    "Fatty Liver Disease",
    "Epilepsy",
    "Alzheimer",
    "Parkinson",
    "Rheumatoid Arthritis",
    "Lupus",
    "Psoriasis",
    "Osteoarthritis",
    "Tuberculosis",
    "Pneumonia",
    "Obesity",
    "Acute Mountain Sickness",
    "Chronic Mountain Sickness",
    "Pulmonary Edema",
    "Cerebral Edema",
    "Systemic Hypertension",
    "Sleep disorders",
    "retinal hemorrhage",
    "Deep Vein Thrombosis",
    "Hypoxia",
]

# Process in chunks
chunk_size = 10000  # Read in chunks of 10,000 rows
disease_counts = {}

# Read file in chunks
for chunk in pd.read_csv(
    file_path,
    sep=",",
    skiprows=28,
    names=columns,
    # compression="gzip",
    dtype=str,
    quotechar='"',
    on_bad_lines="skip",
    chunksize=chunk_size,
    keep_default_na=False,
    na_values=[""],
):
    for disease in disease_list:
        df_filtered = chunk[
            chunk["DiseaseName"].str.contains(disease, case=False, na=False, regex=True)
        ]

        if not df_filtered.empty:
            if disease not in disease_counts:
                disease_counts[disease] = df_filtered
            else:
                disease_counts[disease] = pd.concat(
                    [disease_counts[disease], df_filtered], ignore_index=True
                )

# Save filtered data and count summary
summary_data = []
for disease, df in disease_counts.items():
    df.to_csv(f"{disease}_filtered.csv", index=False)
    row_count = len(df)
    summary_data.append([disease, row_count])
    print(f"✅ Extracted data saved as '{disease}_filtered.csv' with {row_count} rows")

# Save disease counts summary
count_df = pd.DataFrame(summary_data, columns=["DiseaseName", "Count"])
count_df.to_csv("Disease_Counts.csv", index=False)
print("✅ Disease counts saved as 'Disease_Counts.csv'")

✅ Extracted data saved as 'Heart Disease_filtered.csv' with 12745 rows
✅ Extracted data saved as 'Stroke_filtered.csv' with 38358 rows
✅ Extracted data saved as 'Hypertension_filtered.csv' with 110600 rows
✅ Extracted data saved as 'Diabetes Mellitus_filtered.csv' with 149254 rows
✅ Extracted data saved as 'Asthma_filtered.csv' with 48176 rows
✅ Extracted data saved as 'ILD_filtered.csv' with 3142 rows
✅ Extracted data saved as 'Cirrhosis_filtered.csv' with 230406 rows
✅ Extracted data saved as 'Hepatitis_filtered.csv' with 50976 rows
✅ Extracted data saved as 'Fatty Liver Disease_filtered.csv' with 37448 rows
✅ Extracted data saved as 'Epilepsy_filtered.csv' with 30950 rows
✅ Extracted data saved as 'Alzheimer_filtered.csv' with 37875 rows
✅ Extracted data saved as 'Parkinson_filtered.csv' with 44444 rows
✅ Extracted data saved as 'Rheumatoid Arthritis_filtered.csv' with 2539 rows
✅ Extracted data saved as 'Lupus_filtered.csv' with 20185 rows
✅ Extracted data saved as 'Psoriasis_filte

In [None]:
import requests

# URL of GO basic JSON file
url = "http://purl.obolibrary.org/obo/go/go-basic.json"
file_path = "go-basic.json"

# Download the file
print("Downloading GO basic JSON file...")
response = requests.get(url, stream=True)

if response.status_code == 200:
    with open(file_path, "wb") as f:
        for chunk in response.iter_content(chunk_size=1024):
            f.write(chunk)
    print("✅ Download complete: go-basic.json")
else:
    print("❌ Download failed. Check the URL or your internet connection.")

Downloading GO basic JSON file...
✅ Download complete: go-basic.json


In [None]:
import json

# Load the GO JSON file
file_path = "go-basic.json"

with open(file_path, "r") as f:
    go_data = json.load(f)

# Print top-level structure
print("Top-level keys:", go_data.keys())

Top-level keys: dict_keys(['graphs'])


In [None]:
# Check what's inside 'graphs'
print(type(go_data["graphs"]))  # Should be a list
print(len(go_data["graphs"]))  # Number of elements
print(go_data["graphs"][0].keys())  # Keys inside the first graph

<class 'list'>
1
dict_keys(['id', 'meta', 'nodes', 'edges', 'propertyChainAxioms'])


In [None]:
import json

# Load the GO JSON file
file_path = "go-basic.json"

with open(file_path, "r") as f:
    go_data = json.load(f)

# Extract ontology terms
terms = go_data["graphs"][0]["nodes"]

# Search for broader terms
search_terms = ["Edema"]
matching_terms = []

for term in terms:
    term_id = term.get("id", "N/A")
    label = term.get("lbl", "N/A")
    definition = term.get("meta", {}).get("definition", {}).get("val", "")
    synonyms = term.get("meta", {}).get("synonyms", [])

    # Convert synonyms to a list of values
    synonym_list = [syn["val"] for syn in synonyms]

    # Search in label, definition, or synonyms
    for search_term in search_terms:
        if (
            search_term.lower() in label.lower()
            or search_term.lower() in definition.lower()
            or any(search_term.lower() in syn.lower() for syn in synonym_list)
        ):

            matching_terms.append(
                {
                    "ID": term_id,
                    "Label": label,
                    "Definition": definition,
                    "Synonyms": synonym_list,
                }
            )
            break  # Avoid duplicate entries

# Display results
if matching_terms:
    print(f"✅ Found {len(matching_terms)} GO terms related to {search_terms}:")
    for term in matching_terms:
        print(
            f"ID: {term['ID']}\nLabel: {term['Label']}\nDefinition: {term['Definition']}\nSynonyms: {term['Synonyms']}\n"
        )
else:
    print(
        f"❌ No matching GO terms found for {search_terms}. Try alternative keywords."
    )

✅ Found 2 GO terms related to ['Edema']:
ID: http://purl.obolibrary.org/obo/GO_0034480
Label: phosphatidylcholine phospholipase C activity
Definition: Catalysis of the reaction: a 1,2-diacyl-sn-glycero-3-phosphocholine + H2O = a 1,2-diacyl-sn-glycerol + H+ + phosphocholine.
Synonyms: ['phospholipase C, acting on phosphatidylcholine', 'Clostridium oedematiens beta- and gamma-toxins activity', 'Clostridium welchii alpha-toxin activity', 'heat-labile hemolysin', 'lipophosphodiesterase I activity', 'phosphatidylcholine cholinephosphohydrolase activity']

ID: http://purl.obolibrary.org/obo/GO_0044398
Label: venom-mediated edema in another organism
Definition: A process by which an organism causes swelling of soft tissues in another organism via the action of a venom. Edema is the result of excess water accumulation in tissues.
Synonyms: ['envenomation resulting in induction of edema in another organism', 'envenomation resulting in induction of edema in other organism', 'envenomation resulti