In [16]:
"""
Simple Knowledge Graph Query Tools
Two functions: Get Gene ID + Find Diseases
"""

import requests
import json
from typing import Dict, List


class KnowledgeGraphTools:

    def __init__(self):
        self.base_url = "https://api.bte.ncats.io/v1"
        self.ncbi_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"

    # for finding GENE -ID
    def get_gene_id(self, gene_symbol: str) -> str:
        """Look up Entrez Gene ID from gene symbol"""
        params = {
            'db': 'gene',
            'term': f"{gene_symbol}[Gene Name] AND human[Organism]",
            'retmode': 'json'
        }

        try:
            response = requests.get(self.ncbi_url, params=params, timeout=10)
            data = response.json()
            ids = data.get('esearchresult', {}).get('idlist', [])

            if ids:
                return ids[0]
            return None

        except Exception as e:
            print(f"Error looking up {gene_symbol}: {e}")
            return None

    # FUNCTION 2: Find Diseases
    def find_diseases(self, gene_symbol: str, gene_id: str) -> Dict:
        """Find diseases associated with a gene"""

        query = {
            "message": {
                "query_graph": {
                    "nodes": {
                        "gene": {
                            "ids": [f"NCBIGene:{gene_id}"],
                            "categories": ["biolink:Gene"]
                        },
                        "disease": {
                            "categories": ["biolink:Disease"]
                        }
                    },
                    "edges": {
                        "e": {
                            "subject": "gene",
                            "object": "disease",
                            "predicates": ["biolink:related_to"]
                        }
                    }
                }
            }
        }

        try:
            response = requests.post(f"{self.base_url}/query", json=query, timeout=120)

            if response.status_code == 200:
                data = response.json()
                diseases = self._parse_results(data)

                return {
                    "gene_symbol": gene_symbol,
                    "gene_id": gene_id,
                    "diseases": diseases,
                    "count": len(diseases),
                    "status": "success"
                }

            else:
                return {
                    "gene_symbol": gene_symbol,
                    "error": f"API error: {response.status_code}",
                    "status": "error"
                }

        except Exception as e:
            return {
                "gene_symbol": gene_symbol,
                "error": str(e),
                "status": "error"
            }

    def _parse_results(self, data: Dict) -> List[Dict]:
        """Extract the disease ID + name from TRAPI response"""

        diseases = []
        results = data.get('message', {}).get('results', [])
        total_results = len(data.get('message', {}).get('results', []))
        print(f"Total disease associations found: {total_results}")

        for result in results[:15]:
            disease_nodes = result.get('node_bindings', {}).get('disease', [])

            for node in disease_nodes:
                curie = node.get("id", "Unknown")

                # Try to extract label (name)
                name = curie
                kg_nodes = data.get("message", {}).get("knowledge_graph", {}).get("nodes", {})
                if curie in kg_nodes:
                    name = kg_nodes[curie].get("name", curie)

                diseases.append({
                    "id": curie,
                    "name": name
                })

        return diseases


"""
Test with single gene - Validation below
"""

# Initialize
kg = KnowledgeGraphTools()

# Choose a gene to test
test_gene = "MUTYH"

print(f"Testing with gene: {test_gene}")
print("=" * 60)

# Step 1: Get ID
print("\nStep 1: Looking up gene ID...")

gene_id = kg.get_gene_id(test_gene)

if gene_id:
    print(f"✓ Found ID: {gene_id}")

    # Step 2: Query Knowledge Graph
    print("\nStep 2: Querying Knowledge Graph...")
    result = kg.find_diseases(test_gene, gene_id)

    print("\nResults:")
    print(json.dumps(result, indent=2))

    if result['status'] == 'success':
        print(f"\n✓ Found {result['count']} disease associations")
        print("\nDiseases:")
        for i, disease in enumerate(result['diseases'], 1):
            print(f"  {i}. {disease['name']} ({disease['id']})")

else:
    print(f"✗ Could not find gene ID for {test_gene}")


Testing with gene: MUTYH

Step 1: Looking up gene ID...
✓ Found ID: 4595

Step 2: Querying Knowledge Graph...
Total disease associations found: 70

Results:
{
  "gene_symbol": "MUTYH",
  "gene_id": "4595",
  "diseases": [
    {
      "id": "MONDO:0012041",
      "name": "familial adenomatous polyposis 2"
    },
    {
      "id": "MONDO:0001056",
      "name": "gastric cancer"
    },
    {
      "id": "MONDO:0004950",
      "name": "Stomach cancer"
    },
    {
      "id": "MONDO:0005575",
      "name": "colorectal cancer"
    },
    {
      "id": "MONDO:0004992",
      "name": "cancer"
    },
    {
      "id": "MONDO:0007254",
      "name": "breast cancer"
    },
    {
      "id": "MONDO:0021063",
      "name": "colon cancer"
    },
    {
      "id": "MONDO:0016419",
      "name": "hereditary breast carcinoma"
    },
    {
      "id": "HP:0001442",
      "name": "Typified by somatic mosaicism"
    },
    {
      "id": "MONDO:0018604",
      "name": "familial colorectal cancer type X"
 

In [3]:
import asyncio
import httpx
import json

# --- CONSTANTS ---
TRAPI_URL = "https://api.bte.ncats.io/v1/query" 
NCBI_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"

# --- HELPER FUNCTIONS (The Kitchen Logic) ---

async def query_translator_kg(entrez_id: str, target_category: str):
    """Query BioThings Explorer for a specific category."""
    query = {
        "message": {
            "query_graph": {
                "nodes": {
                    "n0": {"ids": [f"NCBIGene:{entrez_id}"], "categories": ["biolink:Gene"]},
                    "n1": {"categories": [target_category]} 
                },
                "edges": {
                    "e01": {
                        "subject": "n0",
                        "object": "n1",
                        "predicates": ["biolink:related_to"]
                    }
                }
            }
        },
        "workflow": [{"id": "lookup"}]
    }
    
    # We use a context manager for the client
    async with httpx.AsyncClient(timeout=60.0) as client:
        try:
            print(f"   -> Sending query for {target_category}...")
            response = await client.post(TRAPI_URL, json=query)
            response.raise_for_status()
            return response.json()
        except Exception as e:
            print(f"Error querying {target_category}: {e}")
            return {}

def extract_associations(kg_result, type_label):
    """Parse the complex JSON from BioThings into a simple list."""
    items = []
    
    try:
        message = kg_result.get("message", {})
        results = message.get("results", [])
        knowledge_graph = message.get("knowledge_graph", {})
        nodes_map = knowledge_graph.get("nodes", {})
        
        for result in results:
            node_bindings = result.get("node_bindings", {})
            target_nodes = node_bindings.get("n1", [])
            
            for node in target_nodes:
                node_id = node.get("id")
                if node_id in nodes_map:
                    node_info = nodes_map[node_id]
                    node_name = node_info.get("name", node_id)
                    
                    if not any(i['id'] == node_id for i in items):
                        items.append({
                            "name": node_name,
                            "type": type_label,
                            "id": node_id
                        })
        return items
    except Exception as e:
        return [{"error": str(e)}]

async def get_gene_info(gene_symbol: str):
    """Convert Gene Symbol (e.g., APOE) to ID (e.g., 348)."""
    params = {
        "db": "gene",
        "term": f"{gene_symbol}[Gene Name] AND Homo sapiens[Organism]",
        "retmode": "json",
        "retmax": 1
    }
    
    async with httpx.AsyncClient(timeout=10.0) as client:
        try:
            print(f"1. Looking up ID for {gene_symbol}...")
            response = await client.get(NCBI_URL, params=params)
            data = response.json()
            id_list = data.get("esearchresult", {}).get("idlist", [])
            if id_list:
                return id_list[0]
            return None
        except Exception as e:
            print(f"Error getting gene info: {e}")
            return None

# --- THE TEST RUNNER (Instead of the Server) ---
# Jupyter naturally handles await, so we can write this directly.

# CHANGE THIS to test different genes
TEST_GENE = "ACTG1" 

print(f"--- STARTING TEST FOR {TEST_GENE} ---")

# 1. Get the ID
entrez_id = await get_gene_info(TEST_GENE)

if entrez_id:
    print(f"   -> Found Entrez ID: {entrez_id}")
    
    # 2. Run queries for Diseases and Drugs
    # We use await here to run them one after another for the test (simpler to debug)
    # or use asyncio.gather for speed. Let's use gather to match the server code.
    print("2. Querying Knowledge Graph for Diseases and Drugs...")
    
    task_disease = query_translator_kg(entrez_id, "biolink:Disease")
    task_drug = query_translator_kg(entrez_id, "biolink:ChemicalEntity")
    
    disease_raw, drug_raw = await asyncio.gather(task_disease, task_drug)
    
    # 3. Clean Data
    diseases = extract_associations(disease_raw, "Disease")
    drugs = extract_associations(drug_raw, "Drug")
    
    # 4. Print Results
    print("\n" + "="*40)
    print(f"RESULTS FOR {TEST_GENE}")
    print("="*40)
    
    print(f"\n--- Top 5 Diseases (out of {len(diseases)}) ---")
    for d in diseases[:5]:
        print(f"- {d['name']} ({d['id']})")
        
    print(f"\n--- Top 5 Drugs/Chemicals (out of {len(drugs)}) ---")
    for d in drugs[:5]:
        print(f"- {d['name']} ({d['id']})")
        
else:
    print("Gene not found!")

--- STARTING TEST FOR ACTG1 ---
1. Looking up ID for ACTG1...
   -> Found Entrez ID: 71
2. Querying Knowledge Graph for Diseases and Drugs...
   -> Sending query for biolink:Disease...
   -> Sending query for biolink:ChemicalEntity...

RESULTS FOR ACTG1

--- Top 5 Diseases (out of 346) ---
- Baraitser-Winter cerebrofrontofacial syndrome (MONDO:0017579)
- autosomal dominant nonsyndromic hearing loss 20 (MONDO:0011480)
- lissencephaly (MONDO:0018838)
- nonsyndromic genetic hearing loss (MONDO:0019497)
- Baraitser-winter syndrome 2 (MONDO:0013812)

--- Top 5 Drugs/Chemicals (out of 1672) ---
- trametinib dimethyl sulfoxide (CHEBI:75991)
- Sirolimus (CHEBI:9168)
- (E)-SB-590885 (CHEBI:131881)
- BI 2536 (UNII:4LJG22T9C6)
- JQ1 (CHEBI:137113)
