# Notebook 03: Semantic Subgraph Extraction

## Objective

Extract neighborhoods WITH semantic relations (entity-relation-entity triplets),
not just equivalent_ids like v2.

## Key Difference from v2

```
v2: glucose -> [CHEBI:4167, HMDB:HMDB0000122, KEGG:C00031]
    (Same entity, different IDs = vocabulary transition)

v3: glucose -> participates_in -> Glycolysis
    glucose -> substrate_of -> Hexokinase
    (Different entities, semantic relations = TRUE hop)
```

In [1]:
# Standard imports
import sys
import json
from pathlib import Path
from datetime import datetime
from collections import Counter, defaultdict
from dataclasses import asdict

# Add project root to path
PROJECT_ROOT = Path.cwd().parents[1]
sys.path.insert(0, str(PROJECT_ROOT / 'src'))
sys.path.insert(0, str(Path.cwd()))

# Import utilities
from kg_o1_v3_utils import (
    test_one_hop, get_predicates, parse_one_hop_edges,
    classify_predicate,
    hybrid_search, text_search,
    save_json, load_json,
    SemanticTriple, SemanticSubgraph,
    TEST_ENTITIES, SEMANTIC_PREDICATES,
)

# Output directory
OUTPUT_DIR = Path.cwd() / 'outputs'
OUTPUT_DIR.mkdir(exist_ok=True)

print(f"Project root: {PROJECT_ROOT}")
print(f"Output directory: {OUTPUT_DIR}")

Project root: /home/trentleslie/Insync/projects/biomapper2
Output directory: /home/trentleslie/Insync/projects/biomapper2/notebooks/kg_o1_v3/outputs


## 1. Define Seed Entities

Use entities that are likely to have semantic relations.

In [2]:
# Extended seed entities for subgraph extraction
SEED_ENTITIES = [
    # Core metabolites
    ("CHEBI:4167", "glucose", "SmallMolecule"),
    ("CHEBI:15846", "NAD+", "SmallMolecule"),
    ("CHEBI:16113", "cholesterol", "SmallMolecule"),
    ("CHEBI:17234", "alanine", "SmallMolecule"),
    ("CHEBI:30616", "ATP", "SmallMolecule"),
    ("CHEBI:15377", "water", "SmallMolecule"),
    
    # Amino acids
    ("CHEBI:16015", "L-leucine", "SmallMolecule"),
    ("CHEBI:17295", "L-tryptophan", "SmallMolecule"),
    ("CHEBI:17823", "L-tyrosine", "SmallMolecule"),
    ("CHEBI:33704", "L-valine", "SmallMolecule"),
    
    # TCA cycle
    ("CHEBI:16947", "citric acid", "SmallMolecule"),
    ("CHEBI:15361", "pyruvic acid", "SmallMolecule"),
    ("CHEBI:15562", "L-lactic acid", "SmallMolecule"),
    
    # Vitamins
    ("CHEBI:17154", "nicotinamide", "SmallMolecule"),
    ("CHEBI:16709", "pyridoxine", "SmallMolecule"),
    
    # Lipids
    ("CHEBI:16526", "carbon dioxide", "SmallMolecule"),
    ("CHEBI:17855", "triglyceride", "SmallMolecule"),
    
    # Nucleotides
    ("CHEBI:15422", "ATP", "SmallMolecule"),
    ("CHEBI:16335", "adenine", "SmallMolecule"),
    ("CHEBI:17258", "UDP-glucose", "SmallMolecule"),
]

print(f"Total seed entities: {len(SEED_ENTITIES)}")

Total seed entities: 20


## 2. Extract Semantic Subgraphs

For each entity, extract all semantic relations (filtering out equivalency).

In [3]:
def extract_semantic_subgraph(
    entity_id: str,
    entity_name: str,
    entity_category: str,
    limit: int = 100,
) -> SemanticSubgraph:
    """
    Extract semantic subgraph for an entity.
    Filters out equivalency predicates to get only true semantic relations.
    """
    subgraph = SemanticSubgraph(
        center_entity_id=entity_id,
        center_entity_name=entity_name,
        center_entity_category=entity_category,
    )
    
    # Get outgoing relations (forward direction)
    forward_result = test_one_hop(entity_id, direction="forward", limit=limit)
    
    # Use parse_one_hop_edges to correctly extract edges with predicates
    forward_edges = parse_one_hop_edges(forward_result)
    
    for edge in forward_edges:
        pred = edge.get('predicate', '')
        classification = classify_predicate(pred)
        
        # Only include semantic relations
        if classification == 'semantic' or classification == 'unknown':
            triple = SemanticTriple(
                subject_id=entity_id,
                subject_name=entity_name,
                predicate=pred,
                object_id=edge.get('object_id', edge.get('end_node_id', '')),
                object_name=edge.get('object_name', edge.get('end_node_name', '')),
                object_category=edge.get('object_category', edge.get('category', '')),
            )
            subgraph.outgoing_relations.append(triple)
    
    # Get incoming relations (reverse direction)
    reverse_result = test_one_hop(entity_id, direction="reverse", limit=limit)
    
    # Use parse_one_hop_edges for incoming edges too
    reverse_edges = parse_one_hop_edges(reverse_result)
    
    for edge in reverse_edges:
        pred = edge.get('predicate', '')
        classification = classify_predicate(pred)
        
        if classification == 'semantic' or classification == 'unknown':
            # For incoming, the subject is the other entity
            triple = SemanticTriple(
                subject_id=edge.get('subject_id', edge.get('subject', '')),
                subject_name=edge.get('subject_name', edge.get('start_node_name', '')),
                predicate=pred,
                object_id=entity_id,
                object_name=entity_name,
                object_category=entity_category,
            )
            subgraph.incoming_relations.append(triple)
    
    return subgraph

print("Extraction function defined.")

Extraction function defined.


In [4]:
# Extract subgraphs for all seed entities
print("="*60)
print("Extracting semantic subgraphs")
print("="*60)

subgraphs = []
stats = {
    'total_entities': len(SEED_ENTITIES),
    'entities_with_relations': 0,
    'total_outgoing': 0,
    'total_incoming': 0,
    'unique_predicates': set(),
    'unique_object_categories': set(),
}

for entity_id, entity_name, entity_category in SEED_ENTITIES:
    subgraph = extract_semantic_subgraph(entity_id, entity_name, entity_category)
    subgraphs.append(subgraph)
    
    n_out = len(subgraph.outgoing_relations)
    n_in = len(subgraph.incoming_relations)
    
    if n_out > 0 or n_in > 0:
        stats['entities_with_relations'] += 1
    
    stats['total_outgoing'] += n_out
    stats['total_incoming'] += n_in
    
    for rel in subgraph.all_relations():
        stats['unique_predicates'].add(rel.predicate)
        stats['unique_object_categories'].add(rel.object_category)
    
    status = "OK" if (n_out + n_in) > 0 else "--"
    print(f"[{status}] {entity_name}: {n_out} outgoing, {n_in} incoming")

stats['unique_predicates'] = list(stats['unique_predicates'])
stats['unique_object_categories'] = list(stats['unique_object_categories'])

Extracting semantic subgraphs
[OK] glucose: 291 outgoing, 256 incoming
[OK] NAD+: 132 outgoing, 206 incoming
[OK] cholesterol: 175 outgoing, 229 incoming
[OK] alanine: 291 outgoing, 256 incoming
[OK] ATP: 16 outgoing, 119 incoming
[OK] water: 191 outgoing, 216 incoming
[OK] L-leucine: 267 outgoing, 232 incoming
[OK] L-tryptophan: 211 outgoing, 178 incoming
[OK] L-tyrosine: 326 outgoing, 116 incoming
[OK] L-valine: 39 outgoing, 291 incoming
[OK] citric acid: 134 outgoing, 106 incoming
[OK] pyruvic acid: 17 outgoing, 112 incoming
[OK] L-lactic acid: 8 outgoing, 16 incoming
[OK] nicotinamide: 270 outgoing, 156 incoming
[OK] pyridoxine: 248 outgoing, 171 incoming
[OK] carbon dioxide: 206 outgoing, 196 incoming
[OK] triglyceride: 26 outgoing, 286 incoming
[OK] ATP: 183 outgoing, 219 incoming
[OK] adenine: 281 outgoing, 248 incoming
[OK] UDP-glucose: 44 outgoing, 33 incoming


In [5]:
# Summary statistics
print("\n" + "="*60)
print("EXTRACTION SUMMARY")
print("="*60)

print(f"\nEntities processed: {stats['total_entities']}")
print(f"Entities with semantic relations: {stats['entities_with_relations']} ({100*stats['entities_with_relations']/stats['total_entities']:.0f}%)")
print(f"\nTotal relations extracted:")
print(f"  - Outgoing: {stats['total_outgoing']}")
print(f"  - Incoming: {stats['total_incoming']}")
print(f"  - Total: {stats['total_outgoing'] + stats['total_incoming']}")
print(f"\nUnique predicates: {len(stats['unique_predicates'])}")
for p in stats['unique_predicates'][:10]:
    print(f"  - {p}")
if len(stats['unique_predicates']) > 10:
    print(f"  ... and {len(stats['unique_predicates']) - 10} more")
print(f"\nUnique object categories: {len(stats['unique_object_categories'])}")
for c in stats['unique_object_categories'][:10]:
    print(f"  - {c}")


EXTRACTION SUMMARY

Entities processed: 20
Entities with semantic relations: 20 (100%)

Total relations extracted:
  - Outgoing: 3356
  - Incoming: 3642
  - Total: 6998

Unique predicates: 28
  - biolink:chemically_similar_to
  - biolink:directly_physically_interacts_with
  - biolink:affects
  - biolink:has_participant
  - biolink:close_match
  - biolink:in_clinical_trials_for
  - biolink:has_output
  - biolink:composed_primarily_of
  - biolink:physically_interacts_with
  - biolink:produces
  ... and 18 more

Unique object categories: 2
  - 
  - SmallMolecule


## 3. Show Example Subgraphs

Display a few complete subgraphs to understand the structure.

In [6]:
# Show example subgraphs
print("="*60)
print("EXAMPLE SUBGRAPHS")
print("="*60)

# Find subgraphs with the most relations
sorted_subgraphs = sorted(
    subgraphs,
    key=lambda sg: len(sg.all_relations()),
    reverse=True
)

for sg in sorted_subgraphs[:3]:
    print(f"\n{'='*40}")
    print(f"Entity: {sg.center_entity_name} ({sg.center_entity_id})")
    print(f"Category: {sg.center_entity_category}")
    print(f"{'='*40}")
    
    if sg.outgoing_relations:
        print(f"\nOutgoing relations ({len(sg.outgoing_relations)}):")
        for rel in sg.outgoing_relations[:5]:
            print(f"  --[{rel.predicate}]--> {rel.object_name} [{rel.object_category}]")
        if len(sg.outgoing_relations) > 5:
            print(f"  ... and {len(sg.outgoing_relations) - 5} more")
    
    if sg.incoming_relations:
        print(f"\nIncoming relations ({len(sg.incoming_relations)}):")
        for rel in sg.incoming_relations[:5]:
            print(f"  {rel.subject_name} --[{rel.predicate}]--> [this entity]")
        if len(sg.incoming_relations) > 5:
            print(f"  ... and {len(sg.incoming_relations) - 5} more")

EXAMPLE SUBGRAPHS

Entity: glucose (CHEBI:4167)
Category: SmallMolecule

Outgoing relations (291):
  --[biolink:mentioned_in_clinical_trials_for]--> hypoglycemia []
  --[biolink:in_clinical_trials_for]--> hypoglycemia []
  --[biolink:treats]--> hypoglycemia []
  --[biolink:related_to]--> hypoglycemia []
  --[biolink:in_clinical_trials_for]--> hypoglycemia []
  ... and 286 more

Incoming relations (256):
   --[biolink:subclass_of]--> [this entity]
   --[biolink:subclass_of]--> [this entity]
   --[biolink:subclass_of]--> [this entity]
   --[biolink:subclass_of]--> [this entity]
   --[biolink:subclass_of]--> [this entity]
  ... and 251 more

Entity: alanine (CHEBI:17234)
Category: SmallMolecule

Outgoing relations (291):
  --[biolink:mentioned_in_clinical_trials_for]--> hypoglycemia []
  --[biolink:in_clinical_trials_for]--> hypoglycemia []
  --[biolink:treats]--> hypoglycemia []
  --[biolink:related_to]--> hypoglycemia []
  --[biolink:in_clinical_trials_for]--> hypoglycemia []
  ... and 

## 4. Analyze Relation Patterns

In [7]:
# Analyze predicate-category patterns
print("="*60)
print("PREDICATE-CATEGORY PATTERNS")
print("="*60)

pred_category_counts = Counter()

for sg in subgraphs:
    for rel in sg.all_relations():
        key = (rel.predicate, rel.object_category)
        pred_category_counts[key] += 1

print(f"\nTop predicate-category combinations:")
for (pred, cat), count in pred_category_counts.most_common(20):
    print(f"  {count:4d}x  {pred} --> {cat}")

PREDICATE-CATEGORY PATTERNS

Top predicate-category combinations:
  1058x  biolink:related_to --> 
   937x  biolink:has_participant --> SmallMolecule
   737x  biolink:subclass_of --> SmallMolecule
   555x  biolink:in_clinical_trials_for --> 
   523x  biolink:subclass_of --> 
   429x  biolink:interacts_with --> SmallMolecule
   374x  biolink:chemically_similar_to --> SmallMolecule
   356x  biolink:related_to --> SmallMolecule
   299x  biolink:physically_interacts_with --> SmallMolecule
   251x  biolink:has_part --> SmallMolecule
   187x  biolink:mentioned_in_clinical_trials_for --> 
   165x  biolink:treats --> 
   116x  biolink:has_input --> SmallMolecule
   109x  biolink:physically_interacts_with --> 
   109x  biolink:contraindicated_in --> 
    95x  biolink:has_chemical_role --> 
    92x  biolink:close_match --> 
    86x  biolink:interacts_with --> 
    85x  biolink:close_match --> SmallMolecule
    78x  biolink:applied_to_treat --> 


## 5. Save Semantic Subgraphs

In [8]:
# Save subgraphs
output_data = {
    'timestamp': datetime.now().isoformat(),
    'stats': stats,
    'subgraphs': [sg.to_dict() for sg in subgraphs],
    'predicate_category_patterns': [
        {'predicate': pred, 'category': cat, 'count': count}
        for (pred, cat), count in pred_category_counts.most_common()
    ],
}

save_json(output_data, OUTPUT_DIR / 'semantic_subgraphs.json')
print(f"\nSemantic subgraphs saved to: {OUTPUT_DIR / 'semantic_subgraphs.json'}")


Semantic subgraphs saved to: /home/trentleslie/Insync/projects/biomapper2/notebooks/kg_o1_v3/outputs/semantic_subgraphs.json


## Summary

In [9]:
# Final summary
print("\n" + "="*60)
print("NOTEBOOK 03 COMPLETE")
print("="*60)

total_relations = stats['total_outgoing'] + stats['total_incoming']

print(f"\nExtraction Results:")
print(f"  - Entities processed: {stats['total_entities']}")
print(f"  - Entities with semantic relations: {stats['entities_with_relations']}")
print(f"  - Total semantic relations: {total_relations}")
print(f"  - Unique predicates: {len(stats['unique_predicates'])}")

# Success criteria: 50+ subgraphs with entity-relation-entity triplets
if stats['entities_with_relations'] >= 10 and total_relations >= 50:
    print(f"\nSuccess criteria met! Proceed to NB04.")
elif total_relations > 0:
    print(f"\nPartial success. Consider expanding seed entities.")
else:
    print(f"\nNo semantic relations found. v3 approach may not be viable.")

print(f"\nNext step: NB04 - Multi-Hop Path Discovery")


NOTEBOOK 03 COMPLETE

Extraction Results:
  - Entities processed: 20
  - Entities with semantic relations: 20
  - Total semantic relations: 6998
  - Unique predicates: 28

Success criteria met! Proceed to NB04.

Next step: NB04 - Multi-Hop Path Discovery
