In [None]:
import json
from collections import defaultdict
from pathlib import Path

# File downloaded from https://github.com/hetio/hetionet/tree/main/hetnet/json
with open("hetionet-v1.0.json") as f:
    data = json.load(f)

out_path = Path("data/records/hetionet_cancer_mvp_records_v3.json")
out_path.parent.mkdir(parents=True, exist_ok=True)


In [2]:
print(data.keys())
print("nodes:", len(data["nodes"]))
print("edges:", len(data["edges"]))

print("First 5 nodes:")
for node in data["nodes"][:5]:
    print(node) 

print("\nFirst 5 edges:")
for edge in data["edges"][:5]:
    print(edge)

dict_keys(['metanode_kinds', 'metaedge_tuples', 'kind_to_abbrev', 'nodes', 'edges'])
nodes: 47031
edges: 2250197
First 5 nodes:
{'kind': 'Molecular Function', 'identifier': 'GO:0031753', 'name': 'endothelial differentiation G-protein coupled receptor binding', 'data': {'source': 'Gene Ontology', 'license': 'CC BY 4.0', 'url': 'http://purl.obolibrary.org/obo/GO_0031753'}}
{'kind': 'Side Effect', 'identifier': 'C0023448', 'name': 'Lymphocytic leukaemia', 'data': {'source': 'UMLS via SIDER 4.1', 'license': 'CC BY-NC-SA 4.0', 'url': 'http://identifiers.org/umls/C0023448'}}
{'kind': 'Gene', 'identifier': 5345, 'name': 'SERPINF2', 'data': {'description': 'serpin peptidase inhibitor, clade F (alpha-2 antiplasmin, pigment epithelium derived factor), member 2', 'source': 'Entrez Gene', 'license': 'CC0 1.0', 'url': 'http://identifiers.org/ncbigene/5345', 'chromosome': '17'}}
{'kind': 'Gene', 'identifier': 9409, 'name': 'PEX16', 'data': {'description': 'peroxisomal biogenesis factor 16', 'source'

In [3]:
from collections import defaultdict, Counter

nodes = data["nodes"]
edges = data["edges"]

# key = (kind, identifier)
node_lookup = {(node["kind"], node["identifier"]): node for node in nodes}

# Count node types
node_kind_counts = Counter(node["kind"] for node in nodes)
print("Node type counts:")
for k, v in node_kind_counts.most_common():
    print(f"{k}: {v}")

Node type counts:
Gene: 20945
Biological Process: 11381
Side Effect: 5734
Molecular Function: 2884
Pathway: 1822
Compound: 1552
Cellular Component: 1391
Symptom: 438
Anatomy: 402
Pharmacologic Class: 345
Disease: 137


In [4]:
cancer_keywords = [
    "cancer", "carcinoma", "sarcoma", "leukemia", "lymphoma", "melanoma", "glioma", "blastoma", "myeloma", "neoplasm", 
    "tumor", "tumour", "adenocarcinoma", "squamous cell carcinoma", "basal cell carcinoma", "small cell carcinoma", "non-small cell carcinoma"
]

cancer_diseases = []
for node in nodes:
    if node["kind"] != "Disease":
        continue
    name = (node.get("name") or "").lower()
    if any(keyword in name for keyword in cancer_keywords):
        cancer_diseases.append(node)

print("Cancer diseases ndoes found are: ", len(cancer_diseases))
for n in cancer_diseases[:20]:
    print("-", n["name"])

Cancer diseases ndoes found are:  54
- spinal cancer
- uterine cancer
- urinary bladder cancer
- pancreatic cancer
- salivary gland cancer
- pharynx cancer
- bile duct cancer
- ureter cancer
- melanoma
- breast cancer
- liver cancer
- vascular cancer
- middle ear cancer
- ovarian cancer
- thyroid cancer
- ocular cancer
- muscle cancer
- peripheral nervous system neoplasm
- prostate cancer
- tracheal cancer


In [5]:
cancer_disease_ids = {(n["kind"], n["identifier"]) for n in cancer_diseases}
cancer_edges = []
for edge in edges:
    src = tuple(edge["source_id"])
    tgt = tuple(edge["target_id"])
    if src in cancer_disease_ids or tgt in cancer_disease_ids:
        cancer_edges.append(edge)

print("Edges involving cancer diseases: ", len(cancer_edges))

Edges involving cancer diseases:  12123


In [6]:
subset_node_ids = set()
for edge in cancer_edges:
    subset_node_ids.add(tuple(edge["source_id"]))
    subset_node_ids.add(tuple(edge["target_id"]))

subset_nodes = [node_lookup[node_id] for node_id in subset_node_ids if node_id in node_lookup]
print("Subset nodes count: ", len(subset_nodes))
subset_node_kind_counts = Counter(node["kind"] for node in subset_nodes)
print("Subset node type counts:")
for k, v in subset_node_kind_counts.most_common():
    print(f"{k}: {v}")

Subset nodes count:  5973
Subset node type counts:
Gene: 5167
Anatomy: 347
Symptom: 259
Compound: 122
Disease: 78


In [7]:
keep_kinds = {"Disease", "Gene", "Compound"}

mvp_nodes = [node for node in subset_nodes if node["kind"] in keep_kinds]
mvp_node_ids = {(node["kind"], node["identifier"]) for node in mvp_nodes}

mvp_edges = []
for edge in cancer_edges:
    src = tuple(edge["source_id"])
    tgt = tuple(edge["target_id"])
    if src in mvp_node_ids and tgt in mvp_node_ids:
        mvp_edges.append(edge)

print("MVP nodes:", len(mvp_nodes))
print("MVP edges:", len(mvp_edges))

print("MVP type counts:", Counter(node["kind"] for node in mvp_nodes))

MVP nodes: 5367
MVP edges: 9221
MVP type counts: Counter({'Gene': 5167, 'Compound': 122, 'Disease': 78})


In [None]:
# Build node lookup
node_lookup = {(n["kind"], n["identifier"]): n for n in mvp_nodes}

# Build adjacency map
adj = defaultdict(list)
for e in mvp_edges:
    src = tuple(e["source_id"])
    tgt = tuple(e["target_id"])
    edge_kind = e["kind"]

    # Store both directions
    adj[src].append((tgt, edge_kind, "out"))
    adj[tgt].append((src, edge_kind, "in"))


def build_search_text_v3(node):
    nid = (node["kind"], node["identifier"])
    kind = node["kind"]
    name = node.get("name", "")
    data = node.get("data", {}) or {}

    parts = []
    parts.append(f"{kind}: {name}.")

    # Adding richer base metadata
    if kind == "Gene" and data.get("description"):
        parts.append(f"Description: {data['description']}.")
    if kind == "Disease" and data.get("name"):
        pass
    if kind == "Compound":
        # Because DrugBank compounds often have useful extra metadata in some cases
        if data.get("description"):
            parts.append(f"Description: {data['description']}.")

    rels = adj.get(nid, [])

    diseases = []
    genes = []
    compounds = []
    others = []

    for nbr_id, edge_kind, direction in rels:
        nbr = node_lookup.get(nbr_id)
        if not nbr:
            continue

        nbr_kind = nbr["kind"]
        nbr_name = nbr.get("name", str(nbr["identifier"]))

        # Direction-aware phrase 
        phrase = f"{nbr_name} ({edge_kind})"

        if nbr_kind == "Disease":
            diseases.append(phrase)
        elif nbr_kind == "Gene":
            genes.append(phrase)
        elif nbr_kind == "Compound":
            compounds.append(phrase)
        else:
            others.append(f"{nbr_kind}: {phrase}")

    # dedup helper
    def dedup_keep_order(seq):
        seen = set()
        out = []
        for x in seq:
            if x not in seen:
                seen.add(x)
                out.append(x)
        return out

    diseases = dedup_keep_order(diseases)
    genes = dedup_keep_order(genes)
    compounds = dedup_keep_order(compounds)
    others = dedup_keep_order(others)

    # Entity-specific wording
    if kind == "Gene":
        if diseases:
            parts.append("Associated diseases: " + "; ".join(diseases[:25]) + ".")
        if compounds:
            parts.append("Related compounds/drugs: " + "; ".join(compounds[:20]) + ".")
        if genes:
            parts.append("Interacting/related genes: " + "; ".join(genes[:20]) + ".")
    elif kind == "Compound":
        if diseases:
            parts.append("Treats or relates to diseases: " + "; ".join(diseases[:25]) + ".")
        if genes:
            parts.append("Targets or linked genes/proteins: " + "; ".join(genes[:25]) + ".")
        if compounds:
            parts.append("Related compounds: " + "; ".join(compounds[:10]) + ".")
    elif kind == "Disease":
        if genes:
            parts.append("Associated genes: " + "; ".join(genes[:30]) + ".")
        if compounds:
            parts.append("Related compounds/drugs: " + "; ".join(compounds[:20]) + ".")
        if diseases:
            parts.append("Related diseases: " + "; ".join(diseases[:20]) + ".")
    else:
        if diseases:
            parts.append("Related diseases: " + "; ".join(diseases[:15]) + ".")
        if genes:
            parts.append("Related genes: " + "; ".join(genes[:15]) + ".")
        if compounds:
            parts.append("Related compounds: " + "; ".join(compounds[:15]) + ".")

    if others:
        parts.append("Other links: " + "; ".join(others[:10]) + ".")

    return " ".join(parts)

In [None]:
records_v3 = []

for n in mvp_nodes:
    records_v3.append({
        "id": f"{n['kind']}::{n['identifier']}",
        "entity_type": n["kind"],
        "identifier": n["identifier"],
        "name": n.get("name", ""),
        "search_text": build_search_text_v3(n),
        "metadata": n.get("data", {}) or {}
    })

print("records_v3:", len(records_v3))
print(records_v3[0]["id"])
print(records_v3[0]["search_text"][:600])

records_v3: 5367
Gene::6515
Gene: SLC2A3. Description: solute carrier family 2 (facilitated glucose transporter), member 3. Associated diseases: germ cell cancer (associates); melanoma (upregulates).


In [None]:
with open(out_path, "w") as f:
    json.dump(records_v3, f, indent=2)

print("Saved hetionet_cancer_mvp_records_v3.json")

Saved hetionet_cancer_mvp_records_v3.json
