In [1]:
import json
import numpy as np
from pathlib import Path
from collections import Counter

In [None]:
# --- JSON files ---
hetionet_json_path = Path("data/records/hetionet_cancer_mvp_records_v3.json")
pubmed_json_path = Path("data/records/pubmed_records_cancer_mvp.json")
clinical_json_path = Path("data/records/clinical_trials_records_cancer_mvp.json")

# --- Embedding files ---
hetionet_npy_path = Path("data/embeddings/vector_index/hetionet_mvp_embeddings.npy")
pubmed_npy_path = Path("data/embeddings/pubmed_embeddings_cancer_mvp.npy")
clinical_npy_path = Path("data/embeddings/clinical_trials_embeddings_cancer_mvp.npy")

# --- Output files ---
out_json_path = Path("data/records/merged_cancer_records_all_sources.json")
out_npy_path = Path("data/embeddings/merged_cancer_embeddings_all_sources.npy")

In [3]:
# Load JSON records
with open(hetionet_json_path, "r") as f:
    hetionet_records = json.load(f)

with open(pubmed_json_path, "r") as f:
    pubmed_records = json.load(f)

with open(clinical_json_path, "r") as f:
    clinical_records = json.load(f)

# Load embeddings
hetionet_emb = np.load(hetionet_npy_path)
pubmed_emb = np.load(pubmed_npy_path)
clinical_emb = np.load(clinical_npy_path)

print("Loaded:")
print("  Hetionet records:", len(hetionet_records), "| embeddings:", hetionet_emb.shape)
print("  PubMed records:", len(pubmed_records), "| embeddings:", pubmed_emb.shape)
print("  Clinical records:", len(clinical_records), "| embeddings:", clinical_emb.shape)

Loaded:
  Hetionet records: 5367 | embeddings: (5367, 384)
  PubMed records: 1177 | embeddings: (1177, 384)
  Clinical records: 1294 | embeddings: (1294, 384)


In [4]:
# Count checks
assert len(hetionet_records) == hetionet_emb.shape[0], "Hetionet count mismatch"
assert len(pubmed_records) == pubmed_emb.shape[0], "PubMed count mismatch"
assert len(clinical_records) == clinical_emb.shape[0], "Clinical count mismatch"

# Dimension checks
dims = {hetionet_emb.shape[1], pubmed_emb.shape[1], clinical_emb.shape[1]}
assert len(dims) == 1, f"Embedding dimension mismatch: {dims}"

print("(yay) Counts and embedding dimensions look good.")
print("Embedding dim:", list(dims)[0])

(yay) Counts and embedding dimensions look good.
Embedding dim: 384


In [5]:
# Normalizing metadata sources

def ensure_source(records, default_source):
    for r in records:
        if "metadata" not in r or r["metadata"] is None:
            r["metadata"] = {}
        if "source" not in r["metadata"] or not r["metadata"]["source"]:
            r["metadata"]["source"] = default_source
    return records

hetionet_records = ensure_source(hetionet_records, "hetionet")
pubmed_records = ensure_source(pubmed_records, "pubmed")
clinical_records = ensure_source(clinical_records, "clinicaltrials")

# Quick check
for name, recs in [("hetionet", hetionet_records), ("pubmed", pubmed_records), ("clinical", clinical_records)]:
    print(name, "sample source ->", recs[0]["metadata"].get("source"))

hetionet sample source -> Entrez Gene
pubmed sample source -> PubMed
clinical sample source -> clinicaltrials


In [6]:
merged_records = hetionet_records + pubmed_records + clinical_records
merged_embeddings = np.vstack([hetionet_emb, pubmed_emb, clinical_emb])

print("Merged records:", len(merged_records))
print("Merged embeddings:", merged_embeddings.shape)

assert len(merged_records) == merged_embeddings.shape[0], "Merged count mismatch"
print("(phew) Merge alignment is correct.")

Merged records: 7838
Merged embeddings: (7838, 384)
(phew) Merge alignment is correct.


In [7]:
required_keys = {"id", "entity_type", "identifier", "name", "search_text", "metadata"}

bad = []
source_counts = Counter()
entity_counts = Counter()

for i, r in enumerate(merged_records):
    missing = required_keys - set(r.keys())
    if missing:
        bad.append((i, missing))
    source_counts[r.get("metadata", {}).get("source", "UNKNOWN")] += 1
    entity_counts[r.get("entity_type", "UNKNOWN")] += 1

print("Bad records:", len(bad))
if bad[:5]:
    print("Sample bad:", bad[:5])

print("\nSource counts:")
print(source_counts)

print("\nEntity type counts:")
print(entity_counts)

Bad records: 0

Source counts:
Counter({'Entrez Gene': 5167, 'clinicaltrials': 1294, 'PubMed': 1177, 'DrugBank': 122, 'Disease Ontology': 78})

Entity type counts:
Counter({'Gene': 5167, 'ClinicalTrial': 1294, 'PubMedArticle': 1177, 'Compound': 122, 'Disease': 78})


In [8]:
with open(out_json_path, "w") as f:
    json.dump(merged_records, f, indent=2)

np.save(out_npy_path, merged_embeddings)

print("Saved merged JSON:", out_json_path)
print("Saved merged NPY:", out_npy_path)

Saved merged JSON: /Users/shaunak/Documents/Hacklytics2026/merged_cancer_records_all_sources.json
Saved merged NPY: /Users/shaunak/Documents/Hacklytics2026/merged_cancer_embeddings_all_sources.npy
