In [2]:
import rdflib

# File paths
main_fp = "data/healthcare_graph_Main.ttl"
train_fp = "data/healthcare_graph_train.ttl"
test_fp  = "data/healthcare_graph_replaced_high.ttl"

# Load graphs
main_graph = rdflib.Graph()
train_graph = rdflib.Graph()
test_graph  = rdflib.Graph()
main_graph.parse(main_fp)
train_graph.parse(train_fp)
test_graph.parse(test_fp)

# Extract predicates
main_preds = set(str(p) for _, p, _ in main_graph)
train_preds = set(str(p) for _, p, _ in train_graph)
test_preds  = set(str(p) for _, p, _ in test_graph)

# Shared and unique relations
all_preds = main_preds | train_preds | test_preds

shared_all = main_preds & train_preds & test_preds
shared_main_train = main_preds & train_preds
shared_main_test  = main_preds & test_preds
shared_train_test = train_preds & test_preds

unique_to_main  = main_preds - (train_preds | test_preds)
unique_to_train = train_preds - (main_preds | test_preds)
unique_to_test  = test_preds - (main_preds | train_preds)

print("TOTAL RELATIONS:")
print(f"Main:  {len(main_preds)}")
print(f"Train: {len(train_preds)}")
print(f"Test:  {len(test_preds)}")
print(f"Shared by all: {len(shared_all)}")
print()

print("UNIQUE RELATIONS")
print(f"Unique to MAIN:  {unique_to_main}")
print(f"Unique to TRAIN: {unique_to_train}")
print(f"Unique to TEST:  {unique_to_test}")
print()
print("RELATIONS SHARED BETWEEN MAIN & TRAIN (not TEST):")
print(shared_main_train - shared_all)
print("RELATIONS SHARED BETWEEN MAIN & TEST (not TRAIN):")
print(shared_main_test - shared_all)
print("RELATIONS SHARED BETWEEN TRAIN & TEST (not MAIN):")
print(shared_train_test - shared_all)


TOTAL RELATIONS:
Main:  21
Train: 21
Test:  21
Shared by all: 21

UNIQUE RELATIONS
Unique to MAIN:  set()
Unique to TRAIN: set()
Unique to TEST:  set()

RELATIONS SHARED BETWEEN MAIN & TRAIN (not TEST):
set()
RELATIONS SHARED BETWEEN MAIN & TEST (not TRAIN):
set()
RELATIONS SHARED BETWEEN TRAIN & TEST (not MAIN):
set()


In [16]:
from rdflib.term import URIRef
from node2vec import Node2Vec
import networkx as nx


def rdf_to_nx(graph):
    G = nx.Graph()
    for s, p, o in graph:
        if isinstance(s, URIRef) and isinstance(o, URIRef):
            G.add_edge(str(s), str(o), predicate=str(p))
    return G

Clean_graph = rdf_to_nx(main_graph)

# Helper: Returns True if node looks like a URI
def is_uri(node):
    return str(node).startswith("http://") or str(node).startswith("https://")

# Find nodes that are NOT URIs
non_uri_nodes = [n for n in Clean_graph.nodes if not is_uri(n)]

if non_uri_nodes:
    print("Found non-URI nodes! Sample:", non_uri_nodes[:5])
else:
    print("All nodes appear to be URIs. Literals are excluded.")

nx.write_gml(Clean_graph, "clean_graph.gml")



All nodes appear to be URIs. Literals are excluded.


In [None]:
def get_graph_embeddings_Node2vec(graph, dimensions=384):
    G_nx = rdf_to_nx(graph)
    node2vec = Node2Vec(G_nx, dimensions=dimensions, walk_length=10, num_walks=60, workers=1)
    model = node2vec.fit()
    embeddings = {node: model.wv[node] for node in model.wv.index_to_key}
    return embeddings

In [4]:
main_entities = set(str(s) for s, _, _ in main_graph) | set(str(o) for _, _, o in main_graph)
train_entities = set(str(s) for s, _, _ in train_graph) | set(str(o) for _, _, o in train_graph)
test_entities  = set(str(s) for s, _, _ in test_graph) | set(str(o) for _, _, o in test_graph)

print("Entities only in test:", test_entities - (main_entities | train_entities))
print("Entities only in train:", train_entities - (main_entities | test_entities))
print("Entities only in main:", main_entities - (train_entities | test_entities))

Entities only in test: {'54bbca1b-59df-4ba0-86f3-19b42cb6b1f0', 'http://example.org/HealthcareOrganization/953e26e3-fc1c-4b72-b4c5-f252799e3f62', 'laurenkeelley@healthcare.org', 'http://example.org/Person/30aede21-d986-495b-8f8d-251a3a1f5b77', 'carrolynsmith@healthcare.org', 'michaelgarrcia@healthcare.org', 'judykiing@healthcare.org', 'Matthews Bethany', 'http://example.org/Person/ae654d59-1f21-4975-9a53-5bf0448da676', 'tylerroddriguez@healthcare.org', 'http://example.org/ContactPoint/94b177ae-8fbd-4b1e-83bc-c6d87b007400', 'c91e409d-9d00-4c08-9ad6-e1253b7b2852', 'c62485c6-a9cd-4fb2-b7fd-6f81e2631110', 'c3b5840f-2d52-4c3a-9092-9fc81d71aef5', 'G. Bentley MD', 'http://example.org/Person/7db40228-f6ad-439d-83a4-d00d20037c13', 'http://example.org/Person/0bde3e8e-5fe6-435f-92eb-a56cf08261ff', 'http://example.org/Person/538cdac0-51e0-45eb-a5ff-5d17ddbf9191', '1976-06-03', 'Chritsine Kim', 'http://example.org/ServiceDepartment/41430ac6-51ce-4492-8bd6-38aff33d9808', '7463d068-b049-4450-bc00-495