In [1]:
import rdflib
import numpy as np
import torch
from pykeen.triples import TriplesFactory
from pykeen.models.inductive import InductiveNodePieceGNN
from pykeen.losses import NSSALoss
from torch.optim import Adam
from pykeen.training import SLCWATrainingLoop
from pykeen.evaluation import RankBasedEvaluator
import torch.nn.functional as F
from modular_methods.similarity_utils import compute_cosine_similarity, match_entities
from modular_methods.graphToText_utils import get_literals_for_entities
from modular_methods.dedup_pipeline import deduplicate_graphs, save_matches
from modular_methods.output_utils import build_final_result

### ---- 1. Load RDF graphs ----

main_graph = rdflib.Graph()
train_graph = rdflib.Graph()
test_graph = rdflib.Graph()

main_graph.parse("data/healthcare_graph_Main.ttl")
train_graph.parse("data/healthcare_graph_train.ttl")
test_graph.parse("data/healthcare_graph_replaced_high.ttl")

# Combine for inductive training/testing
train_combined = main_graph + train_graph
test_combined = main_graph + test_graph

### ---- 2. Convert graphs to triples arrays ----

def graph_to_triples(g):
    return [
        (str(s), str(p), str(o))
        for s, p, o in g
        if not isinstance(s, rdflib.BNode) and not isinstance(o, rdflib.BNode)
    ]

train_triples = np.array(graph_to_triples(train_combined))
test_triples = np.array(graph_to_triples(test_combined))
main_triples = np.array(graph_to_triples(main_graph))
test_only_triples = np.array(graph_to_triples(test_graph))




  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_triples

array([['http://example.org/Address/a56a9b25-ce34-4d81-916f-d21a06718ae8',
        'http://www.w3.org/1999/02/22-rdf-syntax-ns#type',
        'https://schema.org/PostalAddress'],
       ['http://example.org/Person/6840360b-c90e-4fc6-a125-631972492165',
        'https://schema.org/gender', 'Female'],
       ['http://example.org/Person/920c77b9-dee4-4982-afee-6f105951d86b',
        'https://schema.org/identifier',
        '920c77b9-dee4-4982-afee-6f105951d86b'],
       ...,
       ['http://example.org/Person/521d283f-6231-4268-bec9-22c004fcebd3',
        'https://schema.org/worksFor',
        'http://example.org/HealthcareOrganization/06d2ed7c-e6ac-4d8a-8160-ff927c7550f2'],
       ['http://example.org/Person/d9cc401f-0767-4c9c-8bfc-cac98a0b5f8c',
        'https://schema.org/email', 'michelecook@healthcare.org'],
       ['http://example.org/Person/5f6ce363-6b90-4633-88a5-74df67e4598f',
        'https://schema.org/gender', 'Female']], dtype='<U78')

In [3]:
test_triples

array([['http://example.org/Person/7f3f3a7a-d2b0-4249-a813-3fac30edc402',
        'https://schema.org/jobTitle', 'Pulmonary Function Technologist'],
       ['http://example.org/Person/a7a1f3fd-23ed-46c9-b8e9-c63fa15aec4e',
        'https://schema.org/knowsLanguage', 'et'],
       ['http://example.org/Person/cd66e089-73cf-46b8-8eae-cd5ab2ef2b41',
        'https://schema.org/email', 'derickflemingphd@healthcare.org'],
       ...,
       ['http://example.org/ContactPoint/6b5bc344-01bb-47e2-bf69-cd8fa4b4da1c',
        'https://schema.org/availableLanguage', "['et', 'en']"],
       ['http://example.org/Person/1797ce99-3847-4bac-8acd-bea381b8190d',
        'https://schema.org/gender', 'Male'],
       ['http://example.org/Person/5f6ce363-6b90-4633-88a5-74df67e4598f',
        'https://schema.org/gender', 'Female']], dtype='<U78')

In [None]:
# Extract unique relations from train and test triples
train_relations = set(train_triples[:, 1])
test_relations = set(test_triples[:, 1])

# Find common and unique relations
common_relations = train_relations & test_relations
train_only_relations = train_relations - test_relations
test_only_relations = test_relations - train_relations

print(f"Number of relations in train: {len(train_relations)}")
print(f"Number of relations in test: {len(test_relations)}")
print(f"Number of common relations: {len(common_relations)}")
print(f"Relations only in train: {train_only_relations}")
print(f"Relations only in test: {test_only_relations}")

Number of relations in train: 21
Number of relations in test: 21
Number of common relations: 21
Relations only in train: set()
Relations only in test: set()


In [None]:
tf_train = TriplesFactory.from_labeled_triples(train_triples, create_inverse_triples=True)
tf_train


TriplesFactory(num_entities=17517, num_relations=42, create_inverse_triples=True, num_triples=36755)

In [None]:
### ---- 3. Build TriplesFactory objects ----

tf_train = TriplesFactory.from_labeled_triples(train_triples, create_inverse_triples=True)
tf_test = TriplesFactory.from_labeled_triples(
    test_triples,
    relation_to_id=tf_train.relation_to_id,
    create_inverse_triples=True
)
tf_main = TriplesFactory.from_labeled_triples(
    main_triples,
    relation_to_id=tf_train.relation_to_id,
    create_inverse_triples=True
)
tf_test_only = TriplesFactory.from_labeled_triples(
    test_only_triples,
    relation_to_id=tf_train.relation_to_id,
    create_inverse_triples=True
)

### ---- 4. Train InductiveNodePieceGNN ----

model = InductiveNodePieceGNN(
    triples_factory=tf_train,
    inference_factory=tf_test,
    num_tokens=12,
    aggregation="mlp",
    embedding_dim=128,
    interaction="DistMult",
    loss=NSSALoss(margin=15),
    random_seed=42,
).to("cuda" if torch.cuda.is_available() else "cpu")

optimizer = Adam(model.parameters(), lr=0.0005)

training_loop = SLCWATrainingLoop(
    triples_factory=tf_train,
    model=model,
    optimizer=optimizer,
    mode="training"
)

print("Training NodePiece...")
training_loop.train(tf_train, num_epochs=10)

### ---- 5. Extract entity embeddings ----

def extract_embeddings(model, triples_factory, mode="training"):
    model.mode = mode
    emb_array = model.entity_representations[0]().detach().cpu().numpy()
    entities = list(triples_factory.entity_to_id.keys())
    return {e: emb_array[i] for i, e in enumerate(entities)}

main_embeddings = extract_embeddings(model, tf_main, mode="training")
test_embeddings = extract_embeddings(model, tf_test_only, mode="testing")

# Make sure only common entities/types are compared (as your pipeline does)
entity_ids1 = list(main_embeddings.keys())
entity_ids2 = list(test_embeddings.keys())

emb1 = torch.tensor([main_embeddings[e] for e in entity_ids1])
emb2 = torch.tensor([test_embeddings[e] for e in entity_ids2])

emb1 = F.normalize(emb1, p=2, dim=1)
emb2 = F.normalize(emb2, p=2, dim=1)

sim_matrix = compute_cosine_similarity(emb1, emb2)
matches = match_entities(sim_matrix, entity_ids1, entity_ids2, threshold=0.7, top_k=5)

# Literal-based filtering (as in your pipeline)
literals1 = get_literals_for_entities(main_graph, entity_ids1)
literals2 = get_literals_for_entities(test_graph, entity_ids2)
from modular_methods.similarity_utils import Levenshtein_filter
filtered_matches = Levenshtein_filter(matches, literals1, literals2, filter=True)

### ---- 7. Format and save results ----

final_result = build_final_result(
    filtered_matches,
    main_graph,
    test_graph,
    graph1_name="MainGraph",
    graph2_name="TestGraph"
)
save_matches(final_result, "NodePiece_dedup_results.json")
print(f"Saved results to NodePiece_dedup_results.json")