In [1]:
import rdflib

ttl_path = "data/healthcare_graph_original_v2.ttl"

def extract_dedupe_fields_from_ttl(ttl_path):
    g = rdflib.Graph()
    g.parse("data/healthcare_graph_original_v2.ttl")
    
    # Map: {entity_type: set([literal_predicate_names])}
    type_predicate_map = {}
    
    for s in set(g.subjects()):
        # Get type
        types = [str(o) for o in g.objects(s, rdflib.RDF.type)]
        if not types:
            continue
        type_ = types[0].split("/")[-1]  # Or use more sophisticated logic if needed
        
        # Gather literal predicates
        predicates = set()
        for p, o in g.predicate_objects(s):
            if isinstance(o, rdflib.Literal):
                pred_name = p.split("/")[-1] if "/" in str(p) else str(p)
                predicates.add(pred_name)
        if type_ not in type_predicate_map:
            type_predicate_map[type_] = set()
        type_predicate_map[type_].update(predicates)
    
    # Build dedupe.io field definitions per entity type
    dedupe_fields = {}
    for type_, preds in type_predicate_map.items():
        dedupe_fields[type_] = [
            {'field': pred, 'type': 'String'} for pred in preds if pred.lower() != "identifier"
]

    return dedupe_fields

fields_per_type = extract_dedupe_fields_from_ttl("data/healthcare_graph_original_v2.ttl")
for entity_type, fields in fields_per_type.items():
    print(f"Entity type: {entity_type}")
    print(fields)
    print()

Entity type: Person
[{'field': 'jobTitle', 'type': 'String'}, {'field': 'birthDate', 'type': 'String'}, {'field': 'gender', 'type': 'String'}, {'field': 'email', 'type': 'String'}, {'field': 'knowsLanguage', 'type': 'String'}, {'field': 'name', 'type': 'String'}]

Entity type: Department
[{'field': 'name', 'type': 'String'}]

Entity type: ContactPoint
[{'field': 'contactType', 'type': 'String'}, {'field': 'faxNumber', 'type': 'String'}, {'field': 'availableLanguage', 'type': 'String'}, {'field': 'email', 'type': 'String'}, {'field': 'telephone', 'type': 'String'}]

Entity type: PostalAddress
[{'field': 'postalCode', 'type': 'String'}, {'field': 'addressCountry', 'type': 'String'}, {'field': 'addressLocality', 'type': 'String'}, {'field': 'streetAddress', 'type': 'String'}]

Entity type: MedicalOrganization
[{'field': 'name', 'type': 'String'}]



In [3]:
from modular_methods.graphToText_utils import kg_to_dedupe_dict
g = rdflib.Graph()
g1 = rdflib.Graph()
g.parse("data/healthcare_graph_original_v2.ttl")
g1.parse("data/prog_data/healthcare_graph_progdups.ttl")
dict1 = kg_to_dedupe_dict(g)
dict2 = kg_to_dedupe_dict(g1)   

In [None]:
import json
with open("dict1_person.json", "w") as f:
    json.dump(dict1, f, indent=2)
with open("dict2_person.json", "w") as f:
    json.dump(dict2, f, indent=2)