In [1]:
import rdflib

ttl_path = "data/healthcare_graph_original_v2.ttl"

def extract_dedupe_fields_from_ttl(ttl_path):
    g = rdflib.Graph()
    g.parse("data/healthcare_graph_original_v2.ttl")
    
    # Map: {entity_type: set([literal_predicate_names])}
    type_predicate_map = {}
    
    for s in set(g.subjects()):
        # Get type
        types = [str(o) for o in g.objects(s, rdflib.RDF.type)]
        if not types:
            continue
        type_ = types[0].split("/")[-1]  # Or use more sophisticated logic if needed
        
        # Gather literal predicates
        predicates = set()
        for p, o in g.predicate_objects(s):
            if isinstance(o, rdflib.Literal):
                pred_name = p.split("/")[-1] if "/" in str(p) else str(p)
                predicates.add(pred_name)
        if type_ not in type_predicate_map:
            type_predicate_map[type_] = set()
        type_predicate_map[type_].update(predicates)
    
    # Build dedupe.io field definitions per entity type
    dedupe_fields = {}
    for type_, preds in type_predicate_map.items():
        dedupe_fields[type_] = [
            {'field': pred, 'type': 'String'} for pred in preds if pred.lower() != "identifier"
]

    return dedupe_fields

fields_per_type = extract_dedupe_fields_from_ttl("data/healthcare_graph_original_v2.ttl")
for entity_type, fields in fields_per_type.items():
    print(f"Entity type: {entity_type}")
    print(fields)
    print()

Entity type: Person
[{'field': 'jobTitle', 'type': 'String'}, {'field': 'birthDate', 'type': 'String'}, {'field': 'gender', 'type': 'String'}, {'field': 'email', 'type': 'String'}, {'field': 'knowsLanguage', 'type': 'String'}, {'field': 'name', 'type': 'String'}]

Entity type: Department
[{'field': 'name', 'type': 'String'}]

Entity type: ContactPoint
[{'field': 'contactType', 'type': 'String'}, {'field': 'faxNumber', 'type': 'String'}, {'field': 'availableLanguage', 'type': 'String'}, {'field': 'email', 'type': 'String'}, {'field': 'telephone', 'type': 'String'}]

Entity type: PostalAddress
[{'field': 'postalCode', 'type': 'String'}, {'field': 'addressCountry', 'type': 'String'}, {'field': 'addressLocality', 'type': 'String'}, {'field': 'streetAddress', 'type': 'String'}]

Entity type: MedicalOrganization
[{'field': 'name', 'type': 'String'}]



In [3]:
from modular_methods.graphToText_utils import kg_to_dedupe_dict
g = rdflib.Graph()
g1 = rdflib.Graph()
g.parse("data/healthcare_graph_original_v2.ttl")
g1.parse("data/prog_data/healthcare_graph_progdups.ttl")
dict1 = kg_to_dedupe_dict(g)
dict2 = kg_to_dedupe_dict(g1)   

In [6]:
import json
with open("data/dedupe_data/dict1.json", "w") as f:
    json.dump(dict1, f, indent=2)
with open("data/dedupe_data/dict2.json", "w") as f:
    json.dump(dict2, f, indent=2)

In [63]:
import pandas as pd 

golden_standard = pd.read_csv('data/prog_data/updated_golden_standard_duplicates2.csv')

# Replace entity types
golden_standard['entity_type'] = golden_standard['entity_type'].replace({
    'HealthcareOrganization': 'HealthcareOrg',
    'ServiceDepartment': 'ServiceDept'
})

golden_standard_subset = golden_standard[['original_id', 'duplicate_id', 'entity_type']]



ground_truth = {}

for entity in golden_standard_subset['entity_type'].unique():
    df_filtered = golden_standard_subset[golden_standard_subset['entity_type'] == entity]
    ground_truth[entity] = [
        (f"http://example.org/{entity}/{row['original_id']}", f"http://example.org/{entity}/{row['duplicate_id']}")
        for _, row in df_filtered.iterrows()
    ]

# Example: get ground truth pairs for Address
GT_AD =  ground_truth['Address'][:10] # print first 10 pairs
GT_Person = ground_truth['Person'][:10] # print first 10 pairs
GT_HCO = ground_truth['HealthcareOrg'][:10] # print first 10 pairs
GT_SD = ground_truth['ServiceDept'][:10] # print first 10 pairs
GT_CP = ground_truth['ContactPoint'][:10] # print first 10 pairs

import random

def sample_distinct_pairs(golden_standard_subset, ground_truth_pairs, entity_type, n_neg=3):
    """Sample distinct pairs for dedupe.io (negatives), avoiding ground truth."""
    filtered = golden_standard_subset[golden_standard_subset['entity_type'] == entity_type]
    all_ids1 = filtered['original_id'].unique()
    all_ids2 = filtered['duplicate_id'].unique()
    gt_set = set(ground_truth_pairs)
    distinct_pairs = set()
    attempts = 0
    max_attempts = len(gt_set) * n_neg * 10  # Prevent infinite loop
    
    while len(distinct_pairs) < len(gt_set) * n_neg and attempts < max_attempts:
        a, b = random.choice(all_ids1), random.choice(all_ids2)
        if (a, b) not in gt_set and (b, a) not in gt_set and (a, b) not in distinct_pairs:
            distinct_pairs.add((f"http://example.org/{entity_type}/{a}", f"http://example.org/{entity_type}/{b}"))
        attempts += 1
    return list(distinct_pairs)

entity_types = ['Address', 'Person', 'HealthcareOrg', 'ServiceDept', 'ContactPoint']
distinct_pairs = {}
for entity_type in entity_types:
    negatives = sample_distinct_pairs(
        golden_standard_subset, ground_truth[entity_type], entity_type, n_neg=3
    )
    distinct_pairs[entity_type] = negatives

DP_Ad = distinct_pairs['Address'][:10]   
DP_Person = distinct_pairs['Person'][:10]
DP_HCO = distinct_pairs['HealthcareOrg'][:10]
DP_SD = distinct_pairs['ServiceDept'][:10]
DP_CP = distinct_pairs['ContactPoint'][:10]

print("Ground Truth Address Pairs:", GT_AD)
print("Distinct Negative Address Pairs:", DP_Ad)

Ground Truth Address Pairs: [('http://example.org/Address/0e8cff18-9e9d-4b77-ba34-b580eda0a0af', 'http://example.org/Address/b43baeb2-62b1-4c36-99a2-e67258cf1235'), ('http://example.org/Address/1095e497-3d79-4f0b-a6cd-3595d4f0d65d', 'http://example.org/Address/679dbeb2-2136-402b-811b-c2471abca0b2'), ('http://example.org/Address/3899c22a-d4ed-41bd-9692-74a2acbe4ff6', 'http://example.org/Address/32fd16b8-2e2d-4257-9bd2-e411bfc5e5ce'), ('http://example.org/Address/541d1a81-4815-442c-a2bb-62833d83ad7c', 'http://example.org/Address/a26599c2-c1e3-4595-982d-515d344bbc43'), ('http://example.org/Address/603fa8f0-1549-4dee-830b-4d6d9575d095', 'http://example.org/Address/bd4cf3d7-3c27-4645-99bd-a9a62e2d500a'), ('http://example.org/Address/eedbba8b-0f83-41ee-b2ac-89bdb2998fa3', 'http://example.org/Address/e9b93ff7-1d3c-4297-b080-5fb4cb77938d'), ('http://example.org/Address/853a9135-3ee2-451b-be88-c93319db1d7b', 'http://example.org/Address/fdce7129-5e40-43ea-96e2-a428d9964a8c'), ('http://example.or

In [70]:
# Map of entity type to dedupe.io fields
ENTITY_FIELDS = {
    "Person": [
        {'field': 'knowsLanguage', 'type': 'String'},
        {'field': 'jobTitle', 'type': 'String'},
        {'field': 'name', 'type': 'String'},
        {'field': 'birthDate', 'type': 'String'},
        {'field': 'gender', 'type': 'String'},
        {'field': 'email', 'type': 'String'}
    ],
    "ContactPoint": [
        {'field': 'faxNumber', 'type': 'String'},
        {'field': 'availableLanguage', 'type': 'String'},
        {'field': 'telephone', 'type': 'String'},
        {'field': 'email', 'type': 'String'},
        {'field': 'contactType', 'type': 'String'}
    ],
    "Department": [
        {'field': 'name', 'type': 'String'}
    ],
    "PostalAddress": [
        {'field': 'addressLocality', 'type': 'String'},
        {'field': 'streetAddress', 'type': 'String'},
        {'field': 'postalCode', 'type': 'String'},
        {'field': 'addressCountry', 'type': 'String'}
    ],
    "MedicalOrganization": [
        {'field': 'name', 'type': 'String'}
    ]
}

import dedupe
import dedupe.variables

ENTITY_FIELDS_dedup= {"Person": [
        dedupe.variables.String("knowsLanguage"),
        dedupe.variables.String("jobTitle"),
        dedupe.variables.String("name"),
        dedupe.variables.String("birthDate"),
        dedupe.variables.String("gender"),
        dedupe.variables.String("email")
    ],
    "ContactPoint": [
        dedupe.variables.String("faxNumber"),
        dedupe.variables.String("availableLanguage"),
        dedupe.variables.String("telephone"),
        dedupe.variables.String("email"),
        dedupe.variables.String("contactType")
    ],
    "Department": [
        dedupe.variables.String("name")
    ],
    "PostalAddress": [
        dedupe.variables.String("addressLocality"),
        dedupe.variables.String("streetAddress"),
        dedupe.variables.String("postalCode"),
        dedupe.variables.String("addressCountry")
    ],
    "MedicalOrganization": [
        dedupe.variables.String("name")
    ]}



def filter_entity(d, entity_type):
    search_str = f"/{entity_type}/"
    return {k: v for k, v in d.items() if search_str in k}

person_dict1 = filter_entity(dict1, "Person")
person_dict2 = filter_entity(dict2, "Person")
person_dict1



{'http://example.org/Person/45237c7f-ccbb-4ccf-b0f6-927645dd034e': {'birthDate': '1980-08-09',
  'email': 'lindseyarnold@healthcare.org',
  'gender': 'Other',
  'jobTitle': 'Therapist',
  'knowsLanguage': 'et',
  'name': 'Lindsey Arnold'},
 'http://example.org/Person/e65eab5b-cc1f-457e-817d-0fe17e75dbc7': {'birthDate': '2000-03-18',
  'email': 'dawnyoder@healthcare.org',
  'gender': 'Other',
  'jobTitle': 'Infectious Disease Specialist',
  'knowsLanguage': 'de',
  'name': 'Dawn Yoder'},
 'http://example.org/Person/8b50deed-ef03-4dbb-bf9a-429977548eab': {'birthDate': '1968-06-07',
  'email': 'jasonthompson@healthcare.org',
  'gender': 'Male',
  'jobTitle': 'Physical Therapy Assistant',
  'knowsLanguage': 'de',
  'name': 'Jason Thompson'},
 'http://example.org/Person/fd4ff9d0-7b03-4250-8844-da6fe655c3c3': {'birthDate': '1980-06-24',
  'email': 'jennifervelazquez@healthcare.org',
  'gender': 'Female',
  'jobTitle': 'Epidemiologist',
  'knowsLanguage': 'de',
  'name': 'Jennifer Velazquez'}

In [71]:
def pairs_to_records(pair_list, dict1, dict2):
    return [
        (dict1[a], dict2[b])
        for a, b in pair_list
        if a in dict1 and b in dict2
    ]


matches = pairs_to_records(GT_Person, dict1, dict2)
distinct = pairs_to_records(DP_Person, dict1, dict2)

matches

[({'birthDate': '1965-01-31',
   'email': 'melissawest@healthcare.org',
   'gender': 'Female',
   'jobTitle': 'Renal Dietitian',
   'knowsLanguage': 'et',
   'name': 'Melissa West'},
  {'birthDate': '1965-01-31',
   'email': 'melisswest@healthcare.org',
   'gender': 'Female',
   'jobTitle': 'Renal Dietitian',
   'knowsLanguage': 'et',
   'name': 'Melissa Wets'}),
 ({'birthDate': '1961-05-21',
   'email': 'dawnconner@healthcare.org',
   'gender': 'Other',
   'jobTitle': 'Obstetrics Technician',
   'knowsLanguage': 'nl',
   'name': 'Dawn Conner'},
  {'birthDate': '1998-03-25',
   'email': 'christopherteele@healthcare.org',
   'gender': 'Other',
   'jobTitle': 'Neurological Nurse',
   'knowsLanguage': 'Dutch',
   'name': 'Christopher Steele'}),
 ({'birthDate': '1972-09-24',
   'email': 'briannamendez@healthcare.org',
   'gender': 'Female',
   'jobTitle': 'Infectious Disease Specialist',
   'knowsLanguage': 'de',
   'name': 'Brianna Mendez'},
  {'birthDate': '1972-24-09',
   'email': 'bria

In [75]:
training_data = {
    "match": matches,
    "distinct": distinct    
}

linker = dedupe.RecordLink(ENTITY_FIELDS_dedup['Person'])
linker.mark_pairs(training_data)
linker.prepare_training(person_dict1, person_dict2, sample_size=200)
linker.train()


In [77]:
links = linker.join(person_dict1, person_dict2, threshold=0.5)
#list(links)
len(links)

395

In [79]:
list(links)
pairs = [ (str(pair[0][0]), str(pair[0][1])) for pair in links ]

In [None]:
extracted_pairs = [
    (str(pair[0][0]).rsplit('/', 1)[-1], str(pair[0][1]).rsplit('/', 1)[-1])
    for pair in links
]
extracted_pairs



[('2fa0e8b1-1539-4133-99a7-7bc7fa024298',
  '8eab5934-bc99-431b-bc6c-b176c78bfc1b'),
 ('fd1ec008-2575-4290-bfa6-65bfdff5199e',
  'c958f57b-9e3b-4722-b4a7-316743bb73f7'),
 ('bafa50b0-608c-4044-bff4-d9e846e56529',
  '0dc006b3-dbd2-4dd9-a016-4a3e5cf33165'),
 ('ab31993c-1359-40ef-b49a-a8bb1e84405d',
  'ba8e4a06-e3d3-4d0f-946b-836c1bb79afb'),
 ('eaefa8a4-a8d5-4e2b-9c66-c0dc8dc113a5',
  '19527985-8ef4-40f4-bf85-5ccf028a1a84'),
 ('ffdbf61f-bb4b-451f-b3aa-90815cabce60',
  '1afa424d-de78-41d0-8ac8-75c9a658c3ef'),
 ('f8c93b8b-1b24-4427-a624-7c06e63eebbc',
  '9bd6d39b-6ca4-4401-915e-5f0f9c904c4d'),
 ('f7480915-dfe1-4599-917a-9d3adb9d625e',
  '4baed372-9476-49dc-b487-fe3e37e30b4a'),
 ('f6cd31de-2a3a-4bde-ba35-3fb00630a9c5',
  '88d7c653-89fc-4720-aa78-fb076865db05'),
 ('f0df3c2c-4e1a-4d07-97da-4ef05765c8e6',
  '5daf0820-9785-4a83-a7f9-9c9a22b379e8'),
 ('dfd26ce8-5727-40df-a225-5a905d05a6b0',
  '68e2a028-8706-4ee5-b29c-83ded94fceef'),
 ('d5221afa-f98d-434f-9904-6f5305039e3b',
  '6db7e8ea-4a15-4a62-a