Table Address {
  // https://schema.org/PostalAddress
  identifier string [pk, note: "https://schema.org/identifier"]
  text string [note: "https://schema.org/streetAddress"]
  city string [note: "https://schema.org/addressLocality"]
  postal_code string [note: "https://schema.org/postalCode"]
  country string [note: "https://schema.org/addressCountry"]
}

Table Contact {
  // https://schema.org/ContactPoint
  identifier string [pk]
  phone string [note: "https://schema.org/telephone"]
  email string [note: "https://schema.org/email"]
  fax string [note: "https://schema.org/faxNumber"]
  contact_type string [note: "https://schema.org/contactType"]
  available_language string [note: "https://schema.org/availableLanguage"]
}

Table HealthCareOrganization {
  // https://schema.org/MedicalOrganization
  identifier string [pk, note: "https://schema.org/identifier"]
  name string [not null, note: "https://schema.org/name"]
  contact string [ref: > Contact.identifier, note: "https://schema.org/contactPoint"]
  address string [ref: > Address.identifier, note: "https://schema.org/address"]
}

Table ServiceDepartment {
  // https://schema.org/MedicalOrganization
  identifier string [pk, note: "https://schema.org/identifier"]
  name string [not null, note: "https://schema.org/name"]
  is_part_of string [ref: > HealthCareOrganization.identifier, note: "https://schema.org/parentOrganization or https://schema.org/department"]
  contact string [ref: > Contact.identifier, note: "https://schema.org/contactPoint"]
  address string [ref: > Address.identifier, note: "https://schema.org/address"]
}

Table Person {
  // https://schema.org/Person
  identifier string [pk, note: "https://schema.org/identifier"]
  name string [not null, note: "https://schema.org/name"]
  birth_date date [note: "https://schema.org/birthDate"]
  gender string [note: "https://schema.org/gender"]
  knows_language string [note: "https://schema.org/knowsLanguage"]
}

Table HealthCarePersonnel {
  // Inherits Person
  identifier string [unique, ref: >Person.identifier, note: "https://schema.org/identifier"]
  job_title string [note: "https://schema.org/jobTitle"]
  institution string [ref: > HealthCareOrganization.identifier, note: "https://schema.org/worksFor"]
  department string [ref: > ServiceDepartment.identifier, note: "https://schema.org/memberOf"]
  email string [note: "https://schema.org/email"]
}

In [17]:
import pandas as pd
import json


# Load the CSV and JSON files
golden_standard = pd.read_csv('golden_standard_duplicates.csv')
with open('matches.json', 'r') as file:
    matches = json.load(file)


# Create a mapping between CSV field names and RDF predicate endings
field_to_predicate_map = {
    # Person fields
    "personName": "name",
    "birthDate": "birthDate",
    "knowsLanguage": "knowsLanguage",
    
    # HealthcarePersonnel fields 
    "email": "email",
    "jobTitle": "jobTitle",
    
    # Address fields
    "city": "addressLocality",
    "postalCode": "postalCode",
    "country": "addressCountry", 
    "text": "streetAddress",
    
    # Organization fields
    "healthcareOrganizationName": "name",
    "serviceDepartmentName": "name"
}

# Function to match CSV field names to RDF predicates
def match_field_to_predicate(field_name, predicates_list):
    if field_name in field_to_predicate_map:
        predicate_suffix = field_to_predicate_map[field_name]
        for predicate in predicates_list:
            # Check if the predicate ends with the mapped suffix
            if predicate.endswith(predicate_suffix):
                return predicate
    return None

# Prepare lists to store results
matched_results = []
# Iterate over matches and golden standard to find overlaps
for match in matches:
    entity1 = match['entities'][0]['entity1']
    entity2 = match['entities'][1]['entity2']
    
    for predicate1 in entity1['predicates']:
        predicate_suffix = predicate1['predicate'].split('/')[-1]  # Extract suffix
        object1 = predicate1['object']
        
        # Search for corresponding predicate in entity2
        for predicate2 in entity2['predicates']:
            object2 = predicate2['object']
            
            # Check against golden standard
            condition = (
                ((golden_standard['field_name'] == predicate_suffix) | (golden_standard['field_name'].map(lambda x: field_to_predicate_map.get(x)) == predicate_suffix) ) &
                (golden_standard['original_value'] == object1) &
                (golden_standard['varied_value'] == object2)
            )
            
            if golden_standard[condition].any().any():
                matched_results.append({
                    'entity1_subject': entity1['subject'],
                    'entity2_subject': entity2['subject'],
                    'predicate': predicate_suffix,
                    'original_value': object1,
                    'varied_value': object2,
                    'similarity_score': match['similarity_score'],
                    'duplication_type': match['duplication_type'],
                    'variation_type': golden_standard[condition]['variation_type'].iloc[0]
                })

# Convert results to DataFrame for clear presentation
matched_results_df = pd.DataFrame(matched_results)



In [18]:
matched_results_df

Unnamed: 0,entity1_subject,entity2_subject,predicate,original_value,varied_value,similarity_score,duplication_type,variation_type
0,http://example.org/Person/c6b4195c-aa19-4f26-8...,http://example.org/Person/5e39d4e4-b2cb-4e1c-9...,name,Ronald Terrell,Terrell Ronald,0.9595507383346558,exact,name_swap
1,http://example.org/Person/aab6941b-59e8-484c-a...,http://example.org/Person/e08318f4-5233-44a3-b...,email,sherylroberts@healthcare.org,sherlyroberts@healthcare.org,0.9651064872741699,exact,email_typo
2,http://example.org/Person/aab6941b-59e8-484c-a...,http://example.org/Person/e08318f4-5233-44a3-b...,name,Sheryl Roberts,S. Roberts,0.9651064872741699,exact,abbreviated_first_name
3,http://example.org/Person/beb778d6-2b02-4118-a...,http://example.org/Person/c1738634-4ff4-4ba1-a...,email,justinibarra@healthcare.org,justiinibarra@healthcare.org,0.9770858287811279,exact,email_typo
4,http://example.org/Address/d4713d60-c8a7-4639-...,http://example.org/Address/2e9a8e08-7f8c-430f-...,postalCode,6838,6838,0.962367594242096,exact,postal_format
...,...,...,...,...,...,...,...,...
57,http://example.org/ContactPoint/eb1d1061-9717-...,http://example.org/ContactPoint/5a409591-227d-...,email,EvansRodriguez.Infectious@dept.healthcare.org,EvansRodriguez.Infectious@dept.healthcare.nl,0.9513095021247864,exact,email_domain_change
58,http://example.org/Person/b4ca0142-3b23-4fd8-8...,http://example.org/Person/e1abf02f-b57e-4b65-8...,name,Jerry Martinez MD,Jerty Martinez MD,0.9699649810791016,exact,name_typo
59,http://example.org/Person/0a2cf6a9-5bd3-4392-9...,http://example.org/Person/b5335ed9-4ff7-4785-a...,email,kellyrodriguez@healthcare.org,kellyrodrfiguez@healthcare.org,0.9710720181465149,exact,email_typo
60,http://example.org/Person/23746ddd-f410-4d94-8...,http://example.org/Person/a5d993de-dd46-49bb-8...,email,aimeefranklin@healthcare.org,aimeefanklin@healthcare.org,0.953362226486206,exact,email_typo


In [21]:
matched_results_df.sort_values(by='similarity_score', ascending=True)[['original_value', 'varied_value', 'similarity_score', 'variation_type']]

Unnamed: 0,original_value,varied_value,similarity_score,variation_type
30,Renal,Kidney,0.8565924167633057,department_abbreviation
15,Radiography,Medical Imaging,0.8583426475524902,alternative_naming
33,Hughes-Perry Tervisekeskus,HughesP-erry Tervisekeskus,0.8842896223068237,name_typo
39,Emergency,ER,0.8973215818405151,department_abbreviation
9,Ott Roads 92,Ott Roads 92B,0.922610342502594,house_number_suffix
...,...,...,...,...
23,mariastark@healthcare.org,mariasitark@healthcare.org,0.9755954742431641,email_typo
44,jamesriley@healthcare.org,jamesriely@healthcare.org,0.9763870239257812,email_typo
3,justinibarra@healthcare.org,justiinibarra@healthcare.org,0.9770858287811279,email_typo
10,williamsmith@healthcare.org,williamsmih@healthcare.org,0.9781700372695923,email_typo


In [None]:
alternative_naming_df = matched_results_df[matched_results_df['variation_type'] == 'alternative_naming']
print(alternative_naming_df)

                                      entity1_subject  \
15  http://example.org/ServiceDept/c846bf90-40fe-4...   

                                      entity2_subject predicate  \
15  http://example.org/ServiceDept/6ad92085-8966-4...      name   

   original_value     varied_value    similarity_score duplication_type  \
15    Radiography  Medical Imaging  0.8583426475524902          similar   

        variation_type  
15  alternative_naming  


: 

In [20]:
total_golden_duplicates = len(golden_standard)
total_matched_duplicates = len(matched_results_df)
percentage_matched = (total_matched_duplicates / total_golden_duplicates) * 100

variation_type_counts_golden = golden_standard['variation_type'].value_counts()
variation_type_counts_matched = matched_results_df['variation_type'].value_counts()

variation_comparison = pd.DataFrame({
    'Golden Standard Count': variation_type_counts_golden,
    'Matched Count': variation_type_counts_matched
}).fillna(0).astype(int)

variation_comparison['Matched (%)'] = (variation_comparison['Matched Count'] / variation_comparison['Golden Standard Count']) * 100

stats_summary = {
    'Total Duplicates in Golden Standard': total_golden_duplicates,
    'Total Matched Duplicates': total_matched_duplicates,
    'Percentage Matched (%)': percentage_matched
}

print("\nMatching Statistics:")
for key, value in stats_summary.items():
    print(f"{key}: {value:.2f}" if isinstance(value, float) else f"{key}: {value}")

print("\nVariation Type Analysis:")
print(variation_comparison)


Matching Statistics:
Total Duplicates in Golden Standard: 83
Total Matched Duplicates: 62
Percentage Matched (%): 74.70

Variation Type Analysis:
                         Golden Standard Count  Matched Count  Matched (%)
variation_type                                                            
abbreviated_first_name                       8              6    75.000000
alternative_naming                           4              1    25.000000
city_typo                                    4              3    75.000000
country_expansion                            1              0     0.000000
date_format_variation                        1              1   100.000000
department_abbreviation                      3              2    66.666667
email_domain_change                          4              3    75.000000
email_typo                                  33             28    84.848485
house_number_suffix                          2              2   100.000000
language_expansion          