Table Address {
  // https://schema.org/PostalAddress
  identifier string [pk, note: "https://schema.org/identifier"]
  text string [note: "https://schema.org/streetAddress"]
  city string [note: "https://schema.org/addressLocality"]
  postal_code string [note: "https://schema.org/postalCode"]
  country string [note: "https://schema.org/addressCountry"]
}

Table Contact {
  // https://schema.org/ContactPoint
  identifier string [pk]
  phone string [note: "https://schema.org/telephone"]
  email string [note: "https://schema.org/email"]
  fax string [note: "https://schema.org/faxNumber"]
  contact_type string [note: "https://schema.org/contactType"]
  available_language string [note: "https://schema.org/availableLanguage"]
}

Table HealthCareOrganization {
  // https://schema.org/MedicalOrganization
  identifier string [pk, note: "https://schema.org/identifier"]
  name string [not null, note: "https://schema.org/name"]
  contact string [ref: > Contact.identifier, note: "https://schema.org/contactPoint"]
  address string [ref: > Address.identifier, note: "https://schema.org/address"]
}

Table ServiceDepartment {
  // https://schema.org/MedicalOrganization
  identifier string [pk, note: "https://schema.org/identifier"]
  name string [not null, note: "https://schema.org/name"]
  is_part_of string [ref: > HealthCareOrganization.identifier, note: "https://schema.org/parentOrganization or https://schema.org/department"]
  contact string [ref: > Contact.identifier, note: "https://schema.org/contactPoint"]
  address string [ref: > Address.identifier, note: "https://schema.org/address"]
}

Table Person {
  // https://schema.org/Person
  identifier string [pk, note: "https://schema.org/identifier"]
  name string [not null, note: "https://schema.org/name"]
  birth_date date [note: "https://schema.org/birthDate"]
  gender string [note: "https://schema.org/gender"]
  knows_language string [note: "https://schema.org/knowsLanguage"]
}

Table HealthCarePersonnel {
  // Inherits Person
  identifier string [unique, ref: >Person.identifier, note: "https://schema.org/identifier"]
  job_title string [note: "https://schema.org/jobTitle"]
  institution string [ref: > HealthCareOrganization.identifier, note: "https://schema.org/worksFor"]
  department string [ref: > ServiceDepartment.identifier, note: "https://schema.org/memberOf"]
  email string [note: "https://schema.org/email"]
}

In [2]:
import pandas as pd
import json

# Load the CSV and JSON files
golden_standard = pd.read_csv('example_data/updated_golden_standard_duplicates.csv')

# Load both match files
match_files = {
    'matches': json.load(open('matches/example_matches.json', 'r'))
    #'matches1': json.load(open('matches/matches1.json', 'r'))
}



# Create mapping between CSV field names and RDF predicate suffixes
field_to_predicate_map = {
    # Person fields
    "personName": "name",
    "birthDate": "birthDate",
    "knowsLanguage": "knowsLanguage",
    "gender": "gender",
    
    # HealthcarePersonnel fields 
    "email": "email",
    "jobTitle": "jobTitle",
    
    # Address fields
    "city": "addressLocality",
    "postalCode": "postalCode", 
    "country": "addressCountry",
    "text": "streetAddress",
    
    # Organization fields
    "healthcareOrganizationName": "name",
    "serviceDepartmentName": "name"
}

# Function to match CSV field names to RDF predicates
def match_field_to_predicate(field_name, predicates_list):
    if field_name in field_to_predicate_map:
        predicate_suffix = field_to_predicate_map[field_name]
        for predicate in predicates_list:
            # Check if the predicate ends with the mapped suffix
            if predicate.endswith(predicate_suffix):
                return predicate
    return None
# Prepare lists to store results for each match file
matched_results = {
    'matches': []
    #'matches1': []
}

# Process both match files in a single loop
for match_type, matches in match_files.items():
    for match in matches:
        entity1 = match['entities'][0]['entity1']
        entity2 = match['entities'][1]['entity2']
        
        for predicate1 in entity1['predicates']:
            predicate_suffix = predicate1['predicate'].split('/')[-1]  # Extract suffix
            object1 = predicate1['object']
            
            # Search for corresponding predicate in entity2
            for predicate2 in entity2['predicates']:
                object2 = predicate2['object']
                
                # Check against golden standard
                condition = (
                    ((golden_standard['field_name'] == predicate_suffix) | 
                     (golden_standard['field_name'].map(lambda x: field_to_predicate_map.get(x)) == predicate_suffix)) &
                    (golden_standard['original_value'] == object1) &
                    (golden_standard['varied_value'] == object2)
                )
                
                if golden_standard[condition].any().any():
                    matched_results[match_type].append({
                        'entity1_subject': entity1['subject'],
                        'entity2_subject': entity2['subject'],
                        'predicate': predicate_suffix,
                        'original_value': object1,
                        'varied_value': object2,
                        'similarity_score': match['similarity_score'],
                        'duplication_type': match['duplication_type'],
                        'variation_type': golden_standard[condition]['variation_type'].iloc[0],
                        'match_source': match_type  # Add source information
                    })

# Convert results to DataFrames for each match file
matched_results_df = pd.DataFrame(matched_results['matches'])
#matched_results1_df = pd.DataFrame(matched_results['matches1'])



In [3]:
matched_results_df

Unnamed: 0,entity1_subject,entity2_subject,predicate,original_value,varied_value,similarity_score,duplication_type,variation_type,match_source
0,http://example.org/Person/b25c3303-b8c7-4897-9...,http://example.org/Person/91d6c9fe-f1d5-4b37-8...,birthDate,1975-10-15,1975-15-10,0.6376292705535889,conflict,date_format_variation,matches
1,http://example.org/Person/d7982e0b-6903-4c3f-9...,http://example.org/Person/058f3e15-43a2-45cb-8...,name,Cynthia Mathis,Cynthia Mithis,0.6504331827163696,conflict,name_typo,matches
2,http://example.org/Person/960ad3a1-e1bd-44e6-8...,http://example.org/Person/8f92cc97-defb-40ff-8...,name,Michelle Garcia,Michelle Garuia,0.6365549564361572,conflict,name_typo,matches
3,http://example.org/Person/e7ccccb8-2469-416d-8...,http://example.org/Person/9644c90a-61db-4d23-a...,name,Jeffrey Green,J. Green,0.6533868312835693,conflict,abbreviated_first_name,matches
4,http://example.org/Person/c597f961-ba75-408f-9...,http://example.org/Person/8fab2688-035e-46e0-8...,email,johnanderson@healthcare.org,johnanerson@healthcare.org,0.6189196109771729,conflict,email_typo,matches
...,...,...,...,...,...,...,...,...,...
640,http://example.org/Person/5d6e3c4e-2ae1-4c81-b...,http://example.org/Person/433ad684-4e43-48a5-9...,email,gregorycopeland@healthcare.org,gregoyrcopeland@healthcare.org,0.6697738170623779,conflict,email_typo,matches
641,http://example.org/Person/fe8295f3-94b5-41e0-a...,http://example.org/Person/2f4cd901-c561-4fed-b...,name,Deborah Blake,Deborcah Blake,0.639448881149292,conflict,name_typo,matches
642,http://example.org/Person/437a83a1-662f-4cb0-8...,http://example.org/Person/3c8dd370-46a1-4620-b...,name,Adam Yu,Adm Yu,0.6396890878677368,conflict,name_typo,matches
643,http://example.org/ContactPoint/4b833e20-73bb-...,http://example.org/ContactPoint/b26632c3-15da-...,email,WilliamsWilliams.Gastroenterologic@dept.health...,WilliamsWilliams.Gastroenterologic@dept.health...,0.6610933542251587,conflict,email_domain_change,matches


In [4]:
matched_results_df.sort_values(by='similarity_score', ascending=True)[['original_value', 'varied_value', 'similarity_score', 'variation_type']]

Unnamed: 0,original_value,varied_value,similarity_score,variation_type
591,Allen Ltd Zorg,AL Zorg,0.5186635255813599,name_abbreviation
82,Obstetric,Maternity Care,0.5238410234451294,alternative_naming
332,Pathology,Patoloogia,0.5271050333976746,translation
458,Toxicologic,Poison Control Center,0.5286514759063721,alternative_naming
415,Wayne Williams,Wabyne Williams,0.5286908745765686,name_typo
...,...,...,...,...
129,BuckBurke.Pathology@dept.healthcare.org,BuckBurke.Pathology@dept.healthcare.nl,0.7286059856414795,email_domain_change
326,Obrien.Nursing@dept.healthcare.org,Obrien.Nuersing@dept.healthcare.org,0.7287150621414185,email_typo
358,Arnold.Dermatology@dept.healthcare.org,Arnold.Dermattology@dept.healthcare.org,0.7309417724609375,email_typo
249,Bishop.Physiotherapy@dept.healthcare.org,Bishop.Physiotherapy@dept.healthcare.et,0.7381105422973633,email_domain_change


In [5]:
#matched_results1_df.sort_values(by='similarity_score', ascending=True)[['original_value', 'varied_value', 'similarity_score', 'variation_type']]

In [6]:
#alternative_naming_df = matched_results_df[matched_results_df['variation_type'] == 'alternative_naming']
#print(alternative_naming_df)

In [8]:
total_golden_duplicates = len(golden_standard)
total_matched_duplicates = len(matched_results_df)


variation_type_counts_golden = golden_standard['variation_type'].value_counts()
variation_type_counts_matched = matched_results_df['variation_type'].value_counts()

total_matched_variations = variation_type_counts_matched.sum()
percentage_matched = (total_matched_variations / total_golden_duplicates) * 100
variation_comparison = pd.DataFrame({
    'Golden Standard Count': variation_type_counts_golden,
    'Matched Count': variation_type_counts_matched
}).fillna(0).astype(int)

variation_comparison['Matched (%)'] = (variation_comparison['Matched Count'] / variation_comparison['Golden Standard Count']) * 100

stats_summary = {
    'Total Duplicates in Golden Standard': total_golden_duplicates,
    'Total Matched Duplicates': total_matched_variations,
    'Percentage Matched (%)': percentage_matched
}

print("\nMatching Statistics:")
for key, value in stats_summary.items():
    print(f"{key}: {value:.2f}" if isinstance(value, float) else f"{key}: {value}")

print("\nVariation Type Analysis:")
print(variation_comparison)


Matching Statistics:
Total Duplicates in Golden Standard: 789
Total Matched Duplicates: 645
Percentage Matched (%): 81.75

Variation Type Analysis:
                         Golden Standard Count  Matched Count  Matched (%)
variation_type                                                            
abbreviated_first_name                      53             53   100.000000
alternative_naming                          28              4    14.285714
city_typo                                   20             16    80.000000
country_expansion                           16             16   100.000000
date_format_variation                       53             53   100.000000
department_abbreviation                     23              1     4.347826
email_domain_change                         47             47   100.000000
email_typo                                 303            303   100.000000
house_number_suffix                         27             21    77.777778
language_expansion        

In [17]:
total_golden_duplicates = len(golden_standard)
total_matched_duplicates = len(matched_results1_df)



variation_type_counts_golden = golden_standard['variation_type'].value_counts()
variation_type_counts_matched = matched_results1_df['variation_type'].value_counts()
total_matched_variations = variation_type_counts_matched.sum()

percentage_matched = (total_matched_variations / total_golden_duplicates) * 100

print(len(variation_type_counts_matched))
variation_comparison = pd.DataFrame({
    'Golden Standard Count': variation_type_counts_golden,
    'Matched Count': variation_type_counts_matched
}).fillna(0).astype(int)

variation_comparison['Matched (%)'] = (variation_comparison['Matched Count'] / variation_comparison['Golden Standard Count']) * 100

stats_summary = {
    'Total Duplicates in Golden Standard': total_golden_duplicates,
    'Total Matched Duplicates': total_matched_variations,
    'Percentage Matched (%)': percentage_matched
}

print("\nMatching Statistics:")
for key, value in stats_summary.items():
    print(f"{key}: {value:.2f}" if isinstance(value, float) else f"{key}: {value}")

print("\nVariation Type Analysis:")
print(variation_comparison)

9

Matching Statistics:
Total Duplicates in Golden Standard: 83
Total Matched Duplicates: 65
Percentage Matched (%): 78.31

Variation Type Analysis:
                         Golden Standard Count  Matched Count  Matched (%)
variation_type                                                            
abbreviated_first_name                       8              7    87.500000
alternative_naming                           4              0     0.000000
city_typo                                    4              4   100.000000
country_expansion                            1              1   100.000000
date_format_variation                        1              0     0.000000
department_abbreviation                      3              0     0.000000
email_domain_change                          4              4   100.000000
email_typo                                  33             31    93.939394
house_number_suffix                          2              2   100.000000
language_expansion        