Table Address {
  // https://schema.org/PostalAddress
  identifier string [pk, note: "https://schema.org/identifier"]
  text string [note: "https://schema.org/streetAddress"]
  city string [note: "https://schema.org/addressLocality"]
  postal_code string [note: "https://schema.org/postalCode"]
  country string [note: "https://schema.org/addressCountry"]
}

Table Contact {
  // https://schema.org/ContactPoint
  identifier string [pk]
  phone string [note: "https://schema.org/telephone"]
  email string [note: "https://schema.org/email"]
  fax string [note: "https://schema.org/faxNumber"]
  contact_type string [note: "https://schema.org/contactType"]
  available_language string [note: "https://schema.org/availableLanguage"]
}

Table HealthCareOrganization {
  // https://schema.org/MedicalOrganization
  identifier string [pk, note: "https://schema.org/identifier"]
  name string [not null, note: "https://schema.org/name"]
  contact string [ref: > Contact.identifier, note: "https://schema.org/contactPoint"]
  address string [ref: > Address.identifier, note: "https://schema.org/address"]
}

Table ServiceDepartment {
  // https://schema.org/MedicalOrganization
  identifier string [pk, note: "https://schema.org/identifier"]
  name string [not null, note: "https://schema.org/name"]
  is_part_of string [ref: > HealthCareOrganization.identifier, note: "https://schema.org/parentOrganization or https://schema.org/department"]
  contact string [ref: > Contact.identifier, note: "https://schema.org/contactPoint"]
  address string [ref: > Address.identifier, note: "https://schema.org/address"]
}

Table Person {
  // https://schema.org/Person
  identifier string [pk, note: "https://schema.org/identifier"]
  name string [not null, note: "https://schema.org/name"]
  birth_date date [note: "https://schema.org/birthDate"]
  gender string [note: "https://schema.org/gender"]
  knows_language string [note: "https://schema.org/knowsLanguage"]
}

Table HealthCarePersonnel {
  // Inherits Person
  identifier string [unique, ref: >Person.identifier, note: "https://schema.org/identifier"]
  job_title string [note: "https://schema.org/jobTitle"]
  institution string [ref: > HealthCareOrganization.identifier, note: "https://schema.org/worksFor"]
  department string [ref: > ServiceDepartment.identifier, note: "https://schema.org/memberOf"]
  email string [note: "https://schema.org/email"]
}

In [None]:
import pandas as pd
import json

# Load the CSV and JSON files
golden_standard = pd.read_csv('updated_golden_standard_duplicates.csv')

# Load both match files
match_files = {
    'matches': json.load(open('matches.json', 'r')),
    'matches1': json.load(open('matches1.json', 'r'))
}

# ...existing code for field_to_predicate_map and match_field_to_predicate function...
# Create a mapping between CSV field names and RDF predicate endings
field_to_predicate_map = {
    # Person fields
    "personName": "name",
    "birthDate": "birthDate",
    "knowsLanguage": "knowsLanguage",
    
    # HealthcarePersonnel fields 
    "email": "email",
    "jobTitle": "jobTitle",
    
    # Address fields
    "city": "addressLocality",
    "postalCode": "postalCode",
    "country": "addressCountry", 
    "text": "streetAddress",
    
    # Organization fields
    "healthcareOrganizationName": "name",
    "serviceDepartmentName": "name"
}
# Prepare lists to store results for each match file
matched_results = {
    'matches': [],
    'matches1': []
}

# Process both match files in a single loop
for match_type, matches in match_files.items():
    for match in matches:
        entity1 = match['entities'][0]['entity1']
        entity2 = match['entities'][1]['entity2']
        
        for predicate1 in entity1['predicates']:
            predicate_suffix = predicate1['predicate'].split('/')[-1]  # Extract suffix
            object1 = predicate1['object']
            
            # Search for corresponding predicate in entity2
            for predicate2 in entity2['predicates']:
                object2 = predicate2['object']
                
                # Check against golden standard
                condition = (
                    ((golden_standard['field_name'] == predicate_suffix) | 
                     (golden_standard['field_name'].map(lambda x: field_to_predicate_map.get(x)) == predicate_suffix)) &
                    (golden_standard['original_value'] == object1) &
                    (golden_standard['varied_value'] == object2)
                )
                
                if golden_standard[condition].any().any():
                    matched_results[match_type].append({
                        'entity1_subject': entity1['subject'],
                        'entity2_subject': entity2['subject'],
                        'predicate': predicate_suffix,
                        'original_value': object1,
                        'varied_value': object2,
                        'similarity_score': match['similarity_score'],
                        'duplication_type': match['duplication_type'],
                        'variation_type': golden_standard[condition]['variation_type'].iloc[0],
                        'match_source': match_type  # Add source information
                    })

# Convert results to DataFrames for each match file
matched_results_df = pd.DataFrame(matched_results['matches'])
matched_results1_df = pd.DataFrame(matched_results['matches1'])



In [8]:
matched_results_df

Unnamed: 0,entity1_subject,entity2_subject,predicate,original_value,varied_value,similarity_score,duplication_type,variation_type,match_source
0,http://example.org/Person/2db6de21-53a9-4c5d-b...,http://example.org/Person/f0cbc363-9686-406f-8...,name,Ms. Amanda Davis,Davis Amanda Ms.,0.9625009298324584,exact,name_swap,matches
1,http://example.org/Person/bae8d99f-eee0-4fb7-9...,http://example.org/Person/71b77917-4ebd-4d94-8...,name,Stephanie Young MD,MD Young Stephanie,0.9636178016662598,exact,name_swap,matches
2,http://example.org/Person/8ce67bdc-41b7-4448-8...,http://example.org/Person/d4c0bd76-7d38-42af-8...,email,briannasanchez@healthcare.org,briannasanchhez@healthcare.org,0.9843826293945312,exact,email_typo,matches
3,http://example.org/Person/2adfd824-edae-4c39-a...,http://example.org/Person/2b189dd9-f3dd-48a7-9...,email,williamsmith@healthcare.org,williamsmih@healthcare.org,0.9785258173942566,exact,email_typo,matches
4,http://example.org/Person/f48ef9a8-6aeb-4a32-8...,http://example.org/Person/a20ea96a-f78e-4b69-9...,name,Gregory Carney,Gregory Carntey,0.9737997055053712,exact,name_typo,matches
5,http://example.org/HealthcareOrg/d675ebf7-4fe3...,http://example.org/HealthcareOrg/9a396f78-73c3...,name,Hughes-Perry Tervisekeskus,HughesP-erry Tervisekeskus,0.8842896223068237,similar,name_typo,matches
6,http://example.org/ContactPoint/a747795e-b0c9-...,http://example.org/ContactPoint/1d10aa71-c25e-...,email,Ali.Hematologic@dept.healthcare.org,Ali.Hematologic@dept.healthcare.de,0.9584041833877563,exact,email_domain_change,matches
7,http://example.org/Person/49954cf1-faaf-400c-a...,http://example.org/Person/a284d0d2-d2cd-47ff-b...,name,Mary Stewart,Stewart Mary,0.9557233452796936,exact,name_swap,matches
8,http://example.org/Person/e5a15243-c728-4f15-b...,http://example.org/Person/3d5d80e6-e0d1-4580-8...,name,Andrea Griffin,Griffin Andrea,0.9615130424499512,exact,name_swap,matches
9,http://example.org/ContactPoint/d2928f4b-e151-...,http://example.org/ContactPoint/3435b401-5a4d-...,email,Monroe.Physiotherapy@dept.healthcare.org,Monroe.Physsiotherapy@dept.healthcare.org,0.9260987043380736,exact,email_typo,matches


In [9]:
matched_results_df.sort_values(by='similarity_score', ascending=True)[['original_value', 'varied_value', 'similarity_score', 'variation_type']]

Unnamed: 0,original_value,varied_value,similarity_score,variation_type
40,Renal,Kidney,0.8450957536697388,department_abbreviation
5,Hughes-Perry Tervisekeskus,HughesP-erry Tervisekeskus,0.8842896223068237,name_typo
29,Emergency,ER,0.8973215818405151,department_abbreviation
41,EvansRodriguez.Infectious@dept.healthcare.org,EvansRodriguez.Infectious@dept.healthcare.nl,0.9252129197120668,email_domain_change
9,Monroe.Physiotherapy@dept.healthcare.org,Monroe.Physsiotherapy@dept.healthcare.org,0.9260987043380736,email_typo
26,8857,8857,0.9327225685119628,postal_format
39,Fischamend,Fiischamend,0.9344302415847778,city_typo
34,nathankline@healthcare.org,natthankline@healthcare.org,0.9352651238441468,email_typo
55,FaulknerHoward.Nursing@dept.healthcare.org,FaulknerHoward.Nursing@dept.healthcare.de,0.9424973726272584,email_domain_change
54,michellecampbell@healthcare.org,michellecmpbell@healthcare.org,0.9461591243743896,email_typo


In [10]:
matched_results1_df.sort_values(by='similarity_score', ascending=True)[['original_value', 'varied_value', 'similarity_score', 'variation_type']]

Unnamed: 0,original_value,varied_value,similarity_score,variation_type
44,Lisa Mckinney,Lisa Mcinney,0.7089900970458984,name_typo
43,lisamckinney@healthcare.org,lisampckinney@healthcare.org,0.7089900970458984,email_typo
55,Denise Schwartz,D. Schwartz,0.7090005278587341,abbreviated_first_name
18,Sheryl Roberts,S. Roberts,0.7097387313842773,abbreviated_first_name
17,sherylroberts@healthcare.org,sherlyroberts@healthcare.org,0.7097387313842773,email_typo
...,...,...,...,...
38,FaulknerHoward.Nursing@dept.healthcare.org,FaulknerHoward.Nursing@dept.healthcare.de,0.8170784115791321,email_domain_change
26,Richardson@healthcare.org,Richardsoon@healthcare.org,0.8181456923484802,email_typo
12,Trieben,Triebben,0.8226755857467651,city_typo
29,Peterson.Pulmonary@dept.healthcare.org,Peterson.Plumonary@dept.healthcare.org,0.8234403133392334,email_typo


In [11]:
alternative_naming_df = matched_results_df[matched_results_df['variation_type'] == 'alternative_naming']
print(alternative_naming_df)

Empty DataFrame
Columns: [entity1_subject, entity2_subject, predicate, original_value, varied_value, similarity_score, duplication_type, variation_type, match_source]
Index: []


In [24]:
total_golden_duplicates = len(golden_standard)
total_matched_duplicates = len(matched_results_df)


variation_type_counts_golden = golden_standard['variation_type'].value_counts()
variation_type_counts_matched = matched_results_df['variation_type'].value_counts()

total_matched_variations = variation_type_counts_matched.sum()
percentage_matched = (total_matched_variations / total_golden_duplicates) * 100
variation_comparison = pd.DataFrame({
    'Golden Standard Count': variation_type_counts_golden,
    'Matched Count': variation_type_counts_matched
}).fillna(0).astype(int)

variation_comparison['Matched (%)'] = (variation_comparison['Matched Count'] / variation_comparison['Golden Standard Count']) * 100

stats_summary = {
    'Total Duplicates in Golden Standard': total_golden_duplicates,
    'Total Matched Duplicates': total_matched_variations,
    'Percentage Matched (%)': percentage_matched
}

print("\nMatching Statistics:")
for key, value in stats_summary.items():
    print(f"{key}: {value:.2f}" if isinstance(value, float) else f"{key}: {value}")

print("\nVariation Type Analysis:")
print(variation_comparison)


Matching Statistics:
Total Duplicates in Golden Standard: 83
Total Matched Duplicates: 56
Percentage Matched (%): 67.47

Variation Type Analysis:
                         Golden Standard Count  Matched Count  Matched (%)
variation_type                                                            
abbreviated_first_name                       8              5    62.500000
alternative_naming                           4              0     0.000000
city_typo                                    4              3    75.000000
country_expansion                            1              0     0.000000
date_format_variation                        1              1   100.000000
department_abbreviation                      3              2    66.666667
email_domain_change                          4              3    75.000000
email_typo                                  33             24    72.727273
house_number_suffix                          2              2   100.000000
language_expansion          

In [25]:
total_golden_duplicates = len(golden_standard)
total_matched_duplicates = len(matched_results1_df)



variation_type_counts_golden = golden_standard['variation_type'].value_counts()
variation_type_counts_matched = matched_results1_df['variation_type'].value_counts()
total_matched_variations = variation_type_counts_matched.sum()

percentage_matched = (total_matched_variations / total_golden_duplicates) * 100

print(len(variation_type_counts_matched))
variation_comparison = pd.DataFrame({
    'Golden Standard Count': variation_type_counts_golden,
    'Matched Count': variation_type_counts_matched
}).fillna(0).astype(int)

variation_comparison['Matched (%)'] = (variation_comparison['Matched Count'] / variation_comparison['Golden Standard Count']) * 100

stats_summary = {
    'Total Duplicates in Golden Standard': total_golden_duplicates,
    'Total Matched Duplicates': total_matched_variations,
    'Percentage Matched (%)': percentage_matched
}

print("\nMatching Statistics:")
for key, value in stats_summary.items():
    print(f"{key}: {value:.2f}" if isinstance(value, float) else f"{key}: {value}")

print("\nVariation Type Analysis:")
print(variation_comparison)

9

Matching Statistics:
Total Duplicates in Golden Standard: 83
Total Matched Duplicates: 65
Percentage Matched (%): 78.31

Variation Type Analysis:
                         Golden Standard Count  Matched Count  Matched (%)
variation_type                                                            
abbreviated_first_name                       8              7    87.500000
alternative_naming                           4              0     0.000000
city_typo                                    4              4   100.000000
country_expansion                            1              1   100.000000
date_format_variation                        1              0     0.000000
department_abbreviation                      3              0     0.000000
email_domain_change                          4              4   100.000000
email_typo                                  33             31    93.939394
house_number_suffix                          2              2   100.000000
language_expansion        