In [None]:
import pandas as pd
import json

# Load the CSV and JSON files
golden_standard = pd.read_csv('data/prog_data/updated_golden_standard_duplicates.csv')

# Load both match files
match_files = {
    'Hacky': json.load(open('matchesHacky.json', 'r')),
    'Node2Vec': json.load(open('matches/example_matches.json', 'r'))
    #'Dist': json.load(open('Distmatches.json', 'r')),
    #'DistLit': json.load(open('DistLitmatches.json', 'r'))
}
print("Number of matches in Hacky:", len(match_files['Hacky']))
print("Number of matches in Node2vec:", len(match_files['Node2Vec']))
#print("Number of matches in Dist:", len(match_files['Dist']))
#print("Number of matches in DistLit:", len(match_files['DistLit']))

# Process each match file
for match_type, data in match_files.items():
    identifiers = []
    matched_rows_df = {}

    # Extract identifiers from entity1 and entity2
    for match in data:
        pair = {}
        for entity_label in ['entity1', 'entity2']:
            entity = next(e[entity_label] for e in match['entities'] if entity_label in e)
            identifier = next(
                (pred['object'] for pred in entity['predicates'] if pred['predicate'] == "https://schema.org/identifier"),
                None
            )
            pair[entity_label] = identifier
        identifiers.append(pair)

    # Check matches against the golden standard
    matched_rows = golden_standard[
        golden_standard.apply(
            lambda row: any(
                (pair['entity1'] == row['original_id'] and pair['entity2'] == row['duplicate_id']) 
                for pair in identifiers
            ),
            axis=1
        )
    ]
    matched_rows_df[match_type] = matched_rows
    pct_matched = len(matched_rows) / len(golden_standard) * 100 if len(golden_standard) > 0 else 0
    # Print the number of matched pairs for the current file
    print(f"Number of matched pairs in {match_type}: {len(matched_rows)} / {len(golden_standard)} Percentage Matched (%): {pct_matched:.2f}")
    # Save the matched rows to a CSV file

    



Number of matches in Hacky: 2658
Number of matches in Node2vec: 1452
Number of matched pairs in Hacky: 425 / 789 Percentage Matched (%): 53.87
Number of matched pairs in Node2Vec: 632 / 789 Percentage Matched (%): 80.10


In [30]:
import pandas as pd

# assume golden_standard and matched_rows are already defined

# 1) Overall matching statistics
total = len(golden_standard)
matched = len(matched_rows)
pct_matched = matched / total * 100

print("Matching Statistics:")
print(f"Total Duplicates in Golden Standard: {total}")
print(f"Total Matched Duplicates: {matched}")
print(f"Percentage Matched (%): {pct_matched:.2f}\n")

# 2) Variation-type analysis
#   count in golden, count in matched, then % matched
gold_var = golden_standard['variation_type'].value_counts().rename('Golden Standard Count')
match_var = matched_rows['variation_type'].value_counts().rename('Matched Count')

variation_df = pd.concat([gold_var, match_var], axis=1).fillna(0).astype(int)
variation_df['Matched (%)'] = variation_df['Matched Count'] / variation_df['Golden Standard Count'] * 100
variation_df = variation_df.sort_index()


# 3) Entity-type analysis
gold_ent = golden_standard['entity_type'].value_counts().rename('Golden Standard Count')
match_ent = matched_rows['entity_type'].value_counts().rename('Matched Count')

entity_df = pd.concat([gold_ent, match_ent], axis=1).fillna(0).astype(int)
entity_df['Matched (%)'] = entity_df['Matched Count'] / entity_df['Golden Standard Count'] * 100
entity_df = entity_df.sort_index()

# If you want to have them available for further use:
# variation_df  # DataFrame keyed by variation_type
# entity_df     # DataFrame keyed by entity_type
# Create a DataFrame with entity_type, variation_type, Golden Standard Count, Matched Count, and Matched (%)
variation_entity_df = golden_standard.groupby(['variation_type', 'entity_type']).size().unstack(fill_value=0)
matched_variation_entity_df = matched_rows.groupby(['variation_type', 'entity_type']).size().unstack(fill_value=0)

# Combine into a single DataFrame with MultiIndex (variation_type, entity_type)
frames = []
for vtype in variation_entity_df.index:
    for etype in variation_entity_df.columns:
        golden_count = variation_entity_df.at[vtype, etype] if etype in variation_entity_df.columns else 0
        matched_count = matched_variation_entity_df.at[vtype, etype] if (vtype in matched_variation_entity_df.index and etype in matched_variation_entity_df.columns) else 0
        matched_pct = (matched_count / golden_count * 100) if golden_count > 0 else 0
        frames.append({
            'variation_type': vtype,
            'entity_type': etype,
            'Golden Standard Count': golden_count,
            'Matched Count': matched_count,
            'Matched (%)': matched_pct
        })

variation_entity_frame = pd.DataFrame(frames)
variation_entity_frame = variation_entity_frame.sort_values(['variation_type', 'entity_type']).reset_index(drop=True)
variation_frame = variation_entity_frame[variation_entity_frame['Golden Standard Count'] > 1]
variation_frame = variation_frame.sort_values('entity_type').reset_index(drop=True)
variation_frame



Matching Statistics:
Total Duplicates in Golden Standard: 789
Total Matched Duplicates: 632
Percentage Matched (%): 80.10



Unnamed: 0,variation_type,entity_type,Golden Standard Count,Matched Count,Matched (%)
0,city_typo,Address,20,20,100.0
1,country_expansion,Address,16,3,18.75
2,postal_format,Address,26,26,100.0
3,house_number_suffix,Address,27,27,100.0
4,email_domain_change,ContactPoint,47,46,97.87234
5,email_typo,ContactPoint,42,42,100.0
6,name_typo,HealthcareOrganization,7,0,0.0
7,name_abbreviation,HealthcareOrganization,3,1,33.333333
8,email_typo,HealthcarePersonnel,261,261,100.0
9,name_typo,Person,51,51,100.0


In [None]:
import pandas as pd
import json

# Load the CSV and JSON files
golden_standard = pd.read_csv('data/prog_data/updated_golden_standard_duplicates.csv')

# Load both match files
match_files = {
    'Hacky': json.load(open('matchesHacky.json', 'r')),
    'Node2Vec': json.load(open('matches/example_matches.json', 'r'))
    
}

# Process each match file
for match_type, data in match_files.items():
    identifiers = []

    # Extract identifiers from entity1 and entity2
    for match in data:
        pair = {}
        for entity_label in ['entity1', 'entity2']:
            entity = next(e[entity_label] for e in match['entities'] if entity_label in e)
            identifier = next(
                (pred['object'] for pred in entity['predicates'] if pred['predicate'] == "https://schema.org/identifier"),
                None
            )
            pair[entity_label] = identifier
        identifiers.append(pair)

    # Check matches against the golden standard
    matched_rows = golden_standard[
        golden_standard.apply(
            lambda row: any(
                (pair['entity1'] == row['original_id'] and pair['entity2'] == row['duplicate_id']) 
                for pair in identifiers
            ),
            axis=1
        )
    ]

    # Overall matching statistics
    total = len(golden_standard)
    matched = len(matched_rows)
    pct_matched = matched / total * 100

    print(f"Matching Statistics for {match_type}:")
    print(f"Total Duplicates in Golden Standard: {total}")
    print(f"Total Matched Duplicates: {matched}")
    print(f"Percentage Matched (%): {pct_matched:.2f}\n")

    # Variation-type analysis
    gold_var = golden_standard['variation_type'].value_counts().rename('Golden Standard Count')
    match_var = matched_rows['variation_type'].value_counts().rename('Matched Count')

    variation_df = pd.concat([gold_var, match_var], axis=1).fillna(0).astype(int)
    variation_df['Matched (%)'] = variation_df['Matched Count'] / variation_df['Golden Standard Count'] * 100
    variation_df = variation_df.sort_index()

    

    # Entity-type analysis
    gold_ent = golden_standard['entity_type'].value_counts().rename('Golden Standard Count')
    match_ent = matched_rows['entity_type'].value_counts().rename('Matched Count')

    entity_df = pd.concat([gold_ent, match_ent], axis=1).fillna(0).astype(int)
    entity_df['Matched (%)'] = entity_df['Matched Count'] / entity_df['Golden Standard Count'] * 100
    entity_df = entity_df.sort_index()

    

    # Combined variation and entity-type analysis
    variation_entity_df = golden_standard.groupby(['variation_type', 'entity_type']).size().unstack(fill_value=0)
    matched_variation_entity_df = matched_rows.groupby(['variation_type', 'entity_type']).size().unstack(fill_value=0)

    frames = []
    for vtype in variation_entity_df.index:
        for etype in variation_entity_df.columns:
            golden_count = variation_entity_df.at[vtype, etype] if etype in variation_entity_df.columns else 0
            matched_count = matched_variation_entity_df.at[vtype, etype] if (vtype in matched_variation_entity_df.index and etype in matched_variation_entity_df.columns) else 0
            matched_pct = (matched_count / golden_count * 100) if golden_count > 0 else 0
            frames.append({
                'variation_type': vtype,
                'entity_type': etype,
                'Golden Standard Count': golden_count,
                'Matched Count': matched_count,
                'Matched (%)': matched_pct
            })

    variation_entity_frame = pd.DataFrame(frames)
    variation_entity_frame = variation_entity_frame.sort_values(['variation_type', 'entity_type']).reset_index(drop=True)
    variation_frame = variation_entity_frame[variation_entity_frame['Golden Standard Count'] > 1]
    variation_frame = variation_frame.sort_values('entity_type').reset_index(drop=True)
    variation_frame

 # Set pandas display options to use the full width of the notebook
    pd.set_option('display.width', None)
    pd.set_option('display.max_columns', None)
    pd.set_option('display.expand_frame_repr', False)
    print(f"Combined Variation and Entity-Type Analysis for {match_type}:")
    print(variation_frame)
   

Matching Statistics for Hacky:
Total Duplicates in Golden Standard: 789
Total Matched Duplicates: 425
Percentage Matched (%): 53.87

Combined Variation and Entity-Type Analysis for Hacky:
             variation_type             entity_type  Golden Standard Count  Matched Count  Matched (%)
0                 city_typo                 Address                     20             18    90.000000
1         country_expansion                 Address                     16              0     0.000000
2             postal_format                 Address                     26             22    84.615385
3       house_number_suffix                 Address                     27             19    70.370370
4       email_domain_change            ContactPoint                     47             33    70.212766
5                email_typo            ContactPoint                     42             25    59.523810
6                 name_typo  HealthcareOrganization                      7              5  

In [1]:
import pandas as pd
import json

# Load the CSV and JSON files
golden_standard = pd.read_csv('data/prog_data/updated_golden_standard_duplicates.csv')

# Load both match files
match_files = {
    #'Distmult': json.load(open('math.json', 'r')),
    #'DistmultLit': json.load(open('matches/example_matches.json', 'r'))
    'Hacky_new': json.load(open('revisedHacky.json', 'r')),
    'Hacky': json.load(open('matchesHacky.json', 'r'))
}
print("Number of matches in Hacky_new:", len(match_files['Hacky_new']))
print("Number of matches in Hacky:", len(match_files['Hacky']))

def extract_uuid(uri):
    return uri.split("/")[-1]

# Process each match file
for match_type, data in match_files.items():
    identifiers = []

    # Extract UUIDs directly from the subject URIs
    for match in data:
        pair = {}
        for entity_label in ['entity1', 'entity2']:
            entity = next(e[entity_label] for e in match['entities'] if entity_label in e)
            uri = entity.get("subject")
            pair[entity_label] = extract_uuid(uri) if uri else None
        identifiers.append(pair)

    # Check matches against the golden standard
    matched_rows = golden_standard[
        golden_standard.apply(
            lambda row: any(
                (pair['entity1'] == row['original_id'] and pair['entity2'] == row['duplicate_id']) 
                or (pair['entity1'] == row['duplicate_id'] and pair['entity2'] == row['original_id'])  # bi-directional match
                for pair in identifiers
            ),
            axis=1
        )
    ]
    # Overall matching statistics
    total = len(golden_standard)
    matched = len(matched_rows)
    pct_matched = matched / total * 100

    print(f"Matching Statistics for {match_type}:")
    print(f"Total Duplicates in Golden Standard: {total}")
    print(f"Total Matched Duplicates: {matched}")
    print(f"Percentage Matched (%): {pct_matched:.2f}\n")

    # Variation-type analysis
    gold_var = golden_standard['variation_type'].value_counts().rename('Golden Standard Count')
    match_var = matched_rows['variation_type'].value_counts().rename('Matched Count')

    variation_df = pd.concat([gold_var, match_var], axis=1).fillna(0).astype(int)
    variation_df['Matched (%)'] = variation_df['Matched Count'] / variation_df['Golden Standard Count'] * 100
    variation_df = variation_df.sort_index()

    

    # Entity-type analysis
    gold_ent = golden_standard['entity_type'].value_counts().rename('Golden Standard Count')
    match_ent = matched_rows['entity_type'].value_counts().rename('Matched Count')

    entity_df = pd.concat([gold_ent, match_ent], axis=1).fillna(0).astype(int)
    entity_df['Matched (%)'] = entity_df['Matched Count'] / entity_df['Golden Standard Count'] * 100
    entity_df = entity_df.sort_index()

    

    # Combined variation and entity-type analysis
    variation_entity_df = golden_standard.groupby(['variation_type', 'entity_type']).size().unstack(fill_value=0)
    matched_variation_entity_df = matched_rows.groupby(['variation_type', 'entity_type']).size().unstack(fill_value=0)

    frames = []
    for vtype in variation_entity_df.index:
        for etype in variation_entity_df.columns:
            golden_count = variation_entity_df.at[vtype, etype] if etype in variation_entity_df.columns else 0
            matched_count = matched_variation_entity_df.at[vtype, etype] if (vtype in matched_variation_entity_df.index and etype in matched_variation_entity_df.columns) else 0
            matched_pct = (matched_count / golden_count * 100) if golden_count > 0 else 0
            frames.append({
                'variation_type': vtype,
                'entity_type': etype,
                'Golden Standard Count': golden_count,
                'Matched Count': matched_count,
                'Matched (%)': matched_pct
            })

    variation_entity_frame = pd.DataFrame(frames)
    variation_entity_frame = variation_entity_frame.sort_values(['variation_type', 'entity_type']).reset_index(drop=True)
    variation_frame = variation_entity_frame[variation_entity_frame['Golden Standard Count'] > 1]
    variation_frame = variation_frame.sort_values('entity_type').reset_index(drop=True)
    variation_frame

 # Set pandas display options to use the full width of the notebook
    pd.set_option('display.width', None)
    pd.set_option('display.max_columns', None)
    pd.set_option('display.expand_frame_repr', False)
    print(f"Combined Variation and Entity-Type Analysis for {match_type}:")
    print(variation_frame)
   

Number of matches in Hacky_new: 2655
Number of matches in Hacky: 2658
Matching Statistics for Hacky_new:
Total Duplicates in Golden Standard: 789
Total Matched Duplicates: 640
Percentage Matched (%): 81.12

Combined Variation and Entity-Type Analysis for Hacky_new:
             variation_type             entity_type  Golden Standard Count  Matched Count  Matched (%)
0                 city_typo                 Address                     20             20   100.000000
1         country_expansion                 Address                     16              0     0.000000
2             postal_format                 Address                     26             26   100.000000
3       house_number_suffix                 Address                     27             27   100.000000
4       email_domain_change            ContactPoint                     47             47   100.000000
5                email_typo            ContactPoint                     42             37    88.095238
6            