## Check for Programmatic variations


In [2]:
import pandas as pd
import json

# Load the CSV and JSON files
golden_standard = pd.read_csv('data/prog_data/updated_golden_standard_duplicates.csv')

# Load both match files
match_files = {
    #'Hacky': json.load(open('matches/matchesHacky.json', 'r')),
    #'Sent_revised': json.load(open('matches/Sent_revised.json', 'r')),
    'Distmult': json.load(open('matches/Distmult_filtered.json', 'r')),
    'NOde2vec': json.load(open('matches/Node2vec_filtered.json', 'r')),
    #'Sent_revised_g2': json.load(open('matches/SentRevisedg2.json', 'r')),
    #'Sent_revised_g3': json.load(open('matches/SentRevisedv2.json', 'r')),
    'Sent_filtered': json.load(open('matches/SentRevised_filtered.json', 'r')),
    #'Sent_revised_g4': json.load(open('matches/SentRevisedv3.json', 'r')),  
}

def extract_uuid(uri):
    return uri.split("/")[-1]

# Process each match file
for match_type, data in match_files.items():
    identifiers = []

    # Extract UUIDs directly from the subject URIs
    for match in data:
        pair = {}
        for entity_label in ['entity1', 'entity2']:
            entity = next(e[entity_label] for e in match['entities'] if entity_label in e)
            uri = entity.get("subject")
            pair[entity_label] = extract_uuid(uri) if uri else None
        identifiers.append(pair)

    # Check matches against the golden standard
    matched_rows = golden_standard[
        golden_standard.apply(
            lambda row: any(
                (pair['entity1'] == row['original_id'] and pair['entity2'] == row['duplicate_id']) 
                or (pair['entity1'] == row['duplicate_id'] and pair['entity2'] == row['original_id'])  # bi-directional match
                for pair in identifiers
            ),
            axis=1
        )
    ]

    # Overall matching statistics
    total = len(golden_standard)
    matched = len(matched_rows)
    pct_matched = matched / total * 100

    print(f"Matching Statistics for {match_type}:")
    print(f"Number of matches in {match_type}:", len(match_files[match_type]))
    print(f"Total Duplicates in Golden Standard: {total}")
    print(f"Total Matched Duplicates: {matched}")
    print(f"Percentage Matched (%): {pct_matched:.2f}\n")

    # Variation-type analysis
    gold_var = golden_standard['variation_type'].value_counts().rename('Golden Standard Count')
    match_var = matched_rows['variation_type'].value_counts().rename('Matched Count')

    variation_df = pd.concat([gold_var, match_var], axis=1).fillna(0).astype(int)
    variation_df['Matched (%)'] = variation_df['Matched Count'] / variation_df['Golden Standard Count'] * 100
    variation_df = variation_df.sort_index()

    

    # Entity-type analysis
    gold_ent = golden_standard['entity_type'].value_counts().rename('Golden Standard Count')
    match_ent = matched_rows['entity_type'].value_counts().rename('Matched Count')

    entity_df = pd.concat([gold_ent, match_ent], axis=1).fillna(0).astype(int)
    entity_df['Matched (%)'] = entity_df['Matched Count'] / entity_df['Golden Standard Count'] * 100
    entity_df = entity_df.sort_index()

    

    # Combined variation and entity-type analysis
    variation_entity_df = golden_standard.groupby(['variation_type', 'entity_type']).size().unstack(fill_value=0)
    matched_variation_entity_df = matched_rows.groupby(['variation_type', 'entity_type']).size().unstack(fill_value=0)

    frames = []
    for vtype in variation_entity_df.index:
        for etype in variation_entity_df.columns:
            golden_count = variation_entity_df.at[vtype, etype] if etype in variation_entity_df.columns else 0
            matched_count = matched_variation_entity_df.at[vtype, etype] if (vtype in matched_variation_entity_df.index and etype in matched_variation_entity_df.columns) else 0
            matched_pct = (matched_count / golden_count * 100) if golden_count > 0 else 0
            frames.append({
                'variation_type': vtype,
                'entity_type': etype,
                'Golden Standard Count': golden_count,
                'Matched Count': matched_count,
                'Matched (%)': matched_pct
            })

    variation_entity_frame = pd.DataFrame(frames)
    variation_entity_frame = variation_entity_frame.sort_values(['variation_type', 'entity_type']).reset_index(drop=True)
    variation_frame = variation_entity_frame[variation_entity_frame['Golden Standard Count'] > 1]
    variation_frame = variation_frame.sort_values('entity_type').reset_index(drop=True)
    variation_frame

 # Set pandas display options to use the full width of the notebook
    pd.set_option('display.width', None)
    pd.set_option('display.max_columns', None)
    pd.set_option('display.expand_frame_repr', False)
    print(f"Combined Variation and Entity-Type Analysis for {match_type}:")
    print(variation_frame)
   

Matching Statistics for Distmult:
Number of matches in Distmult: 800
Total Duplicates in Golden Standard: 789
Total Matched Duplicates: 637
Percentage Matched (%): 80.74

Combined Variation and Entity-Type Analysis for Distmult:
             variation_type             entity_type  Golden Standard Count  Matched Count  Matched (%)
0                 city_typo                 Address                     20             20   100.000000
1         country_expansion                 Address                     16              3    18.750000
2             postal_format                 Address                     26             26   100.000000
3       house_number_suffix                 Address                     27             27   100.000000
4       email_domain_change            ContactPoint                     47             46    97.872340
5                email_typo            ContactPoint                     42             40    95.238095
6                 name_typo  HealthcareOrganizatio

In [None]:
import json

# Load your match files
match_files_oldform = {
    'Dist': json.load(open('Distmatches.json', 'r')),
    'DistLit': json.load(open('DistLitmatches.json', 'r'))
}

print("Number of matches in Dist:", len(match_files_oldform['Dist']))
print("Number of matches in DistLit:", len(match_files_oldform['DistLit']))

# Loop through the correct dictionary
for match_type, data in match_files_oldform.items():
    extracted_pairs = []

    for item in data:
        # Defensive check to ensure keys exist
        if all(k in item for k in ("entity1", "entity2", "score")):
            extracted_pairs.append({
                "entity1_id": item["entity1"].rsplit("/", 1)[-1],
                "entity2_id": item["entity2"].rsplit("/", 1)[-1],
                "score": item["score"]
            })
        
            
    print(extracted_pairs[:5])  # Print first 5 pairs for verification
    # Check matches against the golden standard
    matched_rows = golden_standard[
        golden_standard.apply(
            lambda row: any(
                (pair['entity1_id'] == row['original_id'] and pair['entity2_id'] == row['duplicate_id']) 
                for pair in identifiers
            ),
            axis=1
        )
    ]
    pct_matched = len(matched_rows) / len(golden_standard) * 100 if len(golden_standard) > 0 else 0
    # Print the number of matched pairs for the current file
    print(f"Number of matched pairs in {match_type}: {len(matched_rows)} / {len(golden_standard)} Percentage Matched (%): {pct_matched:.2f}")


Number of matches in Dist: 1506
Number of matches in DistLit: 12
[{'entity1_id': 'e775f27e-2ec0-42f0-9310-0ac51932263c', 'entity2_id': 'ccc9ccfc-8111-449f-97ea-568c40b0aa75', 'score': 0.5635482668876648}, {'entity1_id': 'a88a96ca-4a8c-4f0a-a9e5-7340fdc60cf4', 'entity2_id': 'd3d90822-0729-4e06-a591-03107ef3782b', 'score': 0.5461329221725464}, {'entity1_id': 'beb2d716-50a2-4f8e-b7ed-8fd685a46efd', 'entity2_id': '3543d36c-da74-4ac4-a443-530c30e5d112', 'score': 0.5020403861999512}, {'entity1_id': '0bd293e3-ee6e-4267-a278-aafc9c62e74b', 'entity2_id': 'cff8cf24-ddc0-4140-b4e9-bfc61b416d8f', 'score': 0.9183053970336914}, {'entity1_id': '8f76781e-2215-44aa-9e5c-fe61f1e5023e', 'entity2_id': '6573d6cc-8f64-4754-bf48-7d8dca63cccc', 'score': 0.7842326164245605}]
Number of matched pairs in Dist: 0 / 789 Percentage Matched (%): 0.00
[{'entity1_id': '86e280c0-6036-4f9c-a229-7cb2cf00f0a8', 'entity2_id': 'f0af5825-66e5-4e50-8c24-8b75c635a410', 'score': 0.5034458637237549}, {'entity1_id': '10d44b3d-8aca