## Statistics Calculation

In [16]:
import pandas as pd
import os


ground_truth_path = os.path.join("files", "labeled_data.csv")

# Load ground-truth matches 
ground_truth_df = pd.read_csv(ground_truth_path, skiprows=5)  

# Only keep positive matches (gold == 1)
ground_truth_matches = set(zip(
    ground_truth_df.loc[ground_truth_df['gold'] == 1, 'ltable.ID'],
    ground_truth_df.loc[ground_truth_df['gold'] == 1, 'rtable.ID']
))

print(f"Total ground-truth matches: {len(ground_truth_matches)}")

# === Load my candidate set
candidate_df = pd.read_csv('entity_matches.csv')  

# Build candidate set
candidate_set = set(zip(candidate_df['left_id'], candidate_df['right_id']))

print(f"Total candidate pairs generated: {len(candidate_set)}")

# === Compute metrics ===

# True positives = intersection between candidate set and ground truth matches
true_positives = ground_truth_matches.intersection(candidate_set)

# Recall = (# of true matches found) / (total true matches)
recall = len(true_positives) / len(ground_truth_matches) if len(ground_truth_matches) > 0 else 0.0

# Precision = (# of true matches found) / (# of candidate pairs)
precision = len(true_positives) / len(candidate_set) if len(candidate_set) > 0 else 0.0

# Reduction Ratio (RR) = 1 - (# candidate pairs / total possible pairs)
# Total possible pairs = len(ltable) * len(rtable)
# You need to know how many total items were in original tables
n_ltable = ground_truth_df['ltable.ID'].nunique()
n_rtable = ground_truth_df['rtable.ID'].nunique()
total_possible_pairs = n_ltable * n_rtable

reduction_ratio = 1 - (len(candidate_set) / total_possible_pairs)

# === Print results ===
print(f"\n🔎 Evaluation Metrics:")
print(f"Blocking Recall: {recall:.4f}")
print(f"Blocking Precision: {precision:.4f}")
print(f"Reduction Ratio: {reduction_ratio:.4f}")

Total ground-truth matches: 58
Total candidate pairs generated: 127770

🔎 Evaluation Metrics:
Blocking Recall: 0.7414
Blocking Precision: 0.0003
Reduction Ratio: -0.0315
