In [1]:
import pandas as pd

In [2]:
ground_truth_REV = pd.read_parquet("/Users/pietro/Desktop/VIDA-NYU/data-gatherer/output/gold/dataset_citation_records_Table.parquet")
ground_truth_REV.columns

Index(['identifier', 'repository', 'citing_publication_link',
       'citation_record_source', 'citation_record_from_doi'],
      dtype='object')

In [3]:
ground_truth_EXP = pd.read_excel("/Users/pietro/Desktop/VIDA-NYU/data-gatherer/output/gold/GT_DataLinks_PMC_and_Fenyo.xlsx")
ground_truth_EXP.rename({'Source Page': 'citing_publication_link', 'UID': 'identifier', 'Dataset Webpage': 'dataset_webpage', 'Repo Name': 'repository'}, axis=1, inplace=True)

In [4]:
# get excel table from the Articles tab
DataRef_EXP_FDR_gpt = pd.read_csv("/Users/pietro/Desktop/VIDA-NYU/data-gatherer/output/DataRef-EXP_FDR_gpt-4o-mini.csv")
DataRef_EXP_RTR_gpt = pd.read_csv("/Users/pietro/Desktop/VIDA-NYU/data-gatherer/output/DataRef-EXP_RTR_gpt-4o-mini.csv")
DataRef_EXP_FDR_gemini = pd.read_csv("/Users/pietro/Desktop/VIDA-NYU/data-gatherer/output/DataRef-EXP_FDR_gemini-2.0-flash.csv")
DataRef_EXP_RTR_gemini = pd.read_csv("/Users/pietro/Desktop/VIDA-NYU/data-gatherer/output/DataRef-EXP_RTR_gemini-2.0-flash.csv").rename({'source_url': 'source_url', 'dataset_id': 'dataset_identifier', 'repository_reference': 'data_repository'}, axis=1)

In [5]:
# from eval drop rows with empty dataset identifier
DataRef_EXP_FDR_gpt.dropna(subset=["dataset_identifier"], inplace=True)
DataRef_EXP_RTR_gpt.dropna(subset=["dataset_identifier"], inplace=True)
DataRef_EXP_FDR_gemini.dropna(subset=["dataset_identifier"], inplace=True)
DataRef_EXP_RTR_gemini.dropna(subset=["dataset_identifier"], inplace=True)

In [6]:
# keep only columns of interes
DataRef_EXP_FDR_gpt = DataRef_EXP_FDR_gpt[["source_url", "dataset_identifier", "data_repository"]]
DataRef_EXP_RTR_gpt = DataRef_EXP_RTR_gpt[["source_url", "dataset_identifier", "data_repository"]]
DataRef_EXP_FDR_gemini = DataRef_EXP_FDR_gemini[["source_url", "dataset_identifier", "data_repository"]]
DataRef_EXP_RTR_gemini = DataRef_EXP_RTR_gemini[["source_url", "dataset_identifier", "data_repository"]]

In [7]:
def calculate_performance_metrics(ground_truth_df: pd.DataFrame, eval_df: pd.DataFrame) -> dict:
    """
    Calculate performance metrics (precision, recall, F1-score) for dataset extraction,
    considering exact, partial, and alias matches. Includes verbose debugging output.
    """
    # Initialize counters for total scores
    total_precision = 0
    total_recall = 0
    num_sources = ground_truth_df['citing_publication_link'].nunique()

    print(f"Number of unique source pages: {num_sources, len(ground_truth_df)}")

    # Iterate through each unique source page'identifier','repository','citing_publications_links
    for source_page in ground_truth_df['citing_publication_link'].unique():
        print(f"\nProcessing source page: {source_page}")

        # Extract ground truth datasets for this source page
        gt_data = ground_truth_df[ground_truth_df['citing_publication_link'] == source_page]
        gt_datasets = set(gt_data['identifier'].dropna().str.lower())
        # gt_webpages = set(gt_data['Dataset Webpage'].dropna().str.lower())
        gt_repositories = set(gt_data['repository'].dropna().str.lower())

        print(f"Ground truth datasets: {gt_datasets}")
        # print(f"Ground truth webpages: {gt_webpages}")
        print(f"Ground truth repositories: {gt_repositories}")

        # Extract evaluation datasets for this source page
        eval_data = eval_df[eval_df['source_url'] == source_page]
        eval_datasets = set(eval_data['dataset_identifier'].dropna().str.lower())
        # eval_webpages = set(eval_data['dataset_webpage'].dropna().str.lower())
        eval_repositories = set(eval_data['data_repository'].dropna().str.lower())

        print(f"Evaluation datasets: {eval_datasets}")
        # print(f"Evaluation webpages: {eval_webpages}")
        print(f"Evaluation repositories: {eval_repositories}")

        # Handle cases where both ground truth and evaluation are empty
        if not gt_datasets and not eval_datasets:
            print("No datasets in both ground truth and evaluation. Perfect precision and recall.")
            total_precision += 1
            total_recall += 1
            continue

        # Initialize match counters
        exact_matches = 0
        partial_matches = 0
        false_positives = 0
        false_negatives = 0

        # Track matches to avoid double counting
        matched_gt = set()
        matched_eval = set()

        # Check for exact matches first
        for eval_id in eval_datasets:
            if eval_id in gt_datasets:
                print(f"Exact match found: {eval_id}")
                exact_matches += 1
                matched_gt.add(eval_id)
                matched_eval.add(eval_id)

        # Check for partial matches and aliases
        for eval_id in eval_datasets - matched_eval:
            for gt_id in gt_datasets - matched_gt:
                if eval_id in gt_id or gt_id in eval_id:  # Partial match or alias
                    print(f"Partial or alias match found: eval_id={eval_id}, gt_id={gt_id}")
                    partial_matches += 1
                    matched_gt.add(gt_id)
                    matched_eval.add(eval_id)
                    break

        # Calculate False Positives (remaining unmatched eval datasets)
        FP = eval_datasets - matched_eval

        # Remove aliases from False Positives
        for eval_id in list(FP):
            for matched_id in matched_gt:
                if eval_id in matched_id or matched_id in eval_id:  # Alias detected
                    print(f"Removing alias from false positives: eval_id={eval_id}, matched_id={matched_id}")
                    FP.discard(eval_id)
                    break

        false_positives = len(FP)
        print(f"False positives: {FP}")

        # Calculate False Negatives (remaining unmatched ground truth datasets)
        FN = gt_datasets - matched_gt
        false_negatives = len(FN)
        print(f"False negatives: {FN}")

        # Calculate precision and recall for this source page
        true_positives = exact_matches + partial_matches
        precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
        recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0

        print(f"Precision for {source_page}: {precision}")
        print(f"Recall for {source_page}: {recall}")

        # Accumulate totals
        total_precision += precision
        total_recall += recall

    # Calculate overall metrics
    average_precision = total_precision / num_sources if num_sources > 0 else 0
    average_recall = total_recall / num_sources if num_sources > 0 else 0
    f1_score = 2 * (average_precision * average_recall) / (average_precision + average_recall) if (average_precision + average_recall) > 0 else 0

    return {
        "average_precision": average_precision,
        "average_recall": average_recall,
        "f1_score": f1_score
    }

In [8]:
# only use rows that are also in eval_output_latest
ground_truth = ground_truth_EXP[ground_truth_EXP[('citing_publication_link')].isin(DataRef_EXP_FDR_gpt['source_url'])]
len(ground_truth)

47

In [9]:
DataRef_EXP_FDR_gpt_metrics = calculate_performance_metrics(ground_truth, DataRef_EXP_FDR_gpt)

# Output the results
print("Performance Metrics:")
print(f"Average Precision: {DataRef_EXP_FDR_gpt_metrics['average_precision']:.4f}")
print(f"Average Recall: {DataRef_EXP_FDR_gpt_metrics['average_recall']:.4f}")
print(f"F1-Score: {DataRef_EXP_FDR_gpt_metrics['f1_score']:.4f}")

Number of unique source pages: (20, 47)

Processing source page: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC11252349/
Ground truth datasets: {'pxd053567'}
Ground truth repositories: {'pride'}
Evaluation datasets: {'pxd053567'}
Evaluation repositories: {'pride'}
Exact match found: pxd053567
False positives: set()
False negatives: set()
Precision for https://www.ncbi.nlm.nih.gov/pmc/articles/PMC11252349/: 1.0
Recall for https://www.ncbi.nlm.nih.gov/pmc/articles/PMC11252349/: 1.0

Processing source page: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC11425778/
Ground truth datasets: {'msv000092944'}
Ground truth repositories: {'massive'}
Evaluation datasets: {'msv000092944'}
Evaluation repositories: {'massive'}
Exact match found: msv000092944
False positives: set()
False negatives: set()
Precision for https://www.ncbi.nlm.nih.gov/pmc/articles/PMC11425778/: 1.0
Recall for https://www.ncbi.nlm.nih.gov/pmc/articles/PMC11425778/: 1.0

Processing source page: https://www.ncbi.nlm.nih.gov/pmc/a

In [10]:
DataRef_EXP_RTR_gpt_metrics = calculate_performance_metrics(ground_truth, DataRef_EXP_RTR_gpt)

# Output the results
print("Performance Metrics:")
print(f"Average Precision: {DataRef_EXP_RTR_gpt_metrics['average_precision']:.4f}")
print(f"Average Recall: {DataRef_EXP_RTR_gpt_metrics['average_recall']:.4f}")
print(f"F1-Score: {DataRef_EXP_RTR_gpt_metrics['f1_score']:.4f}")

Number of unique source pages: (20, 47)

Processing source page: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC11252349/
Ground truth datasets: {'pxd053567'}
Ground truth repositories: {'pride'}
Evaluation datasets: {'pxd053567'}
Evaluation repositories: {'pride'}
Exact match found: pxd053567
False positives: set()
False negatives: set()
Precision for https://www.ncbi.nlm.nih.gov/pmc/articles/PMC11252349/: 1.0
Recall for https://www.ncbi.nlm.nih.gov/pmc/articles/PMC11252349/: 1.0

Processing source page: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC11425778/
Ground truth datasets: {'msv000092944'}
Ground truth repositories: {'massive'}
Evaluation datasets: {'msv000092944'}
Evaluation repositories: {'massive'}
Exact match found: msv000092944
False positives: set()
False negatives: set()
Precision for https://www.ncbi.nlm.nih.gov/pmc/articles/PMC11425778/: 1.0
Recall for https://www.ncbi.nlm.nih.gov/pmc/articles/PMC11425778/: 1.0

Processing source page: https://www.ncbi.nlm.nih.gov/pmc/a

In [11]:
DataRef_EXP_FDR_gemini_metrics = calculate_performance_metrics(ground_truth, DataRef_EXP_FDR_gemini)

# Output the results
print("Performance Metrics:")
print(f"Average Precision: {DataRef_EXP_FDR_gemini_metrics['average_precision']:.4f}")
print(f"Average Recall: {DataRef_EXP_FDR_gemini_metrics['average_recall']:.4f}")
print(f"F1-Score: {DataRef_EXP_FDR_gemini_metrics['f1_score']:.4f}")

Number of unique source pages: (20, 47)

Processing source page: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC11252349/
Ground truth datasets: {'pxd053567'}
Ground truth repositories: {'pride'}
Evaluation datasets: {'pxd053567'}
Evaluation repositories: {'http://www.ebi.ac.uk/pride/archive/'}
Exact match found: pxd053567
False positives: set()
False negatives: set()
Precision for https://www.ncbi.nlm.nih.gov/pmc/articles/PMC11252349/: 1.0
Recall for https://www.ncbi.nlm.nih.gov/pmc/articles/PMC11252349/: 1.0

Processing source page: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC11425778/
Ground truth datasets: {'msv000092944'}
Ground truth repositories: {'massive'}
Evaluation datasets: {'msv000092944'}
Evaluation repositories: {'massive.ucsd.edu'}
Exact match found: msv000092944
False positives: set()
False negatives: set()
Precision for https://www.ncbi.nlm.nih.gov/pmc/articles/PMC11425778/: 1.0
Recall for https://www.ncbi.nlm.nih.gov/pmc/articles/PMC11425778/: 1.0

Processing source p

In [12]:
DataRef_EXP_RTR_gemini_metrics = calculate_performance_metrics(ground_truth, DataRef_EXP_RTR_gemini)

# Output the results
print("Performance Metrics:")
print(f"Average Precision: {DataRef_EXP_RTR_gemini_metrics['average_precision']:.4f}")
print(f"Average Recall: {DataRef_EXP_RTR_gemini_metrics['average_recall']:.4f}")
print(f"F1-Score: {DataRef_EXP_RTR_gemini_metrics['f1_score']:.4f}")

Number of unique source pages: (20, 47)

Processing source page: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC11252349/
Ground truth datasets: {'pxd053567'}
Ground truth repositories: {'pride'}
Evaluation datasets: {'pxd053567'}
Evaluation repositories: {'pride'}
Exact match found: pxd053567
False positives: set()
False negatives: set()
Precision for https://www.ncbi.nlm.nih.gov/pmc/articles/PMC11252349/: 1.0
Recall for https://www.ncbi.nlm.nih.gov/pmc/articles/PMC11252349/: 1.0

Processing source page: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC11425778/
Ground truth datasets: {'msv000092944'}
Ground truth repositories: {'massive'}
Evaluation datasets: {'msv000092944'}
Evaluation repositories: {'massive'}
Exact match found: msv000092944
False positives: set()
False negatives: set()
Precision for https://www.ncbi.nlm.nih.gov/pmc/articles/PMC11425778/: 1.0
Recall for https://www.ncbi.nlm.nih.gov/pmc/articles/PMC11425778/: 1.0

Processing source page: https://www.ncbi.nlm.nih.gov/pmc/a