In [1]:
import pandas as pd

In [2]:
# get excel table from the Articles tab
ground_truth = pd.read_excel("/Users/pietro/Desktop/VIDA-NYU/data-gatherer/output/gold/GT_DataLinks_PMC_and_Fenyo.xlsx")
eval_output_gemma_DAS = pd.read_csv("/Users/pietro/Desktop/VIDA-NYU/data-gatherer/output/output_full_Gemma_DAS.csv")
eval_output_gemma = pd.read_csv("/Users/pietro/Desktop/VIDA-NYU/data-gatherer/output/output_full_Gemma.csv")
eval_output_GPT_DAS = pd.read_csv("/Users/pietro/Desktop/VIDA-NYU/data-gatherer/output/output_full-GPT-DAS.csv")
eval_output_GPT = pd.read_csv("/Users/pietro/Desktop/VIDA-NYU/data-gatherer/output/output_full-GPT.csv")
eval_output_Gemini_15 = pd.read_csv("/Users/pietro/Desktop/VIDA-NYU/data-gatherer/output/output_full-GEMINI-1.5.csv")
eval_output_Gemini_20 = pd.read_csv("/output/output_full-GEMINI-2.0-exp.csv")
# from eval drop rows with empty dataset identifier
eval_output_gemma_DAS.dropna(subset=["dataset_identifier"], inplace=True)
eval_output_gemma.dropna(subset=["dataset_identifier"], inplace=True)
eval_output_GPT_DAS.dropna(subset=["dataset_identifier"], inplace=True)
eval_output_GPT.dropna(subset=["dataset_identifier"], inplace=True)
eval_output_Gemini_15.dropna(subset=["dataset_identifier"], inplace=True)
eval_output_Gemini_20.dropna(subset=["dataset_identifier"], inplace=True)
# keep only columns of interest
eval_output_gemma_DAS = eval_output_gemma_DAS[["source_url", "dataset_identifier", "dataset_webpage", "data_repository"]]
eval_output_gemma = eval_output_gemma[["source_url", "dataset_identifier", "dataset_webpage", "data_repository"]]
eval_output_GPT_DAS = eval_output_GPT_DAS[["source_url", "dataset_identifier", "dataset_webpage", "data_repository"]]
eval_output_GPT = eval_output_GPT[["source_url", "dataset_identifier", "dataset_webpage", "data_repository"]]
eval_output_Gemini_15 = eval_output_Gemini_15[["source_url", "dataset_identifier", "dataset_webpage", "data_repository"]]
eval_output_Gemini_20 = eval_output_Gemini_20[["source_url", "dataset_identifier", "dataset_webpage", "data_repository"]]

In [3]:
def calculate_performance_metrics(ground_truth_df: pd.DataFrame, eval_df: pd.DataFrame) -> dict:
    """
    Calculate performance metrics (precision, recall, F1-score) for dataset extraction,
    considering exact, partial, and alias matches. Includes verbose debugging output.
    """
    # Initialize counters for total scores
    total_precision = 0
    total_recall = 0
    num_sources = ground_truth_df['Source Page'].nunique()

    print(f"Number of unique source pages: {num_sources}")

    # Iterate through each unique source page
    for source_page in ground_truth_df['Source Page'].unique():
        print(f"\nProcessing source page: {source_page}")

        # Extract ground truth datasets for this source page
        gt_data = ground_truth_df[ground_truth_df['Source Page'] == source_page]
        gt_datasets = set(gt_data['UID'].dropna().str.lower())
        gt_webpages = set(gt_data['Dataset Webpage'].dropna().str.lower())
        gt_repositories = set(gt_data['Repository Link'].dropna().str.lower())

        print(f"Ground truth datasets: {gt_datasets}")
        print(f"Ground truth webpages: {gt_webpages}")
        print(f"Ground truth repositories: {gt_repositories}")

        # Extract evaluation datasets for this source page
        eval_data = eval_df[eval_df['source_url'] == source_page]
        eval_datasets = set(eval_data['dataset_identifier'].dropna().str.lower())
        eval_webpages = set(eval_data['dataset_webpage'].dropna().str.lower())
        eval_repositories = set(eval_data['data_repository'].dropna().str.lower())

        print(f"Evaluation datasets: {eval_datasets}")
        print(f"Evaluation webpages: {eval_webpages}")
        print(f"Evaluation repositories: {eval_repositories}")

        # Handle cases where both ground truth and evaluation are empty
        if not gt_datasets and not eval_datasets:
            print("No datasets in both ground truth and evaluation. Perfect precision and recall.")
            total_precision += 1
            total_recall += 1
            continue

        # Initialize match counters
        exact_matches = 0
        partial_matches = 0
        false_positives = 0
        false_negatives = 0

        # Track matches to avoid double counting
        matched_gt = set()
        matched_eval = set()

        # Check for exact matches first
        for eval_id in eval_datasets:
            if eval_id in gt_datasets:
                print(f"Exact match found: {eval_id}")
                exact_matches += 1
                matched_gt.add(eval_id)
                matched_eval.add(eval_id)

        # Check for partial matches and aliases
        for eval_id in eval_datasets - matched_eval:
            for gt_id in gt_datasets - matched_gt:
                if eval_id in gt_id or gt_id in eval_id:  # Partial match or alias
                    print(f"Partial or alias match found: eval_id={eval_id}, gt_id={gt_id}")
                    partial_matches += 1
                    matched_gt.add(gt_id)
                    matched_eval.add(eval_id)
                    break

        # Calculate False Positives (remaining unmatched eval datasets)
        FP = eval_datasets - matched_eval

        # Remove aliases from False Positives
        for eval_id in list(FP):
            for matched_id in matched_gt:
                if eval_id in matched_id or matched_id in eval_id:  # Alias detected
                    print(f"Removing alias from false positives: eval_id={eval_id}, matched_id={matched_id}")
                    FP.discard(eval_id)
                    break

        false_positives = len(FP)
        print(f"False positives: {FP}")

        # Calculate False Negatives (remaining unmatched ground truth datasets)
        FN = gt_datasets - matched_gt
        false_negatives = len(FN)
        print(f"False negatives: {FN}")

        # Calculate precision and recall for this source page
        true_positives = exact_matches + partial_matches
        precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
        recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0

        print(f"Precision for {source_page}: {precision}")
        print(f"Recall for {source_page}: {recall}")

        # Accumulate totals
        total_precision += precision
        total_recall += recall

    # Calculate overall metrics
    average_precision = total_precision / num_sources if num_sources > 0 else 0
    average_recall = total_recall / num_sources if num_sources > 0 else 0
    f1_score = 2 * (average_precision * average_recall) / (average_precision + average_recall) if (average_precision + average_recall) > 0 else 0

    return {
        "average_precision": average_precision,
        "average_recall": average_recall,
        "f1_score": f1_score
    }

In [4]:
metrics_gemma_DAS = calculate_performance_metrics(ground_truth, eval_output_gemma_DAS)

Number of unique source pages: 21

Processing source page: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC11252349/
Ground truth datasets: {'pxd053567'}
Ground truth webpages: {'https://www.ebi.ac.uk/pride/archive/projects/pxd053567'}
Ground truth repositories: {'http://www.ebi.ac.uk/pride/archive/'}
Evaluation datasets: {'pxd053567'}
Evaluation webpages: {'https://www.ebi.ac.uk/pride/archive/projects/pxd053567'}
Evaluation repositories: {'http://www.ebi.ac.uk/pride/archive/'}
Exact match found: pxd053567
False positives: set()
False negatives: set()
Precision for https://www.ncbi.nlm.nih.gov/pmc/articles/PMC11252349/: 1.0
Recall for https://www.ncbi.nlm.nih.gov/pmc/articles/PMC11252349/: 1.0

Processing source page: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC11425778/
Ground truth datasets: {'msv000092944'}
Ground truth webpages: {'https://massive.ucsd.edu/proteosafe/dataset.jsp?task=665eb32b027341559785174213664a61'}
Ground truth repositories: {'https://massive.ucsd.edu'}
Evaluation 

In [5]:
# Output the results
print("Performance Metrics:")
print(f"Average Precision: {metrics_gemma_DAS['average_precision']:.4f}")
print(f"Average Recall: {metrics_gemma_DAS['average_recall']:.4f}")
print(f"F1-Score: {metrics_gemma_DAS['f1_score']:.4f}")

Performance Metrics:
Average Precision: 0.7857
Average Recall: 0.7857
F1-Score: 0.7857


In [6]:
metrics_gemma = calculate_performance_metrics(ground_truth, eval_output_gemma)

Number of unique source pages: 21

Processing source page: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC11252349/
Ground truth datasets: {'pxd053567'}
Ground truth webpages: {'https://www.ebi.ac.uk/pride/archive/projects/pxd053567'}
Ground truth repositories: {'http://www.ebi.ac.uk/pride/archive/'}
Evaluation datasets: {'pxd053567'}
Evaluation webpages: {'https://www.ebi.ac.uk/pride/archive/projects/pxd053567'}
Evaluation repositories: {'http://www.ebi.ac.uk/pride/archive/'}
Exact match found: pxd053567
False positives: set()
False negatives: set()
Precision for https://www.ncbi.nlm.nih.gov/pmc/articles/PMC11252349/: 1.0
Recall for https://www.ncbi.nlm.nih.gov/pmc/articles/PMC11252349/: 1.0

Processing source page: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC11425778/
Ground truth datasets: {'msv000092944'}
Ground truth webpages: {'https://massive.ucsd.edu/proteosafe/dataset.jsp?task=665eb32b027341559785174213664a61'}
Ground truth repositories: {'https://massive.ucsd.edu'}
Evaluation 

In [7]:
# Output the results
print("Performance Metrics:")
print(f"Average Precision: {metrics_gemma['average_precision']:.4f}")
print(f"Average Recall: {metrics_gemma['average_recall']:.4f}")
print(f"F1-Score: {metrics_gemma['f1_score']:.4f}")

Performance Metrics:
Average Precision: 0.8905
Average Recall: 0.8889
F1-Score: 0.8897


In [8]:
metrics_GPT_DAS = calculate_performance_metrics(ground_truth, eval_output_GPT_DAS)

Number of unique source pages: 21

Processing source page: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC11252349/
Ground truth datasets: {'pxd053567'}
Ground truth webpages: {'https://www.ebi.ac.uk/pride/archive/projects/pxd053567'}
Ground truth repositories: {'http://www.ebi.ac.uk/pride/archive/'}
Evaluation datasets: {'pxd053567'}
Evaluation webpages: {'https://www.ebi.ac.uk/pride/archive/projects/pxd053567'}
Evaluation repositories: {'http://www.ebi.ac.uk/pride/archive/'}
Exact match found: pxd053567
False positives: set()
False negatives: set()
Precision for https://www.ncbi.nlm.nih.gov/pmc/articles/PMC11252349/: 1.0
Recall for https://www.ncbi.nlm.nih.gov/pmc/articles/PMC11252349/: 1.0

Processing source page: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC11425778/
Ground truth datasets: {'msv000092944'}
Ground truth webpages: {'https://massive.ucsd.edu/proteosafe/dataset.jsp?task=665eb32b027341559785174213664a61'}
Ground truth repositories: {'https://massive.ucsd.edu'}
Evaluation 

In [9]:
# Output the results
print("Performance Metrics:")
print(f"Average Precision: {metrics_GPT_DAS['average_precision']:.4f}")
print(f"Average Recall: {metrics_GPT_DAS['average_recall']:.4f}")
print(f"F1-Score: {metrics_GPT_DAS['f1_score']:.4f}")

Performance Metrics:
Average Precision: 0.7778
Average Recall: 0.7571
F1-Score: 0.7673


In [10]:
metrics_GPT = calculate_performance_metrics(ground_truth, eval_output_GPT)#

Number of unique source pages: 21

Processing source page: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC11252349/
Ground truth datasets: {'pxd053567'}
Ground truth webpages: {'https://www.ebi.ac.uk/pride/archive/projects/pxd053567'}
Ground truth repositories: {'http://www.ebi.ac.uk/pride/archive/'}
Evaluation datasets: {'pxd053567'}
Evaluation webpages: {'https://www.ebi.ac.uk/pride/archive/projects/pxd053567'}
Evaluation repositories: {'http://www.ebi.ac.uk/pride/archive/'}
Exact match found: pxd053567
False positives: set()
False negatives: set()
Precision for https://www.ncbi.nlm.nih.gov/pmc/articles/PMC11252349/: 1.0
Recall for https://www.ncbi.nlm.nih.gov/pmc/articles/PMC11252349/: 1.0

Processing source page: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC11425778/
Ground truth datasets: {'msv000092944'}
Ground truth webpages: {'https://massive.ucsd.edu/proteosafe/dataset.jsp?task=665eb32b027341559785174213664a61'}
Ground truth repositories: {'https://massive.ucsd.edu'}
Evaluation 

In [11]:
# Output the results
print("Performance Metrics:")
print(f"Average Precision: {metrics_GPT['average_precision']:.4f}")
print(f"Average Recall: {metrics_GPT['average_recall']:.4f}")
print(f"F1-Score: {metrics_GPT['f1_score']:.4f}")

Performance Metrics:
Average Precision: 0.9286
Average Recall: 0.9048
F1-Score: 0.9165


In [12]:
metrics_Gemini_15 = calculate_performance_metrics(ground_truth, eval_output_Gemini_15)

Number of unique source pages: 21

Processing source page: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC11252349/
Ground truth datasets: {'pxd053567'}
Ground truth webpages: {'https://www.ebi.ac.uk/pride/archive/projects/pxd053567'}
Ground truth repositories: {'http://www.ebi.ac.uk/pride/archive/'}
Evaluation datasets: {'pxd053567'}
Evaluation webpages: {'https://www.ebi.ac.uk/pride/archive/projects/pxd053567'}
Evaluation repositories: {'http://www.ebi.ac.uk/pride/archive/'}
Exact match found: pxd053567
False positives: set()
False negatives: set()
Precision for https://www.ncbi.nlm.nih.gov/pmc/articles/PMC11252349/: 1.0
Recall for https://www.ncbi.nlm.nih.gov/pmc/articles/PMC11252349/: 1.0

Processing source page: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC11425778/
Ground truth datasets: {'msv000092944'}
Ground truth webpages: {'https://massive.ucsd.edu/proteosafe/dataset.jsp?task=665eb32b027341559785174213664a61'}
Ground truth repositories: {'https://massive.ucsd.edu'}
Evaluation 

In [13]:
# Output the results
print("Performance Metrics:")
print(f"Average Precision: {metrics_Gemini_15['average_precision']:.4f}")
print(f"Average Recall: {metrics_Gemini_15['average_recall']:.4f}")
print(f"F1-Score: {metrics_Gemini_15['f1_score']:.4f}")

Performance Metrics:
Average Precision: 0.8413
Average Recall: 0.8206
F1-Score: 0.8308


In [14]:
metrics_Gemini_20 = calculate_performance_metrics(ground_truth, eval_output_Gemini_20)

Number of unique source pages: 21

Processing source page: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC11252349/
Ground truth datasets: {'pxd053567'}
Ground truth webpages: {'https://www.ebi.ac.uk/pride/archive/projects/pxd053567'}
Ground truth repositories: {'http://www.ebi.ac.uk/pride/archive/'}
Evaluation datasets: {'pxd053567'}
Evaluation webpages: {'https://www.ebi.ac.uk/pride/archive/projects/pxd053567'}
Evaluation repositories: {'www.ebi.ac.uk'}
Exact match found: pxd053567
False positives: set()
False negatives: set()
Precision for https://www.ncbi.nlm.nih.gov/pmc/articles/PMC11252349/: 1.0
Recall for https://www.ncbi.nlm.nih.gov/pmc/articles/PMC11252349/: 1.0

Processing source page: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC11425778/
Ground truth datasets: {'msv000092944'}
Ground truth webpages: {'https://massive.ucsd.edu/proteosafe/dataset.jsp?task=665eb32b027341559785174213664a61'}
Ground truth repositories: {'https://massive.ucsd.edu'}
Evaluation datasets: {'msv0000929

In [15]:
# Output the results
print("Performance Metrics:")
print(f"Average Precision: {metrics_Gemini_20['average_precision']:.4f}")
print(f"Average Recall: {metrics_Gemini_20['average_recall']:.4f}")
print(f"F1-Score: {metrics_Gemini_20['f1_score']:.4f}")

Performance Metrics:
Average Precision: 0.7302
Average Recall: 0.7587
F1-Score: 0.7442
