In [2]:
import pandas as pd

In [3]:
# get excel table from the Articles tab
ground_truth = pd.read_excel("/Users/pietro/Desktop/VIDA Lab/DataGatherer/output/gold/GT_DataLinks_PMC_and_Fenyo.xlsx")
eval_output_gemma_DAS = pd.read_csv("/Users/pietro/Desktop/VIDA Lab/DataGatherer/output/output_full_Gemma_DAS.csv")
eval_output_gemma = pd.read_csv("/Users/pietro/Desktop/VIDA Lab/DataGatherer/output/output_full_Gemma.csv")
eval_output_GPT_DAS = pd.read_csv("/Users/pietro/Desktop/VIDA Lab/DataGatherer/output/output_full-GPT-DAS.csv")
eval_output_GPT = pd.read_csv("/Users/pietro/Desktop/VIDA Lab/DataGatherer/output/output_full-GPT.csv")
# from eval drop rows with empty dataset identifier
eval_output_gemma_DAS.dropna(subset=["dataset_identifier"], inplace=True)
eval_output_gemma.dropna(subset=["dataset_identifier"], inplace=True)
eval_output_GPT_DAS.dropna(subset=["dataset_identifier"], inplace=True)
eval_output_GPT.dropna(subset=["dataset_identifier"], inplace=True)
# keep only columns of interest
eval_output_gemma_DAS = eval_output_gemma_DAS[["source_url", "dataset_identifier", "dataset_webpage", "data_repository"]]
eval_output_gemma = eval_output_gemma[["source_url", "dataset_identifier", "dataset_webpage", "data_repository"]]
eval_output_GPT_DAS = eval_output_GPT_DAS[["source_url", "dataset_identifier", "dataset_webpage", "data_repository"]]
eval_output_GPT = eval_output_GPT[["source_url", "dataset_identifier", "dataset_webpage", "data_repository"]]

In [4]:
def calculate_performance_metrics(ground_truth_df: pd.DataFrame, eval_df: pd.DataFrame) -> dict:
    """
    Calculate performance metrics (precision, recall, F1-score) for dataset extraction,
    considering exact, partial, and alternative matches.
    """
    # Initialize counters for total scores
    total_precision = 0
    total_recall = 0
    num_sources = ground_truth_df['Source Page'].nunique()

    # Iterate through each unique source page
    for source_page in ground_truth_df['Source Page'].unique():
        # Extract ground truth datasets for this source page
        gt_data = ground_truth_df[ground_truth_df['Source Page'] == source_page]
        gt_datasets = set(gt_data['UID'].dropna().str.lower())
        gt_webpages = set(gt_data['Dataset Webpage'].dropna().str.lower())
        gt_repositories = set(gt_data['Repository Link'].dropna().str.lower())

        # Extract evaluation datasets for this source page
        eval_data = eval_df[eval_df['source_url'] == source_page]
        eval_datasets = set(eval_data['dataset_identifier'].dropna().str.lower())
        eval_webpages = set(eval_data['dataset_webpage'].dropna().str.lower())
        eval_repositories = set(eval_data['data_repository'].dropna().str.lower())
        
        # Handle cases where both ground truth and evaluation are empty
        if not gt_datasets and not eval_datasets:
            #print(f"Paper {source_page}: No datasets in ground truth or evaluation. Skipping.")
            total_precision += 1  # Perfect precision
            total_recall += 1  # Perfect recall
            continue
        # Initialize match counters
        exact_matches = 0
        partial_matches = 0
        false_positives = 0
        false_negatives = 0

        # Track matches to avoid double counting
        matched_gt = set()
        matched_eval = set()
        
        print(f"Paper {source_page}. \ngt_datasets: {gt_datasets}. \ngt_webpages: {gt_webpages}."
              f"\neval_datasets: {eval_datasets}. \neval_webpages: {eval_webpages}")

        # Check for exact matches first
        for eval_id in eval_datasets:
            if eval_id in gt_datasets:
                #print(f"Exact match: {eval_id}")
                exact_matches += 1
                matched_gt.add(eval_id)
                matched_eval.add(eval_id)

        # Check for partial matches (e.g., substrings) and alternative matches
        for eval_id in eval_datasets - matched_eval:
            for gt_id in gt_datasets - matched_gt:
                if eval_id in gt_id or gt_id in eval_id:  # Partial match on dataset IDs
                    print(f"Partial match id-to-id: {eval_id}, {gt_id}")
                    partial_matches += 1
                    matched_gt.add(gt_id)
                    matched_eval.add(eval_id)
                    break
            for gt_webpage in gt_webpages - matched_gt:
                if eval_id in gt_webpage or gt_webpage in eval_id:  # Partial match on webpages
                    print(f"Partial match id-to-webpage: {eval_id}, {gt_webpage}")
                    partial_matches += 1
                    matched_gt.add(gt_webpage)
                    matched_eval.add(eval_id)
                    break

        # Calculate False Positives (remaining unmatched eval datasets)
        FP = eval_datasets - matched_eval
        false_positives = len(FP)
        print(f"False positives: {FP}")

        # Calculate False Negatives (remaining unmatched ground truth datasets)
        FN = gt_datasets - matched_gt
        false_negatives = len(FN)
        print(f"False negatives: {FN}")

        # Calculate precision and recall for this source page
        true_positives = exact_matches + partial_matches
        precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
        recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
        
        print(f"Precision: {precision},\nRecall: {recall}")
        
        # Accumulate totals
        total_precision += precision
        total_recall += recall

    # Calculate overall metrics
    average_precision = total_precision / num_sources if num_sources > 0 else 0
    average_recall = total_recall / num_sources if num_sources > 0 else 0
    f1_score = 2 * (average_precision * average_recall) / (average_precision + average_recall) if (average_precision + average_recall) > 0 else 0

    return {
        "average_precision": average_precision,
        "average_recall": average_recall,
        "f1_score": f1_score
    }

In [5]:
metrics_gemma_DAS = calculate_performance_metrics(ground_truth, eval_output_gemma_DAS)

Paper https://www.ncbi.nlm.nih.gov/pmc/articles/PMC11252349/. 
gt_datasets: {'pxd053567'}. 
gt_webpages: {'https://www.ebi.ac.uk/pride/archive/projects/pxd053567'}.
eval_datasets: {'pxd053567'}. 
eval_webpages: {'https://www.ebi.ac.uk/pride/archive/projects/pxd053567'}
False positives: set()
False negatives: set()
Precision: 1.0,
Recall: 1.0
Paper https://www.ncbi.nlm.nih.gov/pmc/articles/PMC11425778/. 
gt_datasets: {'msv000092944'}. 
gt_webpages: {'https://massive.ucsd.edu/proteosafe/dataset.jsp?task=665eb32b027341559785174213664a61'}.
eval_datasets: {'msv000092944'}. 
eval_webpages: {'https://massive.ucsd.edu/proteosafe/dataset.jsp?accession=msv000092944'}
False positives: set()
False negatives: set()
Precision: 1.0,
Recall: 1.0
Paper https://www.ncbi.nlm.nih.gov/pmc/articles/PMC11320025/. 
gt_datasets: {'6077-6129', 'pxd052949'}. 
gt_webpages: {'https://proteomecentral.proteomexchange.org/cgi/getdataset?id=pxd052949', 'https://assays.cancer.gov/non-cptac-6077'}.
eval_datasets: {'pxd

In [6]:
# Output the results
print("Performance Metrics:")
print(f"Average Precision: {metrics_gemma_DAS['average_precision']:.4f}")
print(f"Average Recall: {metrics_gemma_DAS['average_recall']:.4f}")
print(f"F1-Score: {metrics_gemma_DAS['f1_score']:.4f}")

Performance Metrics:
Average Precision: 0.7630
Average Recall: 0.7857
F1-Score: 0.7742


In [7]:
metrics_gemma = calculate_performance_metrics(ground_truth, eval_output_gemma)

Paper https://www.ncbi.nlm.nih.gov/pmc/articles/PMC11252349/. 
gt_datasets: {'pxd053567'}. 
gt_webpages: {'https://www.ebi.ac.uk/pride/archive/projects/pxd053567'}.
eval_datasets: {'pxd053567'}. 
eval_webpages: {'https://www.ebi.ac.uk/pride/archive/projects/pxd053567'}
False positives: set()
False negatives: set()
Precision: 1.0,
Recall: 1.0
Paper https://www.ncbi.nlm.nih.gov/pmc/articles/PMC11425778/. 
gt_datasets: {'msv000092944'}. 
gt_webpages: {'https://massive.ucsd.edu/proteosafe/dataset.jsp?task=665eb32b027341559785174213664a61'}.
eval_datasets: {'msv000092944'}. 
eval_webpages: {'https://massive.ucsd.edu/proteosafe/dataset.jsp?accession=msv000092944'}
False positives: set()
False negatives: set()
Precision: 1.0,
Recall: 1.0
Paper https://www.ncbi.nlm.nih.gov/pmc/articles/PMC11320025/. 
gt_datasets: {'6077-6129', 'pxd052949'}. 
gt_webpages: {'https://proteomecentral.proteomexchange.org/cgi/getdataset?id=pxd052949', 'https://assays.cancer.gov/non-cptac-6077'}.
eval_datasets: {'607

In [9]:
# Output the results
print("Performance Metrics:")
print(f"Average Precision: {metrics_gemma['average_precision']:.4f}")
print(f"Average Recall: {metrics_gemma['average_recall']:.4f}")
print(f"F1-Score: {metrics_gemma['f1_score']:.4f}")

Performance Metrics:
Average Precision: 0.8429
Average Recall: 0.8889
F1-Score: 0.8653


In [10]:
metrics_GPT_DAS = calculate_performance_metrics(ground_truth, eval_output_GPT_DAS)

Paper https://www.ncbi.nlm.nih.gov/pmc/articles/PMC11252349/. 
gt_datasets: {'pxd053567'}. 
gt_webpages: {'https://www.ebi.ac.uk/pride/archive/projects/pxd053567'}.
eval_datasets: {'pxd053567'}. 
eval_webpages: {'https://www.ebi.ac.uk/pride/archive/projects/pxd053567'}
False positives: set()
False negatives: set()
Precision: 1.0,
Recall: 1.0
Paper https://www.ncbi.nlm.nih.gov/pmc/articles/PMC11425778/. 
gt_datasets: {'msv000092944'}. 
gt_webpages: {'https://massive.ucsd.edu/proteosafe/dataset.jsp?task=665eb32b027341559785174213664a61'}.
eval_datasets: {'msv000092944'}. 
eval_webpages: {'https://massive.ucsd.edu/proteosafe/dataset.jsp?accession=msv000092944'}
False positives: set()
False negatives: set()
Precision: 1.0,
Recall: 1.0
Paper https://www.ncbi.nlm.nih.gov/pmc/articles/PMC11320025/. 
gt_datasets: {'6077-6129', 'pxd052949'}. 
gt_webpages: {'https://proteomecentral.proteomexchange.org/cgi/getdataset?id=pxd052949', 'https://assays.cancer.gov/non-cptac-6077'}.
eval_datasets: {'pxd

In [11]:
# Output the results
print("Performance Metrics:")
print(f"Average Precision: {metrics_GPT_DAS['average_precision']:.4f}")
print(f"Average Recall: {metrics_GPT_DAS['average_recall']:.4f}")
print(f"F1-Score: {metrics_GPT_DAS['f1_score']:.4f}")

Performance Metrics:
Average Precision: 0.7778
Average Recall: 0.7571
F1-Score: 0.7673


In [12]:
metrics_GPT = calculate_performance_metrics(ground_truth, eval_output_GPT)#

Paper https://www.ncbi.nlm.nih.gov/pmc/articles/PMC11252349/. 
gt_datasets: {'pxd053567'}. 
gt_webpages: {'https://www.ebi.ac.uk/pride/archive/projects/pxd053567'}.
eval_datasets: {'pxd053567'}. 
eval_webpages: {'https://www.ebi.ac.uk/pride/archive/projects/pxd053567'}
False positives: set()
False negatives: set()
Precision: 1.0,
Recall: 1.0
Paper https://www.ncbi.nlm.nih.gov/pmc/articles/PMC11425778/. 
gt_datasets: {'msv000092944'}. 
gt_webpages: {'https://massive.ucsd.edu/proteosafe/dataset.jsp?task=665eb32b027341559785174213664a61'}.
eval_datasets: {'msv000092944'}. 
eval_webpages: {'https://massive.ucsd.edu/proteosafe/dataset.jsp?accession=msv000092944'}
False positives: set()
False negatives: set()
Precision: 1.0,
Recall: 1.0
Paper https://www.ncbi.nlm.nih.gov/pmc/articles/PMC11320025/. 
gt_datasets: {'6077-6129', 'pxd052949'}. 
gt_webpages: {'https://proteomecentral.proteomexchange.org/cgi/getdataset?id=pxd052949', 'https://assays.cancer.gov/non-cptac-6077'}.
eval_datasets: {'pxd

In [13]:
# Output the results
print("Performance Metrics:")
print(f"Average Precision: {metrics_GPT['average_precision']:.4f}")
print(f"Average Recall: {metrics_GPT['average_recall']:.4f}")
print(f"F1-Score: {metrics_GPT['f1_score']:.4f}")

Performance Metrics:
Average Precision: 0.9048
Average Recall: 0.9048
F1-Score: 0.9048
