In [1]:
# import from the files in this directory
from dotenv import load_dotenv
from classifier import *
from data_fetcher import *
from parser import *
from orchestrator import *
from logger_setup import *
import os
import json
import pandas as pd
import numpy as np
import re
import time

In [2]:
config_path = 'config_experiment.json'  # Config with input file details
load_dotenv()
orchestrator = Orchestrator(config_path)
orchestrator.setup_data_fetcher()

orchestrator.py - line 20 - INFO - Data_Gatherer Orchestrator initialized. Extraction step Model: gemini-2.0-flash-exp
orchestrator.py - line 45 - INFO - Data fetcher setup completed.


<selenium.webdriver.firefox.webdriver.WebDriver (session="61961f74-e63c-4d24-85e1-9c17d3e2e009")>

In [3]:
ground_truth_PRIDE = pd.read_csv('exp_input/PRIDEid_HTML_data.csv')

In [4]:
urls = ground_truth_PRIDE['publication'].tolist()
print(f"Number of URLs: {len(urls)}")

Number of URLs: 239


In [5]:
predict = False

if predict:
    results = {}
    for i,url in enumerate(urls):
        time.sleep(15)
        orchestrator.current_url = url
        orchestrator.logger.info(f"Processing URL {i+1}: {orchestrator.current_url}")
        orchestrator.publisher = orchestrator.data_fetcher.url_to_publisher_domain(url)
        #orchestrator.logger.info("Using LLMParser to parse data.")
        orchestrator.parser = LLMParser(orchestrator.XML_config, orchestrator.logger)
        raw_data = ground_truth_PRIDE.loc[ground_truth_PRIDE['publication'] == url, 'raw_html'].iloc[0]
        parsed_data = orchestrator.parser.parse_data(raw_data, orchestrator.publisher, orchestrator.current_url, raw_data_format="full_HTML")
        parsed_data['source_url'] = url
        orchestrator.logger.info(f"Parsed data extraction completed. Elements collected: {len(parsed_data)}")
        results[url] = parsed_data
    
    predict_df = pd.DataFrame()
    for url, data in results.items():
        predict_df = pd.concat([predict_df, data], ignore_index=True)
        
    predict_df.to_csv(orchestrator.config['full_output_file'], index=False)
    
else:
    predict_df = pd.read_csv(orchestrator.config['full_output_file'])

In [6]:
recall_list = []
def calculate_performance_metrics(orchestrator, ground_truth_df: pd.DataFrame, eval_df: pd.DataFrame) -> dict:
    """
    Calculate performance metrics (precision, recall, F1-score) for dataset extraction,
    considering exact, partial, and alias matches. Includes verbose debugging output.
    """
    
    list_False_positives_output = []
    
    # Initialize counters for total scores
    total_precision = 0
    total_recall = 0
    
    #num_tot_sources = len(eval_df['source_url'].unique())
    #orchestrator.logger.info(f"Number of unique source pages: {num_tot_sources}")
    num_sources = 0

    # Iterate through each unique source page
    for source_page in eval_df['source_url'].unique():
        orchestrator.logger.info(f"\nStarting performance evaluation for source page: {source_page}")

        # Extract ground truth datasets for this source page
        gt_data = ground_truth_df[ground_truth_df['publication'].apply(str.lower) == source_page.lower()]

        gt_datasets = set(gt_data['dataset_uid'].dropna().str.lower())
        gt_repositories = set(gt_data['repo_name'].dropna().str.lower())
        orchestrator.logger.info(f"Ground truth datasets: {gt_datasets}")
        # orchestrator.logger.info(f"Ground truth repositories: {gt_repositories}")
        
        # skip if no datasets in the HTML
        present = False
        orchestrator.logger.info(f"# of elements in gt_data: {len(gt_data)}")
        for i, row in gt_data.iterrows():
            dataset_uids = row['dataset_uid'].split(',')
            for match_id in dataset_uids:
                if re.findall(match_id, row['raw_html'], re.IGNORECASE):
                    orchestrator.logger.info(f"Dataset id {match_id} reference matched in the raw_html")
                    present = True
                    break
                else:
                    orchestrator.logger.info(f"Dataset id {row['dataset_uid']} not matched in the raw_html")
        
        if not present:
            orchestrator.logger.info(f"No datasets references in the raw_html for {source_page}")
            continue
        
        num_sources += 1

        # Extract evaluation datasets for this source page
        eval_data = eval_df[eval_df['source_url'] == source_page]
        eval_datasets = set(eval_data['dataset_identifier'].dropna().str.lower())
        # eval_webpages = set(eval_data['dataset_webpage'].dropna().str.lower())
        # eval_repositories = set(eval_data['data_repository'].dropna().str.lower())

        orchestrator.logger.info(f"Evaluation datasets: {eval_datasets}")
        # orchestrator.logger.info(f"Evaluation webpages: {eval_webpages}")
        # orchestrator.logger.info(f"Evaluation repositories: {eval_repositories}")

        # Handle cases where both ground truth and evaluation are empty
        if not gt_datasets and not eval_datasets:
            orchestrator.logger.info("No datasets in both ground truth and evaluation. Perfect precision and recall.")
            total_precision += 1
            total_recall += 1
            continue

        # Initialize match counters
        exact_matches = 0
        partial_matches = 0
        false_positives = 0
        false_negatives = 0

        # Track matches to avoid double counting
        matched_gt = set()
        matched_eval = set()

        # Check for exact matches first
        for eval_id in eval_datasets:
            if eval_id in gt_datasets:
                orchestrator.logger.info(f"Exact match found: {eval_id}")
                exact_matches += 1
                matched_gt.add(eval_id)
                matched_eval.add(eval_id)

        # Check for partial matches and aliases
        for eval_id in eval_datasets - matched_eval:
            for gt_id in gt_datasets - matched_gt:
                if eval_id in gt_id or gt_id in eval_id:  # Partial match or alias
                    orchestrator.logger.info(f"Partial or alias match found: eval_id={eval_id}, gt_id={gt_id}")
                    partial_matches += 1
                    matched_gt.add(gt_id)
                    matched_eval.add(eval_id)
                    break

        # Calculate False Positives (remaining unmatched eval datasets)
        FP = eval_datasets - matched_eval

        # Remove aliases from False Positives
        for eval_id in list(FP):
            for matched_id in matched_gt:
                if eval_id in matched_id or matched_id in eval_id:  # Alias detected
                    orchestrator.logger.info(f"Removing alias from false positives: eval_id={eval_id}, matched_id={matched_id}")
                    FP.discard(eval_id)
                    break

        false_positives = len(FP)
        orchestrator.logger.info(f"False positives: {FP}")
        list_False_positives_output.extend(FP)

        # Calculate False Negatives (remaining unmatched ground truth datasets)
        FN = gt_datasets - matched_gt
        false_negatives = len(FN)
        orchestrator.logger.info(f"False negatives: {FN}")

        # Calculate precision and recall for this source page
        true_positives = exact_matches + partial_matches
        precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
        recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0

        orchestrator.logger.info(f"Precision for {source_page}: {precision}")
        orchestrator.logger.info(f"Recall for {source_page}: {recall}")
        
        if recall == 0:
            recall_list.append(source_page)

        # Accumulate totals
        total_precision += precision
        total_recall += recall

    # Calculate overall metrics
    average_precision = total_precision / num_sources if num_sources > 0 else 0
    average_recall = total_recall / num_sources if num_sources > 0 else 0
    f1_score = 2 * (average_precision * average_recall) / (average_precision + average_recall) if (average_precision + average_recall) > 0 else 0
    
    orchestrator.logger.info(f"\nPerformance evaluation completed for {num_sources} source pages.")
    
    # write false positives to a file erasing all past data
    with open(orchestrator.config['false_positives_file'], 'w') as f:
        for item in list_False_positives_output:
            f.write("%s\n" % item)

    return {
        "average_precision": average_precision,
        "average_recall": average_recall,
        "f1_score": f1_score
    }

In [7]:
calculate_performance_metrics(orchestrator, ground_truth_PRIDE, predict_df)

864648685.py - line 20 - INFO - 
Starting performance evaluation for source page: https://dx.doi.org/10.1001/JAMANEUROL.2024.4763
864648685.py - line 27 - INFO - Ground truth datasets: {'pxd056570'}
864648685.py - line 32 - INFO - # of elements in gt_data: 1
864648685.py - line 41 - INFO - Dataset id PXD056570 not matched in the raw_html
864648685.py - line 44 - INFO - No datasets references in the raw_html for https://dx.doi.org/10.1001/JAMANEUROL.2024.4763
864648685.py - line 20 - INFO - 
Starting performance evaluation for source page: https://dx.doi.org/10.1002/CBIC.202400831
864648685.py - line 27 - INFO - Ground truth datasets: {'pxd055649'}
864648685.py - line 32 - INFO - # of elements in gt_data: 1
864648685.py - line 37 - INFO - Dataset id PXD055649 reference matched in the raw_html
864648685.py - line 55 - INFO - Evaluation datasets: {'pxd055649'}
864648685.py - line 79 - INFO - Exact match found: pxd055649
864648685.py - line 106 - INFO - False positives: set()
864648685.py 

{'average_precision': 0.6771259615684079,
 'average_recall': 0.9928057553956835,
 'f1_score': 0.8051281917024722}

In [8]:
ground_truth_PRIDE_GSE = pd.read_csv('exp_input/PRIDE_GSE_id_HTML_data.csv').drop(columns=['Unnamed: 0'])
ground_truth_PRIDE_GSE.head()

Unnamed: 0,publication,dataset_uid,repo_name,doi,raw_html,publisher,smallest_elements
0,https://dx.doi.org/10.1001/JAMANEUROL.2024.4763,PXD056570,PRIDE,10.1001/jamaneurol.2024.4763,"<html id=""doc"" lang=""en"" class=""page-article j...",jamanetwork,na
1,https://dx.doi.org/10.1002/CBIC.202400831,PXD055649,PRIDE,10.1002/cbic.202400831,"<html lang=""en"" class=""pb-page"" data-request-i...",Unknown Publisher,[('<p>The mass spectrometry proteomics data ha...
2,https://dx.doi.org/10.1002/CBIC.202400882,PXD060372,PRIDE,10.1002/cbic.202400882,"<html lang=""en"" class=""pb-page"" data-request-i...",Unknown Publisher,[('<p>The data that support the findings of th...
3,https://dx.doi.org/10.1002/PRCA.202300107,PXD028078,PRIDE,10.1002/prca.202300107,"<html lang=""en"" class=""pb-page"" data-request-i...",wiley,[('<p>Generation of the protein library and SW...
4,https://dx.doi.org/10.1002/anie.202420149,"PXD056865,PXD057925,PXD058045",PRIDE,10.1002/anie.202420149,"<html lang=""en"" class=""pb-page"" data-request-i...",wiley,[('<p>The mass spectrometry proteomics data ha...


In [9]:
calculate_performance_metrics(orchestrator, ground_truth_PRIDE_GSE, predict_df)

864648685.py - line 20 - INFO - 
Starting performance evaluation for source page: https://dx.doi.org/10.1001/JAMANEUROL.2024.4763
864648685.py - line 27 - INFO - Ground truth datasets: {'pxd056570'}
864648685.py - line 32 - INFO - # of elements in gt_data: 1
864648685.py - line 41 - INFO - Dataset id PXD056570 not matched in the raw_html
864648685.py - line 44 - INFO - No datasets references in the raw_html for https://dx.doi.org/10.1001/JAMANEUROL.2024.4763
864648685.py - line 20 - INFO - 
Starting performance evaluation for source page: https://dx.doi.org/10.1002/CBIC.202400831
864648685.py - line 27 - INFO - Ground truth datasets: {'pxd055649'}
864648685.py - line 32 - INFO - # of elements in gt_data: 1
864648685.py - line 37 - INFO - Dataset id PXD055649 reference matched in the raw_html
864648685.py - line 55 - INFO - Evaluation datasets: {'pxd055649'}
864648685.py - line 79 - INFO - Exact match found: pxd055649
864648685.py - line 106 - INFO - False positives: set()
864648685.py 

{'average_precision': 0.7005384004484725,
 'average_recall': 0.9928057553956835,
 'f1_score': 0.8214497371258989}