In [1]:
# import from the files in this directory
from dotenv import load_dotenv
from classifier import *
from data_fetcher import *
from parser import *
from orchestrator import *
from logger_setup import *
import os
import json
import pandas as pd
import numpy as np
import re
import time

In [2]:
config_path = 'config_experiment.json'  # Config with input file details
load_dotenv()
orchestrator = Orchestrator(config_path)
orchestrator.setup_data_fetcher()

orchestrator.py - line 20 - INFO - Data_Gatherer Orchestrator initialized. Extraction step Model: gemini-2.0-flash-exp
orchestrator.py - line 45 - INFO - Data fetcher setup completed.


<selenium.webdriver.firefox.webdriver.WebDriver (session="1d1b7a4b-8c75-42f6-83bc-0a7d9b851ac9")>

In [3]:
df = pd.read_csv('exp_input/raw_data.csv')

In [4]:
urls = df['publication'].tolist()
print(f"Number of URLs: {len(urls)}")

Number of URLs: 221


In [5]:
results = {}
for i,url in enumerate(urls):
    time.sleep(15)
    orchestrator.current_url = url
    orchestrator.logger.info(f"Processing URL {i+1}: {orchestrator.current_url}")
    orchestrator.publisher = orchestrator.data_fetcher.url_to_publisher_domain(url)
    #orchestrator.logger.info("Using XMLParser to parse data.")
    orchestrator.parser = XMLParser(orchestrator.XML_config, orchestrator.logger)
    raw_data = df.loc[df['publication'] == url, 'raw_html'].iloc[0]
    parsed_data = orchestrator.parser.parse_data(raw_data, orchestrator.publisher, orchestrator.current_url, raw_data_format="full_HTML")
    parsed_data['source_url'] = url
    orchestrator.logger.info(f"Parsed data extraction completed. Elements collected: {len(parsed_data)}")
    results[url] = parsed_data

2172060690.py - line 5 - INFO - Processing URL 1: https://dx.doi.org/10.1002/1878-0261.13654
data_fetcher.py - line 30 - INFO - Publisher: doi
parser.py - line 25 - INFO - Parser initialized.
parser.py - line 153 - INFO - Function call: parse_data(api_data(<class 'str'>), publisher, current_url_address, additional_data, full_HTML)
parser.py - line 246 - INFO - Extracting links from full HTML content.
parser.py - line 292 - INFO - Function_call: normalize_full_DOM(api_data). Length of raw api data: 192964 tokens
parser.py - line 690 - INFO - Loading prompt: GEMINI_from_full_input_Examples_4
parser.py - line 705 - INFO - Prompt messages total length: 124070 tokens
parser.py - line 711 - INFO - Prompt ID: gemini-2.0-flash-exp-0-13838f88fd4e37bbb5447e3d4d65fad9944e1ca4e414c8f9e2b8775098bfd967
parser.py - line 730 - INFO - Requesting datasets from content using model: gemini-2.0-flash-exp, temperature: 0, messages: messages
parser.py - line 829 - INFO - Found 1 candidates in the response.
p

In [6]:
combined_df = pd.DataFrame()
for url, data in results.items():
    combined_df = pd.concat([combined_df, data], ignore_index=True)
    
combined_df.to_csv(orchestrator.config['full_output_file'], index=False)

In [7]:
combined_df

Unnamed: 0,dataset_identifier,data_repository,dataset_webpage,source_url
0,PXD048538,proteomecentral.proteomexchange.org,https://proteomecentral.proteomexchange.org/cg...,https://dx.doi.org/10.1002/1878-0261.13654
1,GSE15460,www.ncbi.nlm.nih.gov,na,https://dx.doi.org/10.1002/1878-0261.13654
2,GSE66229,www.ncbi.nlm.nih.gov,na,https://dx.doi.org/10.1002/1878-0261.13654
3,GSE26899,www.ncbi.nlm.nih.gov,na,https://dx.doi.org/10.1002/1878-0261.13654
4,GSE15459,www.ncbi.nlm.nih.gov,na,https://dx.doi.org/10.1002/1878-0261.13654
...,...,...,...,...
723,EMD-14506,,na,https://dx.doi.org/10.7554/elife.78385
724,PXD032016,www.ebi.ac.uk,https://www.ebi.ac.uk/pride/archive/projects/P...,https://dx.doi.org/10.7554/elife.78385
725,PXD032024,www.ebi.ac.uk,https://www.ebi.ac.uk/pride/archive/projects/P...,https://dx.doi.org/10.7554/elife.78385
726,PXD032034,www.ebi.ac.uk,https://www.ebi.ac.uk/pride/archive/projects/P...,https://dx.doi.org/10.7554/elife.78385


In [8]:
recall_list = []
def calculate_performance_metrics(orchestrator, ground_truth_df: pd.DataFrame, eval_df: pd.DataFrame) -> dict:
    """
    Calculate performance metrics (precision, recall, F1-score) for dataset extraction,
    considering exact, partial, and alias matches. Includes verbose debugging output.
    """
    # Initialize counters for total scores
    total_precision = 0
    total_recall = 0
    
    num_sources = len(eval_df['source_url'].unique())

    orchestrator.logger.info(f"Number of unique source pages: {num_sources}")

    # Iterate through each unique source page
    for source_page in eval_df['source_url'].unique():
        orchestrator.logger.info(f"\nStarting performance evaluation for source page: {source_page}")

        # Extract ground truth datasets for this source page
        gt_data = ground_truth_df[ground_truth_df['publication'] == source_page]
        gt_datasets = set(gt_data['dataset_uid'].dropna().str.lower())
        gt_repositories = set(gt_data['repo_name'].dropna().str.lower())

        orchestrator.logger.info(f"Ground truth datasets: {gt_datasets}")
        # orchestrator.logger.info(f"Ground truth repositories: {gt_repositories}")

        # Extract evaluation datasets for this source page
        eval_data = eval_df[eval_df['source_url'] == source_page]
        eval_datasets = set(eval_data['dataset_identifier'].dropna().str.lower())
        # eval_webpages = set(eval_data['dataset_webpage'].dropna().str.lower())
        # eval_repositories = set(eval_data['data_repository'].dropna().str.lower())

        orchestrator.logger.info(f"Evaluation datasets: {eval_datasets}")
        # orchestrator.logger.info(f"Evaluation webpages: {eval_webpages}")
        # orchestrator.logger.info(f"Evaluation repositories: {eval_repositories}")

        # Handle cases where both ground truth and evaluation are empty
        if not gt_datasets and not eval_datasets:
            orchestrator.logger.info("No datasets in both ground truth and evaluation. Perfect precision and recall.")
            total_precision += 1
            total_recall += 1
            continue

        # Initialize match counters
        exact_matches = 0
        partial_matches = 0
        false_positives = 0
        false_negatives = 0

        # Track matches to avoid double counting
        matched_gt = set()
        matched_eval = set()

        # Check for exact matches first
        for eval_id in eval_datasets:
            if eval_id in gt_datasets:
                orchestrator.logger.info(f"Exact match found: {eval_id}")
                exact_matches += 1
                matched_gt.add(eval_id)
                matched_eval.add(eval_id)

        # Check for partial matches and aliases
        for eval_id in eval_datasets - matched_eval:
            for gt_id in gt_datasets - matched_gt:
                if eval_id in gt_id or gt_id in eval_id:  # Partial match or alias
                    orchestrator.logger.info(f"Partial or alias match found: eval_id={eval_id}, gt_id={gt_id}")
                    partial_matches += 1
                    matched_gt.add(gt_id)
                    matched_eval.add(eval_id)
                    break

        # Calculate False Positives (remaining unmatched eval datasets)
        FP = eval_datasets - matched_eval

        # Remove aliases from False Positives
        for eval_id in list(FP):
            for matched_id in matched_gt:
                if eval_id in matched_id or matched_id in eval_id:  # Alias detected
                    orchestrator.logger.info(f"Removing alias from false positives: eval_id={eval_id}, matched_id={matched_id}")
                    FP.discard(eval_id)
                    break

        false_positives = len(FP)
        orchestrator.logger.info(f"False positives: {FP}")

        # Calculate False Negatives (remaining unmatched ground truth datasets)
        FN = gt_datasets - matched_gt
        false_negatives = len(FN)
        orchestrator.logger.info(f"False negatives: {FN}")

        # Calculate precision and recall for this source page
        true_positives = exact_matches + partial_matches
        precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
        recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0

        orchestrator.logger.info(f"Precision for {source_page}: {precision}")
        orchestrator.logger.info(f"Recall for {source_page}: {recall}")
        
        if recall == 0:
            recall_list.append(source_page)

        # Accumulate totals
        total_precision += precision
        total_recall += recall

    # Calculate overall metrics
    average_precision = total_precision / num_sources if num_sources > 0 else 0
    average_recall = total_recall / num_sources if num_sources > 0 else 0
    f1_score = 2 * (average_precision * average_recall) / (average_precision + average_recall) if (average_precision + average_recall) > 0 else 0

    return {
        "average_precision": average_precision,
        "average_recall": average_recall,
        "f1_score": f1_score
    }

In [9]:
calculate_performance_metrics(orchestrator, df, combined_df)

4253583190.py - line 13 - INFO - Number of unique source pages: 219
4253583190.py - line 17 - INFO - 
Starting performance evaluation for source page: https://dx.doi.org/10.1002/1878-0261.13654
4253583190.py - line 24 - INFO - Ground truth datasets: {'pxd048538'}
4253583190.py - line 33 - INFO - Evaluation datasets: {'gse15459', 'n/a', 'pxd048538', 'gse26899', 'gse66229', 'pxd008840', 'gse15460'}
4253583190.py - line 57 - INFO - Exact match found: pxd048538
4253583190.py - line 84 - INFO - False positives: {'gse15459', 'n/a', 'gse26899', 'gse66229', 'pxd008840', 'gse15460'}
4253583190.py - line 89 - INFO - False negatives: set()
4253583190.py - line 96 - INFO - Precision for https://dx.doi.org/10.1002/1878-0261.13654: 0.14285714285714285
4253583190.py - line 97 - INFO - Recall for https://dx.doi.org/10.1002/1878-0261.13654: 1.0
4253583190.py - line 17 - INFO - 
Starting performance evaluation for source page: https://dx.doi.org/10.1002/ADHM.202404465
4253583190.py - line 24 - INFO - Gr

{'average_precision': 0.6402616346109496,
 'average_recall': 0.9908675799086758,
 'f1_score': 0.7778838006799631}