In [1]:
from data_gatherer.data_gatherer import DataGatherer
import pandas as pd

In [2]:
df = pd.read_csv("exp_input/SAGE_groundtruth.csv")
input_file = "exp_input/SAGE_input.txt"

In [3]:
model_name = "gemini-2.0-flash"  # "gemini-2.0-flash" or "gpt-4o-mini"
prompt = "GPT_from_full_input_Examples"  # "GPT_from_full_input_Examples" or "retrieve_datasets_simple_JSON"
FDR = True
semantic_retrieval = False
section_filter= None

In [4]:
# write list to a text file
with open(input_file, 'r') as f:
    pmcids = f.read().splitlines()

print("Number of PMCIDs:", len(pmcids))

Number of PMCIDs: 1


In [5]:
dg = DataGatherer(llm_name=model_name, log_level='INFO', process_entire_document=FDR, driver_path='../../Firefox/geckodriver')

[97mdata_gatherer.py - line 255 - INFO - Setting up data fetcher...[0m
[97mdata_gatherer.py - line 285 - INFO - Data fetcher setup completed.[0m
[97mdata_gatherer.py - line 103 - INFO - DataGatherer orchestrator initialized. Extraction Model: gemini-2.0-flash[0m


In [6]:
input_file = "exp_input/SAGE_input.txt"
combined_df = dg.run(input_file=input_file, semantic_retrieval=semantic_retrieval, section_filter=section_filter, prompt_name=prompt)

[97mdata_gatherer.py - line 624 - INFO - Loaded 1 URLs from file.[0m
[97mdata_gatherer.py - line 539 - INFO - 0th function call: self.process_url(https://pmc.ncbi.nlm.nih.gov/articles/PMC7560866)[0m
[97mdata_gatherer.py - line 347 - INFO - Processing URL: https://pmc.ncbi.nlm.nih.gov/articles/PMC7560866[0m
[97mdata_fetcher.py - line 166 - INFO - raw_HTML_data_filepath: None[0m
[97mdata_fetcher.py - line 173 - INFO - Initializing EntrezFetcher(('requests', 'self.config'))[0m
[97mdata_gatherer.py - line 354 - INFO - Type of data_fetcher EntrezFetcher[0m
[97mdata_fetcher.py - line 533 - INFO - Fetching data from request: https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id=PMC7560866&retmode=xml[0m
[97mxml_retriever.py - line 446 - INFO - ----Checking for data_availability_sections section in raw data.[0m
[97mxml_retriever.py - line 457 - INFO - ----Found section: <notes xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:ali="http://www.niso.org/schemas/a

In [7]:
combined_df

Unnamed: 0,dataset_identifier,data_repository,dataset_webpage,link,source_url,download_link,title,content_type,id,caption,description,source_section,retrieval_pattern,context_description,file_extension,pub_title
0,10.6084/m9.figshare.5335861,doi.org,https://doi.org/10.6084/m9.figshare.5335861,,https://pmc.ncbi.nlm.nih.gov/articles/PMC7560866,,,,,,,,,,,A strategy to incorporate prior knowledge into...
1,TCGA.PANCAN12.sampleMap%2FPanCan12.3602-correc...,xenabrowser.net,,,https://pmc.ncbi.nlm.nih.gov/articles/PMC7560866,,,,,,,,,,,A strategy to incorporate prior knowledge into...
2,10.6084/m9.figshare.12646748,doi.org,https://doi.org/10.6084/m9.figshare.12646748,,https://pmc.ncbi.nlm.nih.gov/articles/PMC7560866,,,,,,,,,,,A strategy to incorporate prior knowledge into...
3,,,,41467_2020_18675_MOESM1_ESM.pdf,https://pmc.ncbi.nlm.nih.gov/articles/PMC7560866,https://pmc.ncbi.nlm.nih.gov/articles/instance...,No Title,local-data,MOESM1,Peer Review File,Peer Review File,supplementary material,.//supplementary-material,,pdf,A strategy to incorporate prior knowledge into...
4,,,,41467_2020_18675_MOESM2_ESM.pdf,https://pmc.ncbi.nlm.nih.gov/articles/PMC7560866,https://pmc.ncbi.nlm.nih.gov/articles/instance...,No Title,local-data,MOESM2,Supplementary Information,Supplementary Information,supplementary material,.//supplementary-material,Analogous results were obtained in the three r...,pdf,A strategy to incorporate prior knowledge into...
5,,,,41467_2020_18675_MOESM3_ESM.pdf,https://pmc.ncbi.nlm.nih.gov/articles/PMC7560866,https://pmc.ncbi.nlm.nih.gov/articles/instance...,No Title,local-data,MOESM3,Reporting Summary,Reporting Summary,supplementary material,.//supplementary-material,Further information on research design is avai...,pdf,A strategy to incorporate prior knowledge into...


In [8]:
ground_truth = pd.read_parquet("exp_input/dataset_citation_records_Table.parquet")

In [9]:
ground_truth.head()

Unnamed: 0,identifier,repository,citing_publication_link,citation_record_source,citation_record_from_doi,doi,pmcid
0,PXD059466,PRIDE,https://dx.doi.org/10.1038/S41467-025-56720-1,proteomexchange_search.tsv,1,10.1038/S41467-025-56720-1,
1,PXD051312,PRIDE,https://dx.doi.org/10.6019/PXD051312,proteomexchange_search.tsv,1,10.6019/PXD051312,
2,PXD051312,PRIDE,https://dx.doi.org/10.1002/prca.202400095,proteomexchange_search.tsv,1,10.1002/prca.202400095,
3,PXD051312,PRIDE,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...,proteomexchange_search.tsv,0,,PMC11895760
4,PXD054431,PRIDE,https://dx.doi.org/10.17159/SAJS.2025/18571,proteomexchange_search.tsv,1,10.17159/SAJS.2025/18571,


In [10]:
def evaluate_performance(predict_df, ground_truth, orchestrator, false_positives_file):
    """ Evaluates dataset extraction performance using precision, recall, and F1-score. """

    recall_list, false_positives_output = [], []
    total_precision, total_recall, num_sources = 0, 0, 0

    for source_page in predict_df['source_url'].unique():
        pub_id = source_page.split('/')[-1].lower()
        
        orchestrator.logger.info(f"Evaluating pub_id: {pub_id}")
        gt_data = ground_truth[ground_truth['pmcid'].str.lower() == pub_id.lower()]  # extract ground truth

        gt_datasets = set()
        for dataset_string in gt_data['identifier'].dropna().str.lower():
            gt_datasets.update(dataset_string.split(','))  # Convert CSV string into set of IDs

        orchestrator.logger.info(f"# of elements in gt_data: {len(gt_data)}. Element IDs: {gt_datasets}")

        num_sources += 1

        # Extract evaluation datasets for this source page
        eval_data = predict_df[predict_df['source_url'] == source_page]
        eval_datasets = set(eval_data['dataset_identifier'].dropna().str.lower())
        # Remove invalid entries
        eval_datasets.discard('n/a')
        eval_datasets.discard('')

        orchestrator.logger.info(f"Evaluation datasets: {eval_datasets}")

        # Handle cases where both ground truth and evaluation are empty
        if not gt_datasets and not eval_datasets:
            orchestrator.logger.info("No datasets in both ground truth and evaluation. Perfect precision and recall.")
            total_precision += 1
            total_recall += 1
            continue

        # Match Extraction Logic
        matched_gt, matched_eval = set(), set()

        # Exact Matches
        exact_matches = gt_datasets & eval_datasets  # Intersection of ground truth and extracted datasets
        matched_gt.update(exact_matches)
        matched_eval.update(exact_matches)

        # Partial Matches (Aliased Identifiers)
        for eval_id in eval_datasets - matched_eval:
            for gt_id in gt_datasets - matched_gt:
                if eval_id in gt_id or gt_id in eval_id:  # Partial match or alias
                    orchestrator.logger.info(f"Partial or alias match found: eval_id={eval_id}, gt_id={gt_id}")
                    matched_gt.add(gt_id)
                    matched_eval.add(eval_id)
                    break  # Stop once matched

        # **False Positives (Unmatched extracted datasets)**
        FP = eval_datasets - matched_eval
        false_positives_output.extend(FP)

        # **False Negatives (Unmatched ground truth datasets)**
        FN = gt_datasets - matched_gt

        # **Precision and Recall Calculation**
        true_positives = len(matched_gt)
        false_positives = len(FP)
        false_negatives = len(FN)

        precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
        recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0

        orchestrator.logger.info(f"Precision for {source_page}: {precision}")
        orchestrator.logger.info(f"Recall for {source_page}: {recall}")

        if recall == 0:
            recall_list.append(source_page)

        # Accumulate totals
        total_precision += precision
        total_recall += recall

    # **Compute Overall Metrics**
    average_precision = total_precision / num_sources if num_sources > 0 else 0
    average_recall = total_recall / num_sources if num_sources > 0 else 0
    f1_score = (
        2 * (average_precision * average_recall) / (average_precision + average_recall)
        if (average_precision + average_recall) > 0
        else 0
    )

    orchestrator.logger.info(f"\nPerformance evaluation completed for {num_sources} source pages.")

    # **Save false positives**
    with open(false_positives_file, 'w') as f:
        for item in false_positives_output:
            f.write("%s\n" % item)

    return {
        "average_precision": average_precision,
        "average_recall": average_recall,
        "f1_score": f1_score
    }

In [11]:
results = evaluate_performance(combined_df, ground_truth, dg, "output/false_positives.txt")

[97m1292017431.py - line 10 - INFO - Evaluating pub_id: pmc7560866[0m
[97m1292017431.py - line 17 - INFO - # of elements in gt_data: 1. Element IDs: {'syn1715755'}[0m
[97m1292017431.py - line 28 - INFO - Evaluation datasets: {'10.6084/m9.figshare.5335861', 'tcga.pancan12.samplemap%2fpancan12.3602-corrected-v3_syn1715755', '10.6084/m9.figshare.12646748'}[0m
[97m1292017431.py - line 49 - INFO - Partial or alias match found: eval_id=tcga.pancan12.samplemap%2fpancan12.3602-corrected-v3_syn1715755, gt_id=syn1715755[0m
[97m1292017431.py - line 69 - INFO - Precision for https://pmc.ncbi.nlm.nih.gov/articles/PMC7560866: 0.3333333333333333[0m
[97m1292017431.py - line 70 - INFO - Recall for https://pmc.ncbi.nlm.nih.gov/articles/PMC7560866: 1.0[0m
[97m1292017431.py - line 88 - INFO - 
Performance evaluation completed for 1 source pages.[0m


In [12]:
results

{'average_precision': 0.3333333333333333,
 'average_recall': 1.0,
 'f1_score': 0.5}