In [None]:
import os
from data_gatherer.data_gatherer import DataGatherer
import pandas as pd

In [None]:
input_file = "scripts/exp_input/REV.txt"

In [None]:
model_name = "gemini-2.0-flash"  # "gemini-2.0-flash" or "gpt-4o-mini"
prompt = "GPT_FewShot"  # "GPT_from_full_input_Examples" or "GPT_FewShot"
FDR = False
semantic_retrieval = True
section_filter= None

In [None]:
# write list to a text file
with open(input_file, 'r') as f:
    pmcids = f.read().splitlines()[:20]

print("Number of PMCIDs:", len(pmcids))

In [None]:
dg = DataGatherer(
    llm_name=model_name, 
    log_level='INFO', 
    process_entire_document=FDR, 
    driver_path='../Firefox/geckodriver', 
    save_to_cache=False, 
    load_from_cache=False,
    full_output_file="scripts/output/result.csv"
) #, save_dynamic_prompts=True

In [None]:
# These are the problematic urls for information extraction for GEMINI
#urls = ["https://pmc.ncbi.nlm.nih.gov/articles/PMC9314356", "https://pmc.ncbi.nlm.nih.gov/articles/PMC4318527/", "https://pmc.ncbi.nlm.nih.gov/articles/PMC11929800/", "https://pmc.ncbi.nlm.nih.gov/articles/PMC11032436/", "https://pmc.ncbi.nlm.nih.gov/articles/PMC10802452/", "https://pmc.ncbi.nlm.nih.gov/articles/PMC8565335/","https://pmc.ncbi.nlm.nih.gov/articles/PMC7778917/"]
# v1 = "https://pmc.ncbi.nlm.nih.gov/articles/PMC7029360/"
# v2 = "https://pmc.ncbi.nlm.nih.gov/articles/PMC8006362"
#urls = ["https://pmc.ncbi.nlm.nih.gov/articles/PMC9710693/"]

# These are the problematic urls for information extraction for GPT - (2025 Aug 29) GPT gets stuck for some reason after 50 calls or so...
#urls = ["https://pmc.ncbi.nlm.nih.gov/articles/PMC11240079", "https://pmc.ncbi.nlm.nih.gov/articles/PMC5752539","https://pmc.ncbi.nlm.nih.gov/articles/PMC7032692", "https://pmc.ncbi.nlm.nih.gov/articles/PMC10547713", "https://pmc.ncbi.nlm.nih.gov/articles/PMC8102856", "https://pmc.ncbi.nlm.nih.gov/articles/PMC8055881", "https://pmc.ncbi.nlm.nih.gov/articles/PMC8226229", "https://pmc.ncbi.nlm.nih.gov/articles/PMC10769298", "https://pmc.ncbi.nlm.nih.gov/articles/PMC11659981", "https://pmc.ncbi.nlm.nih.gov/articles/PMC8082263", "https://pmc.ncbi.nlm.nih.gov/articles/PMC8565335", "https://pmc.ncbi.nlm.nih.gov/articles/PMC9094742", "https://pmc.ncbi.nlm.nih.gov/articles/PMC7029360", "https://pmc.ncbi.nlm.nih.gov/articles/PMC6289083", "https://pmc.ncbi.nlm.nih.gov/articles/PMC7658217", "https://pmc.ncbi.nlm.nih.gov/articles/PMC9780309","https://pmc.ncbi.nlm.nih.gov/articles/PMC3788619", "https://pmc.ncbi.nlm.nih.gov/articles/PMC8859891", "https://pmc.ncbi.nlm.nih.gov/articles/PMC10680627", "https://pmc.ncbi.nlm.nih.gov/articles/PMC6323985", "https://pmc.ncbi.nlm.nih.gov/articles/PMC8131595", "https://pmc.ncbi.nlm.nih.gov/articles/PMC11420198", "https://pmc.ncbi.nlm.nih.gov/articles/PMC9915613", "https://pmc.ncbi.nlm.nih.gov/articles/PMC10238095", "https://pmc.ncbi.nlm.nih.gov/articles/PMC10802452" , "https://pmc.ncbi.nlm.nih.gov/articles/PMC11032436", "https://pmc.ncbi.nlm.nih.gov/articles/PMC10329279", "https://pmc.ncbi.nlm.nih.gov/articles/PMC10836119",  "https://pmc.ncbi.nlm.nih.gov/articles/PMC9280291", "https://pmc.ncbi.nlm.nih.gov/articles/PMC11661334", "https://pmc.ncbi.nlm.nih.gov/articles/PMC5161470", "https://pmc.ncbi.nlm.nih.gov/articles/PMC4339277"]

#out = dg.process_articles(urls, semantic_retrieval=semantic_retrieval, section_filter=section_filter, prompt_name=prompt)

#out
#out['https://pmc.ncbi.nlm.nih.gov/articles/PMC9710693']

In [None]:
combined_df = dg.run(input_file=pmcids, semantic_retrieval=semantic_retrieval, section_filter=section_filter, prompt_name=prompt)

In [None]:
#combined_df = pd.read_csv("scripts/output/result.csv")
combined_df.head()

In [None]:
#ground_truth = pd.read_parquet("scripts/exp_input/dataset_citation_records_Table.parquet")
ground_truth = pd.read_parquet("scripts/output/gold/dataset_citation_records_Table.parquet")
#ground_truth = df

In [None]:
ground_truth.head()
# rename colum synid to 'identifier'
#ground_truth = ground_truth.rename(columns={'synid': 'identifier'})

In [None]:
def evaluate_performance(predict_df, ground_truth, orchestrator, false_positives_file, false_negatives_file=None):
    """ Evaluates dataset extraction performance using precision, recall, and F1-score. """

    recall_list, false_positives_output, false_negatives_output = [], [], []
    total_precision, total_recall, num_sources = 0, 0, 0

    for source_page in predict_df['source_url'].unique():
        #pub_id = source_page.split('/')[-1].lower()
        pub_id = source_page
        
        orchestrator.logger.info(f"Evaluating pub_id: {pub_id}")
        #gt_data = ground_truth[ground_truth['pmcid'].str.lower() == pub_id.lower()]  # extract ground truth
        gt_data = ground_truth[ground_truth['citing_publication_link'].str.lower() == pub_id.lower()]  # extract ground truth

        gt_datasets = set()
        for dataset_string in gt_data['identifier'].dropna().str.lower():
            gt_datasets.update(dataset_string.split(','))  # Convert CSV string into set of IDs

        orchestrator.logger.info(f"# of elements in gt_data: {len(gt_data)}. Element IDs: {gt_datasets}")

        num_sources += 1

        # Extract evaluation datasets for this source page
        eval_data = predict_df[predict_df['source_url'] == source_page]
        eval_datasets = set(eval_data['dataset_identifier'].dropna().str.lower())
        # Remove invalid entries
        eval_datasets.discard('n/a')
        eval_datasets.discard('')

        orchestrator.logger.info(f"Evaluation datasets: {eval_datasets}")

        # Handle cases where both ground truth and evaluation are empty
        if not gt_datasets and not eval_datasets:
            orchestrator.logger.info("No datasets in both ground truth and evaluation. Perfect precision and recall.")
            total_precision += 1
            total_recall += 1
            continue

        # Match Extraction Logic
        matched_gt, matched_eval = set(), set()

        # Exact Matches
        exact_matches = gt_datasets & eval_datasets  # Intersection of ground truth and extracted datasets
        matched_gt.update(exact_matches)
        matched_eval.update(exact_matches)

        # Partial Matches (Aliased Identifiers)
        for eval_id in eval_datasets - matched_eval:
            for gt_id in gt_datasets - matched_gt:
                if eval_id in gt_id or gt_id in eval_id:  # Partial match or alias
                    orchestrator.logger.info(f"Partial or alias match found: eval_id={eval_id}, gt_id={gt_id}")
                    matched_gt.add(gt_id)
                    matched_eval.add(eval_id)
                    break  # Stop once matched

        # **False Positives (Unmatched extracted datasets)**
        FP = eval_datasets - matched_eval
        false_positives_output.extend([false_p, pub_id] for false_p in FP)

        # **False Negatives (Unmatched ground truth datasets)**
        FN = gt_datasets - matched_gt
        false_negatives_output.extend((FN, pub_id)) if len(FN) > 0 else None

        # **Precision and Recall Calculation**
        true_positives = len(matched_gt)
        false_positives = len(FP)
        false_negatives = len(FN)

        precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
        recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0

        orchestrator.logger.info(f"Precision for {source_page}: {precision}")
        orchestrator.logger.info(f"Recall for {source_page}: {recall}")

        if recall == 0:
            recall_list.append(source_page)

        # Accumulate totals
        total_precision += precision
        total_recall += recall

    # **Compute Overall Metrics**
    average_precision = total_precision / num_sources if num_sources > 0 else 0
    average_recall = total_recall / num_sources if num_sources > 0 else 0
    f1_score = (
        2 * (average_precision * average_recall) / (average_precision + average_recall)
        if (average_precision + average_recall) > 0
        else 0
    )

    orchestrator.logger.info(f"\nPerformance evaluation completed for {num_sources} source pages.")

    # **Save false positives**
    with open(false_positives_file, 'w') as f:
        for item in false_positives_output:
            f.write("%s\n" % item)

    if false_negatives_file:
        with open(false_negatives_file, 'w') as f:
            for item in false_negatives_output:
                f.write("%s\n" % item)

    return {
        "average_precision": average_precision,
        "average_recall": average_recall,
        "f1_score": f1_score
    }

In [None]:
#results = evaluate_performance(out, ground_truth, dg, "output/false_positives.txt")
# PMC4318527, 

In [None]:
results = evaluate_performance(combined_df, ground_truth, dg, "scripts/output/false_positives.txt", "scripts/output/false_negatives.txt")

In [None]:
results