In [None]:
from data_gatherer.data_gatherer import DataGatherer
import pandas as pd
import json
import os, requests
import numpy as np
import re
from data_gatherer.llm.response_schema import *
from scripts.experiment_utils import *

In [None]:
dg = DataGatherer(
    log_level='INFO', 
    process_entire_document=True, 
    driver_path=None,
    llm_name = 'gpt-5-mini',
    
)

dg.logger.info("Data Gatherer initialized with params: %s", dg.get_params())

In [None]:
# get all the source publications

with open('scripts/NYU_data_catalog/datacatalog_export-2025-10-02.json', 'r') as file:
    data = json.load(file)
datasets_df = pd.DataFrame(data)

In [None]:
gt_df = pd.DataFrame()

for idx, row in datasets_df.iterrows():
    pubs = row['publications']
    if not pubs:
        dg.logger.info(f"No publications found for paper {idx + 1}, dataset title: {row['title']}")
        continue
    dg.logger.info(f"Found {len(pubs)} publication(s) for dataset {idx + 1}, dataset title: {row['title']}")
    dg.logger.debug(f"Dataset title: {row['title']}")
    dg.logger.debug(f"Row: {row.to_dict()}")
    for pub in pubs:
        dg.logger.debug(f"Processing publication: {pub}")
        new_row = row.to_dict()
        new_row['publication'] = pub
        new_row['publication_url'] = pub['url']
        gt_df = pd.concat([gt_df, pd.DataFrame([new_row])], ignore_index=True)

In [None]:
dg.logger.info(f"Total ground truth entries: {len(gt_df)}")

In [None]:
urls = list(set(gt_df['publication_url'].tolist()))
# drop None or empty URLs
urls = [url for url in urls if url]
len(urls)

In [None]:
batch_file_path=f'scripts/NYU_data_catalog/batch_requests_openai_FDR_final_3.jsonl'
ret_file='scripts/NYU_data_catalog/resp_FDR_89.csv'

In [None]:
dg.run_integrated_batch_processing(
    urls,
    batch_file_path,
    api_provider='openai',
    prompt_name='GPT_FDR_FewShot_shortDescr',
    response_format=dataset_response_schema_with_use_description_and_short,
    submit_immediately=False,
    batch_description='Prompting the Market? Batch Test 4',
    grobid_for_pdf=True,
    #local_fetch_file='scripts/exp_input/nyu_data_catalog_publications.parquet'
)

In [None]:
with open("scripts/NYU_data_catalog/custom_id_src_mapping.json") as f:
    dg.custom_id_to_source_url = json.load(f)

In [None]:
with open("scripts/NYU_data_catalog/custom_id_src_mapping.json", "w") as f:
    json.dump(dg.custom_id_to_source_url, f, indent=4)

In [None]:
# Simple chunking and submission - NO monitoring or result combination
result = dg.split_jsonl_and_submit(
    batch_file_path=batch_file_path,
    max_file_size_mb=200.0,
    api_provider='openai',
    wait_between_submissions=30,
    batch_description=f"NYU Data Catalog GROBID"
)

In [None]:
batch_id_89 = 'batch_69530b9874ac819099b68c7179a3dd20'
batch_id_95 = 'batch_69530ce762088190ab7217863e2d2ac2'

In [None]:
dg.init_parser_by_input_type('XML')
dg.custom_id_to_source_url = json.load(open("scripts/NYU_data_catalog/custom_id_src_mapping.json"))

In [None]:
res = dg.parser.llm_client.download_batch_results(
    batch_id=batch_id_95,
    output_file_path='scripts/NYU_data_catalog/resp_FDR_95.jsonl',
    api_provider='openai'
)

In [None]:
res_df = dg.from_batch_resp_file_to_df(
    'scripts/NYU_data_catalog/resp_FDR_89.jsonl', output_file_path=ret_file, skip_validation=True)

In [None]:
res_df = pd.concat([pd.read_csv('scripts/NYU_data_catalog/supplementary_materials_metadata.csv'), res_df], axis=0)
res_df.to_csv(ret_file, index=False)

In [None]:
res_df = pd.read_csv(ret_file)
dg.logger.info(f"ret_file: {ret_file}, len(res_df): {len(res_df)}")

In [None]:
res_df.head()

In [None]:
final_df = pd.DataFrame()
for i, gt_row in gt_df.iterrows():
    for access_loc in gt_row['data_locations']:
        new_row = gt_row.to_dict()
        if 'accession_number' in access_loc:
            new_row['identifier'] = access_loc['accession_number']
        if 'data_access_url' in access_loc:
            new_row['dataset_webpage'] = access_loc['data_access_url']
        final_df = pd.concat([final_df, pd.DataFrame([new_row])], ignore_index=True)

final_df.dropna(subset=['publication_url'], how='all', inplace=True)
final_df.head()

In [None]:
# save a dict mapping to a file
#with open('scripts/NYU_data_catalog/redirect_mapping.json', 'w') as f:
#    json.dump(dg.data_fetcher.redirect_mapping, f)

red_map = json.load(open('scripts/NYU_data_catalog/redirect_mapping.json', 'r'))

In [None]:
tot_recall = 0 

ret_reportage = {'accession_ids': [], 'dataset_pages': [], 'missed_datasets': [], 'missed_accession_numbers': [], 'missed_dataset_pages': [], 'missed_dataset_names': [], 
                    'no_access_cnt': 0, 'data_request_form': 0}

# in res_df for rows without url, copy source_url to url
res_df['url'] = res_df.apply(lambda row: row['source_url'] if pd.isna(row['url']) or row['url']=='' else row['url'], axis=1)
# also change hhtp prefix to https
res_df['url'] = res_df['url'].apply(lambda x: x.replace('http://', 'https://') if isinstance(x, str) and x.startswith('http://') else x)

for i, gt_row in gt_df.iterrows():
    recall, alt_ids = 0, []
    dg.logger.info(f"Processing ground truth row {i+1},\npublication URL: {gt_row['publication_url']},\nciting dataset: {gt_row['title']}")
    cont = gt_row.to_dict()
    dg.logger.info(f"Alternate titles: {gt_row['dataset_alternate_titles']}")
    dg.logger.info(f"Data locations: {gt_row['data_locations']}")
    dg.logger.debug(f"Related Datasets: {gt_row['related_datasets']}")
    dg.logger.debug(f"Other reources: {gt_row['other_resources']}")
    dg.logger.debug(f"Publishers: {gt_row['publishers']}")

    alt_ids.append(gt_row['title'])
    alt_ids.extend(gt_row['dataset_alternate_titles'])

    accession_ids = [gt_row['data_locations'][k]['accession_number'] for k in range(len(gt_row['data_locations'])) if 'accession_number' in gt_row['data_locations'][k]]
    ret_reportage['accession_ids'].extend(accession_ids)

    accession_urls = [gt_row['data_locations'][k]['data_access_url'] for k in range(len(gt_row['data_locations'])) if 'data_access_url' in gt_row['data_locations'][k]]
    ret_reportage['dataset_pages'].extend(accession_urls)
    
    ret_reportage['data_request_form']+=len(['1' for k in range(len(gt_row['data_locations'])) if gt_row['data_locations'][k] and 'Data Request Form' in gt_row['data_locations'][k]]['data_location'])

    mapped_src = red_map.get(gt_row['publication_url'], '')
    dg.logger.info(f"Redirect mapping: {mapped_src}")

    pub_link = gt_row['publication_url']
    if isinstance(pub_link, str):
        if pub_link.startswith('http://'):
            pub_link = pub_link.replace('http://', 'https://')
        if 'labs/pmc/articles/' in pub_link:
            pub_link = pub_link.replace('labs/pmc/articles/', 'pmc/articles/')

    if (pub_link and 'doi.org/10.1016/' in pub_link) or (mapped_src and 'doi.org/10.1016/' in mapped_src):
        ret_reportage['no_access_cnt'] += 1
        continue

    # match articles from res_df that have either source_url or url column matching publication_url
    pred_row = res_df[res_df['source_url'] == pub_link]
    pred_row = pd.concat([pred_row, res_df[res_df['url'] == pub_link]], axis=0).drop_duplicates().reset_index(drop=True)
    dg.logger.info(f"Mapped source preds count: {len(res_df[res_df['source_url'] == mapped_src])}")

    if (pub_link and not re.search(r'PMC\d+',pub_link, re.IGNORECASE)) or not re.search(r'PMC\d+', mapped_src, re.IGNORECASE):
        dg.logger.info(f"Non-PMC publication URL: {pub_link} or mapped URL: {mapped_src}")
        #continue

    pred_row = pd.concat([res_df[res_df['source_url'] == mapped_src], pred_row.reset_index(drop=True)], axis=0) if pub_link in red_map else pred_row
    dg.logger.info(f"Predicted row count: {len(pred_row)}")
    dg.logger.info(f"Predicted row content: {pred_row.to_dict(orient='records')}")

    for j, row in pred_row.iterrows():
        dg.logger.debug(f"Evaluating predicted row {j+1} for ground truth row {i+1}")
        gt_title = gt_row['title']
        pred_title = row.get('dataset_identifier', row.get('title', ''))
        pred_repo = row.get('data_repository', '')

        dg.logger.info(f"GT Title: {gt_title}")
        dg.logger.info(f"Pred Title: {pred_title}")

        # Check exact match or if in alternate titles
        if type(pred_title) == str and (gt_title == pred_title or pred_title in gt_row['dataset_alternate_titles'] or gt_title in pred_title):
            dg.logger.info(f"Title match found: {gt_title}")
            recall = 1
            break

        elif type(pred_title) == str and pred_title.lower() in [acc.lower() for acc in accession_ids if acc]:
            dg.logger.info(f"Title match found via accession number: {pred_title}")
            recall = 1
            break

        elif isinstance(pred_repo, str) and pred_repo.lower() in [acc_url.lower() for acc_url in accession_urls if acc_url]:
            dg.logger.info(f"Dataset match found via accession URL: {pred_repo}")
            recall = 1
            break

        elif isinstance(pred_repo, str):
            for acc in accession_urls:
                if acc and pred_repo.lower() in acc.lower() or acc.lower() in pred_repo.lower():
                    dg.logger.info(f"Dataset partial match found via accession number in data repository: {pred_repo}")
                    recall = 1
                    break

        # Check substring matches with alternate titles
        dg.logger.info(f"Checking alternate titles for matches. possible ids: {set(alt_ids)}")
        for candidate_id in set(alt_ids):
            dg.logger.info(f"Checking alternate title candidate: {candidate_id}")
            if candidate_id and isinstance(candidate_id, str) and isinstance(pred_title, str):
                if candidate_id in pred_title or pred_title in candidate_id:
                    dg.logger.info(f"Title match found via alternate titles: {candidate_id}")
                    recall = 1
                    break
        
        if recall == 1:
            break
    
    if recall == 0:
        dg.logger.info(f"No title match found for ground truth title: {gt_title}")
        ret_reportage['missed_datasets'].append(gt_row.to_dict())
        ret_reportage['missed_accession_numbers'].extend([acc_id for acc_id in accession_ids if acc_id])
        ret_reportage['missed_dataset_pages'].extend([url for url in accession_urls if url])
        ret_reportage['missed_dataset_names'].extend([name for name in alt_ids if name])
    
    tot_recall += recall

dg.logger.info(f"Total recall: {tot_recall/(i+1-ret_reportage['no_access_cnt'])}")

In [None]:
dg.logger.info(f"ret_reportage: {ret_reportage}")

In [None]:
redirect_append = []
for i, gt_row in gt_df.iterrows():
    url = gt_row['publication_url']
    if url and url.startswith('https://pmc.ncbi.nlm.nih.gov/'):
        redirect_append.append(url)
len(redirect_append)

In [None]:
redirect_append

In [None]:
ret = evaluate_performance_dev(
    res_df,
    final_df,
    dg,
    'scripts/output/false_positives.txt', 
    false_negatives_file='scripts/output/false_negatives.txt',
    repo_return=True,
    gt_base = final_df['publication_url'].unique()
    )

In [None]:
ret

In [None]:
res_df[res_df['source_url'] == 'https://pmc.ncbi.nlm.nih.gov/articles/PMC11460830/']