In [None]:
from data_gatherer.data_gatherer import DataGatherer
import pandas as pd
import json
import os, requests
import numpy as np
import re
from data_gatherer.llm.response_schema import *

In [None]:
dg = DataGatherer(
    log_level='INFO', 
    process_entire_document=True, 
    driver_path=None
)

dg.logger.info("Data Gatherer initialized with params: %s", dg.get_params())

In [None]:
# get all the source publications

with open('scripts/NYU_data_catalog/datacatalog_export-2025-10-02.json', 'r') as file:
    data = json.load(file)
datasets_df = pd.DataFrame(data)

In [None]:
gt_df = pd.DataFrame()

for idx, row in datasets_df.iterrows():
    pubs = row['publications']
    if not pubs:
        continue
    dg.logger.info(f"Found publication for dataset {idx + 1}/{len(datasets_df)}")
    dg.logger.info(f"Dataset title: {row['title']}")
    for pub in pubs:
        dg.logger.info(f"Processing publication: {pub}")
        new_row = row.to_dict()
        new_row['publication'] = pub
        new_row['publication_url'] = pub['url']
        gt_df = pd.concat([gt_df, pd.DataFrame([new_row])], ignore_index=True)



In [None]:
dg.logger.info(f"Total ground truth entries: {len(gt_df)}")

In [None]:
urls = list(set(gt_df['publication_url'].tolist()))
# drop None or empty URLs
urls = [url for url in urls if url]

In [None]:
batch_file_path=f'scripts/NYU_data_catalog/batch_requests_openai_FDR.jsonl'
ret_file=f'scripts/NYU_data_catalog/dg_resp.csv'

In [None]:
dg.run_integrated_batch_processing(
    urls,
    batch_file_path,
    api_provider='openai',
    prompt_name='GPT_FDR_FewShot_shortDescr',
    response_format=dataset_response_schema_with_use_description_and_short,
    submit_immediately=False,
    batch_description='Prompting the Market? Batch Test 2',
    grobid_for_pdf=True,
)

In [None]:
1/0

In [None]:
dg.data_fetcher.update_DataFetcher_settings()

In [None]:
pmid2pmcid = {}
pmid2doi = {}
new_urls = []

for idx, row in gt_df.iterrows():
    url = row['publication_url']
    if not url:
        continue
    paper_id = dg.data_fetcher.url_to_pmcid(url)
    pmc_id, doi = None, None
    if paper_id and re.search(r'^\d+$', paper_id):  # PMIDs are integers
        pmc_id, doi = dg.data_fetcher.get_opendata_from_pubmed_id(paper_id)
    if pmc_id:
        dg.logger.info(f"Mapping PMID URL to PMC ID: {url} -> {pmc_id}")
        pmid2pmcid[url] = pmc_id
        new_urls.append(pmc_id)
    elif doi:
        dg.logger.info(f"Mapping PMID URL to DOI: {url} -> {doi}")
        pmid2doi[url] = doi
        new_urls.append(doi)
    else:
        new_urls.append(url)