In [None]:
from data_gatherer.data_gatherer import DataGatherer
import pandas as pd
import json
import os, requests
import numpy as np
import re
from data_gatherer.llm.response_schema import *
from scripts.experiment_utils import *

In [None]:
dg = DataGatherer(
    log_level='INFO', 
    process_entire_document=True, 
    driver_path=None
)

dg.logger.info("Data Gatherer initialized with params: %s", dg.get_params())

In [None]:
# get all the source publications

with open('scripts/NYU_data_catalog/datacatalog_export-2025-10-02.json', 'r') as file:
    data = json.load(file)
datasets_df = pd.DataFrame(data)

In [None]:
gt_df = pd.DataFrame()

for idx, row in datasets_df.iterrows():
    pubs = row['publications']
    if not pubs:
        dg.logger.info(f"No publications found for paper {idx + 1}, dataset title: {row['title']}")
        continue
    dg.logger.info(f"Found {len(pubs)} publication(s) for dataset {idx + 1}, dataset title: {row['title']}")
    dg.logger.debug(f"Dataset title: {row['title']}")
    dg.logger.debug(f"Row: {row.to_dict()}")
    for pub in pubs:
        dg.logger.debug(f"Processing publication: {pub}")
        new_row = row.to_dict()
        new_row['publication'] = pub
        new_row['publication_url'] = pub['url']
        gt_df = pd.concat([gt_df, pd.DataFrame([new_row])], ignore_index=True)

    if idx >= 20:
        break  # limit to first 20 datasets for testing



In [None]:
dg.logger.info(f"Total ground truth entries: {len(gt_df)}")

In [None]:
urls = list(set(gt_df['publication_url'].tolist()))
# drop None or empty URLs
urls = [url for url in urls if url]

In [None]:
batch_file_path=f'scripts/NYU_data_catalog/batch_requests_openai_FDR_final.jsonl'
ret_file=f'scripts/NYU_data_catalog/dg_resp.csv'

In [None]:
dg.run_integrated_batch_processing(
    urls,
    batch_file_path,
    api_provider='openai',
    prompt_name='GPT_FDR_FewShot_shortDescr',
    response_format=dataset_response_schema_with_use_description_and_short,
    submit_immediately=False,
    batch_description='Prompting the Market? Batch Test 2',
    grobid_for_pdf=True,
)

In [None]:
# Simple chunking and submission - NO monitoring or result combination
result = dg.split_jsonl_and_submit(
    batch_file_path=batch_file_path,
    max_file_size_mb=200.0,
    api_provider='openai',
    wait_between_submissions=30,
    batch_description=f"NYU Data Catalog n=358 run"
)

In [None]:
batch_id = 'batch_6941ff70d2f08190987446b43da81fe2'

In [None]:
res = dg.parser.llm_client.download_batch_results(
    batch_id=batch_id,
    output_file_path='scripts/NYU_data_catalog/resp_FDR.jsonl',
    api_provider='openai'
)

In [None]:
res_df = dg.from_batch_resp_file_to_df(
    'scripts/NYU_data_catalog/resp_FDR.jsonl', 
    output_file_path=ret_file)

In [None]:
res_df = pd.read_csv(ret_file)

In [None]:
urls_ret = set([item for item in res_df['source_url'].to_list()])
urls = set([idx for idx in urls])
missing_urls = list(urls - urls_ret)
len(missing_urls)

In [None]:
res_df.head()

In [None]:
final_df = pd.DataFrame()
for i, gt_row in gt_df.iterrows():
    for access_loc in gt_row['data_locations']:
        new_row = gt_row.to_dict()
        if 'accession_number' in access_loc:
            new_row['identifier'] = access_loc['accession_number']
        if 'data_access_url' in access_loc:
            new_row['dataset_webpage'] = access_loc['data_access_url']
        final_df = pd.concat([final_df, pd.DataFrame([new_row])], ignore_index=True)

final_df.dropna(subset=['publication_url'], how='all', inplace=True)
final_df.head()

In [None]:
ret = evaluate_performance_dev(
    res_df,
    final_df,
    dg,
    'scripts/output/false_positives.txt', 
    false_negatives_file='scripts/output/false_negatives.txt',
    repo_return=True,
    gt_base = final_df['publication_url'].unique()
    )

In [None]:
ret

In [None]:
res_df[res_df['source_url'] == 'https://pmc.ncbi.nlm.nih.gov/articles/PMC11460830/']