In [1]:
import os
from data_gatherer.data_gatherer import DataGatherer
import pandas as pd
from data_gatherer.llm.response_schema import *

In [2]:
input_file = "scripts/exp_input/sage_input.txt"
fname = "prompts/prompts_1.jsonl"

In [3]:
model_name = "gpt-5-mini"  # "gemini-2.0-flash" or "gpt-4o-mini"
prompt = "GPT_FDR_FewShot_Syn"  # "GPT_from_full_input_Examples" or "GPT_FewShot"
FDR = True
semantic_retrieval = False
section_filter= None

In [4]:
# write list to a text file
with open(input_file, 'r') as f:
    pmcids = f.read().splitlines()

print("Number of PMCIDs:", len(pmcids))

Number of PMCIDs: 100


In [5]:
dg = DataGatherer(
    llm_name=model_name, 
    log_level='INFO', 
    process_entire_document=FDR, 
    driver_path=None, 
    save_to_cache=True, 
    load_from_cache=True,
    full_output_file="scripts/output/result.csv"
) #, save_dynamic_prompts=True

[97mdata_gatherer.py - line 329 - INFO - Setting up data fetcher...[0m
[97mdata_fetcher.py - line 40 - INFO - BackupDataStore loaded from scripts/exp_input/Local_fetched_data_SAGE.parquet, entries: 2190[0m
[97mdata_fetcher.py - line 104 - INFO - Backup data store initialized: 2190 publications, valid: True[0m
[97mdata_gatherer.py - line 359 - INFO - Data fetcher setup completed.[0m
[97mdata_gatherer.py - line 108 - INFO - DataGatherer orchestrator initialized. Extraction Model: gpt-5-mini[0m


https://platform.openai.com/docs/guides/batch#1-preparing-your-batch-file

https://portkey.ai/docs/integrations/llms/bedrock/batches

In [6]:
batch_input_openai = dg.run_integrated_batch_processing(
    url_list=pmcids[:50],  # Test with first 50 articles
    batch_file_path='scripts/tmp/batch_requests_openai_v1.jsonl',
    #output_file_path='scripts/tmp/batch_results_openai.jsonl',
    api_provider='openai',  # Uses OpenAI batch API for 50% cost discount
    prompt_name="GPT_FDR_FewShot_Syn",
    response_format=dataset_response_schema_gpt,
    submit_immediately=True,
    wait_for_completion=False,  # Set to True if you want to wait for results
    batch_description="Testing OpenAI Batch API with 50 PMC articles for cost savings"
)

[97mdata_gatherer.py - line 1281 - INFO - Starting integrated batch processing for 50 URLs[0m
[97mdata_gatherer.py - line 1285 - INFO - Step 1: Fetching data...[0m
[97mdata_gatherer.py - line 156 - INFO - Fetch attempt with HTML_fallback=False...[0m
[97mdata_gatherer.py - line 159 - INFO - length of complete fetches < urls: 0 < 50[0m
[97mdata_fetcher.py - line 285 - INFO - API detected: PMC[0m
[97mdata_fetcher.py - line 293 - INFO - Creating EntrezFetcher with backup support[0m
[97mdata_fetcher.py - line 104 - INFO - Backup data store initialized: 2190 publications, valid: True[0m
[97mdata_fetcher.py - line 839 - INFO - Raw_data_format: XML[0m
[97mdata_fetcher.py - line 120 - INFO - Found PMC11792374 in backup data store (format: XML)[0m
  if backup_data:
[97mdata_fetcher.py - line 865 - INFO - Found PMC11792374 in local backup data (fast path, format: XML)[0m
[97mdata_gatherer.py - line 177 - INFO - Raw_data_format: XML, Type of fetched data: <class 'lxml.etree._E

In [8]:
batch_id = 'batch_68ed8a97bf98819090f2cb62841f0219'

In [9]:
batch_input_openai

{'batch_file_created': {'batch_file_path': 'scripts/tmp/batch_requests_openai_v1.jsonl',
  'total_requests': 50,
  'skipped_requests': 0,
  'api_provider': 'openai',
  'model': 'gpt-5-mini',
  'file_stats': {'file_path': 'scripts/tmp/batch_requests_openai_v1.jsonl',
   'total_requests': 50,
   'file_size_bytes': 9245991,
   'created_at': '2025-10-13 19:26:12'},
  'validation': {'is_valid': True,
   'total_lines': 50,
   'valid_lines': 50,
   'invalid_lines': 0,
   'errors': [],
   'file_size_bytes': 9245991},
  'created_at': '2025-10-13 19:26:12'},
 'fetched_data_count': 50,
 'processed_requests': 50,
 'api_provider': 'openai',
 'model': 'gpt-5-mini',
 'batch_submission': {'batch_id': 'batch_68ed8a97bf98819090f2cb62841f0219',
  'status': 'validating',
  'input_file_id': 'file-SrQ5dnQsKdJENe7wgwhDYx',
  'created_at': 1760397975,
  'api_provider': 'openai',
  'endpoint': '/v1/responses',
  'completion_window': '24h'}}

In [None]:
if not dg.parser:
    dg.parser = XMLParser(open_data_repos_ontology="open_bio_data_repos.json", logger=dg.logger,
    llm_name=dg.llm)

dg.parser.llm_client.download_batch_results(
    batch_id=batch_id,
    output_file_path='scripts/tmp/res.jsonl',
    api_provider='openai'
)

In [None]:
ret_2 = []
for dataset in ret['processed_results']:
    dg.logger.info(f"Processing dataset of type: {type(dataset)}")
    dg.logger.info(f"Processing dataset of type: {dataset.keys()}")
    custom_id = dataset.get('custom_id', 'N/A')

    dg.logger.info(f"Processing dataset: {dataset}")
    dt = dg.parser.process_datasets_response(dataset['processed_response'])

    for dt_dt in dt:
        dt_dt['custom_id'] = custom_id

    ret_2.append(dt)

ret_2