In [None]:
import os
from data_gatherer.data_gatherer import DataGatherer
import pandas as pd

In [None]:
input_file = "scripts/exp_input/sage_input.txt"
fname = "prompts/prompts_1.jsonl"

In [None]:
model_name = "gemini-2.0-flash"  # "gemini-2.0-flash" or "gpt-4o-mini"
prompt = "GPT_FDR_FewShot_Syn"  # "GPT_from_full_input_Examples" or "GPT_FewShot"
FDR = True
semantic_retrieval = False
section_filter= None

In [None]:
# write list to a text file
with open(input_file, 'r') as f:
    pmcids = f.read().splitlines()

print("Number of PMCIDs:", len(pmcids))

Number of PMCIDs: 1316


In [5]:
dg = DataGatherer(
    llm_name=model_name, 
    log_level='INFO', 
    process_entire_document=FDR, 
    driver_path=None, 
    save_to_cache=True, 
    load_from_cache=True,
    full_output_file="scripts/output/result.csv"
) #, save_dynamic_prompts=True

[97mdata_gatherer.py - line 301 - INFO - Setting up data fetcher...[0m
[97mdata_fetcher.py - line 33 - INFO - Loaded 2190 publications from local DataFrame.[0m
[97mdata_gatherer.py - line 331 - INFO - Data fetcher setup completed.[0m
[97mdata_gatherer.py - line 103 - INFO - DataGatherer orchestrator initialized. Extraction Model: gemini-2.5-flash[0m


In [None]:
# fetch
raw_data = dg.fetch_data(
    urls=pmcids
    )

https://platform.openai.com/docs/guides/batch#1-preparing-your-batch-file

https://portkey.ai/docs/integrations/llms/bedrock/batches

In [None]:
# parse and return prompts JSONL
for url,item in raw_data.items():
    raw_data_format = item['raw_data_format']
    fetched_data = item['fetched_data']
    print(f"Processing {url} with format {raw_data_format}...")
    jsonl_cont = dg.DRAFT_prepare_prompts_batch(
        fetched_data=fetched_data,
        raw_data_format=raw_data_format,
        prompt=prompt,
        FDR=FDR,
        semantic_retrieval=semantic_retrieval,
        section_filter=section_filter,
        fname=fname
    )

In [None]:
# file info like size and number of lines
file_size = os.path.getsize(fname)
num_lines = sum(1 for line in open(fname))
print(f"File size: {file_size} bytes")
print(f"Number of lines: {num_lines}")

In [None]:
# upload to AWS S3
import boto3

# setup client
s3 = boto3.client('s3')

# define target bucket and object name
bucket_name = 'vida-llm'
object_key = os.path.basename(fname)
local_file_path = fname

# upload the file
s3.upload_file(
    Filename=local_file_path,
    Bucket=bucket_name,
    Key=object_key,
    ExtraArgs={'ContentType': 'application/json'}
)