In [None]:
import os
from data_gatherer.data_gatherer import DataGatherer
import pandas as pd
from data_gatherer.llm.response_schema import *
from data_gatherer.parser.xml_parser import XMLParser
from scripts.experiment_utils import evaluate_performance

In [None]:
input_file = "scripts/exp_input/sage_input.txt"
fname = "prompts/prompts_1.jsonl"

In [None]:
model_name = "gpt-5-mini"  # "gemini-2.0-flash" or "gpt-4o-mini"
prompt = "GPT_FDR_FewShot_Syn"  # "GPT_from_full_input_Examples" or "GPT_FewShot"
FDR = True
semantic_retrieval = False
section_filter= None

In [None]:
# write list to a text file
with open(input_file, 'r') as f:
    pmcids = f.read().splitlines()

print("Number of PMCIDs:", len(pmcids))

In [None]:
dg = DataGatherer(
    llm_name=model_name, 
    log_level='WARNING', 
    process_entire_document=FDR, 
    driver_path=None, 
    save_to_cache=False, 
    load_from_cache=False,
    full_output_file="scripts/output/result.csv"
) #, save_dynamic_prompts=True

https://platform.openai.com/docs/guides/batch#1-preparing-your-batch-file

https://portkey.ai/docs/integrations/llms/bedrock/batches

In [None]:
'''batch_input_openai = dg.run_integrated_batch_processing(
    url_list=pmcids,
    batch_file_path='scripts/tmp/batch_requests_openai_all_SAGE.jsonl',
    #output_file_path='scripts/tmp/batch_results_openai.jsonl',
    api_provider='openai',  # Uses OpenAI batch API for 50% cost discount
    prompt_name="GPT_FDR_FewShot_Syn",
    response_format=dataset_response_schema_gpt,
    submit_immediately=True,
    wait_for_completion=False,  # Set to True if you want to wait for results
    batch_description="Testing OpenAI Batch API with 50 PMC articles for cost savings"
)'''

In [None]:
'''# Simple chunking and submission - NO monitoring or result combination
result = dg.split_jsonl_and_submit(
    batch_file_path='scripts/tmp/batch_requests_openai_all_SAGE.jsonl',
    max_file_size_mb=200.0,
    api_provider='openai',
    wait_between_submissions=30,
    batch_description="Chunked SAGE batch processing"
)
'''

In [None]:
'''for file_path in ['scripts/tmp/batch_requests_openai_all_SAGE_chunk_002.jsonl','scripts/tmp/batch_requests_openai_all_SAGE_chunk_003.jsonl']:
    dg.parser.llm_client.submit_batch_job(
                        file_path, 
                        api_provider='openai',
                        batch_description= f'file_path: {file_path}'
    )'''

In [None]:
'''batch_id = 'batch_68efcceeb7448190ae19cc890a87aef8'
batch_id_1 = 'batch_68efcf6f0c0081909cce95c57d4aa55d'
batch_id_2 = 'batch_68efcf7e0dd881909f000e1634efc368'''

In [None]:
'''if not dg.parser:
    dg.parser = XMLParser(open_data_repos_ontology="open_bio_data_repos.json", logger=dg.logger,
    llm_name=dg.llm)

res = dg.parser.llm_client.download_batch_results(
    batch_id=batch_id_2,
    output_file_path='scripts/tmp/res_2.jsonl',
    api_provider='openai'
)'''

In [None]:
'''# Combine multiple batch result files into one
import json

# List of result files to combine
result_files = [
    'scripts/tmp/res_2.jsonl',
    'scripts/tmp/res.jsonl',
    'scripts/tmp/res.jsonl'  # Note: this appears twice in your list
]

# Output combined file
combined_output = 'scripts/tmp/combined_batch_results.jsonl'

# Combine all JSONL files
all_results = []
total_lines = 0

for file_path in result_files:
    if os.path.exists(file_path):
        print(f"Reading {file_path}...")
        with open(file_path, 'r', encoding='utf-8') as f:
            file_lines = 0
            for line in f:
                if line.strip():  # Skip empty lines
                    all_results.append(line.strip())
                    file_lines += 1
                    total_lines += 1
        print(f"  ‚Üí Added {file_lines} lines from {file_path}")
    else:
        print(f"Warning: File not found - {file_path}")

# Write combined results to new file
print(f"\nWriting {total_lines} total lines to {combined_output}...")
with open(combined_output, 'w', encoding='utf-8') as f:
    for line in all_results:
        f.write(line + '\n')

print(f"‚úÖ Combined batch results saved to: {combined_output}")
print(f"üìä Total lines combined: {total_lines}")

# Verify the combined file
file_size_mb = os.path.getsize(combined_output) / 1024 / 1024
print(f"üìÅ Combined file size: {file_size_mb:.2f} MB")'''

In [None]:
if dg.parser is None:
    dg.parser = XMLParser(dg.open_data_repos_ontology, dg.logger, llm_name=dg.llm)

with open('scripts/tmp/combined_batch_results.jsonl', 'r') as f:
    lines = f.readlines()
print(f"Number of lines in combined file: {len(lines)}")

In [None]:
res_df = dg.from_batch_resp_file_to_df('scripts/tmp/combined_batch_results.jsonl')

In [None]:
res_df.to_csv('scripts/output/combined_results.csv', index=False)
len(res_df)

In [None]:
new_datasets_append = dg.process_articles(
    ['https://pmc.ncbi.nlm.nih.gov/articles/PMC8637040','https://pmc.ncbi.nlm.nih.gov/articles/PMC10092953'],
    prompt_name="GPT_FDR_FewShot_Syn",
    response_format=dataset_response_schema_gpt
)

In [None]:
# union dataframes
for pmc_link in ['https://pmc.ncbi.nlm.nih.gov/articles/PMC8637040','https://pmc.ncbi.nlm.nih.gov/articles/PMC10092953']:
    final_df = pd.concat([res_df, new_datasets_append[pmc_link]], ignore_index=True)

In [None]:
final_df.to_csv('scripts/output/combined_results.csv', index=False)

In [None]:
'''with open('/Users/pietro/Downloads/batch_68ed8a97bf98819090f2cb62841f0219_error.jsonl') as f:
    error_lines = f.readlines()'''

In [None]:
gt = pd.read_csv('scripts/output/gold/SAGE_groundtruth.csv')

In [None]:
#res_df = pd.read_csv('scripts/output/syn_data_results/result_gemini-2.5-flash_full-input.csv')
res_df = pd.read_csv('scripts/output/combined_results.csv')

In [None]:
evaluate_performance(
    res_df,
    gt,
    dg,
    'scripts/output/false_positives.txt', 
    false_negatives_file='scripts/output/false_negatives.txt',
    repo_return=True
    )

In [None]:
len(res_df[res_df['data_repository']=='synapse.org'])