In [None]:
import os, re
from data_gatherer.data_gatherer import DataGatherer
import pandas as pd
from data_gatherer.llm.response_schema import *
from data_gatherer.parser.xml_parser import XMLParser
from scripts.experiment_utils import evaluate_performance
import json

In [None]:
input_file = "scripts/exp_input/REV.txt"

In [None]:
model_name = "gpt-5-mini"  # "gemini-2.0-flash" or "gpt-4o-mini"
prompt = "GPT_FewShot"  # "GPT_from_full_input_Examples" or "GPT_FewShot"
FDR = False
semantic_retrieval = False
brute_force_RegEx_ID_ptrs = False
section_filter= "supplementary_material"
top_k = 0

batch_file_path=f'scripts/tmp/batch_requests_openai_RTR-{top_k}_DataRef-REV.jsonl'
ret_file=f'scripts/output/semantic_search/resp_RTR_{top_k}.csv'


In [None]:
# write list to a text file
with open(input_file, 'r') as f:
    pmcids = f.read().splitlines()

print("Number of PMCIDs:", len(pmcids))

In [None]:
missing_urls = pmcids  # For testing, limit to first 10 PMCIDs

In [None]:
dg = DataGatherer(
    llm_name=model_name, 
    log_level='INFO', 
    process_entire_document=FDR, 
    driver_path=None, 
    save_to_cache=False, 
    load_from_cache=False,
    embeds_cache_read=True,
    embeds_cache_write=True,
) #, save_dynamic_prompts=True

https://platform.openai.com/docs/guides/batch#1-preparing-your-batch-file

https://portkey.ai/docs/integrations/llms/bedrock/batches

In [None]:
batch_file_path

In [None]:
batch_input_openai = dg.run_integrated_batch_processing(
    url_list=pmcids,
    batch_file_path=batch_file_path,
    api_provider='openai',  # Uses OpenAI batch API for 50% cost discount
    prompt_name="GPT_FewShot",
    response_format=dataset_response_schema_gpt,
    semantic_retrieval=semantic_retrieval,
    section_filter=section_filter,
    top_k=top_k,
    submit_immediately=False,
    wait_for_completion=False,  # Set to True if you want to wait for results
    batch_description="DATA-REF REV retrieval with OpenAI API",
)

In [None]:
prompts_filepath = 'scripts/tmp/batch_requests_openai_RTR-1_DataRef-REV.jsonl'
prompts_load = []
with open(prompts_filepath, 'r', encoding='utf-8') as f:
    for line in f:
        if line.strip():
            prompts_load.append(json.loads(line))

In [None]:
df_gt = pd.read_parquet("scripts/output/gold/dataset_citation_records_Table.parquet")

In [None]:
found_avg = 0
n_dfs = len(prompts_load)

for prompt in prompts_load:
    pmc_id = dg.data_fetcher.url_to_article_id(prompt['custom_id'])
    gt = df_gt[df_gt['pmcid'] == pmc_id]
    datasets_gt = gt['identifier'].values.tolist()
    #print(f"datasets: {datasets_gt}")
    body_msg = [item['content'] for item in prompt['body']['input']]
    #print (f"prompt: {body_msg}")
    input_cont_str = "\n".join(body_msg)

    datasets_found, datasets_tot = 0, len(datasets_gt)
    for dataset in datasets_gt:
        if dataset.lower() in input_cont_str.lower():
            datasets_found += 1
            continue
        else:
            print(f"Missing dataset {dataset} in prompt for pmcid {pmc_id}")
    found_i = datasets_found / datasets_tot if datasets_tot > 0 else 1.0
    found_avg += found_i/n_dfs

In [None]:
found_avg

In [None]:
# Simple chunking and submission - NO monitoring or result combination
result = dg.split_jsonl_and_submit(
    batch_file_path=batch_file_path,
    max_file_size_mb=200.0,
    api_provider='openai',
    wait_between_submissions=30,
    batch_description=f"Chunked RTR DataRef-REV batch processing"
)

In [None]:
'''for file_path in ['scripts/tmp/batch_requests_openai_all_SAGE_chunk_002.jsonl','scripts/tmp/batch_requests_openai_all_SAGE_chunk_003.jsonl']:
    dg.parser.llm_client.submit_batch_job(
                        file_path, 
                        api_provider='openai',
                        batch_description= f'file_path: {file_path}'
    )'''

In [None]:
batch_id = 'batch_68efcceeb7448190ae19cc890a87aef8'
batch_id_1 = 'batch_68efcf6f0c0081909cce95c57d4aa55d'
batch_id_2 = 'batch_68efcf7e0dd881909f000e1634efc368'

batch_id_REV_RTR_9 = 'batch_6914c7be180481909752dd682ab730fb'
batch_id_REV_RTR_5 = 'batch_6914a7de95248190b2d5d444d827373f'
batch_id_REV_RTR_1 = 'batch_6914d280310c819082bed1fa3d661f22'
batch_id_REV_RTR_ptr = 'batch_6915049af5008190a695c4284413d692'

batch_id_REV_RTR_base = 'batch_6916b7d814cc8190a4ca76844ad4f58d'

In [None]:
if not dg.parser:
    dg.parser = XMLParser(open_data_repos_ontology="open_bio_data_repos.json", logger=dg.logger,
    llm_name=dg.llm)

res = dg.parser.llm_client.download_batch_results(
    batch_id=batch_id_REV_RTR_base,
    output_file_path='scripts/tmp/resp_RTR_base.jsonl',
    api_provider='openai'
)

In [None]:
'''# Combine multiple batch result files into one
import json

# List of result files to combine
result_files = [
    'scripts/tmp/res_1.jsonl',
    'scripts/tmp/res_2.jsonl',
    'scripts/tmp/res_3.jsonl'
]

# Output combined file
combined_output = 'scripts/tmp/combined_batch_results.jsonl'

# Combine all JSONL files
all_results = []
total_lines = 0

for file_path in result_files:
    if os.path.exists(file_path):
        print(f"Reading {file_path}...")
        with open(file_path, 'r', encoding='utf-8') as f:
            file_lines = 0
            for line in f:
                if line.strip():  # Skip empty lines
                    all_results.append(line.strip())
                    file_lines += 1
                    total_lines += 1
        print(f"  ‚Üí Added {file_lines} lines from {file_path}")
    else:
        print(f"Warning: File not found - {file_path}")

# Write combined results to new file
print(f"\nWriting {total_lines} total lines to {combined_output}...")
with open(combined_output, 'w', encoding='utf-8') as f:
    for line in all_results:
        f.write(line + '\n')

print(f"‚úÖ Combined batch results saved to: {combined_output}")
print(f"üìä Total lines combined: {total_lines}")

# Verify the combined file
file_size_mb = os.path.getsize(combined_output) / 1024 / 1024
print(f"üìÅ Combined file size: {file_size_mb:.2f} MB")'''

In [None]:
if dg.parser is None:
    dg.parser = XMLParser(dg.open_data_repos_ontology, dg.logger, llm_name=dg.llm)

with open('scripts/tmp/resp_RTR_base.jsonl', 'r') as f:
    lines = f.readlines()
print(f"Number of lines in combined file: {len(lines)}")

In [None]:
ret_file = 'scripts/output/semantic_search/resp_RTR_base.csv'
ret_file

In [None]:
res_df = dg.from_batch_resp_file_to_df('scripts/tmp/resp_RTR_base.jsonl', output_file_path=ret_file)

In [None]:
res_df = pd.read_csv(ret_file)

In [None]:
pmcids_ret = set([re.sub('(https://www.ncbi.nlm.nih.gov/pmc/articles/.*)/','\\1',item).lower() for item in res_df['source_url'].to_list()])
pmcids = set([idx.lower() for idx in pmcids])
missing_urls = list(pmcids - pmcids_ret)
len(missing_urls)

In [None]:
new_datasets_append = dg.process_articles(
    missing_urls,
    prompt_name="GPT_FewShot",
    full_document_read=FDR,
    top_k = top_k,
    semantic_retrieval=semantic_retrieval,
    section_filter= section_filter,
    response_format=dataset_response_schema_gpt
)

In [None]:
len(new_datasets_append), type(new_datasets_append)

In [None]:
# union dataframes
for pmc_link in new_datasets_append.keys():
    final_df = pd.concat([res_df, new_datasets_append[pmc_link]], ignore_index=True)

In [None]:
final_df.to_csv(ret_file, index=False)

In [None]:
'''with open('/Users/pietro/Downloads/batch_68ed8a97bf98819090f2cb62841f0219_error.jsonl') as f:
    error_lines = f.readlines()'''

In [None]:
gt = pd.read_parquet('scripts/output/gold/dataset_citation_records_Table.parquet')
gt.head(5)

In [None]:
res_df = pd.read_csv(ret_file)
res_df.head(2)

In [None]:
def map_url_to_article_id(row):
    if row['source_url'].endswith('/'):
        return row['source_url'][:-1]
    return row['source_url']

res_df['source_url'] = res_df.apply(map_url_to_article_id, axis=1)

In [None]:
evaluate_performance(
    res_df,
    gt,
    dg,
    'scripts/output/false_positives.txt', 
    false_negatives_file='scripts/output/false_negatives.txt',
    repo_return=True,
    gt_base = pmcids
    )