# Changables

In [None]:
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

user_query='"Artificial Intelligence"[Mesh]'
user_start_date="2000/01/01"
user_end_date="2025/03/01"
user_unique_experiment_name_for_files = "AI-in-Med-2025"


# In case you wanted to change the CACHE_DIRECTORY to a different location
# import os
# os.environ["CACHE_DIRECTORY"] = "some/folder/you/like" 


In [None]:

# If CACHE_DIRECTORY is not set, use a default path
if "CACHE_DIRECTORY" not in os.environ:
    os.environ["CACHE_DIRECTORY"] = os.path.join(os.getcwd(), "pubmed_data")
    print(f"CACHE_DIRECTORY was not set, using default path {os.environ['CACHE_DIRECTORY']}")
elif not os.path.isabs(os.environ["CACHE_DIRECTORY"]):
    # If it's a relative path, make it absolute
    os.environ["CACHE_DIRECTORY"] = os.path.join(os.getcwd(), os.environ["CACHE_DIRECTORY"])

# Ensure the cache directory exists
os.makedirs(os.environ["CACHE_DIRECTORY"], exist_ok=True)

# Create paths using os.path.join for better compatibility across operating systems
S2_folder_path = os.path.join(os.environ["CACHE_DIRECTORY"], "S2_output")
S3_folder_path = os.path.join(os.environ["CACHE_DIRECTORY"], "S3_output")

# Code

## S1: Retriving articles

In [None]:
from Code import S1_DownloadPubmed_main
import os

# S1_DownloadPubmed_main(query=user_query, start_date=user_start_date, end_date=user_end_date)

## S2: Cleaning XML

In [None]:
from Code import create_and_copy_folder
import os

# Create S2 folder by copying from the original experiment folder
create_and_copy_folder(source_name=os.environ["CACHE_DIRECTORY"], destination_folder=S2_folder_path)


In [None]:
from Code import S2_Cleaner_processor_main

S2_Cleaner_processor_main(data_dir=S2_folder_path)

In [None]:
from Code import S2_prepare_and_label_main
import os

S2_prepare_and_label_main(
    folder_path= S2_folder_path,
    filter_startstring="cleaned_pubmed",
    add_string_at_beginning=""  # empty => overwrite
)

## S3: LLM-based labeling

In [None]:
from Code import S3_EXCT_processor_main
# Suppose you have a dictionary of all other EXCT_main parameters:
exct_params = {
    "text_key": "abstract", 
    "Pydantic_Objects_List": [],  # your pydantic models
    "path_to_list": None,
    "model_engine": "OpenAI_Async",
    "parser_error_handling": "llm_to_correct",
    "model": "gpt-3.5-turbo",
    "pre_prompt": "",
    "temperature": 0,
    "max_tokens": 2048,
    "logprobs": False,
    "seed": None,
    "timeout": 60,
    "max_retries": 2,
    "openai_api_key": "YOUR_OPENAI_API_KEY",
    "runpod_base_url": "",
    "runpod_api": "",
    "azure_api_key": "YOUR_AZURE_API_KEY",
    "azure_endpoint": "YOUR_AZURE_ENDPOINT",
    "azure_api_version": "YOUR_AZURE_API_VERSION",
    "total_async_n": 5,
    # Note that we don't pass json_file_path or output_file_path here
}

folder_to_process = r"C:\path\to\folder"
filter_str = "processed_"  # e.g., only process JSON files that start with "processed_"
prefix_str = "extracted_"

S3_EXCT_processor_main(
    folder_path=folder_to_process,
    filter_startstring=filter_str,
    add_string_at_beginning=prefix_str,
    EXCT_main_kwargs_dictionary=exct_params
)