# Changables

In [None]:
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

user_query='"Artificial Intelligence"[Mesh]'
user_start_date="2000/01/01"
user_end_date="2025/03/01"
user_unique_experiment_name_for_files = "AI-in-Med-2025"


# In case you wanted to change the CACHE_DIRECTORY to a different location the defult is "pubmed_data"
# import os
# os.environ["CACHE_DIRECTORY"] = "some/folder/you/like" 


In [None]:
import os
# If CACHE_DIRECTORY is not set, use a default path
if "CACHE_DIRECTORY" not in os.environ:
    os.environ["CACHE_DIRECTORY"] = os.path.join(os.getcwd(), "pubmed_data")
    print(f"CACHE_DIRECTORY was not set, using default path {os.environ['CACHE_DIRECTORY']}")
elif not os.path.isabs(os.environ["CACHE_DIRECTORY"]):
    # If it's a relative path, make it absolute
    os.environ["CACHE_DIRECTORY"] = os.path.join(os.getcwd(), os.environ["CACHE_DIRECTORY"])

# Ensure the cache directory exists
os.makedirs(os.environ["CACHE_DIRECTORY"], exist_ok=True)

# Create paths using os.path.join for better compatibility across operating systems
S2_folder_path = os.path.join(os.environ["CACHE_DIRECTORY"], "S2_output")
S3_folder_path = os.path.join(os.environ["CACHE_DIRECTORY"], "S3_output")

# Code

## S1: Retriving articles

In [None]:
from Code import S1_DownloadPubmed_main
import os

# S1_DownloadPubmed_main(query=user_query, start_date=user_start_date, end_date=user_end_date)

## S2: Cleaning XML

In [None]:
from Code import create_and_copy_folder
import os

# Create S2 folder by copying from the original experiment folder
create_and_copy_folder(source_name=os.environ["CACHE_DIRECTORY"], destination_folder=S2_folder_path)


In [None]:
from Code import S2_Cleaner_processor_main


S2_Cleaner_processor_main(data_dir=S2_folder_path, combine_all=True)

In [None]:
from Code import S2_prepare_and_label_main
import os

S2_prepare_and_label_main(
    folder_path= S2_folder_path,
    filter_startstring="cleaned_pubmed",
    add_string_at_beginning=""  # empty => overwrite
)

## extraS2: Embedding

In [None]:
# Test 
test_S2_folder_path = r"C:\Users\LEGION\Documents\GIT\AI-in-Med-Trend\pubmed_data_test"

######################
####   CPU or MAC ####
######################
# from Code import extraS2_Embedding_processor_main

# extraS2_Embedding_processor_main(folder_path = test_S2_folder_path,
#                                  filter_startstring="cleaned_pubmed",save_embedding_path=os.getenv("Save_FAISS_Embedding_Path"), batch_size=50, save_format="faiss",
#                                  model_name=os.getenv("HF_ST_model_for_clustering")
#                                  )

#########################
#####   GPU & cuda ######
#########################
#In case you have GPU and want to run it with cude, follow instruction on Code/requirements_venv_torch.txt to create venv_torch and install torch with GPU support.

from Code import extraS2_Embedding_processor_subprocess_main

extraS2_Embedding_processor_subprocess_main(
    folder_path = test_S2_folder_path,
    filter_startstring="cleaned_pubmed",
    save_embedding_path=os.getenv("Save_FAISS_Embedding_Path"),
    batch_size=50,
    save_format="faiss",
    model_name=os.getenv("HF_ST_model_for_clustering"))

In [None]:
######################
####   CPU or MAC ####
######################
from Code import extraS2_Embedding_processor_main

extraS2_Embedding_processor_main(folder_path = S2_folder_path, 
                                 filter_startstring="cleaned_pubmed",save_embedding_path=os.getenv("Save_FAISS_Embedding_Path"), batch_size=50, save_format="faiss")

#########################
#####   GPU & cuda ######
#########################
# In case you have GPU and want to run it with cude, follow instruction on Code/requirements_venv_torch.txt to create venv_torch and install torch with GPU support.

# from Code import extraS2_Embedding_processor_subprocess_main

# extraS2_Embedding_processor_subprocess_main(
#     folder_path = S2_folder_path,
#     filter_startstring="cleaned_pubmed",
#     save_embedding_path=os.getenv("Save_FAISS_Embedding_Path"),
#     batch_size=50,
#     save_format="faiss")

###### vizualization

In [None]:
# The following code is for visualization of the raw embedding (before DimenReduction using UMAP).

from Code import extraS2_DimenReduction_viz_main
import os


viz_mapper, viz_embeddings = extraS2_DimenReduction_viz_main(os.getenv("Save_FAISS_Embedding_Path"), label_for_figure="beforeUMAP")

## extraS2: DimenReduction

In [None]:
from Code import extraS2_DimenReduction_main
import numpy as np

reduced_embeddings= extraS2_DimenReduction_main(faiss_input_path=os.getenv("Save_FAISS_Embedding_Path"),
                            faiss_output_path=os.getenv("Save_FAISS_DimenReduction_Path"),
                            umap_metric= os.getenv("UMAP_METRIC"),
                            umap_min_dist= os.getenv("UMAP_MIN_DIST"),
                            umap_n_components= os.getenv("UMAP_N_COMPONENTS"),
                            umap_n_neighbors= os.getenv("UMAP_N_NEIGHBORS"))

###### vizualization

In [None]:
# The following code is for visualization of the reduced embeddings using UMAP.
from Code import extraS2_DimenReduction_viz_main
import os


viz_mapper, viz_embeddings = extraS2_DimenReduction_viz_main(os.getenv("Save_FAISS_DimenReduction_Path"), label_for_figure="UMAP")


## extraS2: Clustering & Cluster Sampling

In [None]:
import os
from Code import extraS2_Clustering_main

cluster_labels = extraS2_Clustering_main(
    faiss_input_path=os.getenv("Save_FAISS_DimenReduction_Path"),
    cluster_save_path=os.getenv("Save_Clustering_Path"),
    min_cluster_size=50,
    min_samples=25,
    cluster_selection_epsilon=0.4,
    cluster_selection_method="leaf",
    metric='euclidean'
)


###### vizualization

In [None]:
from Code import extraS2_Clustering_report

extraS2_Clustering_report(cluster_save_path=os.getenv("Save_Clustering_Path"),
                              embeddings_faiss_path= os.getenv("Save_FAISS_DimenReduction_Path"),
                              cluster_centers_save_path=os.getenv("Save_Cluster_Centers_Path"))

## S3: LLM-based labeling

In [None]:
from Code import S3_EXCT_processor_main
# Suppose you have a dictionary of all other EXCT_main parameters:
exct_params = {
    "text_key": "abstract", 
    "Pydantic_Objects_List": [],  # your pydantic models
    "path_to_list": None,
    "model_engine": "OpenAI_Async",
    "parser_error_handling": "llm_to_correct",
    "model": "gpt-3.5-turbo",
    "pre_prompt": "",
    "temperature": 0,
    "max_tokens": 2048,
    "logprobs": False,
    "seed": None,
    "timeout": 60,
    "max_retries": 2,
    "openai_api_key": os.getenv("OPENAI_COMPATIBLE_API_KEY"),
    "runpod_base_url": os.getenv("OPENAI_COMPATIBLE_BASE_URL"),
    "runpod_api": os.getenv("RUNPOD_API"),
    "azure_api_key": os.getenv("AZURE_API_KEY"),
    "azure_endpoint": os.getenv("AZURE_ENDPOINT"),
    "azure_api_version": os.getenv("AZURE_API_VERSION"),
    "total_async_n": 5,
    # Note that we don't pass json_file_path or output_file_path here
}

folder_to_process = r"C:\path\to\folder"
filter_str = "processed_"  # e.g., only process JSON files that start with "processed_"
prefix_str = "extracted_"

S3_EXCT_processor_main(
    folder_path=folder_to_process,
    filter_startstring=filter_str,
    add_string_at_beginning=prefix_str,
    EXCT_main_kwargs_dictionary=exct_params
)