In [None]:
import pat2vec

In [None]:
import numpy as np
import random
import os
import sys
import shutil
import logging
from elasticsearch.exceptions import AuthenticationException

# Set up a basic logger to be used throughout the notebook
logging.basicConfig(
    level=logging.INFO,
    stream=sys.stdout,
    format="%(asctime)s - %(levelname)s - %(message)s",
)
logger = logging.getLogger()

# Fix the random seed for reproducibility in unit testing

random_seed_value = 42

np.random.seed(random_seed_value)

random.seed(random_seed_value)

In [None]:
# 1. Log the current working directory
logger.info(f"Current Working Directory: {os.getcwd()}")

# 2. Log Python's sys.path
logger.info(f"Python Path: {sys.path}")

In [None]:
# remove dir
clear_previous_outputs = True

if clear_previous_outputs:

    shutil.rmtree("new_project", ignore_errors=True)

    shutil.rmtree("new_project_ipw", ignore_errors=True)

    shutil.rmtree("treatment_doc_extract", ignore_errors=True)

In [None]:
# Ensure dependencies are on path

# Get the current working directory
current_dir = os.getcwd()

# Define relative paths from the current working directory
path_to_medcat_model_pack = os.path.abspath(
    os.path.join(
        current_dir,
        "..",
        "..",
        "medcat_models",
        "medcat_model_pack_422d1d38fc58f158.zip",
    )
)

path_to_snomed_ct_file = os.path.abspath(
    os.path.join(
        current_dir,
        "..",
        "..",
        "snomed",
        "SnomedCT_InternationalRF2_PRODUCTION_20231101T120000Z",
        "SnomedCT_InternationalRF2_PRODUCTION_20231101T120000Z",
        "Full",
        "Terminology",
        "sct2_StatedRelationship_Full_INT_20231101.txt",
    )
)

# Define the relative path
path_to_gloabl_files = "../../"

additional_path_to_pat2vec = "pat2vec"

additional_path_to_pat2vec = os.path.abspath(
    os.path.join(path_to_gloabl_files, additional_path_to_pat2vec)
)

# Get the absolute path of the current working directory
current_dir = os.getcwd()

# Combine the current directory with the relative path
absolute_path = os.path.abspath(os.path.join(current_dir, path_to_gloabl_files))

# Usage examples
logger.info(f"Path to medcat model pack: {path_to_medcat_model_pack}")
logger.info(f"Path to SNOMED CT file: {path_to_snomed_ct_file}")
logger.info(f"Path to global files: {path_to_gloabl_files}")
logger.info(f"Additional path to pat2vec: {additional_path_to_pat2vec}")

In [None]:
sys.path.insert(0, path_to_gloabl_files)
sys.path.insert(0, additional_path_to_pat2vec)

current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
sys.path.append(parent_dir)

# Add the grandparent directory of the current directory to the Python path
grandparent_dir = os.path.dirname(parent_dir)
sys.path.append(grandparent_dir)

### Set up logger

In [None]:
from pat2vec.util.logger_setup import setup_logger

# Get the logger (this will reconfigure the logger set up earlier)
logger = setup_logger()

# Get treatment_docs 

In [None]:
from pat2vec.util.config_pat2vec import config_class

config_obj = config_class(
    medcat=False,  # Load medcat, ensure model pack is in gloabl_files/medcat_models/ ..examplemodelpack.zip
    override_medcat_model_path=path_to_medcat_model_pack,
    proj_name="treatment_doc_extract",
    verbosity=0,
    global_start_year=1995,  # Set the start date, this will extract data between these dates.
    global_end_year=2024,
    global_start_month=1,
    global_end_month=12,
    global_start_day=1,
    global_end_day=31,
    lookback=False,  # Set to True if you want to look back at the previous year and month
    testing=True,  # Set to True if you want to run in testing mode, this will use dummy data for testing.
)

In [None]:
from pat2vec.main_pat2vec import main

pat2vec_obj = main(
    cogstack=True,
    use_filter=False,
    json_filter_path=None,
    random_seed_val=random_seed_value,
    hostname=None,
    config_obj=config_obj,
)  # initialize the pat2vec object

In [None]:
if pat2vec_obj.config_obj.testing:
    logger.info("Testing mode is enabled, skipping authentication check.")
else:
    # Check if the Elasticsearch client is authenticated # advise user to check credentials
    try:
        pat2vec_obj.cs.elastic.info()
        logger.info("Elasticsearch authentication successful.")
    except AuthenticationException as e:
        logger.error(f"Authentication failed: {e.info['error']['reason']}")
        logger.warning(
            "Please check your Elasticsearch credentials in the configuration file."
        )
    except Exception as e:
        logger.error(
            f"An error occurred while checking Elasticsearch authentication: {e}"
        )

In [None]:
snomed_example = False

if snomed_example:

    from snomed_methods import snomed_methods_v1

    path_to_sct2 = path_to_snomed_ct_file

    medcat_path = path_to_medcat_model_pack

    snomed_relations_obj = snomed_methods_v1.snomed_relations(
        medcat=True, snomed_rf2_full_path=path_to_sct2, medcat_path=medcat_path
    )

    outcome_variable_cui_for_filter = "109989006"  # myeloma

    logger.info(f"Outcome variable CUI for filter: {outcome_variable_cui_for_filter}")

    filter_root_cui = outcome_variable_cui_for_filter
    logger.info(f"Filter root CUI: {filter_root_cui}")

    retrieved_codes_snomed_tree, retrieved_names_snomed_tree = (
        snomed_relations_obj.recursive_code_expansion(
            filter_root_cui, n_recursion=3, debug=False
        )
    )

    logger.info(
        f"Retrieved codes (first 5): {retrieved_codes_snomed_tree[0:5]}, Total codes: {len(retrieved_codes_snomed_tree)}, Total names: {len(retrieved_names_snomed_tree)}"
    )

    retrieved_codes_medcat_cdb, retrieved_names_medcat_cdb = (
        snomed_relations_obj.get_medcat_cdb_most_similar(
            filter_root_cui, context_type="xxxlong", type_id_filter=[], topn=50
        )
    )

In [None]:
# Add terms to search the document indicies for

term_list = [
    "myeloma",
]

In [None]:
from pat2vec.util.pre_processing import (
    get_treatment_docs_by_iterative_multi_term_cohort_searcher_no_terms_fuzzy,
)

# Example getting a patient cohort by the presence of terms in their clinical documents

# We start by extracting the documents across textual document sources with fuzzy string matching

treatment_docs = get_treatment_docs_by_iterative_multi_term_cohort_searcher_no_terms_fuzzy(
    pat2vec_obj=pat2vec_obj,
    term_list=term_list,  # List of terms to search for
    overwrite=True,  # overwrite existing treatment_docs.csv else append results
    append=False,  # Append results to existing treatment_docs.csv
    verbose=9,  # Adjust verbosity for logging
    mct=True,  # Include clinical notes text sources, this will search an additional document index
    textual_obs=True,  # Include observations index text sources, this will search an additional document index
    additional_filters=None,  # Add additional filters to the search such as document type.
    all_fields=False,  # Return all fields from indicies instead of just a typical subset.
)

treatment_docs

In [None]:
# Example get cohort by drug treatment for cross reference etc

# Example, I want to get a cohort of patients who have drug orders to check against their diagnosis status from the previous step.

from pat2vec.util.pre_get_drug_treatment_docs import iterative_drug_treatment_search
import pandas as pd

retrieve_cohort_by_drug_treatment = False

if retrieve_cohort_by_drug_treatment:

    search_terms_list = ["asprin", "ibuprofen", "Emtricitabine", "Mepacrine"]
    output_file_path = "drug_treatment_records.csv"

    iterative_drug_treatment_search(
        pat2vec_obj=pat2vec_obj,
        search_terms=search_terms_list,
        output_file_path=output_file_path,
        verbose=5,  # Adjust verbosity for logging
        drop_duplicates=True,  # Search terms can produce duplicates, remove by order guid.
        overwrite=True,  # Overwrite initial output file
    )

    # Load the csv file
    df_drug_treatment_cohort = pd.read_csv(output_file_path)
    df_drug_treatment_cohort

In [None]:
from pat2vec.util.config_pat2vec import config_class
from datetime import datetime
from tqdm import tqdm
from pat2vec.util.post_processing_process_csv_files import process_csv_files
from pat2vec.util.post_processing import extract_datetime_to_column
from dateutil.relativedelta import relativedelta
import pandas as pd
from typing import Dict, List, Optional, Union

# Configuration dictionary for main options in pat2vec
main_options_dict = {
    "demo": True,  # Enable demographic information (Ethnicity mapped to UK census categories, age, death).
    "bmi": True,  # Enable BMI (Body Mass Index) information.
    "bloods": True,  # Enable blood-related information
    "drugs": True,  # Enable drug-related information
    "diagnostics": True,  # Enable diagnostic information
    "core_02": True,  # Enable core_02 information
    "bed": True,  # Enable bed n information
    "vte_status": True,  # Enable VTE () status information
    "hosp_site": True,  # Enable hospital site information
    "core_resus": True,  # Enable core resuscitation information
    "news": True,  # Enable NEWS (National Early Warning Score) information
    "smoking": True,  # Enable smoking-related information
    "annotations": True,  # Enable EPR documents annotations via MedCat
    "annotations_mrc": True,  # Enable MRC (Additional clinical note observations index) annotations via MedCat
    "negated_presence_annotations": False,  # Enable or disable negated presence annotations
    "appointments": False,  # Enable appointments information
    "annotations_reports": False,  # Enable reports information
    "textual_obs": False,  # Enable textual observations (basic_observations index) annotations via MedCat
}

# Configuration dictionary for annotation filtering, only base annotations meeting this threshold will be included.
annot_filter_arguments = {
    "acc": 0.8,  # base concept accuracy
    "types": [
        "qualifier value",
        "procedure",
        "substance",
        "finding",
        "environment",
        "disorder",
        "observable entity",
    ],  # umls list of types for medcat filter
    # 'types': ['qualifier value', 'procedure', 'substance', 'finding', 'environment', 'disorder', 'observable entity', 'organism', 'phenomenon', 'anatomy', 'conceptual entity', 'physical object', 'intellectual product', 'occupation or discipline', 'mental or behavioral dysfunction', 'geographic area', 'population group', 'biomedical or dental material', 'medical device', 'classification', 'regulation or law', 'health care activity', 'health care related organization', 'professional or occupational group', 'group', 'attribute', 'individual behavior']
    "Time_Value": [
        "Recent",
        "Past",
    ],  # Specify the values you want to include in a list. Must be defined in medcat model. # Example ['Recent', 'Past', 'Subject/Experiencer']
    "Time_Confidence": 0.8,  # Specify the confidence threshold as a float
    "Presence_Value": ["True"],  # Specify the values you want to include in a list
    "Presence_Confidence": 0.8,  # Specify the confidence threshold as a float
    "Subject_Value": ["Patient"],  # Specify the values you want to include in a list
    "Subject_Confidence": 0.8,  # Specify the confidence threshold as a float
}

# Filter data batches by terms before processing.

epr_docs_term_regex: Optional[Union[str, None]] = None
mct_docs_term_regex: Optional[Union[str, None]] = None

# Example bloods_filter_term_list: Optional[Union[List[str], None]] = ['wbc'] # This will only include basic observations with this item name analysed.
bloods_filter_term_list: Optional[Union[List[str], None]] = None

# Example mct_docs_document_type_filter_list: Optional[Union[List[str], None]] = ['KHMDC Integrated report'] # This will only include documents with this document type field value.

mct_docs_document_type_filter_list: Optional[Union[List[str], None]] = None
epr_docs_document_type_filter_list: Optional[Union[List[str], None]] = None

data_type_filter_dict: Dict[str, any] = {
    "filter_term_lists": {
        "epr_docs": epr_docs_document_type_filter_list,
        "mct_docs": mct_docs_document_type_filter_list,
        "bloods": bloods_filter_term_list,
    },
    "epr_docs_term_regex": epr_docs_term_regex,
    "mct_docs_term_regex": mct_docs_term_regex,
}

# Example date settings:
# start_date=(datetime(2020, 1, 1)) Start date for processing

# Define the length of the time window, example 1 year and 15 days, only data within this window will be processed.
# years=1,      # Number of years to add to the start date
# months=0,  # Number of months to add to the start date
# days=15,  # Number of days to add to the start date

# Define the interval between time windows. Example 1 year. Each vector/row output will be based on this interval.
# time_window_interval_delta = relativedelta(years=1)

# lookback = True #This determines the direction of the time length window. True = backward, False = forward. Our time window (+1 years, 15 days) is therefore 2020, 1, 1 - 2021, 1, 15.

# IPW settings:

# Init config obj

# Hypothetical date config_obj configuration:
# I want all patients data between Feb 2015 and Jul 2020. This date window will extract and create the batched patient data for this time window.

# global_start_year=2015,
# global_start_month=2,
# global_end_year=2020,
# global_end_month=6,
# global_start_day = 1,
# global_end_day = 1,

# I want patient vectors starting from Feb 2019 to Feb 2020 as I would like to see if X medical event is recorded on those taking medication Y
# start_date=(datetime(2019, 2, 1)),
# years=1,
# months=0,
# days=0,
# lookback = False # 2019 to 2020 is forward in time.
# I would like a single vector for each patient
# time_window_interval_delta = relativedelta(years=1)
# I would like 1 vector per month per patient for the 1 year time window
# time_window_interval_delta = relativedelta(months=1)

# Creating a configuration object for a specific task or project
config_obj = config_class(
    remote_dump=False,  # Flag for remote data dumping. partially deprecated.
    suffix="",  # Suffix for file names
    treatment_doc_filename="test_files/treatment_docs.csv",  # Filename for treatment documentation
    treatment_control_ratio_n=1,  # Ratio for treatment to control
    proj_name="new_project",  # Project name. patient data batches and vectors stored here.
    current_path_dir="",  # Current path directory
    main_options=main_options_dict,  # Dictionary for main options
    start_date=(datetime(1995, 1, 1)),  # Starting date for processing
    years=30,  # Number of years to add to the start date. Set the duration of the time window. Window is defined as the start date + years/months/days set here.
    months=0,  # Number of months to add to the start date
    days=0,  # Number of days to add to the start date
    batch_mode=True,  # Flag for batch processing mode. Only functioning mode.
    store_annot=True,  # Flag to store annotations. partially deprecated.
    share_sftp=True,  # Flag for sharing via SFTP. partially deprecated
    multi_process=False,  # Flag for multi-process execution. deprecated.
    strip_list=True,  # Flag for stripping lists, this will check for completed patients before starting to avoid redundancy.
    verbosity=0,  # Verbosity level 0-9 printing debug messages
    random_seed_val=random_seed_value,  # Random seed value for reproducibility of controls.
    testing=True,  # Flag for testing mode. Will use dummy data.
    dummy_medcat_model=True,  # Flag for dummy MedCAT model, used if testing == True, this will simulate a MedCAT model.
    use_controls=False,  # If true this will add desired ratio of controls at random from global pool, requires configuring with a master list of patients.
    medcat=False,  # Flag for MedCAT processing. #will load medcat into memory and use for annotating.
    start_time=datetime.now(),  # Current timestamp as the start time for logging and progress bar
    patient_id_column_name="auto",  # Column name for patient ID, auto will try to find it. Example "client_idcode"
    annot_filter_options=annot_filter_arguments,  # Annotation filtering options
    # Global start year. #set the limits of the time window data can be drawn from. Start should not precede start date set above.
    global_start_year=1995,  # Global dates are overwritten by individual patient windows to match patient window. # Ensure that global start year/month/day is before end year/month/day
    global_start_month=1,  # Global start month
    global_end_year=2025,  # Global end year
    global_end_month=1,  # Global end month
    global_start_day=1,
    global_end_day=1,
    ## Use these if each patient has their own individual time window. Requires preparing a table of start dates.
    # individual_patient_window = True,
    # individual_patient_window_df = pd.read_csv('ipw_overlap.csv'),
    # individual_patient_window_start_column_name = 'updatetime_manual_offset',
    # individual_patient_id_column_name = 'client_idcode',
    # individual_patient_window_controls_method = 'full',
    shuffle_pat_list=False,  # Flag for shuffling patient list
    time_window_interval_delta=relativedelta(
        years=31
    ),  # specify the time window to collapse each feature vector into, years=1 is one vector per year within the global time window
    split_clinical_notes=True,  # will split clinical notes by date and treat as individual documents with extracted dates. Requires note splitter module.
    lookback=False,  # when calculating individual patient window from table of start dates, will calculate backwards in time if true. Else Forwards. When calculating from global start date, will calculate backwards or forwards respectively.
    add_icd10=False,  # append icd 10 codes to annot batches. Can be found under current_pat_documents_annotations/%client_idcode%.csv.
    add_opc4s=False,  # needs icd10 true also. Can be found under current_pat_documents_annotations/%client_idcode%.csv
    override_medcat_model_path=path_to_medcat_model_pack,  # Force medcat model path, if None uses defaults for env. #Can be set in paths.py with medcat_path = %path to medcat model pack.zip"
    data_type_filter_dict=None,  # Dictionary for data type filter, see examples above.
    filter_split_notes=True,  # If enabled, will reapply global time window filter post clinical note splitting. Recommended to enable if split notes enabled.
    prefetch_pat_batches=False,  # If enabled, will fetch batches for entire patient list and pre poulate batch folders with individual pat batches. Out of memory issues.
    sample_treatment_docs=5,  # If int > 0, will sample treatment documents from the treatment_docs.csv file. This is useful for testing and debugging / pilot run purposes.
    test_data_path=None,  # Path to test data. None and testing True uses 'test_files/treatment_docs.csv'
)

In [None]:
from pat2vec.main_pat2vec import main

In [None]:
pat2vec_obj = main(
    cogstack=True,
    use_filter=False,
    json_filter_path=None,
    random_seed_val=42,
    hostname=None,
    config_obj=config_obj,
)

View patient list

In [None]:
pat2vec_obj.all_patient_list[0:8]

In [None]:
pat2vec_obj.config_obj.date_list

Make pat vectors for pat 0

In [None]:
pat2vec_obj.pat_maker(0)

In [None]:
# Remove specific patient raw documents and annotations:
from pat2vec.util.post_processing import remove_file_from_paths

# remove_file_from_paths(pat2vec_obj.all_patient_list[i])

In [None]:
# Define the maximum number of retries
MAX_RETRIES = 3

# Iterate through the patient list starting from index 0
for i in tqdm(range(0, len(pat2vec_obj.all_patient_list))):
    retries = 0
    success = False

    while retries < MAX_RETRIES and not success:
        try:
            # Try to process the patient
            pat2vec_obj.pat_maker(i)
            success = True  # Mark as successful if no exception is raised

        except KeyError as e:
            # Handle specific exception
            logger.warning(
                f"KeyError at index {i} for patient {pat2vec_obj.all_patient_list[i]}: {e}. Retrying after removal..."
            )
            remove_file_from_paths(pat2vec_obj.all_patient_list[i])
            retries += 1

        except Exception as e:
            # Handle generic exceptions
            logger.error(
                f"Exception at index {i} for patient {pat2vec_obj.all_patient_list[i]}: {e}. Skipping this patient..."
            )
            break  # Break the retry loop for non-retryable exceptions

        finally:
            pat2vec_obj.t.update(1)  # Update progress

    if not success:
        logger.error(
            f"Failed to process index {i} for patient {pat2vec_obj.all_patient_list[i]} after {MAX_RETRIES} retries."
        )

pat2vec_obj.t.close()

In [None]:
input_directory = f"{pat2vec_obj.proj_name}/current_pat_lines_parts"  # Patient vectors are stored individually in this directory.
output_csv_file = "output_file"

# Specify the directory where you want to create the file
directory = pat2vec_obj.proj_name + "/output_directory"

# Check if the directory exists, if not, create it
if not os.path.exists(directory):
    os.makedirs(directory)

# We will join the individual patient vectors into a single output file. This is useful for filtering.
output_csv_file_filename = process_csv_files(
    input_directory,
    out_folder=directory,
    output_filename_suffix=output_csv_file,
    part_size=336,
)

In [None]:
df = pd.read_csv(output_csv_file_filename)

In [None]:
df = extract_datetime_to_column(df)

In [None]:
df

#### Build all document batches dataframe:

In [None]:
# This will merge all document source batches into a single file. This is useful for filtering. May produce a large file.

from pat2vec.util.post_processing_build_methods import build_merged_epr_mct_doc_df

all_pat_list = pat2vec_obj.all_patient_list

dfd = build_merged_epr_mct_doc_df(all_pat_list, pat2vec_obj.config_obj, overwrite=True)

# dfd = pd.read_csv(dfd)

### Build all annotation batches dataframe:

In [None]:
# This will merge all annotation source batches into a single file. This is useful for filtering. May produce a large file.

from pat2vec.util.post_processing_build_methods import build_merged_epr_mct_annot_df

all_pat_list = pat2vec_obj.all_patient_list

dfa = build_merged_epr_mct_annot_df(
    all_pat_list, pat2vec_obj.config_obj, overwrite=True
)

dfa = pd.read_csv(dfa)

dfa

### Build additional batches from individual patient data batches

In [None]:
# This will merge all drug source batches into a single file. This is useful for filtering. May produce a large file.

from pat2vec.util.post_processing_build_methods import merge_drugs_csv

all_pat_list = pat2vec_obj.all_patient_list

merged_drugs_path = merge_drugs_csv(
    all_pat_list, pat2vec_obj.config_obj, overwrite=True
)

merged_drugs = pd.read_csv(merged_drugs_path)
merged_drugs

In [None]:
# dfmdi = pd.read_csv('new_project/merged_input_pat_batches/merged_drugs_batches.csv')

In [None]:
# for col in dfmdi.select_dtypes(exclude=[np.number]).columns:
#     assert dfmdi[col].astype(str).equals(merged_drugs[col].astype(str)), f"Mismatch in column: {col}"

In [None]:
# This will merge all diagnostics source batches into a single file. This is useful for filtering. May produce a large file.

from pat2vec.util.post_processing_build_methods import merge_diagnostics_csv

all_pat_list = pat2vec_obj.all_patient_list

merged_diagnostics_path = merge_diagnostics_csv(
    all_pat_list, pat2vec_obj.config_obj, overwrite=True
)

merged_diagnostics = pd.read_csv(merged_diagnostics_path)

In [None]:
from pat2vec.util.post_processing_build_methods import merge_news_csv

all_pat_list = pat2vec_obj.all_patient_list

merged_news_path = merge_news_csv(all_pat_list, pat2vec_obj.config_obj, overwrite=True)

# merged_news = pd.read_csv(merged_news_path)

In [None]:
from pat2vec.util.post_processing_build_methods import merge_bmi_csv

all_pat_list = pat2vec_obj.all_patient_list

merged_bmi_path = merge_bmi_csv(all_pat_list, pat2vec_obj.config_obj, overwrite=True)

# merged_bmi = pd.read_csv(merged_bmi_path)

In [None]:
from pat2vec.util.post_processing_build_methods import build_merged_bloods

all_pat_list = pat2vec_obj.all_patient_list

merged_bloods_path = build_merged_bloods(
    all_pat_list, pat2vec_obj.config_obj, overwrite=True
)

merged_bloods = pd.read_csv(merged_bloods_path)
merged_bloods

In [None]:
# pd.read_csv('new_project/merged_input_pat_batches/merged_bloods_batches.csv')

In [None]:
from pat2vec.util.post_processing_build_methods import merge_demographics_csv

all_pat_list = pat2vec_obj.all_patient_list

merged_demographics_path = merge_demographics_csv(
    all_pat_list, pat2vec_obj.config_obj, overwrite=True
)

merged_demographics = pd.read_csv(merged_demographics_path)

merged_demographics

### Filter the annotation batches by a snomed cui and its related codes. 

In [None]:
snomed_methods_example = False

if snomed_methods_example:

    from snomed_methods import snomed_methods_v1

    path_to_sct2 = path_to_snomed_ct_file

    medcat_path = path_to_medcat_model_pack

    snomed_relations_obj = snomed_methods_v1.snomed_relations(
        medcat=True, snomed_rf2_full_path=path_to_sct2, medcat_path=medcat_path
    )
    outcome_variable_cui_for_filter = "40733004"  # infection

    logger.info(f"Outcome variable CUI for filter: {outcome_variable_cui_for_filter}")

    filter_root_cui = outcome_variable_cui_for_filter
    logger.info(f"Filter root CUI: {filter_root_cui}")

    retrieved_codes_snomed_tree, retrieved_names_snomed_tree = (
        snomed_relations_obj.recursive_code_expansion(
            filter_root_cui, n_recursion=3, debug=False
        )
    )

    logger.info(
        f"Retrieved codes (Snomed tree, first 5): {retrieved_codes_snomed_tree[0:5]}, Total codes: {len(retrieved_codes_snomed_tree)}, Total names: {len(retrieved_names_snomed_tree)}"
    )

    logger.info(
        f"Retrieved names (Snomed tree, first 10): {retrieved_names_snomed_tree[0:10]}"
    )

    retrieved_codes_medcat_cdb, retrieved_names_medcat_cdb = (
        snomed_relations_obj.get_medcat_cdb_most_similar(
            filter_root_cui, context_type="xxxlong", type_id_filter=[], topn=25
        )
    )

    logger.info(
        f"Retrieved names (MedCAT CDB, first 10): {retrieved_names_medcat_cdb[0:10]}"
    )

    all_names_list = list(set(retrieved_names_medcat_cdb + retrieved_names_snomed_tree))

    all_codes_list = list(set(retrieved_codes_medcat_cdb + retrieved_codes_snomed_tree))

    logger.info(f"Total unique names combined: {len(all_names_list)}")

# Apply misc methods 

In [None]:
# from pat2vec.all_methods import pat2vec_methods

# p2v = pat2vec_methods()

# p2v.produce_filtered_annotation_dataframe()

# Build IPW dataframe


Find the latest/earliest record for one of [268910001, 62315008, 55822004, 49727002]

We can use this in another main block with:

individual_patient_window = True,

individual_patient_window_df = pd.read_csv('ipw_overlap.csv'),

individual_patient_window_start_column_name = 'updatetime_manual_offset',

individual_patient_id_column_name = 'client_idcode',

individual_patient_window_controls_method = 'full', 

To limit each patients data to a specific individual time window. With controls we can match the time window per control or pull their 'full' data for the global time window.

In [None]:
annot_filter_arguments = {
    "acc": 0.6,  # base concept accuracy
    # umls list of types for medcat filter
    #'types': ['qualifier value', 'procedure', 'substance', 'finding', 'environment', 'disorder', 'observable entity'],
    "types": [
        "qualifier value",
        "procedure",
        "substance",
        "finding",
        "environment",
        "disorder",
        "observable entity",
        "organism",
        "phenomenon",
        "anatomy",
        "conceptual entity",
        "physical object",
        "intellectual product",
        "occupation or discipline",
        "mental or behavioral dysfunction",
        "geographic area",
        "population group",
        "biomedical or dental material",
        "medical device",
        "classification",
        "regulation or law",
        "health care activity",
        "health care related organization",
        "professional or occupational group",
        "group",
        "attribute",
        "individual behavior",
    ],
    # Specify the values you want to include in a list. Must be defined in medcat model.
    "Time_Value": ["Recent", "Past"],
    "Time_Confidence": 0.6,  # Specify the confidence threshold as a float
    # Specify the values you want to include in a list
    "Presence_Value": ["True"],
    "Presence_Confidence": 0.6,  # Specify the confidence threshold as a float
    # Specify the values you want to include in a list
    "Subject_Value": ["Patient"],
    "Subject_Confidence": 0.6,  # Specify the confidence threshold as a float
}

In [None]:
pd.read_csv(
    f"new_project/current_pat_document_batches/{pat2vec_obj.all_patient_list[1]}.csv"
).head()

In [None]:
from pat2vec.util.post_processing_build_ipw_dataframe import build_ipw_dataframe

build_ipw_dataframe(
    annot_filter_arguments=annot_filter_arguments,
    config_obj=pat2vec_obj.config_obj,
    filter_codes=[
        38341003,
        274640006,
        886731000000109,
        268910001,
        62315008,
        55822004,
        49727002,
        22232009,
    ],
    mode="latest",
    include_mct=True,
    include_textual_obs=False,
)  # '62315008', '55822004', '268910001',

In [None]:
from pat2vec.util.post_processing_build_ipw_dataframe import build_ipw_dataframe

build_ipw_dataframe(
    annot_filter_arguments=annot_filter_arguments,
    config_obj=pat2vec_obj.config_obj,
    filter_codes=[
        38341003,
        274640006,
        268910001,
        62315008,
        55822004,
        49727002,
        248153007,
    ],
    mode="earliest",
    include_mct=True,
    include_textual_obs=False,
)  # '62315008', '55822004', '268910001',

In [None]:
# Examine and screen the patient client_idcode list for malformed entries

from pat2vec.pat2vec_pat_list.get_patient_treatment_list import analyze_client_codes

# valid_codes, invalid_codes, clusters = analyze_client_codes(pat2vec_obj.all_patient_list)

In [None]:
len(treatment_docs)

In [None]:
treatment_docs = pd.read_csv("test_files/treatment_docs.csv")
# assert len(treatment_docs) == 23
logger.info(f"Length of treatment_docs is 23: {len(treatment_docs)==23}")

In [None]:
# assert treatment_docs['basicobs_itemname_analysed'].iloc[21] == 'Parathyroid Hormone (PTH)'

In [None]:
logger.info(f"Body analysed (row 0): {treatment_docs['body_analysed'].iloc[0]}")

In [None]:
assert "acrylic head" in str(treatment_docs["body_analysed"].iloc[0])

In [None]:
pat_example_annot = pd.read_csv(
    "new_project/current_pat_documents_annotations_batches/P0IFD0TV.csv"
)

pat_example_annot

In [None]:
# assert pat_example_annot['cui'].iloc[0] == 38341003

In [None]:
pd.set_option("display.max_columns", 50)

# IPW demonstration

### Build IPW 

In [None]:
annot_filter_arguments = {
    "acc": 0.1,  # base concept accuracy
    # umls list of types for medcat filter
    "types": [
        "qualifier value",
        "procedure",
        "substance",
        "finding",
        "environment",
        "disorder",
        "observable entity",
    ],
    # 'types': ['qualifier value', 'procedure', 'substance', 'finding', 'environment', 'disorder', 'observable entity', 'organism', 'phenomenon', 'anatomy', 'conceptual entity', 'physical object', 'intellectual product', 'occupation or discipline', 'mental or behavioral dysfunction', 'geographic area', 'population group', 'biomedical or dental material', 'medical device', 'classification', 'regulation or law', 'health care activity', 'health care related organization', 'professional or occupational group', 'group', 'attribute', 'individual behavior']
    # Specify the values you want to include in a list. Must be defined in medcat model.
    "Time_Value": ["Recent", "Past"],
    "Time_Confidence": 0.1,  # Specify the confidence threshold as a float
    # Specify the values you want to include in a list
    "Presence_Value": ["True"],
    "Presence_Confidence": 0.1,  # Specify the confidence threshold as a float
    # Specify the values you want to include in a list
    "Subject_Value": ["Patient"],
    "Subject_Confidence": 0.1,  # Specify the confidence threshold as a float
}

pd.read_csv(
    f"new_project/current_pat_document_batches/{pat2vec_obj.all_patient_list[1]}.csv"
).head()
len(pat2vec_obj.all_patient_list)
pd.read_csv(
    f"new_project/current_pat_documents_annotations_batches/{pat2vec_obj.all_patient_list[1]}.csv"
).head(2)

In [None]:
# select two cui to simulate condition

dfa_s = pd.read_csv("new_project/merged_batches/annots_mct_epr.csv")

dfa_s.head(2)

In [None]:
dfa_s[dfa_s["client_idcode"] == "V5LXO6QJ"].head(2)

#### Using these two cui codes as an example

In [None]:
import itertools
import pandas as pd

# Group the data so we have a set of CUIs for each client
client_cui_map = dfa_s.groupby("client_idcode")["cui"].apply(set)

# Create all unique pairs of CUIs
all_cuis = pd.Series(list(itertools.chain.from_iterable(client_cui_map))).unique()
pairs = itertools.combinations(all_cuis, 2)

# Dictionary to map each pair to a list of clients that have it
pair_to_clients = {}

for cui1, cui2 in pairs:
    current_pair = {cui1, cui2}
    # Create a list of clients that have both CUIs
    clients_with_pair = [
        client_id
        for client_id, cui_set in client_cui_map.items()
        if current_pair.issubset(cui_set)
    ]

    # If the list is not empty, add it to our dictionary
    if clients_with_pair:
        pair_to_clients[(cui1, cui2)] = clients_with_pair

# Find the pair with the most clients by checking the length of the lists
most_common_pair = max(pair_to_clients, key=lambda pair: len(pair_to_clients[pair]))

# Get the list of clients and the count for that most common pair
clients_list = pair_to_clients[most_common_pair]
max_count = len(clients_list)

logger.info(
    f"Most common co-occurring pair: {most_common_pair} with {max_count} clients having both."
)
logger.info("Clients with this pair:")
# Log each client from the list
logger.info(clients_list)

In [None]:
concept_A_pretty_name = dfa[dfa["cui"] == int(most_common_pair[0])]["pretty_name"].iloc[
    0
]

concept_B_pretty_name = dfa[dfa["cui"] == int(most_common_pair[1])]["pretty_name"].iloc[
    0
]

concept_A_pretty_name, concept_B_pretty_name

In [None]:
concept_A_filter_codes = [int(most_common_pair[0])]
concept_B_filter_codes = [int(most_common_pair[1])]

In [None]:
config_obj.verbosity = 20

### Find the earliest occurrence of any CUI


In [None]:
dfa_s[
    (dfa_s["client_idcode"] == "V5LXO6QJ")
    & (dfa_s["cui"].isin(concept_A_filter_codes + concept_B_filter_codes))
].head(10)

In [None]:
from pat2vec.util.post_processing_build_ipw_dataframe import build_ipw_dataframe

file_path = "ipw_dataframe.csv"
overwrite = True
skip_ipw_build = False

if os.path.exists(file_path):
    if overwrite:
        pd.set_option("display.max_columns", None)

        # n.b this needs filter annot arguments...
        ipw_dataframe = build_ipw_dataframe(
            annot_filter_arguments=annot_filter_arguments,
            config_obj=pat2vec_obj.config_obj,
            filter_codes=concept_A_filter_codes + concept_B_filter_codes,
            mode="earliest",
            include_mct=False,
            include_textual_obs=False,
        )  # '62315008', '55822004', '268910001',
        ipw_dataframe.to_csv(file_path)
        ipw_dataframe
        # Proceed with overwriting the file
        logger.info("File exists and will be overwritten.")
    else:
        # Skip or handle the existing file
        ipw_dataframe = pd.read_csv("ipw_dataframe.csv")
        logger.info("File exists and will NOT be overwritten.")
else:
    # File does not exist, safe to proceed
    logger.info("File does not exist, safe to proceed.")

    pd.set_option("display.max_columns", None)

    # n.b this needs filter annot arguments...
    ipw_dataframe = build_ipw_dataframe(
        annot_filter_arguments=annot_filter_arguments,
        config_obj=pat2vec_obj.config_obj,
        filter_codes=concept_A_filter_codes + concept_B_filter_codes,
        mode="earliest",
        include_mct=False,
        include_textual_obs=False,
    )  # '62315008', '55822004', '268910001',
    ipw_dataframe.to_csv(file_path)
    ipw_dataframe

ipw_dataframe.head()

### Additionally filter by only those who had both of the cui coocurring 

In [None]:
import pandas as pd
from pat2vec.util.post_processing import filter_annot_dataframe2

annot_batch_file_path = "new_project/merged_batches/annots_mct_epr.csv"

# Assume concept_A_filter_codes and concept_B_filter_codes are defined as sets for efficiency
concept_A_filter_codes_set = set(concept_A_filter_codes)
concept_B_filter_codes_set = set(concept_B_filter_codes)
# Assume annot_filter_arguments is defined

if not skip_ipw_build:
    # Initialize two empty sets to store client IDs for each condition
    clients_with_concept_A = set()
    clients_with_concept_B = set()

    # Process the file in chunks ⚙️
    for chunk in pd.read_csv(annot_batch_file_path, chunksize=100000):

        # 1. Filter annotations by earlier annotation filter arguments first
        chunk = filter_annot_dataframe2(chunk, annot_filter_arguments)

        # 2. Find clients in this chunk with a Concept A code and update the set
        A_in_chunk = chunk[chunk["cui"].isin(concept_A_filter_codes)][
            "client_idcode"
        ].unique()
        clients_with_concept_A.update(A_in_chunk)

        # 3. Find clients in this chunk with a Concept B code and update the set
        B_in_chunk = chunk[chunk["cui"].isin(concept_B_filter_codes)][
            "client_idcode"
        ].unique()
        clients_with_concept_B.update(B_in_chunk)

    # 4. The final list is the intersection of the two sets ✅
    true_clients = list(clients_with_concept_A.intersection(clients_with_concept_B))

    logger.info(f"Found {len(true_clients)} patients with both concept A and concept B")
    logger.info(f"Clients with both concepts: {true_clients}")

In [None]:
# finally filter the IPW by the true clients with concept_A_filter_codes and concept_B_filter_codes
if not skip_ipw_build:
    ipw_dataframe = ipw_dataframe[ipw_dataframe["client_idcode"].isin(true_clients)]

    ipw_dataframe
if not skip_ipw_build:
    ipw_dataframe.reset_index(drop=True, inplace=True)
if not skip_ipw_build:
    ipw_dataframe.to_csv("ipw_dataframe.csv")
import pandas as pd
from dateutil.relativedelta import relativedelta

if not skip_ipw_build:
    # Convert to datetime and ensure all values are timezone-aware in UTC
    ipw_dataframe["updatetime"] = pd.to_datetime(
        ipw_dataframe["updatetime"], utc=True  # format='ISO8601',
    )

    # We need to compute individual start and end dates for each patient in the IPW dataframe.
    # We will use the 'updatetime' column as the basis for this calculation.
    # We will create two new columns: 'updatetime_offset' and 'updatetime_end_date'.
    # These will be used to create the patient_dict for pat2vec processing.
    # We add a buffer of 3 months to the 'updatetime' to create 'updatetime_offset' to avoid information leakage.

    # add 3 months using pd.DateOffset, this is a buffer between the first mention of the concept and our new individual patient start time/ time window.
    ipw_dataframe["updatetime_offset"] = ipw_dataframe["updatetime"] + pd.DateOffset(
        months=3
    )

    ipw_dataframe["updatetime_offset"] = pd.to_datetime(
        ipw_dataframe["updatetime_offset"], format="ISO8601", utc=True
    )

    # Now add the time delta to create the individual patient window end date from the offset date

    ipw_dataframe["updatetime_end_date"] = ipw_dataframe["updatetime_offset"].apply(
        lambda dt: dt + pat2vec_obj.config_obj.time_window_interval_delta
    )

    ipw_dataframe.to_csv("ipw_dataframe.csv")

    ipw_dataframe

In [None]:
ipw_dataframe.head(10)

In [None]:
from pat2vec.util.config_pat2vec import config_class
from datetime import datetime
from tqdm import tqdm
from pat2vec.util.post_processing_process_csv_files import process_csv_files
from pat2vec.util.post_processing import extract_datetime_to_column
from dateutil.relativedelta import relativedelta
import pandas as pd
from typing import Dict, List, Optional, Union

# Configuration dictionary for main options in a medical application
main_options_dict = {
    # Enable demographic information (Ethnicity mapped to UK census, age, death)
    "demo": True,
    "bmi": True,  # Enable BMI (Body Mass Index) tracking
    "bloods": True,  # Enable blood-related information
    "drugs": True,  # Enable drug-related information
    "diagnostics": True,  # Enable diagnostic information
    "core_02": True,  # Enable core_02 information
    "bed": True,  # Enable bed n information
    "vte_status": True,  # Enable VTE () status tracking
    "hosp_site": True,  # Enable hospital site information
    "core_resus": True,  # Enable core resuscitation information
    "news": True,  # Enable NEWS (National Early Warning Score) tracking
    "smoking": True,  # Enable smoking-related information
    "annotations": True,  # Enable EPR annotations
    # Enable MRC (Additional clinical note observations index) annotations
    "annotations_mrc": True,
    # Enable or disable negated presence annotations
    "negated_presence_annotations": False,
    "appointments": True,  # Enable appointments
    "annotations_reports": False,  # Enable reports
    "textual_obs": True,  # Enable textual observations (basic_observations index)
}


annot_filter_arguments = {
    "acc": 0.8,  # base concept accuracy
    # umls list of types for medcat filter
    "types": [
        "qualifier value",
        "procedure",
        "substance",
        "finding",
        "environment",
        "disorder",
        "observable entity",
    ],
    # 'types': ['qualifier value', 'procedure', 'substance', 'finding', 'environment', 'disorder', 'observable entity', 'organism', 'phenomenon', 'anatomy', 'conceptual entity', 'physical object', 'intellectual product', 'occupation or discipline', 'mental or behavioral dysfunction', 'geographic area', 'population group', 'biomedical or dental material', 'medical device', 'classification', 'regulation or law', 'health care activity', 'health care related organization', 'professional or occupational group', 'group', 'attribute', 'individual behavior']
    # Specify the values you want to include in a list. Must be defined in medcat model.
    "Time_Value": ["Recent", "Past"],
    "Time_Confidence": 0.8,  # Specify the confidence threshold as a float
    # Specify the values you want to include in a list
    "Presence_Value": ["True"],
    "Presence_Confidence": 0.8,  # Specify the confidence threshold as a float
    # Specify the values you want to include in a list
    "Subject_Value": ["Patient"],
    "Subject_Confidence": 0.8,  # Specify the confidence threshold as a float
}

# Filter data batches by terms before processing.

epr_docs_term_regex: Optional[Union[str, None]] = None
mct_docs_term_regex: Optional[Union[str, None]] = None

# Example bloods_filter_term_list: Optional[Union[List[str], None]] = ['wbc']
bloods_filter_term_list: Optional[Union[List[str], None]] = None

# Example mct_docs_document_type_filter_list: Optional[Union[List[str], None]] = ['KHMDC Integrated report']

mct_docs_document_type_filter_list: Optional[Union[List[str], None]] = None
epr_docs_document_type_filter_list: Optional[Union[List[str], None]] = None

data_type_filter_dict: Dict[str, any] = {
    "filter_term_lists": {
        "epr_docs": epr_docs_document_type_filter_list,
        "mct_docs": mct_docs_document_type_filter_list,
        "bloods": bloods_filter_term_list,
    },
    "epr_docs_term_regex": epr_docs_term_regex,
    "mct_docs_term_regex": mct_docs_term_regex,
}

# Example date settings:
# start_date=(datetime(2020, 1, 1)) Start date for processing

# Define the length of the time window, example 1 year and 15 days, only data within this window will be processed.
# years=1,      # Number of years to add to the start date
# months=0,  # Number of months to add to the start date
# days=15,  # Number of days to add to the start date

# Define the interval between time windows. Example 1 year. Each vector/row output will be based on this interval.
# time_window_interval_delta = relativedelta(years=1)

# lookback = True #This determines the direction of the time length window. True = backward, False = forward. Our time window (+1 years, 15 days) is therefore 2020, 1, 1 - 2021, 1, 15.

# IPW settings:

# Init config obj

# Creating a configuration object for a specific task or project
config_obj = config_class(
    remote_dump=False,  # Flag for remote data dumping. partially deprecated.
    suffix="",  # Suffix for file names
    # Filename for treatment documentation
    treatment_doc_filename="treatment_docs.csv",
    treatment_control_ratio_n=1,  # Ratio for treatment to control
    # Project name. patient data batches and vectors stored here.
    proj_name="new_project_ipw",
    current_path_dir="",  # Current path directory
    main_options=main_options_dict,  # Dictionary for main options
    start_date=(datetime(1995, 1, 1)),  # Starting date for processing
    # Number of years to add to the start date. Set the duration of the time window. Window is defined as the start date + years/months/days set here.
    years=30,
    months=0,  # Number of months to add to the start date
    days=0,  # Number of days to add to the start date
    batch_mode=True,  # Flag for batch processing mode. only functioning mode.
    store_annot=True,  # Flag to store annotations. partially deprecated.
    share_sftp=True,  # Flag for sharing via SFTP. partially deprecated
    multi_process=False,  # Flag for multi-process execution. deprecated.
    # annot_first=False,  # Flag for annotation priority. deprecated.
    # Flag for stripping lists, will check for completed patients before starting to avoid redundancy.
    strip_list=True,
    verbosity=0,  # Verbosity level 0-9 printing debug messages
    random_seed_val=random_seed_value,  # Random seed value for reproducibility of controls.
    testing=True,  # Flag for testing mode
    dummy_medcat_model=True,  # Flag for dummy MedCAT model, used if testing == True
    # Flag for using controls. #will add desired ratio of controls at random from global pool.
    use_controls=False,
    # Flag for MedCAT processing. #will load medcat into memory and use for annotating.
    medcat=False,
    # Current timestamp as the start time for logging and progress bar
    start_time=datetime.now(),
    # Column name for patient ID, auto will try to find it. Example "client_idcode"
    patient_id_column_name="client_idcode",
    annot_filter_options=annot_filter_arguments,  # Annotation filtering options
    # Global start year. #set the limits of the time window data can be drawn from. Start should not precede start date set above.
    global_start_year=1995,  # Global dates are overwritten by individual patient windows to match patient window. # Ensure that global start year/month/day is before end year/month/day
    global_start_month=1,  # Global start month
    global_end_year=2023,  # Global end year
    global_end_month=1,  # Global end month
    global_start_day=1,
    global_end_day=1,
    individual_patient_window=True,
    individual_patient_window_df=pd.read_csv("ipw_dataframe.csv"),
    individual_patient_window_start_column_name="updatetime",  # _offset , this will look for your start column name + '_offset'
    individual_patient_id_column_name="client_idcode",
    individual_patient_window_controls_method="full",
    shuffle_pat_list=False,  # Flag for shuffling patient list
    time_window_interval_delta=relativedelta(
        years=31
    ),  # specify the time window to collapse each feature vector into, years=1 is one vector per year within the global time window
    split_clinical_notes=True,  # will split clinical notes by date and treat as individual documents with extracted dates. Requires note splitter module.
    lookback=False,  # when calculating individual patient window from table of start dates, will calculate backwards in time if true. Else Forwards. When calculating from global start date, will calculate backwards or forwards respectively.
    add_icd10=False,  # append icd 10 codes to annot batches. Can be found under current_pat_documents_annotations/%client_idcode%.csv.
    add_opc4s=False,  # needs icd10 true also. Can be found under current_pat_documents_annotations/%client_idcode%.csv
    override_medcat_model_path=path_to_medcat_model_pack,  # Force medcat model path, if None uses defaults for env. #Can be set in paths.py with medcat_path = %path to medcat model pack.zip"
    data_type_filter_dict=None,  # Dictionary for data type filter, see examples above.
    filter_split_notes=True,  # If enabled, will reapply global time window filter post clinical note splitting. Recommended to enable if split notes enabled.
    calculate_vectors=True,
)

In [None]:
config_obj.individual_patient_window_df.head(5)

In [None]:
pat2vec_obj = main(
    cogstack=True,
    use_filter=False,
    json_filter_path=None,
    random_seed_val=42,
    hostname=None,
    config_obj=config_obj,
)

In [None]:
config_obj.patient_dict  # These are the individual patient time windows for processing.

In [None]:
pat2vec_obj.config_obj.date_list

In [None]:
config_obj.verbosity = 20

In [None]:
pat2vec_obj.config_obj.date_list

In [None]:
pat2vec_obj.pat_maker(0)

In [None]:
# Define the maximum number of retries
MAX_RETRIES = 3

# Iterate through the patient list starting from index 0
for i in tqdm(range(0, len(pat2vec_obj.all_patient_list))):
    retries = 0
    success = False

    while retries < MAX_RETRIES and not success:
        try:
            # Try to process the patient
            pat2vec_obj.pat_maker(i)
            success = True  # Mark as successful if no exception is raised

        except KeyError as e:
            # Handle specific exception
            logger.warning(
                f"KeyError at index {i} for patient {pat2vec_obj.all_patient_list[i]}: {e}. Retrying after removal..."
            )
            remove_file_from_paths(pat2vec_obj.all_patient_list[i])
            retries += 1

        except Exception as e:
            # Handle generic exceptions
            logger.error(
                f"Exception at index {i} for patient {pat2vec_obj.all_patient_list[i]}: {e}. Skipping this patient..."
            )
            raise e
            break  # Break the retry loop for non-retryable exceptions

        finally:
            pat2vec_obj.t.update(1)  # Update progress

    if not success:
        logger.error(
            f"Failed to process index {i} for patient {pat2vec_obj.all_patient_list[i]} after {MAX_RETRIES} retries."
        )

pat2vec_obj.t.close()

In [None]:
pat2vec_obj.config_obj.date_list

In [None]:
df.head()

In [None]:
input_directory = f"{pat2vec_obj.proj_name}/current_pat_lines_parts"  # Patient vectors are stored individually in this directory.
output_csv_file = "output_file"

# Specify the directory where you want to create the file
directory = pat2vec_obj.proj_name + "/output_directory"

# Check if the directory exists, if not, create it
if not os.path.exists(directory):
    os.makedirs(directory)

# We will join the individual patient vectors into a single output file. This is useful for filtering.
output_csv_file_filename = process_csv_files(
    input_directory,
    out_folder=directory,
    output_filename_suffix=output_csv_file,
    part_size=336,
)
df = pd.read_csv(output_csv_file_filename)

In [None]:
df

In [None]:
from pat2vec.util.post_processing import save_missing_values_pickle

# Save with the same name as the final imputed dataset
save_missing_values_pickle(df, "my_project_outputfile_name")

In [None]:
df["client_idcode"].value_counts()

In [None]:
df[df["client_idcode"] == "V1IBLJH7"]

In [None]:
df[df["client_idcode"] == list(pat2vec_obj.config_obj.patient_dict.keys())[1]]

In [None]:
df["extracted_datetime_stamp"] = extract_datetime_to_column(df)[
    "extracted_datetime_stamp"
]

In [None]:
patients_processed = list(pat2vec_obj.config_obj.patient_dict.keys())
logger.info(f"Number of patients processed: {len(patients_processed)}")

In [None]:
# Filter df by patients_processed list on client_idcode

df_filtered = df[df["client_idcode"].isin(patients_processed)]

df_filtered[["client_idcode", "extracted_datetime_stamp"]]

In [None]:
pat2vec_obj.config_obj.patient_dict

In [None]:
import pandas as pd
import numpy as np

# 1. Create DataFrame and Find Earliest Date
df_dict = pd.DataFrame.from_dict(
    pat2vec_obj.config_obj.patient_dict, orient="index", columns=["date1", "date2"]
)
df_dict.index.name = "patient_id"
df_dict.reset_index(inplace=True)
date1_ts = pd.to_datetime(df_dict["date1"])
date2_ts = pd.to_datetime(df_dict["date2"])
df_dict["earliest_date_to_check"] = np.minimum(date1_ts, date2_ts)

# 2. Handle Duplicates in the Main DataFrame
if df_filtered["client_idcode"].duplicated().any():
    logger.warning(
        f"Found and dropped {df_filtered.duplicated(subset=['client_idcode']).sum()} duplicate patient IDs."
    )
    df_filtered_unique = df_filtered.drop_duplicates(
        subset=["client_idcode"], keep="first"
    )
else:
    df_filtered_unique = df_filtered

# --- NEW: Diagnostic Checks ---
logger.info("\n## Diagnostic Info")
logger.info("--------------------------------------------------")

# Check if the data sources are empty
logger.info(f"1. Size of data from dictionary: {len(df_dict)} rows")
logger.info(f"2. Size of data from df_filtered: {len(df_filtered_unique)} rows")

if len(df_dict) > 0 and len(df_filtered_unique) > 0:
    # Check the data types of the keys
    logger.info(
        f"3. Data type of 'patient_id' (from dict): {df_dict['patient_id'].dtype}"
    )
    logger.info(
        f"4. Data type of 'client_idcode' (from df): {df_filtered_unique['client_idcode'].dtype}"
    )

    # Show a sample of the keys to visually inspect for whitespace/casing
    logger.info(
        f"5. Sample keys from dictionary: {df_dict['patient_id'].head(3).to_list()}"
    )
    logger.info(
        f"6. Sample keys from df_filtered: {df_filtered_unique['client_idcode'].head(3).to_list()}"
    )

    # Programmatically find the exact number of overlapping IDs
    set_dict = set(df_dict["patient_id"].astype(str).str.strip())
    set_df = set(df_filtered_unique["client_idcode"].astype(str).str.strip())
    overlap = set_dict.intersection(set_df)
    logger.info(
        f"7. Found {len(overlap)} common IDs between the two sources after cleaning whitespace."
    )
    if len(overlap) < 5 and len(overlap) > 0:
        logger.info(f"   -> Common IDs are: {list(overlap)}")
else:
    logger.warning("One or both data sources are empty. Cannot perform merge.")

logger.info("--------------------------------------------------")


merged_df = pd.merge(
    left=df_dict,
    right=df_filtered_unique,
    left_on="patient_id",
    right_on="client_idcode",
    how="inner",
)

In [None]:
pat2vec_obj.all_patient_list[0]

In [None]:
config_obj.individual_patient_window_df[
    config_obj.individual_patient_window_df["client_idcode"]
    == pat2vec_obj.all_patient_list[0]
]

In [None]:
len(pat2vec_obj.all_patient_list), len(pat2vec_obj.config_obj.patient_dict.keys())

In [None]:
df[df["client_idcode"] == pat2vec_obj.all_patient_list[0]]

In [None]:
df

In [None]:
import random

# Restrict to only clients in df
dfa_filtered = dfa[dfa["client_idcode"].isin(df["client_idcode"])]

# cui → set of client_idcodes that have it
cui_to_clients = dfa_filtered.groupby("cui")["client_idcode"].apply(set).to_dict()

# cuis present for some but not all df clients
eligible_cuis = [
    cui
    for cui, clients in cui_to_clients.items()
    if 0 < len(clients) < df["client_idcode"].nunique()
]

# Randomly pick one
random_cui = random.choice(eligible_cuis) if eligible_cuis else None
# Which client_idcodes have that random_cui
clients_with_random_cui = set(dfa.loc[dfa["cui"] == random_cui, "client_idcode"])

# Flag in df
df["outcome_var_1"] = df["client_idcode"].isin(clients_with_random_cui).astype(int)

In [None]:
df["outcome_var_1"].value_counts()

In [None]:
from pat2vec.util.impute_data_for_pipe import mean_impute_dataframe


df_imputed = mean_impute_dataframe(df, "outcome_var_1")

In [None]:
df_imputed