In [None]:
import pandas as pd
import random

def generate_client_ids(n=10):
    """Generate n random 10-digit client ID strings."""
    ids = []
    for _ in range(n):
        # Ensure it's exactly 10 digits (no leading digit loss)
        id_code = ''.join(str(random.randint(0, 9)) for _ in range(10))
        ids.append(id_code)
    return ids

# Number of entries (default = 10)
n = 10

# Create DataFrame
df = pd.DataFrame({
    "client_idcode": generate_client_ids(n)
})

# Save to CSV
df.to_csv("treatment_docs.csv", index=False)

print("CSV file 'treatment_docs.csv' created!")
print(df.head())

# Replace this with your cohort (Hospital numbers)

In [None]:
import numpy as np 
import os
import sys
import shutil

# Fix the random seed for reproducibility in unit testing

random_seed_value = 42

np.random.seed(random_seed_value)

random.seed(random_seed_value)


In [None]:
# 1. Print the current working directory
print("Current Working Directory:", os.getcwd())

# 2. Print Python's sys.path
print("Python Path:", sys.path)

In [None]:
# remove dir
clear_previous_outputs = True

if(clear_previous_outputs):

    shutil.rmtree('new_project', ignore_errors=True)

    shutil.rmtree('new_project_ipw', ignore_errors=True)

    shutil.rmtree('treatment_doc_extract', ignore_errors=True)

In [None]:
# Ensure dependencies are on path

# Get the current working directory
current_dir = os.getcwd()

# Define relative paths from the current working directory
path_to_medcat_model_pack = os.path.abspath(os.path.join(current_dir, '..', '..', 'medcat_models', 'medcat_model_pack_422d1d38fc58f158.zip'))

path_to_snomed_ct_file = os.path.abspath(os.path.join(current_dir, '..', '..', 'snomed', 'SnomedCT_InternationalRF2_PRODUCTION_20231101T120000Z', 'SnomedCT_InternationalRF2_PRODUCTION_20231101T120000Z', 'Full', 'Terminology', 'sct2_StatedRelationship_Full_INT_20231101.txt'))

 # Define the relative path
path_to_gloabl_files = '../../'

additional_path_to_pat2vec = 'pat2vec'

additional_path_to_pat2vec = os.path.abspath(os.path.join(path_to_gloabl_files, additional_path_to_pat2vec))

# Get the absolute path of the current working directory
current_dir = os.getcwd()

# Combine the current directory with the relative path
absolute_path = os.path.abspath(os.path.join(current_dir, path_to_gloabl_files))

# Usage examples
print(path_to_medcat_model_pack)
print(path_to_snomed_ct_file)
print(path_to_gloabl_files)
print(additional_path_to_pat2vec)


In [None]:
sys.path.insert(0, path_to_gloabl_files)
sys.path.insert(0, additional_path_to_pat2vec)

current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
sys.path.append(parent_dir)

# Add the grandparent directory of the current directory to the Python path
grandparent_dir = os.path.dirname(parent_dir)
sys.path.append(grandparent_dir)


### Set up logger

In [None]:
from pat2vec.util.logger_setup import setup_logger

# Get the logger
logger = setup_logger()

In [None]:
from pat2vec.util.config_pat2vec import config_class
from datetime import datetime
from tqdm import tqdm
from pat2vec.util.post_processing_process_csv_files import process_csv_files 
from pat2vec.util.post_processing import extract_datetime_to_column
from dateutil.relativedelta import relativedelta
import pandas as pd
from typing import Dict, List, Optional, Union

# Configuration dictionary for main options in pat2vec
main_options_dict = {
    
    'demo': True, # Enable demographic information (Ethnicity mapped to UK census categories, age, death).
    'bmi': True,  # Enable BMI (Body Mass Index) information.
    'bloods': True,  # Enable blood-related information
    'drugs': True,  # Enable drug-related information
    'diagnostics': True,  # Enable diagnostic information

    'core_02': True,  # Enable core_02 information
    'bed': True,  # Enable bed n information
    'vte_status': True,  # Enable VTE () status information
    'hosp_site': True,  # Enable hospital site information
    'core_resus': True,  # Enable core resuscitation information
    'news': True,  # Enable NEWS (National Early Warning Score) information

    'smoking': True,  # Enable smoking-related information
    'annotations': True,  # Enable EPR documents annotations via MedCat
    'annotations_mrc': True,# Enable MRC (Additional clinical note observations index) annotations via MedCat
    'negated_presence_annotations': False, # Enable or disable negated presence annotations
    'appointments': False,  # Enable appointments information
    'annotations_reports': False,  # Enable reports information
    'textual_obs': False,  # Enable textual observations (basic_observations index) annotations via MedCat
}

# Configuration dictionary for annotation filtering, only base annotations meeting this threshold will be included.
annot_filter_arguments = {
    'acc': 0.8,  # base concept accuracy
    'types': ['qualifier value', 'procedure', 'substance', 'finding', 'environment', 'disorder', 'observable entity'], # umls list of types for medcat filter
    # 'types': ['qualifier value', 'procedure', 'substance', 'finding', 'environment', 'disorder', 'observable entity', 'organism', 'phenomenon', 'anatomy', 'conceptual entity', 'physical object', 'intellectual product', 'occupation or discipline', 'mental or behavioral dysfunction', 'geographic area', 'population group', 'biomedical or dental material', 'medical device', 'classification', 'regulation or law', 'health care activity', 'health care related organization', 'professional or occupational group', 'group', 'attribute', 'individual behavior']
    
    'Time_Value': ['Recent', 'Past'], # Specify the values you want to include in a list. Must be defined in medcat model. # Example ['Recent', 'Past', 'Subject/Experiencer']
    'Time_Confidence': 0.8,  # Specify the confidence threshold as a float
    'Presence_Value': ['True'], # Specify the values you want to include in a list
    'Presence_Confidence': 0.8,  # Specify the confidence threshold as a float
    'Subject_Value': ['Patient'], # Specify the values you want to include in a list
    'Subject_Confidence': 0.8  # Specify the confidence threshold as a float
}

# Filter data batches by terms before processing. 

epr_docs_term_regex: Optional[Union[str, None]] = None
mct_docs_term_regex: Optional[Union[str, None]] = None

# Example bloods_filter_term_list: Optional[Union[List[str], None]] = ['wbc'] # This will only include basic observations with this item name analysed.
bloods_filter_term_list: Optional[Union[List[str], None]] = None

# Example mct_docs_document_type_filter_list: Optional[Union[List[str], None]] = ['KHMDC Integrated report'] # This will only include documents with this document type field value.

mct_docs_document_type_filter_list: Optional[Union[List[str], None]] = None
epr_docs_document_type_filter_list: Optional[Union[List[str], None]] = None

data_type_filter_dict: Dict[str, any] = {
    'filter_term_lists': {
        'epr_docs': epr_docs_document_type_filter_list,
        'mct_docs': mct_docs_document_type_filter_list,
        'bloods': bloods_filter_term_list
    },
    'epr_docs_term_regex': epr_docs_term_regex,
    'mct_docs_term_regex': mct_docs_term_regex,
}

#Example date settings:
#start_date=(datetime(2020, 1, 1)) Start date for processing

# Define the length of the time window, example 1 year and 15 days, only data within this window will be processed.
# years=1,      # Number of years to add to the start date 
# months=0,  # Number of months to add to the start date
# days=15,  # Number of days to add to the start date

# Define the interval between time windows. Example 1 year. Each vector/row output will be based on this interval.
# time_window_interval_delta = relativedelta(years=1)

# lookback = True #This determines the direction of the time length window. True = backward, False = forward. Our time window (+1 years, 15 days) is therefore 2020, 1, 1 - 2021, 1, 15. 

# IPW settings:

# Init config obj

# Hypothetical date config_obj configuration:
# I want all patients data between Feb 2015 and Jul 2020. This date window will extract and create the batched patient data for this time window.

# global_start_year=2015, 
# global_start_month=2,  
# global_end_year=2020,  
# global_end_month=6, 
# global_start_day = 1, 
# global_end_day = 1, 

# I want patient vectors starting from Feb 2019 to Feb 2020 as I would like to see if X medical event is recorded on those taking medication Y
# start_date=(datetime(2019, 2, 1)),  
# years=1, 
# months=0,  
# days=0, 
# lookback = False # 2019 to 2020 is forward in time.
# I would like a single vector for each patient
# time_window_interval_delta = relativedelta(years=1) 
# I would like 1 vector per month per patient for the 1 year time window
# time_window_interval_delta = relativedelta(months=1)

# Creating a configuration object for a specific task or project
config_obj = config_class(
    remote_dump=False,  # Flag for remote data dumping. partially deprecated.
    suffix='',  # Suffix for file names
    treatment_doc_filename='test_files/treatment_docs.csv', # Filename for treatment documentation
    treatment_control_ratio_n=1,  # Ratio for treatment to control
    proj_name='new_project', # Project name. patient data batches and vectors stored here.
    current_path_dir="",  # Current path directory
    main_options=main_options_dict,  # Dictionary for main options
    start_date=(datetime(1995, 1, 1)),  # Starting date for processing
    years=30, # Number of years to add to the start date. Set the duration of the time window. Window is defined as the start date + years/months/days set here.
    months=0,  # Number of months to add to the start date
    days=0,  # Number of days to add to the start date
    batch_mode=True,  # Flag for batch processing mode. Only functioning mode.
    store_annot=True,  # Flag to store annotations. partially deprecated.
    share_sftp=True,  # Flag for sharing via SFTP. partially deprecated
    multi_process=False,  # Flag for multi-process execution. deprecated.
    strip_list=True, # Flag for stripping lists, this will check for completed patients before starting to avoid redundancy.
    verbosity=0,  # Verbosity level 0-9 printing debug messages
    random_seed_val=random_seed_value,  # Random seed value for reproducibility of controls.
    testing=True,  # Flag for testing mode. Will use dummy data.
    dummy_medcat_model=True,  # Flag for dummy MedCAT model, used if testing == True, this will simulate a MedCAT model.
    use_controls=False, # If true this will add desired ratio of controls at random from global pool, requires configuring with a master list of patients.
    medcat=False, # Flag for MedCAT processing. #will load medcat into memory and use for annotating.
    start_time=datetime.now(), # Current timestamp as the start time for logging and progress bar
    patient_id_column_name='auto', # Column name for patient ID, auto will try to find it. Example "client_idcode"
    annot_filter_options=annot_filter_arguments,  # Annotation filtering options
    
    # Global start year. #set the limits of the time window data can be drawn from. Start should not precede start date set above.
    global_start_year=1995, # Global dates are overwritten by individual patient windows to match patient window. # Ensure that global start year/month/day is before end year/month/day
    global_start_month=1,  # Global start month
    global_end_year=2025,  # Global end year
    global_end_month=1, # Global end month
    global_start_day = 1, 
    global_end_day = 1, 
    ## Use these if each patient has their own individual time window. Requires preparing a table of start dates.
    # individual_patient_window = True,
    # individual_patient_window_df = pd.read_csv('ipw_overlap.csv'),
    # individual_patient_window_start_column_name = 'updatetime_manual_offset',
    # individual_patient_id_column_name = 'client_idcode',
    # individual_patient_window_controls_method = 'full',
    shuffle_pat_list=False,  # Flag for shuffling patient list
    time_window_interval_delta = relativedelta(years=31), #specify the time window to collapse each feature vector into, years=1 is one vector per year within the global time window
    split_clinical_notes=True, #will split clinical notes by date and treat as individual documents with extracted dates. Requires note splitter module. 
    lookback = False, # when calculating individual patient window from table of start dates, will calculate backwards in time if true. Else Forwards. When calculating from global start date, will calculate backwards or forwards respectively. 
    add_icd10 = False, #append icd 10 codes to annot batches. Can be found under current_pat_documents_annotations/%client_idcode%.csv.
    add_opc4s=False, # needs icd10 true also. Can be found under current_pat_documents_annotations/%client_idcode%.csv
    override_medcat_model_path = path_to_medcat_model_pack, #Force medcat model path, if None uses defaults for env. #Can be set in paths.py with medcat_path = %path to medcat model pack.zip"
    data_type_filter_dict = None, # Dictionary for data type filter, see examples above. 
    filter_split_notes = True, # If enabled, will reapply global time window filter post clinical note splitting. Recommended to enable if split notes enabled.
    prefetch_pat_batches = False, # If enabled, will fetch batches for entire patient list and pre poulate batch folders with individual pat batches. Out of memory issues.
    sample_treatment_docs=5 # If int > 0, will sample treatment documents from the treatment_docs.csv file. This is useful for testing and debugging / pilot run purposes.
)

In [None]:
from pat2vec.main_pat2vec import main

In [None]:
pat2vec_obj = main( cogstack=True, use_filter=False,
             json_filter_path = None, random_seed_val=42, 
             hostname =None, config_obj= config_obj, )


View patient list

In [None]:
pat2vec_obj.all_patient_list[0:8]

In [None]:
pat2vec_obj.config_obj.date_list

Make pat vectors for pat 0

In [None]:
pat2vec_obj.pat_maker(0)

In [None]:
# Remove specific patient raw documents and annotations:
from pat2vec.util.post_processing import remove_file_from_paths

# remove_file_from_paths(pat2vec_obj.all_patient_list[i])

In [None]:
# Define the maximum number of retries
MAX_RETRIES = 3

# Iterate through the patient list starting from index 0
for i in tqdm(range(0, len(pat2vec_obj.all_patient_list))):
    retries = 0
    success = False
    
    while retries < MAX_RETRIES and not success:
        try:
            # Try to process the patient
            pat2vec_obj.pat_maker(i)
            success = True  # Mark as successful if no exception is raised
            
        except KeyError as e:
            # Handle specific exception
            print(f"KeyError at index {i}: {e}. Retrying after removal...")
            remove_file_from_paths(pat2vec_obj.all_patient_list[i])
            retries += 1
        
        except Exception as e:
            # Handle generic exceptions
            print(f"Exception at index {i}: {e}. Skipping this patient...")
            break  # Break the retry loop for non-retryable exceptions
            
        finally:
            pat2vec_obj.t.update(1)  # Update progress

    if not success:
        print(f"Failed to process index {i} after {MAX_RETRIES} retries.")

pat2vec_obj.t.close()

In [None]:
input_directory = f'{pat2vec_obj.proj_name}/current_pat_lines_parts' # Patient vectors are stored individually in this directory. 
output_csv_file = 'output_file'

# Specify the directory where you want to create the file
directory = pat2vec_obj.proj_name + '/output_directory'

# Check if the directory exists, if not, create it
if not os.path.exists(directory):
    os.makedirs(directory)

# We will join the individual patient vectors into a single output file. This is useful for filtering.
output_csv_file_filename = process_csv_files(input_directory, out_folder=directory, output_filename_suffix=output_csv_file, part_size=336)

In [None]:
df = pd.read_csv(output_csv_file_filename)

In [None]:
df = extract_datetime_to_column(df)

In [None]:
df

#### Build all document batches dataframe:

In [None]:
# This will merge all document source batches into a single file. This is useful for filtering. May produce a large file.

from pat2vec.util.post_processing_build_methods import build_merged_epr_mct_doc_df

all_pat_list = pat2vec_obj.all_patient_list

dfd = build_merged_epr_mct_doc_df(all_pat_list, pat2vec_obj.config_obj, overwrite=True)

#dfd = pd.read_csv(dfd)

### Build all annotation batches dataframe:

In [None]:
# This will merge all annotation source batches into a single file. This is useful for filtering. May produce a large file.

from pat2vec.util.post_processing_build_methods import build_merged_epr_mct_annot_df

all_pat_list = pat2vec_obj.all_patient_list

dfa = build_merged_epr_mct_annot_df(all_pat_list, pat2vec_obj.config_obj, overwrite=True)

dfa = pd.read_csv(dfa)

dfa

### Build additional batches from individual patient data batches

In [None]:
# This will merge all drug source batches into a single file. This is useful for filtering. May produce a large file.

from pat2vec.util.post_processing_build_methods import merge_drugs_csv

all_pat_list = pat2vec_obj.all_patient_list

merged_drugs_path = merge_drugs_csv(all_pat_list, pat2vec_obj.config_obj, overwrite=True)

merged_drugs = pd.read_csv(merged_drugs_path)
merged_drugs

In [None]:
#dfmdi = pd.read_csv('new_project/merged_input_pat_batches/merged_drugs_batches.csv')

In [None]:
# for col in dfmdi.select_dtypes(exclude=[np.number]).columns:
#     assert dfmdi[col].astype(str).equals(merged_drugs[col].astype(str)), f"Mismatch in column: {col}"

In [None]:
# This will merge all diagnostics source batches into a single file. This is useful for filtering. May produce a large file.

from pat2vec.util.post_processing_build_methods import merge_diagnostics_csv

all_pat_list = pat2vec_obj.all_patient_list

merged_diagnostics_path = merge_diagnostics_csv(all_pat_list, pat2vec_obj.config_obj, overwrite=True)

merged_diagnostics = pd.read_csv(merged_diagnostics_path)

In [None]:
from pat2vec.util.post_processing_build_methods import merge_news_csv

all_pat_list = pat2vec_obj.all_patient_list

merged_news_path = merge_news_csv(all_pat_list, pat2vec_obj.config_obj, overwrite=True)

#merged_news = pd.read_csv(merged_news_path)

In [None]:
from pat2vec.util.post_processing_build_methods import merge_bmi_csv

all_pat_list = pat2vec_obj.all_patient_list

merged_bmi_path = merge_bmi_csv(all_pat_list, pat2vec_obj.config_obj, overwrite=True)

#merged_bmi = pd.read_csv(merged_bmi_path)

In [None]:
from pat2vec.util.post_processing_build_methods import build_merged_bloods

all_pat_list = pat2vec_obj.all_patient_list

merged_bloods_path = build_merged_bloods(all_pat_list, pat2vec_obj.config_obj, overwrite=True)

merged_bloods = pd.read_csv(merged_bloods_path)
merged_bloods

In [None]:
#pd.read_csv('new_project/merged_input_pat_batches/merged_bloods_batches.csv')

In [None]:
from pat2vec.util.post_processing_build_methods import merge_demographics_csv

all_pat_list = pat2vec_obj.all_patient_list

merged_demographics_path = merge_demographics_csv(all_pat_list, pat2vec_obj.config_obj, overwrite=True)

merged_demographics = pd.read_csv(merged_demographics_path)

merged_demographics