Ensure pat2vec is on path

In [None]:
pwd

In [None]:
import sys
sys.path.insert(0,'/home/aliencat/samora/gloabl_files')
sys.path.insert(0,'/data/AS/Samora/gloabl_files')
sys.path.insert(0,'/home/jovyan/work/gloabl_files')
sys.path.insert(0,'/home/cogstack/samora/_data/gloabl_files')

In [None]:
from pat2vec.util.config_pat2vec import config_class
from datetime import datetime

from post_processing import process_csv_files
from post_processing import extract_datetime_to_column
from pat2vec.pat2vec_pat_list.get_patient_treatment_list import get_all_patients_list
from pat2vec.util.post_processing import produce_filtered_annotation_dataframe

import pandas as pd

# Configuration dictionary for main options in a medical application
main_options_dict = {
    'demo': True,  # Enable demographic information (Ethnicity mapped to UK census, age, death)
    'bmi': True,  # Enable BMI (Body Mass Index) tracking
    'bloods': True,  # Enable blood-related information
    'drugs': True,  # Enable drug-related information
    'diagnostics': True,  # Enable diagnostic information

    'core_02': True,  # Enable core_02 information
    'bed': True,  # Enable bed n information
    'vte_status': True,  # Enable VTE () status tracking
    'hosp_site': True,  # Enable hospital site information
    'core_resus': True,  # Enable core resuscitation information
    'news': True,  # Enable NEWS (National Early Warning Score) tracking

    'smoking': True,  # Enable smoking-related information
    'annotations': True,  # Enable EPR annotations
    'annotations_mrc': True,  # Enable MRC (Additional clinical note observations index) annotations
    'negated_presence_annotations': False  # Enable or disable negated presence annotations
}


annot_filter_arguments = {
    'acc': 0.8, #base concept accuracy
    'types': ['qualifier value', 'procedure', 'substance', 'finding', 'environment', 'disorder', 'observable entity'], #umls list of types for medcat filter
    #'types': ['qualifier value', 'procedure', 'substance', 'finding', 'environment', 'disorder', 'observable entity', 'organism', 'phenomenon', 'anatomy', 'conceptual entity', 'physical object', 'intellectual product', 'occupation or discipline', 'mental or behavioral dysfunction', 'geographic area', 'population group', 'biomedical or dental material', 'medical device', 'classification', 'regulation or law', 'health care activity', 'health care related organization', 'professional or occupational group', 'group', 'attribute', 'individual behavior']
    'Time_Value': ['Recent', 'Past'],  # Specify the values you want to include in a list. Must be defined in medcat model. 
    'Time_Confidence': 0.8,  # Specify the confidence threshold as a float
    'Presence_Value': ['True'],  # Specify the values you want to include in a list
    'Presence_Confidence': 0.8,  # Specify the confidence threshold as a float
    'Subject_Value': ['Patient'],  # Specify the values you want to include in a list
    'Subject_Confidence': 0.8  # Specify the confidence threshold as a float
}

## Init config obj
from datetime import datetime

# Creating a configuration object for a specific task or project
config_obj = config_class(
    remote_dump=False,  # Flag for remote data dumping. partially deprecated. 
    suffix='',  # Suffix for file names
    treatment_doc_filename='treatment_docs.csv',  # Filename for treatment documentation
    treatment_control_ratio_n=1,  # Ratio for treatment to control
    proj_name='new_project',  # Project name. patient data batches and vectors stored here. 
    current_path_dir="",  # Current path directory
    main_options=main_options_dict,  # Dictionary for main options
    start_date=(datetime(2020, 1, 1)),  # Starting date for processing
    years=0,  # Number of years to add to the start date. Set the duration of the time window. Window is defined as the start date + years/months/days set here.
    months=0,  # Number of months to add to the start date
    days=2,  # Number of days to add to the start date
    dgx=False,  # Flag for DGX, set true if in env, each env needs specific paths configured.
    dhcap=False,  # Flag for DHCap
    dhcap02=True,  # Flag for DHCap02
    batch_mode=True,  # Flag for batch processing mode. only functioning mode. 
    store_annot=True,  # Flag to store annotations. partially deprecated.
    share_sftp=True,  # Flag for sharing via SFTP. partially deprecated
    multi_process=False,  # Flag for multi-process execution. deprecated.
    annot_first=False,  # Flag for annotation priority. deprecated. 
    strip_list=True,  # Flag for stripping lists, will check for completed patients before starting to avoid redundancy.
    verbosity=0,  # Verbosity level 0-9 printing debug messages
    random_seed_val=42,  # Random seed value for reproducibility of controls. 
    testing=False,  # Flag for testing mode 
    use_controls=False,  # Flag for using controls. #will add desired ratio of controls at random from global pool. 
    medcat=True,  # Flag for MedCAT processing. #will load medcat into memory and use for annotating.
    start_time=datetime.now(),  # Current timestamp as the start time for logging and progress bar
    patient_id_column_name='auto',  # Column name for patient ID, auto will try to find it. Example "client_idcode"
    annot_filter_options=annot_filter_arguments,  # Annotation filtering options
    global_start_year=1995,  # Global start year. #set the limits of the time window data can be drawn from. Start should not precede start date set above.
    global_start_month=1,  # Global start month
    global_end_year=2023,  # Global end year
    global_end_month=1,  # Global end month
    shuffle_pat_list=False  # Flag for shuffling patient list
)





In [None]:
from pat2vec.main_pat2vec import main

In [None]:
pat2vec_obj = main( cogstack=True, use_filter=False,
             json_filter_path = None, random_seed_val=42, 
             hostname =None, config_obj= config_obj, )


View patient list

In [None]:
pat2vec_obj.all_patient_list

Make pat vectors for pat 0

In [None]:
pat2vec_obj.pat_maker(0)

In [None]:

input_directory = 'new_project/current_pat_lines_parts'
output_csv_file = 'output_file.csv'


process_csv_files(input_directory, out_folder='outputs', output_filename_suffix=output_csv_file, part_size=336)



In [None]:
df = pd.read_csv(output_csv_file)

In [None]:
df

In [None]:



df = extract_datetime_to_column(df)

### Filter the annotation batches by a snomed cui and its related codes. 

In [None]:
from snomed_methods import snomed_methods_v1
snomed_relations_obj = snomed_methods_v1.snomed_relations(medcat=True)

outcome_variable_cui_for_filter = '40733004'  # infection

print(outcome_variable_cui_for_filter)

filter_root_cui = outcome_variable_cui_for_filter
print(filter_root_cui)

retrieved_codes_snomed_tree, retrieved_names_snomed_tree = snomed_relations_obj.recursive_code_expansion(filter_root_cui, n_recursion = 3, debug=False)

retrieved_codes_snomed_tree[0:5], len(retrieved_codes_snomed_tree), len(retrieved_names_snomed_tree)





In [None]:
retrieved_names_snomed_tree[0:10]

In [None]:
retrieved_codes_medcat_cdb, retrieved_names_medcat_cdb  = snomed_relations_obj.get_medcat_cdb_most_similar(filter_root_cui, context_type = 'xxxlong', type_id_filter=[], topn=25)

In [None]:
retrieved_names_medcat_cdb[0:10]

In [None]:
all_names_list = list(set(retrieved_names_medcat_cdb + retrieved_names_snomed_tree))

all_codes_list = list(set(retrieved_codes_medcat_cdb + retrieved_codes_snomed_tree))

print(len(all_names_list))

In [None]:


all_pat_list_ = get_all_patients_list(config_obj=pat2vec_obj.config_obj)


all_annot_filtered_df = produce_filtered_annotation_dataframe(cui_filter=True, meta_annot_filter=True, pat_list=all_pat_list_, config_obj=pat2vec_obj.config_obj, filter_custom_args=pat2vec_obj.config_obj.annot_filter_options, cui_code_list=all_codes_list)

In [None]:
all_annot_filtered_df