In [None]:
# Import libraries

import pandas as pd
import numpy as np
from datetime import datetime as dt
import pickle

# Preparing labels patient data

In [None]:
# Paths

in_path = '/ctao_shared_data/Liver_optum/outputs/'
ot_path = '../../Data/'

In [None]:
# Load labels file which contains patient information : each row represents one patient
df_original = pd.read_csv(in_path + 'label.csv')

# Print total number of patients
print("Total number of patients ", len (df_original)) 

In [None]:
# Filtering patients younger than 18 years at the index date (LT procedure)
age_filtered= df_original[df_original['age_at_index']>=18]

In [None]:
# Include only patients identified as Males or Females
age_filtered = age_filtered[(age_filtered['gdr_cd'] == 'M') | (age_filtered['gdr_cd'] == 'F')]

In [None]:
# Add day gap between Index date and MACE date
mace_date = []
for i in age_filtered.first_lowest:
    if i == '0':
        mace_date.append(pd.to_datetime(20000101, format='%Y%m%d'))
    else:
        mace_date.append(i)
age_filtered['event_date'] = mace_date

age_filtered['index_day'] = pd.to_datetime(age_filtered['index_day'])
age_filtered['index_MACE_gap'] = (age_filtered['event_date'] - age_filtered['index_day']).dt.days

In [None]:
# Add interval labeling
age_filtered = age_filtered.copy()
age_filtered['0-30-label'] = np.where((age_filtered['index_MACE_gap'] <=30) & (age_filtered['index_MACE_gap'] >=0), 1,0)
age_filtered['0-365-label'] = np.where((age_filtered['index_MACE_gap'] <=365) & (age_filtered['index_MACE_gap']>=0), 1,0)
age_filtered['0-1095-label'] = np.where((age_filtered['index_MACE_gap'] <=1095) & (age_filtered['index_MACE_gap']>=0), 1,0)
age_filtered['0-1825-label'] = np.where((age_filtered['index_MACE_gap'] <=1825) & (age_filtered['index_MACE_gap']>=0), 1,0)

# Grouping claims into encounters

In [None]:
cl = ['patid','fst_dt']

df1 = pd.read_csv(in_path + 'final_diag_all.csv' , delimiter = '\t', low_memory = False, usecols=cl)
df2 = pd.read_csv(in_path + 'final_drug_all.csv', delimiter = ',', low_memory = False, usecols=cl)
df3 = pd.read_csv(in_path + 'final_proc_all.csv', delimiter = '\t', low_memory = False, usecols=cl)


df_concat = pd.concat([df1, df2, df3], axis=0)
clm_df  = df_concat.drop_duplicates()
clm_df['encounter_id']=clm_df.patid.astype(str)+'_'+clm_df.fst_dt.astype(str)
clm_df = clm_df[['encounter_id', 'patid', 'fst_dt']]

# Merging labeled patients with Encounters

In [None]:
patients_encounters = clm_df.merge(age_filtered)
patients_encounters


df = patients_encounters.copy()
df['stroke_label'] = np.where((df['stroke_index_day'] != '0') & (df['stroke_index_day'] <= df['fst_dt']), 1,0)
df['cardiac_arrest_label'] = np.where((df['cardiac_arrest_index_day'] != '0') & (df['cardiac_arrest_index_day'] <= df['fst_dt']), 1,0)
df['heart_failure_label'] = np.where((df['heart_failure_index_day'] != '0') & (df['heart_failure_index_day'] <= df['fst_dt']), 1,0)
df['pulmonary_embolism_label'] = np.where((df['pulmonary_embolism_index_day'] != '0') & (df['pulmonary_embolism_index_day'] <= df['fst_dt']), 1,0)
df['atrial_fibrillation_label'] = np.where((df['atrial_fibrillation_index_day'] != '0') & (df['atrial_fibrillation_index_day'] <= df['fst_dt']), 1,0)
df['Myocardial_Infarction_label'] = np.where((df['Myocardial_Infarction_index_day'] != '0') & (df['Myocardial_Infarction_index_day'] <= df['fst_dt']), 1,0)

cl = ['encounter_id', 'patid', 'fst_dt', 'index_day', 'stroke_label', 'cardiac_arrest_label', 'heart_failure_label', 
      'pulmonary_embolism_label', 'atrial_fibrillation_label',
       'Myocardial_Infarction_label', 'mixed','first_lowest','yrdob', 'gdr_cd' , 'age_at_index',
       '0-30-label', '0-365-label','0-1095-label',  '0-1825-label']
patients_encounters = df[cl]

# Adding day gap for each encounter
patients_encounters['day_gap'] = (pd.to_datetime(patients_encounters['fst_dt'])- pd.to_datetime(patients_encounters['index_day'])).dt.days

# Adding age at each encounter
patients_encounters['age_at_encounter'] = pd.to_datetime(patients_encounters['fst_dt']).dt.year-patients_encounters['yrdob']

# Aggregating encounters data every 15 days

In [None]:
bins = list(range(-30015, 30000,15))
labels = list(range(-30000,30000,15))
patients_encounters['binned'] = pd.cut(patients_encounters['day_gap'],  bins=bins, labels=labels)
patients_encounters['agg_date'] = pd.to_datetime(patients_encounters['index_day']) + pd.to_timedelta( patients_encounters['binned'].astype('int'), unit='d')
# Save final encounters file
patients_encounters.to_csv(ot_path + "patients_encounters_15_New.csv", index=False)