In [None]:
# This notebook serves to:

#1) Prepare Time Series Convert Numeric for LabValue, VitalSign, Procedure in quantiles
#2) only consider what happened before HT_onset (filter age_delta < HT_onset)
#3) Prepare unique concepts 
#4) Create individual records for MRNs & Split in Cohorts
#5) prepare drug & diagnosis as needed for LSTM

In [None]:
import os
import pyarrow.parquet as pq
import pandas as pd
import time
import numpy as np
import pickle
import sys
import tensorflow as tf

In [None]:
#load time-series fetched data files from all 61k (HT onset before 2013) cohort files

Diagnosis = pd.read_pickle('/home/kiwitn01/master_thesis_hypertension-complications/Time_Series/Fetched_Timeseries/Diagnosis/Diagnosis_time_series_all_new.pkl')
Drugs = pd.read_pickle('/home/kiwitn01/master_thesis_hypertension-complications/Time_Series/Fetched_Timeseries/Drugs/Drug_time_series_all_new.pkl')
Procedure = pd.read_pickle('/home/kiwitn01/master_thesis_hypertension-complications/Time_Series/Fetched_Timeseries/Procedure/procedure_time_series_all.pkl')
LabValues = pd.read_pickle('/home/kiwitn01/master_thesis_hypertension-complications/Time_Series/Fetched_Timeseries/LabValues/labvalue_time_series_all.pkl')
VitalSigns = pd.read_pickle('/home/kiwitn01/master_thesis_hypertension-complications/Time_Series/Fetched_Timeseries/VitalSigns/vitalsign_time_series_all.pkl')

Extra_BP_values = pd.read_pickle('/home/kiwitn01/master_thesis_hypertension-complications/Time_Series/Fetched_Timeseries/VitalSigns/vitalsign_time_series_extra_BP_30_days_before_event.pkl')                                                                                        
Extra_BP_values= Extra_BP_values.reset_index('medical_record_number')

In [None]:
# 1) prepare VitalSigns, LabValues and Procedures dataframes for quantile processing for timeseries model as dynamic features

LabValues = LabValues[['medical_record_number','age_in_days','numeric_value', 'time_delta_in_days', 'test_name']]
LabValues = LabValues.rename(columns={'test_name': 'description'})
LabValues= LabValues.dropna(subset=['description'], axis= 0)
LabValues['description'] = 'LabValue_' + LabValues['description'].astype(str)

VitalSigns = VitalSigns[['medical_record_number','age_in_days', 'context_procedure_code', 'numeric_value', 'time_delta_in_days' ]]
VitalSigns = VitalSigns.rename(columns={'context_procedure_code': 'description'})
VitalSigns= VitalSigns.dropna(subset=['description'], axis= 0)
VitalSigns['description'] = 'VitalSigns_' + VitalSigns['description'].astype(str)

Procedure = Procedure[['medical_record_number', 'age_in_days', 'procedure_description', 'numeric_value', 'time_delta_in_days']]
Procedure = Procedure.rename(columns={'procedure_description': 'description'})
Procedure= Procedure.dropna(subset=['description'], axis= 0)
Procedure['description'] = 'Procedure_' + Procedure['description'].astype(str)

Extra_BP_values = Extra_BP_values.rename(columns={'age_in_days_HT_onset': 'age_in_days'})
Extra_BP_values = Extra_BP_values[['medical_record_number','age_in_days', 'context_procedure_code', 'numeric_value', 'time_delta_in_days' ]]
Extra_BP_values = Extra_BP_values.rename(columns={'context_procedure_code': 'description'})
Extra_BP_values= Extra_BP_values.dropna(subset=['description'], axis= 0)
Extra_BP_values['description'] = 'Extra_BP_values' + Extra_BP_values['description'].astype(str)

df = pd.concat([LabValues,VitalSigns, Procedure, Extra_BP_values])
df = df.dropna(subset=['time_delta_in_days'], axis= 0)
df = df.dropna(subset=['numeric_value'], axis= 0)
df = df.loc[df['numeric_value'] >=0]

df.to_pickle('/home/kiwitn01/master_thesis_hypertension-complications/Time_Series/New/Lab_Vital_Proc_noNaN_prepared_extra_BP_values_30_days.pkl')



In [None]:
#further cleaned for no numeric value and negative values
len (df.drop_duplicates('medical_record_number'))


In [None]:
### numeric process in quantiles
#start here again
df = pd.read_pickle('/home/kiwitn01/master_thesis_hypertension-complications/Time_Series/New/Lab_Vital_Proc_noNaN_prepared_extra_BP_values_30_days.pkl')


In [None]:
len (df.drop_duplicates('medical_record_number'))

In [None]:
df["high_low"] = ""

df1 = pd.DataFrame(df.groupby(by='description')['numeric_value'].quantile([0.25,0.5,0.75]))
df1.reset_index(inplace=True)

df1.rename(columns={'level_1':'quantile'}, inplace=True)
df1['quantile'] = df1['quantile'].replace(0.25,'Q1').replace(0.50,'Q2').replace(0.75,'Q3')

print('Quantile df:',df1.head())

unique_descriptions =list(df1.description.unique())
print('Length of unique_descriptions:',len(unique_descriptions))

for val in unique_descriptions:
    print(f'Key:{val}')
    Q1 = float(df1.loc[(df1['description'] == val) & (df1['quantile'] == 'Q1')]['numeric_value'])
    Q2 = float(df1.loc[(df1['description'] == val) & (df1['quantile'] == 'Q2')]['numeric_value'])
    Q3 = float(df1.loc[(df1['description'] == val) & (df1['quantile'] == 'Q3')]['numeric_value'])
    print(f'Q1:{Q1},Q2:{Q2},Q3:{Q3}')
    print('*'*100)
    very_low_cond = (df['description'] == val) & (df['numeric_value']< Q1)
    medium_low_cond = (df['description'] == val) & ((df['numeric_value'] >= Q1) & (df['numeric_value']<= Q2))
    medium_high_cond = (df['description'] == val) & ((df['numeric_value'] > Q2 ) & (df['numeric_value']<= Q3))
    very_high_cond = (df['description'] == val) & (df['numeric_value']> Q3)
    df['high_low'][very_low_cond] = val+'_very_low'
    df['high_low'][medium_low_cond] = val+'_medium_low'
    df['high_low'][medium_high_cond] = val+'_medium_high'
    df['high_low'][very_high_cond] = val+'_very_high'


print('Head of dataframe with unique_vals:',df.head())


In [None]:
df.to_pickle('/home/kiwitn01/master_thesis_hypertension-complications/Time_Series/New/TimeSeries_Lab_Vital_Proc_ExtraBP_30_days_with_quantiles.pkl')


In [None]:
## 2) prepare dataset in terms of timing 

In [None]:
#for only the Lab_VitalSigns_Proc_Extra_BP

#df = pd.read_pickle('/home/kiwitn01/master_thesis_hypertension-complications/Time_Series/New/TimeSeries_Lab_Vital_Proc_ExtraBP_30_days_with_quantiles.pkl')

timeseries_data = df
timeseries_data_before_HT_onset = timeseries_data.loc[timeseries_data['time_delta_in_days'] < 0]
extra_BP = timeseries_data[timeseries_data['description'].str.match('Extra_BP')]

frames=[timeseries_data_before_HT_onset,extra_BP]
time_series= pd.concat(frames)

#save file
time_series.to_pickle('/home/kiwitn01/master_thesis_hypertension-complications/Time_Series/New/Lab_VS_Proc_features_time_series_clean_and_with_quantiles_with_extra_BP_30_days.pkl')


In [None]:
time_series_data = pd.read_pickle('/home/kiwitn01/master_thesis_hypertension-complications/Time_Series/New/Lab_VS_Proc_features_time_series_clean_and_with_quantiles_with_extra_BP_30_days.pkl')



In [None]:
# 3) prepare unique concepts 

df = time_series_data

save_path =('/home/kiwitn01/master_thesis_hypertension-complications/Time_Series/New/Dynamic_features/with_extra_BP_30_days_without_nan_numeric/')

unique_values = df.high_low.unique()
unique_concepts_dictionary = {}

for i, val in enumerate(unique_values):
    unique_concepts_dictionary[val] = i+1

# Saving names of unique concepts with it's numeric value in csv file
pd.DataFrame.from_dict(unique_concepts_dictionary, orient='index').to_csv('unique_concepts_30_days_no_numeric.csv')

# assigning numbers to the unique concept column
df["unique_concept"] = ""

for key in unique_concepts_dictionary:
    # df['unique_concept'] = df.high_low.apply(lambda x: unique_concepts_dictionary[key] if x==key else '')
    print("Current key:", key)
    print("*" * 100)
    df["unique_concept"] = np.where(
        df["high_low"] == key, unique_concepts_dictionary[key], df["unique_concept"]
    )
    print(df[df["high_low"] == key].head())


In [None]:
df.to_pickle('/home/kiwitn01/master_thesis_hypertension-complications/Time_Series/New/Lab_VS_Pro_with_unique_concepts_with_extra_BP_30_days.pkl')


In [None]:
len (df.drop_duplicates('medical_record_number'))

In [None]:
#start here again

In [None]:
## 4) Create individual records for MRNs & Split in Cohorts


In [None]:
df= pd.read_pickle('/home/kiwitn01/master_thesis_hypertension-complications/Time_Series/New/Lab_VS_Pro_with_unique_concepts_with_extra_BP_30_days.pkl')
#df= pd.read_pickle('/home/kiwitn01/master_thesis_hypertension-complications/Time_Series/All_features_with_unique_concepts_with_extra_BP.pkl')


In [None]:
len (df)

In [None]:
len (df.drop_duplicates('medical_record_number'))

In [None]:
#dataframe with only extra BP
df_ExtraBP = df[df['description'].astype(str).str.startswith('Extra_BP')]
df_ExtraBP

In [None]:
df_ExtraBP.to_pickle('/home/kiwitn01/master_thesis_hypertension-complications/Time_Series/New/Dynamic_features/Lab_VS_Pro_with_unique_concepts_ONLY_extra_BP.pkl')


In [None]:
#dataframe without extraBP
df_without_ExtraBP = df[~df['description'].astype(str).str.startswith('Extra_BP')]
df_without_ExtraBP.to_pickle('/home/kiwitn01/master_thesis_hypertension-complications/Time_Series/New/Dynamic_features/Lab_VS_Pro_with_unique_concepts_without_extra_BP.pkl')

#df = df_without_ExtraBP

In [None]:
df_without_ExtraBP

In [None]:
len (df_without_ExtraBP.drop_duplicates('medical_record_number'))

In [None]:
timeseries_data = df_ExtraBP

#ground_truth_data = pd.read_pickle('/home/kiwitn01/master_thesis_hypertension-complications/Case_Control_Cohort_Creation/For_ML_Pipeline/Split_2011/All3_ML_pipeline_final.pkl')

#ground_truth_data = pd.read_pickle('/home/kiwitn01/master_thesis_hypertension-complications/Case_Control_Cohort_Creation/For_ML_Pipeline/Split_2012/All3_ML_pipeline_final.pkl')
#ground_truth_data = pd.read_pickle('/home/kiwitn01/master_thesis_hypertension-complications/Case_Control_Cohort_Creation/For_ML_Pipeline/Split_2012/Cerebro_ML_pipeline_final.pkl')
#ground_truth_data = pd.read_pickle('/home/kiwitn01/master_thesis_hypertension-complications/Case_Control_Cohort_Creation/For_ML_Pipeline/Split_2012/Heart_ML_pipeline_final.pkl')
ground_truth_data = pd.read_pickle('/home/kiwitn01/master_thesis_hypertension-complications/Case_Control_Cohort_Creation/For_ML_Pipeline/Split_2012/Renal_ML_pipeline_final.pkl')

#ground_truth_data = ground_truth_data.reset_index('medical_record_number')
save_path =('/home/kiwitn01/master_thesis_hypertension-complications/Time_Series/New/Dynamic_features/only_extra_BP/')




In [None]:
ground_truth_data

In [None]:
# creates a sequences for individual MRNs; 
X = timeseries_data[['medical_record_number','unique_concept']].groupby(['medical_record_number']).aggregate(lambda x: list(x)).reset_index()

X1 = X.rename(columns={'unique_concept':'sequence'})
print('The head of main dataframe to be saved:',X1.head())


y=ground_truth_data[ground_truth_data.medical_record_number.isin(
    X.medical_record_number.unique())].sort_values(by='medical_record_number')[['medical_record_number','Complication']].astype('int')
y.head()

y1=y['Complication']

print('The head of y dataframe to be saved:',y1.head())


In [None]:
# save the X and y into pickle 

#important: change data name according to cohort!

with open(save_path+"data_renal_2012.txt", "wb") as fp:   
    pickle.dump(X1, fp)

with open(save_path+"label_renal_2012.txt", "wb") as fp:  
    pickle.dump(y1, fp)

print('File saved succesfully')

with open(save_path+"data_renal_2012.txt", "rb") as fp:   
    XX = pickle.load(fp)

print('Saved X dataframe:',XX.head())


In [None]:
len(y)

In [None]:
################################

In [None]:
#5) getting diagnosis and control as static features

#get the ground data for the cohort that is used also for the timeseries data

#ground_truth_data = pd.read_pickle('/home/kiwitn01/master_thesis_hypertension-complications/Case_Control_Cohort_Creation/For_ML_Pipeline/Split_2012/All3_ML_pipeline_final.pkl')
#ground_truth_data = pd.read_pickle('/home/kiwitn01/master_thesis_hypertension-complications/Case_Control_Cohort_Creation/For_ML_Pipeline/Split_2012/Cerebro_ML_pipeline_final.pkl')
#ground_truth_data = pd.read_pickle('/home/kiwitn01/master_thesis_hypertension-complications/Case_Control_Cohort_Creation/For_ML_Pipeline/Split_2012/Heart_ML_pipeline_final.pkl')
ground_truth_data = pd.read_pickle('/home/kiwitn01/master_thesis_hypertension-complications/Case_Control_Cohort_Creation/For_ML_Pipeline/Split_2012/Renal_ML_pipeline_final.pkl')


ground_truth_data = ground_truth_data.set_index('medical_record_number')

ground_truth_data_main = ground_truth_data[['Complication', 'train_test', 'age_in_days', 'gender', 'race', 'marital_status_code']]
ground_truth_data_diag = ground_truth_data[ground_truth_data.columns[pd.Series(ground_truth_data.columns).str.startswith('Diagnosis')]]
ground_truth_data_drug = ground_truth_data[ground_truth_data.columns[pd.Series(ground_truth_data.columns).str.startswith('Drug')]]
ground_truth_data = ground_truth_data.reset_index('medical_record_number')

list_df = [ground_truth_data_main,ground_truth_data_diag,ground_truth_data_drug]

from functools import reduce

merged_df = reduce(lambda left,right: pd.merge(left, right, how='inner',on='medical_record_number',suffixes=('', '_y')),list_df)
merged_df.drop(merged_df.filter(regex='_y$').columns.tolist(),axis=1, inplace=True)

merged_df= merged_df.reset_index('medical_record_number')

In [None]:
merged_df

In [None]:
#specify which cohort in saving name 
merged_df.to_pickle('/home/kiwitn01/master_thesis_hypertension-complications/Time_Series/Static_features/Renal_2012_Drug_Diag_Static.pkl')


In [None]:
################################