# Creating a table of sub-groups consisting of demographics and CUSUM-GFR data*

### * This code belongs to the paper "Using CUSUM in real time to signal clinically relevant decreases in estimated glomerular filtration rate"
##### To cite: Zafarnejad, R., Dumbauld, S., Dumbauld, D. et al. Using CUSUM in real time to signal clinically relevant decreases in estimated glomerular filtration rate. BMC Nephrol 23, 287 (2022). https://doi.org/10.1186/s12882-022-02910-8

In [2]:
import time
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import datetime

In [3]:
diagnosis_data = pd.read_csv("DIAGNOSIS_2.csv")

full_date_dataset = pd.read_csv('merged_dataset_dates_timedeltas_full.csv')
merged_dataset = full_date_dataset.merge(diagnosis_data, on = 'patient_sk' , how = 'inner')
merged_dataset = merged_dataset.rename(columns={'eGFR_EPI': 'Trigger_eGFR'})

datapool_ESRD = pd.read_csv('Final_ESRD_group_done_pandas.csv')
datapool_ESRD = datapool_ESRD.drop(columns=datapool_ESRD.columns[0])
datapool_ESRD = datapool_ESRD.drop_duplicates()
datapool_control = pd.read_csv("Final_Normal_group_done_pandas.csv")
datapool_control = datapool_control.drop(columns=datapool_control.columns[0])

#some patients have less than 9 datapoinsts!!! AFTER DROPPING DUPLICATES
datapool_ESRD_dropped = datapool_ESRD.groupby('patient_sk').agg({'eGFR_EPI' : 'count'}).reset_index()[datapool_ESRD.groupby('patient_sk').agg({'eGFR_EPI' : 'count'}).reset_index().eGFR_EPI >=9]
datapool_ESRD_dropped = datapool_ESRD_dropped.drop('eGFR_EPI', axis =1)
datapool_ESRD = datapool_ESRD.merge(datapool_ESRD_dropped, on = 'patient_sk', how = 'inner')

datapool_control = datapool_control.drop(datapool_control.index[np.isinf(datapool_control.eGFR_EPI) == True], axis = 0)
datapool_control = datapool_control.drop_duplicates()

#some patients have less than 9 datapoinsts!!! AFTER DROPPING DUPLICATES
datapool_control_dropped = datapool_control.groupby('patient_sk').agg({'eGFR_EPI' : 'count'}).reset_index()[datapool_control.groupby('patient_sk').agg({'eGFR_EPI' : 'count'}).reset_index().eGFR_EPI >=9]
datapool_control_dropped = datapool_control_dropped.drop('eGFR_EPI', axis =1)
datapool_control = datapool_control.merge(datapool_control_dropped, on = 'patient_sk', how = 'inner')

#Pulling out each patient's data 
#Also. sortinh the data by cSr lavel measurement data and reindexing it

patients_list_Normal = list(set(np.unique(list(datapool_control['patient_sk']))))
patients_list_ESRD = list(set(np.unique(list(datapool_ESRD['patient_sk']))))

In [4]:
# !!!! SHOULD TURN TO TOTAL_SECONDS IN THE MIDST OF ALGORITHM

datapool_ESRD['Date'] = pd.to_datetime(datapool_ESRD['Date'])
datapool_ESRD['Date'] = pd.to_datetime(datapool_ESRD['Date'])
datapool_ESRD_dates = datapool_ESRD.groupby('patient_sk').agg({'Date': 'min'})
datapool_ESRD_dates = datapool_ESRD_dates.reset_index()
datapool_ESRD = datapool_ESRD.merge(datapool_ESRD_dates, on = 'patient_sk', how='left')
datapool_ESRD['Date_seconds'] = datapool_ESRD['Date_x'] - datapool_ESRD['Date_y']
datapool_ESRD = datapool_ESRD.rename({'Date_x':'Date'}, axis = 1)
datapool_ESRD = datapool_ESRD.drop('Date_y', axis = 1)
datapool_ESRD['Date_seconds'] = datapool_ESRD['Date_seconds'].dt.total_seconds()

datapool_control['Date'] = pd.to_datetime(datapool_control['Date'])
datapool_control['Date'] = pd.to_datetime(datapool_control['Date'])
datapool_control_dates = datapool_control.groupby('patient_sk').agg({'Date': 'min'})
datapool_control_dates = datapool_control_dates.reset_index()
datapool_control = datapool_control.merge(datapool_control_dates, on = 'patient_sk', how='left')
datapool_control['Date_seconds'] = (datapool_control['Date_x'] - datapool_control['Date_y'])
datapool_control = datapool_control.rename({'Date_x':'Date'}, axis = 1)
datapool_control = datapool_control.drop('Date_y', axis = 1)
datapool_control['Date_seconds'] = datapool_control['Date_seconds'].dt.total_seconds()

#Getting rid of ESRD min eGFR < 60

datapool_ESRD_patients = datapool_ESRD.groupby('patient_sk').agg({'Date': 'min'})
datapool_ESRD_patients = datapool_ESRD_patients.reset_index()

datapool_ESRD_patients_eGFR = datapool_ESRD.merge(datapool_ESRD_patients, on=['patient_sk', 'Date'], how ='inner')
datapool_ESRD_patients_eGFR = datapool_ESRD_patients_eGFR.drop_duplicates('patient_sk')
datapool_ESRD_patients_eGFR = datapool_ESRD_patients_eGFR[datapool_ESRD_patients_eGFR['eGFR_EPI']>=60]

datapool_ESRD_new = datapool_ESRD.merge(datapool_ESRD_patients_eGFR['patient_sk'], on = 'patient_sk', how = 'inner')

datapool_ESRD = datapool_ESRD_new
datapool_ESRD = datapool_ESRD.drop_duplicates()
patients_list_ESRD = list(set(np.unique(list(datapool_ESRD['patient_sk']))))

print(datapool_ESRD.patient_sk.unique().shape[0])

#Getting rid of Normal min eGFR < 60

datapool_control_patients = datapool_control.groupby('patient_sk').agg({'eGFR_EPI': 'min'})
datapool_control_patients = datapool_control_patients[datapool_control_patients['eGFR_EPI']>=60]
datapool_control_patients = datapool_control_patients.reset_index()

datapool_control = datapool_control_patients.merge(datapool_control, on = 'patient_sk', how = 'inner')
datapool_control = datapool_control.rename({'eGFR_EPI_y':'eGFR_EPI'}, axis = 1)
datapool_control = datapool_control.drop('eGFR_EPI_x', axis = 1)

patients_list_control_above_50 = list(set(np.unique(list(datapool_control['patient_sk']))))

patients_list_Normal = patients_list_control_above_50

print(datapool_control.patient_sk.unique().shape[0])

5410
85699


### Now, taking care of the sub-groups

In [5]:
#Age ESRD

datapool_ESRD_age = datapool_ESRD.groupby('patient_sk').agg({'Age': lambda x: (x.iloc[-1] + x.iloc[0])/2}) #getting median of reported ages
datapool_ESRD_age = datapool_ESRD_age.reset_index()

patient_ESRD_working_age = datapool_ESRD_age[datapool_ESRD_age.Age < 65].drop('Age', axis =1)
datapool_ESRD_working_age = datapool_ESRD.merge(patient_ESRD_working_age, on = ['patient_sk'], how = 'inner')

patient_ESRD_none_working_age = datapool_ESRD_age[datapool_ESRD_age.Age >= 65].drop('Age', axis =1)
datapool_ESRD_none_working_age = datapool_ESRD.merge(patient_ESRD_none_working_age, on = ['patient_sk'], how = 'inner')

In [6]:
#Age Normal

datapool_control_age = datapool_control.groupby('patient_sk').agg({'Age': lambda x: (x.iloc[-1] + x.iloc[0])/2}) #getting median of reported ages
datapool_control_age = datapool_control_age.reset_index()

patient_control_working_age = datapool_control_age[datapool_control_age.Age < 65].drop('Age', axis =1)
datapool_control_working_age = datapool_control.merge(patient_control_working_age, on = ['patient_sk'], how = 'inner')

patient_control_none_working_age = datapool_control_age[datapool_control_age.Age >= 65].drop('Age', axis =1)
datapool_control_none_working_age = datapool_control.merge(patient_control_none_working_age, on = ['patient_sk'], how = 'inner')

In [7]:
#Gender ESRD

datapool_ESRD_gender = datapool_ESRD[(datapool_ESRD.Gender == 'Female') | (datapool_ESRD.Gender == 'Male')]
datapool_ESRD_gender = datapool_ESRD_gender.groupby('patient_sk').agg({'Gender': lambda x: x.iloc[0]}) #getting the first non_NONE gender reported (sex essentially)
datapool_ESRD_gender = datapool_ESRD_gender.reset_index()

patients_ESRD_gender_female = datapool_ESRD_gender[datapool_ESRD_gender.Gender == 'Female'].drop_duplicates('patient_sk').drop('Gender', axis = 1)
patients_ESRD_gender_male = datapool_ESRD_gender[datapool_ESRD_gender.Gender == 'Male'].drop_duplicates('patient_sk').drop('Gender', axis = 1)

datapool_ESRD_Female = datapool_ESRD.merge(patients_ESRD_gender_female, on = ['patient_sk'], how = 'inner')
datapool_ESRD_Male = datapool_ESRD.merge(patients_ESRD_gender_male, on = ['patient_sk'], how = 'inner')

In [8]:
#Gender Normal

datapool_control_gender = datapool_control[(datapool_control.Gender == 'Female') | (datapool_control.Gender == 'Male')]
datapool_control_gender = datapool_control_gender.groupby('patient_sk').agg({'Gender': lambda x: x.iloc[0]}) #getting the first non_NONE gender reported (sex essentially)
datapool_control_gender = datapool_control_gender.reset_index()

patients_control_gender_female = datapool_control_gender[datapool_control_gender.Gender == 'Female'].drop_duplicates('patient_sk').drop('Gender', axis = 1)
patients_control_gender_male = datapool_control_gender[datapool_control_gender.Gender == 'Male'].drop_duplicates('patient_sk').drop('Gender', axis = 1)

datapool_control_Female = datapool_control.merge(patients_control_gender_female, on = ['patient_sk'], how = 'inner')
datapool_control_Male = datapool_control.merge(patients_control_gender_male, on = ['patient_sk'], how = 'inner')

In [9]:
#Race Normal

datapool_ESRD_race = datapool_ESRD.groupby('patient_sk').agg({'Race': lambda x: x.iloc[-1]}) #getting the last non_NONE race reported
datapool_ESRD_race = datapool_ESRD_race.reset_index()

patients_ESRD_African = datapool_ESRD_race[(datapool_ESRD_race.Race == 'African American')]
patients_ESRD_African = patients_ESRD_African.drop('Race', axis = 1)
datapool_ESRD_African = datapool_ESRD.merge(patients_ESRD_African, on = ['patient_sk'], how = 'inner')

patients_ESRD_None_African = datapool_ESRD_race[(datapool_ESRD_race.Race != 'African American')]
patients_ESRD_None_African = patients_ESRD_None_African.drop('Race', axis = 1)
datapool_ESRD_None_African = datapool_ESRD.merge(patients_ESRD_None_African, on = ['patient_sk'], how = 'inner')

In [10]:
#Race Normal

datapool_control_race = datapool_control.groupby('patient_sk').agg({'Race': lambda x: x.iloc[-1]}) #getting the last non_NONE race reported
datapool_control_race = datapool_control_race.reset_index()

patients_control_African = datapool_control_race[(datapool_control_race.Race == 'African American')]
patients_control_African = patients_control_African.drop('Race', axis = 1)
datapool_control_African = datapool_control.merge(patients_control_African, on = ['patient_sk'], how = 'inner')

patients_control_None_African = datapool_control_race[(datapool_control_race.Race != 'African American')]
patients_control_None_African = patients_control_None_African.drop('Race', axis = 1)
datapool_control_None_African = datapool_control.merge(patients_control_None_African, on = ['patient_sk'], how = 'inner')

In [11]:
#Hypertenssion

merged_dataset['Diagnosis_admission_date_Hypertension'] = pd.to_datetime(merged_dataset['Diagnosis_admission_date_Hypertension'], errors='coerce')
merged_dataset['Diagnosis_admission_date_ESRD'] = pd.to_datetime(merged_dataset['Diagnosis_admission_date_ESRD'], errors='coerce')

Diagnosis_admission_date_Hypertension = list(merged_dataset['Diagnosis_admission_date_Hypertension'])
Diagnosis_admission_date_ESRD = list(merged_dataset['Diagnosis_admission_date_ESRD'])

for i in range(len(Diagnosis_admission_date_Hypertension)):
    if pd.isnull(Diagnosis_admission_date_Hypertension[i]):
        Diagnosis_admission_date_Hypertension[i] = pd.Timestamp('1800-01-01')

count = 0
count_list = []
for i in range(len(Diagnosis_admission_date_Hypertension)):
    if Diagnosis_admission_date_Hypertension[i] <= Diagnosis_admission_date_ESRD[i] and Diagnosis_admission_date_Hypertension[i] != pd.Timestamp('1800-01-01'):
        count = count + 1
        count_list.append(merged_dataset['patient_sk'][i])
    

patients_ESRD_Hypertension = pd.DataFrame({'patient_sk' : count_list})
datapool_ESRD_Hypertension = datapool_ESRD.merge(patients_ESRD_Hypertension, on = ['patient_sk'], how = 'inner')

#Non-Hypertenssion

patients_list_ESRD
patients_ESRD_Non_Hypertension = pd.DataFrame({'patient_sk' : list(set(patients_list_ESRD).difference(set(count_list)))})
datapool_ESRD_Non_Hypertension = datapool_ESRD.merge(patients_ESRD_Non_Hypertension, on = ['patient_sk'], how = 'inner')

In [12]:
merged_dataset['Diagnosis_admission_date_Diabetes'] = pd.to_datetime(merged_dataset['Diagnosis_admission_date_Diabetes'], errors='coerce')
merged_dataset['Diagnosis_admission_date_ESRD'] = pd.to_datetime(merged_dataset['Diagnosis_admission_date_ESRD'], errors='coerce')

Diagnosis_admission_date_Diabetes = list(merged_dataset['Diagnosis_admission_date_Diabetes'])
Diagnosis_admission_date_ESRD = list(merged_dataset['Diagnosis_admission_date_ESRD'])

for i in range(len(Diagnosis_admission_date_Diabetes)):
    if pd.isnull(Diagnosis_admission_date_Diabetes[i]):
        Diagnosis_admission_date_Diabetes[i] = pd.Timestamp('1800-01-01')

count = 0
count_list = []
for i in range(len(Diagnosis_admission_date_Diabetes)):
    if Diagnosis_admission_date_Diabetes[i] <= Diagnosis_admission_date_ESRD[i] and Diagnosis_admission_date_Diabetes[i] != pd.Timestamp('1800-01-01'):
        count = count + 1
        count_list.append(merged_dataset['patient_sk'][i])
        

patients_ESRD_Diabetes = pd.DataFrame({'patient_sk' : count_list})
datapool_ESRD_Diabetes = datapool_ESRD.merge(patients_ESRD_Diabetes, on = ['patient_sk'], how = 'inner')

#Non-Diabeties

patients_list_ESRD
patients_ESRD_Non_Diabetes = pd.DataFrame({'patient_sk' : list(set(patients_list_ESRD).difference(set(count_list)))})
datapool_ESRD_Non_Diabetes = datapool_ESRD.merge(patients_ESRD_Non_Diabetes, on = ['patient_sk'], how = 'inner')

In [13]:
#Cardiovascular_Disease

merged_dataset['Diagnosis_admission_date_Coronary_Artery_Disease'] = pd.to_datetime(merged_dataset['Diagnosis_admission_date_Coronary_Artery_Disease'], errors='coerce')
merged_dataset['Diagnosis_admission_date_ESRD'] = pd.to_datetime(merged_dataset['Diagnosis_admission_date_ESRD'], errors='coerce')

Diagnosis_admission_date_Coronary_Artery_Disease = list(merged_dataset['Diagnosis_admission_date_Coronary_Artery_Disease'])
Diagnosis_admission_date_ESRD = list(merged_dataset['Diagnosis_admission_date_ESRD'])

for i in range(len(Diagnosis_admission_date_Coronary_Artery_Disease)):
    if pd.isnull(Diagnosis_admission_date_Coronary_Artery_Disease[i]):
        Diagnosis_admission_date_Coronary_Artery_Disease[i] = pd.Timestamp('1800-01-01')

count = 0
count_list = []
for i in range(len(Diagnosis_admission_date_Coronary_Artery_Disease)):
    if Diagnosis_admission_date_Coronary_Artery_Disease[i] <= Diagnosis_admission_date_ESRD[i] and Diagnosis_admission_date_Coronary_Artery_Disease[i] != pd.Timestamp('1800-01-01'):
        count = count + 1
        count_list.append(merged_dataset['patient_sk'][i])

merged_dataset['Diagnosis_admission_date_Cerebrovascular_Disease'] = pd.to_datetime(merged_dataset['Diagnosis_admission_date_Cerebrovascular_Disease'], errors='coerce')
merged_dataset['Diagnosis_admission_date_ESRD'] = pd.to_datetime(merged_dataset['Diagnosis_admission_date_ESRD'], errors='coerce')

Diagnosis_admission_date_Cerebrovascular_Disease = list(merged_dataset['Diagnosis_admission_date_Cerebrovascular_Disease'])
Diagnosis_admission_date_ESRD = list(merged_dataset['Diagnosis_admission_date_ESRD'])

for i in range(len(Diagnosis_admission_date_Cerebrovascular_Disease)):
    if pd.isnull(Diagnosis_admission_date_Cerebrovascular_Disease[i]):
        Diagnosis_admission_date_Cerebrovascular_Disease[i] = pd.Timestamp('1800-01-01')

for i in range(len(Diagnosis_admission_date_Cerebrovascular_Disease)):
    if Diagnosis_admission_date_Cerebrovascular_Disease[i] <= Diagnosis_admission_date_ESRD[i] and Diagnosis_admission_date_Cerebrovascular_Disease[i] != pd.Timestamp('1800-01-01'):
        count = count + 1
        count_list.append(merged_dataset['patient_sk'][i])


merged_dataset['Diagnosis_admission_date_Peripheral_Vascular_Disease'] = pd.to_datetime(merged_dataset['Diagnosis_admission_date_Peripheral_Vascular_Disease'], errors='coerce')
merged_dataset['Diagnosis_admission_date_ESRD'] = pd.to_datetime(merged_dataset['Diagnosis_admission_date_ESRD'], errors='coerce')

Diagnosis_admission_date_Peripheral_Vascular_Disease = list(merged_dataset['Diagnosis_admission_date_Peripheral_Vascular_Disease'])
Diagnosis_admission_date_ESRD = list(merged_dataset['Diagnosis_admission_date_ESRD'])

for i in range(len(Diagnosis_admission_date_Peripheral_Vascular_Disease)):
    if pd.isnull(Diagnosis_admission_date_Peripheral_Vascular_Disease[i]):
        Diagnosis_admission_date_Peripheral_Vascular_Disease[i] = pd.Timestamp('1800-01-01')

for i in range(len(Diagnosis_admission_date_Peripheral_Vascular_Disease)):
    if Diagnosis_admission_date_Peripheral_Vascular_Disease[i] <= Diagnosis_admission_date_ESRD[i] and Diagnosis_admission_date_Peripheral_Vascular_Disease[i] != pd.Timestamp('1800-01-01'):
        count = count + 1
        count_list.append(merged_dataset['patient_sk'][i])
    

patients_ESRD_Cardiovascular_Disease = pd.DataFrame({'patient_sk' : count_list})
datapool_ESRD_Cardiovascular_Disease = datapool_ESRD.merge(patients_ESRD_Cardiovascular_Disease, on = ['patient_sk'], how = 'inner')


#Non-Cardiovascular_Disease

patients_list_ESRD
patients_ESRD_Non_Cardiovascular_Disease = pd.DataFrame({'patient_sk' : list(set(patients_list_ESRD).difference(set(count_list)))})
datapool_ESRD_Non_Cardiovascular_Disease = datapool_ESRD.merge(patients_ESRD_Non_Cardiovascular_Disease, on = ['patient_sk'], how = 'inner')

In [14]:
#Diagnosis_admission_date_Hypercholesterolemia

merged_dataset['Diagnosis_admission_date_Hypercholesterolemia'] = pd.to_datetime(merged_dataset['Diagnosis_admission_date_Hypercholesterolemia'], errors='coerce')
merged_dataset['Diagnosis_admission_date_ESRD'] = pd.to_datetime(merged_dataset['Diagnosis_admission_date_ESRD'], errors='coerce')

Diagnosis_admission_date_Hypercholesterolemia = list(merged_dataset['Diagnosis_admission_date_Hypercholesterolemia'])
Diagnosis_admission_date_ESRD = list(merged_dataset['Diagnosis_admission_date_ESRD'])

for i in range(len(Diagnosis_admission_date_Hypercholesterolemia)):
    if pd.isnull(Diagnosis_admission_date_Hypercholesterolemia[i]):
        Diagnosis_admission_date_Hypercholesterolemia[i] = pd.Timestamp('1800-01-01')

count = 0
count_list = []
for i in range(len(Diagnosis_admission_date_Hypercholesterolemia)):
    if Diagnosis_admission_date_Hypercholesterolemia[i] <= Diagnosis_admission_date_ESRD[i] and Diagnosis_admission_date_Hypercholesterolemia[i] != pd.Timestamp('1800-01-01'):
        count = count + 1
        count_list.append(merged_dataset['patient_sk'][i])


patients_ESRD_Hypercholesterolemia = pd.DataFrame({'patient_sk' : count_list})
datapool_ESRD_Hypercholesterolemia = datapool_ESRD.merge(patients_ESRD_Hypercholesterolemia, on = ['patient_sk'], how = 'inner')

#Non-Hypercholesterolemia

patients_list_ESRD
patients_ESRD_Non_Hypercholesterolemia = pd.DataFrame({'patient_sk' : list(set(patients_list_ESRD).difference(set(count_list)))})
datapool_ESRD_Non_Hypercholesterolemia = datapool_ESRD.merge(patients_ESRD_Non_Hypercholesterolemia, on = ['patient_sk'], how = 'inner')

In [None]:
patients_control_Hypertension = pd.read_csv('DIAGNOSIS_NORMAL_hypertension.csv')
patients_control_Hypertension = patients_control_Hypertension.drop(columns=patients_control_Hypertension.columns[0])
datapool_control_Hypertension = datapool_control.merge(patients_control_Hypertension, on = ['patient_sk'], how='inner')
patient_lislis = pd.DataFrame({'patient_sk':list(set(list(datapool_control.patient_sk))-(set(list(patients_control_Hypertension.patient_sk.unique()))))})
datapool_control_Non_Hypertension = datapool_control.merge(patient_lislis, on = ['patient_sk'], how='inner')
datapool_control_Non_Hypertension.patient_sk.unique().shape

## Now, putting the algorithm together!

In [16]:
#Normal subgroup for the disease
patients_control_Hypertension = pd.read_csv('DIAGNOSIS_NORMAL_hypertension.csv')
patients_control_Hypertension = patients_control_Hypertension.drop(columns=patients_control_Hypertension.columns[0])
datapool_control_Hypertension = datapool_control.merge(patients_control_Hypertension, on = ['patient_sk'], how='inner')
patient_lislis = pd.DataFrame({'patient_sk':list(set(list(datapool_control.patient_sk))-(set(list(patients_control_Hypertension.patient_sk.unique()))))})
datapool_control_Non_Hypertension = datapool_control.merge(patient_lislis, on = ['patient_sk'], how='inner')

patients_control_Diabetes = pd.read_csv('DIAGNOSIS_NORMAL_Diabetes.csv')
patients_control_Diabetes = patients_control_Diabetes.drop(columns=patients_control_Diabetes.columns[0])
datapool_control_Diabetes = datapool_control.merge(patients_control_Diabetes, on = ['patient_sk'], how='inner')
patient_lislis = pd.DataFrame({'patient_sk':list(set(list(datapool_control.patient_sk))-(set(list(patients_control_Diabetes.patient_sk.unique()))))})
datapool_control_Non_Diabetes = datapool_control.merge(patient_lislis, on = ['patient_sk'], how='inner')

patients_control_Cardiovascular_Disease = pd.read_csv('DIAGNOSIS_NORMAL_Cardivascular_Disease.csv')
patients_control_Cardiovascular_Disease = patients_control_Cardiovascular_Disease.drop(columns=patients_control_Cardiovascular_Disease.columns[0])
datapool_control_Cardiovascular_Disease = datapool_control.merge(patients_control_Cardiovascular_Disease, on = ['patient_sk'], how='inner')
patient_lislis = pd.DataFrame({'patient_sk':list(set(list(datapool_control.patient_sk))-(set(list(patients_control_Cardiovascular_Disease.patient_sk.unique()))))})
datapool_control_Non_Cardiovascular_Disease = datapool_control.merge(patient_lislis, on = ['patient_sk'], how='inner')

patients_control_Hypercholesterolemia = pd.read_csv('DIAGNOSIS_NORMAL_Hypercholesterolemia.csv')
patients_control_Hypercholesterolemia = patients_control_Hypercholesterolemia.drop(columns=patients_control_Hypercholesterolemia.columns[0])
datapool_control_Hypercholesterolemia = datapool_control.merge(patients_control_Hypercholesterolemia, on = ['patient_sk'], how='inner')
patient_lislis = pd.DataFrame({'patient_sk':list(set(list(datapool_control.patient_sk))-(set(list(patients_control_Hypercholesterolemia.patient_sk.unique()))))})
datapool_control_Non_Hypercholesterolemia = datapool_control.merge(patient_lislis, on = ['patient_sk'], how='inner')

In [17]:
Sub_Groups = [[datapool_ESRD_working_age, datapool_control_working_age], [datapool_ESRD_none_working_age, datapool_control_none_working_age], [datapool_ESRD_Female, datapool_control_Female],  [datapool_ESRD_Male, datapool_control_Male], [datapool_ESRD_African, datapool_control_African], [datapool_ESRD_None_African, datapool_control_None_African], [datapool_ESRD_Hypertension, datapool_control_Hypertension], [datapool_ESRD_Non_Hypertension, datapool_control_Non_Hypertension], [datapool_ESRD_Diabetes, datapool_control_Diabetes], [datapool_ESRD_Non_Diabetes, datapool_control_Non_Diabetes ],  [datapool_ESRD_Cardiovascular_Disease, datapool_control_Cardiovascular_Disease], [datapool_ESRD_Non_Cardiovascular_Disease, datapool_control_Non_Cardiovascular_Disease], [datapool_ESRD_Hypercholesterolemia, datapool_control_Hypercholesterolemia], [datapool_ESRD_Non_Hypercholesterolemia, datapool_control_Non_Hypercholesterolemia], [datapool_ESRD, datapool_control]]

Sub_grouo_table = pd.DataFrame()

In [19]:
Sub_Groups = [[datapool_ESRD_working_age, datapool_control_working_age], [datapool_ESRD_none_working_age, datapool_control_none_working_age], [datapool_ESRD_Female, datapool_control_Female], [datapool_ESRD_Male, datapool_control_Male], [datapool_ESRD_African, datapool_control_African], [datapool_ESRD_None_African, datapool_control_None_African], [datapool_ESRD_Hypertension, datapool_control_Hypertension], [datapool_ESRD_Non_Hypertension, datapool_control_Non_Hypertension], [datapool_ESRD_Diabetes, datapool_control_Diabetes], [datapool_ESRD_Non_Diabetes, datapool_control_Non_Diabetes], [datapool_ESRD_Cardiovascular_Disease, datapool_control_Cardiovascular_Disease], [datapool_ESRD_Non_Cardiovascular_Disease, datapool_control_Non_Cardiovascular_Disease], [datapool_ESRD_Hypercholesterolemia, datapool_control_Hypercholesterolemia], [datapool_ESRD_Non_Hypercholesterolemia, datapool_control_Non_Hypercholesterolemia], [datapool_ESRD, datapool_control]]

number_in_groups_ESRD = []
number_in_groups_Normal = []
for item in Sub_Groups:
    number_in_groups_ESRD.append(item[0].patient_sk.unique().shape[0])
    number_in_groups_Normal.append(item[1].patient_sk.unique().shape[0])
    
Sub_grouo_table = pd.DataFrame({'Sub groups' : ['Adults under 65','Adults above 65', 'Female', 'Male', 'African American', 'Other (Non-African American)', 'Hypertension', 'Non Hypertension','Diabetes', 'Non Diabetes', 'Cardiovascular Disease',  'Non Cardiovascular Disease', 'Hypercholesterolemia', 'Non Hypercholesterolemia', 'ALL']})
Sub_grouo_table['# of ESRD subgroup'] = number_in_groups_ESRD
Sub_grouo_table['# of Normal subgroup'] = number_in_groups_Normal

In [None]:
Accuracy_list = []
Sensetivity_list = []
Specificity_list = []
time_to_event_ESRD_mean_list = []
time_to_event_ESRD_median_list = []
time_to_event_ESRD_serror_list = []
mu_list = []
sigma_list = []
time_to_event_dataset = []
n_list_normal = []
proportion_list = []

for datapool in Sub_Groups:
    
    datapool_ESRD = datapool[0]
    datapool_control = datapool[1]
    
    # Mu and sigma
    
    
    var_list = []
    n_list = []

    mu = np.mean(datapool_control['eGFR_EPI'])

    var_list = datapool_control.groupby('patient_sk').agg({'eGFR_EPI':'std'})
    var_list = list(var_list.eGFR_EPI)

    n_list =  datapool_control.groupby('patient_sk').agg({'patient_sk':'count'})
    n_list = list(n_list.patient_sk)
    #calculating the mean and variance of the Normal sample

    n_1 = list((n_list - np.ones(len(n_list))).astype('int'))
    numerator = np.multiply(n_1, np.power(var_list, 2))
    denominator = sum(n_list) - len(n_list)
    sigma = np.power(sum(numerator)/denominator,0.5)
    
    mu_list.append(mu)
    sigma_list.append(sigma)
    
    #Hyperparametrs:

    V0 = 0
    w = 0.75
    T = -4
    a = 0.2

    ## Zi:

    datapool_control['Zi'] = (datapool_control.eGFR_EPI - mu)/sigma
    datapool_ESRD['Zi'] = (datapool_ESRD.eGFR_EPI - mu)/sigma

    ## AAANNNDDD let us start palying with Zi and Vi :) AND THE SLOPES AS WELL :)

    from numba import jit
    @jit(nopython=True)

    def Vi_creator(Zi, patient_sk):
        Vi = np.zeros(Zi.shape)
        Vi[0] = V0

        for i in range(1, Vi.shape[0]):
            if patient_sk[i] == patient_sk[i-1]:
                Vi[i] = (min(0.0, Zi[i] + w + Vi[i-1]))
            else:
                Vi[i] = V0

        return Vi

    datapool_control['Vi'] = Vi_creator(datapool_control['Zi'].values, datapool_control['patient_sk'].values)
    datapool_ESRD['Vi'] = Vi_creator(datapool_ESRD['Zi'].values, datapool_ESRD['patient_sk'].values)
    
    # Making up the result trigger date and eGFR tables

    patients_control_trigger = datapool_control[datapool_control['Vi'] <= T].groupby('patient_sk').agg({'Date': 'min'})
    patients_control_trigger = patients_control_trigger.reset_index()
    patients_control_trigger = patients_control_trigger.merge(datapool_control[['patient_sk', 'eGFR_EPI', 'Date']], on=['patient_sk'], how='inner')
    patients_control_trigger = patients_control_trigger.rename({'Date_x':'Trigger_date'}, axis = 1)
    patients_control_trigger = patients_control_trigger.rename({'Date_y':'Date'}, axis = 1)
    patients_control_trigger = patients_control_trigger[patients_control_trigger.Trigger_date == patients_control_trigger.Date]
    patients_control_trigger['New_label'] = list(np.ones(patients_control_trigger.patient_sk.shape[0]))

    patients_ESRD_trigger = datapool_ESRD[datapool_ESRD['Vi'] <= T].groupby('patient_sk').agg({'Date': 'min'})
    patients_ESRD_trigger = patients_ESRD_trigger.reset_index()
    patients_ESRD_trigger = patients_ESRD_trigger.merge(datapool_ESRD[['patient_sk', 'eGFR_EPI', 'Date']], on=['patient_sk'], how='inner')
    patients_ESRD_trigger = patients_ESRD_trigger.rename({'Date_x':'Trigger_date'}, axis = 1)
    patients_ESRD_trigger = patients_ESRD_trigger.rename({'Date_y':'Date'}, axis = 1)
    patients_ESRD_trigger = patients_ESRD_trigger[patients_ESRD_trigger.Trigger_date == patients_ESRD_trigger.Date]
    patients_ESRD_trigger['New_label'] = list(np.ones(patients_ESRD_trigger.patient_sk.shape[0]))

    #Labeling and finishing :)

    patients_Normal_labeled = pd.DataFrame({'patient_sk' : list(datapool_control.patient_sk.unique()) , 'Label' : list(np.ones(len(list(datapool_control.patient_sk.unique()))))}) 

    patients_Normal_labeled =  patients_Normal_labeled.merge(patients_control_trigger, on='patient_sk', how='left')
    patients_Normal_labeled = patients_Normal_labeled.drop_duplicates('patient_sk')
    patients_Normal_labeled = patients_Normal_labeled.drop('Date', axis = 1)



    patients_ESRD_labeled = pd.DataFrame({'patient_sk' : list(datapool_ESRD.patient_sk.unique()) , 'Label' : list(np.ones(len(list(datapool_ESRD.patient_sk.unique()))))}) 

    patients_ESRD_labeled =  patients_ESRD_labeled.merge(patients_ESRD_trigger, on='patient_sk', how='left')
    patients_ESRD_labeled = patients_ESRD_labeled.drop_duplicates('patient_sk')
    patients_ESRD_labeled = patients_ESRD_labeled.drop('Date', axis = 1)

    #Accuracy = true(positive and negative)/total population
    # ESRD NaN = 0.0
    # Normal NaN = 0.0

    #RIGHT detection in ESRD:
    numbet_of_ones_ESRD = patients_ESRD_labeled[patients_ESRD_labeled['New_label'] == 1].shape[0]

    #WRONG detection in Normal
    numbet_of_ones_Normal = patients_Normal_labeled[patients_Normal_labeled['New_label'] == 1].shape[0]

    total_ESRD = patients_ESRD_labeled.shape[0]
    total_Normal = patients_Normal_labeled.shape[0]

    # Accuracy
    Accuracy = (numbet_of_ones_ESRD + (total_Normal - numbet_of_ones_Normal))/(total_ESRD + total_Normal)

    #Sensetivity
    tp = numbet_of_ones_ESRD
    fn = total_ESRD - numbet_of_ones_ESRD
    Sensetivity = tp/(tp+fn)

    #Specificity
    tn = total_Normal - numbet_of_ones_Normal
    fp = numbet_of_ones_Normal
    Specificity = tn/(tn+fp)

    Accuracy_list.append(Accuracy)
    Sensetivity_list.append(Sensetivity)
    Specificity_list.append(Specificity)
    
    patients_ESRD_full_dates_pandas = pd.read_csv('Final_patients_ESRD_full_dates_pandas.csv')


    new_table_dates = pd.DataFrame({'patient_sk' : patients_ESRD_full_dates_pandas['patient_sk'], 'Diagnosis_admission_date_ESRD' : patients_ESRD_full_dates_pandas['Diagnosis_admission_date_ESRD']})
    patients_ESRD_labeled = patients_ESRD_labeled.merge(new_table_dates, on = ['patient_sk'], how = 'inner')

    patients_ESRD_labeled['Diagnosis_admission_date_ESRD'] = pd.to_datetime(patients_ESRD_labeled['Diagnosis_admission_date_ESRD'], errors='coerce')

    lislis_ESRD = (patients_ESRD_labeled['Diagnosis_admission_date_ESRD'] - patients_ESRD_labeled['Trigger_date'])

    patients_ESRD_labeled['time_to_event_ESRD'] = lislis_ESRD
    
    count_lislis = 0
    for i in range(len(lislis_ESRD)):
        if lislis_ESRD[i] >= datetime.timedelta(0):
            count_lislis = count_lislis + 1
            
    proportion_list.append(count_lislis/len(list(datapool_ESRD.patient_sk.unique())))  
    
    for i in range(len(lislis_ESRD)):
        if lislis_ESRD[i] <= datetime.timedelta(0):
            patients_ESRD_labeled['time_to_event_ESRD'][i] = datetime.timedelta(0)
            lislis_ESRD[i] = datetime.timedelta(0)
    
    time_to_event_ESRD_mean = np.mean(lislis_ESRD)
    time_to_event_ESRD_median = np.median(lislis_ESRD)
    time_to_event_ESRD_serror = np.std(lislis_ESRD, ddof=1)
    
        
    
    time_to_event_ESRD_mean_list.append(time_to_event_ESRD_mean)
    time_to_event_ESRD_median_list.append(time_to_event_ESRD_median)
    time_to_event_ESRD_serror_list.append(time_to_event_ESRD_serror)
    
    
    time_to_event_dataset.append(patients_ESRD_labeled) 
    

Sub_grouo_table['# of detected / sub-total'] = proportion_list
Sub_grouo_table['Mu'] = mu_list
Sub_grouo_table['Sigma'] = sigma_list
Sub_grouo_table['Accuracy'] = Accuracy_list
Sub_grouo_table['Sensitivity'] = Sensetivity_list
Sub_grouo_table['Specificity'] = Specificity_list
Sub_grouo_table['Mean time to event (ESRD diagnosis)'] = time_to_event_ESRD_mean_list
Sub_grouo_table['Median time to event (ESRD diagnosis)'] = time_to_event_ESRD_median_list
Sub_grouo_table['Standard error of time to event (ESRD diagnosis)'] = time_to_event_ESRD_serror_list



In [22]:
merged_dataset['time_to_event_ESRD'] = pd.to_timedelta(merged_dataset['time_to_event_ESRD'], errors='coerce')
new_row = {'Sub groups':'Total','Mu':80.65333360443059 , 'Sigma':7.7797820957459045, 'Accuracy' : 0.8344949456145935, 'Sensitivity' : 0.9096118299445471, 'Specificity': 0.8297529726134494, 'Mean time to event (ESRD diagnosis)' : np.mean(merged_dataset.time_to_event_ESRD) , 'Median time to event (ESRD diagnosis)' : np.median(merged_dataset.time_to_event_ESRD)}
Sub_grouo_table = Sub_grouo_table.append(new_row, ignore_index=True)

### Now, working on time-to-event 

In [23]:
for item in time_to_event_dataset:
    item.time_to_event_ESRD = pd.to_timedelta(item.time_to_event_ESRD, errors='coerce')

time_to_event_dataset_Adults_under_65 = time_to_event_dataset[0].drop('Label', axis = 1).drop(['Trigger_date', 'eGFR_EPI', 'New_label', 'Diagnosis_admission_date_ESRD'], axis = 1)
time_to_event_dataset_Adults_above_65 = time_to_event_dataset[1].drop('Label', axis = 1).drop(['Trigger_date', 'eGFR_EPI', 'New_label', 'Diagnosis_admission_date_ESRD'], axis = 1)

time_to_event_dataset_Female = time_to_event_dataset[2].drop('Label', axis = 1).drop(['Trigger_date', 'eGFR_EPI', 'New_label', 'Diagnosis_admission_date_ESRD'], axis = 1)
time_to_event_dataset_Male = time_to_event_dataset[3].drop('Label', axis = 1).drop(['Trigger_date', 'eGFR_EPI', 'New_label', 'Diagnosis_admission_date_ESRD'], axis = 1)

time_to_event_dataset_African_American = time_to_event_dataset[4].drop('Label', axis = 1).drop(['Trigger_date', 'eGFR_EPI', 'New_label', 'Diagnosis_admission_date_ESRD'], axis = 1)
time_to_event_dataset_Non_African_American = time_to_event_dataset[5].drop('Label', axis = 1).drop(['Trigger_date', 'eGFR_EPI', 'New_label', 'Diagnosis_admission_date_ESRD'], axis = 1)

time_to_event_dataset_Hypertension = time_to_event_dataset[6].drop('Label', axis = 1).drop(['Trigger_date', 'eGFR_EPI', 'New_label', 'Diagnosis_admission_date_ESRD'], axis = 1)
time_to_event_dataset_Non_Hypertension = time_to_event_dataset[7].drop('Label', axis = 1).drop(['Trigger_date', 'eGFR_EPI', 'New_label', 'Diagnosis_admission_date_ESRD'], axis = 1)

time_to_event_dataset_Diabetes = time_to_event_dataset[8].drop('Label', axis = 1).drop(['Trigger_date', 'eGFR_EPI', 'New_label', 'Diagnosis_admission_date_ESRD'], axis = 1)
time_to_event_dataset_Non_Diabetes = time_to_event_dataset[9].drop('Label', axis = 1).drop(['Trigger_date', 'eGFR_EPI', 'New_label', 'Diagnosis_admission_date_ESRD'], axis = 1)

time_to_event_dataset_Cardiovascular_Disease = time_to_event_dataset[10].drop('Label', axis = 1).drop(['Trigger_date', 'eGFR_EPI','New_label', 'Diagnosis_admission_date_ESRD'], axis = 1)
time_to_event_dataset_Non_Cardiovascular_Disease = time_to_event_dataset[11].drop('Label', axis = 1).drop(['Trigger_date', 'eGFR_EPI','New_label', 'Diagnosis_admission_date_ESRD'], axis = 1)

time_to_event_dataset_Hypercholesterolemia = time_to_event_dataset[12].drop('Label', axis = 1).drop(['Trigger_date', 'eGFR_EPI', 'New_label', 'Diagnosis_admission_date_ESRD'], axis = 1)
time_to_event_dataset_Non_Hypercholesterolemia = time_to_event_dataset[13].drop('Label', axis = 1).drop(['Trigger_date', 'eGFR_EPI', 'New_label', 'Diagnosis_admission_date_ESRD'], axis = 1)

In [None]:
time_to_event_Total = merged_dataset[['patient_sk', 'time_to_event_ESRD']]
time_to_event_Total.time_to_event_ESRD = pd.to_timedelta(time_to_event_Total.time_to_event_ESRD, errors='coerce')

In [25]:
LISLIS = [[time_to_event_dataset_Adults_under_65, time_to_event_dataset_Adults_above_65], [time_to_event_dataset_Female, time_to_event_dataset_Male], [time_to_event_dataset_African_American, time_to_event_dataset_Non_African_American], [time_to_event_dataset_Hypertension, time_to_event_dataset_Non_Hypertension], [time_to_event_dataset_Diabetes, time_to_event_dataset_Non_Diabetes], [time_to_event_dataset_Cardiovascular_Disease, time_to_event_dataset_Non_Cardiovascular_Disease], [time_to_event_dataset_Hypercholesterolemia, time_to_event_dataset_Non_Hypercholesterolemia]]

#### Test #1: Chi square contingency test

In [None]:
from scipy import stats

chi2_list = []
p_value_list = []

for thing in LISLIS:
    item = thing[0]
    Non_item = thing[1]
    
    a = item[item.time_to_event_ESRD > datetime.timedelta(0)].shape[0]
    b = Non_item[Non_item.time_to_event_ESRD > datetime.timedelta(0)].shape[0]
    c = item[item.time_to_event_ESRD == datetime.timedelta(0)].shape[0]
    d = Non_item[Non_item.time_to_event_ESRD == datetime.timedelta(0)].shape[0]
    
    obs = np.array([[a, b], [c, d]])
    chi2, p, dof, expected = stats.chi2_contingency(obs, correction=False)
    
    chi2_list.append(chi2)
    chi2_list.append(chi2)
    p_value_list.append(p)
    p_value_list.append(p)

#total
chi2_list.append('-')
p_value_list.append('-')

Sub_grouo_table['P Value - Chi2 (0.05)'] = p_value_list

#### Test #2: t-student test with unequal variances

In [None]:
from scipy import stats

ttest_list = []
p_value_list = []

for thing in LISLIS:
    item = thing[0]
    Non_item = thing[1]
    
    a = item[np.isnan(item.time_to_event_ESRD) == False].time_to_event_ESRD.dt.total_seconds()
    b = Non_item[np.isnan(Non_item.time_to_event_ESRD) == False].time_to_event_ESRD.dt.total_seconds()
    
    ttest, p = stats.ttest_ind(a, b, axis=0, equal_var=False)
    
    ttest_list.append(ttest)
    ttest_list.append(ttest)
    p_value_list.append(p)
    p_value_list.append(p)

#total
ttest_list.append('-')
p_value_list.append('-')

Sub_grouo_table['P Value - t-test (0.05)'] = p_value_list

In [None]:
Sub_grouo_table

### For further information please contact rzz5164@psu.edu