# Performing the CUSUM algorithm to obtain qGFRv*

### * This code belongs to the paper "Using CUSUM in real time to signal clinically relevant decreases in estimated glomerular filtration rate"
##### To cite: Zafarnejad, R., Dumbauld, S., Dumbauld, D. et al. Using CUSUM in real time to signal clinically relevant decreases in estimated glomerular filtration rate. BMC Nephrol 23, 287 (2022). https://doi.org/10.1186/s12882-022-02910-8

In [3]:
import time
import pandas as pd
import numpy as np
import pyspark.sql.functions as f
from pyspark.sql.window import Window
import socket    
from datetime import datetime, timedelta

In [4]:
datapool_ESRD = pd.read_csv('Final_ESRD_group_done_pandas.csv')
datapool_ESRD = datapool_ESRD.drop(columns=datapool_ESRD.columns[0])
datapool_ESRD = datapool_ESRD.drop_duplicates()
datapool_control = pd.read_csv("Final_Normal_group_done_pandas.csv")
datapool_control = datapool_control.drop(columns=datapool_control.columns[0])

#some patients have less than 9 datapoinsts!!! AFTER DROPPING DUPLICATES
datapool_ESRD_dropped = datapool_ESRD.groupby('patient_sk').agg({'eGFR_EPI' : 'count'}).reset_index()[datapool_ESRD.groupby('patient_sk').agg({'eGFR_EPI' : 'count'}).reset_index().eGFR_EPI >=9]
datapool_ESRD_dropped = datapool_ESRD_dropped.drop('eGFR_EPI', axis =1)
datapool_ESRD = datapool_ESRD.merge(datapool_ESRD_dropped, on = 'patient_sk', how = 'inner')

datapool_control = datapool_control.drop(datapool_control.index[np.isinf(datapool_control.eGFR_EPI) == True], axis = 0)
datapool_control = datapool_control.drop_duplicates()

#some patients have less than 9 datapoinsts!!! AFTER DROPPING DUPLICATES
datapool_control_dropped = datapool_control.groupby('patient_sk').agg({'eGFR_EPI' : 'count'}).reset_index()[datapool_control.groupby('patient_sk').agg({'eGFR_EPI' : 'count'}).reset_index().eGFR_EPI >=9]
datapool_control_dropped = datapool_control_dropped.drop('eGFR_EPI', axis =1)
datapool_control = datapool_control.merge(datapool_control_dropped, on = 'patient_sk', how = 'inner')

#Pulling out each patient's data 
#Also. sortinh the data by cSr lavel measurement data and reindexing it

patients_list_Normal = list(set(np.unique(list(datapool_control['patient_sk']))))
patients_list_ESRD = list(set(np.unique(list(datapool_ESRD['patient_sk']))))

In [5]:
# !!!! SHOULD TURN TO TOTAL_SECONDS IN THE MIDST OF ALGORITHM

datapool_control['Date'] = pd.to_datetime(datapool_control['Date'])
datapool_control['Date'] = pd.to_datetime(datapool_control['Date'])
datapool_control_dates = datapool_control.groupby('patient_sk').agg({'Date': 'min'})
datapool_control_dates = datapool_control_dates.reset_index()
datapool_control = datapool_control.merge(datapool_control_dates, on = 'patient_sk', how='left')
datapool_control['Date_seconds'] = (datapool_control['Date_x'] - datapool_control['Date_y'])
datapool_control = datapool_control.rename({'Date_x':'Date'}, axis = 1)
datapool_control = datapool_control.drop('Date_y', axis = 1)
datapool_control['Date_seconds'] = datapool_control['Date_seconds'].dt.total_seconds()

datapool_ESRD['Date'] = pd.to_datetime(datapool_ESRD['Date'])
datapool_ESRD['Date'] = pd.to_datetime(datapool_ESRD['Date'])
datapool_ESRD_dates = datapool_ESRD.groupby('patient_sk').agg({'Date': 'min'})
datapool_ESRD_dates = datapool_ESRD_dates.reset_index()
datapool_ESRD = datapool_ESRD.merge(datapool_ESRD_dates, on = 'patient_sk', how='left')
datapool_ESRD['Date_seconds'] = datapool_ESRD['Date_x'] - datapool_ESRD['Date_y']
datapool_ESRD = datapool_ESRD.rename({'Date_x':'Date'}, axis = 1)
datapool_ESRD = datapool_ESRD.drop('Date_y', axis = 1)
datapool_ESRD['Date_seconds'] = datapool_ESRD['Date_seconds'].dt.total_seconds()

In [6]:
#Getting rid of Normal min eGFR < 60

datapool_control_patients = datapool_control.groupby('patient_sk').agg({'eGFR_EPI': 'min'})
datapool_control_patients = datapool_control_patients[datapool_control_patients['eGFR_EPI']>=60]
datapool_control_patients = datapool_control_patients.reset_index()

datapool_control = datapool_control_patients.merge(datapool_control, on = 'patient_sk', how = 'inner')
datapool_control = datapool_control.rename({'eGFR_EPI_y':'eGFR_EPI'}, axis = 1)
datapool_control = datapool_control.drop('eGFR_EPI_x', axis = 1)

patients_list_control_above_60 = list(set(np.unique(list(datapool_control['patient_sk']))))

patients_list_Normal = patients_list_control_above_60



#Getting rid of ESRD min eGFR < 60

datapool_ESRD_patients = datapool_ESRD.groupby('patient_sk').agg({'Date': 'min'})
datapool_ESRD_patients = datapool_ESRD_patients.reset_index()

datapool_ESRD_patients_eGFR = datapool_ESRD.merge(datapool_ESRD_patients, on=['patient_sk', 'Date'], how ='inner')
datapool_ESRD_patients_eGFR = datapool_ESRD_patients_eGFR.drop_duplicates('patient_sk')
datapool_ESRD_patients_eGFR = datapool_ESRD_patients_eGFR[datapool_ESRD_patients_eGFR['eGFR_EPI']>=60]

datapool_ESRD_new = datapool_ESRD.merge(datapool_ESRD_patients_eGFR['patient_sk'], on = 'patient_sk', how = 'inner')

datapool_ESRD = datapool_ESRD_new
datapool_ESRD = datapool_ESRD.drop_duplicates()
patients_list_ESRD = list(set(np.unique(list(datapool_ESRD['patient_sk']))))

In [8]:
# OLD formula

datapool_ESRD_intact = datapool_ESRD.copy()
datapool_ESRD['k'] = 1
datapool_ESRD['a'] = 1
datapool_ESRD['1'] = 1
datapool_ESRD.loc[datapool_ESRD['Gender'] == 'Female', 'k'] = 0.7
datapool_ESRD.loc[datapool_ESRD['Gender'] == 'Female','a'] = -0.329
datapool_ESRD.loc[datapool_ESRD['Gender'] == 'Male','k'] = 0.9
datapool_ESRD.loc[datapool_ESRD['Gender'] == 'Male','a'] = -0.411

datapool_ESRD['sCr/k'] = datapool_ESRD['new_sCr']/datapool_ESRD['k']

datapool_ESRD.loc[datapool_ESRD['Race'] == 'African American', 'newly_calculated_eGFR_old'] = 141 * datapool_ESRD[['sCr/k', '1']].min(axis=1)**datapool_ESRD['a'] * datapool_ESRD[['sCr/k', '1']].max(axis=1) ** (-1.209) * 0.993 ** datapool_ESRD['Age'] * 1.159
datapool_ESRD.loc[datapool_ESRD['Race'] != 'African American', 'newly_calculated_eGFR_old'] = 141 * datapool_ESRD[['sCr/k', '1']].min(axis=1)**datapool_ESRD['a'] * datapool_ESRD[['sCr/k', '1']].max(axis=1) ** (-1.209) * 0.993 ** datapool_ESRD['Age']
datapool_ESRD.loc[datapool_ESRD['Gender'] == 'Female', 'newly_calculated_eGFR_old'] = datapool_ESRD.loc[datapool_ESRD['Gender'] == 'Female', 'newly_calculated_eGFR_old'] * 1.018

datapool_ESRD = datapool_ESRD.drop(['k', 'a', '1', 'sCr/k'], axis = 1)


datapool_control_intact = datapool_control.copy()
datapool_control['k'] = 1
datapool_control['a'] = 1
datapool_control['1'] = 1
datapool_control.loc[datapool_control['Gender'] == 'Female', 'k'] = 0.7
datapool_control.loc[datapool_control['Gender'] == 'Female','a'] = -0.329
datapool_control.loc[datapool_control['Gender'] == 'Male','k'] = 0.9
datapool_control.loc[datapool_control['Gender'] == 'Male','a'] = -0.411

datapool_control['sCr/k'] = datapool_control['new_sCr']/datapool_control['k']
datapool_control
datapool_control.loc[datapool_control['Race'] == 'African American', 'newly_calculated_eGFR_old'] = 141 * datapool_control[['sCr/k', '1']].min(axis=1)**datapool_control['a'] * datapool_control[['sCr/k', '1']].max(axis=1) ** (-1.209) * 0.993 ** datapool_control['Age'] * 1.159
datapool_control.loc[datapool_control['Race'] != 'African American', 'newly_calculated_eGFR_old'] = 141 * datapool_control[['sCr/k', '1']].min(axis=1)**datapool_control['a'] * datapool_control[['sCr/k', '1']].max(axis=1) ** (-1.209) * 0.993 ** datapool_control['Age']
datapool_control.loc[datapool_control['Gender'] == 'Female', 'newly_calculated_eGFR_old'] = datapool_control.loc[datapool_control['Gender'] == 'Female', 'newly_calculated_eGFR_old'] * 1.018

datapool_control = datapool_control.drop(['k', 'a', '1', 'sCr/k'], axis = 1)


In [9]:
# New 2022 formula

datapool_ESRD_intact = datapool_ESRD.copy()
datapool_ESRD['k'] = 1
datapool_ESRD['a'] = 1
datapool_ESRD['1'] = 1
datapool_control['newly_calculated_eGFR_new']  = 1
datapool_ESRD.loc[datapool_ESRD['Gender'] == 'Female', 'k'] = 0.7
datapool_ESRD.loc[datapool_ESRD['Gender'] == 'Female','a'] = -0.241
datapool_ESRD.loc[datapool_ESRD['Gender'] == 'Male','k'] = 0.9
datapool_ESRD.loc[datapool_ESRD['Gender'] == 'Male','a'] = -0.302

datapool_ESRD['sCr/k'] = datapool_ESRD['new_sCr']/datapool_ESRD['k']
datapool_ESRD['newly_calculated_eGFR_new'] = 142 * datapool_ESRD[['sCr/k', '1']].min(axis=1)**datapool_ESRD['a'] * datapool_ESRD[['sCr/k', '1']].max(axis=1) ** (-1.2) * 0.9938 ** datapool_ESRD['Age']
datapool_ESRD.loc[datapool_ESRD['Gender'] == 'Female', 'newly_calculated_eGFR_new'] = datapool_ESRD.loc[datapool_ESRD['Gender'] == 'Female', 'newly_calculated_eGFR_new'] * 1.012

datapool_ESRD = datapool_ESRD.drop(['k', 'a', '1', 'sCr/k'], axis = 1)


datapool_control_intact = datapool_control.copy()
datapool_control['k'] = 1
datapool_control['a'] = 1
datapool_control['1'] = 1
datapool_control['newly_calculated_eGFR_new']  = 1
datapool_control.loc[datapool_control['Gender'] == 'Female', 'k'] = 0.7
datapool_control.loc[datapool_control['Gender'] == 'Female','a'] = -0.241
datapool_control.loc[datapool_control['Gender'] == 'Male','k'] = 0.9
datapool_control.loc[datapool_control['Gender'] == 'Male','a'] = -0.302

datapool_control['sCr/k'] = datapool_control['new_sCr']/datapool_control['k']
datapool_control
datapool_control['newly_calculated_eGFR_new'] = 142 * datapool_control[['sCr/k', '1']].min(axis=1)**datapool_control['a'] * datapool_control[['sCr/k', '1']].max(axis=1) ** (-1.2) * 0.9938 ** datapool_control['Age']
datapool_control.loc[datapool_control['Gender'] == 'Female', 'newly_calculated_eGFR_new'] = datapool_control.loc[datapool_control['Gender'] == 'Female', 'newly_calculated_eGFR_new'] * 1.012

datapool_control = datapool_control.drop(['k', 'a', '1', 'sCr/k'], axis = 1)


In [10]:
datapool_control = datapool_control.drop(['eGFR_EPI', 'newly_calculated_eGFR_old'], axis = 1)
datapool_ESRD = datapool_ESRD.drop(['eGFR_EPI', 'newly_calculated_eGFR_old'], axis = 1)

In [None]:
print(np.mean(datapool_control[(datapool_control['Age'] < 30)].groupby('patient_sk').agg({'newly_calculated_eGFR_new' : 'mean'}).reset_index()['newly_calculated_eGFR_new']))
print(np.mean(datapool_control[(datapool_control['Age'] >= 30) & (datapool_control['Age'] < 40)].groupby('patient_sk').agg({'newly_calculated_eGFR_new' : 'mean'}).reset_index()['newly_calculated_eGFR_new']))
print(np.mean(datapool_control[(datapool_control['Age'] >= 40) & (datapool_control['Age'] < 50)].groupby('patient_sk').agg({'newly_calculated_eGFR_new' : 'mean'}).reset_index()['newly_calculated_eGFR_new']))
print(np.mean(datapool_control[(datapool_control['Age'] >= 50) & (datapool_control['Age'] < 60)].groupby('patient_sk').agg({'newly_calculated_eGFR_new' : 'mean'}).reset_index()['newly_calculated_eGFR_new']))
print(np.mean(datapool_control[(datapool_control['Age'] >= 60) & (datapool_control['Age'] < 70)].groupby('patient_sk').agg({'newly_calculated_eGFR_new' : 'mean'}).reset_index()['newly_calculated_eGFR_new']))
print(np.mean(datapool_control[(datapool_control['Age'] >= 70)].groupby('patient_sk').agg({'newly_calculated_eGFR_new' : 'mean'}).reset_index()['newly_calculated_eGFR_new']))

In [None]:
# Mu and sigma

var_list = []
n_list = []

# _________________________________________________________________________________________________________________________________________________

# finiding the inital age:
control_age_initial_df = datapool_control.groupby('patient_sk').agg({'Age' : 'min'}).rename(columns = {'Age' : 'initial_age'})
datapool_control = datapool_control.join(control_age_initial_df, on = 'patient_sk', how = 'inner')

ESRD_age_initial_df = datapool_ESRD.groupby('patient_sk').agg({'Age' : 'min'}).rename(columns = {'Age' : 'initial_age'})
datapool_ESRD = datapool_ESRD.join(ESRD_age_initial_df, on = 'patient_sk', how = 'inner')

# ____________________________________ Starting point : from CERNER  ____________________________________
# finsing the right eGFR bin:
datapool_control['starting_mu'] = 0
datapool_control.loc[datapool_control['initial_age'] < 30, 'starting_mu'] = np.mean(datapool_control[(datapool_control['Age'] < 30)].groupby('patient_sk').agg({'newly_calculated_eGFR_new' : 'mean'}).reset_index()['newly_calculated_eGFR_new'])
datapool_control.loc[(datapool_control['initial_age'] >= 30) & (datapool_control['initial_age'] < 40), 'starting_mu'] = np.mean(datapool_control[(datapool_control['Age'] >= 30) & (datapool_control['Age'] < 40)].groupby('patient_sk').agg({'newly_calculated_eGFR_new' : 'mean'}).reset_index()['newly_calculated_eGFR_new'])
datapool_control.loc[(datapool_control['initial_age'] >= 40) & (datapool_control['initial_age'] < 50), 'starting_mu'] = np.mean(datapool_control[(datapool_control['Age'] >= 40) & (datapool_control['Age'] < 50)].groupby('patient_sk').agg({'newly_calculated_eGFR_new' : 'mean'}).reset_index()['newly_calculated_eGFR_new'])
datapool_control.loc[(datapool_control['initial_age'] >= 50) & (datapool_control['initial_age'] < 60), 'starting_mu'] = np.mean(datapool_control[(datapool_control['Age'] >= 50) & (datapool_control['Age'] < 60)].groupby('patient_sk').agg({'newly_calculated_eGFR_new' : 'mean'}).reset_index()['newly_calculated_eGFR_new'])
datapool_control.loc[(datapool_control['initial_age'] >= 60) & (datapool_control['initial_age'] < 70), 'starting_mu'] = np.mean(datapool_control[(datapool_control['Age'] >= 60) & (datapool_control['Age'] < 70)].groupby('patient_sk').agg({'newly_calculated_eGFR_new' : 'mean'}).reset_index()['newly_calculated_eGFR_new'])
datapool_control.loc[datapool_control['initial_age'] > 70, 'starting_mu'] = np.mean(datapool_control[(datapool_control['Age'] > 70)].groupby('patient_sk').agg({'newly_calculated_eGFR_new' : 'mean'}).reset_index()['newly_calculated_eGFR_new'])

datapool_ESRD['starting_mu'] = 0
datapool_ESRD.loc[datapool_ESRD['initial_age'] < 30, 'starting_mu'] = np.mean(datapool_control[(datapool_control['Age'] < 30)].groupby('patient_sk').agg({'newly_calculated_eGFR_new' : 'mean'}).reset_index()['newly_calculated_eGFR_new'])
datapool_ESRD.loc[(datapool_ESRD['initial_age'] >= 30) & (datapool_ESRD['initial_age'] < 40), 'starting_mu'] = np.mean(datapool_control[(datapool_control['Age'] >= 30) & (datapool_control['Age'] < 40)].groupby('patient_sk').agg({'newly_calculated_eGFR_new' : 'mean'}).reset_index()['newly_calculated_eGFR_new'])
datapool_ESRD.loc[(datapool_ESRD['initial_age'] >= 40) & (datapool_ESRD['initial_age'] < 50), 'starting_mu'] = np.mean(datapool_control[(datapool_control['Age'] >= 40) & (datapool_control['Age'] < 50)].groupby('patient_sk').agg({'newly_calculated_eGFR_new' : 'mean'}).reset_index()['newly_calculated_eGFR_new'])
datapool_ESRD.loc[(datapool_ESRD['initial_age'] >= 50) & (datapool_ESRD['initial_age'] < 60), 'starting_mu'] = np.mean(datapool_control[(datapool_control['Age'] >= 50) & (datapool_control['Age'] < 60)].groupby('patient_sk').agg({'newly_calculated_eGFR_new' : 'mean'}).reset_index()['newly_calculated_eGFR_new'])
datapool_ESRD.loc[(datapool_ESRD['initial_age'] >= 60) & (datapool_ESRD['initial_age'] < 70), 'starting_mu'] = np.mean(datapool_control[(datapool_control['Age'] >= 60) & (datapool_control['Age'] < 70)].groupby('patient_sk').agg({'newly_calculated_eGFR_new' : 'mean'}).reset_index()['newly_calculated_eGFR_new'])
datapool_ESRD.loc[datapool_ESRD['initial_age'] > 70, 'starting_mu'] = np.mean(datapool_control[(datapool_control['Age'] > 70)].groupby('patient_sk').agg({'newly_calculated_eGFR_new' : 'mean'}).reset_index()['newly_calculated_eGFR_new'])


# _________________________________________________________________________________________________________________________________________________

var_list = datapool_control.groupby('patient_sk').agg({'newly_calculated_eGFR_new':'std'})
var_list = list(var_list.newly_calculated_eGFR_new)

n_list =  datapool_control.groupby('patient_sk').agg({'patient_sk':'count'})
n_list = list(n_list.patient_sk)
#calculating the mean and variance of the Normal sample

n_1 = list((n_list - np.ones(len(n_list))).astype('int'))
numerator = np.multiply(n_1, np.power(var_list, 2))
denominator = sum(n_list) - len(n_list)
sigma = np.power(sum(numerator)/denominator,0.5)

print('sigma: ', sigma)


#Hyperparametrs:

V0 = 0
w = 0.75
T = -4


## AND let us start palying with Zi and Vi
from numba import jit
@jit(nopython=True)

def Vi_creator(newly_calculated_eGFR_new, starting_mu, patient_sk, Age):
    Zi = np.zeros(newly_calculated_eGFR_new.shape)
    Zi[0] = 0
    
    for i in range(1, Zi.shape[0]):
        if patient_sk[i] == patient_sk[i-1]:
            delta_age = Age[i] - Age[i-1]
            Zi[i] = (newly_calculated_eGFR_new[i] - (starting_mu[i] - 0.81 * (delta_age)))/sigma
        else:
            Zi[i] = 0
            
    
    Vi = np.zeros(Zi.shape)
    Vi[0] = V0

    for i in range(1, Vi.shape[0]):
        if patient_sk[i] == patient_sk[i-1]:
            Vi[i] = (min(0.0, Zi[i] + w + Vi[i-1]))
        else:
            Vi[i] = V0
            
    return Vi

datapool_control['Vi'] = Vi_creator(datapool_control['newly_calculated_eGFR_new'].values, datapool_control['starting_mu'].values, datapool_control['patient_sk'].values, datapool_control['Age'].values)
datapool_ESRD['Vi'] = Vi_creator(datapool_ESRD['newly_calculated_eGFR_new'].values, datapool_ESRD['starting_mu'].values, datapool_ESRD['patient_sk'].values, datapool_ESRD['Age'].values)



# Making up the result trigger date and eGFR tables

patients_control_trigger = datapool_control[datapool_control['Vi'] <= T].groupby('patient_sk').agg({'Date': 'min'})
patients_control_trigger = patients_control_trigger.reset_index()
patients_control_trigger = patients_control_trigger.merge(datapool_control[['patient_sk', 'newly_calculated_eGFR_new', 'Date']], on=['patient_sk'], how='inner')
patients_control_trigger = patients_control_trigger.rename({'Date_x':'Trigger_date'}, axis = 1)
patients_control_trigger = patients_control_trigger.rename({'Date_y':'Date'}, axis = 1)
patients_control_trigger = patients_control_trigger[patients_control_trigger.Trigger_date == patients_control_trigger.Date]
patients_control_trigger['New_label'] = list(np.ones(patients_control_trigger.patient_sk.shape[0]))

patients_ESRD_trigger = datapool_ESRD[datapool_ESRD['Vi'] <= T].groupby('patient_sk').agg({'Date': 'min'})
patients_ESRD_trigger = patients_ESRD_trigger.reset_index()
patients_ESRD_trigger = patients_ESRD_trigger.merge(datapool_ESRD[['patient_sk', 'newly_calculated_eGFR_new', 'Date']], on=['patient_sk'], how='inner')
patients_ESRD_trigger = patients_ESRD_trigger.rename({'Date_x':'Trigger_date'}, axis = 1)
patients_ESRD_trigger = patients_ESRD_trigger.rename({'Date_y':'Date'}, axis = 1)
patients_ESRD_trigger = patients_ESRD_trigger[patients_ESRD_trigger.Trigger_date == patients_ESRD_trigger.Date]
patients_ESRD_trigger['New_label'] = list(np.ones(patients_ESRD_trigger.patient_sk.shape[0]))



#Labeling and finishing :)

patients_Normal_labeled = pd.DataFrame({'patient_sk' : list(datapool_control.patient_sk.unique()) , 'Label' : list(np.ones(len(list(datapool_control.patient_sk.unique()))))}) 

patients_Normal_labeled =  patients_Normal_labeled.merge(patients_control_trigger, on='patient_sk', how='left')
patients_Normal_labeled = patients_Normal_labeled.drop_duplicates('patient_sk')
patients_Normal_labeled = patients_Normal_labeled.drop('Date', axis = 1)



patients_ESRD_labeled = pd.DataFrame({'patient_sk' : list(datapool_ESRD.patient_sk.unique()) , 'Label' : list(np.ones(len(list(datapool_ESRD.patient_sk.unique()))))}) 

patients_ESRD_labeled =  patients_ESRD_labeled.merge(patients_ESRD_trigger, on='patient_sk', how='left')
patients_ESRD_labeled = patients_ESRD_labeled.drop_duplicates('patient_sk')
patients_ESRD_labeled = patients_ESRD_labeled.drop('Date', axis = 1)



#Accuracy = true(positive and negative)/total population
# ESRD NaN = 0.0
# Normal NaN = 0.0

#RIGHT detection in ESRD:
numbet_of_ones_ESRD = patients_ESRD_labeled[patients_ESRD_labeled['New_label'] == 1].shape[0]

#WRONG detection in Normal
numbet_of_ones_Normal = patients_Normal_labeled[patients_Normal_labeled['New_label'] == 1].shape[0]

total_ESRD = patients_ESRD_labeled.shape[0]
total_Normal = patients_Normal_labeled.shape[0]


# Accuracy
Accuracy = (numbet_of_ones_ESRD + (total_Normal - numbet_of_ones_Normal))/(total_ESRD + total_Normal)

#Sensetivity
tp = numbet_of_ones_ESRD
fn = total_ESRD - numbet_of_ones_ESRD
Sensetivity = tp/(tp+fn)

#Specificity
tn = total_Normal - numbet_of_ones_Normal
fp = numbet_of_ones_Normal
Specificity = tn/(tn+fp)



#Accuracy = true(positive and negative)/total population
# ESRD NaN = 0.0
# Normal NaN = 0.0

#RIGHT detection in ESRD:
numbet_of_ones_ESRD = patients_ESRD_labeled[patients_ESRD_labeled['New_label'] == 1].shape[0]

#WRONG detection in Normal
numbet_of_ones_Normal = patients_Normal_labeled[patients_Normal_labeled['New_label'] == 1].shape[0]

total_ESRD = patients_ESRD_labeled.shape[0]
total_Normal = patients_Normal_labeled.shape[0]


# Accuracy
Accuracy = (numbet_of_ones_ESRD + (total_Normal - numbet_of_ones_Normal))/(total_ESRD + total_Normal)

#Sensetivity
tp = numbet_of_ones_ESRD
fn = total_ESRD - numbet_of_ones_ESRD
Sensetivity = tp/(tp+fn)

#Specificity
tn = total_Normal - numbet_of_ones_Normal
fp = numbet_of_ones_Normal
Specificity = tn/(tn+fp)


print('Sensetivity: ', Sensetivity)
print('Specificity: ', Specificity)
print('Accuracy: ', Accuracy)
print(patients_Normal_labeled[(patients_Normal_labeled['New_label'] == 1) & (patients_Normal_labeled['newly_calculated_eGFR_new'] > 60)].shape[0]/patients_Normal_labeled.shape[0], 'triggered > 60 - Normal')
print(patients_ESRD_labeled[(patients_ESRD_labeled['New_label'] == 1) & (patients_ESRD_labeled['newly_calculated_eGFR_new'] > 60)].shape[0]/patients_ESRD_labeled.shape[0], 'triggered > 60 - ESKD')
numbet_of_ones_ESRD

In [None]:
diagnosis_data = pd.read_csv("DIAGNOSIS_2.csv")

patients_ESRD_full_dates_pandas = pd.read_csv('Final_patients_ESRD_full_dates_pandas.csv')

merged_dataset = patients_ESRD_labeled

new_table_dates = pd.DataFrame({'patient_sk' : patients_ESRD_full_dates_pandas['patient_sk'], 'Diagnosis_admission_date_ESRD' : patients_ESRD_full_dates_pandas['Diagnosis_admission_date_ESRD']})
merged_dataset = merged_dataset.merge(new_table_dates, on = ['patient_sk'], how = 'inner')

merged_dataset['Diagnosis_admission_date_ESRD'] = pd.to_datetime(merged_dataset['Diagnosis_admission_date_ESRD'], errors='coerce')

merged_dataset = merged_dataset.merge(diagnosis_data, on = 'patient_sk' , how = 'inner')
merged_dataset = merged_dataset.rename(columns={'eGFR_EPI': 'Trigger_eGFR'})


lislis_ESRD = (merged_dataset['Diagnosis_admission_date_ESRD'] - merged_dataset['Trigger_date'])

merged_dataset['time_to_event_ESRD'] = lislis_ESRD

#Age ESRD

datapool_ESRD_age = datapool_ESRD.groupby('patient_sk').agg({'Age': lambda x: (x.iloc[-1] + x.iloc[0])/2}) #getting median of reported ages
datapool_ESRD_age = datapool_ESRD_age.reset_index()

patient_ESRD_working_age = datapool_ESRD_age[datapool_ESRD_age.Age < 65].drop('Age', axis =1)
datapool_ESRD_working_age = datapool_ESRD.merge(patient_ESRD_working_age, on = ['patient_sk'], how = 'inner')

patient_ESRD_none_working_age = datapool_ESRD_age[datapool_ESRD_age.Age >= 65].drop('Age', axis =1)
datapool_ESRD_none_working_age = datapool_ESRD.merge(patient_ESRD_none_working_age, on = ['patient_sk'], how = 'inner')

#Age Normal

datapool_control_age = datapool_control.groupby('patient_sk').agg({'Age': lambda x: (x.iloc[-1] + x.iloc[0])/2}) #getting median of reported ages
datapool_control_age = datapool_control_age.reset_index()

patient_control_working_age = datapool_control_age[datapool_control_age.Age < 65].drop('Age', axis =1)
datapool_control_working_age = datapool_control.merge(patient_control_working_age, on = ['patient_sk'], how = 'inner')

patient_control_none_working_age = datapool_control_age[datapool_control_age.Age >= 65].drop('Age', axis =1)
datapool_control_none_working_age = datapool_control.merge(patient_control_none_working_age, on = ['patient_sk'], how = 'inner')

#Gender ESRD

datapool_ESRD_gender = datapool_ESRD[(datapool_ESRD.Gender == 'Female') | (datapool_ESRD.Gender == 'Male')]
datapool_ESRD_gender = datapool_ESRD_gender.groupby('patient_sk').agg({'Gender': lambda x: x.iloc[0]}) #getting the first non_NONE gender reported (sex essentially)
datapool_ESRD_gender = datapool_ESRD_gender.reset_index()

patients_ESRD_gender_female = datapool_ESRD_gender[datapool_ESRD_gender.Gender == 'Female'].drop_duplicates('patient_sk').drop('Gender', axis = 1)
patients_ESRD_gender_male = datapool_ESRD_gender[datapool_ESRD_gender.Gender == 'Male'].drop_duplicates('patient_sk').drop('Gender', axis = 1)

datapool_ESRD_Female = datapool_ESRD.merge(patients_ESRD_gender_female, on = ['patient_sk'], how = 'inner')
datapool_ESRD_Male = datapool_ESRD.merge(patients_ESRD_gender_male, on = ['patient_sk'], how = 'inner')

#Gender Normal

datapool_control_gender = datapool_control[(datapool_control.Gender == 'Female') | (datapool_control.Gender == 'Male')]
datapool_control_gender = datapool_control_gender.groupby('patient_sk').agg({'Gender': lambda x: x.iloc[0]}) #getting the first non_NONE gender reported (sex essentially)
datapool_control_gender = datapool_control_gender.reset_index()

patients_control_gender_female = datapool_control_gender[datapool_control_gender.Gender == 'Female'].drop_duplicates('patient_sk').drop('Gender', axis = 1)
patients_control_gender_male = datapool_control_gender[datapool_control_gender.Gender == 'Male'].drop_duplicates('patient_sk').drop('Gender', axis = 1)

datapool_control_Female = datapool_control.merge(patients_control_gender_female, on = ['patient_sk'], how = 'inner')
datapool_control_Male = datapool_control.merge(patients_control_gender_male, on = ['patient_sk'], how = 'inner')

#Race ESRD

datapool_ESRD_race = datapool_ESRD.groupby('patient_sk').agg({'Race': lambda x: x.iloc[-1]}) #getting the last non_NONE race reported
datapool_ESRD_race = datapool_ESRD_race.reset_index()

patients_ESRD_African = datapool_ESRD_race[(datapool_ESRD_race.Race == 'African American')]
patients_ESRD_African = patients_ESRD_African.drop('Race', axis = 1)
datapool_ESRD_African = datapool_ESRD.merge(patients_ESRD_African, on = ['patient_sk'], how = 'inner')

patients_ESRD_None_African = datapool_ESRD_race[(datapool_ESRD_race.Race != 'African American')]
patients_ESRD_None_African = patients_ESRD_None_African.drop('Race', axis = 1)
datapool_ESRD_None_African = datapool_ESRD.merge(patients_ESRD_None_African, on = ['patient_sk'], how = 'inner')

#Race Normal

datapool_control_race = datapool_control.groupby('patient_sk').agg({'Race': lambda x: x.iloc[-1]}) #getting the last non_NONE race reported
datapool_control_race = datapool_control_race.reset_index()

patients_control_African = datapool_control_race[(datapool_control_race.Race == 'African American')]
patients_control_African = patients_control_African.drop('Race', axis = 1)
datapool_control_African = datapool_control.merge(patients_control_African, on = ['patient_sk'], how = 'inner')

patients_control_None_African = datapool_control_race[(datapool_control_race.Race != 'African American')]
patients_control_None_African = patients_control_None_African.drop('Race', axis = 1)
datapool_control_None_African = datapool_control.merge(patients_control_None_African, on = ['patient_sk'], how = 'inner')

#Hypertenssion

merged_dataset['Diagnosis_admission_date_Hypertension'] = pd.to_datetime(merged_dataset['Diagnosis_admission_date_Hypertension'], errors='coerce')
merged_dataset['Diagnosis_admission_date_ESRD'] = pd.to_datetime(merged_dataset['Diagnosis_admission_date_ESRD'], errors='coerce')

Diagnosis_admission_date_Hypertension = list(merged_dataset['Diagnosis_admission_date_Hypertension'])
Diagnosis_admission_date_ESRD = list(merged_dataset['Diagnosis_admission_date_ESRD'])

for i in range(len(Diagnosis_admission_date_Hypertension)):
    if pd.isnull(Diagnosis_admission_date_Hypertension[i]):
        Diagnosis_admission_date_Hypertension[i] = pd.Timestamp('1800-01-01')

count = 0
count_list = []
for i in range(len(Diagnosis_admission_date_Hypertension)):
    if Diagnosis_admission_date_Hypertension[i] <= Diagnosis_admission_date_ESRD[i] and Diagnosis_admission_date_Hypertension[i] != pd.Timestamp('1800-01-01'):
        count = count + 1
        count_list.append(merged_dataset['patient_sk'][i])
    

patients_ESRD_Hypertension = pd.DataFrame({'patient_sk' : count_list})
datapool_ESRD_Hypertension = datapool_ESRD.merge(patients_ESRD_Hypertension, on = ['patient_sk'], how = 'inner')

#Non-Hypertenssion

patients_list_ESRD
patients_ESRD_Non_Hypertension = pd.DataFrame({'patient_sk' : list(set(patients_list_ESRD).difference(set(count_list)))})
datapool_ESRD_Non_Hypertension = datapool_ESRD.merge(patients_ESRD_Non_Hypertension, on = ['patient_sk'], how = 'inner')

merged_dataset['Diagnosis_admission_date_Diabetes'] = pd.to_datetime(merged_dataset['Diagnosis_admission_date_Diabetes'], errors='coerce')
merged_dataset['Diagnosis_admission_date_ESRD'] = pd.to_datetime(merged_dataset['Diagnosis_admission_date_ESRD'], errors='coerce')

Diagnosis_admission_date_Diabetes = list(merged_dataset['Diagnosis_admission_date_Diabetes'])
Diagnosis_admission_date_ESRD = list(merged_dataset['Diagnosis_admission_date_ESRD'])

for i in range(len(Diagnosis_admission_date_Diabetes)):
    if pd.isnull(Diagnosis_admission_date_Diabetes[i]):
        Diagnosis_admission_date_Diabetes[i] = pd.Timestamp('1800-01-01')

count = 0
count_list = []
for i in range(len(Diagnosis_admission_date_Diabetes)):
    if Diagnosis_admission_date_Diabetes[i] <= Diagnosis_admission_date_ESRD[i] and Diagnosis_admission_date_Diabetes[i] != pd.Timestamp('1800-01-01'):
        count = count + 1
        count_list.append(merged_dataset['patient_sk'][i])
        

patients_ESRD_Diabetes = pd.DataFrame({'patient_sk' : count_list})
datapool_ESRD_Diabetes = datapool_ESRD.merge(patients_ESRD_Diabetes, on = ['patient_sk'], how = 'inner')

#Non-Diabeties

patients_list_ESRD
patients_ESRD_Non_Diabetes = pd.DataFrame({'patient_sk' : list(set(patients_list_ESRD).difference(set(count_list)))})
datapool_ESRD_Non_Diabetes = datapool_ESRD.merge(patients_ESRD_Non_Diabetes, on = ['patient_sk'], how = 'inner')


#Cardiovascular_Disease

merged_dataset['Diagnosis_admission_date_Coronary_Artery_Disease'] = pd.to_datetime(merged_dataset['Diagnosis_admission_date_Coronary_Artery_Disease'], errors='coerce')
merged_dataset['Diagnosis_admission_date_ESRD'] = pd.to_datetime(merged_dataset['Diagnosis_admission_date_ESRD'], errors='coerce')

Diagnosis_admission_date_Coronary_Artery_Disease = list(merged_dataset['Diagnosis_admission_date_Coronary_Artery_Disease'])
Diagnosis_admission_date_ESRD = list(merged_dataset['Diagnosis_admission_date_ESRD'])

for i in range(len(Diagnosis_admission_date_Coronary_Artery_Disease)):
    if pd.isnull(Diagnosis_admission_date_Coronary_Artery_Disease[i]):
        Diagnosis_admission_date_Coronary_Artery_Disease[i] = pd.Timestamp('1800-01-01')

count = 0
count_list = []
for i in range(len(Diagnosis_admission_date_Coronary_Artery_Disease)):
    if Diagnosis_admission_date_Coronary_Artery_Disease[i] <= Diagnosis_admission_date_ESRD[i] and Diagnosis_admission_date_Coronary_Artery_Disease[i] != pd.Timestamp('1800-01-01'):
        count = count + 1
        count_list.append(merged_dataset['patient_sk'][i])

merged_dataset['Diagnosis_admission_date_Cerebrovascular_Disease'] = pd.to_datetime(merged_dataset['Diagnosis_admission_date_Cerebrovascular_Disease'], errors='coerce')
merged_dataset['Diagnosis_admission_date_ESRD'] = pd.to_datetime(merged_dataset['Diagnosis_admission_date_ESRD'], errors='coerce')

Diagnosis_admission_date_Cerebrovascular_Disease = list(merged_dataset['Diagnosis_admission_date_Cerebrovascular_Disease'])
Diagnosis_admission_date_ESRD = list(merged_dataset['Diagnosis_admission_date_ESRD'])

for i in range(len(Diagnosis_admission_date_Cerebrovascular_Disease)):
    if pd.isnull(Diagnosis_admission_date_Cerebrovascular_Disease[i]):
        Diagnosis_admission_date_Cerebrovascular_Disease[i] = pd.Timestamp('1800-01-01')

for i in range(len(Diagnosis_admission_date_Cerebrovascular_Disease)):
    if Diagnosis_admission_date_Cerebrovascular_Disease[i] <= Diagnosis_admission_date_ESRD[i] and Diagnosis_admission_date_Cerebrovascular_Disease[i] != pd.Timestamp('1800-01-01'):
        count = count + 1
        count_list.append(merged_dataset['patient_sk'][i])


merged_dataset['Diagnosis_admission_date_Peripheral_Vascular_Disease'] = pd.to_datetime(merged_dataset['Diagnosis_admission_date_Peripheral_Vascular_Disease'], errors='coerce')
merged_dataset['Diagnosis_admission_date_ESRD'] = pd.to_datetime(merged_dataset['Diagnosis_admission_date_ESRD'], errors='coerce')

Diagnosis_admission_date_Peripheral_Vascular_Disease = list(merged_dataset['Diagnosis_admission_date_Peripheral_Vascular_Disease'])
Diagnosis_admission_date_ESRD = list(merged_dataset['Diagnosis_admission_date_ESRD'])

for i in range(len(Diagnosis_admission_date_Peripheral_Vascular_Disease)):
    if pd.isnull(Diagnosis_admission_date_Peripheral_Vascular_Disease[i]):
        Diagnosis_admission_date_Peripheral_Vascular_Disease[i] = pd.Timestamp('1800-01-01')

for i in range(len(Diagnosis_admission_date_Peripheral_Vascular_Disease)):
    if Diagnosis_admission_date_Peripheral_Vascular_Disease[i] <= Diagnosis_admission_date_ESRD[i] and Diagnosis_admission_date_Peripheral_Vascular_Disease[i] != pd.Timestamp('1800-01-01'):
        count = count + 1
        count_list.append(merged_dataset['patient_sk'][i])
    

patients_ESRD_Cardiovascular_Disease = pd.DataFrame({'patient_sk' : count_list})
datapool_ESRD_Cardiovascular_Disease = datapool_ESRD.merge(patients_ESRD_Cardiovascular_Disease, on = ['patient_sk'], how = 'inner')


#Non-Cardiovascular_Disease

patients_list_ESRD
patients_ESRD_Non_Cardiovascular_Disease = pd.DataFrame({'patient_sk' : list(set(patients_list_ESRD).difference(set(count_list)))})
datapool_ESRD_Non_Cardiovascular_Disease = datapool_ESRD.merge(patients_ESRD_Non_Cardiovascular_Disease, on = ['patient_sk'], how = 'inner')


#Diagnosis_admission_date_Hypercholesterolemia

merged_dataset['Diagnosis_admission_date_Hypercholesterolemia'] = pd.to_datetime(merged_dataset['Diagnosis_admission_date_Hypercholesterolemia'], errors='coerce')
merged_dataset['Diagnosis_admission_date_ESRD'] = pd.to_datetime(merged_dataset['Diagnosis_admission_date_ESRD'], errors='coerce')

Diagnosis_admission_date_Hypercholesterolemia = list(merged_dataset['Diagnosis_admission_date_Hypercholesterolemia'])
Diagnosis_admission_date_ESRD = list(merged_dataset['Diagnosis_admission_date_ESRD'])

for i in range(len(Diagnosis_admission_date_Hypercholesterolemia)):
    if pd.isnull(Diagnosis_admission_date_Hypercholesterolemia[i]):
        Diagnosis_admission_date_Hypercholesterolemia[i] = pd.Timestamp('1800-01-01')

count = 0
count_list = []
for i in range(len(Diagnosis_admission_date_Hypercholesterolemia)):
    if Diagnosis_admission_date_Hypercholesterolemia[i] <= Diagnosis_admission_date_ESRD[i] and Diagnosis_admission_date_Hypercholesterolemia[i] != pd.Timestamp('1800-01-01'):
        count = count + 1
        count_list.append(merged_dataset['patient_sk'][i])


patients_ESRD_Hypercholesterolemia = pd.DataFrame({'patient_sk' : count_list})
datapool_ESRD_Hypercholesterolemia = datapool_ESRD.merge(patients_ESRD_Hypercholesterolemia, on = ['patient_sk'], how = 'inner')

#Non-Hypercholesterolemia

patients_list_ESRD
patients_ESRD_Non_Hypercholesterolemia = pd.DataFrame({'patient_sk' : list(set(patients_list_ESRD).difference(set(count_list)))})
datapool_ESRD_Non_Hypercholesterolemia = datapool_ESRD.merge(patients_ESRD_Non_Hypercholesterolemia, on = ['patient_sk'], how = 'inner')


#Normal subgroup for the disease
patients_control_Hypertension = pd.read_csv('DIAGNOSIS_NORMAL_hypertension.csv')
patients_control_Hypertension = patients_control_Hypertension.drop(columns=patients_control_Hypertension.columns[0])
datapool_control_Hypertension = datapool_control.merge(patients_control_Hypertension, on = ['patient_sk'], how='inner')
patient_lislis = pd.DataFrame({'patient_sk':list(set(list(datapool_control.patient_sk))-(set(list(patients_control_Hypertension.patient_sk.unique()))))})
datapool_control_Non_Hypertension = datapool_control.merge(patient_lislis, on = ['patient_sk'], how='inner')

patients_control_Diabetes = pd.read_csv('DIAGNOSIS_NORMAL_Diabetes.csv')
patients_control_Diabetes = patients_control_Diabetes.drop(columns=patients_control_Diabetes.columns[0])
datapool_control_Diabetes = datapool_control.merge(patients_control_Diabetes, on = ['patient_sk'], how='inner')
patient_lislis = pd.DataFrame({'patient_sk':list(set(list(datapool_control.patient_sk))-(set(list(patients_control_Diabetes.patient_sk.unique()))))})
datapool_control_Non_Diabetes = datapool_control.merge(patient_lislis, on = ['patient_sk'], how='inner')

patients_control_Cardiovascular_Disease = pd.read_csv('DIAGNOSIS_NORMAL_Cardivascular_Disease.csv')
patients_control_Cardiovascular_Disease = patients_control_Cardiovascular_Disease.drop(columns=patients_control_Cardiovascular_Disease.columns[0])
datapool_control_Cardiovascular_Disease = datapool_control.merge(patients_control_Cardiovascular_Disease, on = ['patient_sk'], how='inner')
patient_lislis = pd.DataFrame({'patient_sk':list(set(list(datapool_control.patient_sk))-(set(list(patients_control_Cardiovascular_Disease.patient_sk.unique()))))})
datapool_control_Non_Cardiovascular_Disease = datapool_control.merge(patient_lislis, on = ['patient_sk'], how='inner')

patients_control_Hypercholesterolemia = pd.read_csv('DIAGNOSIS_NORMAL_Hypercholesterolemia.csv')
patients_control_Hypercholesterolemia = patients_control_Hypercholesterolemia.drop(columns=patients_control_Hypercholesterolemia.columns[0])
datapool_control_Hypercholesterolemia = datapool_control.merge(patients_control_Hypercholesterolemia, on = ['patient_sk'], how='inner')
patient_lislis = pd.DataFrame({'patient_sk':list(set(list(datapool_control.patient_sk))-(set(list(patients_control_Hypercholesterolemia.patient_sk.unique()))))})
datapool_control_Non_Hypercholesterolemia = datapool_control.merge(patient_lislis, on = ['patient_sk'], how='inner')


Sub_Groups = [[datapool_ESRD_working_age, datapool_control_working_age], [datapool_ESRD_none_working_age, datapool_control_none_working_age], [datapool_ESRD_Female, datapool_control_Female], [datapool_ESRD_Male, datapool_control_Male], [datapool_ESRD_African, datapool_control_African], [datapool_ESRD_None_African, datapool_control_None_African], [datapool_ESRD_Hypertension, datapool_control_Hypertension], [datapool_ESRD_Non_Hypertension, datapool_control_Non_Hypertension], [datapool_ESRD_Diabetes, datapool_control_Diabetes], [datapool_ESRD_Non_Diabetes, datapool_control_Non_Diabetes], [datapool_ESRD_Cardiovascular_Disease, datapool_control_Cardiovascular_Disease], [datapool_ESRD_Non_Cardiovascular_Disease, datapool_control_Non_Cardiovascular_Disease], [datapool_ESRD_Hypercholesterolemia, datapool_control_Hypercholesterolemia], [datapool_ESRD_Non_Hypercholesterolemia, datapool_control_Non_Hypercholesterolemia]]

number_in_groups_ESRD = []
number_in_groups_Normal = []
for item in Sub_Groups:
    number_in_groups_ESRD.append(item[0].patient_sk.unique().shape[0])
    number_in_groups_Normal.append(item[1].patient_sk.unique().shape[0])
    
Sub_grouo_table = pd.DataFrame({'Sub groups' : ['Adults under 65','Adults above 65', 'Female', 'Male', 'African American', 'Other (Non-African American)', 'Hypertension', 'Non Hypertension','Diabetes', 'Non Diabetes', 'Cardiovascular Disease',  'Non Cardiovascular Disease', 'Hypercholesterolemia', 'Non Hypercholesterolemia']})
Sub_grouo_table['# of ESRD subgroup'] = number_in_groups_ESRD
Sub_grouo_table['# of Normal subgroup'] = number_in_groups_Normal



Accuracy_list = []
Sensetivity_list = []
Specificity_list = []
time_to_event_ESRD_mean_list = []
time_to_event_ESRD_median_list = []
time_to_event_ESRD_serror_list = []
mu_list = []
sigma_list = []
time_to_event_dataset = []
n_list_normal = []
proportion_list = []

for datapool in Sub_Groups:
    
    datapool_ESRD = datapool[0]
    datapool_control = datapool[1]
    
    # Mu and sigma
    

    var_list = []
    n_list = []

    # ____________________________________ Starting point : from CERNER  ____________________________________

    #everything is still there from previous analysis
    # _________________________________________________________________________________________________________________________________________________

    var_list = datapool_control.groupby('patient_sk').agg({'newly_calculated_eGFR_new':'std'})
    var_list = list(var_list.newly_calculated_eGFR_new)

    n_list =  datapool_control.groupby('patient_sk').agg({'patient_sk':'count'})
    n_list = list(n_list.patient_sk)
    #calculating the mean and variance of the Normal sample

    n_1 = list((n_list - np.ones(len(n_list))).astype('int'))
    numerator = np.multiply(n_1, np.power(var_list, 2))
    denominator = sum(n_list) - len(n_list)
    sigma = np.power(sum(numerator)/denominator,0.5)
    sigma= 7.92457416532869
    #Hyperparametrs:

    V0 = 0
    w = 0.75
    T = -4


    ## AND let us start palying with Zi and Vi
    from numba import jit
    @jit(nopython=True)

    def Vi_creator(newly_calculated_eGFR_new, starting_mu, patient_sk, Age):
        Zi = np.zeros(newly_calculated_eGFR_new.shape)
        Zi[0] = 0

        for i in range(1, Zi.shape[0]):
            if patient_sk[i] == patient_sk[i-1]:
                delta_age = Age[i] - Age[i-1]
                Zi[i] = (newly_calculated_eGFR_new[i] - (starting_mu[i] - 0.81 * (delta_age)))/sigma
            else:
                Zi[i] = 0


        Vi = np.zeros(Zi.shape)
        Vi[0] = V0

        for i in range(1, Vi.shape[0]):
            if patient_sk[i] == patient_sk[i-1]:
                Vi[i] = (min(0.0, Zi[i] + w + Vi[i-1]))
            else:
                Vi[i] = V0

        return Vi

    datapool_control['Vi'] = Vi_creator(datapool_control['newly_calculated_eGFR_new'].values, datapool_control['starting_mu'].values, datapool_control['patient_sk'].values, datapool_control['Age'].values)
    datapool_ESRD['Vi'] = Vi_creator(datapool_ESRD['newly_calculated_eGFR_new'].values, datapool_ESRD['starting_mu'].values, datapool_ESRD['patient_sk'].values, datapool_ESRD['Age'].values)


    # Making up the result trigger date and eGFR tables

    patients_control_trigger = datapool_control[datapool_control['Vi'] <= T].groupby('patient_sk').agg({'Date': 'min'})
    patients_control_trigger = patients_control_trigger.reset_index()
    patients_control_trigger = patients_control_trigger.merge(datapool_control[['patient_sk', 'newly_calculated_eGFR_new', 'Date']], on=['patient_sk'], how='inner')
    patients_control_trigger = patients_control_trigger.rename({'Date_x':'Trigger_date'}, axis = 1)
    patients_control_trigger = patients_control_trigger.rename({'Date_y':'Date'}, axis = 1)
    patients_control_trigger = patients_control_trigger[patients_control_trigger.Trigger_date == patients_control_trigger.Date]
    patients_control_trigger['New_label'] = list(np.ones(patients_control_trigger.patient_sk.shape[0]))

    patients_ESRD_trigger = datapool_ESRD[datapool_ESRD['Vi'] <= T].groupby('patient_sk').agg({'Date': 'min'})
    patients_ESRD_trigger = patients_ESRD_trigger.reset_index()
    patients_ESRD_trigger = patients_ESRD_trigger.merge(datapool_ESRD[['patient_sk', 'newly_calculated_eGFR_new', 'Date']], on=['patient_sk'], how='inner')
    patients_ESRD_trigger = patients_ESRD_trigger.rename({'Date_x':'Trigger_date'}, axis = 1)
    patients_ESRD_trigger = patients_ESRD_trigger.rename({'Date_y':'Date'}, axis = 1)
    patients_ESRD_trigger = patients_ESRD_trigger[patients_ESRD_trigger.Trigger_date == patients_ESRD_trigger.Date]
    patients_ESRD_trigger['New_label'] = list(np.ones(patients_ESRD_trigger.patient_sk.shape[0]))

    #Labeling and finishing :)

    patients_Normal_labeled = pd.DataFrame({'patient_sk' : list(datapool_control.patient_sk.unique()) , 'Label' : list(np.ones(len(list(datapool_control.patient_sk.unique()))))}) 

    patients_Normal_labeled =  patients_Normal_labeled.merge(patients_control_trigger, on='patient_sk', how='left')
    patients_Normal_labeled = patients_Normal_labeled.drop_duplicates('patient_sk')
    patients_Normal_labeled = patients_Normal_labeled.drop('Date', axis = 1)



    patients_ESRD_labeled = pd.DataFrame({'patient_sk' : list(datapool_ESRD.patient_sk.unique()) , 'Label' : list(np.ones(len(list(datapool_ESRD.patient_sk.unique()))))}) 

    patients_ESRD_labeled =  patients_ESRD_labeled.merge(patients_ESRD_trigger, on='patient_sk', how='left')
    patients_ESRD_labeled = patients_ESRD_labeled.drop_duplicates('patient_sk')
    patients_ESRD_labeled = patients_ESRD_labeled.drop('Date', axis = 1)

    #Accuracy = true(positive and negative)/total population
    # ESRD NaN = 0.0
    # Normal NaN = 0.0

    #RIGHT detection in ESRD:
    numbet_of_ones_ESRD = patients_ESRD_labeled[patients_ESRD_labeled['New_label'] == 1].shape[0]

    #WRONG detection in Normal
    numbet_of_ones_Normal = patients_Normal_labeled[patients_Normal_labeled['New_label'] == 1].shape[0]

    total_ESRD = patients_ESRD_labeled.shape[0]
    total_Normal = patients_Normal_labeled.shape[0]

    # Accuracy
    Accuracy = (numbet_of_ones_ESRD + (total_Normal - numbet_of_ones_Normal))/(total_ESRD + total_Normal)

    #Sensetivity
    tp = numbet_of_ones_ESRD
    fn = total_ESRD - numbet_of_ones_ESRD
    Sensetivity = tp/(tp+fn)

    #Specificity
    tn = total_Normal - numbet_of_ones_Normal
    fp = numbet_of_ones_Normal
    Specificity = tn/(tn+fp)

    Accuracy_list.append(Accuracy)
    Sensetivity_list.append(Sensetivity)
    Specificity_list.append(Specificity)
    
    patients_ESRD_full_dates_pandas = pd.read_csv('Final_patients_ESRD_full_dates_pandas.csv')


    new_table_dates = pd.DataFrame({'patient_sk' : patients_ESRD_full_dates_pandas['patient_sk'], 'Diagnosis_admission_date_ESRD' : patients_ESRD_full_dates_pandas['Diagnosis_admission_date_ESRD']})
    patients_ESRD_labeled = patients_ESRD_labeled.merge(new_table_dates, on = ['patient_sk'], how = 'inner')

    patients_ESRD_labeled['Diagnosis_admission_date_ESRD'] = pd.to_datetime(patients_ESRD_labeled['Diagnosis_admission_date_ESRD'], errors='coerce')

    lislis_ESRD = (patients_ESRD_labeled['Diagnosis_admission_date_ESRD'] - patients_ESRD_labeled['Trigger_date'])

    patients_ESRD_labeled['time_to_event_ESRD'] = lislis_ESRD
    
    count_lislis = 0
    count_lislis_NaT = 0
    for i in range(len(lislis_ESRD)):
        if lislis_ESRD[i] >= timedelta(0):
            count_lislis = count_lislis + 1
        if pd.isnull(lislis_ESRD[i]):
            count_lislis_NaT = count_lislis_NaT + 1
            
            
    proportion_list.append(count_lislis/len(list(datapool_ESRD.patient_sk.unique())))  
    
    patients_ESRD_labeled.loc[patients_ESRD_labeled['time_to_event_ESRD'] <= timedelta(0),'time_to_event_ESRD'] = timedelta(0)
    
    time_to_event_ESRD_mean = np.mean(patients_ESRD_labeled['time_to_event_ESRD'])
    time_to_event_ESRD_median = np.median(patients_ESRD_labeled['time_to_event_ESRD'])
    time_to_event_ESRD_serror = np.std(patients_ESRD_labeled['time_to_event_ESRD'])/np.sqrt(len(patients_ESRD_labeled['time_to_event_ESRD']) - count_lislis_NaT)
    
        
    
    time_to_event_ESRD_mean_list.append(time_to_event_ESRD_mean)
    time_to_event_ESRD_median_list.append(time_to_event_ESRD_median)
    time_to_event_ESRD_serror_list.append(time_to_event_ESRD_serror)
    
    
    time_to_event_dataset.append(patients_ESRD_labeled) 
    

#Sub_grouo_table['# of detected / sub-total'] = proportion_list
Sub_grouo_table['Mu'] = 'functin of age'
Sub_grouo_table['Sigma'] = sigma
Sub_grouo_table['Accuracy'] = Accuracy_list
Sub_grouo_table['Sensitivity'] = Sensetivity_list
Sub_grouo_table['Specificity'] = Specificity_list
Sub_grouo_table['Mean time to event (ESRD diagnosis)'] = time_to_event_ESRD_mean_list
Sub_grouo_table['Median time to event (ESRD diagnosis)'] = time_to_event_ESRD_median_list
Sub_grouo_table['Standard error of time to event (ESRD diagnosis)'] = time_to_event_ESRD_serror_list



merged_dataset['time_to_event_ESRD'] = pd.to_timedelta(merged_dataset['time_to_event_ESRD'], errors='coerce')
merged_dataset_all_positive = merged_dataset.copy()
merged_dataset_all_positive.loc[merged_dataset_all_positive['time_to_event_ESRD'] <= timedelta(0), 'time_to_event_ESRD'] = timedelta(0)
new_row = {'Sub groups':'Total','# of ESRD subgroup' : 5410, '# of Normal subgroup':85699, 'Mu':'functin of age' , 'Sigma':sigma, 'Accuracy' : 0.8782227880889923, 'Sensitivity' : 0.8972273567467652, 'Specificity': 0.877023069113992, 'Mean time to event (ESRD diagnosis)' : np.mean(merged_dataset_all_positive.time_to_event_ESRD) , 'Median time to event (ESRD diagnosis)' : np.median(merged_dataset_all_positive.time_to_event_ESRD), 'Standard error of time to event (ESRD diagnosis)': np.std(merged_dataset_all_positive['time_to_event_ESRD'])/np.sqrt((4941))}
Sub_grouo_table = Sub_grouo_table.append(new_row, ignore_index=True)


for item in time_to_event_dataset:
    item.time_to_event_ESRD = pd.to_timedelta(item.time_to_event_ESRD, errors='coerce')

time_to_event_dataset_Adults_under_65 = time_to_event_dataset[0].drop('Label', axis = 1).drop(['Trigger_date', 'newly_calculated_eGFR_new', 'New_label', 'Diagnosis_admission_date_ESRD'], axis = 1)
time_to_event_dataset_Adults_above_65 = time_to_event_dataset[1].drop('Label', axis = 1).drop(['Trigger_date', 'newly_calculated_eGFR_new', 'New_label', 'Diagnosis_admission_date_ESRD'], axis = 1)

time_to_event_dataset_Female = time_to_event_dataset[2].drop('Label', axis = 1).drop(['Trigger_date', 'newly_calculated_eGFR_new', 'New_label', 'Diagnosis_admission_date_ESRD'], axis = 1)
time_to_event_dataset_Male = time_to_event_dataset[3].drop('Label', axis = 1).drop(['Trigger_date', 'newly_calculated_eGFR_new', 'New_label', 'Diagnosis_admission_date_ESRD'], axis = 1)

time_to_event_dataset_African_American = time_to_event_dataset[4].drop('Label', axis = 1).drop(['Trigger_date', 'newly_calculated_eGFR_new', 'New_label', 'Diagnosis_admission_date_ESRD'], axis = 1)
time_to_event_dataset_Non_African_American = time_to_event_dataset[5].drop('Label', axis = 1).drop(['Trigger_date', 'newly_calculated_eGFR_new', 'New_label', 'Diagnosis_admission_date_ESRD'], axis = 1)

Sub_grouo_table
#Sub_grouo_table.to_csv('Gender_Race_impact_corrected_old_formula.csv')

In [120]:
Sub_grouo_table.to_csv('Final_Sub_Groups_NEW_CUSUM.csv')

In [None]:

#Age ESRD

datapool_ESRD_age = datapool_ESRD.groupby('patient_sk').agg({'Age': lambda x: (x.iloc[-1] + x.iloc[0])/2}) #getting median of reported ages
datapool_ESRD_age = datapool_ESRD_age.reset_index()

patient_ESRD_18_25 = datapool_ESRD_age[(datapool_ESRD_age.Age >= 18) & (datapool_ESRD_age.Age < 25)].drop('Age', axis =1)
datapool_ESRD_18_25 = datapool_ESRD.merge(patient_ESRD_18_25, on = ['patient_sk'], how = 'inner')

patient_ESRD_25_30 = datapool_ESRD_age[(datapool_ESRD_age.Age >= 25) & (datapool_ESRD_age.Age < 30)].drop('Age', axis =1)
datapool_ESRD_25_30 = datapool_ESRD.merge(patient_ESRD_25_30, on = ['patient_sk'], how = 'inner')

patient_ESRD_30_35 = datapool_ESRD_age[(datapool_ESRD_age.Age >= 30) & (datapool_ESRD_age.Age < 35)].drop('Age', axis =1)
datapool_ESRD_30_35 = datapool_ESRD.merge(patient_ESRD_30_35, on = ['patient_sk'], how = 'inner')

patient_ESRD_35_40 = datapool_ESRD_age[(datapool_ESRD_age.Age >= 35) & (datapool_ESRD_age.Age < 40)].drop('Age', axis =1)
datapool_ESRD_35_40 = datapool_ESRD.merge(patient_ESRD_35_40, on = ['patient_sk'], how = 'inner')

patient_ESRD_40_45 = datapool_ESRD_age[(datapool_ESRD_age.Age >= 40) & (datapool_ESRD_age.Age < 45)].drop('Age', axis =1)
datapool_ESRD_40_45 = datapool_ESRD.merge(patient_ESRD_40_45, on = ['patient_sk'], how = 'inner')

patient_ESRD_45_50 = datapool_ESRD_age[(datapool_ESRD_age.Age >= 45) & (datapool_ESRD_age.Age < 50)].drop('Age', axis =1)
datapool_ESRD_45_50 = datapool_ESRD.merge(patient_ESRD_45_50, on = ['patient_sk'], how = 'inner')

patient_ESRD_50_55 = datapool_ESRD_age[(datapool_ESRD_age.Age >= 50) & (datapool_ESRD_age.Age < 55)].drop('Age', axis =1)
datapool_ESRD_50_55 = datapool_ESRD.merge(patient_ESRD_50_55, on = ['patient_sk'], how = 'inner')

patient_ESRD_55_60 = datapool_ESRD_age[(datapool_ESRD_age.Age >= 55) & (datapool_ESRD_age.Age < 60)].drop('Age', axis =1)
datapool_ESRD_55_60 = datapool_ESRD.merge(patient_ESRD_55_60, on = ['patient_sk'], how = 'inner')

patient_ESRD_60_65 = datapool_ESRD_age[(datapool_ESRD_age.Age >= 60) & (datapool_ESRD_age.Age < 65)].drop('Age', axis =1)
datapool_ESRD_60_65 = datapool_ESRD.merge(patient_ESRD_60_65, on = ['patient_sk'], how = 'inner')

patient_ESRD_65_70 = datapool_ESRD_age[(datapool_ESRD_age.Age >= 65) & (datapool_ESRD_age.Age < 70)].drop('Age', axis =1)
datapool_ESRD_65_70 = datapool_ESRD.merge(patient_ESRD_65_70, on = ['patient_sk'], how = 'inner')

patient_ESRD_70_75 = datapool_ESRD_age[(datapool_ESRD_age.Age >= 70) & (datapool_ESRD_age.Age < 75)].drop('Age', axis =1)
datapool_ESRD_70_75 = datapool_ESRD.merge(patient_ESRD_70_75, on = ['patient_sk'], how = 'inner')

patient_ESRD_75_80 = datapool_ESRD_age[(datapool_ESRD_age.Age >= 75) & (datapool_ESRD_age.Age < 80)].drop('Age', axis =1)
datapool_ESRD_75_80 = datapool_ESRD.merge(patient_ESRD_75_80, on = ['patient_sk'], how = 'inner')

patient_ESRD_80_85 = datapool_ESRD_age[(datapool_ESRD_age.Age >= 80) & (datapool_ESRD_age.Age < 85)].drop('Age', axis =1)
datapool_ESRD_80_85 = datapool_ESRD.merge(patient_ESRD_80_85, on = ['patient_sk'], how = 'inner')

patient_ESRD_85_on = datapool_ESRD_age[(datapool_ESRD_age.Age >= 85)].drop('Age', axis =1)
datapool_ESRD_85_on = datapool_ESRD.merge(patient_ESRD_85_on, on = ['patient_sk'], how = 'inner')


#Age Normal

datapool_control_age = datapool_control.groupby('patient_sk').agg({'Age': lambda x: (x.iloc[-1] + x.iloc[0])/2}) #getting median of reported ages
datapool_control_age = datapool_control_age.reset_index()


patient_control_18_25 = datapool_control_age[(datapool_control_age.Age >= 18) & (datapool_control_age.Age < 25)].drop('Age', axis =1)
datapool_control_18_25 = datapool_control.merge(patient_control_18_25, on = ['patient_sk'], how = 'inner')

patient_control_25_30 = datapool_control_age[(datapool_control_age.Age >= 25) & (datapool_control_age.Age < 30)].drop('Age', axis =1)
datapool_control_25_30 = datapool_control.merge(patient_control_25_30, on = ['patient_sk'], how = 'inner')

patient_control_30_35 = datapool_control_age[(datapool_control_age.Age >= 30) & (datapool_control_age.Age < 35)].drop('Age', axis =1)
datapool_control_30_35 = datapool_control.merge(patient_control_30_35, on = ['patient_sk'], how = 'inner')

patient_control_35_40 = datapool_control_age[(datapool_control_age.Age >= 35) & (datapool_control_age.Age < 40)].drop('Age', axis =1)
datapool_control_35_40 = datapool_control.merge(patient_control_35_40, on = ['patient_sk'], how = 'inner')

patient_control_40_45 = datapool_control_age[(datapool_control_age.Age >= 40) & (datapool_control_age.Age < 45)].drop('Age', axis =1)
datapool_control_40_45 = datapool_control.merge(patient_control_40_45, on = ['patient_sk'], how = 'inner')

patient_control_45_50 = datapool_control_age[(datapool_control_age.Age >= 45) & (datapool_control_age.Age < 50)].drop('Age', axis =1)
datapool_control_45_50 = datapool_control.merge(patient_control_45_50, on = ['patient_sk'], how = 'inner')

patient_control_50_55 = datapool_control_age[(datapool_control_age.Age >= 50) & (datapool_control_age.Age < 55)].drop('Age', axis =1)
datapool_control_50_55 = datapool_control.merge(patient_control_50_55, on = ['patient_sk'], how = 'inner')

patient_control_55_60 = datapool_control_age[(datapool_control_age.Age >= 55) & (datapool_control_age.Age < 60)].drop('Age', axis =1)
datapool_control_55_60 = datapool_control.merge(patient_control_55_60, on = ['patient_sk'], how = 'inner')

patient_control_60_65 = datapool_control_age[(datapool_control_age.Age >= 60) & (datapool_control_age.Age < 65)].drop('Age', axis =1)
datapool_control_60_65 = datapool_control.merge(patient_control_60_65, on = ['patient_sk'], how = 'inner')

patient_control_65_70 = datapool_control_age[(datapool_control_age.Age >= 65) & (datapool_control_age.Age < 70)].drop('Age', axis =1)
datapool_control_65_70 = datapool_control.merge(patient_control_65_70, on = ['patient_sk'], how = 'inner')

patient_control_70_75 = datapool_control_age[(datapool_control_age.Age >= 70) & (datapool_control_age.Age < 75)].drop('Age', axis =1)
datapool_control_70_75 = datapool_control.merge(patient_control_70_75, on = ['patient_sk'], how = 'inner')

patient_control_75_80 = datapool_control_age[(datapool_control_age.Age >= 75) & (datapool_control_age.Age < 80)].drop('Age', axis =1)
datapool_control_75_80 = datapool_control.merge(patient_control_75_80, on = ['patient_sk'], how = 'inner')

patient_control_80_85 = datapool_control_age[(datapool_control_age.Age >= 80) & (datapool_control_age.Age < 85)].drop('Age', axis =1)
datapool_control_80_85 = datapool_control.merge(patient_control_80_85, on = ['patient_sk'], how = 'inner')

patient_control_85_on = datapool_control_age[(datapool_control_age.Age >= 85)].drop('Age', axis =1)
datapool_control_85_on = datapool_control.merge(patient_control_85_on, on = ['patient_sk'], how = 'inner')


Sub_Groups = [[datapool_ESRD_18_25, datapool_control_18_25], [datapool_ESRD_25_30, datapool_control_25_30], [datapool_ESRD_30_35, datapool_control_30_35], [datapool_ESRD_35_40, datapool_control_35_40], [datapool_ESRD_40_45, datapool_control_40_45], [datapool_ESRD_45_50, datapool_control_45_50], [datapool_ESRD_50_55, datapool_control_50_55], [datapool_ESRD_55_60, datapool_control_55_60], [datapool_ESRD_60_65, datapool_control_60_65], [datapool_ESRD_65_70, datapool_control_65_70], [datapool_ESRD_70_75, datapool_control_70_75], [datapool_ESRD_75_80, datapool_control_75_80], [datapool_ESRD_80_85, datapool_control_80_85], [datapool_ESRD_85_on, datapool_control_85_on]]

number_in_groups_ESRD = []
number_in_groups_Normal = []
for item in Sub_Groups:
    number_in_groups_ESRD.append(item[0].patient_sk.unique().shape[0])
    number_in_groups_Normal.append(item[1].patient_sk.unique().shape[0])
    
Sub_grouo_table = pd.DataFrame({'Sub groups' : ['18_25', '25_30', '30_35', '35_40', '40_45', '45_50', '50_55', '55_60', '60_65', '65_70', '70_75', '75_80', '80_85', '> 85']})
Sub_grouo_table['# of ESRD subgroup'] = number_in_groups_ESRD
Sub_grouo_table['# of Normal subgroup'] = number_in_groups_Normal


Accuracy_list = []
Sensetivity_list = []
Specificity_list = []
time_to_event_ESRD_mean_list = []
time_to_event_ESRD_median_list = []
time_to_event_ESRD_serror_list = []
mu_list = []
sigma_list = []
time_to_event_dataset = []
n_list_normal = []
proportion_list = []

for datapool in Sub_Groups:
    
    datapool_ESRD = datapool[0]
    datapool_control = datapool[1]
    
    # Mu and sigma
    
    
    var_list = []
    n_list = []

    mu = np.mean(datapool_control['newly_calculated_eGFR_new'])

    var_list = datapool_control.groupby('patient_sk').agg({'newly_calculated_eGFR_new':'std'})
    var_list = list(var_list.newly_calculated_eGFR_new)

    n_list =  datapool_control.groupby('patient_sk').agg({'patient_sk':'count'})
    n_list = list(n_list.patient_sk)
    #calculating the mean and variance of the Normal sample

    n_1 = list((n_list - np.ones(len(n_list))).astype('int'))
    numerator = np.multiply(n_1, np.power(var_list, 2))
    denominator = sum(n_list) - len(n_list)
    sigma = np.power(sum(numerator)/denominator,0.5)
    
    mu_list.append(mu)
    sigma_list.append(sigma)
    
    #Hyperparametrs:

    V0 = 0
    w = 0.75
    T = -4
    a = 0.2

    ## Zi:

    datapool_control['Zi'] = (datapool_control.newly_calculated_eGFR_new - mu)/sigma
    datapool_ESRD['Zi'] = (datapool_ESRD.newly_calculated_eGFR_new - mu)/sigma

    ## AAANNNDDD let us start palying with Zi and Vi :) AND THE SLOPES AS WELL :)

    from numba import jit
    @jit(nopython=True)

    def Vi_creator(Zi, patient_sk):
        Vi = np.zeros(Zi.shape)
        Vi[0] = V0

        for i in range(1, Vi.shape[0]):
            if patient_sk[i] == patient_sk[i-1]:
                Vi[i] = (min(0.0, Zi[i] + w + Vi[i-1]))
            else:
                Vi[i] = V0

        return Vi

    datapool_control['Vi'] = Vi_creator(datapool_control['Zi'].values, datapool_control['patient_sk'].values)
    datapool_ESRD['Vi'] = Vi_creator(datapool_ESRD['Zi'].values, datapool_ESRD['patient_sk'].values)
    
    # Making up the result trigger date and eGFR tables

    patients_control_trigger = datapool_control[datapool_control['Vi'] <= T].groupby('patient_sk').agg({'Date': 'min'})
    patients_control_trigger = patients_control_trigger.reset_index()
    patients_control_trigger = patients_control_trigger.merge(datapool_control[['patient_sk', 'newly_calculated_eGFR_new', 'Date']], on=['patient_sk'], how='inner')
    patients_control_trigger = patients_control_trigger.rename({'Date_x':'Trigger_date'}, axis = 1)
    patients_control_trigger = patients_control_trigger.rename({'Date_y':'Date'}, axis = 1)
    patients_control_trigger = patients_control_trigger[patients_control_trigger.Trigger_date == patients_control_trigger.Date]
    patients_control_trigger['New_label'] = list(np.ones(patients_control_trigger.patient_sk.shape[0]))

    patients_ESRD_trigger = datapool_ESRD[datapool_ESRD['Vi'] <= T].groupby('patient_sk').agg({'Date': 'min'})
    patients_ESRD_trigger = patients_ESRD_trigger.reset_index()
    patients_ESRD_trigger = patients_ESRD_trigger.merge(datapool_ESRD[['patient_sk', 'newly_calculated_eGFR_new', 'Date']], on=['patient_sk'], how='inner')
    patients_ESRD_trigger = patients_ESRD_trigger.rename({'Date_x':'Trigger_date'}, axis = 1)
    patients_ESRD_trigger = patients_ESRD_trigger.rename({'Date_y':'Date'}, axis = 1)
    patients_ESRD_trigger = patients_ESRD_trigger[patients_ESRD_trigger.Trigger_date == patients_ESRD_trigger.Date]
    patients_ESRD_trigger['New_label'] = list(np.ones(patients_ESRD_trigger.patient_sk.shape[0]))

    #Labeling and finishing :)

    patients_Normal_labeled = pd.DataFrame({'patient_sk' : list(datapool_control.patient_sk.unique()) , 'Label' : list(np.ones(len(list(datapool_control.patient_sk.unique()))))}) 

    patients_Normal_labeled =  patients_Normal_labeled.merge(patients_control_trigger, on='patient_sk', how='left')
    patients_Normal_labeled = patients_Normal_labeled.drop_duplicates('patient_sk')
    patients_Normal_labeled = patients_Normal_labeled.drop('Date', axis = 1)



    patients_ESRD_labeled = pd.DataFrame({'patient_sk' : list(datapool_ESRD.patient_sk.unique()) , 'Label' : list(np.ones(len(list(datapool_ESRD.patient_sk.unique()))))}) 

    patients_ESRD_labeled =  patients_ESRD_labeled.merge(patients_ESRD_trigger, on='patient_sk', how='left')
    patients_ESRD_labeled = patients_ESRD_labeled.drop_duplicates('patient_sk')
    patients_ESRD_labeled = patients_ESRD_labeled.drop('Date', axis = 1)

    #Accuracy = true(positive and negative)/total population
    # ESRD NaN = 0.0
    # Normal NaN = 0.0

    #RIGHT detection in ESRD:
    numbet_of_ones_ESRD = patients_ESRD_labeled[patients_ESRD_labeled['New_label'] == 1].shape[0]

    #WRONG detection in Normal
    numbet_of_ones_Normal = patients_Normal_labeled[patients_Normal_labeled['New_label'] == 1].shape[0]

    total_ESRD = patients_ESRD_labeled.shape[0]
    total_Normal = patients_Normal_labeled.shape[0]

    # Accuracy
    Accuracy = (numbet_of_ones_ESRD + (total_Normal - numbet_of_ones_Normal))/(total_ESRD + total_Normal)

    #Sensetivity
    tp = numbet_of_ones_ESRD
    fn = total_ESRD - numbet_of_ones_ESRD
    Sensetivity = tp/(tp+fn)

    #Specificity
    tn = total_Normal - numbet_of_ones_Normal
    fp = numbet_of_ones_Normal
    Specificity = tn/(tn+fp)

    Accuracy_list.append(Accuracy)
    Sensetivity_list.append(Sensetivity)
    Specificity_list.append(Specificity)
    
    patients_ESRD_full_dates_pandas = pd.read_csv('Final_patients_ESRD_full_dates_pandas.csv')


    new_table_dates = pd.DataFrame({'patient_sk' : patients_ESRD_full_dates_pandas['patient_sk'], 'Diagnosis_admission_date_ESRD' : patients_ESRD_full_dates_pandas['Diagnosis_admission_date_ESRD']})
    patients_ESRD_labeled = patients_ESRD_labeled.merge(new_table_dates, on = ['patient_sk'], how = 'inner')

    patients_ESRD_labeled['Diagnosis_admission_date_ESRD'] = pd.to_datetime(patients_ESRD_labeled['Diagnosis_admission_date_ESRD'], errors='coerce')

    lislis_ESRD = (patients_ESRD_labeled['Diagnosis_admission_date_ESRD'] - patients_ESRD_labeled['Trigger_date'])

    patients_ESRD_labeled['time_to_event_ESRD'] = lislis_ESRD
    
    count_lislis = 0
    for i in range(len(lislis_ESRD)):
        if lislis_ESRD[i] >= timedelta(0):
            count_lislis = count_lislis + 1
            
    proportion_list.append(count_lislis/len(list(datapool_ESRD.patient_sk.unique())))  
    
    for i in range(len(lislis_ESRD)):
        if lislis_ESRD[i] <= timedelta(0):
            patients_ESRD_labeled['time_to_event_ESRD'][i] = timedelta(0)
            lislis_ESRD[i] = timedelta(0)
    
    time_to_event_ESRD_mean = np.mean(lislis_ESRD)
    time_to_event_ESRD_median = np.median(lislis_ESRD)
    time_to_event_ESRD_serror = np.std(lislis_ESRD, ddof=1)
    
        
    
    time_to_event_ESRD_mean_list.append(time_to_event_ESRD_mean)
    time_to_event_ESRD_median_list.append(time_to_event_ESRD_median)
    time_to_event_ESRD_serror_list.append(time_to_event_ESRD_serror)
    
    
    time_to_event_dataset.append(patients_ESRD_labeled) 
    

Sub_grouo_table['# of detected / sub-total'] = proportion_list
Sub_grouo_table['Mu'] = mu_list
Sub_grouo_table['Sigma'] = sigma_list
Sub_grouo_table['Accuracy'] = Accuracy_list
Sub_grouo_table['Sensitivity'] = Sensetivity_list
Sub_grouo_table['Specificity'] = Specificity_list
Sub_grouo_table['Mean time to event (ESRD diagnosis)'] = time_to_event_ESRD_mean_list
Sub_grouo_table['Median time to event (ESRD diagnosis)'] = time_to_event_ESRD_median_list
Sub_grouo_table['Standard error of time to event (ESRD diagnosis)'] = time_to_event_ESRD_serror_list/np.sqrt(count_lislis)




merged_dataset['time_to_event_ESRD'] = pd.to_timedelta(merged_dataset['time_to_event_ESRD'], errors='coerce')
new_row = {'Sub groups':'Total','Mu':mu , 'Sigma':sigma, 'Accuracy' : Accuracy, 'Sensitivity' : Sensetivity, 'Specificity': Specificity, 'Mean time to event (ESRD diagnosis)' : np.mean(merged_dataset.time_to_event_ESRD) , 'Median time to event (ESRD diagnosis)' : np.median(merged_dataset.time_to_event_ESRD)}
Sub_grouo_table = Sub_grouo_table.append(new_row, ignore_index=True)


for item in time_to_event_dataset:
    item.time_to_event_ESRD = pd.to_timedelta(item.time_to_event_ESRD, errors='coerce')

time_to_event_dataset_Adults_under_65 = time_to_event_dataset[0].drop('Label', axis = 1).drop(['Trigger_date', 'newly_calculated_eGFR_new', 'New_label', 'Diagnosis_admission_date_ESRD'], axis = 1)
time_to_event_dataset_Adults_above_65 = time_to_event_dataset[1].drop('Label', axis = 1).drop(['Trigger_date', 'newly_calculated_eGFR_new', 'New_label', 'Diagnosis_admission_date_ESRD'], axis = 1)

time_to_event_dataset_Female = time_to_event_dataset[2].drop('Label', axis = 1).drop(['Trigger_date', 'newly_calculated_eGFR_new', 'New_label', 'Diagnosis_admission_date_ESRD'], axis = 1)
time_to_event_dataset_Male = time_to_event_dataset[3].drop('Label', axis = 1).drop(['Trigger_date', 'newly_calculated_eGFR_new', 'New_label', 'Diagnosis_admission_date_ESRD'], axis = 1)

time_to_event_dataset_African_American = time_to_event_dataset[4].drop('Label', axis = 1).drop(['Trigger_date', 'newly_calculated_eGFR_new', 'New_label', 'Diagnosis_admission_date_ESRD'], axis = 1)
time_to_event_dataset_Non_African_American = time_to_event_dataset[5].drop('Label', axis = 1).drop(['Trigger_date', 'newly_calculated_eGFR_new', 'New_label', 'Diagnosis_admission_date_ESRD'], axis = 1)

Sub_grouo_table
#Sub_grouo_table.to_csv('Age_impact_new_formula.csv')

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import pandas as pd
import datetime

sns.set_style("whitegrid")
sns.set_style("ticks", {"xtick.major.size": 8, "ytick.major.size": 8})

yyy = pd.to_timedelta(merged_dataset['time_to_event_ESRD'], errors='coerce')

y = []
for i in yyy:
    if i.total_seconds() >= 0 | pd.isnull(i) == False:
        y.append(i.total_seconds()/(2.628e+6))
    
#Plot Data
fig, ax = plt.subplots(figsize = (8,8))

sns.distplot(y, bins=100, color="darkslategray", ax=ax, kde=False)
ax.set(xlabel="Earliness, in months", ylabel = "Frequency")
ax.set(xlim=(250, 0))
ax.invert_xaxis()
plt.savefig('plot3_2022_new_cusum_anew.jpg', orientation="landscape",
           dpi=300)
plt.show()


In [None]:
import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
fig, axes = plt.subplots(nrows=2, ncols=2, figsize= (15,10))

#time_origin = datetime.strptime('2020-01-01 00:00:00', '%Y-%m-%d %H:%M:%S')

patient_sks = [134769713, 194590220, 125108635, 121768243]
i = 0;

for ax in axes:
    for hi in ax:
        hi.scatter(datapool_ESRD[datapool_ESRD['patient_sk'] == patient_sks[i]]['Date'].values, datapool_ESRD[datapool_ESRD['patient_sk'] == patient_sks[i]]['newly_calculated_eGFR_new'].values, marker='o',color="steelblue")
        hi.axvline(x=time_origin, color="white", lw=2)
        hi.set_ylabel('eGFR')
        hi.set_xlabel('Date')
        hi.set_ylim([0,120])
        hi.set_title('{}, {}, {},y.o'.format(datapool_ESRD[datapool_ESRD['patient_sk'] == patient_sks[i]]['Gender'].values[0], datapool_ESRD[datapool_ESRD['patient_sk'] == patient_sks[i]]['Race'].values[0], int(datapool_ESRD[datapool_ESRD['patient_sk'] == patient_sks[i]]['Age'].values[0])))
        hi.grid()
        i = i + 1

plt.savefig('sample_ESKD_patients.jpg', orientation="landscape",
           dpi=300)        

In [None]:
patients_ESRD_full_dates_pandas = pd.read_csv('Final_patients_ESRD_full_dates_pandas.csv')

merged_dataset = patients_ESRD_labeled

new_table_dates = pd.DataFrame({'patient_sk' : patients_ESRD_full_dates_pandas['patient_sk'], 'Diagnosis_admission_date_ESRD' : patients_ESRD_full_dates_pandas['Diagnosis_admission_date_ESRD']})
merged_dataset = merged_dataset.merge(new_table_dates, on = ['patient_sk'], how = 'inner')

merged_dataset['Diagnosis_admission_date_ESRD'] = pd.to_datetime(merged_dataset['Diagnosis_admission_date_ESRD'], errors='coerce')

lislis_ESRD = (merged_dataset['Diagnosis_admission_date_ESRD'] - merged_dataset['Trigger_date'])

merged_dataset['time_to_event_ESRD'] = lislis_ESRD

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import pandas as pd
import datetime
sns.set_style("white")
yyy = pd.to_timedelta(merged_dataset['time_to_event_ESRD'], errors='coerce')

y = []
for i in yyy:
    y.append(i.total_seconds()/2.628e+6)
    
#Plot Data
fig, ax = plt.subplots(figsize = (15,8))
sns.distplot(y, bins=100, color="darkslategray", ax=ax, kde=False)
ax.set(xlabel="How early (in months) the risk trigger occurs, for 5410 patients", ylabel = "Population")
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import pandas as pd
import datetime
sns.set_style("whitegrid")
yyy = pd.to_timedelta(merged_dataset['time_to_event_ESRD'], errors='coerce')

y = []
for i in yyy:
    y.append(i.total_seconds()/2.628e+6)

#Plot Data
fig, ax = plt.subplots(figsize = (15,8))
sns.distplot(y, bins=100, color="darkslategray", ax=ax ,  kde_kws = {'cumulative': True})
ax.set(xlabel="How early (in months) the risk trigger occurs, for 5410 patients - Cumulative plot ", ylabel = "")

plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import pandas as pd
import datetime

sns.set_style("whitegrid")
sns.set_style("ticks", {"xtick.major.size": 8, "ytick.major.size": 8})

yyy = pd.to_timedelta(merged_dataset['time_to_event_ESRD'], errors='coerce')

y = []
for i in yyy:
    if i.total_seconds() >= 0 | pd.isnull(i) == False:
        y.append(i.total_seconds()/(2.628e+6))
    
#Plot Data
fig, ax = plt.subplots(figsize = (8,8))

sns.distplot(y, bins=100, color="darkslategray", ax=ax, kde=False)
ax.set(xlabel="Earliness, in months", ylabel = "Frequency")
ax.set(xlim=(250, 0))
ax.invert_xaxis()
plt.savefig('plot3.jpg', orientation="landscape",
           dpi=300)
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import pandas as pd
import datetime

sns.set_style("whitegrid")
sns.set_style("ticks", {"xtick.major.size": 8, "ytick.major.size": 8})

yyy = pd.to_timedelta(merged_dataset['time_to_event_ESRD'], errors='coerce')

y = []
for i in yyy:
    if i.total_seconds() >= 0 | pd.isnull(i) == False:
        y.append(i.total_seconds()/(2.628e+6))
    
#Plot Data
fig, ax = plt.subplots(figsize = (8,8))

sns.distplot(y, bins=100, color="darkslategray", ax=ax)
ax.set(xlabel="How early (in months) the risk trigger occurs", ylabel = "Probability")
ax.set(xlim=(250, 0))
ax.invert_xaxis()
plt.savefig('plot3.jpg', orientation="landscape",
           dpi=150)
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import pandas as pd
import datetime
sns.set_style("whitegrid")
yyy = pd.to_timedelta(merged_dataset['time_to_event_ESRD'], errors='coerce')

y = []
for i in yyy:
    y.append(i.total_seconds()/2.628e+6)

#Plot Data
fig, ax = plt.subplots(figsize = (8,8))
sns.distplot(y, bins=100, color="darkslategray", ax=ax ,  kde_kws = {'cumulative': True})
ax.set(xlabel="How early (in months) the risk trigger occurs, for 5410 patients - Cumulative plot ", ylabel = "")

ax.set(ylim=(0.23, 1))
ax.set(xlim=(0, 250))

plt.show()

### For further information please contact rzz5164@psu.edu