# Creating the "merged dataset" which contains patient IDs as well as qGFRv and trigger points data*

### * This code belongs to the paper "Using CUSUM in real time to signal clinically relevant decreases in estimated glomerular filtration rate"
##### To cite: Zafarnejad, R., Dumbauld, S., Dumbauld, D. et al. Using CUSUM in real time to signal clinically relevant decreases in estimated glomerular filtration rate. BMC Nephrol 23, 287 (2022). https://doi.org/10.1186/s12882-022-02910-8

In [None]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import time
import pandas as pd
import numpy as np
import pyspark.sql.functions as f
from pyspark.sql.window import Window
import socket    
hostname = socket.gethostname()    
IPAddr = socket.gethostbyname(hostname)  
from datetime import datetime, timedelta

#conf = SparkConf()
conf = SparkConf().setAll([("spark.executor.instances", '5'), ('spark.executor.memory', '8g'), ('spark.executor.cores', '5'), ('spark.driver.memory','3g'),('spark.sql.broadcastTimeout', '3000')])
conf.setMaster('yarn')
conf.setAppName('spark-yarn-2')
#conf.set("spark.driver.host", '10.42.7.162') #Change it accordingly based on your host ip 
#address. Open a terminal and use "cat /etc/hosts", the last line is the host ip and the host name.
conf.set("spark.driver.host", IPAddr)#Change it accordingly based on your host ip address

In [None]:
datapool_ESRD = pd.read_csv('Final_ESRD_group_done_pandas.csv')
datapool_ESRD = datapool_ESRD.drop(columns=datapool_ESRD.columns[0])
datapool_ESRD = datapool_ESRD.drop_duplicates()
datapool_control = pd.read_csv("Final_Normal_group_done_pandas.csv")
datapool_control = datapool_control.drop(columns=datapool_control.columns[0])

#some patients have less than 9 datapoinsts!!! AFTER DROPPING DUPLICATES
datapool_ESRD_dropped = datapool_ESRD.groupby('patient_sk').agg({'eGFR_EPI' : 'count'}).reset_index()[datapool_ESRD.groupby('patient_sk').agg({'eGFR_EPI' : 'count'}).reset_index().eGFR_EPI >=9]
datapool_ESRD_dropped = datapool_ESRD_dropped.drop('eGFR_EPI', axis =1)
datapool_ESRD = datapool_ESRD.merge(datapool_ESRD_dropped, on = 'patient_sk', how = 'inner')

datapool_control = datapool_control.drop(datapool_control.index[np.isinf(datapool_control.eGFR_EPI) == True], axis = 0)
datapool_control = datapool_control.drop_duplicates()

#some patients have less than 9 datapoinsts!!! AFTER DROPPING DUPLICATES
datapool_control_dropped = datapool_control.groupby('patient_sk').agg({'eGFR_EPI' : 'count'}).reset_index()[datapool_control.groupby('patient_sk').agg({'eGFR_EPI' : 'count'}).reset_index().eGFR_EPI >=9]
datapool_control_dropped = datapool_control_dropped.drop('eGFR_EPI', axis =1)
datapool_control = datapool_control.merge(datapool_control_dropped, on = 'patient_sk', how = 'inner')

#Pulling out each patient's data 
#Also. sortinh the data by cSr lavel measurement data and reindexing it

patients_list_Normal = list(set(np.unique(list(datapool_control['patient_sk']))))
patients_list_ESRD = list(set(np.unique(list(datapool_ESRD['patient_sk']))))

In [None]:
# !!!! SHOULD TURN TO TOTAL_SECONDS IN THE MIDST OF ALGORITHM

datapool_control['Date'] = pd.to_datetime(datapool_control['Date'])
datapool_control['Date'] = pd.to_datetime(datapool_control['Date'])
datapool_control_dates = datapool_control.groupby('patient_sk').agg({'Date': 'min'})
datapool_control_dates = datapool_control_dates.reset_index()
datapool_control = datapool_control.merge(datapool_control_dates, on = 'patient_sk', how='left')
datapool_control['Date_seconds'] = (datapool_control['Date_x'] - datapool_control['Date_y'])
datapool_control = datapool_control.rename({'Date_x':'Date'}, axis = 1)
datapool_control = datapool_control.drop('Date_y', axis = 1)
datapool_control['Date_seconds'] = datapool_control['Date_seconds'].dt.total_seconds()

datapool_ESRD['Date'] = pd.to_datetime(datapool_ESRD['Date'])
datapool_ESRD['Date'] = pd.to_datetime(datapool_ESRD['Date'])
datapool_ESRD_dates = datapool_ESRD.groupby('patient_sk').agg({'Date': 'min'})
datapool_ESRD_dates = datapool_ESRD_dates.reset_index()
datapool_ESRD = datapool_ESRD.merge(datapool_ESRD_dates, on = 'patient_sk', how='left')
datapool_ESRD['Date_seconds'] = datapool_ESRD['Date_x'] - datapool_ESRD['Date_y']
datapool_ESRD = datapool_ESRD.rename({'Date_x':'Date'}, axis = 1)
datapool_ESRD = datapool_ESRD.drop('Date_y', axis = 1)
datapool_ESRD['Date_seconds'] = datapool_ESRD['Date_seconds'].dt.total_seconds()

In [None]:
#Getting rid of Normal min eGFR < 60

datapool_control_patients = datapool_control.groupby('patient_sk').agg({'eGFR_EPI': 'min'})
datapool_control_patients = datapool_control_patients[datapool_control_patients['eGFR_EPI']>=60]
datapool_control_patients = datapool_control_patients.reset_index()

datapool_control = datapool_control_patients.merge(datapool_control, on = 'patient_sk', how = 'inner')
datapool_control = datapool_control.rename({'eGFR_EPI_y':'eGFR_EPI'}, axis = 1)
datapool_control = datapool_control.drop('eGFR_EPI_x', axis = 1)

patients_list_control_above_50 = list(set(np.unique(list(datapool_control['patient_sk']))))

patients_list_Normal = patients_list_control_above_50



#Getting rid of ESRD min eGFR < 60

datapool_ESRD_patients = datapool_ESRD.groupby('patient_sk').agg({'Date': 'min'})
datapool_ESRD_patients = datapool_ESRD_patients.reset_index()

datapool_ESRD_patients_eGFR = datapool_ESRD.merge(datapool_ESRD_patients, on=['patient_sk', 'Date'], how ='inner')
datapool_ESRD_patients_eGFR = datapool_ESRD_patients_eGFR.drop_duplicates('patient_sk')
datapool_ESRD_patients_eGFR = datapool_ESRD_patients_eGFR[datapool_ESRD_patients_eGFR['eGFR_EPI']>=60]

datapool_ESRD_new = datapool_ESRD.merge(datapool_ESRD_patients_eGFR['patient_sk'], on = 'patient_sk', how = 'inner')

datapool_ESRD = datapool_ESRD_new
datapool_ESRD = datapool_ESRD.drop_duplicates()
patients_list_ESRD = list(set(np.unique(list(datapool_ESRD['patient_sk']))))

In [None]:
print(datapool_control.patient_sk.unique().shape[0])
print(datapool_ESRD.patient_sk.unique().shape[0])

In [None]:
# Mu and sigma

var_list = []
n_list = []

mu = np.mean(datapool_control['eGFR_EPI'])

var_list = datapool_control.groupby('patient_sk').agg({'eGFR_EPI':'std'})
var_list = list(var_list.eGFR_EPI)

n_list =  datapool_control.groupby('patient_sk').agg({'patient_sk':'count'})
n_list = list(n_list.patient_sk)
#calculating the mean and variance of the Normal sample

n_1 = list((n_list - np.ones(len(n_list))).astype('int'))
numerator = np.multiply(n_1, np.power(var_list, 2))
denominator = sum(n_list) - len(n_list)
sigma = np.power(sum(numerator)/denominator,0.5)

print(mu, sigma)

In [None]:
#Hyperparametrs:

V0 = 0
w = 0.75
T = -4
a = 0.2


## Zi:

datapool_control['Zi'] = list((datapool_control.eGFR_EPI - mu)/sigma)
datapool_ESRD['Zi'] = list((datapool_ESRD.eGFR_EPI - mu)/sigma)

## AND let us start palying with Zi and Vi :) AND THE SLOPES AS WELL
from numba import jit
@jit(nopython=True)

def Vi_creator(Zi, patient_sk):
    Vi = np.zeros(Zi.shape)
    Vi[0] = V0

    for i in range(1, Vi.shape[0]):
        if patient_sk[i] == patient_sk[i-1]:
            Vi[i] = (min(0.0, Zi[i] + w + Vi[i-1]))
        else:
            Vi[i] = V0
            
    return Vi

datapool_control['Vi'] = Vi_creator(datapool_control['Zi'].values, datapool_control['patient_sk'].values)
datapool_ESRD['Vi'] = Vi_creator(datapool_ESRD['Zi'].values, datapool_ESRD['patient_sk'].values)

### the algorithm

Inst_slope_initial = 0.0
Smooth_slope_initial = 0.0

@jit(nopython=True)
def Slope_creator(Vi, patient_sk, Date_seconds, eGFR_EPI):
    Inst_slope = np.zeros(Vi.shape)
    Smooth_slope = np.zeros(Vi.shape)
    
    Inst_slope[0] = Inst_slope_initial
    Smooth_slope[0] = Smooth_slope_initial

    for i in range(1, Vi.shape[0]):
        if patient_sk[i] == patient_sk[i-1]:
            if Vi[i-1] == 0.0 :
                if Date_seconds[i] - Date_seconds[i-1] != 0:
                    Inst_slope[i] = min(0.0, (eGFR_EPI[i] - mu)/((Date_seconds[i] - Date_seconds[i-1])/86400))
                else:
                    Inst_slope[i] = 0.0
            else:
                if Date_seconds[i] - Date_seconds[i-1] != 0:
                    Inst_slope[i] = min(0.0, (eGFR_EPI[i] - eGFR_EPI[i-1])/((Date_seconds[i] - Date_seconds[i-1])/86400))
                else:
                    Inst_slope[i] = 0.0
            if Date_seconds[i] - Date_seconds[i-1] != 0:
                Smooth_slope[i] = (1-a) * Smooth_slope[i-1] + a * (min(0.0, (eGFR_EPI[i] - eGFR_EPI[i-1])/((Date_seconds[i] - Date_seconds[i-1])/86400)))
            else:
                Smooth_slope[i] = Smooth_slope[i-1]
        else:
            Inst_slope[i] = Inst_slope_initial
            Smooth_slope[i] = Smooth_slope_initial
            
    return [Inst_slope, Smooth_slope]

[datapool_control['Inst_slope'],datapool_control['Smooth_slope']] = Slope_creator(datapool_control['Vi'].values, datapool_control['patient_sk'].values, datapool_control['Date_seconds'].values, datapool_control['eGFR_EPI'].values)
[datapool_ESRD['Inst_slope'], datapool_ESRD['Smooth_slope']] = Slope_creator(datapool_ESRD['Vi'].values, datapool_ESRD['patient_sk'].values, datapool_ESRD['Date_seconds'].values, datapool_ESRD['eGFR_EPI'].values)

# Making up the result trigger date and eGFR tables

patients_control_trigger = datapool_control[datapool_control['Vi'] <= T].groupby('patient_sk').agg({'Date': 'min'})
patients_control_trigger = patients_control_trigger.reset_index()
patients_control_trigger = patients_control_trigger.merge(datapool_control[['patient_sk', 'eGFR_EPI', 'Date', 'Inst_slope', 'Smooth_slope']], on=['patient_sk'], how='inner')
patients_control_trigger = patients_control_trigger.rename({'Date_x':'Trigger_date'}, axis = 1)
patients_control_trigger = patients_control_trigger.rename({'Date_y':'Date'}, axis = 1)
patients_control_trigger = patients_control_trigger[patients_control_trigger.Trigger_date == patients_control_trigger.Date]
patients_control_trigger['New_label'] = list(np.ones(patients_control_trigger.patient_sk.shape[0]))

patients_ESRD_trigger = datapool_ESRD[datapool_ESRD['Vi'] <= T].groupby('patient_sk').agg({'Date': 'min'})
patients_ESRD_trigger = patients_ESRD_trigger.reset_index()
patients_ESRD_trigger = patients_ESRD_trigger.merge(datapool_ESRD[['patient_sk', 'eGFR_EPI', 'Date', 'Inst_slope', 'Smooth_slope']], on=['patient_sk'], how='inner')
patients_ESRD_trigger = patients_ESRD_trigger.rename({'Date_x':'Trigger_date'}, axis = 1)
patients_ESRD_trigger = patients_ESRD_trigger.rename({'Date_y':'Date'}, axis = 1)
patients_ESRD_trigger = patients_ESRD_trigger[patients_ESRD_trigger.Trigger_date == patients_ESRD_trigger.Date]
patients_ESRD_trigger['New_label'] = list(np.ones(patients_ESRD_trigger.patient_sk.shape[0]))


#Labeling and finishing :)

patients_Normal_labeled = pd.DataFrame({'patient_sk' : list(datapool_control.patient_sk.unique()) , 'Label' : list(np.ones(len(list(datapool_control.patient_sk.unique()))))}) 

patients_Normal_labeled =  patients_Normal_labeled.merge(patients_control_trigger, on='patient_sk', how='left')
patients_Normal_labeled = patients_Normal_labeled.drop_duplicates('patient_sk')
patients_Normal_labeled = patients_Normal_labeled.drop('Date', axis = 1)



patients_ESRD_labeled = pd.DataFrame({'patient_sk' : list(datapool_ESRD.patient_sk.unique()) , 'Label' : list(np.ones(len(list(datapool_ESRD.patient_sk.unique()))))}) 

patients_ESRD_labeled =  patients_ESRD_labeled.merge(patients_ESRD_trigger, on='patient_sk', how='left')
patients_ESRD_labeled = patients_ESRD_labeled.drop_duplicates('patient_sk')
patients_ESRD_labeled = patients_ESRD_labeled.drop('Date', axis = 1)

#Accuracy = true(positive and negative)/total population
# ESRD NaN = 0.0
# Normal NaN = 0.0

#RIGHT detection in ESRD:
numbet_of_ones_ESRD = patients_ESRD_labeled[patients_ESRD_labeled['New_label'] == 1].shape[0]

#WRONG detection in Normal
numbet_of_ones_Normal = patients_Normal_labeled[patients_Normal_labeled['New_label'] == 1].shape[0]

total_ESRD = patients_ESRD_labeled.shape[0]
total_Normal = patients_Normal_labeled.shape[0]


# Accuracy
Accuracy = (numbet_of_ones_ESRD + (total_Normal - numbet_of_ones_Normal))/(total_ESRD + total_Normal)

#Sensetivity
tp = numbet_of_ones_ESRD
fn = total_ESRD - numbet_of_ones_ESRD
Sensetivity = tp/(tp+fn)

#Specificity
tn = total_Normal - numbet_of_ones_Normal
fp = numbet_of_ones_Normal
Specificity = tn/(tn+fp)


In [None]:
Accuracy

In [None]:
Sensetivity

In [None]:
Specificity

## Now, creating the MERGED DATASET

In [None]:
patients_ESRD_full_dates_pandas = pd.read_csv('Final_patients_ESRD_full_dates_pandas.csv')
patients_ESRD_full_dates_pandas = patients_ESRD_full_dates_pandas.drop(patients_ESRD_full_dates_pandas.columns[[0]], axis = 1)

In [None]:
merged_dataset = patients_ESRD_labeled.merge(patients_ESRD_full_dates_pandas, on = 'patient_sk' , how = 'inner')
merged_dataset['Trigger_date'] = pd.to_datetime(merged_dataset['Trigger_date'], errors='coerce')
merged_dataset['Diagnosis_admission_date_ESRD'] = pd.to_datetime(merged_dataset['Diagnosis_admission_date_ESRD'], errors='coerce')
merged_dataset['Diagnosis_admission_date_dialysis'] = pd.to_datetime(merged_dataset['Diagnosis_admission_date_dialysis'], errors='coerce')
merged_dataset['Diagnosis_admission_date_transplant'] = pd.to_datetime(merged_dataset['Diagnosis_admission_date_transplant'], errors='coerce')

In [None]:
lislis_ESRD = (merged_dataset['Diagnosis_admission_date_ESRD'] - merged_dataset['Trigger_date'])
lislis_dialysis = (merged_dataset['Diagnosis_admission_date_dialysis'] - merged_dataset['Trigger_date'])
lislis_transplant = (merged_dataset['Diagnosis_admission_date_transplant'] - merged_dataset['Trigger_date'])

In [None]:
merged_dataset['time_to_event_ESRD'] = lislis_ESRD
merged_dataset['time_to_event_dialysis'] = lislis_dialysis
merged_dataset['time_to_event_transplant'] = lislis_transplant

#Making the negatives, positive
for i in range(len(lislis_ESRD)):
    if lislis_ESRD[i] <= datetime.timedelta(0):
        merged_dataset['time_to_event_ESRD'][i] = datetime.timedelta(0)

In [None]:
merged_dataset.to_csv('merged_dataset_dates_timedeltas_full.csv')

In [None]:
merged_dataset

### For further information please contact rzz5164@psu.edu