# Extracting diagnosis data, lab findings and medication history for the ESKD group*

### * This code belongs to the paper "Using CUSUM in real time to signal clinically relevant decreases in estimated glomerular filtration rate"
##### To cite: Zafarnejad, R., Dumbauld, S., Dumbauld, D. et al. Using CUSUM in real time to signal clinically relevant decreases in estimated glomerular filtration rate. BMC Nephrol 23, 287 (2022). https://doi.org/10.1186/s12882-022-02910-8

In [None]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import time
import pandas as pd
import numpy as np
import pyspark.sql.functions as f
from pyspark.sql.window import Window
import socket    
hostname = socket.gethostname()    
IPAddr = socket.gethostbyname(hostname)  

#conf = SparkConf()
conf = SparkConf().setAll([("spark.executor.instances", '5'), ('spark.executor.memory', '8g'), ('spark.executor.cores', '5'), ('spark.driver.memory','3g'),('spark.sql.broadcastTimeout', '3000')])
conf.setMaster('yarn')
conf.setAppName('spark-yarn-2')
#conf.set("spark.driver.host", '10.42.7.162') #Change it accordingly based on your host ip 
#address. Open a terminal and use "cat /etc/hosts", the last line is the host ip and the host name.
conf.set("spark.driver.host", IPAddr)#Change it accordingly based on your host ip address

In [None]:
spark = SparkSession.builder.config(conf=conf).enableHiveSupport().getOrCreate()

In [None]:
# Pulling out the groups from Cerner database

## Adult patient full data (with sCr level)
data_pool = spark.sql("select P.patient_sk, L.lab_drawn_dt_tm as Date, P.race as Race, P.gender as Gender, E.age_in_years as Age, L.numeric_result as sCr_level\
                                      from cerner.orc_hf_d_patient P \
                                      join cerner.orc_hf_f_encounter E on E.patient_id = P.patient_id\
                                      join cerner.orc_hf_f_lab_procedure L on L.encounter_id = E.encounter_id\
                                      where L.detail_lab_procedure_id ='13.0' and L.numeric_result >= 0 and L.numeric_result <= 1000\
                                      and L.numeric_result is not null\
                                      and E.age_in_years >= '18'\
                                      and L.lab_drawn_dt_tm is not null")
                              
data_pool.persist()
data_pool.cache()
#labs_patients_detailed.take(10)

#---------------------------------------------------------------------------------------------------------------------

#assuming pat is the initial spark dataframe consisting all information

k_female = 0.7
k_male = 0.9
alpha_male = -0.411
alpha_female = -0.329
alpha_fixed = -1.209
age_factor = 0.993

# min(SCr/κ, 1)

data_pool=data_pool.withColumn("new_sCr", f.when(f.col('sCr_level') >= f.lit(60), f.col('sCr_level')*f.lit(0.01132))
.otherwise(f.col('sCr_level')))

data_pool=data_pool.withColumn("min(SCr/κ, 1)", f.when(f.col('Gender') =='Female', f.least(f.col('new_sCr')/k_female, f.lit(1)))
.otherwise(f.least(f.col('new_sCr')/k_male, f.lit(1))))

# min(SCr/κ, 1)^α
data_pool=data_pool.withColumn("pow(min(SCr/κ, 1),alpha)", f.when(f.col('Gender') =='Female', f.pow(f.col('min(SCr/κ, 1)'), f.lit(alpha_female)))
.otherwise(f.pow(f.col('min(SCr/κ, 1)'), f.lit(alpha_male))))

# max(SCr/κ, 1)
data_pool=data_pool.withColumn("max(SCr/κ, 1)", f.when(f.col('Gender') =='Female', f.greatest(f.col('new_sCr')/k_female, f.lit(1)))
.otherwise(f.greatest(f.col('new_sCr')/k_male, f.lit(1))))

# max(SCr /κ, 1)^-1.209
data_pool=data_pool.withColumn("pow(max(SCr/κ, 1),alpha)", f.pow(f.col('max(SCr/κ, 1)'), f.lit(alpha_fixed)))


# 0.993^Age
data_pool=data_pool.withColumn("pow_age", f.pow(f.lit(age_factor),f.col('Age')))


# Intermediate egfrs : 141 x min(SCr/κ, 1)^α x max(SCr /κ, 1)^-1.209 x 0.993^Age
data_pool=data_pool.withColumn("Intermediate_egfr_1", 141 * f.col('pow(min(SCr/κ, 1),alpha)'))
data_pool=data_pool.withColumn("Intermediate_egfr_2", f.col('Intermediate_egfr_1') * f.col('pow(max(SCr/κ, 1),alpha)'))
data_pool=data_pool.withColumn("Intermediate_egfr_3", f.col('Intermediate_egfr_2') * f.col("pow_age"))


# Intermediate egfrs : 141 x min(SCr/κ, 1)^α x max(SCr /κ, 1)^-1.209 x 0.993^Age * 1.018[female]
data_pool=data_pool.withColumn("Intermediate_egfr_4", f.when(f.col('Gender') =='Female', f.col('Intermediate_egfr_3')* 1.018)
                   .otherwise(f.col('Intermediate_egfr_3')))


#Final egfrs

data_pool=data_pool.withColumn("eGFR_EPI", f.when(f.col('Race') =='Black', f.col('Intermediate_egfr_4')* 1.159)
                   .otherwise(f.col('Intermediate_egfr_4')))

#---------------------------------------------------------------------------------------------------------------------

data_pool = data_pool.drop("sCr_level")
data_pool = data_pool.drop("max(SCr/κ, 1)")
data_pool = data_pool.drop("min(SCr/κ, 1)")
data_pool = data_pool.drop("pow(min(SCr/κ, 1),alpha)")
data_pool = data_pool.drop("pow(max(SCr/κ, 1),alpha)")
data_pool = data_pool.drop("pow_age")
data_pool = data_pool.drop("Intermediate_egfr_1")
data_pool = data_pool.drop("Intermediate_egfr_2")
data_pool = data_pool.drop("Intermediate_egfr_3")
data_pool = data_pool.drop("Intermediate_egfr_4")

#---------------------------------------------------------------------------------------------------------------------

data_pool = data_pool.withColumn("eGFR_bin", 
                                 f.when(f.col('eGFR_EPI') < 15, "[0, 15)-Stage5")
                                 .when((f.col('eGFR_EPI') >= 15) & (f.col('eGFR_EPI') < 30), "[15, 30)-Stage4")
                                 .when((f.col('eGFR_EPI') >= 30) & (f.col('eGFR_EPI') < 45), "[30,45)-Stage3B")
                                 .when((f.col('eGFR_EPI') >= 45) & (f.col('eGFR_EPI') < 60), "[45, 60)-Stage3A")
                                 .when((f.col('eGFR_EPI') >= 60) & (f.col('eGFR_EPI') < 90), "[60, 90)-Stage2")
                                 .when((f.col('eGFR_EPI') >= 90) & (f.col('eGFR_EPI') < 500), "[90, ..)-Normal")
                                 .otherwise(None))                  
#data_test = data_test.withColumn("eGFR_bin_5", f.when(f.col('eGFR_EPI') < 15, "[0, 15)-Stage5").otherwise(None)) "[45, 
#data_test = data_test.withColumn("eGFR_bin_4", f.when((f.col('eGFR_EPI') >= 15)&  (f.col('eGFR_EPI') < 30), "[15, 30)-Stage4").otherwise(None))

#---------------------------------------------------------------------------------------------------------------------

#9 or more datapoints
patients_9more = data_pool.groupBy(f.col('patient_sk')).agg(f.count(f.when((f.col('eGFR_bin') == "[15, 30)-Stage4")|(f.col('eGFR_bin') == "[15, 30)-Stage4")|(f.col('eGFR_bin') == "[30,45)-Stage3B")|(f.col('eGFR_bin') == "[45,60)-Stage3B")|(f.col('eGFR_bin') == "[60, 90)-Stage2"),True)))
patients_9more = patients_9more.filter(patients_9more['count(CASE WHEN (((((eGFR_bin = [15, 30)-Stage4) OR (eGFR_bin = [15, 30)-Stage4)) OR (eGFR_bin = [30,45)-Stage3B)) OR (eGFR_bin = [45,60)-Stage3B)) OR (eGFR_bin = [60, 90)-Stage2)) THEN true END)'] >= (9))
patients_9more = patients_9more.drop("count(CASE WHEN (((((eGFR_bin = [15, 30)-Stage4) OR (eGFR_bin = [15, 30)-Stage4)) OR (eGFR_bin = [30,45)-Stage3B)) OR (eGFR_bin = [45,60)-Stage3B)) OR (eGFR_bin = [60, 90)-Stage2)) THEN true END)")

#---------------------------------------------------------------------------------------------------------------------

data_pool_9more = data_pool.join(patients_9more, on = ['patient_sk'] , how = 'inner')
data_pool_9more.cache()

#---------------------------------------------------------------------------------------------------------------------

### Now, the second criterion (makeing sure this is not acute kidney disease)

data_test = data_pool_9more.filter((f.col('eGFR_bin') == "[15, 30)-Stage4")|(f.col('eGFR_bin') == "[15, 30)-Stage4")|(f.col('eGFR_bin') == "[30,45)-Stage3B")|(f.col('eGFR_bin') == "[45,60)-Stage3B")|(f.col('eGFR_bin') == "[60, 90)-Stage2"))

patients_max_date = data_test.groupBy(f.col('patient_sk')).agg(f.max(f.col('Date')))
patients_min_date = data_test.groupBy(f.col('patient_sk')).agg(f.min(f.col('Date')))

timeFmt = "YY-mm-dd HH:MM:SS"
patients_not_acute = patients_max_date.join(patients_min_date, on = ['patient_sk']).withColumn("Duration", f.unix_timestamp('max(Date)', format=timeFmt) - f.unix_timestamp('min(Date)',format=timeFmt))
patients_not_acute = patients_not_acute.filter(f.col('Duration') > 7776000)

#---------------------------------------------------------------------------------------------------------------------

#Duration
patients_not_acute = patients_not_acute.drop(f.col('max(Date)'))
patients_not_acute = patients_not_acute.drop(f.col('min(Date)'))

#patients with more than 3 months of data points (not accute)
data_pool_9more_chronic = data_pool_9more.join(patients_not_acute.select(patients_not_acute['patient_sk']), on = ['patient_sk'] , how = 'inner')
data_pool_9more_chronic.cache()

data_pool_9more_chronic_sorted = data_pool_9more_chronic.orderBy('patient_sk', 'Date')

#---------------------------------------------------------------------------------------------------------------------

patients_60to90 = data_pool_9more_chronic_sorted.groupBy(f.col('patient_sk')).agg(f.count(f.when(f.col('eGFR_bin') == "[60, 90)-Stage2", True)).alias('count'))
patients_60to90 = patients_60to90.filter(patients_60to90['count']>=3)
patients_60to90.drop(patients_60to90['count'])

data_pool_9more_chronic_sorted_60to90 = data_pool_9more_chronic_sorted.join(patients_60to90.select('patient_sk'), on=['patient_sk'], how='inner')


#---------------------------------------------------------------------------------------------------------------------

# ICD9 + ICD 10
#Patients With ESRD, Dialysis, Stage 5 CKD

Patients_diagnosed_ESRD = spark.sql("select distinct P.patient_sk\
                          from cerner.orc_hf_d_diagnosis Dd \
                          join cerner.orc_hf_f_diagnosis Df on Dd.diagnosis_id = Df.diagnosis_id\
                          join cerner.orc_hf_f_encounter E on Df.encounter_id = E.encounter_id\
                          join cerner.orc_hf_d_patient P on E.patient_id = P.patient_id\
                          where Dd.diagnosis_code in ('585.6', 'N18.6')\
                          and E.age_in_years >= '18'")

Patients_diagnosed_ESRD.cache()
Patients_diagnosed_ESRD = Patients_diagnosed_ESRD.dropDuplicates()

#---------------------------------------------------------------------------------------------------------------------

# ICD9 + ICD 10
#Patients With ANY diagnosis regrading CKD (different stages or transplent) or Kidney related death (Death caused by Kidney Disease (Nephritis, Nephrotic Syndrome, Nephrosis))

Patients_diagnosed_any = spark.sql("select distinct P.patient_sk\
                          from cerner.orc_hf_d_diagnosis Dd \
                          join cerner.orc_hf_f_diagnosis Df on Dd.diagnosis_id = Df.diagnosis_id\
                          join cerner.orc_hf_f_encounter E on Df.encounter_id = E.encounter_id\
                          join cerner.orc_hf_d_patient P on E.patient_id = P.patient_id\
                          where Dd.diagnosis_code in ('585.6', '585.5', 'V42.0', 'N18.6', 'Z94.0', 'N18.1', 'N18.2', 'N18.3', 'N18.4', 'N18.5', 'N00', 'N01', 'N02', 'N03', 'N04', 'N05','N06','N07', 'N17', 'N18', 'N19', 'N25', 'N26', 'N27')\
                          and E.age_in_years >= '18'")

Patients_diagnosed_any.cache()
Patients_diagnosed_any = Patients_diagnosed_any.dropDuplicates()

#---------------------------------------------------------------------------------------------------------------------

#Now, splitting up into two groups, again

#Normal_group_done = data_pool_9more_chronic_sorted_60to90_age50to80.join(Patients_diagnosed_any.select('patient_sk'), on = ['patient_sk'], how="leftanti")
Normal_group_done = data_pool_9more_chronic_sorted_60to90.join(Patients_diagnosed_any.select('patient_sk'), on = ['patient_sk'], how="leftanti")
Normal_group_done.cache()

#ESRD_group_done = data_pool_9more_chronic_sorted_60to90_age50to80.join(Patients_diagnosed_ESRD.select('patient_sk'), on = ['patient_sk'], how="inner")
ESRD_group_done = data_pool_9more_chronic_sorted_60to90.join(Patients_diagnosed_ESRD.select('patient_sk'), on = ['patient_sk'], how="inner")
ESRD_group_done.cache()

#---------------------------------------------------------------------------------------------------------------------

# The same aim, but now dropping based on eGFR values (first observation below 20)

patients_ESRD_eGFR_min_date = ESRD_group_done.groupBy('patient_sk').agg(f.min(f.column('Date')).alias('min_eGFR_date'))
ESRD_group_done = ESRD_group_done.join(patients_ESRD_eGFR_min_date, on=['patient_sk'], how='left')

ESRD_group_done_first_eGFR = ESRD_group_done.withColumn('first_eGFR', f.when((f.col('min_eGFR_date') == f.col('Date')) , f.col('eGFR_EPI')).otherwise(f.lit(10000)))
patients_ESRD_group_done_above_60 =  ESRD_group_done_first_eGFR.filter(f.col('first_eGFR')<1000)
patients_ESRD_group_done_above_60 = patients_ESRD_group_done_above_60.filter(f.col('first_eGFR')>=60)
ESRD_group_done_above_60 = ESRD_group_done.join(patients_ESRD_group_done_above_60.select('patient_sk'), on=['patient_sk'], how="inner")

# The same aim, but now Normal group above 50

patients_Normal_group_done_above_60 =  Normal_group_done.groupBy('patient_sk').agg(f.min(f.column('eGFR_EPI')).alias('min_eGFR'))
patients_Normal_group_done_above_60 = patients_Normal_group_done_above_60.filter(f.col('min_eGFR')>=60)
Normal_group_done_above_60 = Normal_group_done.join(patients_Normal_group_done_above_60.select('patient_sk'), on=['patient_sk'], how="inner")

#---------------------------------------------------------------------------------------------------------------------

# Now, getting rid of the condenced datapoint patients :)

Normal_group_done_sparse_above_60 = Normal_group_done_above_60.withColumn('lag', f.lag('Date').over(Window.partitionBy('patient_sk').orderBy('Date')))
patients_min_date = Normal_group_done_above_60.groupBy('patient_sk').agg(f.min('Date').alias('mindate'))
Normal_group_done_sparse_above_60 = Normal_group_done_sparse_above_60.join(patients_min_date, on = ['patient_sk'], how='left')

Normal_group_done_sparse_above_60 = Normal_group_done_sparse_above_60.withColumn('lag_set', f.when(Normal_group_done_sparse_above_60['Date'] > Normal_group_done_sparse_above_60['mindate'], f.col('lag')).otherwise(None))
Normal_group_done_sparse_above_60 = Normal_group_done_sparse_above_60.withColumn("Duration", f.abs(f.unix_timestamp('Date', format=timeFmt) - f.unix_timestamp('lag_set',format=timeFmt)))

patient_Normal_group_done_sparse = Normal_group_done_sparse_above_60.groupBy('patient_sk').agg(f.min(f.col('Duration')).alias('minn'))
patient_Normal_group_done_sparse = patient_Normal_group_done_sparse.filter(f.col('minn')>86400)
patient_Normal_group_done_sparse = patient_Normal_group_done_sparse.drop('minn')
Normal_group_done_sparse_above_60 = Normal_group_done_above_60.join(patient_Normal_group_done_sparse.select('patient_sk'), on = ['patient_sk'] , how = 'inner')

#---------------------------------------------------------------------------------------------------------------------
## In order to rerun the code, you may want to delet theis part. Here we double check the data extraction with what we expect.
# Normal having 9 or more obs AFTER dropping duplicates :)

dropped_normal_patients = list(pd.read_csv('dropped_normal_group.csv').patient_sk)
dropped_normal_patients = [str(i) for i in dropped_normal_patients]
patients_Normal_group_done = Normal_group_done_sparse_above_60.where(f.col("patient_sk").isin(dropped_normal_patients))
Normal_group_done_sparse_above_60_droped = Normal_group_done_sparse_above_60.join(patients_Normal_group_done.select('patient_sk'), on = ['patient_sk'] , how = 'leftanti')


kept_ESRD_group = list(pd.read_csv('kept_ESRD_group.csv').patient_sk)
kept_ESRD_group = [str(i) for i in kept_ESRD_group]
patients_ESRD_group_done = ESRD_group_done_above_60.where(f.col("patient_sk").isin(kept_ESRD_group))
ESRD_group_done_above_60_droped = ESRD_group_done_above_60.join(patients_ESRD_group_done.select('patient_sk'), on = ['patient_sk'] , how = 'left')

#---------------------------------------------------------------------------------------------------------------------

ESRD_group_done = ESRD_group_done_above_60_droped.dropDuplicates()
Normal_group_done = Normal_group_done_sparse_above_60_droped.dropDuplicates()

## Disorders and diseases

In [None]:
#Nicotine/Tobacco dependency

all_patients_Nicotine_Tobacco_dependency = spark.sql("select P.patient_sk, min(E.admitted_dt_tm) as Diagnosis_admission_date_Nicotine_Tobacco_dependency\
                                         from cerner.orc_hf_d_diagnosis Dd \
                                         join cerner.orc_hf_f_diagnosis Df on Dd.diagnosis_id = Df.diagnosis_id\
                                         join cerner.orc_hf_f_encounter E on Df.encounter_id = E.encounter_id\
                                         join cerner.orc_hf_d_patient P on E.patient_id = P.patient_id\
                                         where Dd.diagnosis_code in ('V15.82', 'Z87.891', '305.1', 'Z72.0') or Dd.diagnosis_code like 'O99.33%' or Dd.diagnosis_code like 'F17%' or Dd.diagnosis_code like '649.0%'\
                                         group by P.patient_sk")
all_patients_Nicotine_Tobacco_dependency.persist()                                       
all_patients_Nicotine_Tobacco_dependency = all_patients_Nicotine_Tobacco_dependency.dropDuplicates()

In [None]:
#Hypertension

all_patients_hypertension = spark.sql("select P.patient_sk, min(E.admitted_dt_tm) as Diagnosis_admission_date_Hypertension\
                                         from cerner.orc_hf_d_diagnosis Dd \
                                         join cerner.orc_hf_f_diagnosis Df on Dd.diagnosis_id = Df.diagnosis_id\
                                         join cerner.orc_hf_f_encounter E on Df.encounter_id = E.encounter_id\
                                         join cerner.orc_hf_d_patient P on E.patient_id = P.patient_id\
                                         where Dd.diagnosis_code in ('401.0','401.1','401.9', '642.11', '642.14') or Dd.diagnosis_code like '402%' or Dd.diagnosis_code like '403%' or Dd.diagnosis_code like '404.0%'\
                                         or Dd.diagnosis_code like '404.1%' or Dd.diagnosis_code like '404.9%' or Dd.diagnosis_code like '405%' or Dd.diagnosis_code like '642.7%'\
                                         or Dd.diagnosis_code like 'I10%' or Dd.diagnosis_code like 'I11%' or Dd.diagnosis_code like 'I12%' or Dd.diagnosis_code like 'I13%' or Dd.diagnosis_code like 'I15%' or Dd.diagnosis_code like 'I16%'\
                                         or Dd.diagnosis_code like 'O11%' or Dd.diagnosis_code like 'O10%'\
                                         group by P.patient_sk")

all_patients_hypertension.persist()                                       
all_patients_hypertension = all_patients_hypertension.dropDuplicates()

In [None]:
#Diabetes Mellitus

all_patients_Diabetes = spark.sql("select P.patient_sk, min(E.admitted_dt_tm) as Diagnosis_admission_date_Diabetes\
                                         from cerner.orc_hf_d_diagnosis Dd \
                                         join cerner.orc_hf_f_diagnosis Df on Dd.diagnosis_id = Df.diagnosis_id\
                                         join cerner.orc_hf_f_encounter E on Df.encounter_id = E.encounter_id\
                                         join cerner.orc_hf_d_patient P on E.patient_id = P.patient_id\
                                         where Dd.diagnosis_code like '250%%' or Dd.diagnosis_code like '249%%'\
                                         or Dd.diagnosis_code like 'E08%' or Dd.diagnosis_code like 'E09%' or Dd.diagnosis_code like 'E10%' or Dd.diagnosis_code like 'E11%' or Dd.diagnosis_code like 'E12%' or Dd.diagnosis_code like 'E13%'\
                                         or Dd.diagnosis_code like 'O24.0%' or Dd.diagnosis_code like 'O24.1%' or Dd.diagnosis_code like 'O24.3%'\
                                         group by P.patient_sk")

all_patients_Diabetes.persist()                                       
all_patients_Diabetes = all_patients_Diabetes.dropDuplicates()

In [None]:
#Coronary_Artery_Disease

all_patients_Coronary_Artery_Disease = spark.sql("select P.patient_sk, min(E.admitted_dt_tm) as Diagnosis_admission_date_Coronary_Artery_Disease\
                                         from cerner.orc_hf_d_diagnosis Dd \
                                         join cerner.orc_hf_f_diagnosis Df on Dd.diagnosis_id = Df.diagnosis_id\
                                         join cerner.orc_hf_f_encounter E on Df.encounter_id = E.encounter_id\
                                         join cerner.orc_hf_d_patient P on E.patient_id = P.patient_id\
                                         where Dd.diagnosis_code in ('I25.2', 'I25.5',  'I25.6', '125.9', '414.01', '414.03', '414.8', '414.9', '414.3', '414.4', '412', '414.02', '414.06,' '414.05', '414.04', '414.00')\
                                         or Dd.diagnosis_code like 'I25.1%' or Dd.diagnosis_code like 'I25.7%' or Dd.diagnosis_code like 'I25.8%'\
                                         group by P.patient_sk")

all_patients_Coronary_Artery_Disease.persist()                                       
all_patients_Coronary_Artery_Disease = all_patients_Coronary_Artery_Disease.dropDuplicates()

In [None]:
#Cerebrovascular_Disease

all_patients_Cerebrovascular_Disease = spark.sql("select P.patient_sk, min(E.admitted_dt_tm) as Diagnosis_admission_date_Cerebrovascular_Disease\
                                         from cerner.orc_hf_d_diagnosis Dd \
                                         join cerner.orc_hf_f_diagnosis Df on Dd.diagnosis_id = Df.diagnosis_id\
                                         join cerner.orc_hf_f_encounter E on Df.encounter_id = E.encounter_id\
                                         join cerner.orc_hf_d_patient P on E.patient_id = P.patient_id\
                                         where Dd.diagnosis_code in ('I63.9', 'I67.2', 'I67.4', 'I67.5', 'I67.9', '437.2', '437.5', '436', '437.9', '437.0', '437.1', '437.8')\
                                         or Dd.diagnosis_code like 'I63.0%%' or Dd.diagnosis_code like 'I63.2%%' or Dd.diagnosis_code like 'I63.3%%' or Dd.diagnosis_code like 'I63.5%%' or Dd.diagnosis_code like 'I63.8%%' or Dd.diagnosis_code like 'I65%%' or Dd.diagnosis_code like 'I66%%' or Dd.diagnosis_code like 'I67.8%%' or Dd.diagnosis_code like '433%%' or Dd.diagnosis_code like '434.0%%' or Dd.diagnosis_code like '434.9%%' \
                                         group by P.patient_sk")

all_patients_Cerebrovascular_Disease.persist()                                       
all_patients_Cerebrovascular_Disease = all_patients_Cerebrovascular_Disease.dropDuplicates()

In [None]:
#Peripheral_Vascular_Disease

all_patients_Peripheral_Vascular_Disease = spark.sql("select P.patient_sk, min(E.admitted_dt_tm) as Diagnosis_admission_date_Peripheral_Vascular_Disease\
                                         from cerner.orc_hf_d_diagnosis Dd \
                                         join cerner.orc_hf_f_diagnosis Df on Dd.diagnosis_id = Df.diagnosis_id\
                                         join cerner.orc_hf_f_encounter E on Df.encounter_id = E.encounter_id\
                                         join cerner.orc_hf_d_patient P on E.patient_id = P.patient_id\
                                         where Dd.diagnosis_code like 'I70%%' or Dd.diagnosis_code like '440%' or Dd.diagnosis_code like '443.8%'  or Dd.diagnosis_code like '443.9%'\
                                         group by P.patient_sk")

all_patients_Peripheral_Vascular_Disease.persist()                                       
all_patients_Peripheral_Vascular_Disease = all_patients_Peripheral_Vascular_Disease.dropDuplicates()

In [None]:
#Sickle_Cell_Trait 

all_patients_Sickle_Cell_Trait  = spark.sql("select P.patient_sk, min(E.admitted_dt_tm) as Diagnosis_admission_date_Sickle_Cell_Trait \
                                         from cerner.orc_hf_d_diagnosis Dd \
                                         join cerner.orc_hf_f_diagnosis Df on Dd.diagnosis_id = Df.diagnosis_id\
                                         join cerner.orc_hf_f_encounter E on Df.encounter_id = E.encounter_id\
                                         join cerner.orc_hf_d_patient P on E.patient_id = P.patient_id\
                                         where Dd.diagnosis_code like '282.6%%' or Dd.diagnosis_code like 'D57%'\
                                         group by P.patient_sk")

all_patients_Sickle_Cell_Trait.persist()                                       
all_patients_Sickle_Cell_Trait  = all_patients_Sickle_Cell_Trait.dropDuplicates()

In [None]:
#Hx_of_Cancer 

all_patients_Hx_of_Cancer  = spark.sql("select P.patient_sk, min(E.admitted_dt_tm) as Diagnosis_admission_date_Hx_of_Cancer \
                                         from cerner.orc_hf_d_diagnosis Dd \
                                         join cerner.orc_hf_f_diagnosis Df on Dd.diagnosis_id = Df.diagnosis_id\
                                         join cerner.orc_hf_f_encounter E on Df.encounter_id = E.encounter_id\
                                         join cerner.orc_hf_d_patient P on E.patient_id = P.patient_id\
                                         where Dd.diagnosis_code like 'V10%%' or Dd.diagnosis_code like 'Z85%%'\
                                         group by P.patient_sk")

all_patients_Hx_of_Cancer.persist()                                       
all_patients_Hx_of_Cancer = all_patients_Hx_of_Cancer.dropDuplicates()

In [None]:
#Hypercholesterolemia

all_patients_Hypercholesterolemia = spark.sql("select P.patient_sk, min(E.admitted_dt_tm) as Diagnosis_admission_date_Hypercholesterolemia\
                                         from cerner.orc_hf_d_diagnosis Dd \
                                         join cerner.orc_hf_f_diagnosis Df on Dd.diagnosis_id = Df.diagnosis_id\
                                         join cerner.orc_hf_f_encounter E on Df.encounter_id = E.encounter_id\
                                         join cerner.orc_hf_d_patient P on E.patient_id = P.patient_id\
                                         where Dd.diagnosis_code like '272%' or Dd.diagnosis_code like 'E78%'\
                                         group by P.patient_sk")

all_patients_Hypercholesterolemia.persist()                                       
all_patients_Hypercholesterolemia = all_patients_Hypercholesterolemia.dropDuplicates()

In [None]:
#History_of_Urinary_Tract_Abnormalities 

all_patients_History_of_Urinary_Tract_Abnormalities  = spark.sql("select P.patient_sk, min(E.admitted_dt_tm) as Diagnosis_admission_date_History_of_Urinary_Tract_Abnormalities \
                                         from cerner.orc_hf_d_diagnosis Dd \
                                         join cerner.orc_hf_f_diagnosis Df on Dd.diagnosis_id = Df.diagnosis_id\
                                         join cerner.orc_hf_f_encounter E on Df.encounter_id = E.encounter_id\
                                         join cerner.orc_hf_d_patient P on E.patient_id = P.patient_id\
                                         where Dd.diagnosis_code in ('591', '593.3', '593.89', '593.9', '599.6', '599.8', '599.9', '589.9', '596.0', 'N32.0')\
                                         or Dd.diagnosis_code like 'N13%%' or Dd.diagnosis_code like 'N26%%' or Dd.diagnosis_code like 'N27%%' or Dd.diagnosis_code like 'N28%%' or Dd.diagnosis_code like 'N31%%' or Dd.diagnosis_code like '593.7%%' or Dd.diagnosis_code like '589%%' or Dd.diagnosis_code like '596.5%%'\
                                         group by P.patient_sk")

all_patients_History_of_Urinary_Tract_Abnormalities.persist()                                       
all_patients_History_of_Urinary_Tract_Abnormalities = all_patients_History_of_Urinary_Tract_Abnormalities.dropDuplicates()

In [None]:
ESRD_group_patients = ESRD_group_done.select('patient_sk').dropDuplicates()

In [None]:
DIAGNOSIS = ESRD_group_patients.join(all_patients_Nicotine_Tobacco_dependency, on = ['patient_sk'], how = "left").dropDuplicates()
DIAGNOSIS = DIAGNOSIS.join(all_patients_hypertension, on = ['patient_sk'], how = "left").dropDuplicates()
DIAGNOSIS = DIAGNOSIS.join(all_patients_Diabetes, on = ['patient_sk'], how = "left").dropDuplicates()
DIAGNOSIS = DIAGNOSIS.join(all_patients_Coronary_Artery_Disease, on = ['patient_sk'], how = "left").dropDuplicates()
DIAGNOSIS = DIAGNOSIS.join(all_patients_Cerebrovascular_Disease, on = ['patient_sk'], how = "left").dropDuplicates()
DIAGNOSIS = DIAGNOSIS.join(all_patients_Peripheral_Vascular_Disease, on = ['patient_sk'], how = "left").dropDuplicates()
DIAGNOSIS = DIAGNOSIS.join(all_patients_Sickle_Cell_Trait, on = ['patient_sk'], how = "left").dropDuplicates()
DIAGNOSIS = DIAGNOSIS.join(all_patients_Hx_of_Cancer, on = ['patient_sk'], how = "left").dropDuplicates().cache()
DIAGNOSIS = DIAGNOSIS.join(all_patients_Hypercholesterolemia, on = ['patient_sk'], how = "left").dropDuplicates()
DIAGNOSIS = DIAGNOSIS.join(all_patients_History_of_Urinary_Tract_Abnormalities, on = ['patient_sk'], how = "left").dropDuplicates()
DIAGNOSIS.cache()

In [None]:
DIAGNOSIS_pandas = DIAGNOSIS.toPandas()

In [None]:
DIAGNOSIS_pandas.to_csv('DIAGNOSIS_2.csv')

## Lab findings

In [None]:
ESRD_group_patients = ESRD_group_done.select('patient_sk').dropDuplicates()

In [None]:
#Urine Protein/Creatinine Ratio

all_patientsdata_Urine_Protein_Creatinine = spark.sql("select P.patient_sk, Lf.lab_drawn_dt_tm as Date_Urine_Protein_Creatinine, Lf.numeric_result as Urine_Protein_Creatinine, U.unit_display as UOM, Ld.loinc_code as LOINC_code\
                                      from cerner.orc_hf_d_patient P \
                                      join cerner.orc_hf_f_encounter E on E.patient_id = P.patient_id\
                                      join cerner.orc_hf_f_lab_procedure Lf on Lf.encounter_id = E.encounter_id\
                                      join cerner.orc_hf_d_lab_procedure Ld on Lf.detail_lab_procedure_id = Ld.lab_procedure_id\
                                      join cerner.orc_hf_d_unit U on Lf.result_units_id = U.unit_id\
                                      where Ld.loinc_code ='2890-2'\
                                      and Lf.numeric_result is not null\
                                      and Lf.lab_drawn_dt_tm is not null")
                              

all_patientsdata_Urine_Protein_Creatinine = all_patientsdata_Urine_Protein_Creatinine.dropDuplicates()                             

In [None]:
Urine_Protein_Creatinine = ESRD_group_patients.join(all_patientsdata_Urine_Protein_Creatinine, on = ['patient_sk'], how = "left").dropDuplicates().cache()

In [None]:
Urine_Protein_Creatinine_pandas = Urine_Protein_Creatinine.toPandas()
Urine_Protein_Creatinine_pandas.to_csv('Urine_Protein_Creatinine_pandas.csv')

In [None]:
#Urine Microalbumin/Creatinine Ratio

all_patientsdata_Urine_Microalbumin_Creatinine = spark.sql("select P.patient_sk, Lf.lab_drawn_dt_tm as Date_Urine_Microalbumin_Creatinine, Lf.numeric_result as Urine_Microalbumin_Creatinine, U.unit_display as UOM, Ld.loinc_code as LOINC_code\
                                      from cerner.orc_hf_d_patient P \
                                      join cerner.orc_hf_f_encounter E on E.patient_id = P.patient_id\
                                      join cerner.orc_hf_f_lab_procedure Lf on Lf.encounter_id = E.encounter_id\
                                      join cerner.orc_hf_d_lab_procedure Ld on Lf.detail_lab_procedure_id = Ld.lab_procedure_id\
                                      join cerner.orc_hf_d_unit U on Lf.result_units_id = U.unit_id\
                                      where Ld.loinc_code in ('14959-1', '30000-4', '58447-4')\
                                      and Lf.numeric_result is not null\
                                      and Lf.lab_drawn_dt_tm is not null")
                              
                                  
all_patientsdata_Urine_Microalbumin_Creatinine = all_patientsdata_Urine_Microalbumin_Creatinine.dropDuplicates()                            

In [None]:
Urine_Microalbumin_Creatinine = ESRD_group_patients.join(all_patientsdata_Urine_Microalbumin_Creatinine, on = ['patient_sk'], how = "left").dropDuplicates().cache()

In [None]:
Urine_Microalbumin_Creatinine_pandas = Urine_Microalbumin_Creatinine.toPandas()
Urine_Microalbumin_Creatinine_pandas.to_csv('Urine_Microalbumin_Creatinine_pandas.csv')

In [None]:
#24_hr_Urine_Protein

all_patientsdata_24_hr_Urine_Protein = spark.sql("select P.patient_sk, Lf.lab_drawn_dt_tm as Date_24_hr_Urine_Protein, Lf.numeric_result as 24_hr_Urine_Protein, U.unit_display as UOM, Ld.loinc_code as LOINC_code\
                                      from cerner.orc_hf_d_patient P \
                                      join cerner.orc_hf_f_encounter E on E.patient_id = P.patient_id\
                                      join cerner.orc_hf_f_lab_procedure Lf on Lf.encounter_id = E.encounter_id\
                                      join cerner.orc_hf_d_lab_procedure Ld on Lf.detail_lab_procedure_id = Ld.lab_procedure_id\
                                      join cerner.orc_hf_d_unit U on Lf.result_units_id = U.unit_id\
                                      where Ld.loinc_code in ('21482-5', '3167-4')\
                                      and Lf.numeric_result is not null\
                                      and Lf.lab_drawn_dt_tm is not null")
                              
                                     
all_patientsdata_24_hr_Urine_Protein = all_patientsdata_24_hr_Urine_Protein.dropDuplicates()                            

In [None]:
twentyfour_hr_Urine_Protein = ESRD_group_patients.join(all_patientsdata_24_hr_Urine_Protein, on = ['patient_sk'], how = "left").dropDuplicates().cache()

In [None]:
twentyfour_hr_Urine_Protein_pandas = twentyfour_hr_Urine_Protein.toPandas()
twentyfour_hr_Urine_Protein_pandas.to_csv('twentyfour_hr_Urine_Protein_pandas.csv')

In [None]:
#Urinalysis_Dipstick_Protein

all_patientsdata_Urinalysis_Dipstick_Protein = spark.sql("select P.patient_sk, Lf.lab_drawn_dt_tm as Date_Urinalysis_Dipstick_Protein, Lf.numeric_result as Urinalysis_Dipstick_Protein, U.unit_display as UOM_Dipstick, Ld.loinc_code as LOINC_code\
                                      from cerner.orc_hf_d_patient P \
                                      join cerner.orc_hf_f_encounter E on E.patient_id = P.patient_id\
                                      join cerner.orc_hf_f_lab_procedure Lf on Lf.encounter_id = E.encounter_id\
                                      join cerner.orc_hf_d_lab_procedure Ld on Lf.detail_lab_procedure_id = Ld.lab_procedure_id\
                                      join cerner.orc_hf_d_unit U on Lf.result_units_id = U.unit_id\
                                      where Ld.loinc_code ='20454-5' and Lf.numeric_result >= 0\
                                      and Lf.numeric_result is not null\
                                      and Lf.lab_drawn_dt_tm is not null")


all_patientsdata_Urinalysis_Dipstick_Protein = all_patientsdata_Urinalysis_Dipstick_Protein.dropDuplicates()                            

In [None]:
Urinalysis_Dipstick_Protein = ESRD_group_patients.join(all_patientsdata_Urinalysis_Dipstick_Protein, on = ['patient_sk'], how = "left").dropDuplicates().cache()

In [None]:
Urinalysis_Dipstick_Protein_pandas = Urinalysis_Dipstick_Protein.toPandas()
Urinalysis_Dipstick_Protein_pandas.to_csv('Urinalysis_Dipstick_Protein_pandas.csv')

In [None]:
#Hemoglobin A1c

all_patientsdata_Hemoglobin_A1c = spark.sql("select P.patient_sk, Lf.lab_drawn_dt_tm as Date_Hemoglobin_A1c, Lf.numeric_result as SBP_Hemoglobin_A1c, U.unit_display as UOM, Ld.loinc_code as LOINC_code\
                                      from cerner.orc_hf_d_patient P \
                                      join cerner.orc_hf_f_encounter E on E.patient_id = P.patient_id\
                                      join cerner.orc_hf_f_lab_procedure Lf on Lf.encounter_id = E.encounter_id\
                                      join cerner.orc_hf_d_lab_procedure Ld on Lf.detail_lab_procedure_id = Ld.lab_procedure_id\
                                      join cerner.orc_hf_d_unit U on Lf.result_units_id = U.unit_id\
                                      where Ld.loinc_code in ('59261-8', '41995-2', '17856-6', '4548-4')\
                                      and Lf.numeric_result is not null\
                                      and Lf.lab_drawn_dt_tm is not null")
                              
                                      
all_patientsdata_Hemoglobin_A1c = all_patientsdata_Hemoglobin_A1c.dropDuplicates()                            

In [None]:
Hemoglobin_A1c = ESRD_group_patients.join(all_patientsdata_Hemoglobin_A1c, on = ['patient_sk'], how = "left").dropDuplicates().cache()

In [None]:
Hemoglobin_A1c_pandas = Hemoglobin_A1c.toPandas()
Hemoglobin_A1c_pandas.to_csv('Hemoglobin_A1c_pandas.csv')

In [None]:
#Hemoglobin

all_patientsdata_Hemoglobin = spark.sql("select P.patient_sk, Lf.lab_drawn_dt_tm as Date_Hemoglobin, Lf.numeric_result as Hemoglobin, U.unit_display as UOM , Ld.loinc_code as LOINC_code\
                                      from cerner.orc_hf_d_patient P \
                                      join cerner.orc_hf_f_encounter E on E.patient_id = P.patient_id\
                                      join cerner.orc_hf_f_lab_procedure Lf on Lf.encounter_id = E.encounter_id\
                                      join cerner.orc_hf_d_lab_procedure Ld on Lf.detail_lab_procedure_id = Ld.lab_procedure_id\
                                      join cerner.orc_hf_d_unit U on Lf.result_units_id = U.unit_id\
                                      where Ld.loinc_code ='718-7'\
                                      and Lf.numeric_result is not null\
                                      and Lf.lab_drawn_dt_tm is not null")
                              

all_patientsdata_Hemoglobin.persist()                                       
all_patientsdata_Hemoglobin = all_patientsdata_Hemoglobin.dropDuplicates()                            

In [None]:
Hemoglobin = ESRD_group_patients.join(all_patientsdata_Hemoglobin, on = ['patient_sk'], how = "left").dropDuplicates().cache()

In [None]:
Hemoglobin_pandas = Hemoglobin.toPandas()
Hemoglobin_pandas.to_csv('Hemoglobin_pandas.csv')

In [None]:
#Serum_Calcium 

all_patientsdata_Serum_Calcium = spark.sql("select P.patient_sk, Lf.lab_drawn_dt_tm as Date_Serum_Calcium, Lf.numeric_result as Serum_Calcium, U.unit_display as UOM, Ld.loinc_code as LOINC_code\
                                      from cerner.orc_hf_d_patient P \
                                      join cerner.orc_hf_f_encounter E on E.patient_id = P.patient_id\
                                      join cerner.orc_hf_f_lab_procedure Lf on Lf.encounter_id = E.encounter_id\
                                      join cerner.orc_hf_d_lab_procedure Ld on Lf.detail_lab_procedure_id = Ld.lab_procedure_id\
                                      join cerner.orc_hf_d_unit U on Lf.result_units_id = U.unit_id\
                                      where Ld.loinc_code in ('17861-6', '2000-8')\
                                      and Lf.numeric_result is not null\
                                      and Lf.lab_drawn_dt_tm is not null")
                              
                                      
all_patientsdata_Serum_Calcium = all_patientsdata_Serum_Calcium.dropDuplicates()                            

In [None]:
Serum_Calcium = ESRD_group_patients.join(all_patientsdata_Serum_Calcium, on = ['patient_sk'], how = "left").dropDuplicates().cache()

In [None]:
Serum_Calcium_pandas = Serum_Calcium.toPandas()
Serum_Calcium_pandas.to_csv('Serum_Calcium_pandas.csv')

In [None]:
#Serum_Bicarbonate 

all_patientsdata_Serum_Bicarbonate = spark.sql("select P.patient_sk, Lf.lab_drawn_dt_tm as Date_Serum_Bicarbonate, Lf.numeric_result as Serum_Bicarbonate, U.unit_display as UOM, Ld.loinc_code as LOINC_code\
                                      from cerner.orc_hf_d_patient P \
                                      join cerner.orc_hf_f_encounter E on E.patient_id = P.patient_id\
                                      join cerner.orc_hf_f_lab_procedure Lf on Lf.encounter_id = E.encounter_id\
                                      join cerner.orc_hf_d_lab_procedure Ld on Lf.detail_lab_procedure_id = Ld.lab_procedure_id\
                                      join cerner.orc_hf_d_unit U on Lf.result_units_id = U.unit_id\
                                      where Ld.loinc_code ='1963-8'\
                                      and Lf.numeric_result is not null\
                                      and Lf.lab_drawn_dt_tm is not null")
                              
                                     
all_patientsdata_Serum_Bicarbonate = all_patientsdata_Serum_Bicarbonate.dropDuplicates()                            

In [None]:
Serum_Bicarbonate = ESRD_group_patients.join(all_patientsdata_Serum_Bicarbonate, on = ['patient_sk'], how = "left").dropDuplicates().cache()

In [None]:
Serum_Bicarbonate_pandas = Serum_Bicarbonate.toPandas()
Serum_Bicarbonate_pandas.to_csv('Serum_Bicarbonate_pandas.csv')

In [None]:
#HIV 

all_patientsdata_HIV = spark.sql("select P.patient_sk, Lf.lab_drawn_dt_tm as Date_HIV, Lf.numeric_result as HIV, U.unit_display as UOM, Ld.loinc_code as LOINC_code\
                                      from cerner.orc_hf_d_patient P \
                                      join cerner.orc_hf_f_encounter E on E.patient_id = P.patient_id\
                                      join cerner.orc_hf_f_lab_procedure Lf on Lf.encounter_id = E.encounter_id\
                                      join cerner.orc_hf_d_lab_procedure Ld on Lf.detail_lab_procedure_id = Ld.lab_procedure_id\
                                      join cerner.orc_hf_d_unit U on Lf.result_units_id = U.unit_id\
                                      where Ld.loinc_code in ('68961-2', '7917-8')\
                                      and Lf.numeric_result is not null\
                                      and Lf.lab_drawn_dt_tm is not null")
                              
                                     
all_patientsdata_HIV = all_patientsdata_HIV.dropDuplicates()                            

In [None]:
HIV = ESRD_group_patients.join(all_patientsdata_HIV, on = ['patient_sk'], how = "left").dropDuplicates().cache()

In [None]:
HIV_pandas = HIV.toPandas()
HIV_pandas.to_csv('HIV_pandas.csv')

In [None]:
#Hepatitis_C 

all_patientsdata_Hepatitis_C = spark.sql("select P.patient_sk, Lf.lab_drawn_dt_tm as Date_Hepatitis_C, Lf.numeric_result as Hepatitis_C, U.unit_display as UOM, Ld.loinc_code as LOINC_code\
                                      from cerner.orc_hf_d_patient P \
                                      join cerner.orc_hf_f_encounter E on E.patient_id = P.patient_id\
                                      join cerner.orc_hf_f_lab_procedure Lf on Lf.encounter_id = E.encounter_id\
                                      join cerner.orc_hf_d_lab_procedure Ld on Lf.detail_lab_procedure_id = Ld.lab_procedure_id\
                                      join cerner.orc_hf_d_unit U on Lf.result_units_id = U.unit_id\
                                      where Ld.loinc_code in ('72376-7', '22327-1', '22327-1', '5198-7', '16128-1', '13955-0')\
                                      and Lf.numeric_result is not null\
                                      and Lf.lab_drawn_dt_tm is not null")
                              
                                     
all_patientsdata_Hepatitis_C = all_patientsdata_Hepatitis_C.dropDuplicates()                            

In [None]:
Hepatitis_C = ESRD_group_patients.join(all_patientsdata_Hepatitis_C, on = ['patient_sk'], how = "left").dropDuplicates().cache()

In [None]:
Hepatitis_C_pandas = Hepatitis_C.toPandas()
Hepatitis_C_pandas.to_csv('Hepatitis_C_pandas.csv')

In [None]:
#Serum_Cholesterol 

all_patientsdata_Serum_Cholesterol = spark.sql("select P.patient_sk, Lf.lab_drawn_dt_tm as Date_Serum_Cholesterol, Lf.numeric_result as Serum_Cholesterol, U.unit_display as UOM, Ld.loinc_code as LOINC_code\
                                      from cerner.orc_hf_d_patient P \
                                      join cerner.orc_hf_f_encounter E on E.patient_id = P.patient_id\
                                      join cerner.orc_hf_f_lab_procedure Lf on Lf.encounter_id = E.encounter_id\
                                      join cerner.orc_hf_d_lab_procedure Ld on Lf.detail_lab_procedure_id = Ld.lab_procedure_id\
                                      join cerner.orc_hf_d_unit U on Lf.result_units_id = U.unit_id\
                                      where Ld.loinc_code ='2093-3'\
                                      and Lf.numeric_result is not null\
                                      and Lf.lab_drawn_dt_tm is not null")
                              

all_patientsdata_Serum_Cholesterol.persist()                                       
all_patientsdata_Serum_Cholesterol = all_patientsdata_Serum_Cholesterol.dropDuplicates()                            

In [None]:
Serum_Cholesterol = ESRD_group_patients.join(all_patientsdata_Serum_Cholesterol, on = ['patient_sk'], how = "left").dropDuplicates().cache()

In [None]:
Serum_Cholesterol_pandas = Serum_Cholesterol.toPandas()
Serum_Cholesterol_pandas.to_csv('Serum_Cholesterol_pandas.csv')

In [None]:
#serum_albumin   

all_patientsdata_Serum_albumin = spark.sql("select P.patient_sk, Lf.lab_drawn_dt_tm as Date_Serum_albumin, Lf.numeric_result as Serum_albumin, U.unit_display as UOM, Ld.loinc_code as LOINC_code\
                                      from cerner.orc_hf_d_patient P \
                                      join cerner.orc_hf_f_encounter E on E.patient_id = P.patient_id\
                                      join cerner.orc_hf_f_lab_procedure Lf on Lf.encounter_id = E.encounter_id\
                                      join cerner.orc_hf_d_lab_procedure Ld on Lf.detail_lab_procedure_id = Ld.lab_procedure_id\
                                      join cerner.orc_hf_d_unit U on Lf.result_units_id = U.unit_id\
                                      where Ld.loinc_code in ('61152-5', '62234-0', '61151-7', '62235-7', '1751-7', '54347-0')\
                                      and Lf.numeric_result is not null\
                                      and Lf.lab_drawn_dt_tm is not null")
                              
                                       
all_patientsdata_Serum_albumin = all_patientsdata_Serum_albumin.dropDuplicates()                            

In [None]:
Serum_albumin = ESRD_group_patients.join(all_patientsdata_Serum_albumin, on = ['patient_sk'], how = "left").dropDuplicates().cache()

In [None]:
Serum_albumin_pandas = Serum_albumin.toPandas()
Serum_albumin_pandas.to_csv('Serum_albumin_pandas.csv')

In [None]:
#serum_Phosphorus   

all_patientsdata_Serum_Phosphorus = spark.sql("select P.patient_sk, Lf.lab_drawn_dt_tm as Date_Serum_Phosphorus, Lf.numeric_result as Serum_Phosphorus, U.unit_display as UOM, Ld.loinc_code as LOINC_code\
                                      from cerner.orc_hf_d_patient P \
                                      join cerner.orc_hf_f_encounter E on E.patient_id = P.patient_id\
                                      join cerner.orc_hf_f_lab_procedure Lf on Lf.encounter_id = E.encounter_id\
                                      join cerner.orc_hf_d_lab_procedure Ld on Lf.detail_lab_procedure_id = Ld.lab_procedure_id\
                                      join cerner.orc_hf_d_unit U on Lf.result_units_id = U.unit_id\
                                      where Ld.loinc_code = '2777-1'\
                                      and Lf.numeric_result is not null\
                                      and Lf.lab_drawn_dt_tm is not null")
                              
                                       
all_patientsdata_Serum_Phosphorus = all_patientsdata_Serum_Phosphorus.dropDuplicates()                            

In [None]:
Serum_Phosphorus = ESRD_group_patients.join(all_patientsdata_Serum_Phosphorus, on = ['patient_sk'], how = "left").dropDuplicates().cache()

In [None]:
Serum_Phosphorus_pandas = Serum_Phosphorus.toPandas()
Serum_Phosphorus_pandas.to_csv('Serum_Phosphorus_pandas.csv')

## Medication data

In [4]:
ESRD_group_patients = ESRD_group_done.select('patient_sk').dropDuplicates()

In [5]:
## NSAID


all_patients_NSAID = spark.sql("select P.patient_sk, Mf.med_started_dt_tm as Date_NSAID\
                                        from cerner.orc_hf_d_patient P\
                                        join cerner.orc_hf_f_encounter E on E.patient_id = P.patient_id\
                                        join cerner.orc_hf_f_medication Mf on Mf.encounter_id = E.encounter_id\
                                        join cerner.orc_hf_d_medication Md on Md.medication_id = Mf.medication_id\
                                        where Md.generic_name like '%celecoxib%' or Md.generic_name like '%diclofenac%' or Md.generic_name like '%tolmetin%'\
                                        or Md.generic_name like '%etodolac%' or Md.generic_name like '%fenoprofen%' or Md.generic_name like '%flubiprofen%'\
                                        or Md.generic_name like '%ibuprofen%' or Md.generic_name like '%indomethacin%' or Md.generic_name like '%ketoprofen%'\
                                        or Md.generic_name like '%ketorolac%' or Md.generic_name like '%meclofenamate%' or Md.generic_name like '%mefenamic%acid%'\
                                        or Md.generic_name like '%meloxicam%' or Md.generic_name like '%nabumethone%' or Md.generic_name like '%naproxen%sodium%'\
                                        or Md.generic_name like '%oxaprozin%' or Md.generic_name like '%piroxicam%' or Md.generic_name like '%sulindac%'\
                                        or Md.brand_name like '%celecoxib%' or Md.brand_name like '%diclofenac%' or Md.brand_name like '%tolmetin%'\
                                        or Md.brand_name like '%etodolac%' or Md.brand_name like '%fenoprofen%' or Md.brand_name like '%flubiprofen%'\
                                        or Md.brand_name like '%ibuprofen%' or Md.brand_name like '%indomethacin%' or Md.brand_name like '%ketoprofen%'\
                                        or Md.brand_name like '%ketorolac%' or Md.brand_name like '%meclofenamate%' or Md.brand_name like '%mefenamic%acid%'\
                                        or Md.brand_name like '%meloxicam%' or Md.brand_name like '%nabumethone%' or Md.brand_name like '%naproxen%sodium%'\
                                        or Md.brand_name like '%oxaprozin%' or Md.brand_name like '%piroxicam%' or Md.brand_name like '%sulindac%'\
                                        or Md.generic_name like '%Celecoxib%' or Md.generic_name like '%Diclofenac%' or Md.generic_name like '%Tolmetin%'\
                                        or Md.generic_name like '%Etodolac%' or Md.generic_name like '%Fenoprofen%' or Md.generic_name like '%Flubiprofen%'\
                                        or Md.generic_name like '%Ibuprofen%' or Md.generic_name like '%Indomethacin%' or Md.generic_name like '%Ketoprofen%'\
                                        or Md.generic_name like '%Ketorolac%' or Md.generic_name like '%Meclofenamate%' or Md.generic_name like '%Mefenamic%acid%'\
                                        or Md.generic_name like '%Meloxicam%' or Md.generic_name like '%Nabumethone%' or Md.generic_name like '%Naproxen%sodium%'\
                                        or Md.generic_name like '%Oxaprozin%' or Md.generic_name like '%Piroxicam%' or Md.generic_name like '%Sulindac%'\
                                        or Md.brand_name like '%Celecoxib%' or Md.brand_name like '%Diclofenac%' or Md.brand_name like '%Tolmetin%'\
                                        or Md.brand_name like '%Etodolac%' or Md.brand_name like '%Fenoprofen%' or Md.brand_name like '%Flubiprofen%'\
                                        or Md.brand_name like '%Ibuprofen%' or Md.brand_name like '%Indomethacin%' or Md.brand_name like '%Ketoprofen%'\
                                        or Md.brand_name like '%Ketorolac%' or Md.brand_name like '%Meclofenamate%' or Md.brand_name like '%Mefenamic%acid%'\
                                        or Md.brand_name like '%Meloxicam%' or Md.brand_name like '%Nabumethone%' or Md.brand_name like '%Naproxen%sodium%'\
                                        or Md.brand_name like '%Oxaprozin%' or Md.brand_name like '%Piroxicam%' or Md.brand_name like '%Sulindac%'")
                                 
    
all_patients_NSAID = all_patients_NSAID.dropDuplicates()

In [6]:
NSAID = ESRD_group_patients.join(all_patients_NSAID, on = ['patient_sk'], how = "left").dropDuplicates().cache()

In [7]:
NSAID_pandas = NSAID.toPandas()
NSAID_pandas.to_csv('NSAID_pandas.csv')

In [8]:
## Proton_Pump_Inhibitors


all_patients_Proton_Pump_Inhibitors = spark.sql("select P.patient_sk, Mf.med_started_dt_tm as Date_Proton_Pump_Inhibitors\
                                         from cerner.orc_hf_d_patient P\
                                         join cerner.orc_hf_f_encounter E on E.patient_id = P.patient_id\
                                         join cerner.orc_hf_f_medication Mf on Mf.encounter_id = E.encounter_id\
                                         join cerner.orc_hf_d_medication Md on Md.medication_id = Mf.medication_id\
                                         where Md.generic_name like '%dexlansoprazole%' or Md.generic_name like '%esomeprazole%' or Md.generic_name like '%lansoprazole%'\
                                        or Md.generic_name like '%omeprazole%' or Md.generic_name like '%pantoprazole%' or Md.generic_name like '%rabeprazole%'\
                                        or Md.brand_name like '%dexlansoprazole%' or Md.brand_name like '%esomeprazole%' or Md.brand_name like '%lansoprazole%'\
                                        or Md.brand_name like '%omeprazole%' or Md.brand_name like '%pantoprazole%' or Md.brand_name like '%rabeprazole%'\
                                        or Md.generic_name like '%Dexlansoprazole%' or Md.generic_name like '%Esomeprazole%' or Md.generic_name like '%Lansoprazole%'\
                                        or Md.generic_name like '%Omeprazole%' or Md.generic_name like '%Pantoprazole%' or Md.generic_name like '%Rabeprazole%'\
                                        or Md.brand_name like '%Dexlansoprazole%' or Md.brand_name like '%Esomeprazole%' or Md.brand_name like '%Lansoprazole%'\
                                        or Md.brand_name like '%Omeprazole%' or Md.brand_name like '%Pantoprazole%' or Md.brand_name like '%Rabeprazole%'")
                                 
    
all_patients_Proton_Pump_Inhibitors = all_patients_Proton_Pump_Inhibitors.dropDuplicates()

In [9]:
Proton_Pump_Inhibitors = ESRD_group_patients.join(all_patients_Proton_Pump_Inhibitors, on = ['patient_sk'], how = "left").dropDuplicates().cache()

In [10]:
Proton_Pump_Inhibitors_pandas = Proton_Pump_Inhibitors.toPandas()
Proton_Pump_Inhibitors_pandas.to_csv('Proton_Pump_Inhibitors_pandas.csv')

In [11]:
## Bipolar_Drugs

all_patients_Bipolar_Drugs = spark.sql("select P.patient_sk, Mf.med_started_dt_tm as Date_Bipolar_Drugs\
                                         from cerner.orc_hf_d_patient P\
                                         join cerner.orc_hf_f_encounter E on E.patient_id = P.patient_id\
                                         join cerner.orc_hf_f_medication Mf on Mf.encounter_id = E.encounter_id\
                                         join cerner.orc_hf_d_medication Md on Md.medication_id = Mf.medication_id\
                                         where Md.generic_name like '%Lithium%' or Md.generic_name like '%lithium%'\
                                         or Md.brand_name like '%Lithium%' or Md.brand_name like '%lithium%'")
                                 
    
all_patients_Bipolar_Drugs = all_patients_Bipolar_Drugs.dropDuplicates()

In [12]:
Bipolar_Drugs = ESRD_group_patients.join(all_patients_Bipolar_Drugs, on = ['patient_sk'], how = "left").dropDuplicates().cache()

In [13]:
Bipolar_Drugs_pandas = Bipolar_Drugs.toPandas()
Bipolar_Drugs_pandas.to_csv('Bipolar_Drugs_pandas.csv')

### For further information please contact rzz5164@psu.edu