_Author_ = "Sevda Molani"

_copyright_ = "2022 Sevda Molani"

_License_ = "Institute for Systems Biology"

_Version_ = "1.0"


In [0]:
from pyspark.sql.types import *
from pyspark.sql.functions import *
import pyspark.sql.functions as F
import numpy as np
from pyspark.sql.window import Window

import pandas as pd
pd.set_option("display.max_colwidth", 0)
pd.set_option('display.max_rows', None)

In [0]:
initial = spark.sql("""SELECT * FROM rdp_phi_sandbox.sm_initial_delta_table_feb""") #### Go to this notebook for this table: /Users/sevda.molani@providence.org/Sevda_Codes (1)/Covid/PATT/Initial_table_for_labs

In [0]:
dataframe_ddimer = initial.where((lower(col("resultname")).contains("d-dimer")) | (lower(col("resultname")).contains('ddimer')) | (lower(col("resultname")).contains('dimer')) | (lower(col("resultname")).contains('disseminated intravascular coagulation')) | (lower(col("resultname")).contains('intravascular coagulation')))

dataframe_ddimer = dataframe_ddimer.withColumn('unit_cleaned',
                                               when(lower(dataframe_ddimer.unit).contains("ug/ml"),"standard").\
                                               when(lower(dataframe_ddimer.unit).contains("ug/mlfeu"),"standard").\
                                               when(lower(dataframe_ddimer.unit).contains("ng/ml"),"standard").\
                                               otherwise(col("unit")))

dataframe_ddimer = dataframe_ddimer.withColumn('resultname_cleaned', 
                                         when(dataframe_ddimer.unit_cleaned.contains("standard"),"ddimer_cleaned").\
                                         otherwise(col("resultname")))


ddimer_df = dataframe_ddimer.filter(dataframe_ddimer.resultname_cleaned =="ddimer_cleaned")
ddimer_df = ddimer_df.withColumn('resultvalue',when(lower(dataframe_ddimer.unit).contains("ng/ml"),col("resultvalue")*0.001).\
                                               otherwise(col("resultvalue")))


w=Window.partitionBy("pat_id","instance","pat_enc_csn_id").orderBy("observationdatetime").rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)
final_ddimer_1 = ddimer_df.withColumn('resultvalue_ddimer', F.mean('resultvalue').over(w))
final_ddimer_2 = final_ddimer_1.withColumn('Last_ddimer', F.last('resultvalue').over(w))
    
columns = ['pat_id','instance','pat_enc_csn_id','resultvalue_ddimer','Last_ddimer']
saved_ddimer = final_ddimer_2[columns]
saved_ddimer = saved_ddimer.dropDuplicates(['pat_id','instance','pat_enc_csn_id'])

In [0]:
%sql
DROP TABLE rdp_phi_sandbox.sm_patt_ddimer_feb;

In [0]:
table_name = 'rdp_phi_sandbox.sm_patt_ddimer_feb'
saved_ddimer.write.saveAsTable(table_name)

In [0]:
dataframe_crp = initial.where((lower(col("resultname")).contains("reactive")) | (lower(col("resultname")).contains('crp')) | (lower(col("resultname")).contains('c reactive')) | (lower(col("resultname")).contains('creactive')) | (lower(col("resultname")).contains('c-reactive')) | (lower(col("resultname")).contains('c_reactive')))

dataframe_crp = dataframe_crp.where(dataframe_crp.unit != "%")
dataframe_crp = dataframe_crp.where(~(col("resultname").contains("LYMPHOCYTES")))

dataframe_crp = dataframe_crp.withColumn('unit_cleaned',lit("standard"))

dataframe_crp = dataframe_crp.withColumn('resultname_cleaned', 
                                         when(dataframe_crp.unit_cleaned.contains("standard"),"crp_cleaned").\
                                         otherwise(col("resultname")))


crp_df = dataframe_crp.filter(dataframe_crp.resultname_cleaned =="crp_cleaned")

w=Window.partitionBy("pat_id","instance","pat_enc_csn_id").orderBy("observationdatetime").rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)
final_crp_1 = crp_df.withColumn('resultvalue_crp', F.mean('resultvalue').over(w))
final_crp_2 = final_crp_1.withColumn('Last_crp', F.last('resultvalue').over(w))
    
columns = ['pat_id','instance','pat_enc_csn_id','resultvalue_crp','Last_crp']
saved_crp = final_crp_2[columns]
saved_crp = saved_crp.dropDuplicates(['pat_id','instance','pat_enc_csn_id'])

In [0]:
%sql
DROP TABLE rdp_phi_sandbox.sm_patt_crp_feb;

In [0]:
table_name = 'rdp_phi_sandbox.sm_patt_crp_feb'
saved_crp.write.saveAsTable(table_name)

In [0]:
dataframe_egfr = initial.where((lower(col("resultname")).contains("egfr")) | (lower(col("resultname")).contains('estimated glomerular filtration')) | (lower(col("resultname")).contains('estimated glomerular')))

dataframe_egfr = dataframe_egfr.withColumn('unit_cleaned',lit("standard"))

dataframe_egfr = dataframe_egfr.withColumn('resultname_cleaned', 
                                         when(dataframe_egfr.unit_cleaned.contains("standard"),"egfr_cleaned").\
                                         otherwise(col("resultname")))


egfr_df = dataframe_egfr.filter(dataframe_egfr.resultname_cleaned =="egfr_cleaned")

w=Window.partitionBy("pat_id","instance","pat_enc_csn_id").orderBy("observationdatetime").rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)
final_egfr_1 = egfr_df.withColumn('resultvalue_egfr', F.mean('resultvalue').over(w))
final_egfr_2 = final_egfr_1.withColumn('Last_egfr', F.last('resultvalue').over(w))
    
columns = ['pat_id','instance','pat_enc_csn_id','resultvalue_egfr','Last_egfr']
saved_egfr = final_egfr_2[columns]
saved_egfr = saved_egfr.dropDuplicates(['pat_id','instance','pat_enc_csn_id'])

In [0]:
%sql
DROP TABLE rdp_phi_sandbox.sm_patt_egfr_feb;

In [0]:
table_name = 'rdp_phi_sandbox.sm_patt_egfr_feb'
saved_egfr.write.saveAsTable(table_name)

In [0]:
dataframe_Prothrombin = initial.where((lower(col("resultname")).contains("prothrombin")))

dataframe_Prothrombin = dataframe_Prothrombin.withColumn('unit_cleaned',lit("standard"))

dataframe_Prothrombin = dataframe_Prothrombin.withColumn('resultname_cleaned', 
                                         when(dataframe_Prothrombin.unit_cleaned.contains("standard"),"Proth_cleaned").\
                                         otherwise(col("resultname")))


Proth_df = dataframe_Prothrombin.filter(dataframe_Prothrombin.resultname_cleaned =="Proth_cleaned")

w=Window.partitionBy("pat_id","instance","pat_enc_csn_id").orderBy("observationdatetime").rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)
final_Proth_1 = Proth_df.withColumn('resultvalue_Proth', F.mean('resultvalue').over(w))
final_Proth_2 = final_Proth_1.withColumn('Last_Proth', F.last('resultvalue').over(w))
    
columns = ['pat_id','instance','pat_enc_csn_id','resultvalue_Proth','Last_Proth']
saved_Proth = final_Proth_2[columns]
saved_Proth = saved_Proth.dropDuplicates(['pat_id','instance','pat_enc_csn_id'])

In [0]:
%sql
DROP TABLE rdp_phi_sandbox.sm_patt_Proth_feb;

In [0]:
table_name = 'rdp_phi_sandbox.sm_patt_Proth_feb'
saved_Proth.write.saveAsTable(table_name)

In [0]:
dataframe_buncreat = initial.where((lower(col("resultname")).contains("bun/creatinine")) | (lower(col("resultname")).contains('bun/creat')))

dataframe_buncreat = dataframe_buncreat.withColumn('unit_cleaned',
                                       when(dataframe_buncreat.unit.isNull(),"standard").\
                                       when(lower(dataframe_buncreat.unit).contains("ratio"),"standard").\
                                       otherwise(col("unit")))

dataframe_buncreat = dataframe_buncreat.withColumn('resultname_cleaned', 
                                         when(dataframe_buncreat.unit_cleaned.contains("standard"),"buncreat_cleaned").\
                                         otherwise(col("resultname")))


buncreat_df = dataframe_buncreat.filter(dataframe_buncreat.resultname_cleaned =="buncreat_cleaned")

w=Window.partitionBy("pat_id","instance","pat_enc_csn_id").orderBy("observationdatetime").rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)
final_buncreat_1 = buncreat_df.withColumn('resultvalue_buncreat', F.mean('resultvalue').over(w))
final_buncreat_2 = final_buncreat_1.withColumn('Last_buncreat', F.last('resultvalue').over(w))
    
columns = ['pat_id','instance','pat_enc_csn_id','resultvalue_buncreat','Last_buncreat']
saved_buncreat = final_buncreat_2[columns]
saved_buncreat = saved_buncreat.dropDuplicates(['pat_id','instance','pat_enc_csn_id'])

In [0]:
%sql
DROP TABLE rdp_phi_sandbox.sm_patt_buncreat_feb;

In [0]:
table_name = 'rdp_phi_sandbox.sm_patt_buncreat_feb'
saved_buncreat.write.saveAsTable(table_name)

In [0]:
dataframe_Ferritin = initial.where((lower(col("resultname")).contains("ferritin")))

dataframe_Ferritin = dataframe_Ferritin.withColumn('unit_cleaned',
                                       when(dataframe_Ferritin.unit.isNull(),"standard").\
                                       when(lower(dataframe_Ferritin.unit).contains("ng/ml"),"standard").\
                                       otherwise(col("unit")))

dataframe_Ferritin = dataframe_Ferritin.withColumn('resultname_cleaned', 
                                         when(dataframe_Ferritin.unit_cleaned.contains("standard"),"fr_cleaned").\
                                         otherwise(col("resultname")))


fr_df = dataframe_Ferritin.filter(dataframe_Ferritin.resultname_cleaned =="fr_cleaned")

w=Window.partitionBy("pat_id","instance","pat_enc_csn_id").orderBy("observationdatetime").rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)
final_fr_1 = fr_df.withColumn('resultvalue_fr', F.mean('resultvalue').over(w))
final_fr_2 = final_fr_1.withColumn('Last_fr', F.last('resultvalue').over(w))
    
columns = ['pat_id','instance','pat_enc_csn_id','resultvalue_fr','Last_fr']
saved_fr = final_fr_2[columns]
saved_fr = saved_fr.dropDuplicates(['pat_id','instance','pat_enc_csn_id'])

In [0]:
%sql
DROP TABLE rdp_phi_sandbox.sm_patt_fr_feb;

In [0]:
table_name = 'rdp_phi_sandbox.sm_patt_fr_feb'
saved_fr.write.saveAsTable(table_name)

In [0]:
dataframe_inr = initial.where((lower(col("resultname")).contains("inr")) | (lower(col("resultname")).contains(" international normalized ratio")))

dataframe_inr = dataframe_inr.withColumn('unit_cleaned',
                                       when(dataframe_inr.unit.isNull(),"standard").\
                                       when(lower(dataframe_inr.unit).contains("ratio"),"standard").\
                                       otherwise(col("unit")))

dataframe_inr = dataframe_inr.withColumn('resultname_cleaned', 
                                         when(dataframe_inr.unit_cleaned.contains("standard"),"inr_cleaned").\
                                         otherwise(col("resultname")))


inr_df = dataframe_inr.filter(dataframe_inr.resultname_cleaned =="inr_cleaned")

w=Window.partitionBy("pat_id","instance","pat_enc_csn_id").orderBy("observationdatetime").rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)
final_inr_1 = inr_df.withColumn('resultvalue_inr', F.mean('resultvalue').over(w))
final_inr_2 = final_inr_1.withColumn('Last_inr', F.last('resultvalue').over(w))
    
columns = ['pat_id','instance','pat_enc_csn_id','resultvalue_inr','Last_inr']
saved_inr = final_inr_2[columns]
saved_inr = saved_inr.dropDuplicates(['pat_id','instance','pat_enc_csn_id'])

In [0]:
%sql
DROP TABLE rdp_phi_sandbox.sm_patt_inr_feb;

In [0]:
table_name = 'rdp_phi_sandbox.sm_patt_inr_feb'
saved_inr.write.saveAsTable(table_name)

In [0]:
dataframe_mg = initial.where((lower(col("resultname")).contains("magnesium")))

dataframe_mg = dataframe_mg.withColumn('unit_cleaned',
                                       when(dataframe_mg.unit.isNull(),"standard").\
                                       when(lower(dataframe_mg.unit).contains("mg/dl"),"standard").\
                                       otherwise(col("unit")))

dataframe_mg = dataframe_mg.withColumn('resultname_cleaned', 
                                         when(dataframe_mg.unit_cleaned.contains("standard"),"mg_cleaned").\
                                         otherwise(col("resultname")))


mg_df = dataframe_mg.filter(dataframe_mg.resultname_cleaned =="mg_cleaned")

w=Window.partitionBy("pat_id","instance","pat_enc_csn_id").orderBy("observationdatetime").rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)
final_mg_1 = mg_df.withColumn('resultvalue_mg', F.mean('resultvalue').over(w))
final_mg_2 = final_mg_1.withColumn('Last_mg', F.last('resultvalue').over(w))
    
columns = ['pat_id','instance','pat_enc_csn_id','resultvalue_mg','Last_mg']
saved_mg = final_mg_2[columns]
saved_mg = saved_mg.dropDuplicates(['pat_id','instance','pat_enc_csn_id'])

In [0]:
%sql
DROP TABLE rdp_phi_sandbox.sm_patt_mg_feb;

In [0]:
table_name = 'rdp_phi_sandbox.sm_patt_mg_feb'
saved_mg.write.saveAsTable(table_name)

In [0]:
dataframe_procalcitonin = initial.where((lower(col("resultname")).contains("procalcitonin")) | (lower(col("resultname")).contains("pct")))

dataframe_procalcitonin = dataframe_procalcitonin.withColumn('unit_cleaned',
                                       when(dataframe_procalcitonin.unit.isNull(),"standard").\
                                       when(lower(dataframe_procalcitonin.unit).contains("ng/ml"),"standard").\
                                       otherwise(col("unit")))

dataframe_procalcitonin = dataframe_procalcitonin.withColumn('resultname_cleaned', 
                                         when(dataframe_procalcitonin.unit_cleaned.contains("standard"),"pct_cleaned").\
                                         otherwise(col("resultname")))


pct_df = dataframe_procalcitonin.filter(dataframe_procalcitonin.resultname_cleaned =="pct_cleaned")

w=Window.partitionBy("pat_id","instance","pat_enc_csn_id").orderBy("observationdatetime").rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)
final_pct_1 = pct_df.withColumn('resultvalue_pct', F.mean('resultvalue').over(w))
final_pct_2 = final_pct_1.withColumn('Last_pct', F.last('resultvalue').over(w))
    
columns = ['pat_id','instance','pat_enc_csn_id','resultvalue_pct','Last_pct']
saved_pct = final_pct_2[columns]
saved_pct = saved_pct.dropDuplicates(['pat_id','instance','pat_enc_csn_id'])

In [0]:
%sql
DROP TABLE rdp_phi_sandbox.sm_patt_pct_feb;

In [0]:
table_name = 'rdp_phi_sandbox.sm_patt_pct_feb'
saved_pct.write.saveAsTable(table_name)

In [0]:
dataframe_ldh = initial.where((lower(col("resultname")).contains("lactate dehydrogenase")) | (lower(col("resultname")).contains("ldh")))

dataframe_ldh = dataframe_ldh.withColumn('unit_cleaned',
                                       when(dataframe_ldh.unit.isNull(),"standard").\
                                       when(lower(dataframe_ldh.unit).contains("iu/l"),"standard").\
                                       when(lower(dataframe_ldh.unit).contains("u/l"),"standard").\
                                       otherwise(col("unit")))

dataframe_ldh = dataframe_ldh.withColumn('resultname_cleaned', 
                                         when(dataframe_ldh.unit_cleaned.contains("standard"),"ldh_cleaned").\
                                         otherwise(col("resultname")))


ldh_df = dataframe_ldh.filter(dataframe_ldh.resultname_cleaned =="ldh_cleaned")

w=Window.partitionBy("pat_id","instance","pat_enc_csn_id").orderBy("observationdatetime").rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)
final_ldh_1 = ldh_df.withColumn('resultvalue_ldh', F.mean('resultvalue').over(w))
final_ldh_2 = final_ldh_1.withColumn('Last_ldh', F.last('resultvalue').over(w))
    
columns = ['pat_id','instance','pat_enc_csn_id','resultvalue_ldh','Last_ldh']
saved_ldh = final_ldh_2[columns]
saved_ldh = saved_ldh.dropDuplicates(['pat_id','instance','pat_enc_csn_id'])

In [0]:
%sql
DROP TABLE rdp_phi_sandbox.sm_patt_ldh_feb;

In [0]:
table_name = 'rdp_phi_sandbox.sm_patt_ldh_feb'
saved_ldh.write.saveAsTable(table_name)

In [0]:
dataframe_be = initial.where((lower(col("resultname")).contains("base excess")))

dataframe_be = dataframe_be.withColumn('unit_cleaned',
                                       when(dataframe_be.unit.isNull(),"standard").\
                                       when(lower(dataframe_be.unit).contains("mmol/l"),"standard").\
                                       when(lower(dataframe_be.unit).contains("meq/l"),"standard").\
                                       otherwise(col("unit")))

dataframe_be = dataframe_be.withColumn('resultname_cleaned', 
                                         when(dataframe_be.unit_cleaned.contains("standard"),"BE_cleaned").\
                                         otherwise(col("resultname")))


BE_df = dataframe_be.filter(dataframe_be.resultname_cleaned =="BE_cleaned")

w=Window.partitionBy("pat_id","instance","pat_enc_csn_id").orderBy("observationdatetime").rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)
final_be_1 = BE_df.withColumn('resultvalue_be', F.mean('resultvalue').over(w))
final_be_2 = final_be_1.withColumn('Last_be', F.last('resultvalue').over(w))
    
columns = ['pat_id','instance','pat_enc_csn_id','resultvalue_be','Last_be']
saved_be = final_be_2[columns]
saved_be = saved_be.dropDuplicates(['pat_id','instance','pat_enc_csn_id'])

In [0]:
%sql
DROP TABLE rdp_phi_sandbox.sm_patt_be_feb;

In [0]:
table_name = 'rdp_phi_sandbox.sm_patt_be_feb'
saved_be.write.saveAsTable(table_name)

In [0]:
##WBC
dataframe_wbc = initial.where((lower(col("resultname")).contains("wbc")) | (lower(col("resultname")).contains('white blood cell')))

###Change Nulls to the defined unit
#dataframe_wbc = dataframe_wbc.na.fill("missing",["unit"])
###Remove Nulls
dataframe_wbc = dataframe_wbc.where(dataframe_wbc.unit.isNotNull())

dataframe_wbc = dataframe_wbc.where(dataframe_wbc.unit != "nmol/min/mg")
dataframe_wbc = dataframe_wbc.where(dataframe_wbc.unit != "g/dL")
dataframe_wbc = dataframe_wbc.where(dataframe_wbc.unit != "mg/L") #I removed this because I don't know how to conver to k/ul
dataframe_wbc = dataframe_wbc.where(dataframe_wbc.unit != "mg/dL") #I removed this because I don't know how to conver to k/ul
dataframe_wbc = dataframe_wbc.where(dataframe_wbc.unit != "%")
dataframe_wbc = dataframe_wbc.where(~(lower(col("unit")).contains("cell"))) #for now??
dataframe_wbc = dataframe_wbc.where(~(lower(col("resultname")).contains("cell"))) #for now??
dataframe_wbc = dataframe_wbc.where(~(lower(col("resultname")).contains("urine")))
dataframe_wbc = dataframe_wbc.where(~(lower(col("resultname")).contains("wbc corr"))) #not sure what this is
dataframe_wbc = dataframe_wbc.where(~(lower(col("resultname")).contains("ua"))) ###Removed UA which standas for Urine 
dataframe_wbc = dataframe_wbc.where(~(lower(col("resultname")).contains("fluid"))) ##Removed WBC BODY FLUID
dataframe_wbc = dataframe_wbc.where(~(lower(col("resultname")).contains("stool"))) ##Removed WBC Stool
dataframe_wbc = dataframe_wbc.where(~(lower(col("resultname")).contains("%")))
dataframe_wbc = dataframe_wbc.where(~(col("resultname").contains("CD4/CD8 RATIO.WBC.QN (REF)"))) 
dataframe_wbc = dataframe_wbc.where(~(col("resultname").contains("WBC CSF")))
dataframe_wbc = dataframe_wbc.where(~(col("resultname").contains("WBC COMMENT")))
dataframe_wbc = dataframe_wbc.where(~(col("resultname").contains("ADJUSTED")))
dataframe_wbc = dataframe_wbc.where(~(col("resultname").contains("MORPHOLOGY")))

dataframe_wbc = dataframe_wbc.withColumn('unit_cleaned',
                                         when((dataframe_wbc.unit=="ul"),"standard").\
                                         when(lower(dataframe_wbc.unit).contains("k/ul"),"standard").\
                                         when(lower(dataframe_wbc.unit).contains("mcl"),"standard").\
                                         when(lower(dataframe_wbc.unit).contains("mm3"),"standard").\
                                         when(lower(dataframe_wbc.unit).contains("cumm"),"standard").\
                                         when(lower(dataframe_wbc.unit).contains("cmm"),"standard").\
                                         when(lower(dataframe_wbc.unit).contains("10*3/ul"),"standard").\
                                         when(dataframe_wbc.resultname.contains("(10*3/UL)"),"standard").\
                                         when(dataframe_wbc.resultname.contains("(K/UL)"),"standard").\
                                         when((dataframe_wbc.resultname=="WBC (REF)"),"standard").\
                                         when((dataframe_wbc.resultname=="WBC"),"standard").\
                                         when(dataframe_wbc.resultname.contains("EXTERNAL"),"standard").\
                                         otherwise(col("unit")))

dataframe_wbc = dataframe_wbc.withColumn('resultname_cleaned', 
                                         when(dataframe_wbc.unit_cleaned.contains("standard"),"WBC_cleaned").\
                                         otherwise(col("resultname")))


WBC_df = dataframe_wbc.filter(dataframe_wbc.resultname_cleaned =="WBC_cleaned")

WBC_df = WBC_df.withColumn("resultvalue", when(WBC_df.resultvalue == "Normal",7.5).otherwise(WBC_df.resultvalue))

count = WBC_df.groupBy(["unit"]).count()
final_wbc = WBC_df.join(count,"unit")
final_wbc = final_wbc.filter(col("count")>5)

w=Window.partitionBy("pat_id","instance","pat_enc_csn_id").orderBy("observationdatetime").rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)
final_wbc_1 = final_wbc.withColumn('resultvalue_WBC', F.mean('resultvalue').over(w))
final_wbc_2 = final_wbc_1.withColumn('Last_wbc', F.last('resultvalue').over(w))
    
columns = ['pat_id','instance','pat_enc_csn_id','resultvalue_WBC','Last_wbc']
saved_wbc = final_wbc_2[columns]
saved_wbc = saved_wbc.dropDuplicates(['pat_id','instance','pat_enc_csn_id'])

In [0]:
%sql
DROP TABLE rdp_phi_sandbox.sm_patt_wbc_feb;

In [0]:
table_name = 'rdp_phi_sandbox.sm_patt_wbc_feb'
saved_wbc.write.saveAsTable(table_name)

In [0]:
dataframe_lymphocyte = initial.where((lower(col("resultname")).contains("lymphocyte")) | (lower(col("resultname")).contains('lymphs')))

###Change Nulls to the defined unit
dataframe_lymphocyte = dataframe_lymphocyte.na.fill("missing",["unit"])

dataframe_lymphocyte = dataframe_lymphocyte.where(dataframe_lymphocyte.unit != "CEL/UL")
dataframe_lymphocyte = dataframe_lymphocyte.where(dataframe_lymphocyte.unit != "%")
dataframe_lymphocyte = dataframe_lymphocyte.where(dataframe_lymphocyte.unit != "L")
dataframe_lymphocyte = dataframe_lymphocyte.where(dataframe_lymphocyte.unit != "`")
dataframe_lymphocyte = dataframe_lymphocyte.where(dataframe_lymphocyte.unit != "/L")
dataframe_lymphocyte = dataframe_lymphocyte.where(~(lower(col("unit")).contains("cell")))
dataframe_lymphocyte = dataframe_lymphocyte.where(~(lower(col("unit")).contains("cel")))
dataframe_lymphocyte = dataframe_lymphocyte.where(~(lower(col("unit")).contains("%")))

dataframe_lymphocyte = dataframe_lymphocyte.where(dataframe_lymphocyte.resultname != "ATYPICAL LYMPHOCYTES.BLD.QN.CALC.MAN (K/UL) (BEAKER)")
dataframe_lymphocyte = dataframe_lymphocyte.where(dataframe_lymphocyte.resultname != "VARIANT LYMPHS, CSF")
dataframe_lymphocyte = dataframe_lymphocyte.where(dataframe_lymphocyte.resultname != "ABS.CD8-CD57+ LYMPHS")
dataframe_lymphocyte = dataframe_lymphocyte.where(dataframe_lymphocyte.resultname != "LYMPHS CSFA")
dataframe_lymphocyte = dataframe_lymphocyte.where(dataframe_lymphocyte.resultname != "ABS CD8 + CD57 + LYMPHS")
dataframe_lymphocyte = dataframe_lymphocyte.where(dataframe_lymphocyte.resultname != "ABS.CD8+CD38+ LYMPHS.BLD")
dataframe_lymphocyte = dataframe_lymphocyte.where(dataframe_lymphocyte.resultname != "ABS.CD8+HLA-DR+LYMPHS.BLD.QN (REF)")
dataframe_lymphocyte = dataframe_lymphocyte.where(dataframe_lymphocyte.resultname != "ABS CD3 + CD25 + LYMPHS")
dataframe_lymphocyte = dataframe_lymphocyte.where(~(lower(col("resultname")).contains("cell")))
dataframe_lymphocyte = dataframe_lymphocyte.where(~(lower(col("resultname")).contains("%")))
dataframe_lymphocyte = dataframe_lymphocyte.where(~(lower(col("resultname")).contains("fluid")))
dataframe_lymphocyte = dataframe_lymphocyte.where(~(lower(col("resultname")).contains("stool")))
dataframe_lymphocyte = dataframe_lymphocyte.where(~(lower(col("resultname")).contains("urine")))
dataframe_lymphocyte = dataframe_lymphocyte.where(~(lower(col("resultname")).contains("ua")))
dataframe_lymphocyte = dataframe_lymphocyte.where(~(lower(col("resultname")).contains("abnormal")))
dataframe_lymphocyte = dataframe_lymphocyte.where(~(lower(col("resultname")).contains("variant")))
dataframe_lymphocyte = dataframe_lymphocyte.where(~(lower(col("resultname")).contains("atypical")))
dataframe_lymphocyte = dataframe_lymphocyte.where(~(lower(col("resultname")).contains("reactive")))
dataframe_lymphocyte = dataframe_lymphocyte.where(~(lower(col("resultname")).contains("prolymphocytes")))

dataframe_lymphocyte = dataframe_lymphocyte.withColumn('unit_cleaned',
                                                       when((dataframe_lymphocyte.unit=="missing"),"standard").\
                                                       when(lower(dataframe_lymphocyte.unit).contains("k/ul"),"standard").\
                                                       when(lower(dataframe_lymphocyte.unit).contains("ul"),"standard").\
                                                       when(lower(dataframe_lymphocyte.unit).contains("mcl"),"standard").\
                                                       when(lower(dataframe_lymphocyte.unit).contains("mm3"),"standard").\
                                                       when(lower(dataframe_lymphocyte.unit).contains("cumm"),"standard").\
                                                       when(lower(dataframe_lymphocyte.unit).contains("cmm"),"standard").\
                                                       when(lower(dataframe_lymphocyte.unit).contains("10*3/ul"),"standard").\
                                                       when(dataframe_lymphocyte.resultname.contains("(10*3/UL)"),"standard").\
                                                       when(dataframe_lymphocyte.resultname.contains("(K/UL)"),"standard").\
                                                       when((dataframe_lymphocyte.resultname=="LYMPHOCYTES ABS (REF)"),"standard").\
                                                       when((dataframe_lymphocyte.resultname=="LYMPHOCYTES BL"),"standard").\
                                                       when((dataframe_lymphocyte.resultname=="LYMPHOCYTES MANUAL COUNT.BLD.QN (BEAKER)"),"standard").\
                                                       when((dataframe_lymphocyte.resultname=="EXTERNAL: LYMPHOCYTES, ABSOLUTE"),"standard").\
                                                       when(lower(dataframe_lymphocyte.resultname).contains("ul"),"standard").\
                                                       when((dataframe_lymphocyte.unit=="THOU/u"),"standard").\
                                                       otherwise(col("unit")))

dataframe_lymphocyte = dataframe_lymphocyte.withColumn('resultname_cleaned', 
                                         when(dataframe_lymphocyte.unit_cleaned.contains("standard"),"lymabs_cleaned").\
                                         otherwise(col("resultname")))


lymphocyte_df = dataframe_lymphocyte.filter(dataframe_lymphocyte.resultname_cleaned =="lymabs_cleaned")

count = lymphocyte_df.groupBy(["unit"]).count()
final_lymphocyte = lymphocyte_df.join(count,"unit")
final_lymphocyte = final_lymphocyte.filter(col("count")>5)

##############
w=Window.partitionBy("pat_id","instance","pat_enc_csn_id").orderBy("observationdatetime").rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)
final_lymphocyte_1 = final_lymphocyte.withColumn('resultvalue_lymph', F.mean('resultvalue').over(w))
final_lymphocyte_2 = final_lymphocyte_1.withColumn('Last_lymph', F.last('resultvalue').over(w))
    
columns = ['pat_id','instance','pat_enc_csn_id','resultvalue_lymph','Last_lymph']
saved_lymph = final_lymphocyte_2[columns]
saved_lymph = saved_lymph.dropDuplicates(['pat_id','instance','pat_enc_csn_id'])

In [0]:
%sql
DROP TABLE rdp_phi_sandbox.sm_patt_lymphs_feb;

In [0]:
table_name = 'rdp_phi_sandbox.sm_patt_lymphs_feb'
saved_lymph.write.saveAsTable(table_name)

In [0]:
dataframe_hematocrit = initial.where((lower(col("resultname")).contains("hematocrit")) | (lower(col("resultname")).contains("hct")))

dataframe_hematocrit = dataframe_hematocrit.where(dataframe_hematocrit.unit.isNotNull())
dataframe_hematocrit = dataframe_hematocrit.where(~(lower(col("resultname")).contains("adjusted"))) ###Removed adjusted HCT

dataframe_hematocrit = dataframe_hematocrit.withColumn('unit_cleaned', 
                                                       when(dataframe_hematocrit.unit.contains("%"),"standard").\
                                                       when(dataframe_hematocrit.unit.contains("percent"),"standard").\
                                                       otherwise(col("unit")))

dataframe_hematocrit = dataframe_hematocrit.withColumn('resultname_cleaned', 
                                         when(dataframe_hematocrit.unit_cleaned.contains("standard"),"hct_cleaned").\
                                         otherwise(col("resultname")))

hematocrit_df = dataframe_hematocrit.filter(dataframe_hematocrit.resultname_cleaned =="hct_cleaned")

count = hematocrit_df.groupBy(["unit"]).count()
final_hematocrit = hematocrit_df.join(count,"unit")
final_hematocrit = final_hematocrit.filter(col("count")>5)

##############
w=Window.partitionBy("pat_id","instance","pat_enc_csn_id").orderBy("observationdatetime").rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)
final_hematocrit_1 = final_hematocrit.withColumn('resultvalue_HCT', F.mean('resultvalue').over(w))
final_hematocrit_2 = final_hematocrit_1.withColumn('Last_HCT', F.last('resultvalue').over(w))
    
columns = ['pat_id','instance','pat_enc_csn_id','resultvalue_HCT','Last_HCT']
saved_hct = final_hematocrit_2[columns]
saved_hct = saved_hct.dropDuplicates(['pat_id','instance','pat_enc_csn_id'])

In [0]:
%sql
DROP TABLE rdp_phi_sandbox.sm_patt_hct_feb;

In [0]:
table_name = 'rdp_phi_sandbox.sm_patt_hct_feb'
saved_hct.write.saveAsTable(table_name)

In [0]:
dataframe_hemoglobin = initial.where((lower(col("resultname")).contains("hemoglobin")) | (lower(col("resultname")).contains("hgb")) | (lower(col("resultname")).contains(r'(?:\s|^)hb(?:\s|$)')))

dataframe_hemoglobin = dataframe_hemoglobin.where(dataframe_hemoglobin.unit.isNotNull())
dataframe_hemoglobin = dataframe_hemoglobin.where(~(lower(col("unit")).contains("mg/dl")))
dataframe_hemoglobin = dataframe_hemoglobin.where(~(lower(col("resultname")).contains("%")))
dataframe_hemoglobin = dataframe_hemoglobin.where(~(lower(col("resultname")).contains("deleted")))

dataframe_hemoglobin = dataframe_hemoglobin.withColumn('unit_cleaned', 
                                                       when(lower(dataframe_hemoglobin.unit).contains("g/dl"),"standard").\
                                                       when(lower(dataframe_hemoglobin.unit).contains("gm/dl"),"standard").\
                                                       when(lower(dataframe_hemoglobin.unit).contains("grams/dl"),"standard").\
                                                       when(dataframe_hemoglobin.unit == "grams per deciliter","standard").\
                                                       otherwise(col("unit")))

dataframe_hemoglobin = dataframe_hemoglobin.withColumn('resultname_cleaned', 
                                         when(dataframe_hemoglobin.unit_cleaned.contains("standard"),"hgb_cleaned").\
                                         otherwise(col("resultname")))

hemoglobin_df = dataframe_hemoglobin.filter(dataframe_hemoglobin.resultname_cleaned =="hgb_cleaned")

count = hemoglobin_df.groupBy(["unit"]).count()
final_hemoglobin = hemoglobin_df.join(count,"unit")
final_hemoglobin = final_hemoglobin.filter(col("count")>5)

##############
w=Window.partitionBy("pat_id","instance","pat_enc_csn_id").orderBy("observationdatetime").rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)
final_hemoglobin_1 = final_hemoglobin.withColumn('resultvalue_HGB', F.mean('resultvalue').over(w))
final_hemoglobin_2 = final_hemoglobin_1.withColumn('Last_HGB', F.last('resultvalue').over(w))
    
columns = ['pat_id','instance','pat_enc_csn_id','resultvalue_HGB','Last_HGB']
saved_hemoglobin = final_hemoglobin_2[columns]
saved_hemoglobin = saved_hemoglobin.dropDuplicates(['pat_id','instance','pat_enc_csn_id'])

In [0]:
%sql
DROP TABLE rdp_phi_sandbox.sm_patt_hgb_feb;

In [0]:
table_name = 'rdp_phi_sandbox.sm_patt_hgb_feb'
saved_hemoglobin.write.saveAsTable(table_name)

In [0]:
dataframe_sodium = initial.where((lower(col("resultname")).contains("sodium")) | (lower(col("resultname")).contains(r'(?:\s|^)na(?:\s|$)')))

###Change Nulls to the defined unit
dataframe_sodium = dataframe_sodium.na.fill("missing",["unit"])

#dataframe_sodium = dataframe_sodium.where(dataframe_sodium.unit.isNotNull())
dataframe_sodium = dataframe_sodium.where(dataframe_sodium.unit != "145")
dataframe_sodium = dataframe_sodium.where(dataframe_sodium.unit != "`")
dataframe_sodium = dataframe_sodium.where(dataframe_sodium.unit != "MMOL/24HRS")
dataframe_sodium = dataframe_sodium.where(dataframe_sodium.unit != "70")
dataframe_sodium = dataframe_sodium.where(dataframe_sodium.unit != "32")
dataframe_sodium = dataframe_sodium.where(dataframe_sodium.unit != "5.2")
dataframe_sodium = dataframe_sodium.where(dataframe_sodium.unit != "146")
dataframe_sodium = dataframe_sodium.where(dataframe_sodium.unit != "134-144")
dataframe_sodium = dataframe_sodium.where(dataframe_sodium.unit != "4.3")
dataframe_sodium = dataframe_sodium.where(dataframe_sodium.unit != "143")
dataframe_sodium = dataframe_sodium.where(dataframe_sodium.unit != "Other")
dataframe_sodium = dataframe_sodium.where(dataframe_sodium.unit != "units")
dataframe_sodium = dataframe_sodium.where(dataframe_sodium.unit != "mmol/24h")
dataframe_sodium = dataframe_sodium.where(dataframe_sodium.resultname != "SODIUM 24 HOUR VOLUME")
dataframe_sodium = dataframe_sodium.where(~(lower(col("resultname")).contains("urine")))
dataframe_sodium = dataframe_sodium.where(~col("resultname").contains("24 HOUR"))
dataframe_sodium = dataframe_sodium.where(~col("resultname").contains("VENOUS"))

dataframe_sodium = dataframe_sodium.withColumn('unit_cleaned', 
                                               when((dataframe_sodium.unit=="missing"),"standard").\
                                               when(lower(dataframe_sodium.unit).contains("mmol/l"),"standard").\
                                               when(lower(dataframe_sodium.unit).contains("mmol"),"standard").\
                                               when(lower(dataframe_sodium.unit).contains("mmoi/l"),"standard").\
                                               when(lower(dataframe_sodium.unit).contains("mm/l"),"standard").\
                                               when(lower(dataframe_sodium.unit).contains("meq/l"),"standard").\
                                               when(lower(dataframe_sodium.unit).contains("meq"),"standard").\
                                               when(dataframe_sodium.unit == "mmol per liter","standard").\
                                               otherwise(col("unit")))

dataframe_sodium = dataframe_sodium.withColumn('resultname_cleaned', 
                                         when(dataframe_sodium.unit_cleaned.contains("standard"),"na_cleaned").\
                                         otherwise(col("resultname")))

sodium_df = dataframe_sodium.filter(dataframe_sodium.resultname_cleaned =="na_cleaned")

count = sodium_df.groupBy(["unit"]).count()
final_sodium = sodium_df.join(count,"unit")
final_sodium = final_sodium.filter(col("count")>5)

##############
w=Window.partitionBy("pat_id","instance","pat_enc_csn_id").orderBy("observationdatetime").rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)
final_sodium_1 = final_sodium.withColumn('resultvalue_sodium', F.mean('resultvalue').over(w))
final_sodium_2 = final_sodium_1.withColumn('Last_sodium', F.last('resultvalue').over(w))
    
columns = ['pat_id','instance','pat_enc_csn_id','resultvalue_sodium','Last_sodium']
saved_sodium = final_sodium_2[columns]
saved_sodium = saved_sodium.dropDuplicates(['pat_id','instance','pat_enc_csn_id'])

In [0]:
%sql
DROP TABLE rdp_phi_sandbox.sm_patt_sodium_feb;

In [0]:
table_name = 'rdp_phi_sandbox.sm_patt_sodium_feb'
saved_sodium.write.saveAsTable(table_name)

In [0]:
dataframe_alb = initial.where((lower(col("resultname")).contains("albumin")) | (lower(col("resultname")).contains('(?:\s|^)alb(?:\s|$)')))

dataframe_alb = dataframe_alb.where(dataframe_alb.unit.isNotNull())
dataframe_alb = dataframe_alb.where(~(lower(col("unit")).contains("ratio")))
dataframe_alb = dataframe_alb.where(~(lower(col("unit")).contains("mg/dl")))
dataframe_alb = dataframe_alb.where(~(lower(col("resultname")).contains("ratio")))
dataframe_alb = dataframe_alb.where(~(lower(col("resultname")).contains("urine")))
dataframe_alb = dataframe_alb.where(~(lower(col("resultname")).contains("stool")))
dataframe_alb = dataframe_alb.where(~(lower(col("resultname")).contains("ua")))
dataframe_alb = dataframe_alb.where(~(lower(col("resultname")).contains("adjusted")))
dataframe_alb = dataframe_alb.where(dataframe_alb.unit != "%")
dataframe_alb = dataframe_alb.where(dataframe_alb.unit != "-")
dataframe_alb = dataframe_alb.where(dataframe_alb.resultname != "ALBUMIN/GLOBULIN.SER/PLAS.QN.ELP (BEAKER)")
dataframe_alb = dataframe_alb.where(dataframe_alb.resultname != "PREALBUMIN.SER/PLAS.QN (MG/DL)(BEAKER)")

dataframe_alb = dataframe_alb.withColumn('unit_cleaned', 
                                                       when(lower(dataframe_alb.unit).contains("g/dl"),"standard").\
                                                       when(lower(dataframe_alb.unit).contains("gm/dl"),"standard").\
                                                       when(lower(dataframe_alb.unit).contains("grams/dl"),"standard").\
                                                       when(dataframe_alb.unit == "grams per deciliter","standard").\
                                                       otherwise(col("unit")))

dataframe_alb = dataframe_alb.withColumn('resultname_cleaned', 
                                         when(dataframe_alb.unit_cleaned.contains("standard"),"alb_cleaned").\
                                         otherwise(col("resultname")))

alb_df = dataframe_alb.filter(dataframe_alb.resultname_cleaned =="alb_cleaned")

count = alb_df.groupBy(["unit"]).count()
final_alb = alb_df.join(count,"unit")
final_alb = final_alb.filter(col("count")>5)

##############
w=Window.partitionBy("pat_id","instance","pat_enc_csn_id").orderBy("observationdatetime").rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)
final_alb_1 = final_alb.withColumn('resultvalue_alb', F.mean('resultvalue').over(w))
final_alb_2 = final_alb_1.withColumn('Last_alb', F.last('resultvalue').over(w))
    
columns = ['pat_id','instance','pat_enc_csn_id','resultvalue_alb','Last_alb']
saved_alb = final_alb_2[columns]
saved_alb = saved_alb.dropDuplicates(['pat_id','instance','pat_enc_csn_id'])

In [0]:
%sql
DROP TABLE rdp_phi_sandbox.sm_patt_alb_feb;

In [0]:
table_name = 'rdp_phi_sandbox.sm_patt_alb_feb'
saved_alb.write.saveAsTable(table_name)

In [0]:
dataframe_potassium = initial.where((lower(col("resultname")).contains("potassium")))

###Change Nulls to the defined unit
dataframe_potassium = dataframe_potassium.na.fill("missing",["unit"])

#dataframe_potassium = dataframe_potassium.where(dataframe_potassium.unit.isNotNull())
dataframe_potassium = dataframe_potassium.where(~(lower(col("unit")).contains("ratio")))
dataframe_potassium = dataframe_potassium.where(~(lower(col("unit")).contains("%")))
dataframe_potassium = dataframe_potassium.where(~(lower(col("resultname")).contains("urine")))
dataframe_potassium = dataframe_potassium.where(~(lower(col("resultname")).contains("stool")))
dataframe_potassium = dataframe_potassium.where(~(lower(col("resultname")).contains("ua")))

dataframe_potassium = dataframe_potassium.withColumn('unit_cleaned',
                                                     when((dataframe_potassium.unit=="missing"),"standard").\
                                                     when(lower(dataframe_potassium.unit).contains("mm/l"),"standard").\
                                                     when(lower(dataframe_potassium.unit).contains("mmol/l"),"standard").\
                                                     when(lower(dataframe_potassium.unit).contains("mmoi/l"),"standard").\
                                                     when(lower(dataframe_potassium.unit).contains("meq/l"),"standard").\
                                                     when(lower(dataframe_potassium.unit).contains("nmol/l"),"standard").\
                                                     when(dataframe_potassium.unit == "mmol per liter","standard").\
                                                     otherwise(col("unit")))

dataframe_potassium = dataframe_potassium.withColumn('resultname_cleaned', 
                                                    when(dataframe_potassium.unit_cleaned.contains("standard"),"pot_cleaned").\
                                                    otherwise(col("resultname")))

potassium_df = dataframe_potassium.filter(dataframe_potassium.resultname_cleaned =="pot_cleaned")

count = potassium_df.groupBy(["unit"]).count()
final_potassium = potassium_df.join(count,"unit")
final_potassium = final_potassium.filter(col("count")>5)

##############
w=Window.partitionBy("pat_id","instance","pat_enc_csn_id").orderBy("observationdatetime").rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)
final_potassium_1 = final_potassium.withColumn('resultvalue_potassium', F.mean('resultvalue').over(w))
final_potassium_2 = final_potassium_1.withColumn('Last_potassium', F.last('resultvalue').over(w))
    
columns = ['pat_id','instance','pat_enc_csn_id','resultvalue_potassium','Last_potassium']
saved_potassium = final_potassium_2[columns]
saved_potassium = saved_potassium.dropDuplicates(['pat_id','instance','pat_enc_csn_id'])

In [0]:
%sql
DROP TABLE rdp_phi_sandbox.sm_patt_potassium_feb;

In [0]:
table_name = 'rdp_phi_sandbox.sm_patt_potassium_feb'
saved_potassium.write.saveAsTable(table_name)

In [0]:
dataframe_creatinine = initial.where((lower(col("resultname")).contains("creatinine")) | (lower(col("resultname")).contains('creat')))

###Change Nulls to the defined unit
dataframe_creatinine = dataframe_creatinine.na.fill("missing",["unit"])

dataframe_creatinine = dataframe_creatinine.where(~(lower(col("unit")).contains("ratio")))
dataframe_creatinine = dataframe_creatinine.where(~(lower(col("unit")).contains("%")))
dataframe_creatinine = dataframe_creatinine.where(~(lower(col("resultname")).contains("ratio")))
dataframe_creatinine = dataframe_creatinine.where(~(lower(col("resultname")).contains("%")))
dataframe_creatinine = dataframe_creatinine.where(~(lower(col("resultname")).contains("urine")))
dataframe_creatinine = dataframe_creatinine.where(~(lower(col("resultname")).contains("stool")))
dataframe_creatinine = dataframe_creatinine.where(~(lower(col("resultname")).contains("ua")))
dataframe_creatinine = dataframe_creatinine.where(~(lower(col("resultname")).contains("kinase")))
dataframe_creatinine = dataframe_creatinine.where(~(lower(col("resultname")).contains("bun")))

dataframe_creatinine = dataframe_creatinine.withColumn('unit_cleaned', 
                                                       when((dataframe_creatinine.unit=="missing"),"standard").\
                                                       when(lower(dataframe_creatinine.unit).contains("mg/dl"),"standard").\
                                                       when(lower(dataframe_creatinine.unit).contains("mg/l"),"standard").\
                                                       when(lower(dataframe_creatinine.unit).contains("mg.l"),"standard").\
                                                       when(dataframe_creatinine.unit == "milligrams per deciliter","standard").\
                                                       otherwise(col("unit")))

dataframe_creatinine = dataframe_creatinine.withColumn('resultname_cleaned', 
                                                    when(dataframe_creatinine.unit_cleaned.contains("standard"),"creat_cleaned").\
                                                    otherwise(col("resultname")))

creatinine_df = dataframe_creatinine.filter(dataframe_creatinine.resultname_cleaned =="creat_cleaned")

count = creatinine_df.groupBy(["unit"]).count()
final_creatinine = creatinine_df.join(count,"unit")
final_creatinine = final_creatinine.filter(col("count")>5)

final_creatinine = final_creatinine.withColumn('resultvalue',when(lower(final_creatinine.unit).contains("mg/l"),col("resultvalue")*0.1).\
                                               when(lower(final_creatinine.unit).contains("mg.l"),col("resultvalue")*0.1).\
                                               otherwise(col("resultvalue")))


##############
w=Window.partitionBy("pat_id","instance","pat_enc_csn_id").orderBy("observationdatetime").rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)
final_creatinine_1 = final_creatinine.withColumn('resultvalue_creatinine', F.mean('resultvalue').over(w))
final_creatinine_2 = final_creatinine_1.withColumn('Last_creatinine', F.last('resultvalue').over(w))
    
columns = ['pat_id','instance','pat_enc_csn_id','resultvalue_creatinine','Last_creatinine']
saved_creatinine = final_creatinine_2[columns]
saved_creatinine = saved_creatinine.dropDuplicates(['pat_id','instance','pat_enc_csn_id'])

In [0]:
%sql
DROP TABLE rdp_phi_sandbox.sm_patt_creatinine_feb;

In [0]:
table_name = 'rdp_phi_sandbox.sm_patt_creatinine_feb'
saved_creatinine.write.saveAsTable(table_name)

In [0]:
dataframe_calcium = initial.where((lower(col("resultname")).contains("calcium")) | (lower(col("resultname")).contains('(?:\s|^)ca(?:\s|$)')))

dataframe_calcium = dataframe_calcium.where(dataframe_calcium.unit.isNotNull())
dataframe_calcium = dataframe_calcium.where(~(lower(col("unit")).contains("ratio")))
dataframe_calcium = dataframe_calcium.where(~(lower(col("resultname")).contains("adjusted")))
dataframe_calcium = dataframe_calcium.where(~(lower(col("resultname")).contains("corrected")))
dataframe_calcium = dataframe_calcium.where(~(lower(col("resultname")).contains("normalized")))
dataframe_calcium = dataframe_calcium.where(~(lower(col("resultname")).contains("urine")))
dataframe_calcium = dataframe_calcium.where(~(lower(col("resultname")).contains("stool")))
dataframe_calcium = dataframe_calcium.where(~(lower(col("resultname")).contains("ua")))
dataframe_calcium = dataframe_calcium.where(~(lower(col("resultname")).contains("ionized")))
dataframe_calcium = dataframe_calcium.where(dataframe_calcium.unit != "%")
#dataframe_calcium = dataframe_calcium.where(dataframe_calcium.unit != "/HPF")
#dataframe_calcium = dataframe_calcium.where(dataframe_calcium.unit != "10.0")
#dataframe_calcium = dataframe_calcium.where(dataframe_calcium.unit != "/LPF")

dataframe_calcium = dataframe_calcium.withColumn('unit_cleaned', 
                                                       when(lower(dataframe_calcium.unit).contains("mmol/l"),"standard").\
                                                       when(lower(dataframe_calcium.unit).contains("mmol"),"standard").\
                                                       when(lower(dataframe_calcium.unit).contains("mg/dl"),"standard").\
                                                       when(lower(dataframe_calcium.unit).contains("nmol/l"),"standard").\
                                                       when(lower(dataframe_calcium.unit).contains("mg/l"),"standard").\
                                                       when(dataframe_calcium.unit == "milligrams per deciliter","standard").\
                                                       otherwise(col("unit")))

dataframe_calcium = dataframe_calcium.withColumn('resultname_cleaned', 
                                         when(dataframe_calcium.unit_cleaned.contains("standard"),"cal_cleaned").\
                                         otherwise(col("resultname")))

calcium_df = dataframe_calcium.filter(dataframe_calcium.resultname_cleaned =="cal_cleaned")

count = calcium_df.groupBy(["unit"]).count()
final_calcium = calcium_df.join(count,"unit")
final_calcium = final_calcium.filter(col("count")>5)

##############
w=Window.partitionBy("pat_id","instance","pat_enc_csn_id").orderBy("observationdatetime").rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)
final_calcium_1 = final_calcium.withColumn('resultvalue_calcium', F.mean('resultvalue').over(w))
final_calcium_2 = final_calcium_1.withColumn('Last_calcium', F.last('resultvalue').over(w))
    
columns = ['pat_id','instance','pat_enc_csn_id','resultvalue_calcium','Last_calcium']
saved_calcium = final_calcium_2[columns]
saved_calcium = saved_calcium.dropDuplicates(['pat_id','instance','pat_enc_csn_id'])

In [0]:
%sql
DROP TABLE rdp_phi_sandbox.sm_patt_calcium_feb;

In [0]:
table_name = 'rdp_phi_sandbox.sm_patt_calcium_feb'
saved_calcium.write.saveAsTable(table_name)

In [0]:
dataframe_alt = initial.where((lower(col("resultname")).contains("alt (sgpt)")))

###Change Nulls to the defined unit
dataframe_alt = dataframe_alt.na.fill("missing",["unit"])

dataframe_alt = dataframe_alt.where(~(lower(col("unit")).contains("ratio")))
dataframe_alt = dataframe_alt.where(~(lower(col("resultname")).contains("ratio")))
dataframe_alt = dataframe_alt.where(~(lower(col("resultname")).contains("urine")))

dataframe_alt = dataframe_alt.withColumn('unit_cleaned', 
                                         when((dataframe_alt.unit=="missing"),"standard").\
                                         when(lower(dataframe_alt.unit).contains("iu/l"),"standard").\
                                         when(lower(dataframe_alt.unit).contains("u/l"),"standard").\
                                         when(lower(dataframe_alt.unit).contains("unit/l"),"standard").\
                                         when(lower(dataframe_alt.unit).contains("intunit/l"),"standard").\
                                         when(lower(dataframe_alt.unit).contains("intlunit/l"),"standard").\
                                         when(lower(dataframe_alt.unit).contains("units/l"),"standard").\
                                         when(lower(dataframe_alt.unit).contains("units per liter"),"standard").\
                                         when(dataframe_alt.unit.contains("[iU]/L"),"standard").\
                                         otherwise(col("unit")))

dataframe_alt = dataframe_alt.withColumn('resultname_cleaned', 
                                         when(dataframe_alt.unit_cleaned.contains("standard"),"alt_cleaned").\
                                         otherwise(col("resultname")))

alt_df = dataframe_alt.filter(dataframe_alt.resultname_cleaned =="alt_cleaned")

count = alt_df.groupBy(["unit"]).count()
final_alt = alt_df.join(count,"unit")
final_alt = final_alt.filter(col("count")>5)

##############
w=Window.partitionBy("pat_id","instance","pat_enc_csn_id").orderBy("observationdatetime").rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)
final_alt_1 = final_alt.withColumn('resultvalue_alt', F.mean('resultvalue').over(w)) ##Find mean ALT
final_alt_2 = final_alt_1.withColumn('Last_alt', F.last('resultvalue').over(w)) ##Find LAST ALT
    
columns = ['pat_id','instance','pat_enc_csn_id','resultvalue_alt','Last_alt']
saved_alt = final_alt_2[columns]
saved_alt = saved_alt.dropDuplicates(['pat_id','instance','pat_enc_csn_id'])

In [0]:
%sql
DROP TABLE rdp_phi_sandbox.sm_patt_alt_feb;

In [0]:
table_name = 'rdp_phi_sandbox.sm_patt_alt_feb'
saved_alt.write.saveAsTable(table_name)

In [0]:
dataframe_ast = initial.where((lower(col("resultname")).contains("ast (sgot)")))

###Change Nulls to the defined unit
dataframe_ast = dataframe_ast.na.fill("missing",["unit"])

dataframe_ast = dataframe_ast.where(~(lower(col("unit")).contains("ratio")))
dataframe_ast = dataframe_ast.where(~(lower(col("resultname")).contains("ratio")))
dataframe_ast = dataframe_ast.where(~(lower(col("resultname")).contains("urine")))

dataframe_ast = dataframe_ast.withColumn('unit_cleaned', 
                                         when((dataframe_ast.unit=="missing"),"standard").\
                                         when(lower(dataframe_ast.unit).contains("iu/l"),"standard").\
                                         when(lower(dataframe_ast.unit).contains("u/l"),"standard").\
                                         when(lower(dataframe_ast.unit).contains("unit/l"),"standard").\
                                         when(lower(dataframe_ast.unit).contains("intunit/l"),"standard").\
                                         when(lower(dataframe_ast.unit).contains("intlunit/l"),"standard").\
                                         when(lower(dataframe_ast.unit).contains("units/l"),"standard").\
                                         when(lower(dataframe_ast.unit).contains("units per liter"),"standard").\
                                         when(dataframe_ast.unit.contains("[iU]/L"),"standard").\
                                         otherwise(col("unit")))

dataframe_ast = dataframe_ast.withColumn('resultname_cleaned', 
                                         when(dataframe_ast.unit_cleaned.contains("standard"),"ast_cleaned").\
                                         otherwise(col("resultname")))

ast_df = dataframe_ast.filter(dataframe_ast.resultname_cleaned =="ast_cleaned")

count = ast_df.groupBy(["unit"]).count()
final_ast = ast_df.join(count,"unit")
final_ast = final_ast.filter(col("count")>5)

##############
w=Window.partitionBy("pat_id","instance","pat_enc_csn_id").orderBy("observationdatetime").rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)
final_ast_1 = final_ast.withColumn('resultvalue_ast', F.mean('resultvalue').over(w))
final_ast_2 = final_ast_1.withColumn('Last_ast', F.last('resultvalue').over(w))
    
columns = ['pat_id','instance','pat_enc_csn_id','resultvalue_ast','Last_ast']
saved_ast = final_ast_2[columns]
saved_ast = saved_ast.dropDuplicates(['pat_id','instance','pat_enc_csn_id'])

In [0]:
%sql
DROP TABLE rdp_phi_sandbox.sm_patt_ast_feb;

In [0]:
table_name = 'rdp_phi_sandbox.sm_patt_ast_feb'
saved_ast.write.saveAsTable(table_name)

In [0]:
dataframe_neuabs = initial.where((lower(col("resultname")).contains("neutrophils")) | (lower(col("resultname")).contains("pmns")))

###Change Nulls to the defined unit
dataframe_neuabs = dataframe_neuabs.na.fill("missing",["unit"])
#dataframe_neuabs = dataframe_neuabs.where(dataframe_neuabs.unit.isNotNull())
dataframe_neuabs = dataframe_neuabs.where(~(lower(col("unit")).contains("cell"))) #??????????????
dataframe_neuabs = dataframe_neuabs.where(~(lower(col("unit")).contains("cel")))
dataframe_neuabs = dataframe_neuabs.where(~(lower(col("unit")).contains("%")))
dataframe_neuabs = dataframe_neuabs.where(~(lower(col("resultname")).contains("cell"))) #????????????????
dataframe_neuabs = dataframe_neuabs.where(~(lower(col("resultname")).contains("urine")))
dataframe_neuabs = dataframe_neuabs.where(~(lower(col("resultname")).contains("%")))
dataframe_neuabs = dataframe_neuabs.where(~(col("resultname").contains("VACUOLATED")))
dataframe_neuabs = dataframe_neuabs.where(~(col("resultname").contains("HYPERSEGMENTED")))
#dataframe_neuabs = dataframe_neuabs.where(~(col("resultname").contains("SEGMENTED")))
#dataframe_neuabs = dataframe_neuabs.where(~(col("resultname").contains("BAND")))
dataframe_neuabs = dataframe_neuabs.where(dataframe_neuabs.unit != "nmol/min/mg")
dataframe_neuabs = dataframe_neuabs.where(dataframe_neuabs.unit != "%")
#Segmented neutrophils
#Polymorphonuclear neutrophils
#PMNs
#Band neutrophils

dataframe_neuabs = dataframe_neuabs.withColumn('unit_cleaned', 
                                               when((dataframe_neuabs.unit=="missing"),"standard").\
                                               when(lower(dataframe_neuabs.unit).contains("k/ul"),"standard").\
                                               when(lower(dataframe_neuabs.unit).contains("ul"),"standard").\
                                               when(lower(dataframe_neuabs.unit).contains("mcl"),"standard").\
                                               when(lower(dataframe_neuabs.unit).contains("mm3"),"standard").\
                                               when(lower(dataframe_neuabs.unit).contains("cumm"),"standard").\
                                               when(lower(dataframe_neuabs.unit).contains("cmm"),"standard").\
                                               when(lower(dataframe_neuabs.unit).contains("10*3/ul"),"standard").\
                                               when(lower(dataframe_neuabs.unit).contains("x10e3/ul"),"standard").\
                                               when(dataframe_neuabs.resultname.contains("(10*3/UL)"),"standard").\
                                               when(dataframe_neuabs.resultname.contains("(K/UL)"),"standard").\
                                               when((dataframe_neuabs.resultname=="NEUTROPHILS BL"),"standard").\
                                               when((dataframe_neuabs.resultname=="CBC ABS NEUTROPHILS (REF)"),"standard").\
                                               when((dataframe_neuabs.resultname=="NEUTROPHILS ABSOLUTE.BLD.QN (REF)"),"standard").\
                                               when(dataframe_neuabs.resultname.contains("EXTERNAL"),"standard").\
                                               when(lower(dataframe_neuabs.unit).contains("k/ul"),"standard").\
                                               when(lower(dataframe_neuabs.unit).contains("thous/mcl"),"standard").\
                                               when(lower(dataframe_neuabs.unit).contains("thousand/mcl"),"standard").\
                                               when(lower(dataframe_neuabs.unit).contains("/ul"),"standard").\
                                               otherwise(col("unit")))

dataframe_neuabs = dataframe_neuabs.withColumn('resultname_cleaned', 
                                         when(dataframe_neuabs.unit_cleaned.contains("standard"),"neuabs_cleaned").\
                                         otherwise(col("resultname")))

neuabs_df = dataframe_neuabs.filter(dataframe_neuabs.resultname_cleaned =="neuabs_cleaned")

count = neuabs_df.groupBy(["unit"]).count()
final_neuabs = neuabs_df.join(count,"unit")
final_neuabs = final_neuabs.filter(col("count")>4)


##############
w=Window.partitionBy("pat_id","instance","pat_enc_csn_id").orderBy("observationdatetime").rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)
final_neuabs_1 = final_neuabs.withColumn('resultvalue_neuabs', F.mean('resultvalue').over(w))
final_neuabs_2 = final_neuabs_1.withColumn('Last_neuabs', F.last('resultvalue').over(w))
    
columns = ['pat_id','instance','pat_enc_csn_id','resultvalue_neuabs','Last_neuabs']
saved_neuabst = final_neuabs_2[columns]
saved_neuabst = saved_neuabst.dropDuplicates(['pat_id','instance','pat_enc_csn_id'])

In [0]:
%sql
DROP TABLE rdp_phi_sandbox.sm_patt_neuabs_feb;

In [0]:
table_name = 'rdp_phi_sandbox.sm_patt_neuabs_feb'
saved_neuabst.write.saveAsTable(table_name)

In [0]:
dataframe_eosabs = initial.where((lower(col("resultname")).contains("eosinophils")) | (lower(col("resultname")).contains("eosinophiles")))

#dataframe_eosabs = dataframe_eosabs.where(dataframe_eosabs.unit.isNotNull())
dataframe_eosabs = dataframe_eosabs.na.fill("missing",["unit"])
dataframe_eosabs = dataframe_eosabs.where(~(lower(col("unit")).contains("cell"))) #??????????????
dataframe_eosabs = dataframe_eosabs.where(~(lower(col("unit")).contains("cel")))
dataframe_eosabs = dataframe_eosabs.where(~(lower(col("unit")).contains("%")))
dataframe_eosabs = dataframe_eosabs.where(~(lower(col("resultname")).contains("cell"))) #????????????????
dataframe_eosabs = dataframe_eosabs.where(~(lower(col("resultname")).contains("urine")))
dataframe_eosabs = dataframe_eosabs.where(~(lower(col("resultname")).contains("%")))
dataframe_eosabs = dataframe_eosabs.where(dataframe_eosabs.unit != "nmol/min/mg")
dataframe_eosabs = dataframe_eosabs.where(dataframe_eosabs.unit != "/L")
dataframe_eosabs = dataframe_eosabs.where(dataframe_eosabs.unit != "%")


dataframe_eosabs = dataframe_eosabs.withColumn('unit_cleaned', 
                                               when((dataframe_eosabs.unit=="missing"),"standard").\
                                               when(lower(dataframe_eosabs.unit).contains("k/ul"),"standard").\
                                               when(lower(dataframe_eosabs.unit).contains("ul"),"standard").\
                                               when(lower(dataframe_eosabs.unit).contains("mcl"),"standard").\
                                               when(lower(dataframe_eosabs.unit).contains("mm3"),"standard").\
                                               when(lower(dataframe_eosabs.unit).contains("cumm"),"standard").\
                                               when(lower(dataframe_eosabs.unit).contains("cmm"),"standard").\
                                               when(lower(dataframe_eosabs.unit).contains("10*3/ul"),"standard").\
                                               when(lower(dataframe_eosabs.unit).contains("x10e3/ul"),"standard").\
                                               when(dataframe_eosabs.resultname.contains("(10*3/UL)"),"standard").\
                                               when(dataframe_eosabs.resultname.contains("(K/UL)"),"standard").\
                                               when((dataframe_eosabs.resultname=="EOSINOPHILS ABSOLUTE (REF)"),"standard").\
                                               when((dataframe_eosabs.resultname=="EOSINOPHILS.XXX.ORD (REF)"),"standard").\
                                               when((dataframe_eosabs.resultname=="CBC ABS EOSINOPHILS (REF)"),"standard").\
                                               when(dataframe_eosabs.resultname.contains("EXTERNAL"),"standard").\
                                               when(lower(dataframe_eosabs.unit).contains("k/ul"),"standard").\
                                               when(lower(dataframe_eosabs.unit).contains("thous/mcl"),"standard").\
                                               when(lower(dataframe_eosabs.unit).contains("thousand/mcl"),"standard").\
                                               otherwise(col("unit")))

dataframe_eosabs = dataframe_eosabs.withColumn('resultname_cleaned', 
                                         when(dataframe_eosabs.unit_cleaned.contains("standard"),"eosabs_cleaned").\
                                         otherwise(col("resultname")))


eosabs_df = dataframe_eosabs.filter(dataframe_eosabs.resultname_cleaned =="eosabs_cleaned")

count = eosabs_df.groupBy(["unit"]).count()
final_eosabs = eosabs_df.join(count,"unit")
final_eosabs = final_eosabs.filter(col("count")>4)


##############
w=Window.partitionBy("pat_id","instance","pat_enc_csn_id").orderBy("observationdatetime").rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)
final_eosabs_1 = final_eosabs.withColumn('resultvalue_eosabs', F.mean('resultvalue').over(w))
final_eosabs_2 = final_eosabs_1.withColumn('Last_eosabs', F.last('resultvalue').over(w))
    
columns = ['pat_id','instance','pat_enc_csn_id','resultvalue_eosabs','Last_eosabs']
saved_eosabs = final_eosabs_2[columns]
saved_eosabs = saved_eosabs.dropDuplicates(['pat_id','instance','pat_enc_csn_id'])

In [0]:
%sql
DROP TABLE rdp_phi_sandbox.sm_patt_eosabs_feb;

In [0]:
table_name = 'rdp_phi_sandbox.sm_patt_eosabs_feb'
saved_eosabs.write.saveAsTable(table_name)

In [0]:
dataframe_plt = initial.where((lower(col("resultname")).contains("platelet")) | (lower(col("resultname")).contains("plt")))

dataframe_plt = dataframe_plt.where(dataframe_plt.unit.isNotNull())
dataframe_plt = dataframe_plt.where(~(lower(col("unit")).contains("cell"))) #??
dataframe_plt = dataframe_plt.where(~(lower(col("unit")).contains("cel")))
dataframe_plt = dataframe_plt.where(~(lower(col("unit")).contains("%")))
dataframe_plt = dataframe_plt.where(~(lower(col("resultname")).contains("cell"))) #??
dataframe_plt = dataframe_plt.where(~(lower(col("resultname")).contains("urine")))
dataframe_plt = dataframe_plt.where(~(lower(col("resultname")).contains("%")))
dataframe_plt = dataframe_plt.where(~(col("resultname").contains("CITRATED")))
dataframe_plt = dataframe_plt.where(dataframe_plt.unit != "nmol/min/mg")
dataframe_plt = dataframe_plt.where(dataframe_plt.unit != "%")
dataframe_plt = dataframe_plt.where(dataframe_plt.unit != "PRU")
dataframe_plt = dataframe_plt.where(dataframe_plt.unit != "Base PRU")
dataframe_plt = dataframe_plt.where(dataframe_plt.unit != "ARU")

dataframe_plt = dataframe_plt.withColumn('unit_cleaned', 
                                         when(lower(dataframe_plt.unit).contains("k/ul"),"standard").\
                                         when(lower(dataframe_plt.unit).contains("ul"),"standard").\
                                         when(lower(dataframe_plt.unit).contains("mcl"),"standard").\
                                         when(lower(dataframe_plt.unit).contains("mm3"),"standard").\
                                         when(lower(dataframe_plt.unit).contains("cumm"),"standard").\
                                         when(lower(dataframe_plt.unit).contains("cmm"),"standard").\
                                         when(lower(dataframe_plt.unit).contains("10*3/ul"),"standard").\
                                         when(dataframe_plt.resultname.contains("(10*3/UL)"),"standard").\
                                         when(dataframe_plt.resultname.contains("(K/UL)"),"standard").\
                                         when((dataframe_plt.resultname=="HIGH TITER O PHERESIS PLATELETS"),"standard").\
                                         when((dataframe_plt.resultname=="PLT BASELINE"),"standard").\
                                         when((dataframe_plt.resultname=="(ZZZ)PLT"),"standard").\
                                         when((dataframe_plt.resultname=="PLT, POC"),"standard").\
                                         when((dataframe_plt.resultname=="LARGE PLATELETS (REF)"),"standard").\
                                         when(dataframe_plt.resultname.contains("EXTERNAL"),"standard").\
                                         when(lower(dataframe_plt.unit).contains("k/ul"),"standard").\
                                         when(lower(dataframe_plt.unit).contains("thous/mcl"),"standard").\
                                         when(lower(dataframe_plt.unit).contains("thousand/mcl"),"standard").\
                                         when(lower(dataframe_plt.unit).contains("/ul"),"standard").\
                                         otherwise(col("unit")))

dataframe_plt = dataframe_plt.withColumn('resultname_cleaned', 
                                         when(dataframe_plt.unit_cleaned.contains("standard"),"plt_cleaned").\
                                         otherwise(col("resultname")))

plt_df = dataframe_plt.filter(dataframe_plt.resultname_cleaned =="plt_cleaned")

count = plt_df.groupBy(["unit"]).count()
final_plt = plt_df.join(count,"unit")
final_plt = final_plt.filter(col("count")>5)


##############
w=Window.partitionBy("pat_id","instance","pat_enc_csn_id").orderBy("observationdatetime").rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)
final_plt_1 = final_plt.withColumn('resultvalue_PLT', F.mean('resultvalue').over(w))
final_plt_2 = final_plt_1.withColumn('Last_PLT', F.last('resultvalue').over(w))
    
columns = ['pat_id','instance','pat_enc_csn_id','resultvalue_PLT','Last_PLT']
saved_plt = final_plt_2[columns]
saved_plt = saved_plt.dropDuplicates(['pat_id','instance','pat_enc_csn_id'])

In [0]:
%sql
DROP TABLE rdp_phi_sandbox.sm_patt_plt_feb;

In [0]:
table_name = 'rdp_phi_sandbox.sm_patt_plt_feb'
saved_plt.write.saveAsTable(table_name)

In [0]:
dataframe_bun = initial.where((lower(col("resultname")).contains("blood urea nitrogen"))| (lower(col("resultname")).contains("bun")) | (lower(col("resultname")).contains("urea nitrogen")))

###Change Nulls to the defined unit
dataframe_bun = dataframe_bun.na.fill("missing",["unit"])
#dataframe_bun = dataframe_bun.where(dataframe_bun.unit.isNotNull())
dataframe_bun = dataframe_bun.where(~(lower(col("unit")).contains("ratio")))
dataframe_bun = dataframe_bun.where(~(lower(col("unit")).contains("(calc)")))
dataframe_bun = dataframe_bun.where(~(lower(col("resultname")).contains("ratio")))
dataframe_bun = dataframe_bun.where(~(lower(col("resultname")).contains("urine")))
dataframe_bun = dataframe_bun.where(~(lower(col("resultname")).contains("bun/creatinine")))
dataframe_bun = dataframe_bun.where(~(col("resultname").contains("SUBUNIT")))
dataframe_bun = dataframe_bun.where(~(col("resultname").contains("BUN, POST DIALYSIS")))

dataframe_bun = dataframe_bun.withColumn('unit_cleaned', 
                                         when((dataframe_bun.unit=="missing"),"standard").\
                                         when(lower(dataframe_bun.unit).contains("mg/dl"),"standard").\
                                         when((dataframe_bun.unit=="milligrams per deciliter"),"standard").\
                                         otherwise(col("unit")))

dataframe_bun = dataframe_bun.withColumn('resultname_cleaned', 
                                         when(dataframe_bun.unit_cleaned.contains("standard"),"bun_cleaned").\
                                         otherwise(col("resultname")))

bun_df = dataframe_bun.filter(dataframe_bun.resultname_cleaned =="bun_cleaned")

count = bun_df.groupBy(["unit"]).count()
final_bun = bun_df.join(count,"unit")
final_bun = final_bun.filter(col("count")>5)

##############
w=Window.partitionBy("pat_id","instance","pat_enc_csn_id").orderBy("observationdatetime").rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)
final_bun_1 = final_bun.withColumn('resultvalue_BUN', F.mean('resultvalue').over(w))
final_bun_2 = final_bun_1.withColumn('Last_BUN', F.last('resultvalue').over(w))
    
columns = ['pat_id','instance','pat_enc_csn_id','resultvalue_BUN','Last_BUN']
saved_bun = final_bun_2[columns]
saved_bun = saved_bun.dropDuplicates(['pat_id','instance','pat_enc_csn_id'])

In [0]:
%sql
DROP TABLE rdp_phi_sandbox.sm_patt_bun_feb;

In [0]:
table_name = 'rdp_phi_sandbox.sm_patt_bun_feb'
saved_bun.write.saveAsTable(table_name)

In [0]:
dataframe_CI = initial.where((lower(col("resultname")).contains("chloride")) | (lower(col("resultname")).contains('(?:\s|^)ci(?:\s|$)')))

###Change Nulls to the defined unit
dataframe_CI = dataframe_CI.na.fill("missing",["unit"])
#dataframe_CI = dataframe_CI.where(dataframe_CI.unit.isNotNull())
dataframe_CI = dataframe_CI.where(~(lower(col("unit")).contains("ratio")))
dataframe_CI = dataframe_CI.where(~(lower(col("resultname")).contains("ratio")))
dataframe_CI = dataframe_CI.where(~(lower(col("resultname")).contains("urine")))

dataframe_CI = dataframe_CI.withColumn('unit_cleaned', 
                                       when((dataframe_CI.unit=="missing"),"standard").\
                                       when(lower(dataframe_CI.unit).contains("mmol/l"),"standard").\
                                       when(lower(dataframe_CI.unit).contains("mmoi/l"),"standard").\
                                       when(lower(dataframe_CI.unit).contains("mm\l"),"standard").\
                                       when(lower(dataframe_CI.unit).contains("meq/l"),"standard").\
                                       when(lower(dataframe_CI.unit).contains("nmol/l"),"standard").\
                                       when(lower(dataframe_CI.unit).contains("mm/l"),"standard").\
                                       when(dataframe_CI.unit == "mmol per liter","standard").\
                                       otherwise(col("unit")))

dataframe_CI = dataframe_CI.withColumn('resultname_cleaned', 
                                       when(dataframe_CI.unit_cleaned.contains("standard"),"CI_cleaned").\
                                       otherwise(col("resultname")))

CI_df = dataframe_CI.filter(dataframe_CI.resultname_cleaned =="CI_cleaned")

count = CI_df.groupBy(["unit"]).count()
final_CI = CI_df.join(count,"unit")
final_CI = final_CI.filter(col("count")>1)

##############
w=Window.partitionBy("pat_id","instance","pat_enc_csn_id").orderBy("observationdatetime").rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)
final_CI_1 = final_CI.withColumn('resultvalue_CI', F.mean('resultvalue').over(w))
final_CI_2 = final_CI_1.withColumn('Last_CI', F.last('resultvalue').over(w))
    
columns = ['pat_id','instance','pat_enc_csn_id','resultvalue_CI','Last_CI']
saved_CI = final_CI_2[columns]
saved_CI = saved_CI.dropDuplicates(['pat_id','instance','pat_enc_csn_id'])

In [0]:
%sql
DROP TABLE rdp_phi_sandbox.sm_patt_clt_feb;

In [0]:
table_name = 'rdp_phi_sandbox.sm_patt_clt_feb'
saved_CI.write.saveAsTable(table_name)

In [0]:
dataframe_HCO3 = initial.where((lower(col("resultname")).contains("bicarbonate")) | (lower(col("resultname")).contains("hco3")) | (lower(col("resultname")).contains("co2")) | (lower(col("resultname")).contains("carbon dioxide")))

###Change Nulls to the defined unit
dataframe_HCO3 = dataframe_HCO3.na.fill("missing",["unit"])
###Remove Nulls
#dataframe_HCO3 = dataframe_HCO3.where(dataframe_HCO3.unit.isNotNull())

dataframe_HCO3 = dataframe_HCO3.where(~(lower(col("unit")).contains("ratio")))
dataframe_HCO3 = dataframe_HCO3.where(~(lower(col("resultname")).contains("ratio")))
dataframe_HCO3 = dataframe_HCO3.where(~(lower(col("resultname")).contains("urine")))
dataframe_HCO3 = dataframe_HCO3.where(~(lower(col("resultname")).contains("venous")))
dataframe_HCO3 = dataframe_HCO3.where(~(lower(col("resultname")).contains("ven")))
dataframe_HCO3 = dataframe_HCO3.where(~(lower(col("resultname")).contains("pco2")))
dataframe_HCO3 = dataframe_HCO3.where(~(lower(col("resultname")).contains("end tidal")))

dataframe_HCO3 = dataframe_HCO3.withColumn('unit_cleaned',
                                           when((dataframe_HCO3.unit=="missing"),"standard").\
                                           when(lower(dataframe_HCO3.unit).contains("mmol/l"),"standard").\
                                           when(lower(dataframe_HCO3.unit).contains("mmoi/l"),"standard").\
                                           when(lower(dataframe_HCO3.unit).contains("mm\l"),"standard").\
                                           when(lower(dataframe_HCO3.unit).contains("meq/l"),"standard").\
                                           when(lower(dataframe_HCO3.unit).contains("nmol/l"),"standard").\
                                           when(dataframe_HCO3.unit == "mmol per liter","standard").\
                                           otherwise(col("unit")))

dataframe_HCO3 = dataframe_HCO3.withColumn('resultname_cleaned', 
                                       when(dataframe_HCO3.unit_cleaned.contains("standard"),"HCO3_cleaned").\
                                       otherwise(col("resultname")))

HCO3_df = dataframe_HCO3.filter(dataframe_HCO3.resultname_cleaned =="HCO3_cleaned")

count = HCO3_df.groupBy(["unit"]).count()
final_HCO3 = HCO3_df.join(count,"unit")
final_HCO3 = final_HCO3.filter(col("count")>5)

##############
w=Window.partitionBy("pat_id","instance","pat_enc_csn_id").orderBy("observationdatetime").rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)
final_HCO3_1 = final_HCO3.withColumn('resultvalue_HCO3', F.mean('resultvalue').over(w))
final_HCO3_2 = final_HCO3_1.withColumn('Last_HCO3', F.last('resultvalue').over(w))
    
columns = ['pat_id','instance','pat_enc_csn_id','resultvalue_HCO3','Last_HCO3']
saved_HCO3 = final_HCO3_2[columns]
saved_HCO3 = saved_HCO3.dropDuplicates(['pat_id','instance','pat_enc_csn_id'])

In [0]:
%sql
DROP TABLE rdp_phi_sandbox.sm_patt_HCO3_feb;

In [0]:
table_name = 'rdp_phi_sandbox.sm_patt_HCO3_feb'
saved_HCO3.write.saveAsTable(table_name)

In [0]:
dataframe_rbc = initial.where((lower(col("resultname")).contains("rbc")) | (lower(col("resultname")).contains("red blood cell")))

###Change Nulls to the defined unit
dataframe_rbc = dataframe_rbc.na.fill("missing",["unit"])

dataframe_rbc = dataframe_rbc.where(dataframe_rbc.unit != "%")
dataframe_rbc = dataframe_rbc.where(~(lower(col("unit")).contains("cell"))) #for now??
dataframe_rbc = dataframe_rbc.where(~(lower(col("resultname")).contains("cell"))) #for now??
dataframe_rbc = dataframe_rbc.where(~(lower(col("resultname")).contains("urine")))
dataframe_rbc = dataframe_rbc.where(~(lower(col("resultname")).contains("ua"))) ###Removed UA which standas for Urine 
dataframe_rbc = dataframe_rbc.where(~(lower(col("resultname")).contains("fluid"))) ##Removed WBC BODY FLUID
dataframe_rbc = dataframe_rbc.where(~(lower(col("resultname")).contains("stool"))) ##Removed WBC Stool
dataframe_rbc = dataframe_rbc.where(~(lower(col("resultname")).contains("%")))
dataframe_rbc = dataframe_rbc.where(~(lower(col("resultname")).contains("nrbc")))
dataframe_rbc = dataframe_rbc.where(~(lower(col("resultname")).contains("comment")))
dataframe_rbc = dataframe_rbc.where(~(lower(col("resultname")).contains("fragments")))
dataframe_rbc = dataframe_rbc.where(~(lower(col("resultname")).contains("error")))
dataframe_rbc = dataframe_rbc.where(~(col("resultname").contains("AGGLUTINATION")))
dataframe_rbc = dataframe_rbc.where(~(col("resultname").contains("RBC INCLUSION BODIES")))
dataframe_rbc = dataframe_rbc.where(~(col("resultname").contains("MORPHOLOGY")))

dataframe_rbc = dataframe_rbc.withColumn('unit_cleaned',
                                         when(lower(dataframe_rbc.unit).contains("missing"),"standard").\
                                         when((dataframe_rbc.unit=="ul"),"standard").\
                                         when(lower(dataframe_rbc.unit).contains("10e6/ul"),"standard").\
                                         when(lower(dataframe_rbc.unit).contains("10^12/l"),"standard").\
                                         when(lower(dataframe_rbc.unit).contains("million/ul"),"standard").\
                                         when(lower(dataframe_rbc.unit).contains("m/ul"),"standard").\
                                         when(lower(dataframe_rbc.unit).contains("mil/mm3"),"standard").\
                                         when(lower(dataframe_rbc.unit).contains("ul"),"standard").\
                                         when(lower(dataframe_rbc.unit).contains("m/mm^3"),"standard").\
                                         when(lower(dataframe_rbc.unit).contains("m/cmm"),"standard").\
                                         otherwise(col("unit")))

dataframe_rbc = dataframe_rbc.withColumn('resultname_cleaned', 
                                         when(dataframe_rbc.unit_cleaned.contains("standard"),"rbc_cleaned").\
                                         otherwise(col("resultname")))


RBC_df = dataframe_rbc.filter(dataframe_rbc.resultname_cleaned =="rbc_cleaned")

count = RBC_df.groupBy(["unit"]).count()
final_RBC = RBC_df.join(count,"unit")
final_RBC = final_RBC.filter(col("count")>5)

##############
w=Window.partitionBy("pat_id","instance","pat_enc_csn_id").orderBy("observationdatetime").rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)
final_RBC_1 = final_RBC.withColumn('resultvalue_RBC', F.mean('resultvalue').over(w))
final_RBC_2 = final_RBC_1.withColumn('Last_RBC', F.last('resultvalue').over(w))
    
columns = ['pat_id','instance','pat_enc_csn_id','resultvalue_RBC','Last_RBC']
saved_RBC = final_RBC_2[columns]
saved_RBC = saved_RBC.dropDuplicates(['pat_id','instance','pat_enc_csn_id'])

In [0]:
%sql
DROP TABLE rdp_phi_sandbox.sm_patt_RBC_feb;

In [0]:
table_name = 'rdp_phi_sandbox.sm_patt_RBC_feb'
saved_RBC.write.saveAsTable(table_name)

In [0]:
dataframe_baso = initial.where(lower(col("resultname")).contains("basophil"))

dataframe_baso = dataframe_baso.where(dataframe_baso.unit.isNotNull())

dataframe_baso = dataframe_baso.where(dataframe_baso.unit != "%")
dataframe_baso = dataframe_baso.where(~(lower(col("unit")).contains("cell"))) #for now??
dataframe_baso = dataframe_baso.where(~(lower(col("resultname")).contains("cell"))) #for now??
dataframe_baso = dataframe_baso.where(~(lower(col("resultname")).contains("urine")))
dataframe_baso = dataframe_baso.where(~(lower(col("resultname")).contains("ua"))) ###Removed UA which standas for Urine 
dataframe_baso = dataframe_baso.where(~(lower(col("resultname")).contains("fluid"))) ##Removed WBC BODY FLUID
dataframe_baso = dataframe_baso.where(~(lower(col("resultname")).contains("stool"))) ##Removed WBC Stool
dataframe_baso = dataframe_baso.where(~(lower(col("resultname")).contains("%")))

dataframe_baso = dataframe_baso.withColumn('unit_cleaned',
                                         when((dataframe_baso.unit=="ul"),"standard").\
                                         when(lower(dataframe_baso.unit).contains("k/ul"),"standard").\
                                         when(lower(dataframe_baso.unit).contains("10^9/l"),"standard").\
                                         when(lower(dataframe_baso.unit).contains("mcl"),"standard").\
                                         when(lower(dataframe_baso.unit).contains("mm3"),"standard").\
                                         when(lower(dataframe_baso.unit).contains("cumm"),"standard").\
                                         when(lower(dataframe_baso.unit).contains("cmm"),"standard").\
                                         when(lower(dataframe_baso.unit).contains("10*3/ul"),"standard").\
                                         when(lower(dataframe_baso.unit).contains("10e3/ul"),"standard").\
                                         when(dataframe_baso.resultname.contains("(10*3/UL)"),"standard").\
                                         when(dataframe_baso.resultname.contains("(K/UL)"),"standard").\
                                         otherwise(col("unit")))

dataframe_baso = dataframe_baso.withColumn('resultname_cleaned', 
                                         when(dataframe_baso.unit_cleaned.contains("standard"),"baso_cleaned").\
                                         otherwise(col("resultname")))


Baso_df = dataframe_baso.filter(dataframe_baso.resultname_cleaned =="baso_cleaned")

count = Baso_df.groupBy(["unit"]).count()
final_Baso = Baso_df.join(count,"unit")
final_Baso = final_Baso.filter(col("count")>5)

##############
w=Window.partitionBy("pat_id","instance","pat_enc_csn_id").orderBy("observationdatetime").rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)
final_Baso_1 = final_Baso.withColumn('resultvalue_Baso', F.mean('resultvalue').over(w))
final_Baso_2 = final_Baso_1.withColumn('Last_Baso', F.last('resultvalue').over(w))
    
columns = ['pat_id','instance','pat_enc_csn_id','resultvalue_Baso','Last_Baso']
saved_Baso = final_Baso_2[columns]
saved_Baso = saved_Baso.dropDuplicates(['pat_id','instance','pat_enc_csn_id'])

In [0]:
%sql
DROP TABLE rdp_phi_sandbox.sm_patt_Basophil_feb;

In [0]:
table_name = 'rdp_phi_sandbox.sm_patt_Basophil_feb'
saved_Baso.write.saveAsTable(table_name)

In [0]:
dataframe_mono = initial.where(lower(col("resultname")).contains("monocyte"))

###Change Nulls to the defined unit
#dataframe_mono = dataframe_mono.na.fill("missing",["unit"])
###Remove Nulls
dataframe_mono = dataframe_mono.where(dataframe_mono.unit.isNotNull())

dataframe_mono = dataframe_mono.where(dataframe_mono.unit != "%")
dataframe_mono = dataframe_mono.where(~(lower(col("unit")).contains("cell"))) #for now??
dataframe_mono = dataframe_mono.where(~(lower(col("resultname")).contains("cell"))) #for now??
dataframe_mono = dataframe_mono.where(~(lower(col("resultname")).contains("urine")))
dataframe_mono = dataframe_mono.where(~(lower(col("resultname")).contains("ua"))) ###Removed UA which standas for Urine 
dataframe_mono = dataframe_mono.where(~(lower(col("resultname")).contains("fluid"))) ##Removed WBC BODY FLUID
dataframe_mono = dataframe_mono.where(~(lower(col("resultname")).contains("stool"))) ##Removed WBC Stool
dataframe_mono = dataframe_mono.where(~(lower(col("resultname")).contains("%")))

dataframe_mono = dataframe_mono.withColumn('unit_cleaned',
                                         when((dataframe_mono.unit=="ul"),"standard").\
                                         when(lower(dataframe_mono.unit).contains("k/ul"),"standard").\
                                         when(lower(dataframe_mono.unit).contains("10^9/l"),"standard").\
                                         when(lower(dataframe_mono.unit).contains("mcl"),"standard").\
                                         when(lower(dataframe_mono.unit).contains("mm3"),"standard").\
                                         when(lower(dataframe_mono.unit).contains("cumm"),"standard").\
                                         when(lower(dataframe_mono.unit).contains("cmm"),"standard").\
                                         when(lower(dataframe_mono.unit).contains("10*3/ul"),"standard").\
                                         when(lower(dataframe_mono.unit).contains("10e3/ul"),"standard").\
                                         when(dataframe_mono.resultname.contains("(10*3/UL)"),"standard").\
                                         when(dataframe_mono.resultname.contains("(K/UL)"),"standard").\
                                         otherwise(col("unit")))

dataframe_mono = dataframe_mono.withColumn('resultname_cleaned', 
                                         when(dataframe_mono.unit_cleaned.contains("standard"),"mono_cleaned").\
                                         otherwise(col("resultname")))


Mono_df = dataframe_mono.filter(dataframe_mono.resultname_cleaned =="mono_cleaned")

count = Mono_df.groupBy(["unit"]).count()
final_Mono = Mono_df.join(count,"unit")
final_Mono = final_Mono.filter(col("count")>5)

##############
w=Window.partitionBy("pat_id","instance","pat_enc_csn_id").orderBy("observationdatetime").rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)
final_Mono_1 = final_Mono.withColumn('resultvalue_Mono', F.mean('resultvalue').over(w))
final_Mono_2 = final_Mono_1.withColumn('Last_Mono', F.last('resultvalue').over(w))
    
columns = ['pat_id','instance','pat_enc_csn_id','resultvalue_Mono','Last_Mono']
saved_Mono = final_Mono_2[columns]
saved_Mono = saved_Mono.dropDuplicates(['pat_id','instance','pat_enc_csn_id'])

In [0]:
%sql
DROP TABLE rdp_phi_sandbox.sm_patt_Monocyte_feb;

In [0]:
table_name = 'rdp_phi_sandbox.sm_patt_Monocyte_feb'
saved_Mono.write.saveAsTable(table_name)

In [0]:
dataframe_alkaline = initial.where(lower(col("resultname")).contains("alkaline"))

###Change Nulls to the defined unit
dataframe_alkaline = dataframe_alkaline.na.fill("missing",["unit"])

dataframe_alkaline = dataframe_alkaline.withColumn('unit_cleaned', 
                                                   when(lower(dataframe_alkaline.unit).contains("missing"),"standard").\
                                                   when(lower(dataframe_alkaline.unit).contains("iu/l"),"standard").\
                                                   when(lower(dataframe_alkaline.unit).contains("u/l"),"standard").\
                                                   otherwise(col("unit")))

dataframe_alkaline = dataframe_alkaline.withColumn('resultname_cleaned', 
                                       when(dataframe_alkaline.unit_cleaned.contains("standard"),"alkaline_cleaned").\
                                       otherwise(col("resultname")))

Alkaline_df = dataframe_alkaline.filter(dataframe_alkaline.resultname_cleaned =="alkaline_cleaned")

count = Alkaline_df.groupBy(["unit"]).count()
final_Alkaline = Alkaline_df.join(count,"unit")


##############
w=Window.partitionBy("pat_id","instance","pat_enc_csn_id").orderBy("observationdatetime").rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)
final_Alkaline_1 = final_Alkaline.withColumn('resultvalue_Alkaline', F.mean('resultvalue').over(w))
final_Alkaline_2 = final_Alkaline_1.withColumn('Last_Alkaline', F.last('resultvalue').over(w))
    
columns = ['pat_id','instance','pat_enc_csn_id','resultvalue_Alkaline','Last_Alkaline']
saved_Alkaline = final_Alkaline_2[columns]
saved_Alkaline = saved_Alkaline.dropDuplicates(['pat_id','instance','pat_enc_csn_id'])

In [0]:
%sql
DROP TABLE rdp_phi_sandbox.sm_patt_Alk_feb;

In [0]:
table_name = 'rdp_phi_sandbox.sm_patt_Alk_feb'
saved_Alkaline.write.saveAsTable(table_name)

In [0]:
dataframe_anion = initial.where(lower(col("resultname")).contains("anion gap"))

###Change Nulls to the defined unit
dataframe_anion = dataframe_anion.na.fill("missing",["unit"])

dataframe_anion = dataframe_anion.withColumn('unit_cleaned', 
                                                   when(lower(dataframe_anion.unit).contains("missing"),"standard").\
                                                   when(lower(dataframe_anion.unit).contains("mmol/l"),"standard").\
                                                   when(lower(dataframe_anion.unit).contains("meq/l"),"standard").\
                                                   otherwise(col("unit")))

dataframe_anion = dataframe_anion.withColumn('resultname_cleaned', 
                                       when(dataframe_anion.unit_cleaned.contains("standard"),"anion_cleaned").\
                                       otherwise(col("resultname")))

Anion_df = dataframe_anion.filter(dataframe_anion.resultname_cleaned =="anion_cleaned")

count = Anion_df.groupBy(["unit"]).count()
final_Anion = Anion_df.join(count,"unit")

##############
w=Window.partitionBy("pat_id","instance","pat_enc_csn_id").orderBy("observationdatetime").rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)
final_Anion_1 = final_Anion.withColumn('resultvalue_Anion', F.mean('resultvalue').over(w))
final_Anion_2 = final_Anion_1.withColumn('Last_Anion', F.last('resultvalue').over(w))
    
columns = ['pat_id','instance','pat_enc_csn_id','resultvalue_Anion','Last_Anion']
saved_Anion = final_Anion_2[columns]
saved_Anion = saved_Anion.dropDuplicates(['pat_id','instance','pat_enc_csn_id'])

In [0]:
%sql
DROP TABLE rdp_phi_sandbox.sm_patt_Anion_feb;

In [0]:
table_name = 'rdp_phi_sandbox.sm_patt_Anion_feb'
saved_Anion.write.saveAsTable(table_name)

In [0]:
dataframe_bilirubin = initial.where(lower(col("resultname")).contains("total"))
dataframe_bilirubin = dataframe_bilirubin.where(lower(col("resultname")).contains("bilirubin"))

###Change Nulls to the defined unit
dataframe_bilirubin = dataframe_bilirubin.na.fill("missing",["unit"])

dataframe_bilirubin = dataframe_bilirubin.withColumn('unit_cleaned', 
                                                   when(lower(dataframe_bilirubin.unit).contains("missing"),"standard").\
                                                   when(lower(dataframe_bilirubin.unit).contains("mg/dl"),"standard").\
                                                   otherwise(col("unit")))

dataframe_bilirubin = dataframe_bilirubin.withColumn('resultname_cleaned', 
                                       when(dataframe_bilirubin.unit_cleaned.contains("standard"),"bilirubin_cleaned").\
                                       otherwise(col("resultname")))

Bilirubin_df = dataframe_bilirubin.filter(dataframe_bilirubin.resultname_cleaned =="bilirubin_cleaned")

count = Bilirubin_df.groupBy(["unit"]).count()
final_Bilirubin = Bilirubin_df.join(count,"unit")


##############
w=Window.partitionBy("pat_id","instance","pat_enc_csn_id").orderBy("observationdatetime").rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)
final_Bilirubin_1 = final_Bilirubin.withColumn('resultvalue_Bilirubin', F.mean('resultvalue').over(w))
final_Bilirubin_2 = final_Bilirubin_1.withColumn('Last_Bilirubin', F.last('resultvalue').over(w))
    
columns = ['pat_id','instance','pat_enc_csn_id','resultvalue_Bilirubin','Last_Bilirubin']
saved_Bilirubin = final_Bilirubin_2[columns]
saved_Bilirubin = saved_Bilirubin.dropDuplicates(['pat_id','instance','pat_enc_csn_id'])

In [0]:
%sql
DROP TABLE rdp_phi_sandbox.sm_patt_Bilirubin_feb;

In [0]:
table_name = 'rdp_phi_sandbox.sm_patt_Bilirubin_feb'
saved_Bilirubin.write.saveAsTable(table_name)

In [0]:
dataframe_globulin = initial.where(lower(col("resultname")).contains("globulin"))

###Change Nulls to the defined unit
dataframe_globulin = dataframe_globulin.na.fill("missing",["unit"])
###Remove Nulls
dataframe_globulin = dataframe_globulin.where(dataframe_globulin.unit.isNotNull())
dataframe_globulin = dataframe_globulin.where(~(lower(col("unit")).contains("ratio")))
dataframe_globulin = dataframe_globulin.where(~(lower(col("unit")).contains("calc")))
dataframe_globulin = dataframe_globulin.where(~(lower(col("resultname")).contains("albumin")))
dataframe_globulin = dataframe_globulin.where(~(lower(col("resultname")).contains("urine")))
dataframe_globulin = dataframe_globulin.where(~(lower(col("resultname")).contains("ua")))
dataframe_globulin = dataframe_globulin.where(~(lower(col("resultname")).contains("stool")))
dataframe_globulin = dataframe_globulin.where(~(lower(col("resultname")).contains("%")))
dataframe_globulin = dataframe_globulin.where(~(col("resultname").contains("BETA")))
dataframe_globulin = dataframe_globulin.where(~(col("resultname").contains("GAMMA")))
dataframe_globulin = dataframe_globulin.where(~(col("resultname").contains("ALPHA ")))
dataframe_globulin = dataframe_globulin.where(~(col("resultname").contains("IMMUNOGLOBULIN")))
dataframe_globulin = dataframe_globulin.where(~(col("resultname").contains("ALPHA-")))

dataframe_globulin = dataframe_globulin.withColumn('unit_cleaned', 
                                                   when(lower(dataframe_globulin.unit).contains("missing"),"standard").\
                                                   when(lower(dataframe_globulin.unit).contains("g/dl"),"standard").\
                                                   when(lower(dataframe_globulin.unit).contains("gm/dl"),"standard").\
                                                   when(lower(dataframe_globulin.unit).contains("grams/dl"),"standard").\
                                                   when(lower(dataframe_globulin.unit).contains("mg/dl"),"standard").\
                                                   when(lower(dataframe_globulin.unit).contains("mg/l"),"standard").\
                                                   when(dataframe_globulin.unit == "grams per deciliter","standard").\
                                                   otherwise(col("unit")))

dataframe_globulin = dataframe_globulin.withColumn('resultname_cleaned', 
                                       when(dataframe_globulin.unit_cleaned.contains("standard"),"globulin_cleaned").\
                                       otherwise(col("resultname")))

Globulin_df = dataframe_globulin.filter(dataframe_globulin.resultname_cleaned =="globulin_cleaned")

count = Globulin_df.groupBy(["unit"]).count()
final_Globulin = Globulin_df.join(count,"unit")
final_Globulin = final_Globulin.filter(col("count")>5)

final_Globulin = final_Globulin.withColumn('resultvalue',when(lower(final_Globulin.unit).contains("mg/dl"),col("resultvalue")/1000).otherwise(col("resultvalue"))) #mg/dl to g/dl

##############
w=Window.partitionBy("pat_id","instance","pat_enc_csn_id").orderBy("observationdatetime").rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)
final_Globulin_1 = final_Globulin.withColumn('resultvalue_Globulin', F.mean('resultvalue').over(w))
final_Globulin_2 = final_Globulin_1.withColumn('Last_Globulin', F.last('resultvalue').over(w))
    
columns = ['pat_id','instance','pat_enc_csn_id','resultvalue_Globulin','Last_Globulin']
saved_Globulin = final_Globulin_2[columns]
saved_Globulin = saved_Globulin.dropDuplicates(['pat_id','instance','pat_enc_csn_id'])

In [0]:
%sql
DROP TABLE rdp_phi_sandbox.sm_patt_Globulin_feb;

In [0]:
table_name = 'rdp_phi_sandbox.sm_patt_Globulin_feb'
saved_Globulin.write.saveAsTable(table_name)

In [0]:
dataframe_protein = initial.where(lower(col("resultname")).contains("protein"))
dataframe_protein = dataframe_protein.where(lower(col("resultname")).contains("total"))

dataframe_protein = dataframe_protein.where(dataframe_protein.unit.isNotNull())
dataframe_protein = dataframe_protein.where(~(lower(col("unit")).contains("%")))
dataframe_protein = dataframe_protein.where(~(lower(col("unit")).contains("ratio")))
dataframe_protein = dataframe_protein.where(~(lower(col("resultname")).contains("urine")))
dataframe_protein = dataframe_protein.where(~(lower(col("resultname")).contains("ua")))
dataframe_protein = dataframe_protein.where(~(lower(col("resultname")).contains("stool")))
dataframe_protein = dataframe_protein.where(~(lower(col("resultname")).contains("%")))

dataframe_protein = dataframe_protein.withColumn('unit_cleaned', 
                                                   when(lower(dataframe_protein.unit).contains("g/dl"),"standard").\
                                                   when(lower(dataframe_protein.unit).contains("gm/dl"),"standard").\
                                                   when(lower(dataframe_protein.unit).contains("grams/dl"),"standard").\
                                                   when(lower(dataframe_protein.unit).contains("mg/dl"),"standard").\
                                                   when(lower(dataframe_protein.unit).contains("mg/l"),"standard").\
                                                   otherwise(col("unit")))

dataframe_protein = dataframe_protein.withColumn('resultname_cleaned', 
                                       when(dataframe_protein.unit_cleaned.contains("standard"),"protein_cleaned").\
                                       otherwise(col("resultname")))

Protein_df = dataframe_protein.filter(dataframe_protein.resultname_cleaned =="protein_cleaned")

count = Protein_df.groupBy(["unit"]).count()
final_Protein = Protein_df.join(count,"unit")
final_Protein = final_Protein.filter(col("count")>5)

final_Protein = final_Protein.withColumn('resultvalue',when(lower(final_Protein.unit).contains("mg/dl"),col("resultvalue")/1000).otherwise(col("resultvalue"))) #mg/dl to g/dl

##############
w=Window.partitionBy("pat_id","instance","pat_enc_csn_id").orderBy("observationdatetime").rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)
final_Protein_1 = final_Protein.withColumn('resultvalue_Protein', F.mean('resultvalue').over(w))
final_Protein_2 = final_Protein_1.withColumn('Last_Protein', F.last('resultvalue').over(w))
    
columns = ['pat_id','instance','pat_enc_csn_id','resultvalue_Protein','Last_Protein']
saved_Protein = final_Protein_2[columns]
saved_Protein = saved_Protein.dropDuplicates(['pat_id','instance','pat_enc_csn_id'])

In [0]:
%sql
DROP TABLE rdp_phi_sandbox.sm_patt_Protein_feb;

In [0]:
table_name = 'rdp_phi_sandbox.sm_patt_Protein_feb'
saved_Protein.write.saveAsTable(table_name)

In [0]:
dataframe_glucose = initial.where((lower(col("resultname")).contains("glucose")))

###Change Nulls to the defined unit
dataframe_glucose = dataframe_glucose.na.fill("missing",["unit"])
dataframe_glucose = dataframe_glucose.where(~(lower(col("unit")).contains("ratio")))
dataframe_glucose = dataframe_glucose.where(~(lower(col("unit")).contains("%")))
dataframe_glucose = dataframe_glucose.where(~(lower(col("unit")).contains("calc")))
dataframe_glucose = dataframe_glucose.where(~(lower(col("unit")).contains("cal")))
dataframe_glucose = dataframe_glucose.where(~(lower(col("resultname")).contains("ratio")))
dataframe_glucose = dataframe_glucose.where(~(lower(col("resultname")).contains("%")))
dataframe_glucose = dataframe_glucose.where(~(lower(col("resultname")).contains("ua")))
dataframe_glucose = dataframe_glucose.where(~(lower(col("resultname")).contains("stool")))
dataframe_glucose = dataframe_glucose.where(~(lower(col("resultname")).contains("urine")))
dataframe_glucose = dataframe_glucose.where(~(lower(col("resultname")).contains("mmol")))
dataframe_glucose = dataframe_glucose.where(~(lower(col("resultname")).contains("hour")))
dataframe_glucose = dataframe_glucose.where(~(lower(col("resultname")).contains("hr")))
dataframe_glucose = dataframe_glucose.where(~(lower(col("resultname")).contains("fasting")))
dataframe_glucose = dataframe_glucose.where(~(col("resultname").contains("GESTATIONAL GLUCOSE")))
dataframe_glucose = dataframe_glucose.where(~(col("resultname").contains("ESTIMATED AVERAGE")))

dataframe_glucose = dataframe_glucose.withColumn('unit_cleaned',
                                                 when(lower(dataframe_glucose.unit).contains("missing"),"standard").\
                                                 when(lower(dataframe_glucose.unit).contains("mg/dl"),"standard").\
                                                 when(dataframe_glucose.unit == "milligrams per deciliter","standard").\
                                                 otherwise(col("unit")))

dataframe_glucose = dataframe_glucose.withColumn('resultname_cleaned', 
                                                    when(dataframe_glucose.unit_cleaned.contains("standard"),"glu_cleaned").\
                                                    otherwise(col("resultname")))

glucose_df = dataframe_glucose.filter(dataframe_glucose.resultname_cleaned =="glu_cleaned")

count = glucose_df.groupBy(["unit"]).count()
final_Glucose = glucose_df.join(count,"unit")
final_Glucose = final_Glucose.filter(col("count")>5)


##############
w=Window.partitionBy("pat_id","instance","pat_enc_csn_id").orderBy("observationdatetime").rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)
final_Glucose_1 = final_Glucose.withColumn('resultvalue_Glucose', F.mean('resultvalue').over(w))
final_Glucose_2 = final_Glucose_1.withColumn('Last_Glucose', F.last('resultvalue').over(w))
    
columns = ['pat_id','instance','pat_enc_csn_id','resultvalue_Glucose','Last_Glucose']
saved_Glucose = final_Glucose_2[columns]
saved_Glucose = saved_Glucose.dropDuplicates(['pat_id','instance','pat_enc_csn_id'])

In [0]:
%sql
DROP TABLE rdp_phi_sandbox.sm_patt_Glucose_feb;

In [0]:
table_name = 'rdp_phi_sandbox.sm_patt_Glucose_feb'
saved_Glucose.write.saveAsTable(table_name)

In [0]:
Joined_table = spark.sql("""
SELECT 
initial.pat_id,
initial.instance,
initial.pat_enc_csn_id,
resultvalue_HCO3,
resultvalue_CI,
resultvalue_BUN,
resultvalue_PLT,
resultvalue_eosabs,
resultvalue_neuabs,
resultvalue_ast,
resultvalue_alt,
resultvalue_creatinine,
resultvalue_potassium,
resultvalue_alb,
resultvalue_sodium,
resultvalue_HGB,
resultvalue_HCT,
resultvalue_lymph,
resultvalue_WBC,
resultvalue_calcium,
resultvalue_RBC,
resultvalue_Baso,
resultvalue_Mono,
resultvalue_Alkaline,
resultvalue_Anion,
resultvalue_Bilirubin,
resultvalue_Globulin,
resultvalue_Protein,
resultvalue_Glucose,
resultvalue_ddimer,
resultvalue_be,
resultvalue_ldh,
resultvalue_pct,
resultvalue_mg,
resultvalue_inr,
resultvalue_fr,
resultvalue_buncreat,
resultvalue_Proth,
resultvalue_egfr,
resultvalue_crp,
Last_Glucose,
Last_Protein,
Last_Globulin,
Last_Bilirubin,
Last_Anion,
Last_Alkaline,
Last_Mono,
Last_Baso,
Last_RBC,
Last_HCO3,
Last_CI,
Last_BUN,
Last_PLT,
Last_eosabs,
Last_neuabs,
Last_ast,
Last_alt,
Last_calcium,
Last_creatinine,
Last_potassium,
Last_alb,
Last_sodium,
Last_HGB,
Last_HCT,
Last_lymph,
Last_wbc,
Last_ddimer,
Last_be,
Last_ldh,
Last_pct,
Last_mg,
Last_inr,
Last_fr,
Last_buncreat,
Last_Proth,
Last_egfr,
Last_crp
FROM rdp_phi_sandbox.sm_initial_delta_table AS initial
  LEFT JOIN rdp_phi_sandbox.sm_patt_wbc_feb 
       ON sm_patt_wbc_feb.pat_id = initial.pat_id AND sm_patt_wbc_feb.instance = initial.instance AND sm_patt_wbc_feb.pat_enc_csn_id = initial.pat_enc_csn_id
  LEFT JOIN rdp_phi_sandbox.sm_patt_lymphs_feb 
       ON sm_patt_lymphs_feb.pat_id = initial.pat_id AND sm_patt_lymphs_feb.instance = initial.instance AND sm_patt_lymphs_feb.pat_enc_csn_id = initial.pat_enc_csn_id
  LEFT JOIN rdp_phi_sandbox.sm_patt_hct_feb 
       ON sm_patt_hct_feb.pat_id = initial.pat_id AND sm_patt_hct_feb.instance = initial.instance AND sm_patt_hct_feb.pat_enc_csn_id = initial.pat_enc_csn_id
  LEFT JOIN rdp_phi_sandbox.sm_patt_hgb_feb 
       ON sm_patt_hgb_feb.pat_id = initial.pat_id AND sm_patt_hgb_feb.instance = initial.instance AND sm_patt_hgb_feb.pat_enc_csn_id = initial.pat_enc_csn_id
  LEFT JOIN rdp_phi_sandbox.sm_patt_sodium_feb 
       ON sm_patt_sodium_feb.pat_id = initial.pat_id AND sm_patt_sodium_feb.instance = initial.instance AND sm_patt_sodium_feb.pat_enc_csn_id = initial.pat_enc_csn_id
  LEFT JOIN rdp_phi_sandbox.sm_patt_alb_feb 
       ON sm_patt_alb_feb.pat_id = initial.pat_id AND sm_patt_alb_feb.instance = initial.instance AND sm_patt_alb_feb.pat_enc_csn_id = initial.pat_enc_csn_id
  LEFT JOIN rdp_phi_sandbox.sm_patt_potassium_feb 
       ON sm_patt_potassium_feb.pat_id = initial.pat_id AND sm_patt_potassium_feb.instance = initial.instance AND sm_patt_potassium_feb.pat_enc_csn_id = initial.pat_enc_csn_id
  LEFT JOIN rdp_phi_sandbox.sm_patt_creatinine_feb 
       ON sm_patt_creatinine_feb.pat_id = initial.pat_id AND sm_patt_creatinine_feb.instance = initial.instance AND sm_patt_creatinine_feb.pat_enc_csn_id = initial.pat_enc_csn_id
  LEFT JOIN rdp_phi_sandbox.sm_patt_alt_feb 
       ON sm_patt_alt_feb.pat_id = initial.pat_id AND sm_patt_alt_feb.instance = initial.instance AND sm_patt_alt_feb.pat_enc_csn_id = initial.pat_enc_csn_id
  LEFT JOIN rdp_phi_sandbox.sm_patt_ast_feb 
       ON sm_patt_ast_feb.pat_id = initial.pat_id AND sm_patt_ast_feb.instance = initial.instance AND sm_patt_ast_feb.pat_enc_csn_id = initial.pat_enc_csn_id
  LEFT JOIN rdp_phi_sandbox.sm_patt_neuabs_feb 
       ON sm_patt_neuabs_feb.pat_id = initial.pat_id AND sm_patt_neuabs_feb.instance = initial.instance AND sm_patt_neuabs_feb.pat_enc_csn_id = initial.pat_enc_csn_id
  LEFT JOIN rdp_phi_sandbox.sm_patt_eosabs_feb 
       ON sm_patt_eosabs_feb.pat_id = initial.pat_id AND sm_patt_eosabs_feb.instance = initial.instance AND sm_patt_eosabs_feb.pat_enc_csn_id = initial.pat_enc_csn_id
  LEFT JOIN rdp_phi_sandbox.sm_patt_plt_feb 
       ON sm_patt_plt_feb.pat_id = initial.pat_id AND sm_patt_plt_feb.instance = initial.instance AND sm_patt_plt_feb.pat_enc_csn_id = initial.pat_enc_csn_id
  LEFT JOIN rdp_phi_sandbox.sm_patt_bun_feb 
       ON sm_patt_bun_feb.pat_id = initial.pat_id AND sm_patt_bun_feb.instance = initial.instance AND sm_patt_bun_feb.pat_enc_csn_id = initial.pat_enc_csn_id
  LEFT JOIN rdp_phi_sandbox.sm_patt_clt_feb 
       ON sm_patt_clt_feb.pat_id = initial.pat_id AND sm_patt_clt_feb.instance = initial.instance AND sm_patt_clt_feb.pat_enc_csn_id = initial.pat_enc_csn_id
  LEFT JOIN rdp_phi_sandbox.sm_patt_HCO3_feb 
       ON sm_patt_HCO3_feb.pat_id = initial.pat_id AND sm_patt_HCO3_feb.instance = initial.instance AND sm_patt_HCO3_feb.pat_enc_csn_id = initial.pat_enc_csn_id
  LEFT JOIN rdp_phi_sandbox.sm_patt_calcium_feb 
       ON sm_patt_calcium_feb.pat_id = initial.pat_id AND sm_patt_calcium_feb.instance = initial.instance AND sm_patt_calcium_feb.pat_enc_csn_id = initial.pat_enc_csn_id    
  LEFT JOIN rdp_phi_sandbox.sm_patt_Glucose_feb 
       ON sm_patt_Glucose_feb.pat_id = initial.pat_id AND sm_patt_Glucose_feb.instance = initial.instance AND sm_patt_Glucose_feb.pat_enc_csn_id = initial.pat_enc_csn_id
  LEFT JOIN rdp_phi_sandbox.sm_patt_RBC_feb 
       ON sm_patt_RBC_feb.pat_id = initial.pat_id AND sm_patt_RBC_feb.instance = initial.instance AND sm_patt_RBC_feb.pat_enc_csn_id = initial.pat_enc_csn_id
  LEFT JOIN rdp_phi_sandbox.sm_patt_Basophil_feb 
       ON sm_patt_Basophil_feb.pat_id = initial.pat_id AND sm_patt_Basophil_feb.instance = initial.instance AND sm_patt_Basophil_feb.pat_enc_csn_id = initial.pat_enc_csn_id
  LEFT JOIN rdp_phi_sandbox.sm_patt_Monocyte_feb 
       ON sm_patt_Monocyte_feb.pat_id = initial.pat_id AND sm_patt_Monocyte_feb.instance = initial.instance AND sm_patt_Monocyte_feb.pat_enc_csn_id = initial.pat_enc_csn_id
  LEFT JOIN rdp_phi_sandbox.sm_patt_Alk_feb 
       ON sm_patt_Alk_feb.pat_id = initial.pat_id AND sm_patt_Alk_feb.instance = initial.instance AND sm_patt_Alk_feb.pat_enc_csn_id = initial.pat_enc_csn_id
  LEFT JOIN rdp_phi_sandbox.sm_patt_Anion_feb 
       ON sm_patt_Anion_feb.pat_id = initial.pat_id AND sm_patt_Anion_feb.instance = initial.instance AND sm_patt_Anion_feb.pat_enc_csn_id = initial.pat_enc_csn_id
  LEFT JOIN rdp_phi_sandbox.sm_patt_Bilirubin_feb 
       ON sm_patt_Bilirubin_feb.pat_id = initial.pat_id AND sm_patt_Bilirubin_feb.instance = initial.instance AND sm_patt_Bilirubin_feb.pat_enc_csn_id = initial.pat_enc_csn_id
  LEFT JOIN rdp_phi_sandbox.sm_patt_Globulin_feb 
       ON sm_patt_Globulin_feb.pat_id = initial.pat_id AND sm_patt_Globulin_feb.instance = initial.instance AND sm_patt_Globulin_feb.pat_enc_csn_id = initial.pat_enc_csn_id
  LEFT JOIN rdp_phi_sandbox.sm_patt_Protein_feb 
       ON sm_patt_Protein_feb.pat_id = initial.pat_id AND sm_patt_Protein_feb.instance = initial.instance AND sm_patt_Protein_feb.pat_enc_csn_id = initial.pat_enc_csn_id
  LEFT JOIN rdp_phi_sandbox.sm_patt_ddimer_feb 
       ON sm_patt_ddimer_feb.pat_id = initial.pat_id AND sm_patt_ddimer_feb.instance = initial.instance AND sm_patt_ddimer_feb.pat_enc_csn_id = initial.pat_enc_csn_id
  LEFT JOIN rdp_phi_sandbox.sm_patt_be_feb 
       ON sm_patt_be_feb.pat_id = initial.pat_id AND sm_patt_be_feb.instance = initial.instance AND sm_patt_be_feb.pat_enc_csn_id = initial.pat_enc_csn_id
  LEFT JOIN rdp_phi_sandbox.sm_patt_ldh_feb 
       ON sm_patt_ldh_feb.pat_id = initial.pat_id AND sm_patt_ldh_feb.instance = initial.instance AND sm_patt_ldh_feb.pat_enc_csn_id = initial.pat_enc_csn_id
  LEFT JOIN rdp_phi_sandbox.sm_patt_pct_feb 
       ON sm_patt_pct_feb.pat_id = initial.pat_id AND sm_patt_pct_feb.instance = initial.instance AND sm_patt_pct_feb.pat_enc_csn_id = initial.pat_enc_csn_id
  LEFT JOIN rdp_phi_sandbox.sm_patt_mg_feb 
       ON sm_patt_mg_feb.pat_id = initial.pat_id AND sm_patt_mg_feb.instance = initial.instance AND sm_patt_mg_feb.pat_enc_csn_id = initial.pat_enc_csn_id
  LEFT JOIN rdp_phi_sandbox.sm_patt_inr_feb 
       ON sm_patt_inr_feb.pat_id = initial.pat_id AND sm_patt_inr_feb.instance = initial.instance AND sm_patt_inr_feb.pat_enc_csn_id = initial.pat_enc_csn_id
  LEFT JOIN rdp_phi_sandbox.sm_patt_fr_feb 
       ON sm_patt_fr_feb.pat_id = initial.pat_id AND sm_patt_fr_feb.instance = initial.instance AND sm_patt_fr_feb.pat_enc_csn_id = initial.pat_enc_csn_id
  LEFT JOIN rdp_phi_sandbox.sm_patt_buncreat_feb 
       ON sm_patt_buncreat_feb.pat_id = initial.pat_id AND sm_patt_buncreat_feb.instance = initial.instance AND sm_patt_buncreat_feb.pat_enc_csn_id = initial.pat_enc_csn_id
  LEFT JOIN rdp_phi_sandbox.sm_patt_Proth_feb 
       ON sm_patt_Proth_feb.pat_id = initial.pat_id AND sm_patt_Proth_feb.instance = initial.instance AND sm_patt_Proth_feb.pat_enc_csn_id = initial.pat_enc_csn_id
       LEFT JOIN rdp_phi_sandbox.sm_patt_egfr_feb 
       ON sm_patt_egfr_feb.pat_id = initial.pat_id AND sm_patt_egfr_feb.instance = initial.instance AND sm_patt_egfr_feb.pat_enc_csn_id = initial.pat_enc_csn_id
       LEFT JOIN rdp_phi_sandbox.sm_patt_crp_feb 
       ON sm_patt_crp_feb.pat_id = initial.pat_id AND sm_patt_crp_feb.instance = initial.instance AND sm_patt_crp_feb.pat_enc_csn_id = initial.pat_enc_csn_id

group by
initial.pat_id,
initial.instance,
initial.pat_enc_csn_id,
resultvalue_HCO3,
resultvalue_CI,
resultvalue_BUN,
resultvalue_PLT,
resultvalue_eosabs,
resultvalue_neuabs,
resultvalue_ast,
resultvalue_alt,
resultvalue_creatinine,
resultvalue_potassium,
resultvalue_alb,
resultvalue_sodium,
resultvalue_HGB,
resultvalue_HCT,
resultvalue_lymph,
resultvalue_WBC,
resultvalue_calcium,
resultvalue_RBC,
resultvalue_Baso,
resultvalue_Mono,
resultvalue_Alkaline,
resultvalue_Anion,
resultvalue_Bilirubin,
resultvalue_Globulin,
resultvalue_Protein,
resultvalue_Glucose,
resultvalue_ddimer,
resultvalue_be,
resultvalue_ldh,
resultvalue_pct,
resultvalue_mg,
resultvalue_inr,
resultvalue_fr,
resultvalue_buncreat,
resultvalue_Proth,
resultvalue_egfr,
resultvalue_crp,
Last_Glucose,
Last_Protein,
Last_Globulin,
Last_Bilirubin,
Last_Anion,
Last_Alkaline,
Last_Mono,
Last_Baso,
Last_RBC,
Last_HCO3,
Last_CI,
Last_BUN,
Last_PLT,
Last_eosabs,
Last_neuabs,
Last_ast,
Last_alt,
Last_calcium,
Last_creatinine,
Last_potassium,
Last_alb,
Last_sodium,
Last_HGB,
Last_HCT,
Last_lymph,
Last_wbc,
Last_ddimer,
Last_be,
Last_ldh,
Last_pct,
Last_mg,
Last_inr,
Last_fr,
Last_buncreat,
Last_Proth,
Last_egfr,
Last_crp""")

Joined_table = Joined_table.dropDuplicates()

Joined_table =Joined_table.dropna(subset=['resultvalue_HCO3','resultvalue_CI','resultvalue_BUN','resultvalue_PLT','resultvalue_eosabs','resultvalue_neuabs','resultvalue_ast','resultvalue_alt','resultvalue_creatinine','resultvalue_potassium','resultvalue_alb','resultvalue_sodium','resultvalue_HGB','resultvalue_HCT','resultvalue_lymph','resultvalue_WBC','resultvalue_calcium','resultvalue_RBC','resultvalue_Baso','resultvalue_Mono','resultvalue_Alkaline','resultvalue_Anion','resultvalue_Bilirubin','resultvalue_Globulin','resultvalue_Protein','resultvalue_Glucose','resultvalue_ddimer','resultvalue_be','resultvalue_ldh','resultvalue_pct','resultvalue_mg','resultvalue_inr','resultvalue_fr','resultvalue_buncreat','resultvalue_Proth','resultvalue_egfr','resultvalue_crp',
'Last_Glucose','Last_Protein','Last_Globulin','Last_Bilirubin','Last_Anion','Last_Alkaline','Last_Mono','Last_Baso','Last_RBC','Last_HCO3','Last_CI','Last_BUN','Last_PLT','Last_eosabs','Last_neuabs','Last_ast','Last_alt','Last_calcium','Last_creatinine','Last_potassium','Last_alb','Last_sodium','Last_HGB','Last_HCT','Last_lymph','Last_wbc','Last_ddimer','Last_be','Last_ldh','Last_pct','Last_mg','Last_inr','Last_fr','Last_buncreat','Last_Proth','Last_egfr','Last_crp'],how="all")

Joined_table.select("pat_id").distinct().count()

In [0]:
%sql
DROP TABLE rdp_phi_sandbox.sm_patt_delta_labs_feb;

In [0]:
table_name = 'rdp_phi_sandbox.sm_patt_delta_labs_feb'
Joined_table.write.saveAsTable(table_name)

In [0]:
%sql
REFRESH table_name;
SELECT * FROM rdp_phi_sandbox.sm_patt_delta_labs_feb;