# Gold Layer NLP - Addition of Business Logic

In [2]:
from pyspark.sql.functions import expr, col, to_date

df = spark.sql("SELECT * FROM PT_evals_lakehouse.silver_layer_nlp LIMIT 1000")

# Only act upon those rows which are new for the week
df = df.withColumn('eval_date', to_date(col('eval_date'), 'yyyy-MM-dd'))
df = df.filter(expr(f"eval_date >= TO_DATE('{last_added_date}', 'yyyy-MM-dd')"))

StatementMeta(, 29fdca0d-03f2-4032-a3f3-67929928e730, 6, Finished, Available, Finished)

In [3]:
# Pain, while 0-10, is not necessarily linear. It is likely better to categorize into three groups
def categorize_pain(num):
    if num < 4: return 'mild'
    elif num < 8: return 'moderate'
    else: return 'severe'
    
spark.udf.register('categorize_pain', categorize_pain)

df = df.withColumn('pain', expr('categorize_pain(pain)'))

StatementMeta(, 29fdca0d-03f2-4032-a3f3-67929928e730, 7, Finished, Available, Finished)

In [4]:
df.write.mode('append').saveAsTable('gold_layer_nlp')

StatementMeta(, 29fdca0d-03f2-4032-a3f3-67929928e730, 8, Finished, Available, Finished)

In [5]:
%%sql

ALTER TABLE gold_layer_nlp
ADD COLUMN tx_prior_loc STRING;

UPDATE gold_layer_nlp
SET tx_prior_loc = CASE 
    WHEN prior_loc = 0 THEN 'Ranch home'
    WHEN prior_loc = 1 THEN 'Two story home'
    WHEN prior_loc = 2 THEN 'Apartment'
    WHEN prior_loc = 3 THEN 'Independent Living Facility'
    WHEN prior_loc = 4 THEN 'Assisted Living Facility'
    WHEN prior_loc = 5 THEN 'Skilled Nursing Facility'
    WHEN prior_loc = 6 THEN 'Inpatient Rehabilitation Hospital'
    WHEN prior_loc = 7 THEN 'Long-term Care Facility'
    ELSE 'Unknown'
END;

StatementMeta(, 29fdca0d-03f2-4032-a3f3-67929928e730, 10, Finished, Available, Finished)

<Spark SQL result set with 0 rows and 0 fields>

<Spark SQL result set with 1 rows and 1 fields>