In [0]:
dbutils.widgets.text(name="env", defaultValue="", label="Enter environment")
env = dbutils.widgets.get("env")
env

'dev'

In [0]:
import pyspark.sql.functions as F
from pyspark.sql.types import IntegerType

In [0]:
%run "./paths"

('abfss://landing@dlsunitycat.dfs.core.windows.net/',
 'abfss://medallion@dlsunitycat.dfs.core.windows.net/bronze',
 'abfss://medallion@dlsunitycat.dfs.core.windows.net/silver',
 'abfss://medallion@dlsunitycat.dfs.core.windows.net/gold')

In [0]:
catalog = f'{env}_catalog'
upstream_table = f'{catalog}{silver_tables["rates"]}'
downstream_table = f'{catalog}{gold_tables["rates_fact"]}'
downstream_path = gold_paths['rates_fact']
upstream_table, downstream_table, downstream_path

('dev_catalog.silver.rates_silver',
 'dev_catalog.gold.rates_fact',
 'abfss://medallion@dlsunitycat.dfs.core.windows.net/gold/rates_fact')

In [0]:
def cululative_query(upstream_table, downstream_table, downstream_path, year):
    query = f"""
insert into delta.`{downstream_path}`
with last_year as (
  select * 
  from {downstream_table}
  where current_year = {year - 1}
),
this_year_raw as (
  select 
    plan_id, 
    state_code, 
    age_category,
    business_year,
    cast(avg(rate) as DECIMAL(10,2)) rate
  from {upstream_table}
  where business_year = {year}
  group by plan_id, state_code, age_category, business_year
),
this_year_pivoted AS (
    SELECT 
        plan_id,
        state_code,
        business_year,
        MAX(CASE WHEN age_category = '0 - 19' THEN rate END) age_0_19,
        MAX(CASE WHEN age_category = '20 - 29' THEN rate END) age_20_29,
        MAX(CASE WHEN age_category = '30 - 39' THEN rate END) age_30_39,
        MAX(CASE WHEN age_category = '40 - 49' THEN rate END) age_40_49,
        MAX(CASE WHEN age_category = '50 - 59' THEN rate END) age_50_59,
        MAX(CASE WHEN age_category = '60+' THEN rate END) age_60_plus
    FROM this_year_raw
    GROUP BY plan_id, state_code, business_year
),
this_year as (
select 
  plan_id,
  state_code,
  business_year,
  ARRAY(
    NAMED_STRUCT(
      'age_0_19', age_0_19,
      'age_20_29', age_20_29,
      'age_30_39', age_30_39,
      'age_40_49', age_40_49,
      'age_50_59', age_50_59,
      'age_60_plus', age_60_plus,
      'business_year', business_year
      )
    ) age_category_and_year
FROM this_year_pivoted
)
select 
  coalesce(ly.plan_id, ty.plan_id) plan_id,
  coalesce(ly.state_code, ty.state_code) state_code,
  coalesce(ty.business_year, ly.current_year + 1) current_year,
  case
    WHEN ly.age_category_and_year IS NULL THEN ty.age_category_and_year
    WHEN ty.age_category_and_year IS NULL THEN ly.age_category_and_year
    ELSE ly.age_category_and_year || ty.age_category_and_year
  END age_category_and_year,
  CURRENT_TIMESTAMP() AS date_ingested
from last_year ly
full outer join this_year ty on ly.plan_id = ty.plan_id
"""
    spark.sql(query)

In [0]:
for year in range(2018, 2025):
    cululative_query(upstream_table, downstream_table, downstream_path, year)