In [0]:
dbutils.widgets.text(name="env", defaultValue="", label="Enter environment")
env = dbutils.widgets.get("env")
env

'dev'

In [0]:
from pyspark.sql import DataFrame
from functools import reduce
from dataclasses import dataclass
from pyspark.sql.types import IntegerType, StringType
from pyspark.sql.functions import current_timestamp


In [0]:
%run "./include"

End


In [0]:
%run "./paths"

('abfss://landing@dlsunitycat.dfs.core.windows.net/',
 'abfss://medallion@dlsunitycat.dfs.core.windows.net/bronze',
 'abfss://medallion@dlsunitycat.dfs.core.windows.net/silver',
 'abfss://medallion@dlsunitycat.dfs.core.windows.net/gold')

In [0]:
columns_to_rename = {"BusinessYear": "business_year",
                     "StateCode": "state_code",
                     "DentalOnlyPlan": "dental_only_plan",
                     "StandardComponentId": "plan_id",
                     "MetalLevel": "metal_level",      
                     "IsNewPlan": "new_plan",
                     "PlanType": "plan_type"}

@SchemaDefiner.get_pyspark_schema
@dataclass
class Write_Schema:
    business_year: IntegerType
    state_code: StringType
    dental_only_plan: StringType
    plan_id: StringType
    metal_level: StringType
    new_plan: StringType
    plan_type: StringType

write_schema = Write_Schema()

In [0]:
plans_csv_reader = csv_reader(columns_to_rename, write_schema)

In [0]:
table = 'plans'
raw_plans_files = dbutils.fs.ls(raw_paths[table])
plans_csv_pths = [file.path for file in raw_plans_files]
plans_csv_pths

['abfss://landing@dlsunitycat.dfs.core.windows.net/raw_plans/2018_plans.csv',
 'abfss://landing@dlsunitycat.dfs.core.windows.net/raw_plans/2019_plans.csv',
 'abfss://landing@dlsunitycat.dfs.core.windows.net/raw_plans/2020_plans.csv',
 'abfss://landing@dlsunitycat.dfs.core.windows.net/raw_plans/2021_plans.csv',
 'abfss://landing@dlsunitycat.dfs.core.windows.net/raw_plans/2022_plans.csv',
 'abfss://landing@dlsunitycat.dfs.core.windows.net/raw_plans/2023_plans.csv',
 'abfss://landing@dlsunitycat.dfs.core.windows.net/raw_plans/2024_plans.csv']

In [0]:
plans_dfs = [plans_csv_reader.csv_to_df(pth) for pth in plans_csv_pths]
plans_dfs

[DataFrame[business_year: int, state_code: string, dental_only_plan: string, plan_id: string, metal_level: string, new_plan: string, plan_type: string],
 DataFrame[business_year: int, state_code: string, dental_only_plan: string, plan_id: string, metal_level: string, new_plan: string, plan_type: string],
 DataFrame[business_year: int, state_code: string, dental_only_plan: string, plan_id: string, metal_level: string, new_plan: string, plan_type: string],
 DataFrame[business_year: int, state_code: string, dental_only_plan: string, plan_id: string, metal_level: string, new_plan: string, plan_type: string],
 DataFrame[business_year: int, state_code: string, dental_only_plan: string, plan_id: string, metal_level: string, new_plan: string, plan_type: string],
 DataFrame[business_year: int, state_code: string, dental_only_plan: string, plan_id: string, metal_level: string, new_plan: string, plan_type: string],
 DataFrame[business_year: int, state_code: string, dental_only_plan: string, plan_

In [0]:
plans_df = reduce(DataFrame.unionByName, plans_dfs)
plans_df

DataFrame[business_year: int, state_code: string, dental_only_plan: string, plan_id: string, metal_level: string, new_plan: string, plan_type: string]

In [0]:
plans_df = plans_df.withColumn("date_ingested", current_timestamp())

plans_df.write.format("delta").mode("overwrite").save(bronze_paths[table])
