In [0]:
dbutils.widgets.text(name="env", defaultValue="", label="Enter environment")
env = dbutils.widgets.get("env")
env

'dev'

In [0]:
from pyspark.sql import DataFrame
from functools import reduce
from dataclasses import dataclass
from pyspark.sql.types import IntegerType, DoubleType, StringType
from pyspark.sql.functions import current_timestamp


In [0]:
%run "./include"

In [0]:
%run "./paths"

('abfss://landing@dlsunitycat.dfs.core.windows.net/',
 'abfss://medallion@dlsunitycat.dfs.core.windows.net/bronze',
 'abfss://medallion@dlsunitycat.dfs.core.windows.net/silver',
 'abfss://medallion@dlsunitycat.dfs.core.windows.net/gold')

End


In [0]:
columns_to_rename = {"BusinessYear": "business_year",
                     "StateCode": "state_code",
                     "PlanId": "plan_id",
                     "RatingAreaId": "rating_area_id",
                     "Age": "age",      
                     "IndividualRate": "rate"}

@SchemaDefiner.get_pyspark_schema
@dataclass
class Write_Schema:
    business_year: IntegerType
    state_code: StringType
    plan_id: StringType
    rating_area_id: StringType
    age: StringType
    rate: DoubleType

write_schema = Write_Schema()

In [0]:
rates_csv_reader = csv_reader(columns_to_rename, write_schema)

In [0]:
table = 'rates'
raw_rates_files = dbutils.fs.ls(raw_paths[table])
rates_csv_pths = [file.path for file in raw_rates_files]
rates_csv_pths

['abfss://landing@dlsunitycat.dfs.core.windows.net/raw_rates/2018_rates.csv',
 'abfss://landing@dlsunitycat.dfs.core.windows.net/raw_rates/2019_rates.csv',
 'abfss://landing@dlsunitycat.dfs.core.windows.net/raw_rates/2020_rates.csv',
 'abfss://landing@dlsunitycat.dfs.core.windows.net/raw_rates/2021_rates.csv',
 'abfss://landing@dlsunitycat.dfs.core.windows.net/raw_rates/2022_rates.csv',
 'abfss://landing@dlsunitycat.dfs.core.windows.net/raw_rates/2023_rates.csv',
 'abfss://landing@dlsunitycat.dfs.core.windows.net/raw_rates/2024_rates.csv']

In [0]:
rates_dfs = [rates_csv_reader.csv_to_df(pth) for pth in rates_csv_pths]
rates_dfs

[DataFrame[business_year: int, state_code: string, plan_id: string, rating_area_id: string, age: string, rate: double],
 DataFrame[business_year: int, state_code: string, plan_id: string, rating_area_id: string, age: string, rate: double],
 DataFrame[business_year: int, state_code: string, plan_id: string, rating_area_id: string, age: string, rate: double],
 DataFrame[business_year: int, state_code: string, plan_id: string, rating_area_id: string, age: string, rate: double],
 DataFrame[business_year: int, state_code: string, plan_id: string, rating_area_id: string, age: string, rate: double],
 DataFrame[business_year: int, state_code: string, plan_id: string, rating_area_id: string, age: string, rate: double],
 DataFrame[business_year: int, state_code: string, plan_id: string, rating_area_id: string, age: string, rate: double]]

In [0]:
rates_df = reduce(DataFrame.unionByName, rates_dfs)
rates_df

DataFrame[business_year: int, state_code: string, plan_id: string, rating_area_id: string, age: string, rate: double]

In [0]:
# Add date_ingested column with the current date and time
rates_df = rates_df.withColumn("date_ingested", current_timestamp())

# Save the DataFrame as a Delta table to the external location
rates_df.write.format("delta").mode("overwrite").save(bronze_paths[table])
