In [None]:
import os

CONTAINER_NAMES = os.environ["container_names"].split(",")
BRONZ_CONTAINER = CONTAINER_NAMES[0]
SILVER_CONTAINER = CONTAINER_NAMES[1]
GOLD_CONTAINER = CONTAINER_NAMES[2]

STORAGE_ACCOUNT_NAME = os.environ["storage_account_name"]

SAS_TOKEN = os.environ["sas_token"].strip('?')

Configure access to azure data lake gen 2

In [None]:
spark.conf.set(f"fs.azure.account.auth.type.{STORAGE_ACCOUNT_NAME}.dfs.core.windows.net", "SAS")
spark.conf.set(f"fs.azure.sas.token.provider.type.{STORAGE_ACCOUNT_NAME}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.sas.FixedSASTokenProvider")
spark.conf.set(f"fs.azure.sas.fixed.token.{STORAGE_ACCOUNT_NAME}.dfs.core.windows.net", SAS_TOKEN)

In [None]:
BRONZE_FS = f"abfss://{BRONZ_CONTAINER}@{STORAGE_ACCOUNT_NAME}.dfs.core.windows.net/"
SILVER_FS = f"abfss://{SILVER_CONTAINER}@{STORAGE_ACCOUNT_NAME}.dfs.core.windows.net/"
GOLD_FS = f"abfss://{GOLD_CONTAINER}@{STORAGE_ACCOUNT_NAME}.dfs.core.windows.net/"

Transform date fields in all tables

In [None]:
filenames = []
for entry in dbutils.fs.ls(BRONZE_FS):
  filenames.append(entry.name)

filenames

In [None]:
from pyspark.sql.functions import from_utc_timestamp, date_format
from pyspark.sql.types import TimestampType

for filename in filenames:
  df = spark.read.format("parquet").load(f"{BRONZE_FS}/{filename}")
  columns = df.columns

  for col in columns:
    if "Date" in col or "date" in col:
      df = df.withColumn(col, date_format(from_utc_timestamp(df[col].cast(TimestampType()), "UTC"), "yyyy-MM-dd"))

  df.write.format("delta").mode("overwrite").save(f"{SILVER_FS}/{filename.split('.')[0]}")