In [None]:
import os
from dotenv import load_dotenv
load_dotenv()

In [0]:
configs = {
    "fs.azure.account.auth.type": "OAuth",
    "fs.azure.account.oauth.provider.type": "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
    "fs.azure.account.oauth2.client.id": os.getenv("AZURE_CLIENT_ID"),  
    "fs.azure.account.oauth2.client.secret": os.getenv("AZURE_CLIENT_SECRET"), 
    "fs.azure.account.oauth2.client.endpoint": f"https://login.microsoftonline.com/{os.getenv('AZURE_TERNANT_ID')}/oauth2/v2.0/token" 
}
container = os.getenv("AZURE_CONTAINER_NAME")
storage_account = os.getenv("AZURE_STORAGE_ACCOUNT")
mount_point = f"/mnt/{os.getenv('AZURE_CONTAINER_NAME')}"   

existing_mounts = [mnt.mountPoint for mnt in dbutils.fs.mounts()]
if mount_point in existing_mounts:
    dbutils.fs.unmount(mount_point)

dbutils.fs.mount(
    source=f"abfss://{container}@{storage_account}.dfs.core.windows.net/",
    mount_point=mount_point,
    extra_configs=configs
)

print(f"Mounted {mount_point} successfully")


/mnt/testsynapsehieum has been unmounted.
Mounted /mnt/testsynapsehieum successfully


In [0]:
# Databricks Notebook: Incremental Load from Bronze to Silver (Parquet) with Data Transformations

# Import required libraries
from pyspark.sql import SparkSession
from delta.tables import DeltaTable
from pyspark.sql.functions import col, max, upper, lit, when

# Initialize Spark session
spark = SparkSession.builder.appName("Bronze_to_Silver").getOrCreate()

# Define storage paths
bronze_path = "/mnt/testsynapsehieum/bronze"
silver_path = "/mnt/testsynapsehieum/silver"


In [0]:
# List all available Parquet files in the Bronze layer
bronze_tables = [f.name for f in dbutils.fs.ls(bronze_path) if f.name.endswith(".parquet")]

# Display detected tables
print("Tables found in Bronze:", bronze_tables)


Tables found in Bronze: ['HR.BusinessTravel.parquet', 'HR.Department.parquet', 'HR.Employee.parquet', 'HR.Job.parquet', 'HR.Location.parquet', 'HR.Shift.parquet', 'HR.Training.parquet']


In [0]:
# Process Each Table
for table in bronze_tables:
    table_name = table.replace(".parquet", "")  # Remove file extension
    bronze_table_path = f"{bronze_path}/{table}"  # Define Bronze table path
    silver_table_folder = f"{silver_path}/{table_name}"  # Create folder for each table in Silver

    print(f"Processing table: {table_name}")

    # Load Bronze data
    df_bronze = spark.read.format("parquet").load(bronze_table_path)

    # Check if Silver table exists
    try:
        df_silver = spark.read.format("parquet").load(silver_table_folder)
        silver_exists = True
    except:
        silver_exists = False

    # Determine incremental load
    if silver_exists:
        latest_timestamp = df_silver.select(max(col("modified_date"))).collect()[0][0]

        # Lọc dữ liệu mới từ Bronze (chỉ giữ lại các bản ghi có modified_date mới hơn)
        df_new_data = df_bronze.filter(col("modified_date") > latest_timestamp)

        # Nếu không có dữ liệu mới, bỏ qua
        if df_new_data.count() == 0:
            print(f"No new data found for {table_name}. Skipping update.")
            continue
    else:
        df_new_data = df_bronze  # Full load for first execution

    # APPLY DATA TRANSFORMATIONS (Only if the column exists)
    if "name" in df_new_data.columns:
        df_new_data = df_new_data.withColumn("name", upper(col("name")))

    if "is_deleted" in df_new_data.columns:
        df_new_data = df_new_data.withColumn(
            "is_deleted",
            when(col("is_deleted").isNull(), lit(0)).otherwise(col("is_deleted").cast("int"))
        )  # Ensure is_deleted is 0 or 1 (integer type)

    # Append new data to Silver layer (Parquet format)
    df_new_data.write.format("parquet").mode("append").save(silver_table_folder)
    print(f"Table {table_name} processed successfully with incremental data.")

print("Incremental Load from Bronze to Silver completed successfully.")

Processing table: HR.BusinessTravel
No new data found for HR.BusinessTravel. Skipping update.
Processing table: HR.Department
No new data found for HR.Department. Skipping update.
Processing table: HR.Employee
No new data found for HR.Employee. Skipping update.
Processing table: HR.Job
No new data found for HR.Job. Skipping update.
Processing table: HR.Location
No new data found for HR.Location. Skipping update.
Processing table: HR.Shift
No new data found for HR.Shift. Skipping update.
Processing table: HR.Training
No new data found for HR.Training. Skipping update.
Incremental Load from Bronze to Silver completed successfully.
