In [None]:
%run oeai_py

In [None]:
# Create an instance of OEAI class and set the platform ("Synapse" or "Fabric")
oeai = OEAI()

In [14]:
# CHANGE VALUES FOR YOUR KEY VAULT
keyvault = "INSERT_YOUR_KEYVAULT_NAME_HERE"  # Fabric requires full URL eg "https://key_vault_name.vault.azure.net/"
keyvault_linked_service = "INSERT_YOUR_KEYVAULT_LINKED_SERVICE_NAME_HERE"  # Not required for Fabric.

# Synapse OEA environment paths
silver_path = oeai.get_secret(spark, "wonde-silver", keyvault_linked_service, keyvault)
gold_path = oeai.get_secret(spark, "gold-path", keyvault_linked_service, keyvault)
storage_account_name = oeai.get_secret(spark, "storage-account", keyvault_linked_service, keyvault)
storage_account_access_key = oeai.get_secret(spark, "storage-accesskey", keyvault_linked_service, keyvault)

In [None]:
def process_delta_tables_to_parquet(spark, storage_account_name, storage_account_access_key, silver_path, gold_path):
    """
    Sets up configuration for Azure storage access, lists subdirectories in the silver path, and processes
    each Delta Lake table by converting and saving it in Parquet format in the gold path.

    Args:
        spark (SparkSession): Active Spark session.
        storage_account_name (str): Azure storage account name.
        storage_account_access_key (str): Access key for the Azure storage account.
        silver_path (str): Path to the silver layer directory (source Delta tables).
        gold_path (str): Path to the gold layer directory (destination for Parquet files).

    This function will process each Delta Lake table found in the silver layer, partition the data by 
    'organisationkey', and write it to the gold layer as Parquet files.
    """
    # Set up the configuration for accessing the storage account
    spark.conf.set(f"fs.azure.account.key.{storage_account_name}.dfs.core.windows.net", storage_account_access_key)

    sc = spark.sparkContext
    hadoop_conf = sc._jsc.hadoopConfiguration()
    hadoop_conf.set("fs.azure", "org.apache.hadoop.fs.azure.NativeAzureFileSystem")
    hadoop_conf.set("fs.azure.account.key." + storage_account_name + ".blob.core.windows.net", storage_account_access_key)

    # URI for the parent directory
    parent_dir_uri = sc._gateway.jvm.java.net.URI(silver_path)

    # Hadoop Path of the parent directory
    Path = sc._gateway.jvm.org.apache.hadoop.fs.Path
    FileSystem = sc._gateway.jvm.org.apache.hadoop.fs.FileSystem

    # Get the FileSystem for the given URI and configuration
    fs = FileSystem.get(parent_dir_uri, hadoop_conf)

    # List the subdirectories at the given URI
    status = fs.listStatus(Path(silver_path))
    delta_table_paths = [file.getPath().toString() for file in status if file.isDirectory()]

    for table_path in delta_table_paths:
        try:
            df = spark.read.format("delta").load(table_path)
            table_name = os.path.basename(urlparse(table_path).path)
            parquet_output_folder_path = os.path.join(gold_path, table_name)
            
            df = df.withColumn("partitionkey", col("organisationkey"))
            df.write.partitionBy("partitionkey").mode("overwrite").format("parquet").save(parquet_output_folder_path)
            
        except AnalysisException as e:
            print(f"Error reading Delta table at {table_path}: ", e)

In [None]:
process_delta_tables_to_parquet(spark, storage_account_name, storage_account_access_key, silver_path, gold_path)