In [0]:
# service principal for integrating with ADLS and access it's data

spark.conf.set("fs.azure.account.auth.type.hpadlsacc.dfs.core.windows.net", "OAuth")
spark.conf.set("fs.azure.account.oauth.provider.type.hpadlsacc.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set("fs.azure.account.oauth2.client.id.hpadlsacc.dfs.core.windows.net", dbutils.secrets.get("hc-secret-scope", "app-key"))
spark.conf.set("fs.azure.account.oauth2.client.secret.hpadlsacc.dfs.core.windows.net", dbutils.secrets.get("hc-secret-scope", "service-cred"))
tenant_id = dbutils.secrets.get("hc-secret-scope", "dir-id")
spark.conf.set("fs.azure.account.oauth2.client.endpoint.hpadlsacc.dfs.core.windows.net", f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")

## Loading the data to bronze layer from Landing container

In [0]:
# importing the required spark session and functions
from pyspark.sql import SparkSession, functions as f
from pyspark.sql.functions import col, when

src_path = "abfss://landing@hpadlsacc.dfs.core.windows.net/claims/*.csv"

# reading all claim files
claims_df = spark.read.format('csv').option('header',True).option("includeMetadata", "true").load(src_path) 

# adding datasource column based on the file name
claims_df = claims_df.withColumn(
    "datasource",
    when(col("_metadata.file_path").contains("hospital1"), "hosa")
    .when(col("_metadata.file_path").contains("hospital2"), "hosb")
    .otherwise(None)
) 

In [0]:
# selecting a set of records for clarification
claims_df.filter(col('datasource')=='hosa').display(10)


In [0]:
# defining the destination path
target_path = "abfss://bronze@hpadlsacc.dfs.core.windows.net/claims/"

# finally writing the dataframe to the target path
claims_df.write.format("parquet").mode("overwrite").save(target_path)