In [0]:
scopes = dbutils.secrets.listScopes()
display(scopes)


Connecting ADLS account with Data bricks using Service principal and Azure Key Vault

In [0]:
# Define variables for your ADLS account, container, and file path
adls_account_name = "aprdemoadls"
adls_container_name = "bronze"
adls_file_path="Transactions 2"

# Define variables for your service principal and Key Vault
tenant_id = dbutils.secrets.get(scope="keyvaultscopeap", key="tenantid")
client_id = dbutils.secrets.get(scope="keyvaultscopeap", key="clientid")
client_secret = dbutils.secrets.get(scope="keyvaultscopeap", key="secret")

# Set up the configuration for accessing ADLS
spark.conf.set("fs.azure.account.auth.type." + adls_account_name + ".dfs.core.windows.net", "OAuth")
spark.conf.set("fs.azure.account.oauth.provider.type." + adls_account_name + ".dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set("fs.azure.account.oauth2.client.id." + adls_account_name + ".dfs.core.windows.net", client_id)
spark.conf.set("fs.azure.account.oauth2.client.secret." + adls_account_name + ".dfs.core.windows.net", client_secret)
spark.conf.set("fs.azure.account.oauth2.client.endpoint." + adls_account_name + ".dfs.core.windows.net", "https://login.microsoftonline.com/" + tenant_id + "/oauth2/token")

# Access the file in the bronze container
file_path = f"abfss://{adls_container_name}@{adls_account_name}.dfs.core.windows.net/{adls_file_path}.csv"
df = spark.read.format("csv").option("header", "true").load(file_path)

# Show the data
display(df)


**Checking for Null Values**

In [0]:
from pyspark.sql.functions import col, sum

# Check for null values in each column
null_counts = df.select([sum(col(c).isNull().cast("int")).alias(c) for c in df.columns])

# Display the null counts
display(null_counts)

Checking for Duplicate values in the data

In [0]:
# Check for duplicate values in the data
duplicate_count = df.groupBy(df.columns).count().filter(col("count") > 1).count()

# Display the duplicate count
display(spark.createDataFrame([(duplicate_count,)], ["duplicate_count"]))

Storing the preprocessed data as a parquet file in Silver Layer

In [0]:
adls_account_name = "aprdemoadls"
adls_container_name = "silver"
adls_file_name = "transactions.parquet"

# Define the file path for saving the data
save_path = f"abfss://{adls_container_name}@{adls_account_name}.dfs.core.windows.net/{adls_file_name}"
df.write.mode("overwrite").parquet(save_path)

print(f"Data saved to {save_path}")


In [0]:
adls_account_name = "aprdemoadls"
adls_container_name = "silver"
adls_file_name = "transactions.csv"

# Define the file path for saving the data
save_path = f"abfss://{adls_container_name}@{adls_account_name}.dfs.core.windows.net/{adls_file_name}"
df.write.mode("overwrite").csv(save_path)

print(f"Data saved to {save_path}")