In [0]:
dbutils.secrets.listScopes()

In [0]:
pip install databricks-cli

In [0]:
storage_account_name = "aprdemoadls"
scope_name = "keyvaultscopeap"
client_id = dbutils.secrets.get(scope=scope_name, key="clientid")
tenant_id = dbutils.secrets.get(scope=scope_name, key="tenantid")
client_secret = dbutils.secrets.get(scope=scope_name, key="secret")

spark.conf.set(f"fs.azure.account.auth.type.{storage_account_name}.dfs.core.windows.net", "OAuth")
spark.conf.set(f"fs.azure.account.oauth.provider.type.{storage_account_name}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set(f"fs.azure.account.oauth2.client.id.{storage_account_name}.dfs.core.windows.net", client_id)
spark.conf.set(f"fs.azure.account.oauth2.client.secret.{storage_account_name}.dfs.core.windows.net", client_secret)
spark.conf.set(f"fs.azure.account.oauth2.client.endpoint.{storage_account_name}.dfs.core.windows.net", f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")

In [0]:
container_name = "bronze"
file_name = "Inventory 2"
storage_account_name = "aprdemoadls"
file_path = f"abfss://{container_name}@{storage_account_name}.dfs.core.windows.net/{file_name}.csv"
df = spark.read.format("csv").option("header", "true").load(file_path)
display(df)

In [0]:
from pyspark.sql.functions import count, when, col

missing_values_df = df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns])
display(missing_values_df)

In [0]:
from pyspark.sql.functions import mode

mode_value = df.select(mode(col("ReorderLevel"))).collect()[0][0]
df = df.na.fill({"ReorderLevel": mode_value})
display(df)

In [0]:
from pyspark.sql.functions import count, when, col

missing_values_df = df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns])
display(missing_values_df)

In [0]:
df = df.dropDuplicates()
display(df)

In [0]:
#df.write.mode("overwrite").parquet("abfss://silver@aprdemoadls.dfs.core.windows.net/inventory_parquet")

In [0]:
#df.write.mode("overwrite").csv("abfss://silver@aprdemoadls.dfs.core.windows.net/inventory_csv")

In [0]:
%python
# Save the cleaned DataFrame to a single Parquet file
output_path2 = "abfss://silver@aprdemoadls.dfs.core.windows.net/Inventory.parquet"
temp_output_path = "abfss://silver@aprdemoadls.dfs.core.windows.net/temp_Inventory.parquet"

# Write the DataFrame to a temporary path
df.coalesce(1).write.mode("overwrite").parquet(temp_output_path)

# List the files in the temporary directory
files = dbutils.fs.ls(temp_output_path)

# Find the part file
part_file = [file.path for file in files if file.path.endswith(".parquet")][0]

# Rename the part file to the desired output path
dbutils.fs.mv(part_file, output_path2 + "/Inventory.parquet", True)

# Remove the temporary directory
dbutils.fs.rm(temp_output_path, True)

In [0]:
%python
# Read the Parquet file to verify its contents
df_parquet = spark.read.parquet(output_path2 + "/Inventory.parquet")
display(df_parquet)