In [0]:
dbutils.secrets.listScopes()

In [0]:
# Step 1: Retrieve secrets from Azure Key Vault
scope_name = "keyvaultscopeap"
client_id = dbutils.secrets.get(scope=scope_name, key="clientid")
tenant_id = dbutils.secrets.get(scope=scope_name, key="tenantid")
client_secret = dbutils.secrets.get(scope=scope_name, key="secret")

# Step 2: Set Spark configurations to access ADLS
storage_account_name = "aprdemoadls"
container_name = "bronze"
spark.conf.set(f"fs.azure.account.auth.type.{storage_account_name}.dfs.core.windows.net", "OAuth")
spark.conf.set(f"fs.azure.account.oauth.provider.type.{storage_account_name}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set(f"fs.azure.account.oauth2.client.id.{storage_account_name}.dfs.core.windows.net", client_id)
spark.conf.set(f"fs.azure.account.oauth2.client.secret.{storage_account_name}.dfs.core.windows.net", client_secret)
spark.conf.set(f"fs.azure.account.oauth2.client.endpoint.{storage_account_name}.dfs.core.windows.net", f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")

# Access the data
df = spark.read.csv(f"abfss://{container_name}@{storage_account_name}.dfs.core.windows.net/Suppliers 2.csv", header=True, inferSchema=True)
display(df)

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, regexp_replace, when, initcap, udf
from pyspark.sql.types import StringType
import re

# Initialize Spark session
spark = SparkSession.builder.appName("SuppliersCleaning").getOrCreate()

# Define your container name and storage account name
container_name = "bronze"
storage_account_name = "aprdemoadls"

# Read the CSV file into a DataFrame
df = spark.read.csv(f"abfss://{container_name}@{storage_account_name}.dfs.core.windows.net/Suppliers 2.csv", header=True, inferSchema=True)

# Remove null values
df = df.dropna()

# Remove duplicates based on SupplierID
df = df.dropDuplicates(["SupplierID"])

# Define UDF to preprocess contact numbers
def preprocess_contact_number(number):
    # Remove any non-digit characters
    number = re.sub(r'\D', '', number)
    # Format the number to standard format with '-'
    if len(number) == 10:
        return f"+1-{number[:3]}-{number[3:6]}-{number[6:]}"
    elif len(number) > 10:
        return f"+1-{number[1:4]}-{number[4:7]}-{number[7:11]}"
    else:
        return number

# Register the function as a UDF
preprocess_contact_number_udf = udf(preprocess_contact_number, StringType())

# Apply the UDF to the ContactNumber column
df = df.withColumn("ContactNumber", preprocess_contact_number_udf(col("ContactNumber")))

# Validate Email addresses (simple regex check)
df = df.filter(col("Email").rlike("^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$"))

# Standardize City names using initcap (capitalize first letter of each word)
df = df.withColumn("City", initcap(col("City")))

# Show the cleaned data
df.show()

# Write the cleaned data to the silver layer in Parquet format with overwrite mode
df.write.mode('overwrite').parquet(f"abfss://silver@{storage_account_name}.dfs.core.windows.net/Suppliers.parquet")

print("Data cleaned and saved to Parquet format successfully.")
