###access_adls_mount_service_principal



In [0]:
configs = {
    "fs.azure.account.auth.type.azssdatalake.dfs.core.windows.net": "OAuth",
    "fs.azure.account.oauth.provider.type.azssdatalake.dfs.core.windows.net": "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
    "fs.azure.account.oauth2.client.id.azssdatalake.dfs.core.windows.net": "ffaa1217-3787-4607-babf-9690297b95e7",
    "fs.azure.account.oauth2.client.secret.azssdatalake.dfs.core.windows.net": "Pp88Q~_LFiFjjPmR3MbLcVQlvB6fM_y6j2QFBaKP",
    "fs.azure.account.oauth2.client.endpoint.azssdatalake.dfs.core.windows.net": "https://login.microsoftonline.com/851bbc80-7ce3-49a4-b345-17c81b8d1813/oauth2/token"
}


In [0]:
for key, value in configs.items():
    spark.conf.set(key, value)

##  Display Files in ADLS Container

In [0]:
display(dbutils.fs.ls('abfss://silver@azssdatalake.dfs.core.windows.net'))

## Reading File With Defined Schema

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("proj").getOrCreate()

#Define Schema 
my_Schema = StructType([
                    StructField("ProductID", IntegerType(), False),
                    StructField("ProductName", StringType(), True),
                    StructField("SupplierID", IntegerType(), True), 
                    StructField("CategoryID", IntegerType(), True), 
                    StructField("QuantityPerUnit", StringType(), True),
                    StructField("UnitPrice", DoubleType(), True),
                    StructField("UnitsInStock", IntegerType(), True),
                    StructField("UnitsOnOrder", IntegerType(), True),
                    StructField("ReOrderLevel", IntegerType(), True),
                    StructField("Discontinued",BooleanType())

])
#reading file with a defined schema
df = spark.read.format("csv")\
        .option("header", "true")\
        .schema(my_Schema)\
        .load("abfss://silver@azssdatalake.dfs.core.windows.net/Project/")
display(df)

## Formatting Column

In [0]:
df = df.withColumn("UnitSize", trim(regexp_extract(col("QuantityPerUnit"), r"^\d+\D*(.*)", 1)).cast(StringType()))
df =df.withColumn("QuantityOfUnit", regexp_extract(col("QuantityPerUnit"), r"^(\d+)", 1))
df = df.withColumn("QuantityOfUnit", col("QuantityOfUnit").cast(DoubleType()))
df = df.select("ProductID", "ProductName", "SupplierID", "CategoryID", "QuantityOfUnit", "UnitSize", \
    "UnitPrice","UnitsInStock","UnitsOnOrder", "ReOrderLevel", "Discontinued")
 
display(df)

## Creating Price Segment

In [0]:
# price segmentation
df_avg = df.groupBy("CategoryID")\
            .agg(avg("UnitPrice").alias("AvPricePerCat"))
df_avg = df_avg.withColumn("AvPricePerCat", round(col("AvPricePerCat"),2))

df = df.join(df_avg, on= "CategoryID", how = "left")\
        .withColumn("PriceSegment", when(col("UnitPrice") > col("AvPricePerCat"), "Premium")
                                    .otherwise ("Economy"))
display(df)

## Products to Re-Order

In [0]:
df = df.withColumn("ReOrder", when((col("UnitsInStock") + col("UnitsOnOrder")) <= col("ReOrderLevel"), "Yes")
                                        .otherwise("No"))
display(df)
#df.filter(col("ReOrder") == "Yes").select("ProductId","ProductName", "ReOrder").show()

## Products-In Stock or Out Of Stock

In [0]:

df = df.withColumn("StockStatus", when(col("UnitsInStock") > 0, "InStock").otherwise("OutOfStock"))
display(df)

## Stock Value

In [0]:

df = df.withColumn("StockValue", round(col("UnitsInStock") * col("UnitPrice"), 2))
display(df)

## Trimmed Product Names

In [0]:

df.withColumn("ProductName", trim(col("ProductName")))
display(df)