In [0]:
dbutils.fs.mounts()

In [0]:
# dbutils.fs.unmount("/mnt/rainbowstorage")

In [0]:
dbutils.fs.help()

In [0]:
dbutils.fs.mounts()

In [0]:
# secret Scopes
secret = dbutils.secrets.get(scope="j2dtech-secrets3", key="secret-id")
tenent_id = dbutils.secrets.get(scope="j2dtech-secrets3", key="tenent-id")
client_id = dbutils.secrets.get(scope="j2dtech-secrets3", key="client")
storage_acc_name = "aldlsj2dtechstorage"
container_name = "j2dadlscontainer"
mount_path = "/mnt/j2dadlscontainer"

In [0]:
# Service Principle

In [0]:
configs = {
    "fs.azure.account.auth.type": "OAuth",
    "fs.azure.account.oauth.provider.type": "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
    "fs.azure.account.oauth2.client.id": client_id,
    "fs.azure.account.oauth2.client.secret": secret,
    "fs.azure.account.oauth2.client.endpoint": f"https://login.microsoftonline.com/{tenent_id}/oauth2/token"
}

In [0]:
dbutils.fs.mount(
    source = "abfss://j2dadlscontainer@aldlsj2dtechstorage.dfs.core.windows.net/",
    mount_point = mount_path,
    extra_configs = configs
)

In [0]:
%fs
ls '/mnt/j2dadlscontainer'

In [0]:
dbutils.fs.ls("dbfs:/mnt/j2dadlscontainer/raw_datasets/")

In [0]:
%fs
ls 'dbfs:/mnt/j2dadlscontainer/raw_datasets/'

In [0]:
cars_df = spark.read.format("csv").option("header", True).option("inferSchema", True).load("/mnt/j2dadlscontainer/raw_datasets/car_price_dataset.csv")
cars_df.display()

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window 

In [0]:
grouped_df = cars_df.groupBy("Fuel_Type").agg(F.count(F.lit(1)).alias("no_of_cars"))
grouped_df.display()

In [0]:
grouped_df = cars_df.groupBy("Fuel_Type").agg(F.count(F.col("Fuel_Type")).alias("no_of_cars"))
grouped_df.display()

In [0]:
(cars_df.groupBy("Fuel_Type").count()).display()

In [0]:
cars_df = (
    cars_df
    .filter(
        (F.col("Fuel_Type") == "Diesel") &
        (F.col("Transmission") == "Automatic")
    )
)
cars_df.display()

In [0]:
windowSpec = Window.partitionBy("Brand").orderBy(F.col("Year").desc(), F.col("Mileage").asc(), F.col("Owner_Count").asc(), F.col("Price").asc())

ranked_cars_df = (
    cars_df
    .withColumn("rn", F.row_number().over(windowSpec))
    .filter(F.col("rn") == 1)
    .drop("rn")
)

display(ranked_cars_df)

In [0]:
ranked_cars_df.write.format("parquet").partitionBy("Brand").save("/mnt/j2dadlscontainer/processed_datasets/BestCars")