Use the following Azure Databricks storage setup block only if you are using Azure Databricks. You can refer to the instructions here to get started:
https://docs.microsoft.com/en-us/azure/databricks/data/data-sources/azure/adls-gen2/azure-datalake-gen2-sp-access

If you are using Synapse Spark and if your data is residing on the storage attached to the Synapse Spark workspace, you can skip the below storage setup section.

In [None]:
%scala
val storageAccountName = "<INSERT STORAGE ACCOUNT>"
val fileSystemName = "<INSERT CONTAINER NAME>"

val commonPath = "abfss://" + fileSystemName  + "@" + storageAccountName + ".dfs.core.windows.net"

# AAD Application Details
val appID = "<INSERT APP ID>"
val secret = "<INSERT SECRET>"
val tenantID = "<INSERT TENANT ID>"

spark.conf.set("fs.azure.account.auth.type." + storageAccountName + ".dfs.core.windows.net", "OAuth")
spark.conf.set("fs.azure.account.oauth.provider.type." + storageAccountName + ".dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set("fs.azure.account.oauth2.client.id." + storageAccountName + ".dfs.core.windows.net", "" + appID + "")
spark.conf.set("fs.azure.account.oauth2.client.secret." + storageAccountName + ".dfs.core.windows.net", "" + secret + "")
spark.conf.set("fs.azure.account.oauth2.client.endpoint." + storageAccountName + ".dfs.core.windows.net", "https://login.microsoftonline.com/" + tenantID + "/oauth2/token")
spark.conf.set("fs.azure.createRemoteFileSystemDuringInitialization", "true")
dbutils.fs.ls("abfss://" + fileSystemName  + "@" + storageAccountName + ".dfs.core.windows.net/")
spark.conf.set("fs.azure.createRemoteFileSystemDuringInitialization", "false")

In [None]:
from pyspark.sql.functions import *

columnNames = ["tripId","driverId","customerId","cabId","tripDate","startLocation","endLocation"]
tripData = [
  ('100', '200', '300', '400', '20240101', 'New York', 'New Jersey'),
  ('101', '201', '301', '401', '20240102', 'Tempe', 'Phoenix'),
  ('102', '202', '302', '402', '20240103', 'San Jose', 'San Franciso'),
  ('103', '203', '303', '403', '20240102', 'New York', 'Boston'),
  ('104', '204', '304', '404', '20240103', 'New York', 'Washington'),
  ('105', '205', '305', '405', '20240201', 'Miami', 'Fort Lauderdale'),
  ('106', '206', '306', '406', '20240202', 'Seattle', 'Redmond'),
  ('107', '207', '307', '407', '20240203', 'Los Angeles', 'San Diego'),
  ('108', '208', '308', '408', '20240301', 'Phoenix', 'Las Vegas'),
  ('109', '209', '309', '409', '20240302', 'Washington', 'Baltimore'),
  ('110', '210', '310', '410', '20240303', 'Dallas', 'Austin'),
  ('111', '211', '311', '411', '20240303', 'New York', 'New Jersey'),
  ('112', '212', '312', '412', '20240304', 'New York', 'Boston'),
  ('113', '212', '312', '412', '20240401', 'San Jose', 'San Ramon'),
  ('114', '212', '312', '412', '20240404', 'San Jose', 'Oakland'),
  ('115', '212', '312', '412', '20240404', 'Tempe', 'Scottsdale'),
  ('116', '212', '312', '412', '20240405', 'Washington', 'Atlanta'),
  ('117', '212', '312', '412', '20240405', 'Seattle', 'Portland'),
  ('118', '212', '312', '412', '20240405', 'Miami', 'Tampa')
]
df = spark.createDataFrame(data= tripData, schema = columnNames)

# Split the data according the current timestamp and write to store as parquet files
dftripDate = df.withColumn("tripDate", to_timestamp(col("tripDate"), 'yyyyMMdd')) \
           .withColumn("year", tripDate_format(col("tripDate"), "yyyy")) \
           .withColumn("month", tripDate_format(col("tripDate"), "MM")) \
           .withColumn("day", tripDate_format(col("tripDate"), "dd"))

dftripDate.show(truncate=False)

dftripDate.write.partitionBy("year", "month", "day").mode("overwrite").parquet(commonPath + "/partition/")

In [None]:
# Now, let’s see how pruning works. 
# For example,  the following query will only scan month=01 folder and skip all other folders.
readDF = spark.read.parquet(commonPath + "/partition/year=2024").filter("month=01")
readDF.show(truncate=False)