In [0]:
%sql
SELECT * FROM `workspace`.`default`.`sample_dataset`;

order_id,order_timestamp,customer_id,country,amount,currency,status
O-1001,2025-11-01T10:00:00.000Z,C-0001,US,66.77,USD,CANCELLED
O-1002,2025-11-01T10:07:00.000Z,C-0002,AU,358.12,AUD,CREATED
O-1003,2025-11-01T10:14:00.000Z,C-0003,US,1024.75,USD,CREATED
O-1004,2025-11-01T10:21:00.000Z,C-0004,US,898.02,USD,PAID
O-1005,2025-11-01T10:28:00.000Z,C-0005,US,167.73,USD,PAID
O-1006,2025-11-01T10:35:00.000Z,C-0006,GB,914.97,GBP,CREATED
O-1007,2025-11-01T10:42:00.000Z,C-0007,AU,1082.55,AUD,CREATED
O-1008,2025-11-01T10:49:00.000Z,C-0008,GB,646.69,GBP,CANCELLED
O-1009,2025-11-01T10:56:00.000Z,C-0009,GB,438.94,GBP,PAID
O-1010,2025-11-01T11:03:00.000Z,C-0010,AU,1056.26,AUD,CANCELLED


In [0]:
#Load dataset
df = spark.table("workspace.default.sample_dataset")
display(df)


order_id,order_timestamp,customer_id,country,amount,currency,status
O-1001,2025-11-01T10:00:00.000Z,C-0001,US,66.77,USD,CANCELLED
O-1002,2025-11-01T10:07:00.000Z,C-0002,AU,358.12,AUD,CREATED
O-1003,2025-11-01T10:14:00.000Z,C-0003,US,1024.75,USD,CREATED
O-1004,2025-11-01T10:21:00.000Z,C-0004,US,898.02,USD,PAID
O-1005,2025-11-01T10:28:00.000Z,C-0005,US,167.73,USD,PAID
O-1006,2025-11-01T10:35:00.000Z,C-0006,GB,914.97,GBP,CREATED
O-1007,2025-11-01T10:42:00.000Z,C-0007,AU,1082.55,AUD,CREATED
O-1008,2025-11-01T10:49:00.000Z,C-0008,GB,646.69,GBP,CANCELLED
O-1009,2025-11-01T10:56:00.000Z,C-0009,GB,438.94,GBP,PAID
O-1010,2025-11-01T11:03:00.000Z,C-0010,AU,1056.26,AUD,CANCELLED


In [0]:
from pyspark.sql.functions import to_date, col
#Add the derived column order_date
df2 = df.withColumn("order_date", to_date(col("order_timestamp")))
display(df2)


order_id,order_timestamp,customer_id,country,amount,currency,status,order_date
O-1001,2025-11-01T10:00:00.000Z,C-0001,US,66.77,USD,CANCELLED,2025-11-01
O-1002,2025-11-01T10:07:00.000Z,C-0002,AU,358.12,AUD,CREATED,2025-11-01
O-1003,2025-11-01T10:14:00.000Z,C-0003,US,1024.75,USD,CREATED,2025-11-01
O-1004,2025-11-01T10:21:00.000Z,C-0004,US,898.02,USD,PAID,2025-11-01
O-1005,2025-11-01T10:28:00.000Z,C-0005,US,167.73,USD,PAID,2025-11-01
O-1006,2025-11-01T10:35:00.000Z,C-0006,GB,914.97,GBP,CREATED,2025-11-01
O-1007,2025-11-01T10:42:00.000Z,C-0007,AU,1082.55,AUD,CREATED,2025-11-01
O-1008,2025-11-01T10:49:00.000Z,C-0008,GB,646.69,GBP,CANCELLED,2025-11-01
O-1009,2025-11-01T10:56:00.000Z,C-0009,GB,438.94,GBP,PAID,2025-11-01
O-1010,2025-11-01T11:03:00.000Z,C-0010,AU,1056.26,AUD,CANCELLED,2025-11-01


In [0]:
#Write as a Delta table partitioned by country and order_date
df2.write.format("delta") \
    .mode("overwrite") \
    .partitionBy("country", "order_date") \
    .saveAsTable("workspace.default.shopez_orders")




In [0]:
%sql
--  Verify the partition structure
SELECT * FROM workspace.default.shopez_orders;
SHOW PARTITIONS workspace.default.shopez_orders;


country,order_date
GB,2025-11-12
AU,2025-11-13
CA,2025-11-24
US,2025-11-13
GB,2025-11-04
US,2025-11-08
GB,2025-11-03
IN,2025-11-15
GB,2025-11-16
CA,2025-11-09


In [0]:
#Run queries that demonstrate partition pruning
spark.table("workspace.default.sample_dataset") \
    .withColumn("order_date", to_date("order_timestamp")) \
    .write.format("delta") \
    .mode("overwrite") \
    .partitionBy("country", "order_date") \
    .saveAsTable("workspace.default.shopez_orders")

# Verify the pruning
df = spark.read.table("workspace.default.shopez_orders")

df.filter("country = 'IN'") \
  .explain(True)



== Parsed Logical Plan ==
'Filter ('country = IN)
+- 'UnresolvedRelation [workspace, default, shopez_orders], [], false

== Analyzed Logical Plan ==
order_id: string, order_timestamp: timestamp, customer_id: string, country: string, amount: double, currency: string, status: string, order_date: date
Filter (country#13215 = IN)
+- SubqueryAlias workspace.default.shopez_orders
   +- Relation workspace.default.shopez_orders[order_id#13212,order_timestamp#13213,customer_id#13214,country#13215,amount#13216,currency#13217,status#13218,order_date#13219] parquet

== Optimized Logical Plan ==
Filter (isnotnull(country#13215) AND (country#13215 = IN))
+- Relation workspace.default.shopez_orders[order_id#13212,order_timestamp#13213,customer_id#13214,country#13215,amount#13216,currency#13217,status#13218,order_date#13219] parquet

== Physical Plan ==
*(1) ColumnarToRow
+- PhotonResultStage
   +- PhotonProject [order_id#13212, order_timestamp#13213, customer_id#13214, country#13215, amount#13216, cu

In [0]:
%sql
-- Demonstrate Delta Lake Time Travel
DESCRIBE HISTORY workspace.default.shopez_orders;


version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
2,2025-12-01T18:17:50.000Z,74725293153598,theoryofnumbers123@gmail.com,CREATE OR REPLACE TABLE AS SELECT,"Map(partitionBy -> [""country"",""order_date""], clusterBy -> [], description -> null, isManaged -> true, properties -> {""delta.parquet.compression.codec"":""zstd"",""delta.enableDeletionVectors"":""true""}, statsOnLoad -> true)",,List(2456404513182676),1201-180437-6bbf1vw0-v2n,1.0,WriteSerializable,False,"Map(numFiles -> 125, numRemovedFiles -> 125, numRemovedBytes -> 319354, numDeletionVectorsRemoved -> 0, numOutputRows -> 5000, numOutputBytes -> 319354)",,Databricks-Runtime/17.2.x-aarch64-photon-scala2.13
1,2025-12-01T18:16:24.000Z,74725293153598,theoryofnumbers123@gmail.com,CREATE OR REPLACE TABLE AS SELECT,"Map(partitionBy -> [""country"",""order_date""], clusterBy -> [], description -> null, isManaged -> true, properties -> {""delta.parquet.compression.codec"":""zstd"",""delta.enableDeletionVectors"":""true""}, statsOnLoad -> true)",,List(2456404513182676),1201-180437-6bbf1vw0-v2n,0.0,WriteSerializable,False,"Map(numFiles -> 125, numRemovedFiles -> 125, numRemovedBytes -> 319354, numDeletionVectorsRemoved -> 0, numOutputRows -> 5000, numOutputBytes -> 319354)",,Databricks-Runtime/17.2.x-aarch64-photon-scala2.13
0,2025-12-01T18:09:53.000Z,74725293153598,theoryofnumbers123@gmail.com,CREATE OR REPLACE TABLE AS SELECT,"Map(partitionBy -> [""country"",""order_date""], clusterBy -> [], description -> null, isManaged -> true, properties -> {""delta.parquet.compression.codec"":""zstd"",""delta.enableDeletionVectors"":""true""}, statsOnLoad -> true)",,List(2456404513182676),1201-180437-6bbf1vw0-v2n,,WriteSerializable,False,"Map(numFiles -> 125, numRemovedFiles -> 0, numRemovedBytes -> 0, numDeletionVectorsRemoved -> 0, numOutputRows -> 5000, numOutputBytes -> 319354)",,Databricks-Runtime/17.2.x-aarch64-photon-scala2.13


In [0]:
spark.read.format("delta") \
    .option("versionAsOf", 0) \
    .table("workspace.default.shopez_orders") \
    .show()


+--------+-------------------+-----------+-------+-------+--------+---------+----------+
|order_id|    order_timestamp|customer_id|country| amount|currency|   status|order_date|
+--------+-------------------+-----------+-------+-------+--------+---------+----------+
|  O-2562|2025-11-09 00:07:00|     C-1562|     GB| 958.49|     GBP|  CREATED|2025-11-09|
|  O-2563|2025-11-09 00:14:00|     C-1563|     GB| 625.55|     GBP|  CREATED|2025-11-09|
|  O-2569|2025-11-09 00:56:00|     C-1569|     GB| 163.11|     GBP|CANCELLED|2025-11-09|
|  O-2570|2025-11-09 01:03:00|     C-1570|     GB|  45.57|     GBP|     PAID|2025-11-09|
|  O-2571|2025-11-09 01:10:00|     C-1571|     GB|1035.72|     GBP|  CREATED|2025-11-09|
|  O-2573|2025-11-09 01:24:00|     C-1573|     GB|1424.61|     GBP|  CREATED|2025-11-09|
|  O-2578|2025-11-09 01:59:00|     C-1578|     GB|1364.36|     GBP|CANCELLED|2025-11-09|
|  O-2583|2025-11-09 02:34:00|     C-1583|     GB| 830.19|     GBP|     PAID|2025-11-09|
|  O-2585|2025-11-09 

In [0]:
# Append new data
from pyspark.sql.functions import to_timestamp, to_date, lit, col
new_data = [
  ("tt-o-9001","2025-12-01 19:00:00","c-9001","US",250.0,"USD","CREATED"),
  ("tt-o-9002","2025-12-01 19:05:00","c-9002","IN",75.0,"INR","CREATED")
]
schema = "order_id STRING, order_timestamp STRING, customer_id STRING, country STRING, amount DOUBLE, currency STRING, status STRING"
df_new = spark.createDataFrame(new_data, schema=schema) \
    .withColumn("order_timestamp", to_timestamp("order_timestamp")) \
    .withColumn("order_date", to_date(col("order_timestamp")))

df_new.write.format("delta").mode("append").partitionBy("country","order_date").saveAsTable("workspace.default.shopez_orders")


In [0]:
# Update some rows
from delta.tables import DeltaTable
dt = DeltaTable.forName(spark, "workspace.default.shopez_orders")
# mark the temporary created orders as PAID
dt.update(condition = "order_id IN ('tt-o-9001','tt-o-9002')", set = {"status": "'PAID'"})


DataFrame[num_affected_rows: bigint]

In [0]:
%sql
-- View all versions again
DESCRIBE HISTORY workspace.default.shopez_orders;


version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
4,2025-12-01T18:27:54.000Z,74725293153598,theoryofnumbers123@gmail.com,UPDATE,"Map(predicate -> [""order_id#14469 IN (tt-o-9001,tt-o-9002)""])",,List(2456404513182676),1201-180437-6bbf1vw0-v2n,3.0,WriteSerializable,False,"Map(numRemovedFiles -> 2, numRemovedBytes -> 3518, numCopiedRows -> 0, numDeletionVectorsAdded -> 0, numDeletionVectorsRemoved -> 0, numAddedChangeFiles -> 0, executionTimeMs -> 4068, numDeletionVectorsUpdated -> 0, scanTimeMs -> 1539, numAddedFiles -> 2, numUpdatedRows -> 2, numAddedBytes -> 3486, rewriteTimeMs -> 2500)",,Databricks-Runtime/17.2.x-aarch64-photon-scala2.13
3,2025-12-01T18:27:26.000Z,74725293153598,theoryofnumbers123@gmail.com,WRITE,"Map(mode -> Append, statsOnLoad -> true, partitionBy -> [])",,List(2456404513182676),1201-180437-6bbf1vw0-v2n,2.0,WriteSerializable,True,"Map(numFiles -> 2, numOutputRows -> 2, numOutputBytes -> 3518)",,Databricks-Runtime/17.2.x-aarch64-photon-scala2.13
2,2025-12-01T18:17:50.000Z,74725293153598,theoryofnumbers123@gmail.com,CREATE OR REPLACE TABLE AS SELECT,"Map(partitionBy -> [""country"",""order_date""], clusterBy -> [], description -> null, isManaged -> true, properties -> {""delta.parquet.compression.codec"":""zstd"",""delta.enableDeletionVectors"":""true""}, statsOnLoad -> true)",,List(2456404513182676),1201-180437-6bbf1vw0-v2n,1.0,WriteSerializable,False,"Map(numFiles -> 125, numRemovedFiles -> 125, numRemovedBytes -> 319354, numDeletionVectorsRemoved -> 0, numOutputRows -> 5000, numOutputBytes -> 319354)",,Databricks-Runtime/17.2.x-aarch64-photon-scala2.13
1,2025-12-01T18:16:24.000Z,74725293153598,theoryofnumbers123@gmail.com,CREATE OR REPLACE TABLE AS SELECT,"Map(partitionBy -> [""country"",""order_date""], clusterBy -> [], description -> null, isManaged -> true, properties -> {""delta.parquet.compression.codec"":""zstd"",""delta.enableDeletionVectors"":""true""}, statsOnLoad -> true)",,List(2456404513182676),1201-180437-6bbf1vw0-v2n,0.0,WriteSerializable,False,"Map(numFiles -> 125, numRemovedFiles -> 125, numRemovedBytes -> 319354, numDeletionVectorsRemoved -> 0, numOutputRows -> 5000, numOutputBytes -> 319354)",,Databricks-Runtime/17.2.x-aarch64-photon-scala2.13
0,2025-12-01T18:09:53.000Z,74725293153598,theoryofnumbers123@gmail.com,CREATE OR REPLACE TABLE AS SELECT,"Map(partitionBy -> [""country"",""order_date""], clusterBy -> [], description -> null, isManaged -> true, properties -> {""delta.parquet.compression.codec"":""zstd"",""delta.enableDeletionVectors"":""true""}, statsOnLoad -> true)",,List(2456404513182676),1201-180437-6bbf1vw0-v2n,,WriteSerializable,False,"Map(numFiles -> 125, numRemovedFiles -> 0, numRemovedBytes -> 0, numDeletionVectorsRemoved -> 0, numOutputRows -> 5000, numOutputBytes -> 319354)",,Databricks-Runtime/17.2.x-aarch64-photon-scala2.13


In [0]:
# Schema Evolution — add payment_method & coupon_code and append with mergeSchema
from pyspark.sql.functions import lit
data_with_new_cols = [
  ("se-o-1001","2025-12-02 08:00:00","c-1001","IN",120.0,"INR","PAID","UPI", None),
  ("se-o-1002","2025-12-02 09:30:00","c-1002","US",410.0,"USD","PAID","CARD","SALE50")
]
schema2 = "order_id STRING, order_timestamp STRING, customer_id STRING, country STRING, amount DOUBLE, currency STRING, status STRING, payment_method STRING, coupon_code STRING"
df_se = spark.createDataFrame(data_with_new_cols, schema=schema2) \
    .withColumn("order_timestamp", to_timestamp("order_timestamp")) \
    .withColumn("order_date", to_date(col("order_timestamp")))

df_se.write.format("delta") \
    .mode("append") \
    .option("mergeSchema","true") \
    .partitionBy("country","order_date") \
    .saveAsTable("workspace.default.shopez_orders")


In [0]:
%sql
-- Verify schema
DESCRIBE TABLE workspace.default.shopez_orders;


col_name,data_type,comment
order_id,string,
order_timestamp,timestamp,
customer_id,string,
country,string,
amount,double,
currency,string,
status,string,
order_date,date,
payment_method,string,
coupon_code,string,


In [0]:
%sql
-- Verify schema
SELECT order_id, payment_method, coupon_code FROM workspace.default.shopez_orders WHERE order_id IN ('se-o-1001','se-o-1002');


order_id,payment_method,coupon_code
se-o-1002,CARD,SALE50
se-o-1001,UPI,


In [0]:
%sql
-- Updates & Deletes using Delta
--update
UPDATE workspace.default.shopez_orders
SET status = 'CANCELLED'
WHERE amount < 50;
--delete
DELETE FROM workspace.default.shopez_orders WHERE amount < 5;
--verify
SELECT order_id, amount, status FROM workspace.default.shopez_orders ORDER BY order_timestamp DESC LIMIT 50;
DESCRIBE HISTORY workspace.default.shopez_orders;



version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
7,2025-12-01T18:33:05.000Z,74725293153598,theoryofnumbers123@gmail.com,DELETE,"Map(predicate -> [""(amount#16893 < 5.0)""])",,List(2456404513182676),1201-180437-6bbf1vw0-v2n,6.0,WriteSerializable,False,"Map(numRemovedFiles -> 0, numRemovedBytes -> 0, numCopiedRows -> 0, numDeletionVectorsAdded -> 0, numDeletionVectorsRemoved -> 0, numAddedChangeFiles -> 0, executionTimeMs -> 367, numDeletionVectorsUpdated -> 0, numDeletedRows -> 0, scanTimeMs -> 364, numAddedFiles -> 0, numAddedBytes -> 0, rewriteTimeMs -> 0)",,Databricks-Runtime/17.2.x-aarch64-photon-scala2.13
6,2025-12-01T18:33:03.000Z,74725293153598,theoryofnumbers123@gmail.com,UPDATE,"Map(predicate -> [""(amount#16156 < 50.0)""])",,List(2456404513182676),1201-180437-6bbf1vw0-v2n,5.0,WriteSerializable,False,"Map(numRemovedFiles -> 0, numRemovedBytes -> 0, numCopiedRows -> 0, numDeletionVectorsAdded -> 54, numDeletionVectorsRemoved -> 0, numAddedChangeFiles -> 0, executionTimeMs -> 5632, numDeletionVectorsUpdated -> 0, scanTimeMs -> 1144, numAddedFiles -> 54, numUpdatedRows -> 65, numAddedBytes -> 115568, rewriteTimeMs -> 4487)",,Databricks-Runtime/17.2.x-aarch64-photon-scala2.13
5,2025-12-01T18:30:02.000Z,74725293153598,theoryofnumbers123@gmail.com,WRITE,"Map(mode -> Append, statsOnLoad -> false, partitionBy -> [])",,List(2456404513182676),1201-180437-6bbf1vw0-v2n,4.0,WriteSerializable,True,"Map(numFiles -> 2, numOutputRows -> 2, numOutputBytes -> 4313)",,Databricks-Runtime/17.2.x-aarch64-photon-scala2.13
4,2025-12-01T18:27:54.000Z,74725293153598,theoryofnumbers123@gmail.com,UPDATE,"Map(predicate -> [""order_id#14469 IN (tt-o-9001,tt-o-9002)""])",,List(2456404513182676),1201-180437-6bbf1vw0-v2n,3.0,WriteSerializable,False,"Map(numRemovedFiles -> 2, numRemovedBytes -> 3518, numCopiedRows -> 0, numDeletionVectorsAdded -> 0, numDeletionVectorsRemoved -> 0, numAddedChangeFiles -> 0, executionTimeMs -> 4068, numDeletionVectorsUpdated -> 0, scanTimeMs -> 1539, numAddedFiles -> 2, numUpdatedRows -> 2, numAddedBytes -> 3486, rewriteTimeMs -> 2500)",,Databricks-Runtime/17.2.x-aarch64-photon-scala2.13
3,2025-12-01T18:27:26.000Z,74725293153598,theoryofnumbers123@gmail.com,WRITE,"Map(mode -> Append, statsOnLoad -> true, partitionBy -> [])",,List(2456404513182676),1201-180437-6bbf1vw0-v2n,2.0,WriteSerializable,True,"Map(numFiles -> 2, numOutputRows -> 2, numOutputBytes -> 3518)",,Databricks-Runtime/17.2.x-aarch64-photon-scala2.13
2,2025-12-01T18:17:50.000Z,74725293153598,theoryofnumbers123@gmail.com,CREATE OR REPLACE TABLE AS SELECT,"Map(partitionBy -> [""country"",""order_date""], clusterBy -> [], description -> null, isManaged -> true, properties -> {""delta.parquet.compression.codec"":""zstd"",""delta.enableDeletionVectors"":""true""}, statsOnLoad -> true)",,List(2456404513182676),1201-180437-6bbf1vw0-v2n,1.0,WriteSerializable,False,"Map(numFiles -> 125, numRemovedFiles -> 125, numRemovedBytes -> 319354, numDeletionVectorsRemoved -> 0, numOutputRows -> 5000, numOutputBytes -> 319354)",,Databricks-Runtime/17.2.x-aarch64-photon-scala2.13
1,2025-12-01T18:16:24.000Z,74725293153598,theoryofnumbers123@gmail.com,CREATE OR REPLACE TABLE AS SELECT,"Map(partitionBy -> [""country"",""order_date""], clusterBy -> [], description -> null, isManaged -> true, properties -> {""delta.parquet.compression.codec"":""zstd"",""delta.enableDeletionVectors"":""true""}, statsOnLoad -> true)",,List(2456404513182676),1201-180437-6bbf1vw0-v2n,0.0,WriteSerializable,False,"Map(numFiles -> 125, numRemovedFiles -> 125, numRemovedBytes -> 319354, numDeletionVectorsRemoved -> 0, numOutputRows -> 5000, numOutputBytes -> 319354)",,Databricks-Runtime/17.2.x-aarch64-photon-scala2.13
0,2025-12-01T18:09:53.000Z,74725293153598,theoryofnumbers123@gmail.com,CREATE OR REPLACE TABLE AS SELECT,"Map(partitionBy -> [""country"",""order_date""], clusterBy -> [], description -> null, isManaged -> true, properties -> {""delta.parquet.compression.codec"":""zstd"",""delta.enableDeletionVectors"":""true""}, statsOnLoad -> true)",,List(2456404513182676),1201-180437-6bbf1vw0-v2n,,WriteSerializable,False,"Map(numFiles -> 125, numRemovedFiles -> 0, numRemovedBytes -> 0, numDeletionVectorsRemoved -> 0, numOutputRows -> 5000, numOutputBytes -> 319354)",,Databricks-Runtime/17.2.x-aarch64-photon-scala2.13


In [0]:
%sql
-- Optimize the table: Use OPTIMIZE and optionally ZORDER on customer_id or order_date.
--current history numFiles (before optimize)
DESCRIBE HISTORY workspace.default.shopez_orders;
-- Optimize entire table
OPTIMIZE workspace.default.shopez_orders;
-- Using ZORDER
OPTIMIZE workspace.default.shopez_orders ZORDER BY (customer_id);
--Veify optimization
DESCRIBE HISTORY workspace.default.shopez_orders;

version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
8,2025-12-01T18:35:45.000Z,74725293153598,theoryofnumbers123@gmail.com,OPTIMIZE,"Map(predicate -> [], auto -> false, clusterBy -> [], zOrderBy -> [], batchId -> 0)",,List(2456404513182676),1201-180437-6bbf1vw0-v2n,7.0,SnapshotIsolation,False,"Map(numRemovedFiles -> 108, numRemovedBytes -> 253839, p25FileSize -> 2885, numDeletionVectorsRemoved -> 54, minFileSize -> 2523, numAddedFiles -> 54, maxFileSize -> 3102, p75FileSize -> 2965, p50FileSize -> 2928, numAddedBytes -> 157953)",,Databricks-Runtime/17.2.x-aarch64-photon-scala2.13
7,2025-12-01T18:33:05.000Z,74725293153598,theoryofnumbers123@gmail.com,DELETE,"Map(predicate -> [""(amount#16893 < 5.0)""])",,List(2456404513182676),1201-180437-6bbf1vw0-v2n,6.0,WriteSerializable,False,"Map(numRemovedFiles -> 0, numRemovedBytes -> 0, numCopiedRows -> 0, numDeletionVectorsAdded -> 0, numDeletionVectorsRemoved -> 0, numAddedChangeFiles -> 0, executionTimeMs -> 367, numDeletionVectorsUpdated -> 0, numDeletedRows -> 0, scanTimeMs -> 364, numAddedFiles -> 0, numAddedBytes -> 0, rewriteTimeMs -> 0)",,Databricks-Runtime/17.2.x-aarch64-photon-scala2.13
6,2025-12-01T18:33:03.000Z,74725293153598,theoryofnumbers123@gmail.com,UPDATE,"Map(predicate -> [""(amount#16156 < 50.0)""])",,List(2456404513182676),1201-180437-6bbf1vw0-v2n,5.0,WriteSerializable,False,"Map(numRemovedFiles -> 0, numRemovedBytes -> 0, numCopiedRows -> 0, numDeletionVectorsAdded -> 54, numDeletionVectorsRemoved -> 0, numAddedChangeFiles -> 0, executionTimeMs -> 5632, numDeletionVectorsUpdated -> 0, scanTimeMs -> 1144, numAddedFiles -> 54, numUpdatedRows -> 65, numAddedBytes -> 115568, rewriteTimeMs -> 4487)",,Databricks-Runtime/17.2.x-aarch64-photon-scala2.13
5,2025-12-01T18:30:02.000Z,74725293153598,theoryofnumbers123@gmail.com,WRITE,"Map(mode -> Append, statsOnLoad -> false, partitionBy -> [])",,List(2456404513182676),1201-180437-6bbf1vw0-v2n,4.0,WriteSerializable,True,"Map(numFiles -> 2, numOutputRows -> 2, numOutputBytes -> 4313)",,Databricks-Runtime/17.2.x-aarch64-photon-scala2.13
4,2025-12-01T18:27:54.000Z,74725293153598,theoryofnumbers123@gmail.com,UPDATE,"Map(predicate -> [""order_id#14469 IN (tt-o-9001,tt-o-9002)""])",,List(2456404513182676),1201-180437-6bbf1vw0-v2n,3.0,WriteSerializable,False,"Map(numRemovedFiles -> 2, numRemovedBytes -> 3518, numCopiedRows -> 0, numDeletionVectorsAdded -> 0, numDeletionVectorsRemoved -> 0, numAddedChangeFiles -> 0, executionTimeMs -> 4068, numDeletionVectorsUpdated -> 0, scanTimeMs -> 1539, numAddedFiles -> 2, numUpdatedRows -> 2, numAddedBytes -> 3486, rewriteTimeMs -> 2500)",,Databricks-Runtime/17.2.x-aarch64-photon-scala2.13
3,2025-12-01T18:27:26.000Z,74725293153598,theoryofnumbers123@gmail.com,WRITE,"Map(mode -> Append, statsOnLoad -> true, partitionBy -> [])",,List(2456404513182676),1201-180437-6bbf1vw0-v2n,2.0,WriteSerializable,True,"Map(numFiles -> 2, numOutputRows -> 2, numOutputBytes -> 3518)",,Databricks-Runtime/17.2.x-aarch64-photon-scala2.13
2,2025-12-01T18:17:50.000Z,74725293153598,theoryofnumbers123@gmail.com,CREATE OR REPLACE TABLE AS SELECT,"Map(partitionBy -> [""country"",""order_date""], clusterBy -> [], description -> null, isManaged -> true, properties -> {""delta.parquet.compression.codec"":""zstd"",""delta.enableDeletionVectors"":""true""}, statsOnLoad -> true)",,List(2456404513182676),1201-180437-6bbf1vw0-v2n,1.0,WriteSerializable,False,"Map(numFiles -> 125, numRemovedFiles -> 125, numRemovedBytes -> 319354, numDeletionVectorsRemoved -> 0, numOutputRows -> 5000, numOutputBytes -> 319354)",,Databricks-Runtime/17.2.x-aarch64-photon-scala2.13
1,2025-12-01T18:16:24.000Z,74725293153598,theoryofnumbers123@gmail.com,CREATE OR REPLACE TABLE AS SELECT,"Map(partitionBy -> [""country"",""order_date""], clusterBy -> [], description -> null, isManaged -> true, properties -> {""delta.parquet.compression.codec"":""zstd"",""delta.enableDeletionVectors"":""true""}, statsOnLoad -> true)",,List(2456404513182676),1201-180437-6bbf1vw0-v2n,0.0,WriteSerializable,False,"Map(numFiles -> 125, numRemovedFiles -> 125, numRemovedBytes -> 319354, numDeletionVectorsRemoved -> 0, numOutputRows -> 5000, numOutputBytes -> 319354)",,Databricks-Runtime/17.2.x-aarch64-photon-scala2.13
0,2025-12-01T18:09:53.000Z,74725293153598,theoryofnumbers123@gmail.com,CREATE OR REPLACE TABLE AS SELECT,"Map(partitionBy -> [""country"",""order_date""], clusterBy -> [], description -> null, isManaged -> true, properties -> {""delta.parquet.compression.codec"":""zstd"",""delta.enableDeletionVectors"":""true""}, statsOnLoad -> true)",,List(2456404513182676),1201-180437-6bbf1vw0-v2n,,WriteSerializable,False,"Map(numFiles -> 125, numRemovedFiles -> 0, numRemovedBytes -> 0, numDeletionVectorsRemoved -> 0, numOutputRows -> 5000, numOutputBytes -> 319354)",,Databricks-Runtime/17.2.x-aarch64-photon-scala2.13


In [0]:
#Show small-file problem and how OPTIMIZE helps
from datetime import datetime
import random
for i in range(150):
    tiny = [ (f"sf-{i}-{j}", datetime(2025,12,3,10,0,0), f"csf-{j%20}", random.choice(["US","IN","UK"]), float(j+1), "USD", "CREATED") for j in range(2) ]
    s_schema = "order_id STRING, order_timestamp timestamp, customer_id STRING, country STRING, amount DOUBLE, currency STRING, status STRING"
    s_df = spark.createDataFrame(tiny, s_schema).withColumn("order_date", to_date(col("order_timestamp")))
    # append many tiny files into multiple partitions
    s_df.write.format("delta").mode("append").partitionBy("country","order_date").saveAsTable("workspace.default.shopez_orders")


In [0]:
%sql
-- Observing small-file problem
DESCRIBE HISTORY workspace.default.shopez_orders;


version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
315,2025-12-01T18:47:19.000Z,74725293153598,theoryofnumbers123@gmail.com,WRITE,"Map(mode -> Append, statsOnLoad -> false, partitionBy -> [])",,List(2456404513182676),1201-180437-6bbf1vw0-v2n,314.0,WriteSerializable,True,"Map(numFiles -> 2, numOutputRows -> 2, numOutputBytes -> 4223)",,Databricks-Runtime/17.2.x-aarch64-photon-scala2.13
314,2025-12-01T18:47:17.000Z,74725293153598,theoryofnumbers123@gmail.com,WRITE,"Map(mode -> Append, statsOnLoad -> false, partitionBy -> [])",,List(2456404513182676),1201-180437-6bbf1vw0-v2n,313.0,WriteSerializable,True,"Map(numFiles -> 2, numOutputRows -> 2, numOutputBytes -> 4223)",,Databricks-Runtime/17.2.x-aarch64-photon-scala2.13
313,2025-12-01T18:47:15.000Z,74725293153598,theoryofnumbers123@gmail.com,WRITE,"Map(mode -> Append, statsOnLoad -> false, partitionBy -> [])",,List(2456404513182676),1201-180437-6bbf1vw0-v2n,312.0,WriteSerializable,True,"Map(numFiles -> 2, numOutputRows -> 2, numOutputBytes -> 4223)",,Databricks-Runtime/17.2.x-aarch64-photon-scala2.13
312,2025-12-01T18:47:13.000Z,74725293153598,theoryofnumbers123@gmail.com,WRITE,"Map(mode -> Append, statsOnLoad -> false, partitionBy -> [])",,List(2456404513182676),1201-180437-6bbf1vw0-v2n,311.0,WriteSerializable,True,"Map(numFiles -> 2, numOutputRows -> 2, numOutputBytes -> 4223)",,Databricks-Runtime/17.2.x-aarch64-photon-scala2.13
311,2025-12-01T18:47:11.001Z,74725293153598,theoryofnumbers123@gmail.com,WRITE,"Map(mode -> Append, statsOnLoad -> false, partitionBy -> [])",,List(2456404513182676),1201-180437-6bbf1vw0-v2n,309.0,WriteSerializable,True,"Map(numFiles -> 2, numOutputRows -> 2, numOutputBytes -> 4223, conflictDetectionTimeMs -> 32)",,Databricks-Runtime/17.2.x-aarch64-photon-scala2.13
310,2025-12-01T18:47:11.000Z,74725293153598,theoryofnumbers123@gmail.com,OPTIMIZE,"Map(predicate -> [""(('country <=> cast(IN as string)) AND ('order_date <=> cast(2025-12-03 as date)))""], auto -> true, clusterBy -> [], zOrderBy -> [], batchId -> 0)",,List(2456404513182676),1201-180437-6bbf1vw0-v2n,308.0,SnapshotIsolation,False,"Map(numRemovedFiles -> 16, numRemovedBytes -> 35083, p25FileSize -> 2949, numDeletionVectorsRemoved -> 0, conflictDetectionTimeMs -> 31, minFileSize -> 2949, numAddedFiles -> 1, maxFileSize -> 2949, p75FileSize -> 2949, p50FileSize -> 2949, numAddedBytes -> 2949)",,Databricks-Runtime/17.2.x-aarch64-photon-scala2.13
309,2025-12-01T18:47:09.000Z,74725293153598,theoryofnumbers123@gmail.com,WRITE,"Map(mode -> Append, statsOnLoad -> false, partitionBy -> [])",,List(2456404513182676),1201-180437-6bbf1vw0-v2n,308.0,WriteSerializable,True,"Map(numFiles -> 1, numOutputRows -> 2, numOutputBytes -> 2244)",,Databricks-Runtime/17.2.x-aarch64-photon-scala2.13
308,2025-12-01T18:47:07.000Z,74725293153598,theoryofnumbers123@gmail.com,WRITE,"Map(mode -> Append, statsOnLoad -> false, partitionBy -> [])",,List(2456404513182676),1201-180437-6bbf1vw0-v2n,307.0,WriteSerializable,True,"Map(numFiles -> 1, numOutputRows -> 2, numOutputBytes -> 2245)",,Databricks-Runtime/17.2.x-aarch64-photon-scala2.13
307,2025-12-01T18:47:06.000Z,74725293153598,theoryofnumbers123@gmail.com,WRITE,"Map(mode -> Append, statsOnLoad -> false, partitionBy -> [])",,List(2456404513182676),1201-180437-6bbf1vw0-v2n,306.0,WriteSerializable,True,"Map(numFiles -> 1, numOutputRows -> 2, numOutputBytes -> 2245)",,Databricks-Runtime/17.2.x-aarch64-photon-scala2.13
306,2025-12-01T18:47:04.000Z,74725293153598,theoryofnumbers123@gmail.com,WRITE,"Map(mode -> Append, statsOnLoad -> false, partitionBy -> [])",,List(2456404513182676),1201-180437-6bbf1vw0-v2n,305.0,WriteSerializable,True,"Map(numFiles -> 2, numOutputRows -> 2, numOutputBytes -> 4223)",,Databricks-Runtime/17.2.x-aarch64-photon-scala2.13


In [0]:
%sql
-- I have ran above cell 2 times to show about 300 new versions with many small and possibly duplicate files
-- Lets optimize it now
OPTIMIZE workspace.default.shopez_orders
ZORDER BY customer_id



path,metrics
,"List(3, 30, List(2870, 2976, 2921.6666666666665, 3, 8765), List(2111, 2949, 2214.5666666666666, 30, 66437), 132, List(minCubeSize(107374182400), List(0, 0), List(159, 413272), 0, List(30, 66437), 3, null), null, 0, 1, 159, 129, false, 0, 0, 1764615130055, 1764615134420, 8, 3, null, List(0, 0), null, 10, 10, 710, 0, null)"


In [0]:
%sql
-- Lets see optimzed version
DESCRIBE HISTORY workspace.default.shopez_orders;
-- we can see the last version as operation = OPTIMIZE, We can also see several optimizations that automatically happend before (in parameters we can see "auto" : "true"), in last one we do not see this as it was a manual optimization.


version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
316,2025-12-01T18:52:14.000Z,74725293153598,theoryofnumbers123@gmail.com,OPTIMIZE,"Map(predicate -> [], auto -> false, clusterBy -> [], zOrderBy -> [""customer_id""], batchId -> 0)",,List(2456404513182676),1201-180437-6bbf1vw0-v2n,315.0,SnapshotIsolation,False,"Map(numRemovedFiles -> 30, numRemovedBytes -> 66437, p25FileSize -> 2870, numDeletionVectorsRemoved -> 0, minFileSize -> 2870, numAddedFiles -> 3, maxFileSize -> 2976, p75FileSize -> 2976, p50FileSize -> 2919, numAddedBytes -> 8765)",,Databricks-Runtime/17.2.x-aarch64-photon-scala2.13
315,2025-12-01T18:47:19.000Z,74725293153598,theoryofnumbers123@gmail.com,WRITE,"Map(mode -> Append, statsOnLoad -> false, partitionBy -> [])",,List(2456404513182676),1201-180437-6bbf1vw0-v2n,314.0,WriteSerializable,True,"Map(numFiles -> 2, numOutputRows -> 2, numOutputBytes -> 4223)",,Databricks-Runtime/17.2.x-aarch64-photon-scala2.13
314,2025-12-01T18:47:17.000Z,74725293153598,theoryofnumbers123@gmail.com,WRITE,"Map(mode -> Append, statsOnLoad -> false, partitionBy -> [])",,List(2456404513182676),1201-180437-6bbf1vw0-v2n,313.0,WriteSerializable,True,"Map(numFiles -> 2, numOutputRows -> 2, numOutputBytes -> 4223)",,Databricks-Runtime/17.2.x-aarch64-photon-scala2.13
313,2025-12-01T18:47:15.000Z,74725293153598,theoryofnumbers123@gmail.com,WRITE,"Map(mode -> Append, statsOnLoad -> false, partitionBy -> [])",,List(2456404513182676),1201-180437-6bbf1vw0-v2n,312.0,WriteSerializable,True,"Map(numFiles -> 2, numOutputRows -> 2, numOutputBytes -> 4223)",,Databricks-Runtime/17.2.x-aarch64-photon-scala2.13
312,2025-12-01T18:47:13.000Z,74725293153598,theoryofnumbers123@gmail.com,WRITE,"Map(mode -> Append, statsOnLoad -> false, partitionBy -> [])",,List(2456404513182676),1201-180437-6bbf1vw0-v2n,311.0,WriteSerializable,True,"Map(numFiles -> 2, numOutputRows -> 2, numOutputBytes -> 4223)",,Databricks-Runtime/17.2.x-aarch64-photon-scala2.13
311,2025-12-01T18:47:11.001Z,74725293153598,theoryofnumbers123@gmail.com,WRITE,"Map(mode -> Append, statsOnLoad -> false, partitionBy -> [])",,List(2456404513182676),1201-180437-6bbf1vw0-v2n,309.0,WriteSerializable,True,"Map(numFiles -> 2, numOutputRows -> 2, numOutputBytes -> 4223, conflictDetectionTimeMs -> 32)",,Databricks-Runtime/17.2.x-aarch64-photon-scala2.13
310,2025-12-01T18:47:11.000Z,74725293153598,theoryofnumbers123@gmail.com,OPTIMIZE,"Map(predicate -> [""(('country <=> cast(IN as string)) AND ('order_date <=> cast(2025-12-03 as date)))""], auto -> true, clusterBy -> [], zOrderBy -> [], batchId -> 0)",,List(2456404513182676),1201-180437-6bbf1vw0-v2n,308.0,SnapshotIsolation,False,"Map(numRemovedFiles -> 16, numRemovedBytes -> 35083, p25FileSize -> 2949, numDeletionVectorsRemoved -> 0, conflictDetectionTimeMs -> 31, minFileSize -> 2949, numAddedFiles -> 1, maxFileSize -> 2949, p75FileSize -> 2949, p50FileSize -> 2949, numAddedBytes -> 2949)",,Databricks-Runtime/17.2.x-aarch64-photon-scala2.13
309,2025-12-01T18:47:09.000Z,74725293153598,theoryofnumbers123@gmail.com,WRITE,"Map(mode -> Append, statsOnLoad -> false, partitionBy -> [])",,List(2456404513182676),1201-180437-6bbf1vw0-v2n,308.0,WriteSerializable,True,"Map(numFiles -> 1, numOutputRows -> 2, numOutputBytes -> 2244)",,Databricks-Runtime/17.2.x-aarch64-photon-scala2.13
308,2025-12-01T18:47:07.000Z,74725293153598,theoryofnumbers123@gmail.com,WRITE,"Map(mode -> Append, statsOnLoad -> false, partitionBy -> [])",,List(2456404513182676),1201-180437-6bbf1vw0-v2n,307.0,WriteSerializable,True,"Map(numFiles -> 1, numOutputRows -> 2, numOutputBytes -> 2245)",,Databricks-Runtime/17.2.x-aarch64-photon-scala2.13
307,2025-12-01T18:47:06.000Z,74725293153598,theoryofnumbers123@gmail.com,WRITE,"Map(mode -> Append, statsOnLoad -> false, partitionBy -> [])",,List(2456404513182676),1201-180437-6bbf1vw0-v2n,306.0,WriteSerializable,True,"Map(numFiles -> 1, numOutputRows -> 2, numOutputBytes -> 2245)",,Databricks-Runtime/17.2.x-aarch64-photon-scala2.13
