### Analyze the query plan

In [0]:
spark.sql("""
SELECT *
FROM workspace.ecommerce.ecommerce_events_delta
WHERE event_type = 'purchase'
""").explain(mode="extended")

### Collect table statistics

In [0]:
%sql
ANALYZE TABLE workspace.ecommerce.ecommerce_events_delta
COMPUTE STATISTICS;


### Partitioned Delta table

In [0]:
%sql
CREATE TABLE workspace.ecommerce.ecommerce_events_delta_part
USING DELTA
PARTITIONED BY (event_date)
AS
SELECT *, DATE(event_time) AS event_date
FROM workspace.ecommerce.ecommerce_events_delta;

In [0]:
%sql
SHOW PARTITIONS workspace.ecommerce.ecommerce_events_delta_part;


### Optimize file layout + apply ZORDER

In [0]:
%sql
OPTIMIZE workspace.ecommerce.ecommerce_events_delta
ZORDER BY (user_id, product_id);

### Benchmark performance properly (before vs after)

In [0]:
import time

start = time.time()
spark.sql("""
SELECT COUNT(*)
FROM workspace.ecommerce.ecommerce_events_delta
WHERE user_id = 12345
""").collect()

print(f"Baseline time: {time.time() - start:.2f}s")

In [0]:
start = time.time()
spark.sql("""
SELECT COUNT(*)
FROM workspace.ecommerce.ecommerce_events_delta
WHERE user_id = 12345
""").collect()

print(f"Optimized time: {time.time() - start:.2f}s")


### Cache

In [0]:
events_cached = spark.table("workspace.ecommerce.ecommerce_events_delta").cache()

# Materialize cache
events_cached.count()

[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
File [0;32m<command-4898009473685540>, line 1[0m
[0;32m----> 1[0m events_cached [38;5;241m=[39m spark[38;5;241m.[39mtable([38;5;124m"[39m[38;5;124mworkspace.ecommerce.ecommerce_events_delta[39m[38;5;124m"[39m)[38;5;241m.[39mcache()
[1;32m      3[0m [38;5;66;03m# Materialize cache[39;00m
[1;32m      4[0m events_cached[38;5;241m.[39mcount()

File [0;32m/databricks/python/lib/python3.12/site-packages/pyspark/sql/connect/dataframe.py:2093[0m, in [0;36mDataFrame.cache[0;34m(self)[0m
[1;32m   2092[0m [38;5;28;01mdef[39;00m [38;5;21mcache[39m([38;5;28mself[39m) [38;5;241m-[39m[38;5;241m>[39m ParentDataFrame:
[0;32m-> 2093[0m     [38;5;28;01mreturn[39;00m [38;5;28mself[39m[38;5;241m.[39mpersist()

File [0;32m/databricks/python/lib/python3.12/site-packages/pyspark/sql/connec