## ZORDER IN DATABRICKS (DELTA LAKE)
### ZORDER:
- ZORDER is an optional feature used with OPTIMIZE to colocate related data physically in the same set of files.
- It improves query performance for range or equality filters on the specified columns.
### USAGE:
- OPTIMIZE table_name [WHERE predicate] ZORDER BY (col1, col2, ...)
- Reduces file scan for queries filtering on ZORDER columns.
- Works best for columns used frequently in WHERE clauses.
- Only reorganizes existing data; does not add or remove rows.
### EXAMPLE USE CASE:
- Periodically optimize large Delta tables with frequent writes/updates.
- Use ZORDER on high-selectivity columns to improve read performance.

In [0]:
%sql
-- Step 1 – Create the Delta table

CREATE TABLE sales1_catalog.inputdb.customer_txn (
    txn_id INT,
    customer_id INT,
    region STRING,
    txn_amount DOUBLE,
    txn_type STRING,
    transaction_date DATE
)
USING DELTA;

In [0]:
%sql
--Step 2 – Insert multiple small batches

--Each insert writes a few small Parquet files.

-- Batch 1
INSERT INTO sales1_catalog.inputdb.customer_txn VALUES
 (1, 1001, 'North', 250.00, 'Online', '2025-10-01'),
 (2, 1002, 'South', 400.00, 'Offline', '2025-10-02'),
 (3, 1003, 'West', 600.00, 'Online', '2025-10-03');

-- Batch 2
INSERT INTO sales1_catalog.inputdb.customer_txn VALUES
 (4, 1001, 'North', 300.00, 'Offline', '2025-10-01'),
 (5, 1004, 'East', 750.00, 'Online', '2025-10-02'),
 (6, 1005, 'South', 180.00, 'Online', '2025-10-03');

-- Batch 3
INSERT INTO sales1_catalog.inputdb.customer_txn VALUES
 (7, 1001, 'North', 270.00, 'Online', '2025-10-01'),
 (8, 1003, 'West', 500.00, 'Offline', '2025-10-02'),
 (9, 1002, 'South', 900.00, 'Online', '2025-10-03');

/*
region=North
    - part-0
    - part-1
    - part-2
region=South
    - part-0
    - part-1
    - part-2
region=West
    - part-0
    - part-1
region=East
    - part-0

select * from sales1_catalog.inputdb.customer_txn where region='North';

optimize sales1_catalog.inputdb.customer_txn;
region=North
    - part-0
    - part-1
    - part-2
    - part-3 - after optimize
region=South
    - part-0
    - part-1
    - part-2
    - part-3 - after optimize
region=West
    - part-0
    - part-1
    - part-2 - after optimize
region=East
    - part-0
    - part-1 - after optimize


optimize sales1_catalog.inputdb.customer_txn zorder by transaction_date;

optimize sales1_catalog.inputdb.customer_txn;
region=North
    - part-0
    - part-1
    - part-2
    - part-3 - optimize & sort the data rows in transaction_date
region=South
    - part-0
    - part-1
    - part-2
    - part-3 - optimize & sort the data rows in transaction_date
region=West
    - part-0
    - part-1
    - part-2 - optimize & sort the data rows in transaction_date
region=East
    - part-0
    - part-1 - optimize & sort the data rows in transaction_date
*/



In [0]:
%sql
-- Step 3 – Inspect fragmentation

DESCRIBE DETAIL sales1_catalog.inputdb.customer_txn;

In [0]:
%sql
-- Step 4 – Run OPTIMIZE ZORDER

-- Now compact and physically order data.

OPTIMIZE sales1_catalog.inputdb.customer_txn ZORDER BY (transaction_date);

In [0]:
%sql
-- Step 3 – Inspect fragmentation

DESCRIBE DETAIL sales1_catalog.inputdb.customer_txn;

In [0]:
%sql
SELECT * 
FROM sales1_catalog.inputdb.customer_txn
WHERE region = 'North'
AND transaction_date BETWEEN '2025-10-01' AND '2025-10-03';