# Utility function to generate data
* Requirement: pip install faker

In [0]:
%pip install faker

In [0]:
from faker import Faker
from faker.providers import BaseProvider

import random

cols = ["COMP_CODE", "GL_ACCOUNT", "FISC_YEAR", "BALANCE", "CURRENCY", "CURRENCY_ISO", "CODE", "MESSAGE"]
# Create a customer provider for generating random salaries and ages.
class CustomProvider(BaseProvider):
    def COMP_CODE(self):
        comp_code_range = range(1000, 1006)
        return random.choice(comp_code_range)
    
    def GL_ACCOUNT(self):
        gl_account_range = range(1000, 9999)
        return '000000' + str(random.choice(gl_account_range))
      
    def FISC_YEAR(self):
        year_range = range(2010, 2019)
        return random.choice(year_range)

    def BALANCE(self):
        balance_range = range(-10, 1000000)
        return random.choice(balance_range)

faker = Faker()
faker.add_provider(CustomProvider)


# Generate data for 4 columns: name, age, job and salary.
def gen_data_v1(num: int) -> list:
  return [(faker.COMP_CODE(), faker.GL_ACCOUNT(), int(faker.FISC_YEAR()), float(faker.BALANCE()), 'USD', '$', 'FN028', '..notes..') for _ in range(num)]

# Clean prior run data files

In [0]:
dbutils.fs.rm('/tmp/ch-12/', True)

# Drop & recreate database
spark.sql("DROP DATABASE IF EXISTS ch_12 CASCADE")
spark.sql("CREATE DATABASE ch_12 ")
spark.sql("USE ch_12")

# Configure Path
DELTALAKE_PATH = "/tmp/ch-12/data"

# Remove table if it exists
dbutils.fs.rm(DELTALAKE_PATH, recurse=True)

## Create a Delta table

In [0]:
df_0 = spark.createDataFrame(gen_data_v1(1000)).toDF(*cols)
df_0.write.format('delta').partitionBy('FISC_YEAR').save(DELTALAKE_PATH)

s_sql = "CREATE TABLE IF NOT EXISTS perf_test USING delta LOCATION '" + DELTALAKE_PATH+ "'"
spark.sql(s_sql)

## Simulate new data coming in

In [0]:
for i in range(5):
  df = spark.createDataFrame(gen_data_v1(1000)).toDF(*cols)
  df.write.format('delta').mode('append').save(DELTALAKE_PATH)

# Tune Table Properties & spark settings

## Optimize Writes
* combines multiple small files to reduce the number of disk I/O operations

In [0]:
%sql
ALTER TABLE perf_test SET TBLPROPERTIES ('delta.autoOptimize.optimizeWrite' = 'true');

## Randomize File Prefixes to avoid hotspots
  * ALTER TABLE <delta_table> SET TBLPROPERTIES 'delta.randomizeFilePrefixes' = 'true'

In [0]:
%sql
ALTER TABLE perf_test SET TBLPROPERTIES ('delta.randomizeFilePrefixes' = 'true');

## Other specialized settings (specific to Databricks Runtime)

### Dynamic File Pruning
SET spark.databricks.optimizer.dynamicFilePruning = true;
* Useful for non-partitioned tables, or for joins on non-partitioned columns. <br>

#### Max file size on disk
SET spark.databricks.delta.optimize.maxFileSize = 1610612736;

#### Join
SET spark.databricks.optimizer.dynamicFilePruning = true;
* Number of files of the Delta table on the probe side of the join required to trigger dynamic file pruning
* minimum table size on the probe side of the join required to trigger (DFP) 
* default is 10G <br>

### IO Caching (Delta Cache)
SET spark.databricks.io.cache.enabled = true; <br>
SET spark.databricks.io.cache.maxDiskUsage = <> ; <br>
SET spark.databricks.io.cache.maxMetaDataCache = <> ; <br>
SET spark.databricks.io.cache.compression.enabled = true; <br>
CACHE SELECT * FROM perf_test;

### Optimize Join performance (Range & Skew Joins using hints)
SET spark.databricks.optimizer.rangeJoin.binSize=5;<br>
 
SELECT /*+ RANGE_JOIN(points, 10) */ * <br>
FROM points JOIN ranges ON points.p >= ranges.start AND points.p < ranges.end;

### Enable Low Shuffle Merge
SET spark.databricks.delta.merge.enableLowShuffle = true;

# Optimize (file management)
* OPTIMIZE <delta_table> [WHERE <partition_filter>] ZORDER BY (<column>[, …]) 
* Combine small filess into larger one on disk
* Aids file skipping
* Bin packing is idempotent meaning 2nd run without any new data does not have any impact on data layout

In [0]:
%sql
-- optimize entire table
OPTIMIZE perf_test;

In [0]:
%sql
-- optimize only subset of table ex. recent data
OPTIMIZE perf_test WHERE FISC_YEAR >= 2015;

# ZORDER
* Best applied on increment data 
* it is not idempotent

In [0]:
%sql
OPTIMIZE perf_test
WHERE FISC_YEAR >= 2015 
ZORDER BY (Comp_Code);

# Bloom Filter
* Databricks specific feature

In [0]:
%sql
-- Enable the Bloom filter index capability 
SET spark.databricks.io.skipping.bloomFilter.enabled = true; 

CREATE BLOOMFILTER INDEX 
ON TABLE perf_test
FOR COLUMNS(balance OPTIONS (fpp=0.1, numItems=50000000));

# Use Delta APIs

In [0]:
from delta.tables import * 

deltaTable = DeltaTable.forPath(spark, DELTALAKE_PATH) 

In [0]:
deltaTable.

## Optimize

In [0]:
deltaTable.optimize()

In [0]:
deltaTable.optimize().where("FISC_YEAR='2011'").executeCompaction()

## Vacuum

In [0]:
# versions older than the default retention period
deltaTable.vacuum() 

In [0]:
# not required by versions more than 100 hours old
deltaTable.vacuum(100) 