

A Bloom filter index is a space-efficient data structure that enables data skipping on chosen columns, particularly for fields containing **arbitrary text**.



The catch is that you want to have a reasonable idea of the number of distinct values
that need to be indexed, because this will determine the length of hashes needed to
avoid collisions if that number is set too small or to avoid wasting space if it is set too
large.

[Reference Link](https://docs.databricks.com/aws/en/optimizations/bloom-filters)

In [0]:
access_key =  "{}"
secret_key = "{}"
encoded_secret_key = secret_key.replace("/", "%2F")
aws_bucket_name = "{}"
mount_name = "databricks_external_stage_bloom"

dbutils.fs.mount(f"s3a://{access_key}:{encoded_secret_key}@{aws_bucket_name}", f"/mnt/{mount_name}")

[0;31m---------------------------------------------------------------------------[0m
[0;31mExecutionError[0m                            Traceback (most recent call last)
File [0;32m<command-2709764810692339>, line 7[0m
[1;32m      4[0m aws_bucket_name [38;5;241m=[39m [38;5;124m"[39m[38;5;124mdeltasparktesting[39m[38;5;124m"[39m
[1;32m      5[0m mount_name [38;5;241m=[39m [38;5;124m"[39m[38;5;124mdatabricks_external_stage_bloom[39m[38;5;124m"[39m
[0;32m----> 7[0m dbutils[38;5;241m.[39mfs[38;5;241m.[39mmount([38;5;124mf[39m[38;5;124m"[39m[38;5;124ms3a://[39m[38;5;132;01m{[39;00maccess_key[38;5;132;01m}[39;00m[38;5;124m:[39m[38;5;132;01m{[39;00mencoded_secret_key[38;5;132;01m}[39;00m[38;5;124m@[39m[38;5;132;01m{[39;00maws_bucket_name[38;5;132;01m}[39;00m[38;5;124m"[39m, [38;5;124mf[39m[38;5;124m"[39m[38;5;124m/mnt/[39m[38;5;132;01m{[39;00mmount_name[38;5;132;01m}[39;00m[38;5;124m"[39m)

File [0;32m/databricks/python_shel

In [0]:
%sql
drop table if exists hive_metastore.default.bloom_filter_demo;

In [0]:
%sql
CREATE OR REPLACE TABLE hive_metastore.default.bloom_filter_demo (
  record_id BIGINT NOT NULL,
  key_with_bloom STRING NOT NULL,
  key_without_bloom STRING NOT NULL
)
USING DELTA
LOCATION '/mnt/databricks_external_stage_bloom/bloom_filter_demo/';

In [0]:
%sql
CREATE BLOOMFILTER INDEX
ON TABLE hive_metastore.default.bloom_filter_demo
FOR COLUMNS(key_with_bloom OPTIONS (fpp=0.1, numItems=100000));

The fpp value in the parameters is short for false positive probability. This number
sets a limit on what rate of false positives is acceptable during reads. A lower value
increases the accuracy of the index but takes a little bit of a performance hit. This is
because the fpp value determines how many bits are required for each element to be
stored, so increasing the accuracy increases the size of the index itself.

In [0]:
%sql
WITH data_gen AS (
  SELECT
    monotonically_increasing_id() AS record_id,
    md5(cast(record_id as string)) AS key_with_bloom,  -- Using MD5 hash of a UUID
    md5(cast(record_id as string)) AS key_without_bloom -- Generating the same value again
  FROM RANGE(0, 100000, 1, 100)
)
INSERT INTO hive_metastore.default.bloom_filter_demo
SELECT record_id, key_with_bloom, key_without_bloom
FROM data_gen

num_affected_rows,num_inserted_rows
100000,100000


In [0]:
%sql 
SELECT * FROM hive_metastore.default.bloom_filter_demo limit 10;

record_id,key_with_bloom,key_without_bloom
0,cfcd208495d565ef66e7dff9f98764da,cfcd208495d565ef66e7dff9f98764da
1,c4ca4238a0b923820dcc509a6f75849b,c4ca4238a0b923820dcc509a6f75849b
2,c81e728d9d4c2f636f067f89cc14862c,c81e728d9d4c2f636f067f89cc14862c
3,eccbc87e4b5ce2fe28308fd9f2a7baf3,eccbc87e4b5ce2fe28308fd9f2a7baf3
4,a87ff679a2f3e71d9181a67b7542122c,a87ff679a2f3e71d9181a67b7542122c
5,e4da3b7fbbce2345d7772b0674a318d5,e4da3b7fbbce2345d7772b0674a318d5
6,1679091c5a880faf6fb5e6087eb1b2dc,1679091c5a880faf6fb5e6087eb1b2dc
7,8f14e45fceea167a5a36dedd4bea2543,8f14e45fceea167a5a36dedd4bea2543
8,c9f0f895fb98ab9159f51fd0297e236d,c9f0f895fb98ab9159f51fd0297e236d
9,45c48cce2e2d7fbdea1afc51c7c6ad26,45c48cce2e2d7fbdea1afc51c7c6ad26


In [0]:
%sql 
SELECT * FROM hive_metastore.default.bloom_filter_demo WHERE key_with_bloom IN ('cfcd208495d565ef66e7dff9f98764da','c4ca4238a0b923820dcc509a6f75849b','45c48cce2e2d7fbdea1afc51c7c6ad26')

record_id,key_with_bloom,key_without_bloom
0,cfcd208495d565ef66e7dff9f98764da,cfcd208495d565ef66e7dff9f98764da
1,c4ca4238a0b923820dcc509a6f75849b,c4ca4238a0b923820dcc509a6f75849b
9,45c48cce2e2d7fbdea1afc51c7c6ad26,45c48cce2e2d7fbdea1afc51c7c6ad26


In [0]:
%sql 
SELECT * FROM hive_metastore.default.bloom_filter_demo WHERE key_without_bloom IN ('cfcd208495d565ef66e7dff9f98764da','c4ca4238a0b923820dcc509a6f75849b','45c48cce2e2d7fbdea1afc51c7c6ad26')

record_id,key_with_bloom,key_without_bloom
0,cfcd208495d565ef66e7dff9f98764da,cfcd208495d565ef66e7dff9f98764da
1,c4ca4238a0b923820dcc509a6f75849b,c4ca4238a0b923820dcc509a6f75849b
9,45c48cce2e2d7fbdea1afc51c7c6ad26,45c48cce2e2d7fbdea1afc51c7c6ad26
