In [1]:
from delta import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import timeit

In [2]:
builder = (SparkSession.builder
           .appName("compression-delta-table")
           .master("spark://spark-master:7077")
           .config("spark.executor.memory", "512m")
           .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
           .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog"))

spark = configure_spark_with_delta_pip(builder).getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

:: loading settings :: url = jar:file:/usr/local/lib/python3.10/dist-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-039c90a5-6b76-416c-a380-08c58b2a6e42;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.4.0 in central
	found io.delta#delta-storage;2.4.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
:: resolution report :: resolve 399ms :: artifacts dl 14ms
	:: modules in use:
	io.delta#delta-core_2.12;2.4.0 from central in [default]
	io.delta#delta-storage;2.4.0 from central in [default]
	org.antlr#antlr4-runtime;4.9.3 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   3   |   0   |   0   |   0 

In [3]:
%load_ext sparksql_magic
%config SparkSql.limit=20

In [4]:
# Create some sample data frames
# A large data frame with 1 million rows
df = (spark.range(0, 1000000)
            .withColumn("salary", 100*(rand() * 100).cast("int"))
            .withColumn("gender", when((rand() * 2).cast("int") == 0, "M").otherwise("F"))
            .withColumn("country_code", 
                        when((rand() * 4).cast("int") == 0, "US")
                        .when((rand() * 4).cast("int") == 1, "CN")
                        .when((rand() * 4).cast("int") == 2, "IN")
                        .when((rand() * 4).cast("int") == 3, "BR")
                        .otherwise('RU')))
df.show(5)

[Stage 0:>                                                          (0 + 1) / 1]

+---+------+------+------------+
| id|salary|gender|country_code|
+---+------+------+------------+
|  0|  2000|     M|          BR|
|  1|  4000|     M|          RU|
|  2|  2400|     F|          IN|
|  3|  6200|     M|          US|
|  4|  7900|     M|          RU|
+---+------+------+------------+
only showing top 5 rows



                                                                                

In [5]:
# Write the DataFrame to a Delta Lake table with the default compression codec (snappy)
(df.write.format("delta")
    .mode("overwrite")
    .save("../data/tmp/employee_salary_snappy"))

                                                                                

In [6]:
# Check the size of the table on disk
query = "(spark.read.format(\"delta\").load(\"../data/tmp/employee_salary_snappy\").write.mode(\"overwrite\").format(\"noop\").save())"
snappy_time = timeit.timeit(query, number=1, globals=globals())
print(f"Snappy Compression query time: {snappy_time} seconds")

[Stage 20:>                                                         (0 + 2) / 2]

Snappy Compression query time: 2.3627631140006997 seconds


                                                                                

In [7]:
# Write the DataFrame to a Delta Lake table with the default compression codec (snappy)
(df.write.format("delta")
 .mode("overwrite")
 .option("compression", "zstd")
 .save("../data/tmp/employee_salary_zstd"))

                                                                                

In [8]:
# Check the size of the table on disk
query = "(spark.read.format(\"delta\").load(\"../data/tmp/employee_salary_zstd\").write.mode(\"overwrite\").format(\"noop\").save())"
zstd_time = timeit.timeit(query, number=1, globals=globals())
print(f"zstd Compression query time: {zstd_time} seconds")

                                                                                

zstd Compression query time: 1.8800181449987576 seconds


                                                                                

In [10]:
spark.stop()