In [27]:
from delta import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import timeit

In [2]:
builder = (SparkSession.builder
           .appName("z-order-delta-table")
           .master("spark://spark-master:7077")
           .config("spark.executor.memory", "512m")
           .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
           .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog"))

spark = configure_spark_with_delta_pip(builder).getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

:: loading settings :: url = jar:file:/usr/local/lib/python3.10/dist-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-233a37d2-006e-4ab2-a07f-581a415f3f6f;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.4.0 in central
	found io.delta#delta-storage;2.4.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
:: resolution report :: resolve 296ms :: artifacts dl 11ms
	:: modules in use:
	io.delta#delta-core_2.12;2.4.0 from central in [default]
	io.delta#delta-storage;2.4.0 from central in [default]
	org.antlr#antlr4-runtime;4.9.3 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   3   |   0   |   0   |   0 

In [24]:
%load_ext sparksql_magic
%config SparkSql.limit=20

The sparksql_magic extension is already loaded. To reload it, use:
  %reload_ext sparksql_magic


In [5]:
# Read the CSV file into a Spark DataFrame
df = (spark.read
      .format("csv")
      .option("header", "true")
      .option("inferSchema", "true")
      .load("../data/Online_Retail.csv"))

# Write the DataFrame into a Delta table
(df.write
 .format("delta")
 .mode("overwrite")
 .save("../data/delta_lake/online_retail"))

                                                                                

In [31]:
# Query the original table
query = "spark.sql(\"SELECT StockCode, CustomerID, SUM(Quantity) AS TotalQuantity FROM delta.`/opt/workspace/data/delta_lake/online_retail` GROUP BY StockCode, CustomerID\").show()"
query_time = timeit.timeit(query, number=1, globals=globals())
print(f"Time taken for original table: {query_time} seconds")



+---------+----------+-------------+
|StockCode|CustomerID|TotalQuantity|
+---------+----------+-------------+
|    22637|     15311|           15|
|    22141|     17920|            4|
|    22242|     17920|            5|
|    21257|     14849|            8|
|    21670|     17841|           85|
|   85123A|     15235|           12|
|    21042|     13715|            3|
|    22752|     12471|           11|
|    22726|     13418|           72|
|   85099B|     14388|           70|
|    22094|     18041|           12|
|    21211|     16916|            1|
|    22335|     14449|           15|
|    84347|     15061|         1200|
|    20752|     15574|            1|
|    22593|     16546|         -144|
|    71477|     13295|           -1|
|   85014B|     18239|            3|
|    21916|     15093|           24|
|    79321|     12841|           33|
+---------+----------+-------------+
only showing top 20 rows

Time taken for original table: 5.887500449000072 seconds


                                                                                

In [35]:
# Get the DeltaTable object for the online_retail table
deltaTable = DeltaTable.forPath(spark, "/opt/workspace/data/delta_lake/online_retail")

# Optimize the table with Z-Ordering on StockCode and CustomerID
deltaTable.optimize().executeZOrderBy("StockCode", "CustomerID")

                                                                                

DataFrame[path: string, metrics: struct<numFilesAdded:bigint,numFilesRemoved:bigint,filesAdded:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>,filesRemoved:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>,partitionsOptimized:bigint,zOrderStats:struct<strategyName:string,inputCubeFiles:struct<num:bigint,size:bigint>,inputOtherFiles:struct<num:bigint,size:bigint>,inputNumCubes:bigint,mergedFiles:struct<num:bigint,size:bigint>,numOutputCubes:bigint,mergedNumCubes:bigint>,numBatches:bigint,totalConsideredFiles:bigint,totalFilesSkipped:bigint,preserveInsertionOrder:boolean,numFilesSkippedToReduceWriteAmplification:bigint,numBytesSkippedToReduceWriteAmplification:bigint,startTimeMs:bigint,endTimeMs:bigint,totalClusterParallelism:bigint,totalScheduledTasks:bigint,autoCompactParallelismStats:struct<maxClusterActiveParallelism:bigint,minClusterActiveParallelism:bigint,maxSessionActiveParallelism:bigint,minSessionActiveParallelism:bigint>,de

In [37]:
# Query the Z-Ordered table
query = "spark.sql(\"SELECT StockCode, CustomerID, SUM(Quantity) AS TotalQuantity FROM delta.`/opt/workspace/data/delta_lake/online_retail` GROUP BY StockCode, CustomerID\").show()"
query_time = timeit.timeit(query, number=1, globals=globals())
print(f"Time taken for z-ordered table: {query_time} seconds")

+---------+----------+-------------+
|StockCode|CustomerID|TotalQuantity|
+---------+----------+-------------+
|    22637|     15311|           15|
|    22141|     17920|            4|
|    22242|     17920|            5|
|    21257|     14849|            8|
|    21670|     17841|           85|
|   85123A|     15235|           12|
|    21042|     13715|            3|
|    22752|     12471|           11|
|    22726|     13418|           72|
|   85099B|     14388|           70|
|    22094|     18041|           12|
|    21211|     16916|            1|
|    22335|     14449|           15|
|    84347|     15061|         1200|
|    20752|     15574|            1|
|    22593|     16546|         -144|
|    71477|     13295|           -1|
|   85014B|     18239|            3|
|    21916|     15093|           24|
|    79321|     12841|           33|
+---------+----------+-------------+
only showing top 20 rows

Time taken for z-ordered table: 1.648877622999862 seconds


                                                                                

In [38]:
spark.stop()