In [6]:
from pyspark.sql import SparkSession 
from pyspark import StorageLevel 
from pyspark.sql.functions import rand, current_date, date_sub

In [2]:
spark = (SparkSession.builder
         .appName("cache-and-persist")
         .master("spark://spark-master:7077")
         .config("spark.executor.memory", "512m")
         .getOrCreate())

spark.sparkContext.setLogLevel("ERROR")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/02/21 13:44:30 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [9]:
# Define a function to measure the execution time of a query
import time

def measure_time(query):
    start = time.time()
    query.collect() # Force the query execution by calling an action
    end = time.time()
    print(f"Execution time: {end - start} seconds")

In [3]:
# Create some sample data frames
# A large data frame with 10 million rows and two columns: id and value
large_df = (spark.range(0, 10000000)
            .withColumn("date", date_sub(current_date(), (rand() * 365).cast("int")))
            .withColumn("ProductId", (rand() * 100).cast("int")))
large_df.show(5)

[Stage 0:>                                                          (0 + 1) / 1]

+---+----------+---------+
| id|      date|ProductId|
+---+----------+---------+
|  0|2024-02-10|       67|
|  1|2023-07-12|       39|
|  2|2023-08-10|        8|
|  3|2023-05-22|       29|
|  4|2023-06-22|       63|
+---+----------+---------+
only showing top 5 rows



                                                                                

In [4]:
# Cache the DataFrame using cache() method
large_df.cache()
# Check the storage level of the cached DataFrame
print(large_df.storageLevel)

Disk Memory Deserialized 1x Replicated


In [7]:
# Persist the DataFrame using persist() method with a different storage level
large_df.persist(StorageLevel.MEMORY_AND_DISK_DESER)
# Check the storage level of the persisted DataFrame
print(large_df.storageLevel)

Disk Memory Deserialized 1x Replicated


In [10]:
results_df = large_df.groupBy("ProductId").agg({"Id": "count"}) 
measure_time(results_df)
# Show the result
results_df.show(5)

                                                                                

Execution time: 8.600075006484985 seconds
+---------+---------+
|ProductId|count(Id)|
+---------+---------+
|       31|    99961|
|       85|    99746|
|       65|   100023|
|       53|   100615|
|       78|    99985|
+---------+---------+
only showing top 5 rows



In [11]:
results_df = large_df.groupBy("ProductId").agg({"Id": "count"}) 
measure_time(results_df)
# Show the result
results_df.show(5)

Execution time: 0.984121561050415 seconds




+---------+---------+
|ProductId|count(Id)|
+---------+---------+
|       31|    99961|
|       85|    99746|
|       65|   100023|
|       53|   100615|
|       78|    99985|
+---------+---------+
only showing top 5 rows



                                                                                

In [12]:
# Unpersist the DataFrame using unpersist() method
large_df.unpersist()
# Check the storage level of the unpersisted DataFrame
print(large_df.storageLevel)

Serialized 1x Replicated


In [13]:
spark.stop()