In [1]:
# Spark Session
from pyspark.sql import SparkSession

spark = (
    SparkSession
    .builder
    .appName("Understand Caching")
    .master("local[*]")
    .config("spark.executor.memory", "512M")
    .getOrCreate()
)

spark

In [3]:
# Read Sales CSV Data - 752MB Size ~ 7.2M Records

_schema = "transacted_at string, trx_id string, retailer_id string, description string, amount double, city_id string"

df = spark.read.format("csv").schema(_schema).option("header", True).load(r"C:\Users\vivek\Downloads\pyspark_lecture_tutorial\data\emp_sales.csv")

In [4]:
df.show()

+----------------+----------+-----------+--------------------+-------+----------+
|   transacted_at|    trx_id|retailer_id|         description| amount|   city_id|
+----------------+----------+-----------+--------------------+-------+----------+
|25-11-2017 00:30|1995601912| 2077350195|Walgreen       11-25| 197.23| 216510442|
|25-11-2017 00:30|1734117021|  644879053|unkn    ppd id: 7...|   8.58| 930259917|
|25-11-2017 00:30|1734117022|  847200066|Wal-Mart  ppd id:...|1737.26|1646415505|
|25-11-2017 00:30|1734117030| 1953761884|Home Depot     pp...|  384.5| 287177635|
|25-11-2017 00:30|1734117089| 1898522855| Target        11-25|  66.33|1855530529|
|25-11-2017 00:30|1734117117|  997626433|Sears  ppd id: 85...| 298.87| 957346984|
|25-11-2017 00:30|1734117123| 1953761884|unkn   ppd id: 15...|  19.55|  45522086|
|25-11-2017 00:30|1734117152| 1429095612|Ikea     arc id: ...|   9.39|1268541279|
|25-11-2017 00:30|1734117153|  847200066|unkn        Kings...|2907.57|1483931123|
|25-11-2017 00:3

In [5]:
df.where("amount > 300").show()

+----------------+----------+-----------+--------------------+-------+----------+
|   transacted_at|    trx_id|retailer_id|         description| amount|   city_id|
+----------------+----------+-----------+--------------------+-------+----------+
|25-11-2017 00:30|1734117022|  847200066|Wal-Mart  ppd id:...|1737.26|1646415505|
|25-11-2017 00:30|1734117030| 1953761884|Home Depot     pp...|  384.5| 287177635|
|25-11-2017 00:30|1734117153|  847200066|unkn        Kings...|2907.57|1483931123|
|25-11-2017 00:30|1734117241|  486576507|              iTunes|2912.67|1663872965|
|25-11-2017 00:30|2076947146|  511877722|unkn     ccd id: ...|1915.35|1698762556|
|25-11-2017 00:30|2076947113| 1996661856|AutoZone  arc id:...| 1523.6|1759612211|
|25-11-2017 00:30|2076946994| 1898522855|Target    ppd id:...|2589.93|2074005445|
|25-11-2017 00:30|2076946121|  562903918|unkn    ccd id: 5...| 315.86|1773943669|
|25-11-2017 00:30|2076946063| 1070485878|Amazon.com   arc ...| 785.27|1126623009|
|25-11-2017 00:3

In [21]:
# Cache DataFrame (cache or persist)

# df_cache = df.cache()  #Memory and disc

# Cache DataFrame (cache or persist)

df_cache = df.where("amount > 100").cache()

In [22]:
df_cache.count()

371162

In [19]:
df.where("amount > 300").show()

+----------------+----------+-----------+--------------------+-------+----------+
|   transacted_at|    trx_id|retailer_id|         description| amount|   city_id|
+----------------+----------+-----------+--------------------+-------+----------+
|25-11-2017 00:30|1734117022|  847200066|Wal-Mart  ppd id:...|1737.26|1646415505|
|25-11-2017 00:30|1734117030| 1953761884|Home Depot     pp...|  384.5| 287177635|
|25-11-2017 00:30|1734117153|  847200066|unkn        Kings...|2907.57|1483931123|
|25-11-2017 00:30|1734117241|  486576507|              iTunes|2912.67|1663872965|
|25-11-2017 00:30|2076947146|  511877722|unkn     ccd id: ...|1915.35|1698762556|
|25-11-2017 00:30|2076947113| 1996661856|AutoZone  arc id:...| 1523.6|1759612211|
|25-11-2017 00:30|2076946994| 1898522855|Target    ppd id:...|2589.93|2074005445|
|25-11-2017 00:30|2076946121|  562903918|unkn    ccd id: 5...| 315.86|1773943669|
|25-11-2017 00:30|2076946063| 1070485878|Amazon.com   arc ...| 785.27|1126623009|
|25-11-2017 00:3

In [23]:
# Remove Cache

# df.unpersist()

DataFrame[transacted_at: string, trx_id: string, retailer_id: string, description: string, amount: double, city_id: string]

In [25]:
# MEMORY_ONLY, MEMORY_AND_DISK, MEMORY_ONLY_SER, MEMORY_AND_DISK_SER, DISK_ONLY, MEMORY_ONLY_2, MEMORY_AND_DISK_2
import pyspark

df_persist = df.persist(pyspark.StorageLevel.MEMORY_ONLY_2)

In [26]:
df_persist.write.format("noop").mode("overwrite").save()

In [27]:
# Remove Cache

spark.catalog.clearCache()

In [28]:
spark.stop()