In [38]:
# Spark Session
from pyspark.sql import SparkSession

spark = (
    SparkSession
    .builder
    .appName("Understand Caching")
    .master("local[*]")
    .config("spark.executor.memory", "512M")
    .getOrCreate()
)

spark

In [39]:
# Read Sales CSV Data - 752MB Size ~ 7.2M Records
# _schema = "employee_id integer, department_id integer, name string, age integer, gender string, salary integer"
# df_emp = spark.read.format("csv").option("header", True).schema(_schema).load("data/input/emp.csv")
# df_emp.printSchema()
# df_emp.show()

df_emp = spark.read.format("csv").option("header", True).load("data/input/employee_records.csv")
df_emp1 = df_emp.select("first_name", "last_name", "job_title", "salary")
df_emp1.printSchema()
df_emp1.show()

root
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- job_title: string (nullable = true)
 |-- salary: string (nullable = true)

+----------+----------+--------------------+------+
|first_name| last_name|           job_title|salary|
+----------+----------+--------------------+------+
|   Richard|  Morrison|Public relations ...|512653|
|     Bobby|  Mccarthy|   Barrister's clerk|999836|
|    Dennis|    Norman|Land/geomatics su...|131900|
|      John|    Monroe|        Retail buyer|485506|
|  Michelle|   Elliott|      Air cabin crew|604738|
|    Ashley|   Montoya|        Cartographer|483339|
| Nathaniel|     Smith|     Quality manager|419644|
|     Faith|  Cummings|Industrial/produc...|205939|
|  Margaret|    Sutton|Administrator, ed...|671167|
|      Mary|    Sutton|   Freight forwarder|993829|
|      Jake|      King|       Lexicographer|702101|
|   Heather|     Haley|         Music tutor|570960|
|    Thomas|    Thomas|Chartered managem...|339441|


In [42]:
# Cache DataFrame (cache or persist)
# default: deserialized, MEMORY_AND_DISK (for dataframe & dataset storage) 
# For RDD default caching is memory
df_cache = df_emp1.where("salary > 60000").cache() 
df_cache.count()

                                                                                

949640

In [43]:
df_emp1.where("salary > 50000").count()

959664

In [48]:
# MEMORY_ONLY, MEMORY_AND_DISK, MEMORY_ONLY_SER, MEMORY_AND_DISK_SER, DISK_ONLY, MEMORY_ONLY_2, MEMORY_AND_DISK_2
import pyspark 

# df_persist = df_emp1.persist(pyspark.StorageLevel.MEMORY_ONLY) # Memory Serialized 1x Replicated
df_persist = df_emp1.persist(pyspark.StorageLevel.MEMORY_ONLY_2) # Memory Serialized 2x Replicated
df_persist.write.format("noop").mode("overwrite").save()

25/01/12 22:52:54 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
25/01/12 22:52:54 WARN BlockManager: Block rdd_63_7 replicated to only 0 peer(s) instead of 1 peers
25/01/12 22:52:54 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
25/01/12 22:52:54 WARN BlockManager: Block rdd_63_1 replicated to only 0 peer(s) instead of 1 peers
25/01/12 22:52:54 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
25/01/12 22:52:54 WARN BlockManager: Block rdd_63_3 replicated to only 0 peer(s) instead of 1 peers
25/01/12 22:52:54 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
25/01/12 22:52:54 WARN BlockManager: Block rdd_63_0 replicated to only 0 peer(s) instead of 1 peers
25/01/12 22:52:54 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
25/01/12 22:52:54 WARN BlockManager: Block rdd_63_6 replicated to only 0 peer(s) instead of 1 peers
25/01/12 22:52:54 WARN RandomB

In [47]:
# Remove Cache
# df_cache.unpersist()
spark.catalog.clearCache()

In [49]:
spark.stop()