# Caching in SQL -- Part 2
Understand Spark SQL caching


## Step 1 : Read JSON data

In [None]:
import time

t1 = time.perf_counter()
clickstreamDF = spark.read.json("/data/click-stream/json")
t2 = time.perf_counter()
print ("Read JSON in {:,.2f} ms ".format( (t2-t1)*1000))

clickstreamDF.createOrReplaceTempView("clickstream")
print ("registered temp table clickstream")
spark.catalog.listTables()

## Step 2 : Query without caching


In [None]:
import time

spark.catalog.clearCache()

t1 = time.perf_counter()
sql="""
select domain, count(*) as total from clickstream
group by domain 
order by total 
desc limit 10
"""
top10_domains = spark.sql(sql)
top10_domains.show()
t2 = time.perf_counter()
print ("query took {:,.2f} ms ".format( (t2-t1)*1000))



## Step 3 : Explain Query

In [None]:
top10_domains.explain()

#top10_domains.explain(extended=True)

## Step 3 : Cache

There are 3 ways to cache
1. dataframe.cache()  : non blocking
2. spark.sql("cache table TABLE_NAME") : blocking
3. spark.catalog.cacheTable('tableName') : non blocking

Try all these options and see the performance implications.

In [None]:
import time

# uncache
spark.catalog.clearCache() ## clear all tables
# spark.catalog.uncacheTable("clickstream")  # clear just one table

print ("is 'clickstream' cached : " , spark.catalog.isCached('clickstream'))

t1 = time.perf_counter()
## we have different ways to cache,
## uncomment one of the following
#spark.sql("cache table clickstream");  ## 1
#clickstreamDF.cache()  ## 2
spark.catalog.cacheTable("clickstream") ## 3

t2 = time.perf_counter()
print ("caching took {:,.2f} ms ".format( (t2-t1)*1000))

print ("is 'clickstream' cached : " , spark.catalog.isCached('clickstream'))

## Step : Query after caching
Run the following cell to measure query time after caching.

In [None]:
## Query after caching

import time

t1 = time.perf_counter()
sql="""
select domain, count(*) as total from clickstream
group by domain 
order by total 
desc limit 10
"""
top10_domains = spark.sql(sql)
top10_domains.show()
t2 = time.perf_counter()
print ("query took {:,.2f} ms ".format( (t2-t1)*1000))

## Step : Explain Query
You will see caching in effect!

In [None]:
top10_domains.explain()

## Clear Cache
Try the following ways to clear cache

1. spark.sql ("CLEAR CACHE")  - removes all cache
2. spark.sql ("CLEAR CACHE tableName") - removes one table
3. spark.catalog.unCacheTable('tableName') - removes one cached table
4. spark.catalog.clearCache() - clear all caches
5. dataframe.unPersist()

In [None]:
spark.sql("clear cache")
# spark.catalog.clearCacheTable()
# df.unpersist()