# Caching In Spark Table

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder \
.appName('Spark Table Caching')\
.enableHiveSupport()\
.getOrCreate()

In [3]:
spark.sql('show tables').show()

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
+---------+---------+-----------+



In [6]:
df = spark.read.format('csv').option('header','true').load('/tmp/customers_100.csv')

                                                                                

In [7]:
df.show()

+-----------+-----------+---------+-----------+-------+-----------------+---------+
|customer_id|       name|     city|      state|country|registration_date|is_active|
+-----------+-----------+---------+-----------+-------+-----------------+---------+
|          0| Customer_0|     Pune|Maharashtra|  India|       2023-06-29|    False|
|          1| Customer_1|Bangalore| Tamil Nadu|  India|       2023-12-07|     True|
|          2| Customer_2|Hyderabad|    Gujarat|  India|       2023-10-27|     True|
|          3| Customer_3|Bangalore|  Karnataka|  India|       2023-10-17|    False|
|          4| Customer_4|Ahmedabad|  Karnataka|  India|       2023-03-14|    False|
|          5| Customer_5|Hyderabad|  Karnataka|  India|       2023-07-28|    False|
|          6| Customer_6|     Pune|      Delhi|  India|       2023-08-29|    False|
|          7| Customer_7|Ahmedabad|West Bengal|  India|       2023-12-28|     True|
|          8| Customer_8|     Pune|  Karnataka|  India|       2023-06-22|   

In [None]:
df.write.format('csv').saveAsTable('default.customers_100')

In [None]:
spark.sql('show tables').show()

In [None]:
spark.sql('describe extended customers_100').show(truncate=False)

In [None]:
!hadoop fs -ls /user/hive/warehouse/customers_100

In [None]:
spark.sql('select * from customers_100 limit 5').show()

In [None]:
spark.sql('describe customers_100').show()

In [None]:
spark.sql('select * from customers_100 limit 5').show()

In [None]:
spark.sql('cache table customers_100')

In [None]:
spark.sql('select * from customers_100 limit 5').show()

In [None]:
spark.sql('show tables').show()

In [5]:
spark.sql('describe extended customers_500mb').show(truncate = False)

+----------------------------+-------------------------------------------------------+-------+
|col_name                    |data_type                                              |comment|
+----------------------------+-------------------------------------------------------+-------+
|customers_id                |int                                                    |null   |
|name                        |string                                                 |null   |
|city                        |string                                                 |null   |
|state                       |string                                                 |null   |
|country                     |string                                                 |null   |
|registration_date           |string                                                 |null   |
|is_active                   |boolean                                                |null   |
|                            |                    

In [8]:
spark.sql('select * from customers_500mb where city ="Hyderabad" limit 5').show()



+------------+-----------+---------+-----------+-------+-----------------+---------+
|customers_id|       name|     city|      state|country|registration_date|is_active|
+------------+-----------+---------+-----------+-------+-----------------+---------+
|           3| Customer_3|Hyderabad|    Gujarat|  India|       2023-11-11|    false|
|           6| Customer_6|Hyderabad| Tamil Nadu|  India|       2023-07-17|    false|
|           7| Customer_7|Hyderabad| Tamil Nadu|  India|       2023-08-18|     true|
|          20|Customer_20|Hyderabad| Tamil Nadu|  India|       2023-02-19|     true|
|          26|Customer_26|Hyderabad|Maharashtra|  India|       2023-12-13|     true|
+------------+-----------+---------+-----------+-------+-----------------+---------+



                                                                                

In [9]:
spark.sql('cache table customers_500mb')

                                                                                

DataFrame[]

In [10]:
spark.sql('select * from customers_500mb where city ="Hyderabad" limit 5').show()

+------------+-----------+---------+-----------+-------+-----------------+---------+
|customers_id|       name|     city|      state|country|registration_date|is_active|
+------------+-----------+---------+-----------+-------+-----------------+---------+
|           3| Customer_3|Hyderabad|    Gujarat|  India|       2023-11-11|    false|
|           6| Customer_6|Hyderabad| Tamil Nadu|  India|       2023-07-17|    false|
|           7| Customer_7|Hyderabad| Tamil Nadu|  India|       2023-08-18|     true|
|          20|Customer_20|Hyderabad| Tamil Nadu|  India|       2023-02-19|     true|
|          26|Customer_26|Hyderabad|Maharashtra|  India|       2023-12-13|     true|
+------------+-----------+---------+-----------+-------+-----------------+---------+



In [11]:
spark.sql('uncache table customers_500mb')

DataFrame[]

In [12]:
spark.sql('cache lazy table customers_500mb')

DataFrame[]

In [13]:
spark.sql('select * from customers_500mb limit 5').show()

[Stage 15:>                                                         (0 + 1) / 1]

+------------+----------+---------+-----------+-------+-----------------+---------+
|customers_id|      name|     city|      state|country|registration_date|is_active|
+------------+----------+---------+-----------+-------+-----------------+---------+
|        null|      name|     city|      state|country|registration_date|     null|
|           0|Customer_0|   Mumbai|  Telangana|  India|       2023-03-21|     true|
|           1|Customer_1|  Chennai|West Bengal|  India|       2023-05-27|    false|
|           2|Customer_2|     Pune|  Karnataka|  India|       2023-10-11|    false|
|           3|Customer_3|Hyderabad|    Gujarat|  India|       2023-11-11|    false|
+------------+----------+---------+-----------+-------+-----------------+---------+



                                                                                

In [15]:
spark.sql('select city, count (*) from customers_500mb group by city').show()



+---------+--------+
|     city|count(1)|
+---------+--------+
|Bangalore| 1094195|
|  Chennai| 1095052|
|   Mumbai| 1095815|
|Ahmedabad| 1097162|
|  Kolkata| 1096777|
|     city|       1|
|     Pune| 1095748|
|    Delhi| 1096183|
|Hyderabad| 1096426|
+---------+--------+



                                                                                

In [17]:
spark.sql('select city, count (*) from customers_500mb group by city').explain(mode='extended')

== Parsed Logical Plan ==
'Aggregate ['city], ['city, unresolvedalias('count(1), None)]
+- 'UnresolvedRelation [customers_500mb], [], false

== Analyzed Logical Plan ==
city: string, count(1): bigint
Aggregate [city#151], [city#151, count(1) AS count(1)#1202L]
+- SubqueryAlias spark_catalog.default.customers_500mb
   +- Relation default.customers_500mb[customers_id#149,name#150,city#151,state#152,country#153,registration_date#154,is_active#155] csv

== Optimized Logical Plan ==
Aggregate [city#151], [city#151, count(1) AS count(1)#1202L]
+- Project [city#151]
   +- InMemoryRelation [customers_id#149, name#150, city#151, state#152, country#153, registration_date#154, is_active#155], StorageLevel(disk, memory, deserialized, 1 replicas)
         +- FileScan csv default.customers_500mb[customers_id#149,name#150,city#151,state#152,country#153,registration_date#154,is_active#155] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[hdfs://my-cluster-m/user/hive/

In [22]:
spark.sql('select city, count (*) from customers_500mb where city ="Hyderabad" group by city').show()

+---------+--------+
|     city|count(1)|
+---------+--------+
|Hyderabad| 1096426|
+---------+--------+



In [23]:
spark.sql('select city, count (*) from customers_500mb where city ="Hyderabad" group by city').explain(mode='extended')

== Parsed Logical Plan ==
'Aggregate ['city], ['city, unresolvedalias('count(1), None)]
+- 'Filter ('city = Hyderabad)
   +- 'UnresolvedRelation [customers_500mb], [], false

== Analyzed Logical Plan ==
city: string, count(1): bigint
Aggregate [city#151], [city#151, count(1) AS count(1)#1614L]
+- Filter (city#151 = Hyderabad)
   +- SubqueryAlias spark_catalog.default.customers_500mb
      +- Relation default.customers_500mb[customers_id#149,name#150,city#151,state#152,country#153,registration_date#154,is_active#155] csv

== Optimized Logical Plan ==
Aggregate [city#151], [city#151, count(1) AS count(1)#1614L]
+- Project [city#151]
   +- Filter (isnotnull(city#151) AND (city#151 = Hyderabad))
      +- InMemoryRelation [customers_id#149, name#150, city#151, state#152, country#153, registration_date#154, is_active#155], StorageLevel(disk, memory, deserialized, 1 replicas)
            +- FileScan csv default.customers_500mb[customers_id#149,name#150,city#151,state#152,country#153,registrat

In [25]:
spark.sql('describe extended external_customers_2').show(truncate = False)

+----------------------------+--------------------------------------------------+-------+
|col_name                    |data_type                                         |comment|
+----------------------------+--------------------------------------------------+-------+
|customer_id                 |int                                               |null   |
|name                        |string                                            |null   |
|city                        |string                                            |null   |
|state                       |string                                            |null   |
|country                     |string                                            |null   |
|registration_date           |string                                            |null   |
|is_active                   |boolean                                           |null   |
|                            |                                                  |       |
|# Detaile

In [26]:
spark.sql('cache table external_customers_2').show(truncate = False)

++
||
++
++



In [27]:
df.show()

+-----------+-----------+---------+-----------+-------+-----------------+---------+
|customer_id|       name|     city|      state|country|registration_date|is_active|
+-----------+-----------+---------+-----------+-------+-----------------+---------+
|          0| Customer_0|     Pune|Maharashtra|  India|       2023-06-29|    False|
|          1| Customer_1|Bangalore| Tamil Nadu|  India|       2023-12-07|     True|
|          2| Customer_2|Hyderabad|    Gujarat|  India|       2023-10-27|     True|
|          3| Customer_3|Bangalore|  Karnataka|  India|       2023-10-17|    False|
|          4| Customer_4|Ahmedabad|  Karnataka|  India|       2023-03-14|    False|
|          5| Customer_5|Hyderabad|  Karnataka|  India|       2023-07-28|    False|
|          6| Customer_6|     Pune|      Delhi|  India|       2023-08-29|    False|
|          7| Customer_7|Ahmedabad|West Bengal|  India|       2023-12-28|     True|
|          8| Customer_8|     Pune|  Karnataka|  India|       2023-06-22|   

In [28]:
spark.stop()