In [17]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('caching2').master('local[*]').getOrCreate()

In [18]:
_schema = 'Year Double, Flood Double'
df = spark.read.format('csv').schema(_schema).option('Header','true').load('nile.csv')

# If we don't provide a schema although read is not an action, spark will trigger a job to infer the schema from the data.
# Hence we provide schema here so that no job is trigerred.

In [19]:
df.where('Year>100').show()

# we can see from UI that first job is trigerred here, data is read from csv and then applied filter on

+-----+-------+
| Year|  Flood|
+-----+-------+
|101.0| 11.037|
|102.0|10.9986|
|103.0| 11.037|
|104.0| 10.151|
|105.0|11.1906|
|106.0|10.7284|
|107.0|10.9218|
|108.0|  10.69|
|109.0|10.5748|
|110.0|10.9986|
|111.0| 10.941|
|112.0|10.4596|
|113.0|10.9602|
|114.0| 10.151|
|115.0|10.0742|
|116.0|10.1126|
|117.0|  9.767|
|118.0|10.4212|
|119.0|10.1894|
|120.0|10.4212|
+-----+-------+
only showing top 20 rows



In [21]:
df.cache().count()
#we have to run a action that performs action on whole dataframe for it be cached. If we run show() action, as it won't read whole dataframe, the dataframe will not be cached

# in storage menu we can see that this dataframe is cached to the memory. Cache only happens after an action is triggered as it is also lazily evaluated.

570

In [22]:
df.where('Year>100').show()
#the same filter applied before will run faster now and will read data from cache rather than performing csv scan.
# we can also check DAG in UI for the same.

+-----+-------+
| Year|  Flood|
+-----+-------+
|101.0| 11.037|
|102.0|10.9986|
|103.0| 11.037|
|104.0| 10.151|
|105.0|11.1906|
|106.0|10.7284|
|107.0|10.9218|
|108.0|  10.69|
|109.0|10.5748|
|110.0|10.9986|
|111.0| 10.941|
|112.0|10.4596|
|113.0|10.9602|
|114.0| 10.151|
|115.0|10.0742|
|116.0|10.1126|
|117.0|  9.767|
|118.0|10.4212|
|119.0|10.1894|
|120.0|10.4212|
+-----+-------+
only showing top 20 rows



In [23]:
# to remove the data from cache we use unpersist
df.unpersist()

#now if we see storage, there will be nothing and any tranformation on df will be read from csv

DataFrame[Year: double, Flood: double]

In [24]:
# Caching will remember the lineage to figure out if data should be read from cache or source.
# lets cache a part of df

df.where('Year>100').cache().count()

470

In [26]:
# lets try to get a subset of above cached data.

df.where('Year > 150').count()

# we can see that although given by the above filter was present in the cached dataframe itself, spark will read the csv.
# this is because lineage is not same for the above cached dataframe and this output dataframe

420

In [27]:
# to clear entire cache, we can use below

spark.catalog.clearCache()

In [30]:
#now lets look at persist
import pyspark
df_persist = df.persist(pyspark.StorageLevel.MEMORY_ONLY)
#lets call an action to perform the persist

df_persist.write.format('noop').mode('overwrite').save()

#if we want to perform a dummy write action without actually writing the data anywhere, we can use noop as the format

# Memory Serialized 1x Replicated 
# we can see that data is now serialized hence takes less space, but this takes time compared to deserialized.
# cache will stored deserialized, however persist will store serialized

In [31]:
df_persist_ser = df.persist(pyspark.StorageLevel.MEMORY_ONLY_SER)
df_persist_ser.write.format('noop').mode('overwrite').save()

# as we know in pyspark, persist by default serializes the data, hence SER options are not applicable

AttributeError: type object 'StorageLevel' has no attribute 'MEMORY_ONLY_SER'

In [33]:
spark.catalog.clearCache()
df_persist_disk= df.persist(pyspark.StorageLevel.DISK_ONLY)
df_persist_disk.write.format('noop').mode('overwrite').save()

#Disk Serialized 1x Replicated, and we can see data is stored on disk only not on memory

In [34]:
#lets try the replication
spark.catalog.clearCache()
df_persist_2= df.persist(pyspark.StorageLevel.MEMORY_ONLY_2)
df_persist_2.write.format('noop').mode('overwrite').save()

# Memory Serialized 2x Replicated

In [35]:
spark.catalog.clearCache()

In [42]:
spark.sparkContext.setCheckpointDir('D:\AWS Udemy')
df121=df.checkpoint()

# Although we are getting error, we can see that an ORC file is  created at the above location

Py4JJavaError: An error occurred while calling o137.checkpoint.
: java.lang.UnsatisfiedLinkError: org.apache.hadoop.io.nativeio.NativeIO$Windows.access0(Ljava/lang/String;I)Z
	at org.apache.hadoop.io.nativeio.NativeIO$Windows.access0(Native Method)
	at org.apache.hadoop.io.nativeio.NativeIO$Windows.access(NativeIO.java:793)
	at org.apache.hadoop.fs.FileUtil.canRead(FileUtil.java:1249)
	at org.apache.hadoop.fs.FileUtil.list(FileUtil.java:1454)
	at org.apache.hadoop.fs.RawLocalFileSystem.listStatus(RawLocalFileSystem.java:601)
	at org.apache.hadoop.fs.FileSystem.listStatus(FileSystem.java:1972)
	at org.apache.hadoop.fs.FileSystem.listStatus(FileSystem.java:2014)
	at org.apache.hadoop.fs.ChecksumFileSystem.listStatus(ChecksumFileSystem.java:761)
	at org.apache.spark.rdd.ReliableCheckpointRDD.getPartitions(ReliableCheckpointRDD.scala:74)
	at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:291)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:287)
	at org.apache.spark.rdd.ReliableCheckpointRDD$.writeRDDToCheckpointDirectory(ReliableCheckpointRDD.scala:179)
	at org.apache.spark.rdd.ReliableRDDCheckpointData.doCheckpoint(ReliableRDDCheckpointData.scala:60)
	at org.apache.spark.rdd.RDDCheckpointData.checkpoint(RDDCheckpointData.scala:75)
	at org.apache.spark.rdd.RDD.$anonfun$doCheckpoint$1(RDD.scala:1927)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDD.doCheckpoint(RDD.scala:1917)
	at org.apache.spark.sql.Dataset.$anonfun$checkpoint$1(Dataset.scala:736)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$2(Dataset.scala:4334)
	at org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:546)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:4332)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:201)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:108)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:66)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:4332)
	at org.apache.spark.sql.Dataset.checkpoint(Dataset.scala:727)
	at org.apache.spark.sql.Dataset.checkpoint(Dataset.scala:690)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:834)
