In [0]:

# Spark Session
from pyspark.sql import SparkSession
spark = (
        SparkSession
        .builder
        .appName("Spark Memory Management")
        .master("local[*]")
        .config("spark.executor.cores",4)
        .config("spark.cores.max",8)
        .config("spark.executor.memory", "512M")
        .getOrCreate()
)
spark

### Spark Memory Calculation per Executor

In [0]:
# JVM On-Heap Usable Memory (89% of executor memory)
print(f"The On-Heap Usuable Memory is {512*0.89} MB")

The On-Heap Usuable Memory is 455.68 MB


In [0]:
# Subtracting the Reserved Memory (300MB)
print(f"The On-Heap Non-Reserved Memory is {455.68-300} MB")


The On-Heap Non-Reserved Memory is 155.68 MB


In [0]:
# Total Spark Memory (Unified Memory - Storage + Execution Memory)(60% Default) spark.memory.fraction = 0.6
print(f"Total Spark Memory is {155.68*0.6} MB")

Total Spark Memory is 93.408 MB


In [0]:
# User /Undefined Memory (Not controlled by Spark) (remaining 40% default)
print(f"Total Undefined Memory is {155.68*0.4} MB")

Total Undefined Memory is 62.272000000000006 MB


In [0]:
# Storage Memory (spark.memory.storageFraction = 0.5)
print(f"Total Storage Memory is {93.408*0.5} MB")


Total Storage Memory is 46.704 MB


In [0]:
# Executor Memory per Core -- We have 8 executors and 4 cores each
print(f"Each Core Memory is {46.704/4} MB")

Each Core Memory is 11.676 MB


### Out of Memory(OOM) Errors on Execution

In [0]:
# Disable Adaptive Query Engine(AQE) and Broadcast Join
spark.conf.set("spark.sql.adaptive.enabled", False)
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled", False)
spark.conf.set("spark.sql.adaptive.autoBroadcastJoinThreshold", -1)

In [0]:
print(spark.conf.get("spark.sql.adaptive.enabled"))
print(spark.conf.get("spark.sql.adaptive.coalescePartitions.enabled"))
print(spark.conf.get("spark.sql.adaptive.autoBroadcastJoinThreshold"))

false
false
-1


In [0]:
%%sh
ls -ltrh /data/datasets/oom_example/

total 118M
-rw-r--r-- 1 root root 12M Jul 21 09:00 file_xs.txt
-rw-r--r-- 1 root root 12M Jul 21 09:00 file_singleline_xs.txt
-rw-r--r-- 1 root root 22M Jul 21 09:00 file_s.txt
-rw-r--r-- 1 root root 75M Jul 21 09:00 cities.txt


In [0]:
# READ files
# df = spark.read.format("text").load("/data/datasets/oom_example/file_singleline_xs.txt")  -- size 12MB
df = spark.read.format("text").load("file:///data/datasets/oom_example/file_singleline_xs.txt")

In [0]:
# Cache data
# Data gets stored into the Storage Memory after Caching.
df.cache().count()  # it is a single line file so the output is '1'

Out[13]: 1

In the above execution, we read a file of size 12MB(from DISK-SER) and after doing 'count'(action) it got Deserialised and expanded to 300 MB in Memory(Check Spark UI executors tab for more info and there is skewing of data.)

In [0]:
df.printSchema()

root
 |-- value: string (nullable = true)



In [0]:
# Explode data to Count words in the file(s)
from pyspark.sql.functions import lower, split, explode, count, lit
dffinal = (
    df.withColumn("value", lower("value"))
    .withColumn("splittedval", split("value", " "))
    .withColumn("explodedval", explode("splittedval"))
    .drop("splittedval", "value")
    .groupBy("explodedval").agg(count(lit(1)).alias("cnt"))
)

- The above 'explode' operation would cause multiplication of records which will cause the memory to overflow!
- The Executor will start crashing because of this and you will see 'Out Of Memory' Error in the Spark UI Job tab.

In [0]:
dffinal.show() # This should fail if there is 'Out Of Memory' error in the job executed.

+-----------+------+
|explodedval|   cnt|
+-----------+------+
| keyboards.| 44193|
|       used| 44193|
|       lazy| 44193|
| landscape.| 44193|
|      green| 44193|
|        for| 44193|
|      jumps| 44193|
|  sparkling| 44193|
|   contains| 44193|
|     letter| 44193|
|    pangram| 44193|
|      water| 44193|
|        fox| 44193|
|    testing| 44193|
|   sentence| 44193|
|         it| 44193|
|      flows| 44193|
|        the|176772|
|       dog.| 44193|
|      clear| 44193|
+-----------+------+
only showing top 20 rows



In [0]:
# Lets read the same singleline text in multiline format
df1 = spark.read.format("text").load("file:///data/datasets/oom_example/file_xs.txt")
df1.show()

+--------------------+
|               value|
+--------------------+
|The quick brown f...|
|The quick brown f...|
|The quick brown f...|
|The quick brown f...|
|The quick brown f...|
|The quick brown f...|
|The quick brown f...|
|The quick brown f...|
|The quick brown f...|
|The quick brown f...|
|The quick brown f...|
|The quick brown f...|
|The quick brown f...|
|The quick brown f...|
|The quick brown f...|
|The quick brown f...|
|The quick brown f...|
|The quick brown f...|
|The quick brown f...|
|The quick brown f...|
+--------------------+
only showing top 20 rows



In [0]:
df1.printSchema()

root
 |-- value: string (nullable = true)



In [0]:
from pyspark.sql.functions import lower, split, explode, count, lit
dffinal1 = (
    df1.withColumn("value", lower("value"))
    .withColumn("splittedval", split("value", " "))
    .withColumn("explodedval", explode("splittedval"))
    .drop("splittedval", "value")
    .groupBy("explodedval").agg(count(lit(1)).alias("cnt"))
)

In [0]:
dffinal1.show()

+-----------+------+
|explodedval|   cnt|
+-----------+------+
| keyboards.| 44193|
|       used| 44193|
|       lazy| 44193|
| landscape.| 44193|
|      green| 44193|
|        for| 44193|
|      jumps| 44193|
|  sparkling| 44193|
|   contains| 44193|
|     letter| 44193|
|    pangram| 44193|
|      water| 44193|
|        fox| 44193|
|    testing| 44193|
|   sentence| 44193|
|         it| 44193|
|      flows| 44193|
|        the|176772|
|       dog.| 44193|
|      clear| 44193|
+-----------+------+
only showing top 20 rows



In [0]:
df2 = spark.read.format("text").load("file:///data/datasets/oom_example/file_s.txt")
df2.show()

+--------------------+
|               value|
+--------------------+
|The quick brown f...|
|The quick brown f...|
|The quick brown f...|
|The quick brown f...|
|The quick brown f...|
|The quick brown f...|
|The quick brown f...|
|The quick brown f...|
|The quick brown f...|
|The quick brown f...|
|The quick brown f...|
|The quick brown f...|
|The quick brown f...|
|The quick brown f...|
|The quick brown f...|
|The quick brown f...|
|The quick brown f...|
|The quick brown f...|
|The quick brown f...|
|The quick brown f...|
+--------------------+
only showing top 20 rows



In [0]:
from pyspark.sql.functions import lower, split, explode, count, lit
dffinal2 = (
    df2.withColumn("value", lower("value"))
    .withColumn("splittedval", split("value", " "))
    .withColumn("explodedval", explode("splittedval"))
    .drop("splittedval", "value")
    .groupBy("explodedval").agg(count(lit(1)).alias("cnt"))
)

In [0]:
dffinal2.show()

+-----------+------+
|explodedval|   cnt|
+-----------+------+
| keyboards.| 84369|
|       used| 84369|
|       lazy| 84369|
| landscape.| 84369|
|      green| 84369|
|        for| 84369|
|      jumps| 84369|
|  sparkling| 84369|
|   contains| 84369|
|     letter| 84369|
|    pangram| 84369|
|      water| 84369|
|        fox| 84369|
|    testing| 84369|
|   sentence| 84369|
|         it| 84369|
|      flows| 84369|
|        the|337476|
|       dog.| 84369|
|      clear| 84369|
+-----------+------+
only showing top 20 rows



In [0]:
# Write in 'noop' format for simulation
dffinal2.write.format("noop").mode("overwrite").save()

- The 'option': wholtext=True will write the contents in a single record show below.
- It wrote the multiline file in a single record! So, now when it run it with explode, it will again fail with OOM erros!!!
- So, you have to be careful while using options!

In [0]:
df4 = spark.read.format("text").option("wholetext",True).load("file:///data/datasets/oom_example/file_xs.txt")
df4.show()

+--------------------+
|               value|
+--------------------+
|The quick brown f...|
+--------------------+



NOTE:
- Sometimes a '**Bad Core**' can also cause '**OOM(Out Of Memory)**' error.
- So, it is not always that you have to expand your memory to work it out.
- Also, OOM errors are still bound to be encountered even if the 'AQE' is enabled (Because the logic still remains same in the background)
- It is also very important to understand how memory works and how a BAD CODE can lead to OOM errors.