In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("MySparkApp").master("local[*]").getOrCreate()
from pyspark.sql.functions import col,count,avg, month, year, to_date, current_date, when
spark

1. Enable Spark Logging

In [4]:
spark.sparkContext.setLogLevel("INFO") # Options : All, DEBUG, INFO, WARN, ERROR
# Helps reduce or increase log verbosity for better debugging

2. Check Spark Configuration at Runtime

In [7]:
spark.sparkContext.getConf().getAll()
# USed to verify memory, cores, shuffle config

[('spark.app.id', 'local-1747392811311'),
 ('spark.driver.port', '57770'),
 ('spark.driver.extraJavaOptions',
  '-Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/jdk.internal.ref=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED -Djdk.reflect.useDirectMethodHandle=false'),
 ('spark.sql.warehouse.dir',

3. Executor Memory Debug

In [6]:
--conf spark.executor.memory = 2g
--conf spark.driver.memory = 1g
# Used when facing OutOfMemory errors

SyntaxError: invalid decimal literal (1039121640.py, line 1)

4. Shuffle Debugging - Too much Shuffle

In [8]:
df = df.repartition(100,"key") # beofre groupBy or join
# Avoids default small partitions that cause skew

NameError: name 'df' is not defined

5. Handling Skewed Join Keys

In [9]:
from pyspark.sql.functions import broadcast
result = big_df.join(broadcast(small_df),"key")
#Fixes long tasks dues to skew

NameError: name 'big_df' is not defined

6. Catch Read Failures

In [10]:
try:
    df = spark.read.csv("path/does/not/exist")
except Exception as e:
    print("Failed to read : ", e)
# Good for detecting bad paths or file formats

Failed to read :  [PATH_NOT_FOUND] Path does not exist: file:/C:/Users/Administrator/Big Data Class/Python/Pyspark/path/does/not/exist.


7. Memory Usage via Executors tab

Use Spark UI at localhost:4040 > Executors tab
Diagnoses executor memory issues visually

8. Monitor Job Execution Time

In [12]:
from time import time 
start = time()
df.count()
print("Execution Time : ", time() - start)
# Useful to measure slow operations

NameError: name 'df' is not defined

9. Use .persist() Wisely

In [13]:
df = df.filter("status = 'active'").persist()
#Avoid caching raw or large unfiltered dataset

NameError: name 'df' is not defined

10. Analyze Task and Duration

View task skew in Spark UI>Stages>Tasks

Useful to detect long-runnning tasks

11. Avoid Exploding Memory in Collect

In [14]:
#Bad 
all_rows = df.collect()

# Better
df.show(10)

#Prevents memory issues on the driver

NameError: name 'df' is not defined

12. Check Number of Partitions 

In [15]:
print(df.rdd.getNUMPartitions())
# Too few partitions cause bottlenecks. Too many = overhead

NameError: name 'df' is not defined

13. Debug Join Type Issues

In [16]:
df1.join(df2,"id","inner").explain(True)
# Check for broadcast hint or sort-merge join issues

NameError: name 'df1' is not defined

14. Investigate Lazy Evaluation Problems

In [18]:
df.printSchema() #Doesnt Trigger job
df.count() #Triggers full execution
# Use actions to debug transformations

NameError: name 'df' is not defined

15. Track Failed Jobs

Use Spark UI>Jobs tab>click failed job>review stderr logs

Best for tracking OOM, file read, schema mismatch errors