In [None]:
from pyspark.sql import SparkSession
from pyspark import StorageLevel

spark = SparkSession.builder.appName("PersistExample").getOrCreate()

# Create a DataFrame
df = spark.read.csv("large_dataset.csv", header=True, inferSchema=True)

# Apply some transformations
filtered_df = df.filter(df.age > 25).select("name", "age", "salary")

# Persist the DataFrame since we'll use it multiple times
filtered_df.persist(StorageLevel.MEMORY_AND_DISK)

# Multiple actions on the same DataFrame
count = filtered_df.count()  # First computation - data gets cached
avg_salary = filtered_df.agg({"salary": "avg"}).collect()  # Uses cached data
max_age = filtered_df.agg({"age": "max"}).collect()  # Uses cached data

# Don't forget to unpersist when done
filtered_df.unpersist()