# Spark SQL DataFrames - Learn by Coding Tutorial

## Setup and Environment

In [9]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [10]:
# 🔧 SOLUTION 1: Check if Spark session exists, create if needed
def get_spark_session():
    try:
        # Try to get existing active session
        spark = SparkSession.getActiveSession()
        if spark is None:
            # Create new session if none exists
            spark = SparkSession.builder \
                .appName("SparkSQL_Learning") \
                .config("spark.sql.adaptive.enabled", "true") \
                .getOrCreate()
        return spark
    except:
        # If anything goes wrong, create fresh session
        spark = SparkSession.builder \
            .appName("SparkSQL_Learning") \
            .config("spark.sql.adaptive.enabled", "true") \
            .getOrCreate()
        return spark

In [11]:
# Get or create Spark session
spark = get_spark_session()
sc = spark.sparkContext

25/06/26 14:05:08 INFO SparkEnv: Registering MapOutputTracker
25/06/26 14:05:08 INFO SparkEnv: Registering BlockManagerMaster
25/06/26 14:05:08 INFO SparkEnv: Registering BlockManagerMasterHeartbeat
25/06/26 14:05:09 INFO SparkEnv: Registering OutputCommitCoordinator


In [12]:
print("✅ Spark Session Status:")
print(f"   App Name: {spark.sparkContext.appName}")
print(f"   Master: {spark.sparkContext.master}")
print(f"   Spark Version: {spark.version}")
print(f"   Active: {not spark.sparkContext._jsc.sc().isStopped()}")

✅ Spark Session Status:
   App Name: SparkSQL_Learning
   Master: yarn
   Spark Version: 3.5.3
   Active: True


## 1. Creating DataFrames

### From Lists/Tuples

In [13]:
# Sample data
employees_data = [
    (1, "John", "Engineering", 75000, "2020-01-15"),
    (2, "Sarah", "Marketing", 65000, "2019-03-22"),
    (3, "Mike", "Engineering", 80000, "2021-06-10"),
    (4, "Lisa", "HR", 60000, "2018-11-05"),
    (5, "David", "Engineering", 85000, "2022-02-28")
]

# Define schema
schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("department", StringType(), True),
    StructField("salary", IntegerType(), True),
    StructField("hire_date", StringType(), True)
])

# Create DataFrame
df_employees = spark.createDataFrame(employees_data, schema)
df_employees.show()

                                                                                

+---+-----+-----------+------+----------+
| id| name| department|salary| hire_date|
+---+-----+-----------+------+----------+
|  1| John|Engineering| 75000|2020-01-15|
|  2|Sarah|  Marketing| 65000|2019-03-22|
|  3| Mike|Engineering| 80000|2021-06-10|
|  4| Lisa|         HR| 60000|2018-11-05|
|  5|David|Engineering| 85000|2022-02-28|
+---+-----+-----------+------+----------+



### From Dictionary

In [14]:
# Alternative way using dictionary
employees_dict = [
    {"id": 1, "name": "John", "department": "Engineering", "salary": 75000},
    {"id": 2, "name": "Sarah", "department": "Marketing", "salary": 65000},
    {"id": 3, "name": "Mike", "department": "Engineering", "salary": 80000}
]

df_from_dict = spark.createDataFrame(employees_dict)
df_from_dict.show()

+-----------+---+-----+------+
| department| id| name|salary|
+-----------+---+-----+------+
|Engineering|  1| John| 75000|
|  Marketing|  2|Sarah| 65000|
|Engineering|  3| Mike| 80000|
+-----------+---+-----+------+



## 2. Basic DataFrame Operations

### Viewing Data

In [15]:
# Show first few rows
df_employees.show()
df_employees.show(3)  # Show only 3 rows

# Print schema
df_employees.printSchema()

# Get column names
print(df_employees.columns)

# Count rows
print(f"Total rows: {df_employees.count()}")

# Describe statistics
df_employees.describe().show()

+---+-----+-----------+------+----------+
| id| name| department|salary| hire_date|
+---+-----+-----------+------+----------+
|  1| John|Engineering| 75000|2020-01-15|
|  2|Sarah|  Marketing| 65000|2019-03-22|
|  3| Mike|Engineering| 80000|2021-06-10|
|  4| Lisa|         HR| 60000|2018-11-05|
|  5|David|Engineering| 85000|2022-02-28|
+---+-----+-----------+------+----------+

+---+-----+-----------+------+----------+
| id| name| department|salary| hire_date|
+---+-----+-----------+------+----------+
|  1| John|Engineering| 75000|2020-01-15|
|  2|Sarah|  Marketing| 65000|2019-03-22|
|  3| Mike|Engineering| 80000|2021-06-10|
+---+-----+-----------+------+----------+
only showing top 3 rows

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- salary: integer (nullable = true)
 |-- hire_date: string (nullable = true)

['id', 'name', 'department', 'salary', 'hire_date']
Total rows: 5


25/06/26 14:08:17 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

+-------+------------------+-----+-----------+------------------+----------+
|summary|                id| name| department|            salary| hire_date|
+-------+------------------+-----+-----------+------------------+----------+
|  count|                 5|    5|          5|                 5|         5|
|   mean|               3.0| NULL|       NULL|           73000.0|      NULL|
| stddev|1.5811388300841898| NULL|       NULL|10368.220676663861|      NULL|
|    min|                 1|David|Engineering|             60000|2018-11-05|
|    max|                 5|Sarah|  Marketing|             85000|2022-02-28|
+-------+------------------+-----+-----------+------------------+----------+



### Selecting Columns

In [16]:
# Select specific columns
df_employees.select("name", "salary").show()

# Select with column expressions
df_employees.select(
    col("name"),
    col("salary"),
    (col("salary") * 0.1).alias("bonus")
).show()

                                                                                

+-----+------+
| name|salary|
+-----+------+
| John| 75000|
|Sarah| 65000|
| Mike| 80000|
| Lisa| 60000|
|David| 85000|
+-----+------+

+-----+------+------+
| name|salary| bonus|
+-----+------+------+
| John| 75000|7500.0|
|Sarah| 65000|6500.0|
| Mike| 80000|8000.0|
| Lisa| 60000|6000.0|
|David| 85000|8500.0|
+-----+------+------+



## 3. Filtering Data

In [17]:
# Simple filter
df_employees.filter(col("salary") > 70000).show()

# Multiple conditions
df_employees.filter(
    (col("salary") > 70000) & 
    (col("department") == "Engineering")
).show()

# Using where (alias for filter)
df_employees.where(col("department").isin(["Engineering", "HR"])).show()

# String operations
df_employees.filter(col("name").startswith("S")).show()

+---+-----+-----------+------+----------+
| id| name| department|salary| hire_date|
+---+-----+-----------+------+----------+
|  1| John|Engineering| 75000|2020-01-15|
|  3| Mike|Engineering| 80000|2021-06-10|
|  5|David|Engineering| 85000|2022-02-28|
+---+-----+-----------+------+----------+

+---+-----+-----------+------+----------+
| id| name| department|salary| hire_date|
+---+-----+-----------+------+----------+
|  1| John|Engineering| 75000|2020-01-15|
|  3| Mike|Engineering| 80000|2021-06-10|
|  5|David|Engineering| 85000|2022-02-28|
+---+-----+-----------+------+----------+

+---+-----+-----------+------+----------+
| id| name| department|salary| hire_date|
+---+-----+-----------+------+----------+
|  1| John|Engineering| 75000|2020-01-15|
|  3| Mike|Engineering| 80000|2021-06-10|
|  4| Lisa|         HR| 60000|2018-11-05|
|  5|David|Engineering| 85000|2022-02-28|
+---+-----+-----------+------+----------+

+---+-----+----------+------+----------+
| id| name|department|salary| hi

## 4. Adding and Modifying Columns

In [18]:
# Add new columns
df_with_bonus = df_employees.withColumn("bonus", col("salary") * 0.1)
df_with_bonus.show()

# Multiple new columns
df_enhanced = df_employees \
    .withColumn("bonus", col("salary") * 0.1) \
    .withColumn("total_compensation", col("salary") + col("bonus")) \
    .withColumn("is_senior", col("salary") > 70000)

df_enhanced.show()

# Rename columns
df_renamed = df_employees.withColumnRenamed("department", "dept")
df_renamed.show()

+---+-----+-----------+------+----------+------+
| id| name| department|salary| hire_date| bonus|
+---+-----+-----------+------+----------+------+
|  1| John|Engineering| 75000|2020-01-15|7500.0|
|  2|Sarah|  Marketing| 65000|2019-03-22|6500.0|
|  3| Mike|Engineering| 80000|2021-06-10|8000.0|
|  4| Lisa|         HR| 60000|2018-11-05|6000.0|
|  5|David|Engineering| 85000|2022-02-28|8500.0|
+---+-----+-----------+------+----------+------+

+---+-----+-----------+------+----------+------+------------------+---------+
| id| name| department|salary| hire_date| bonus|total_compensation|is_senior|
+---+-----+-----------+------+----------+------+------------------+---------+
|  1| John|Engineering| 75000|2020-01-15|7500.0|           82500.0|     true|
|  2|Sarah|  Marketing| 65000|2019-03-22|6500.0|           71500.0|    false|
|  3| Mike|Engineering| 80000|2021-06-10|8000.0|           88000.0|     true|
|  4| Lisa|         HR| 60000|2018-11-05|6000.0|           66000.0|    false|
|  5|David|E

## 5. Grouping and Aggregations

In [19]:
# Group by department
dept_stats = df_employees.groupBy("department") \
    .agg(
        count("*").alias("employee_count"),
        avg("salary").alias("avg_salary"),
        max("salary").alias("max_salary"),
        min("salary").alias("min_salary")
    )

dept_stats.show()

# Multiple grouping columns (create sample data first)
projects_data = [
    (1, "John", "Engineering", "ProjectA", 40),
    (1, "John", "Engineering", "ProjectB", 35),
    (2, "Sarah", "Marketing", "ProjectC", 45),
    (3, "Mike", "Engineering", "ProjectA", 50)
]

df_projects = spark.createDataFrame(projects_data, 
    ["id", "name", "department", "project", "hours"])

project_summary = df_projects.groupBy("department", "project") \
    .agg(
        sum("hours").alias("total_hours"),
        count("*").alias("people_count")
    )

project_summary.show()

                                                                                

+-----------+--------------+----------+----------+----------+
| department|employee_count|avg_salary|max_salary|min_salary|
+-----------+--------------+----------+----------+----------+
|Engineering|             3|   80000.0|     85000|     75000|
|  Marketing|             1|   65000.0|     65000|     65000|
|         HR|             1|   60000.0|     60000|     60000|
+-----------+--------------+----------+----------+----------+





+-----------+--------+-----------+------------+
| department| project|total_hours|people_count|
+-----------+--------+-----------+------------+
|Engineering|ProjectB|         35|           1|
|Engineering|ProjectA|         90|           2|
|  Marketing|ProjectC|         45|           1|
+-----------+--------+-----------+------------+



                                                                                

## 7. Joins

In [20]:
# Create second DataFrame for joining
departments_data = [
    ("Engineering", "John Smith", "Building A"),
    ("Marketing", "Jane Doe", "Building B"),
    ("HR", "Bob Wilson", "Building C")
]

df_departments = spark.createDataFrame(departments_data, 
    ["department", "manager", "location"])

# Inner join
joined_df = df_employees.join(df_departments, "department", "inner")
joined_df.show()

# Left join
left_joined = df_employees.join(df_departments, "department", "left")
left_joined.show()

# Join with explicit conditions
explicit_join = df_employees.join(
    df_departments, 
    df_employees.department == df_departments.department,
    "inner"
).select(
    df_employees["*"],
    df_departments.manager,
    df_departments.location
)
explicit_join.show()

                                                                                

+-----------+---+-----+------+----------+----------+----------+
| department| id| name|salary| hire_date|   manager|  location|
+-----------+---+-----+------+----------+----------+----------+
|Engineering|  3| Mike| 80000|2021-06-10|John Smith|Building A|
|Engineering|  5|David| 85000|2022-02-28|John Smith|Building A|
|Engineering|  1| John| 75000|2020-01-15|John Smith|Building A|
|         HR|  4| Lisa| 60000|2018-11-05|Bob Wilson|Building C|
|  Marketing|  2|Sarah| 65000|2019-03-22|  Jane Doe|Building B|
+-----------+---+-----+------+----------+----------+----------+



                                                                                

+-----------+---+-----+------+----------+----------+----------+
| department| id| name|salary| hire_date|   manager|  location|
+-----------+---+-----+------+----------+----------+----------+
|Engineering|  1| John| 75000|2020-01-15|John Smith|Building A|
|  Marketing|  2|Sarah| 65000|2019-03-22|  Jane Doe|Building B|
|Engineering|  3| Mike| 80000|2021-06-10|John Smith|Building A|
|Engineering|  5|David| 85000|2022-02-28|John Smith|Building A|
|         HR|  4| Lisa| 60000|2018-11-05|Bob Wilson|Building C|
+-----------+---+-----+------+----------+----------+----------+



[Stage 50:>                                                         (0 + 2) / 2]

+---+-----+-----------+------+----------+----------+----------+
| id| name| department|salary| hire_date|   manager|  location|
+---+-----+-----------+------+----------+----------+----------+
|  3| Mike|Engineering| 80000|2021-06-10|John Smith|Building A|
|  5|David|Engineering| 85000|2022-02-28|John Smith|Building A|
|  1| John|Engineering| 75000|2020-01-15|John Smith|Building A|
|  4| Lisa|         HR| 60000|2018-11-05|Bob Wilson|Building C|
|  2|Sarah|  Marketing| 65000|2019-03-22|  Jane Doe|Building B|
+---+-----+-----------+------+----------+----------+----------+



                                                                                

## 8. Window Functions

In [21]:
from pyspark.sql.window import Window

# Define window specification
window_spec = Window.partitionBy("department").orderBy(col("salary").desc())

# Add row numbers and rankings
df_with_window = df_employees \
    .withColumn("row_number", row_number().over(window_spec)) \
    .withColumn("rank", rank().over(window_spec)) \
    .withColumn("dense_rank", dense_rank().over(window_spec))

df_with_window.show()

# Running totals
window_running = Window.partitionBy("department").orderBy("salary") \
    .rowsBetween(Window.unboundedPreceding, Window.currentRow)

df_running_total = df_employees \
    .withColumn("running_total", sum("salary").over(window_running))

df_running_total.show()

+---+-----+-----------+------+----------+----------+----+----------+
| id| name| department|salary| hire_date|row_number|rank|dense_rank|
+---+-----+-----------+------+----------+----------+----+----------+
|  5|David|Engineering| 85000|2022-02-28|         1|   1|         1|
|  3| Mike|Engineering| 80000|2021-06-10|         2|   2|         2|
|  1| John|Engineering| 75000|2020-01-15|         3|   3|         3|
|  4| Lisa|         HR| 60000|2018-11-05|         1|   1|         1|
|  2|Sarah|  Marketing| 65000|2019-03-22|         1|   1|         1|
+---+-----+-----------+------+----------+----------+----+----------+



[Stage 59:>                                                         (0 + 1) / 1]

+---+-----+-----------+------+----------+-------------+
| id| name| department|salary| hire_date|running_total|
+---+-----+-----------+------+----------+-------------+
|  1| John|Engineering| 75000|2020-01-15|        75000|
|  3| Mike|Engineering| 80000|2021-06-10|       155000|
|  5|David|Engineering| 85000|2022-02-28|       240000|
|  4| Lisa|         HR| 60000|2018-11-05|        60000|
|  2|Sarah|  Marketing| 65000|2019-03-22|        65000|
+---+-----+-----------+------+----------+-------------+



                                                                                

## 9. SQL Queries on DataFrames

In [22]:
# Register DataFrame as temporary view
df_employees.createOrReplaceTempView("employees")
df_departments.createOrReplaceTempView("departments")

# Execute SQL queries
result1 = spark.sql("""
    SELECT department, 
           COUNT(*) as employee_count,
           AVG(salary) as avg_salary
    FROM employees 
    GROUP BY department
    ORDER BY avg_salary DESC
""")
result1.show()

# Complex SQL with joins
result2 = spark.sql("""
    SELECT e.name, e.salary, e.department, d.manager, d.location
    FROM employees e
    JOIN departments d ON e.department = d.department
    WHERE e.salary > 70000
    ORDER BY e.salary DESC
""")
result2.show()

# Subqueries
result3 = spark.sql("""
    SELECT * FROM employees
    WHERE salary > (
        SELECT AVG(salary) FROM employees
    )
""")
result3.show()

                                                                                

+-----------+--------------+----------+
| department|employee_count|avg_salary|
+-----------+--------------+----------+
|Engineering|             3|   80000.0|
|  Marketing|             1|   65000.0|
|         HR|             1|   60000.0|
+-----------+--------------+----------+



                                                                                

+-----+------+-----------+----------+----------+
| name|salary| department|   manager|  location|
+-----+------+-----------+----------+----------+
|David| 85000|Engineering|John Smith|Building A|
| Mike| 80000|Engineering|John Smith|Building A|
| John| 75000|Engineering|John Smith|Building A|
+-----+------+-----------+----------+----------+

+---+-----+-----------+------+----------+
| id| name| department|salary| hire_date|
+---+-----+-----------+------+----------+
|  1| John|Engineering| 75000|2020-01-15|
|  3| Mike|Engineering| 80000|2021-06-10|
|  5|David|Engineering| 85000|2022-02-28|
+---+-----+-----------+------+----------+



## 10. Working with Dates

In [23]:
# Convert string to date
df_with_dates = df_employees \
    .withColumn("hire_date", to_date(col("hire_date"), "yyyy-MM-dd"))

df_with_dates.printSchema()

# Date operations
df_date_ops = df_with_dates \
    .withColumn("hire_year", year("hire_date")) \
    .withColumn("hire_month", month("hire_date")) \
    .withColumn("days_since_hire", datediff(current_date(), "hire_date"))

df_date_ops.show()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- salary: integer (nullable = true)
 |-- hire_date: date (nullable = true)

+---+-----+-----------+------+----------+---------+----------+---------------+
| id| name| department|salary| hire_date|hire_year|hire_month|days_since_hire|
+---+-----+-----------+------+----------+---------+----------+---------------+
|  1| John|Engineering| 75000|2020-01-15|     2020|         1|           1989|
|  2|Sarah|  Marketing| 65000|2019-03-22|     2019|         3|           2288|
|  3| Mike|Engineering| 80000|2021-06-10|     2021|         6|           1477|
|  4| Lisa|         HR| 60000|2018-11-05|     2018|        11|           2425|
|  5|David|Engineering| 85000|2022-02-28|     2022|         2|           1214|
+---+-----+-----------+------+----------+---------+----------+---------------+



## 11. Handling Null Values

In [24]:
# Create data with nulls
data_with_nulls = [
    (1, "John", "Engineering", 75000),
    (2, "Sarah", None, 65000),
    (3, None, "Engineering", None),
    (4, "Lisa", "HR", 60000)
]

df_nulls = spark.createDataFrame(data_with_nulls, 
    ["id", "name", "department", "salary"])

# Check for nulls
df_nulls.show()

# Drop rows with any null
df_nulls.dropna().show()

# Drop rows with null in specific columns
df_nulls.dropna(subset=["name"]).show()

# Fill nulls
df_filled = df_nulls.fillna({
    "name": "Unknown",
    "department": "Unassigned",
    "salary": 0
})
df_filled.show()

+---+-----+-----------+------+
| id| name| department|salary|
+---+-----+-----------+------+
|  1| John|Engineering| 75000|
|  2|Sarah|       NULL| 65000|
|  3| NULL|Engineering|  NULL|
|  4| Lisa|         HR| 60000|
+---+-----+-----------+------+

+---+----+-----------+------+
| id|name| department|salary|
+---+----+-----------+------+
|  1|John|Engineering| 75000|
|  4|Lisa|         HR| 60000|
+---+----+-----------+------+

+---+-----+-----------+------+
| id| name| department|salary|
+---+-----+-----------+------+
|  1| John|Engineering| 75000|
|  2|Sarah|       NULL| 65000|
|  4| Lisa|         HR| 60000|
+---+-----+-----------+------+

+---+-------+-----------+------+
| id|   name| department|salary|
+---+-------+-----------+------+
|  1|   John|Engineering| 75000|
|  2|  Sarah| Unassigned| 65000|
|  3|Unknown|Engineering|     0|
|  4|   Lisa|         HR| 60000|
+---+-------+-----------+------+



## 12. Advanced Operations

### Pivot Tables

In [25]:
# Create sample sales data
sales_data = [
    ("Q1", "North", "Product A", 100),
    ("Q1", "South", "Product A", 80),
    ("Q2", "North", "Product A", 120),
    ("Q2", "South", "Product A", 90),
    ("Q1", "North", "Product B", 150),
    ("Q1", "South", "Product B", 130)
]

df_sales = spark.createDataFrame(sales_data, 
    ["quarter", "region", "product", "sales"])

# Pivot
pivoted = df_sales.groupBy("quarter", "product") \
    .pivot("region") \
    .sum("sales")

pivoted.show()

+-------+---------+-----+-----+
|quarter|  product|North|South|
+-------+---------+-----+-----+
|     Q1|Product B|  150|  130|
|     Q2|Product A|  120|   90|
|     Q1|Product A|  100|   80|
+-------+---------+-----+-----+



### User Defined Functions (UDFs)

In [26]:
from pyspark.sql.functions import udf

# Define UDF
def categorize_salary(salary):
    if salary >= 80000:
        return "High"
    elif salary >= 70000:
        return "Medium"
    else:
        return "Low"

# Register UDF
categorize_udf = udf(categorize_salary, StringType())

# Use UDF
df_categorized = df_employees \
    .withColumn("salary_category", categorize_udf(col("salary")))

df_categorized.show()

                                                                                

+---+-----+-----------+------+----------+---------------+
| id| name| department|salary| hire_date|salary_category|
+---+-----+-----------+------+----------+---------------+
|  1| John|Engineering| 75000|2020-01-15|         Medium|
|  2|Sarah|  Marketing| 65000|2019-03-22|            Low|
|  3| Mike|Engineering| 80000|2021-06-10|           High|
|  4| Lisa|         HR| 60000|2018-11-05|            Low|
|  5|David|Engineering| 85000|2022-02-28|           High|
+---+-----+-----------+------+----------+---------------+



## Practice Exercises
### Try these exercises to reinforce your learning:

#### Exercise 1: Find the employee with the highest salary in each department
#### Exercise 2: Calculate the percentage of total salary each employee represents within their department
#### Exercise 3: Find departments where the average salary is above the company average
#### Exercise 4: Create a report showing month-over-month hiring trends
#### Exercise 5: Find employees whose names contain specific patterns

In [27]:
# Exercise 1: Highest salary per department
window_max = Window.partitionBy("department")
exercise1 = df_employees \
    .withColumn("max_dept_salary", max("salary").over(window_max)) \
    .filter(col("salary") == col("max_dept_salary")) \
    .drop("max_dept_salary")

exercise1.show()

# Exercise 2: Salary percentage per department
window_dept = Window.partitionBy("department")
exercise2 = df_employees \
    .withColumn("dept_total", sum("salary").over(window_dept)) \
    .withColumn("salary_percentage", 
                round((col("salary") / col("dept_total")) * 100, 2))

exercise2.show()

+---+-----+-----------+------+----------+
| id| name| department|salary| hire_date|
+---+-----+-----------+------+----------+
|  5|David|Engineering| 85000|2022-02-28|
|  4| Lisa|         HR| 60000|2018-11-05|
|  2|Sarah|  Marketing| 65000|2019-03-22|
+---+-----+-----------+------+----------+

+---+-----+-----------+------+----------+----------+-----------------+
| id| name| department|salary| hire_date|dept_total|salary_percentage|
+---+-----+-----------+------+----------+----------+-----------------+
|  1| John|Engineering| 75000|2020-01-15|    240000|            31.25|
|  3| Mike|Engineering| 80000|2021-06-10|    240000|            33.33|
|  5|David|Engineering| 85000|2022-02-28|    240000|            35.42|
|  4| Lisa|         HR| 60000|2018-11-05|     60000|            100.0|
|  2|Sarah|  Marketing| 65000|2019-03-22|     65000|            100.0|
+---+-----+-----------+------+----------+----------+-----------------+



                                                                                

## Performance Tips

### 1 - Use caching for DataFrames you'll reuse:

In [None]:
df_employees.cache()
df_employees.count()  # Triggers caching

### 2 - Partition your data when reading from files:

In [None]:
# When reading from files
df = spark.read.option("multiline", "true").json("path/to/files")

### 3 - Use appropriate file formats (Parquet is usually best):

In [None]:
# Write to Parquet
df_employees.write.mode("overwrite").parquet("employees.parquet")

# Read from Parquet
df_read = spark.read.parquet("employees.parquet")

In [None]:
#stop spark
spark.stop()