In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col,when

| Scenario                     | Code Example                                                                                                                                | Description                                       | Expected Output                          |                                                 |
| ---------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------- | ---------------------------------------- | ----------------------------------------------- |
| **Basic Condition**          | `python df.withColumn("result", when(col("score") >= 50, "Pass").otherwise("Fail")) `                                                       | Assign value based on a simple condition.         | `"Pass"` if score ≥ 50, else `"Fail"`    |                                                 |
| **Multiple Conditions**      | `python df.withColumn("grade", when(col("score") >= 85, "A").when(col("score") >= 70, "B").otherwise("C")) `                                | Chain multiple `when` conditions.                 | `"A"` for ≥85, `"B"` for ≥70, else `"C"` |                                                 |
| **Without `.otherwise()`**   | `python df.withColumn("status", when(col("score") >= 50, "Pass")) `                                                                         | No `.otherwise()` — unmatched rows return `null`. | `"Pass"` for score ≥ 50, else `null`     |                                                 |
| **Using Expressions**        | `python df.withColumn("bonus", when(col("score") > 80, col("score") * 0.1).otherwise(0)) `                                                  | Perform calculations inside `when`.               | Bonus = 10% of score if >80, else 0      |                                                 |
| **Handling Null Values**     | `python df.withColumn("status", when(col("score").isNull(), "No Score").otherwise("Checked")) `                                             | Explicitly handle nulls.                          | `"No Score"` if null, else `"Checked"`   |                                                 |
| **Complex Condition**        | `python df.withColumn("category", when((col("score") >= 50) & (col("score") < 70), "Average").otherwise("Other")) `                         | Combine multiple conditions using `&`, `          | `.                                       | `"Average"` if score in [50,70), else `"Other"` |
| **SQL Equivalent**           | `python df.withColumn("result", when(col("score") >= 50, "Pass").otherwise("Fail")) `                                                       | Mimics SQL `CASE WHEN`.                           | `"Pass"` or `"Fail"` based on condition  |                                                 |
| **Chaining Multiple `when`** | `python df.withColumn("grade", when(col("score") >= 90, "A+").when(col("score") >= 80, "A").when(col("score") >= 70, "B").otherwise("C")) ` | Multiple checks, first match used.                | `"A+"`, `"A"`, `"B"`, or `"C"`           |                                                 |
| **Default Null Handling**    | `python df.withColumn("result", when(col("score") >= 50, "Pass")) `                                                                         | Without `.otherwise()`, unmatched rows = null.    | Null values for unmatched cases          |                                                 |


In [0]:
data = [
    ("Alice", 15, "F"),
    ("Bob", 22, "M"),
    ("Charlie", 35, "M"),
    ("Diana", 29, "F"),
    ("Ethan", 42, "M"),
    ("Fiona", 17, "F")
]

columns = ["name", "age", "gender"]


In [0]:
df = spark.createDataFrame(data,columns)
df.show()

+-------+---+------+
|   name|age|gender|
+-------+---+------+
|  Alice| 15|     F|
|    Bob| 22|     M|
|Charlie| 35|     M|
|  Diana| 29|     F|
|  Ethan| 42|     M|
|  Fiona| 17|     F|
+-------+---+------+



In [0]:
df = df.withColumn(
    'age_classification',
    when(col('age') < 15, 'child')
    .when(col('age') >= 15, 'adult')
    .otherwise('old')
)
display(df)

name,age,gender,age_classification
Alice,15,F,adult
Bob,22,M,adult
Charlie,35,M,adult
Diana,29,F,adult
Ethan,42,M,adult
Fiona,17,F,adult


In [0]:
df.filter(col("age").isNull()).show()
df.filter(col("age").isNotNull()).show()

+----+---+------+------------------+
|name|age|gender|age_classification|
+----+---+------+------------------+
+----+---+------+------------------+

+-------+---+------+------------------+
|   name|age|gender|age_classification|
+-------+---+------+------------------+
|  Alice| 15|     F|             adult|
|    Bob| 22|     M|             adult|
|Charlie| 35|     M|             adult|
|  Diana| 29|     F|             adult|
|  Ethan| 42|     M|             adult|
|  Fiona| 17|     F|             adult|
+-------+---+------+------------------+



| Function & Parameter              | Description                                           | Example Code                                        | Output Explanation                                                                                                      |
| --------------------------------- | ----------------------------------------------------- | --------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------- |
| **`fillna(value)`**               | Replace nulls with a single value across all columns  | `df.fillna("Unknown").show()`                       | All `NULL` values in **all string columns** become `"Unknown"`. Numeric columns stay unchanged unless value is numeric. |
| **`fillna(value, subset=[...])`** | Replace nulls with a single value in specific columns | `df.fillna("N/A", subset=["Department"]).show()`    | Only nulls in **Department** column are replaced with `"N/A"`.                                                          |
| **`fillna({col: val, ...})`**     | Replace nulls with different values per column        | `df.fillna({"Department": "N/A", "Age": 0}).show()` | Nulls in **Department** → `"N/A"`, nulls in **Age** → `0`.                                                              |
| **`na.fill(...)`**                | Same as `fillna`, alternate syntax                    | `df.na.fill("Unassigned", ["Department"]).show()`   | Equivalent to `fillna`. Preferred when chaining `.na` operations.                                                       |
| **`dropna(how="any")`**           | Drop row if **any column** is null                    | `df.dropna(how="any").show()`                       | Rows with **at least one null** are removed.                                                                            |
| **`dropna(how="all")`**           | Drop row if **all columns** are null                  | `df.dropna(how="all").show()`                       | Keeps rows unless **every column is null**.                                                                             |
| **`dropna(thresh=n)`**            | Keep rows with **at least n non-nulls**               | `df.dropna(thresh=2).show()`                        | Drops rows that have **less than 2 non-null values**.                                                                   |
| **`dropna(subset=[...])`**        | Drop rows if subset columns contain null              | `df.dropna(subset=["Department"]).show()`           | Removes rows where **Department is null**, ignores nulls in other cols.                                                 |
| **`na.drop(...)`**                | Same as `dropna`, alternate syntax                    | `df.na.drop(how="any", subset=["Age"]).show()`      | Equivalent to `dropna`.                                                                                                 |


In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

spark = SparkSession.builder.appName("FillNaExample").getOrCreate()

data = [("Alice", None, 30), ("Bob", "HR", None), ("Charlie", "IT", 25), ("David", None, None)]
columns = ["Name", "Department", "Age"]
df = spark.createDataFrame(data, columns)

# Using fillna with a single value
df_filled_single = df.fillna("Unknown", subset=["Department"])
df_filled_single.show()

# Using fillna with a dictionary for different columns
df_filled_dict = df.fillna({"Department": "N/A", "Age": 0})
df_filled_dict.show()

# Using na.fill (equivalent to fillna with a single value)
df_na_filled = df.na.fill("Unassigned", subset=["Department"])
df_na_filled.show()

spark.stop()

+-------+----------+----+
|   Name|Department| Age|
+-------+----------+----+
|  Alice|   Unknown|  30|
|    Bob|        HR|NULL|
|Charlie|        IT|  25|
|  David|   Unknown|NULL|
+-------+----------+----+

+-------+----------+---+
|   Name|Department|Age|
+-------+----------+---+
|  Alice|       N/A| 30|
|    Bob|        HR|  0|
|Charlie|        IT| 25|
|  David|       N/A|  0|
+-------+----------+---+

+-------+----------+----+
|   Name|Department| Age|
+-------+----------+----+
|  Alice|Unassigned|  30|
|    Bob|        HR|NULL|
|Charlie|        IT|  25|
|  David|Unassigned|NULL|
+-------+----------+----+



In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

spark = SparkSession.builder.appName("DropNaExample").getOrCreate()

data = [
    ("Alice", None, 30),
    ("Bob", "HR", None),
    ("Charlie", "IT", 25),
    ("David", None, None),
    ("Eve", "Finance", 28)
]
columns = ["Name", "Department", "Age"]

df = spark.createDataFrame(data, columns)

print("Original DataFrame:")
df.show()

# 1️⃣ Drop rows with ANY null value
df_drop_any = df.dropna(how="any")
print("Drop rows with ANY nulls:")
df_drop_any.show()

# 2️⃣ Drop rows with ALL values null
df_drop_all = df.dropna(how="all")
print("Drop rows where ALL columns are null (none in this case):")
df_drop_all.show()

# 3️⃣ Drop rows with nulls in a SUBSET of columns
df_drop_subset = df.dropna(subset=["Department"])
print("Drop rows with nulls in Department column:")
df_drop_subset.show()

# 4️⃣ Drop rows with less than a certain number of NON-null values
df_drop_thresh = df.dropna(thresh=2)  
# keep rows that have at least 2 non-null values
print("Drop rows with less than 2 non-null values:")
df_drop_thresh.show()

# 5️⃣ Drop rows with nulls in multiple specific columns
df_drop_multi_subset = df.dropna(subset=["Department", "Age"])
print("Drop rows with nulls in Department OR Age columns:")
df_drop_multi_subset.show()

spark.stop()


Original DataFrame:
+-------+----------+----+
|   Name|Department| Age|
+-------+----------+----+
|  Alice|      NULL|  30|
|    Bob|        HR|NULL|
|Charlie|        IT|  25|
|  David|      NULL|NULL|
|    Eve|   Finance|  28|
+-------+----------+----+

Drop rows with ANY nulls:
+-------+----------+---+
|   Name|Department|Age|
+-------+----------+---+
|Charlie|        IT| 25|
|    Eve|   Finance| 28|
+-------+----------+---+

Drop rows where ALL columns are null (none in this case):
+-------+----------+----+
|   Name|Department| Age|
+-------+----------+----+
|  Alice|      NULL|  30|
|    Bob|        HR|NULL|
|Charlie|        IT|  25|
|  David|      NULL|NULL|
|    Eve|   Finance|  28|
+-------+----------+----+

Drop rows with nulls in Department column:
+-------+----------+----+
|   Name|Department| Age|
+-------+----------+----+
|    Bob|        HR|NULL|
|Charlie|        IT|  25|
|    Eve|   Finance|  28|
+-------+----------+----+

Drop rows with less than 2 non-null values:
+----