In [1]:
from pyspark.sql import SparkSession

import pyspark.sql.functions as F
import pyspark.sql.types as T

spark = SparkSession.builder.appName("nulls").master("local[*]").getOrCreate()

In [2]:
data = [("Alice", 30), 
 ("Bob", None), 
 ("Catherine", 25), 
 (None, 35), 
 ("Eve", None)]

columns = ["Name", "Age"]

df = spark.createDataFrame(data, columns)
df.show()

+---------+----+
|     Name| Age|
+---------+----+
|    Alice|  30|
|      Bob|null|
|Catherine|  25|
|     null|  35|
|      Eve|null|
+---------+----+



 # Dropping Null Values:

In [7]:
# We can use df.na.drop() function to drop rows containing one or more null values
df.na.drop().show()

# We can also drop rows only if all columns contains nulls
df.na.drop(how="all").show()

# We can also drop rows where only mentioned columns contains nulls
df.na.drop(subset=["Age"]).show()

+---------+---+
|     Name|Age|
+---------+---+
|    Alice| 30|
|Catherine| 25|
+---------+---+

+---------+----+
|     Name| Age|
+---------+----+
|    Alice|  30|
|      Bob|null|
|Catherine|  25|
|     null|  35|
|      Eve|null|
+---------+----+

+---------+---+
|     Name|Age|
+---------+---+
|    Alice| 30|
|Catherine| 25|
|     null| 35|
+---------+---+



# Filling Null Values
df.na.fill(value): Replaces all null values in the DataFrame with a specified value. Spark automatically infers the data type for replacement.                                                                        
df.na.fill(value, subset=['column1']): Replaces null values in a specific column with a value.                                                                                                  
df.na.fill({'column1': value1, 'column2': value2}): Fills nulls in different columns with different values using a dictionary.

In [8]:
# fill any value in any row or column
df.na.fill("yo").show()

+---------+----+
|     Name| Age|
+---------+----+
|    Alice|  30|
|      Bob|null|
|Catherine|  25|
|       yo|  35|
|      Eve|null|
+---------+----+



In [13]:
# fill only a particular subset of columns
df.na.fill("yo",subset=["Age"]).show()
# AS this is a int column, we can't fill it with yo

# We can also specify what needs to be filled in each column
df.na.fill({'Name': "yo", 'Age': 999}).show()

+---------+----+
|     Name| Age|
+---------+----+
|    Alice|  30|
|      Bob|null|
|Catherine|  25|
|     null|  35|
|      Eve|null|
+---------+----+

+---------+---+
|     Name|Age|
+---------+---+
|    Alice| 30|
|      Bob|999|
|Catherine| 25|
|       yo| 35|
|      Eve|999|
+---------+---+



# Filtering Null Values:
Use where or filter with isNotNull() or isNull() to select rows based on null presence.

In [16]:
df.where(df.Age.isNull()).show()

df.where(df.Age.isNotNull()).show()

+----+----+
|Name| Age|
+----+----+
| Bob|null|
| Eve|null|
+----+----+

+---------+---+
|     Name|Age|
+---------+---+
|    Alice| 30|
|Catherine| 25|
|     null| 35|
+---------+---+



In [17]:
spark.stop()