### Create a Sample DataFrame

In [0]:
data = [
    ("Alice", "Math", 85, "Pass", 92),
    ("Bob", "Science", None, "Fail", 80),
    ("Charlie", "English", 78, None, 88),
    ("David", "Math", 90, "Pass", None),
    ("Eva", "History", 65, "Fail", 75),
    ("Frank", "Science", 55, "Fail", None),
    ("Grace", "English", None, None, 95),
    (None, None, None, None, None)
]
columns = ["name", "subject", "marks", "status", "attendance"]

student_df = spark.createDataFrame(data, columns)
student_df.show()

+-------+-------+-----+------+----------+
|   name|subject|marks|status|attendance|
+-------+-------+-----+------+----------+
|  Alice|   Math|   85|  Pass|        92|
|    Bob|Science| null|  Fail|        80|
|Charlie|English|   78|  null|        88|
|  David|   Math|   90|  Pass|      null|
|    Eva|History|   65|  Fail|        75|
|  Frank|Science|   55|  Fail|      null|
|  Grace|English| null|  null|        95|
|   null|   null| null|  null|      null|
+-------+-------+-----+------+----------+



### Filter all records with Null Value/ NOT NULL value --isNull(), isNotNull()

In [0]:

# from pyspark.sql.functions import col
# display(student_df.filter(col("marks").isNull())) 

# alternate ways
# display(student_df.filter(student_df.marks.isNull())) 
display(student_df.filter("marks IS NOT NULL"))


name,subject,marks,status,attendance
Alice,Math,85,Pass,92.0
Charlie,English,78,,88.0
David,Math,90,Pass,
Eva,,65,Fail,75.0
Frank,Science,55,Fail,
Helen,Math,88,Pass,98.0


### Drop the records will NULL value - All and ANY

In [0]:
# display(student_df.na.drop()) # drops rows with ANY NUll
# display(student_df.na.drop("any"))
display(student_df.dropna("any"))

name,subject,marks,status,attendance
Alice,Math,85,Pass,92
Eva,History,65,Fail,75


In [0]:
display(student_df.na.drop("all")) # drops rows with All NUll values

name,subject,marks,status,attendance
Alice,Math,85.0,Pass,92.0
Bob,Science,,Fail,80.0
Charlie,English,78.0,,88.0
David,Math,90.0,Pass,
Eva,History,65.0,Fail,75.0
Frank,Science,55.0,Fail,
Grace,English,,,95.0


### Drops the records with NULL on Selected Column

In [0]:
display(student_df.dropna(subset=["marks", "attendance"])) # can also be combination of columns

name,subject,marks,status,attendance
Alice,Math,85,Pass,92
Charlie,English,78,,88
Eva,History,65,Fail,75


In [0]:
display(student_df.dropna(subset=["status"]))

name,subject,marks,status,attendance
Alice,Math,85.0,Pass,92.0
Bob,Science,,Fail,80.0
David,Math,90.0,Pass,
Eva,History,65.0,Fail,75.0
Frank,Science,55.0,Fail,


### Fill Value for All columns if NULL is present

In [0]:
display(student_df.fillna(value=0)) # applies for integer values

name,subject,marks,status,attendance
Alice,Math,85,Pass,92
Bob,Science,0,Fail,80
Charlie,English,78,,88
David,Math,90,Pass,0
Eva,History,65,Fail,75
Frank,Science,55,Fail,0
Grace,English,0,,95
,,0,,0


In [0]:
display(student_df.fillna(value="NA")) # applies for string values

name,subject,marks,status,attendance
Alice,Math,85.0,Pass,92.0
Bob,Science,,Fail,80.0
Charlie,English,78.0,,88.0
David,Math,90.0,Pass,
Eva,History,65.0,Fail,75.0
Frank,Science,55.0,Fail,
Grace,English,,,95.0
,,,,


### Fill Value for Specific columns if NULL is present

In [0]:
display(student_df.fillna(value=0, subset=["marks", "attendance"])) 

name,subject,marks,status,attendance
Alice,Math,85,Pass,92
Bob,Science,0,Fail,80
Charlie,English,78,,88
David,Math,90,Pass,0
Eva,History,65,Fail,75
Frank,Science,55,Fail,0
Grace,English,0,,95
,,0,,0


In [0]:
display(student_df.fillna(value="Not Applicable", subset=["name", "subject", "status"])) 

name,subject,marks,status,attendance
Alice,Math,85.0,Pass,92.0
Bob,Science,,Fail,80.0
Charlie,English,78.0,Not Applicable,88.0
David,Math,90.0,Pass,
Eva,History,65.0,Fail,75.0
Frank,Science,55.0,Fail,
Grace,English,,Not Applicable,95.0
Not Applicable,Not Applicable,,Not Applicable,


In [0]:
display(student_df.fillna({"name": "UNKNOWN", "subject":"NO_SUBJECT", "marks":0, "status":"NA"})) 

name,subject,marks,status,attendance
Alice,Math,85,Pass,92.0
Bob,Science,0,Fail,80.0
Charlie,English,78,,88.0
David,Math,90,Pass,
Eva,History,65,Fail,75.0
Frank,Science,55,Fail,
Grace,English,0,,95.0
UNKNOWN,NO_SUBJECT,0,,
