## Filter Columns with None or Null Values


### the dataframes contains many NULL/None values in columns, in many of the cases before
performing any of the operations of the dataframe firstly we have to handle the NULL/None
values in order to get the desired result or output, we have to filter those NULL values from
the dataframe.
df.filter(condition) : This function returns the new dataframe with the values which
satisfies the given condition.
df.column_name.isNotNull() : This function is used to filter the rows that are not NULL/None
in the dataframe column.


In [1]:
import pyspark
from pyspark.sql import *
from pyspark.sql.functions import col,lit
from pyspark import SQLContext
from pyspark.sql import SparkSession
spark = SparkSession.builder.config("spark.driver.host", "localhost").appName("SparkByExamples.com").getOrCreate()
conf = pyspark.SparkConf()
spark_context = SparkSession.builder.config(conf=conf).getOrCreate()

In [15]:
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql.types import StructType,StructField,StringType,IntegerType


In [16]:
data2 = [("Pulkit", 12, "CS32", 82, "Programming"),
("Ritika", 20, "CS32", 94, "Writing"),
("Atirikt", 4, "BB21", 78, None),
("Reshav", 18, None, 56, None)
]
spark = SparkSession.builder.appName("Student_Info").getOrCreate()
schema = StructType([
StructField("Name", StringType(), True),
StructField("Roll Number", IntegerType(), True),
StructField("Class ID", StringType(), True),
StructField("Marks", IntegerType(), True),
StructField("Extracurricular", StringType(), True)
])
df = spark.createDataFrame(data=data2, schema=schema)

In [17]:
df.na.drop(how="any").show(truncate=False)

+------+-----------+--------+-----+---------------+
|Name  |Roll Number|Class ID|Marks|Extracurricular|
+------+-----------+--------+-----+---------------+
|Pulkit|12         |CS32    |82   |Programming    |
|Ritika|20         |CS32    |94   |Writing        |
+------+-----------+--------+-----+---------------+



In [18]:
actor_data = [
("James", None, "Bond", "M", 6000),
("Michael", None, None, "M", 4000),
("Robert", None, "Pattinson", "M", 4000),
("Natalie", None, "Portman", "F", 4000),
("Julia", None, "Roberts", "F", 1000)
]
actor_schema = T.StructType([
T.StructField("firstname", T.StringType(), True),
T.StructField("middlename", T.StringType(), True),
T.StructField("lastname", T.StringType(), True),
T.StructField("gender", T.StringType(), True),
T.StructField("salary", T.IntegerType(), True)
])
df = spark.createDataFrame(data=actor_data, schema=actor_schema)
df.show(truncate=False)

+---------+----------+---------+------+------+
|firstname|middlename|lastname |gender|salary|
+---------+----------+---------+------+------+
|James    |NULL      |Bond     |M     |6000  |
|Michael  |NULL      |NULL     |M     |4000  |
|Robert   |NULL      |Pattinson|M     |4000  |
|Natalie  |NULL      |Portman  |F     |4000  |
|Julia    |NULL      |Roberts  |F     |1000  |
+---------+----------+---------+------+------+



In [19]:
import pyspark.sql.functions as F
null_counts = df.select([F.count(F.when(F.col(c).isNull(), c)).alias(
c) for c in df.columns]).collect()[0].asDict()
print(null_counts)

{'firstname': 0, 'middlename': 5, 'lastname': 1, 'gender': 0, 'salary': 0}


In [20]:
df_size = df.count()

In [21]:
to_drop = [k for k, v in null_counts.items() if v == df_size]
print(to_drop)

['middlename']


In [22]:
output_df = df.drop(*to_drop)
output_df.show(truncate=False)

+---------+---------+------+------+
|firstname|lastname |gender|salary|
+---------+---------+------+------+
|James    |Bond     |M     |6000  |
|Michael  |NULL     |M     |4000  |
|Robert   |Pattinson|M     |4000  |
|Natalie  |Portman  |F     |4000  |
|Julia    |Roberts  |F     |1000  |
+---------+---------+------+------+

