In [0]:
#Handling nulls and missing values effectively is crucial when reading CSV files into Spark DataFrames to ensure data integrity and prevent errors during data processing.

#1. Identifying Null Values
When reading a CSV file, you might encounter different representations of null values, such as NA, NULL, empty strings, or other placeholders. Spark provides options to handle these representations appropriately.

#2. Using nullValue Option
The nullValue option allows you to specify what should be considered as a null value in the CSV file.
example.

# Initialize SparkSession
spark = SparkSession.builder.appName("HandleNulls").getOrCreate()

# Reading CSV with nullValue option
df = spark.read.csv("path/to/csvfile.csv",
                    header=True,
                    nullValue="NA",
                    inferSchema=True)

df.show()

#In this example, any occurrence of NA in the CSV file will be treated as a null value.

In [0]:
# using 'Nanvalue' option
#the 'NanValue' option is used to specify what should be considered as 'NaNvalue'(Not a Number)

# Reading CSV with nullValue and nanValue options
df = spark.read.csv("path/to/csvfile.csv",
                    header=True,
                    nullValue="NA",
                    nanValue="NaN",
                    inferSchema=True)

df.show()
#In this example, both NA and NaN in the CSV file will be treated as null values.


In [0]:
# Dealing with missing values post read.

#Once the DataFrame is loaded, Spark provides various methods to handle missing values, such as 'fillna', 'dropna', and 'replace'.

# Fill null values with a specific value
df_filled = df.fillna({"age": 0, "name": "Unknown", "email": "no_email@example.com"})

df_filled.show()

#In this example, null values in the age column are replaced with 0, in the name column with "Unknown", and in the email column with "no_email@example.com".

In [0]:
# Drop rows where any value is null
df_dropped = df.dropna()

df_dropped.show()

## Drop rows where all values are null
df_dropped_all = df.dropna(how='all')

df_dropped_all.show()

## Drop rows that have less than 2 non-null values
df_dropped_thresh = df.dropna(thresh=2)

df_dropped_thresh.show()

In [0]:
#Replacing Specific Values
#Sometimes, you might want to replace specific values, such as empty strings or specific placeholders, with nulls.

Example:
#    from pyspark.sql.functions import col, when

# Replace empty strings with null
df_replaced = df.withColumn("name", when(col("name") == "", None).otherwise(col("name")))

df_replaced.show()
#In this example, empty strings in the name column are replaced with null.