# PySpark fillna() & fill() – Replace NULL/None Values

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = (
    SparkSession.builder
    .appName("example-fill-na")
    .getOrCreate()   
)

In [None]:
# pyspark --master yarn 
#         --deploy-mode client 
#         --num-executors 5 
#         --executor-cores 1 
#         --executor-memory 1G 
#         --jars ./spark-csv_2.11-1.4.0.jar 
#         --jars ./commons-csv-1.4.jar 
#         --jars ./univocity-parsers-2.2.1.jar

In [None]:

filePath = "../files/small_zipcode.csv"

df = spark.read.format("csv").option("header", "true").load(filePath)

df.printSchema()
df.show(truncate=False)


## PySpark fillna() & fill() Syntax

In [None]:
#Replace 0 for null for all integer columns
df.na.fill(value=0).show()

#Replace Replace 0 for null on only population column 
df.na.fill(value=0,subset=["population"]).show()

df.na.fill("").show(false)

df.na.fill("unknown",["city"]).na.fill("",["type"]).show()


df.na.fill({"city": "unknown", "type": ""}) \
    .show()



In [10]:
data = [(101, 'prashant', 'pandey', 'accounts'),
         (102, 'abdul', None, 'support'),
         (103, 'M David', 'turner', None)]

df = spark.createDataFrame(data=data, schema=['id','fname','lname','department'])
df.show()

+---+--------+------+----------+
| id|   fname| lname|department|
+---+--------+------+----------+
|101|prashant|pandey|  accounts|
|102|   abdul|  null|   support|
|103| M David|turner|      null|
+---+--------+------+----------+



In [11]:
df1 = df.na.fill("Unknown")
df1.show()


+---+--------+-------+----------+
| id|   fname|  lname|department|
+---+--------+-------+----------+
|101|prashant| pandey|  accounts|
|102|   abdul|Unknown|   support|
|103| M David| turner|   Unknown|
+---+--------+-------+----------+



In [12]:
df1 = df.fill("Unknown")
df1.show()


AttributeError: 'DataFrame' object has no attribute 'fill'

In [13]:
df1 = df.fillna("Unknown")
df1.show()

+---+--------+-------+----------+
| id|   fname|  lname|department|
+---+--------+-------+----------+
|101|prashant| pandey|  accounts|
|102|   abdul|Unknown|   support|
|103| M David| turner|   Unknown|
+---+--------+-------+----------+



In [14]:
df1 = df.fillna("Unknown",'all')
df1.show()

AnalysisException: Cannot resolve column name "all" among (id, fname, lname, department);

In [16]:
df1 = df.selectExpr("id", "Array(fname,lname) as PersonalDetails", "department")
df1.show(truncate=0)

+---+------------------+----------+
|id |PersonalDetails   |department|
+---+------------------+----------+
|101|[prashant, pandey]|accounts  |
|102|[abdul,]          |support   |
|103|[M David, turner] |null      |
+---+------------------+----------+



In [19]:
df1 = df.selectExpr("id", "Struct(fname,lname) as PersonalDetails", "department")
df1.show(truncate=0)


+---+------------------+----------+
|id |PersonalDetails   |department|
+---+------------------+----------+
|101|[prashant, pandey]|accounts  |
|102|[abdul,]          |support   |
|103|[M David, turner] |null      |
+---+------------------+----------+

