# Spark DataFrame Missing Data ~~ na

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('Missing_Data').getOrCreate()

In [3]:
df = spark.read.csv('ContainsNull.csv', inferSchema=True, header=True)

In [4]:
df.show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp2| null| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



# drop all rows with missing data

In [7]:
df.na.drop().show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp4|Cindy|456.0|
+----+-----+-----+



# drop rows with atleast 2 null values

In [8]:
df.na.drop(thresh = 2).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



# column based

In [9]:
# how ~~ operator
df.na.drop(how='any').show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp4|Cindy|456.0|
+----+-----+-----+



In [10]:
df.na.drop(how='all').show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp2| null| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [11]:
df.na.drop(subset=['Sales']).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



# fill missing values

In [12]:
df.printSchema()

root
 |-- Id: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sales: double (nullable = true)



In [13]:
# for 'string' fill
df.na.fill('Fill Values').show()

+----+-----------+-----+
|  Id|       Name|Sales|
+----+-----------+-----+
|emp1|       John| null|
|emp2|Fill Values| null|
|emp3|Fill Values|345.0|
|emp4|      Cindy|456.0|
+----+-----------+-----+



In [14]:
# numerical fill
df.na.fill(0).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John|  0.0|
|emp2| null|  0.0|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [15]:
df.na.fill('No Name', subset=['Name']).show()

+----+-------+-----+
|  Id|   Name|Sales|
+----+-------+-----+
|emp1|   John| null|
|emp2|No Name| null|
|emp3|No Name|345.0|
|emp4|  Cindy|456.0|
+----+-------+-----+



# fill empty column with its mean

In [16]:
from pyspark.sql.functions import mean

In [18]:
mean_val = df.select(mean(df['Sales'])).collect()

mean_val

[Row(avg(Sales)=400.5)]

In [19]:
mean_val[0]

Row(avg(Sales)=400.5)

In [25]:
new_mean_val = mean_val[0].asDict()
new_mean_val

{'avg(Sales)': 400.5}

In [26]:
mean_sales = mean_val[0][0]
mean_sales

400.5

In [27]:
df.na.fill(mean_sales, ['Sales']).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John|400.5|
|emp2| null|400.5|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [28]:
# All at once
df.na.fill(df.select(mean(df['Sales'])).collect()[0][0], ['Sales']).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John|400.5|
|emp2| null|400.5|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [None]:
# type(mean_val)          # list
# type(mean_val[0])       # pyspark.sql.types.Row
# type(mean_val[0][0])    # float