# Spark DataFrame
## Missing Data

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import mean

In [2]:
spark = SparkSession.builder.appName("missing").getOrCreate()

In [3]:
df = spark.read.csv("../data/ContainsNull.csv", inferSchema=True, header=True)

In [4]:
df.show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| NULL|
|emp2| NULL| NULL|
|emp3| NULL|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



## Different ways to drop NA values

In [5]:
# You can set a threshold of how many null values should be in a row for it to be dropped
df.na.drop(thresh=2).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| NULL|
|emp3| NULL|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [6]:
# Drop a row if all the values of the rows are NA
df.na.drop(how="all").show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| NULL|
|emp2| NULL| NULL|
|emp3| NULL|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [7]:
# Drop rows in a column if it is NA
df.na.drop(subset=["Sales"]).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp3| NULL|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [8]:
df.printSchema()

root
 |-- Id: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sales: double (nullable = true)



## Spark is smart inorder to fill data into NA values based on the data type

In [9]:
# Fills only the columns with integers
df.na.fill(0).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John|  0.0|
|emp2| NULL|  0.0|
|emp3| NULL|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [10]:
# fills only the text columns, use subset to fill required columns
df.na.fill("John Doe", subset=["Name"]).show()

+----+--------+-----+
|  Id|    Name|Sales|
+----+--------+-----+
|emp1|    John| NULL|
|emp2|John Doe| NULL|
|emp3|John Doe|345.0|
|emp4|   Cindy|456.0|
+----+--------+-----+



### Using the mean to fill the NA values in sales colum

In [11]:
mean_val = df.select(mean(df["Sales"])).collect()

In [12]:
mean_sales = mean_val[0][0]

In [13]:
df.na.fill(mean_sales, subset=["Sales"]).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John|400.5|
|emp2| NULL|400.5|
|emp3| NULL|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [14]:
# One liner for the above code
df.na.fill(
    df.select(
        mean(df["Sales"])
    ).collect()[0][0],
    subset=["Sales"]
).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John|400.5|
|emp2| NULL|400.5|
|emp3| NULL|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+

