# PySpark Handling Missing Values

- Dropping Columns
- Dropping Rows
- Variable Parameter in Dropping functionalities
- handling Missing values by Mean

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Practical').getOrCreate()

In [2]:
df_pyspark = spark.read.csv('test3.csv', header=True, inferSchema=True)

In [3]:
df_pyspark.show()

+---------+----+----------+------+
|     Name| Age|Experience|Salary|
+---------+----+----------+------+
|  Shubham|  27|         5| 50000|
|   Manish|  25|         1| 25000|
|Suryakant|  58|        34| 45000|
|  Saurabh|  18|         0| 15000|
|    Seeta|  45|        25| 15000|
|   Shweta|  45|        15| 85000|
|     null|null|      null| 40000|
|     null|  36|        10| 38000|
|     null|  34|      null|  null|
+---------+----+----------+------+



In [4]:
# Drop column
df_pyspark.drop('Name').show()

+----+----------+------+
| Age|Experience|Salary|
+----+----------+------+
|  27|         5| 50000|
|  25|         1| 25000|
|  58|        34| 45000|
|  18|         0| 15000|
|  45|        25| 15000|
|  45|        15| 85000|
|null|      null| 40000|
|  36|        10| 38000|
|  34|      null|  null|
+----+----------+------+



In [5]:
df_pyspark.show()

+---------+----+----------+------+
|     Name| Age|Experience|Salary|
+---------+----+----------+------+
|  Shubham|  27|         5| 50000|
|   Manish|  25|         1| 25000|
|Suryakant|  58|        34| 45000|
|  Saurabh|  18|         0| 15000|
|    Seeta|  45|        25| 15000|
|   Shweta|  45|        15| 85000|
|     null|null|      null| 40000|
|     null|  36|        10| 38000|
|     null|  34|      null|  null|
+---------+----+----------+------+



In [6]:
df_pyspark.na.drop().show()

+---------+---+----------+------+
|     Name|Age|Experience|Salary|
+---------+---+----------+------+
|  Shubham| 27|         5| 50000|
|   Manish| 25|         1| 25000|
|Suryakant| 58|        34| 45000|
|  Saurabh| 18|         0| 15000|
|    Seeta| 45|        25| 15000|
|   Shweta| 45|        15| 85000|
+---------+---+----------+------+



In [7]:
# any == how
df_pyspark.na.drop(how='any').show()

+---------+---+----------+------+
|     Name|Age|Experience|Salary|
+---------+---+----------+------+
|  Shubham| 27|         5| 50000|
|   Manish| 25|         1| 25000|
|Suryakant| 58|        34| 45000|
|  Saurabh| 18|         0| 15000|
|    Seeta| 45|        25| 15000|
|   Shweta| 45|        15| 85000|
+---------+---+----------+------+



In [16]:
#Threshold
df_pyspark.na.drop(how='any',thresh=4).show()

+---------+---+----------+------+
|     Name|Age|Experience|Salary|
+---------+---+----------+------+
|  Shubham| 27|         5| 50000|
|   Manish| 25|         1| 25000|
|Suryakant| 58|        34| 45000|
|  Saurabh| 18|         0| 15000|
|    Seeta| 45|        25| 15000|
|   Shweta| 45|        15| 85000|
+---------+---+----------+------+



In [18]:
## Seubset
df_pyspark.na.drop(how='any', subset=["Name"]).show()

+---------+---+----------+------+
|     Name|Age|Experience|Salary|
+---------+---+----------+------+
|  Shubham| 27|         5| 50000|
|   Manish| 25|         1| 25000|
|Suryakant| 58|        34| 45000|
|  Saurabh| 18|         0| 15000|
|    Seeta| 45|        25| 15000|
|   Shweta| 45|        15| 85000|
+---------+---+----------+------+



In [19]:
# filling na missing values
df_pyspark.na.fill('Missing values').show()

+--------------+----+----------+------+
|          Name| Age|Experience|Salary|
+--------------+----+----------+------+
|       Shubham|  27|         5| 50000|
|        Manish|  25|         1| 25000|
|     Suryakant|  58|        34| 45000|
|       Saurabh|  18|         0| 15000|
|         Seeta|  45|        25| 15000|
|        Shweta|  45|        15| 85000|
|Missing values|null|      null| 40000|
|Missing values|  36|        10| 38000|
|Missing values|  34|      null|  null|
+--------------+----+----------+------+



In [20]:
from pyspark.ml.feature import Imputer

In [21]:
imputer = Imputer(inputCols=['Age','Experience','Salary'],
                 outputCols=["{}_impute".format(c) for c in ['Age','Experience','Salary']]).setStrategy("mean")

In [23]:
## add columns to dataframe
imputer.fit(df_pyspark).transform(df_pyspark).show()

+---------+----+----------+------+----------+-----------------+-------------+
|     Name| Age|Experience|Salary|Age_impute|Experience_impute|Salary_impute|
+---------+----+----------+------+----------+-----------------+-------------+
|  Shubham|  27|         5| 50000|        27|                5|        50000|
|   Manish|  25|         1| 25000|        25|                1|        25000|
|Suryakant|  58|        34| 45000|        58|               34|        45000|
|  Saurabh|  18|         0| 15000|        18|                0|        15000|
|    Seeta|  45|        25| 15000|        45|               25|        15000|
|   Shweta|  45|        15| 85000|        45|               15|        85000|
|     null|null|      null| 40000|        36|               12|        40000|
|     null|  36|        10| 38000|        36|               10|        38000|
|     null|  34|      null|  null|        34|               12|        39125|
+---------+----+----------+------+----------+-----------------+-