## ***In this tutorial we will learn to handle missing values by using the following methods:***



* Dropping Columns
* Dropping Rows
* Filling Missing Values
* Handling Missing Values by Mean, Median and Mode

In [1]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.0.tar.gz (281.3 MB)
[K     |████████████████████████████████| 281.3 MB 44 kB/s 
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 55.1 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.0-py2.py3-none-any.whl size=281764026 sha256=f27aa2d631691f998439f95fe31e12bce9409ca457bb74b12fbc0b6eb6110d40
  Stored in directory: /root/.cache/pip/wheels/7a/8e/1b/f73a52650d2e5f337708d9f6a1750d451a7349a867f928b885
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.5 pyspark-3.3.0


In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('missing').getOrCreate()
spark

In [29]:
pyspark = spark.read.csv('missing_values.csv',header= True, inferSchema = True)
pyspark.show()

+-------+----+----------+-------+
|   name| age|experience| salary|
+-------+----+----------+-------+
| Uttam |  21|        10|1000000|
| Karan |  23|         4| 400000|
|Krishna|  28|        50|5000000|
|  Arjun|null|         9| 900000|
|  Bhim |null|      null| 700000|
+-------+----+----------+-------+



# Dropping Columns

In [8]:
pyspark = pyspark.drop('age')
pyspark.show()

+-------+----------+-------+
|   name|experience| salary|
+-------+----------+-------+
| Uttam |        10|1000000|
| Karan |         4| 400000|
|Krishna|        50|5000000|
|  Arjun|         9| 900000|
|  Bhim |      null| 700000|
+-------+----------+-------+



# Dropping Rows


###***Dropping Parameters:***

* how : 'any' or 'all'.

If 'any', drop a row if it contains any nulls. (default)

If 'all', drop a row only if all its values are null.


* thresh: int, optional, default None

If specified, drop rows that have less than thresh non-null values.

This overwrites the how parameter.

* subset : str, tuple or list, optional
    
optional list of column names to consider.

In [22]:
pyspark.na.drop(how = 'any').show()

+-------+---+----------+-------+
|   name|age|experience| salary|
+-------+---+----------+-------+
| Uttam | 21|        10|1000000|
| Karan | 23|         4| 400000|
|Krishna| 28|        50|5000000|
+-------+---+----------+-------+



In [24]:
pyspark.na.drop(thresh = 3).show()

+-------+----+----------+-------+
|   name| age|experience| salary|
+-------+----+----------+-------+
| Uttam |  21|        10|1000000|
| Karan |  23|         4| 400000|
|Krishna|  28|        50|5000000|
|  Arjun|null|         9| 900000|
+-------+----+----------+-------+



In [26]:
pyspark.na.drop(subset=['age']).show()

+-------+---+----------+-------+
|   name|age|experience| salary|
+-------+---+----------+-------+
| Uttam | 21|        10|1000000|
| Karan | 23|         4| 400000|
|Krishna| 28|        50|5000000|
+-------+---+----------+-------+



# Filling Missing Values

In [33]:
# Filling all the null values

pyspark.na.fill(100).show()

+-------+---+----------+-------+
|   name|age|experience| salary|
+-------+---+----------+-------+
| Uttam | 21|        10|1000000|
| Karan | 23|         4| 400000|
|Krishna| 28|        50|5000000|
|  Arjun|100|         9| 900000|
|  Bhim |100|       100| 700000|
+-------+---+----------+-------+



In [34]:
# Filling null values of targeted column

pyspark.na.fill(100,'age').show()

+-------+---+----------+-------+
|   name|age|experience| salary|
+-------+---+----------+-------+
| Uttam | 21|        10|1000000|
| Karan | 23|         4| 400000|
|Krishna| 28|        50|5000000|
|  Arjun|100|         9| 900000|
|  Bhim |100|      null| 700000|
+-------+---+----------+-------+



In [35]:
# Filling null values of targeted columns

pyspark.na.fill(100,['age','experience']).show()

+-------+---+----------+-------+
|   name|age|experience| salary|
+-------+---+----------+-------+
| Uttam | 21|        10|1000000|
| Karan | 23|         4| 400000|
|Krishna| 28|        50|5000000|
|  Arjun|100|         9| 900000|
|  Bhim |100|       100| 700000|
+-------+---+----------+-------+



# Handling Missing Values by Mean, Median and Mode

In [40]:
from pyspark.ml.feature import Imputer

imputer_mean = Imputer(
    inputCols = ['age','experience'],
    outputCols = ["{}_imputed".format(c) for c in ['age','experience']]
).setStrategy('mean')

In [41]:
imputer_mean.fit(pyspark).transform(pyspark).show()

+-------+----+----------+-------+-----------+------------------+
|   name| age|experience| salary|age_imputed|experience_imputed|
+-------+----+----------+-------+-----------+------------------+
| Uttam |  21|        10|1000000|         21|                10|
| Karan |  23|         4| 400000|         23|                 4|
|Krishna|  28|        50|5000000|         28|                50|
|  Arjun|null|         9| 900000|         24|                 9|
|  Bhim |null|      null| 700000|         24|                18|
+-------+----+----------+-------+-----------+------------------+



In [42]:
imputer_median = Imputer(
    inputCols = ['age','experience'],
    outputCols = ["{}_imputed".format(c) for c in ['age','experience']]
).setStrategy('median')

In [43]:
imputer_median.fit(pyspark).transform(pyspark).show()

+-------+----+----------+-------+-----------+------------------+
|   name| age|experience| salary|age_imputed|experience_imputed|
+-------+----+----------+-------+-----------+------------------+
| Uttam |  21|        10|1000000|         21|                10|
| Karan |  23|         4| 400000|         23|                 4|
|Krishna|  28|        50|5000000|         28|                50|
|  Arjun|null|         9| 900000|         23|                 9|
|  Bhim |null|      null| 700000|         23|                 9|
+-------+----+----------+-------+-----------+------------------+



In [45]:
imputer_mode = Imputer(
    inputCols = ['age','experience'],
    outputCols = ["{}_imputed".format(c) for c in ['age','experience']]
).setStrategy('mode')

In [46]:
imputer_mode.fit(pyspark).transform(pyspark).show()

+-------+----+----------+-------+-----------+------------------+
|   name| age|experience| salary|age_imputed|experience_imputed|
+-------+----+----------+-------+-----------+------------------+
| Uttam |  21|        10|1000000|         21|                10|
| Karan |  23|         4| 400000|         23|                 4|
|Krishna|  28|        50|5000000|         28|                50|
|  Arjun|null|         9| 900000|         21|                 9|
|  Bhim |null|      null| 700000|         21|                 4|
+-------+----+----------+-------+-----------+------------------+

