In [1]:
from pyspark.sql import SparkSession

In [2]:
spark=SparkSession.builder.appName('Practice').getOrCreate()

In [3]:
spark

In [4]:
spark.read.csv('employee-details.csv', header=True, inferSchema=True)

DataFrame[Name: string, Age: int, Salary: int, Experience: int]

In [5]:
spark.read.csv('employee-details.csv', header=True, inferSchema=True).show()

+---------+----+------+----------+
|     Name| Age|Salary|Experience|
+---------+----+------+----------+
|     John|  25| 50000|         5|
|     Emma|NULL| 60000|         3|
|  Michael|  30|  NULL|         7|
|   Sophia|  28| 55000|      NULL|
|   Oliver|  35| 70000|        10|
|      Ava|NULL| 45000|         2|
|  William|  40|  NULL|        12|
|Charlotte|  22| 40000|         1|
|    David|  45| 80000|      NULL|
+---------+----+------+----------+



In [6]:
df_pyspark=spark.read.csv('employee-details.csv', header=True, inferSchema=True)

In [10]:
# For dropping any columns
df_pyspark.drop('Name').show()

+----+------+----------+
| Age|Salary|Experience|
+----+------+----------+
|  25| 50000|         5|
|NULL| 60000|         3|
|  30|  NULL|         7|
|  28| 55000|      NULL|
|  35| 70000|        10|
|NULL| 45000|         2|
|  40|  NULL|        12|
|  22| 40000|         1|
|  45| 80000|      NULL|
+----+------+----------+



In [13]:
df_pyspark.show()

+---------+----+------+----------+
|     Name| Age|Salary|Experience|
+---------+----+------+----------+
|     John|  25| 50000|         5|
|     Emma|NULL| 60000|         3|
|  Michael|  30|  NULL|         7|
|   Sophia|  28| 55000|      NULL|
|   Oliver|  35| 70000|        10|
|      Ava|NULL| 45000|         2|
|  William|  40|  NULL|        12|
|Charlotte|  22| 40000|         1|
|    David|  45| 80000|      NULL|
+---------+----+------+----------+



### For dropping any missing/null row of any columns

In [12]:
df_pyspark.na.drop().show() # It will drop all the null values from every columns

+---------+---+------+----------+
|     Name|Age|Salary|Experience|
+---------+---+------+----------+
|     John| 25| 50000|         5|
|   Oliver| 35| 70000|        10|
|Charlotte| 22| 40000|         1|
+---------+---+------+----------+



In [14]:
## How ro drop row wise for specifc not all

In [15]:
## how=='any' (by default) and how=='all'


df_pyspark.na.drop(how='all').show() # If any has null value for all the fields then it will drop all those null from all field

## But here there is no any row which has null for all the fields/columns, hence it doesn't drop anything.

+---------+----+------+----------+
|     Name| Age|Salary|Experience|
+---------+----+------+----------+
|     John|  25| 50000|         5|
|     Emma|NULL| 60000|         3|
|  Michael|  30|  NULL|         7|
|   Sophia|  28| 55000|      NULL|
|   Oliver|  35| 70000|        10|
|      Ava|NULL| 45000|         2|
|  William|  40|  NULL|        12|
|Charlotte|  22| 40000|         1|
|    David|  45| 80000|      NULL|
+---------+----+------+----------+



In [16]:
df_pyspark.na.drop(how='any').show()  # It will drop all the null values from anywhere in any of the columns

+---------+---+------+----------+
|     Name|Age|Salary|Experience|
+---------+---+------+----------+
|     John| 25| 50000|         5|
|   Oliver| 35| 70000|        10|
|Charlotte| 22| 40000|         1|
+---------+---+------+----------+



In [17]:
## threshold

df_pyspark.na.drop(how='any', thresh=2).show()

## It checks, in each row if there is less than 2 values which are not null.

## If not null are equal or more than threshold value then it will not drop
## but if not null value are less than thresh value then it will drop that whole row.


# Here all the row having more than threshold value. hence non of them are dropped.

+---------+----+------+----------+
|     Name| Age|Salary|Experience|
+---------+----+------+----------+
|     John|  25| 50000|         5|
|     Emma|NULL| 60000|         3|
|  Michael|  30|  NULL|         7|
|   Sophia|  28| 55000|      NULL|
|   Oliver|  35| 70000|        10|
|      Ava|NULL| 45000|         2|
|  William|  40|  NULL|        12|
|Charlotte|  22| 40000|         1|
|    David|  45| 80000|      NULL|
+---------+----+------+----------+



In [20]:
df_pyspark.na.drop(how='any', thresh=3).show()

# means not null should be 2 or less
## Here not null value from each columns are 3 and more. hence non of them are deleted.

+---------+----+------+----------+
|     Name| Age|Salary|Experience|
+---------+----+------+----------+
|     John|  25| 50000|         5|
|     Emma|NULL| 60000|         3|
|  Michael|  30|  NULL|         7|
|   Sophia|  28| 55000|      NULL|
|   Oliver|  35| 70000|        10|
|      Ava|NULL| 45000|         2|
|  William|  40|  NULL|        12|
|Charlotte|  22| 40000|         1|
|    David|  45| 80000|      NULL|
+---------+----+------+----------+



In [22]:
df_pyspark.na.drop(how='any', thresh=4).show()

+---------+---+------+----------+
|     Name|Age|Salary|Experience|
+---------+---+------+----------+
|     John| 25| 50000|         5|
|   Oliver| 35| 70000|        10|
|Charlotte| 22| 40000|         1|
+---------+---+------+----------+



In [23]:
## Subset -- delete null value from column

In [24]:
df_pyspark.na.drop(how='any', subset=['Salary']).show()

# here null value from salary column is dropped and remain showing.

+---------+----+------+----------+
|     Name| Age|Salary|Experience|
+---------+----+------+----------+
|     John|  25| 50000|         5|
|     Emma|NULL| 60000|         3|
|   Sophia|  28| 55000|      NULL|
|   Oliver|  35| 70000|        10|
|      Ava|NULL| 45000|         2|
|Charlotte|  22| 40000|         1|
|    David|  45| 80000|      NULL|
+---------+----+------+----------+



In [25]:
## FIlling the missing value

In [30]:
# df_pyspark.na.fill("missing values").show() ---> it is not working because null values are in integer column

df_pyspark.na.fill(0).show() # ---> It worked because we are replacing null of interger column with integer value 0

+---------+---+------+----------+
|     Name|Age|Salary|Experience|
+---------+---+------+----------+
|     John| 25| 50000|         5|
|     Emma|  0| 60000|         3|
|  Michael| 30|     0|         7|
|   Sophia| 28| 55000|         0|
|   Oliver| 35| 70000|        10|
|      Ava|  0| 45000|         2|
|  William| 40|     0|        12|
|Charlotte| 22| 40000|         1|
|    David| 45| 80000|         0|
+---------+---+------+----------+



In [33]:
df_pyspark.na.fill(0, 'Salary').show()  # For filling on specific column

+---------+----+------+----------+
|     Name| Age|Salary|Experience|
+---------+----+------+----------+
|     John|  25| 50000|         5|
|     Emma|NULL| 60000|         3|
|  Michael|  30|     0|         7|
|   Sophia|  28| 55000|      NULL|
|   Oliver|  35| 70000|        10|
|      Ava|NULL| 45000|         2|
|  William|  40|     0|        12|
|Charlotte|  22| 40000|         1|
|    David|  45| 80000|      NULL|
+---------+----+------+----------+



In [37]:
df_pyspark.na.fill(0, ['Age']).show()  # --> We can pass the specific column whose null we want to fill, in list form

+---------+---+------+----------+
|     Name|Age|Salary|Experience|
+---------+---+------+----------+
|     John| 25| 50000|         5|
|     Emma|  0| 60000|         3|
|  Michael| 30|  NULL|         7|
|   Sophia| 28| 55000|      NULL|
|   Oliver| 35| 70000|        10|
|      Ava|  0| 45000|         2|
|  William| 40|  NULL|        12|
|Charlotte| 22| 40000|         1|
|    David| 45| 80000|      NULL|
+---------+---+------+----------+



In [38]:
## For filling on multiple columns

df_pyspark.na.fill(0, ['Age','Salary']).show()

+---------+---+------+----------+
|     Name|Age|Salary|Experience|
+---------+---+------+----------+
|     John| 25| 50000|         5|
|     Emma|  0| 60000|         3|
|  Michael| 30|     0|         7|
|   Sophia| 28| 55000|      NULL|
|   Oliver| 35| 70000|        10|
|      Ava|  0| 45000|         2|
|  William| 40|     0|        12|
|Charlotte| 22| 40000|         1|
|    David| 45| 80000|      NULL|
+---------+---+------+----------+



In [40]:
# Another way

df_pyspark.fillna({'Name': 'Missing values', 'Age': 0, 'Salary': 0, 'Experience': 0}).show()

+---------+---+------+----------+
|     Name|Age|Salary|Experience|
+---------+---+------+----------+
|     John| 25| 50000|         5|
|     Emma|  0| 60000|         3|
|  Michael| 30|     0|         7|
|   Sophia| 28| 55000|         0|
|   Oliver| 35| 70000|        10|
|      Ava|  0| 45000|         2|
|  William| 40|     0|        12|
|Charlotte| 22| 40000|         1|
|    David| 45| 80000|         0|
+---------+---+------+----------+



In [39]:
df_pyspark.show()

+---------+----+------+----------+
|     Name| Age|Salary|Experience|
+---------+----+------+----------+
|     John|  25| 50000|         5|
|     Emma|NULL| 60000|         3|
|  Michael|  30|  NULL|         7|
|   Sophia|  28| 55000|      NULL|
|   Oliver|  35| 70000|        10|
|      Ava|NULL| 45000|         2|
|  William|  40|  NULL|        12|
|Charlotte|  22| 40000|         1|
|    David|  45| 80000|      NULL|
+---------+----+------+----------+



In [41]:
from pyspark.ml.feature import Imputer
# from pyspark.ml import Pipeline

# define the columns to impute
imputer_cols = ['Age', 'Salary', 'Experience']

# create a Imputer object
imputer = Imputer(
    inputCols=imputer_cols, 
    outputCols=[col + '_imputed' for col in imputer_cols]
    ).setStrategy("mean")


In [42]:
# fit the pipeline to the data and transform the data
imputer.fit(df_pyspark).transform(df_pyspark).show()


# All null values of given columns are filled with mean value

+---------+----+------+----------+-----------+--------------+------------------+
|     Name| Age|Salary|Experience|Age_imputed|Salary_imputed|Experience_imputed|
+---------+----+------+----------+-----------+--------------+------------------+
|     John|  25| 50000|         5|         25|         50000|                 5|
|     Emma|NULL| 60000|         3|         32|         60000|                 3|
|  Michael|  30|  NULL|         7|         30|         57142|                 7|
|   Sophia|  28| 55000|      NULL|         28|         55000|                 5|
|   Oliver|  35| 70000|        10|         35|         70000|                10|
|      Ava|NULL| 45000|         2|         32|         45000|                 2|
|  William|  40|  NULL|        12|         40|         57142|                12|
|Charlotte|  22| 40000|         1|         22|         40000|                 1|
|    David|  45| 80000|      NULL|         45|         80000|                 5|
+---------+----+------+-----

In [43]:
## For filling null with median

from pyspark.ml.feature import Imputer
# from pyspark.ml import Pipeline

# define the columns to impute
imputer_cols = ['Age', 'Salary', 'Experience']

# create a Imputer object
imputer = Imputer(
    inputCols=imputer_cols, 
    outputCols=[col + '_imputed' for col in imputer_cols]
    ).setStrategy("median")


# Only mean is changed to median

In [44]:
# fit the pipeline to the data and transform the data
imputer.fit(df_pyspark).transform(df_pyspark).show()

# All null values of given columns are filled with median value

+---------+----+------+----------+-----------+--------------+------------------+
|     Name| Age|Salary|Experience|Age_imputed|Salary_imputed|Experience_imputed|
+---------+----+------+----------+-----------+--------------+------------------+
|     John|  25| 50000|         5|         25|         50000|                 5|
|     Emma|NULL| 60000|         3|         30|         60000|                 3|
|  Michael|  30|  NULL|         7|         30|         55000|                 7|
|   Sophia|  28| 55000|      NULL|         28|         55000|                 5|
|   Oliver|  35| 70000|        10|         35|         70000|                10|
|      Ava|NULL| 45000|         2|         30|         45000|                 2|
|  William|  40|  NULL|        12|         40|         55000|                12|
|Charlotte|  22| 40000|         1|         22|         40000|                 1|
|    David|  45| 80000|      NULL|         45|         80000|                 5|
+---------+----+------+-----