## Handling Missing Values

In [1]:
from pyspark.sql import SparkSession


In [2]:
spark = SparkSession.builder.appName("Practice").getOrCreate()
spark

In [39]:
df_pysp = spark.read.csv(r"C:\Users\bhupe\OneDrive\Desktop\Portfolio Project\Python Project\Emp_pyspark.csv", header= True ,inferSchema =True)
df_pysp.show()

+---------+----+-----------+------+
|     Name| Age|Experience |Salary|
+---------+----+-----------+------+
|    sohag|  25|          2| 25000|
|    argha|  26|          1| 30000|
|    majhi|  25|          3| 35000|
|   pratik|  23|       null| 27000|
|souradeep|null|       null| 28000|
|     null|  28|          5| 26000|
|     null|  27|          2|  null|
+---------+----+-----------+------+



### Drop Missing Values

In [4]:
df_pysp.na.drop().show()

+-----+----+-----------+------+
| Name| Age|Experience |Salary|
+-----+----+-----------+------+
|sohag|  25|          2| 25000|
|argha|  26|          1| 30000|
|majhi|  25|          3| 35000|
+-----+----+-----------+------+



In [5]:
df_pysp

DataFrame[Name: string,  Age: int, Experience : int, Salary: int]

In [7]:
df_pysp.na.drop(how = "all").show()

+---------+----+-----------+------+
|     Name| Age|Experience |Salary|
+---------+----+-----------+------+
|    sohag|  25|          2| 25000|
|    argha|  26|          1| 30000|
|    majhi|  25|          3| 35000|
|   pratik|  23|       null| 27000|
|souradeep|null|       null| 28000|
|     null|  28|          5| 26000|
|     null|  27|          2|  null|
+---------+----+-----------+------+



In [17]:
df_pysp.na.drop(how = "any", subset= "Salary").show()

+---------+----+-----------+------+
|     Name| Age|Experience |Salary|
+---------+----+-----------+------+
|    sohag|  25|          2| 25000|
|    argha|  26|          1| 30000|
|    majhi|  25|          3| 35000|
|   pratik|  23|       null| 27000|
|souradeep|null|       null| 28000|
|     null|  28|          5| 26000|
+---------+----+-----------+------+



In [21]:
df_pysp.show()

+---------+----+-----------+------+
|     Name| Age|Experience |Salary|
+---------+----+-----------+------+
|    sohag|  25|          2| 25000|
|    argha|  26|          1| 30000|
|    majhi|  25|          3| 35000|
|   pratik|  23|       null| 27000|
|souradeep|null|       null| 28000|
|     null|  28|          5| 26000|
|     null|  27|          2|  null|
+---------+----+-----------+------+



In [30]:
df_pysp.na.fill("unknown",["Name"," Age"]).show()

+---------+-------+-----------+------+
|     Name|    Age|Experience |Salary|
+---------+-------+-----------+------+
|    sohag|     25|          2| 25000|
|    argha|     26|          1| 30000|
|    majhi|     25|          3| 35000|
|   pratik|     23|       null| 27000|
|souradeep|unknown|       null| 28000|
|  unknown|     28|          5| 26000|
|  unknown|     27|          2|  null|
+---------+-------+-----------+------+



In [26]:
df_pysp.printSchema()

root
 |-- Name: string (nullable = true)
 |--  Age: string (nullable = true)
 |-- Experience : string (nullable = true)
 |-- Salary: string (nullable = true)



In [49]:
## Impute missing values by mean, median 
from pyspark.ml.feature import Imputer

In [53]:
imputer = Imputer(
inputCols = [" Age","Experience ","Salary"],
outputCols = ["{}_imputed". format(x) for x in [" Age","Experience ","Salary"]]    
).setStrategy("mean")

In [54]:
## Add imputation columns to df
imputer.fit(df_pysp).transform(df_pysp).show()

+---------+----+-----------+------+------------+-------------------+--------------+
|     Name| Age|Experience |Salary| Age_imputed|Experience _imputed|Salary_imputed|
+---------+----+-----------+------+------------+-------------------+--------------+
|    sohag|  25|          2| 25000|          25|                  2|         25000|
|    argha|  26|          1| 30000|          26|                  1|         30000|
|    majhi|  25|          3| 35000|          25|                  3|         35000|
|   pratik|  23|       null| 27000|          23|                  2|         27000|
|souradeep|null|       null| 28000|          25|                  2|         28000|
|     null|  28|          5| 26000|          28|                  5|         26000|
|     null|  27|          2|  null|          27|                  2|         28500|
+---------+----+-----------+------+------------+-------------------+--------------+



In [48]:
df_pysp.columns

['Name', ' Age', 'Experience ', 'Salary']