In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("PysparkDataCleaning").getOrCreate()
spark

In [22]:
df_pyspark = spark.read.csv(r"C:\Users\opandey2\Desktop\test1.csv", header = True, inferSchema = True)

In [27]:
df_pyspark.show()

+--------+----+----------+------+
|    Name| age|Experience|Salary|
+--------+----+----------+------+
|      Om|  23|         2| 43000|
|   2dwaf|  23|         3| 65888|
|     asf|  32|         3| 60000|
|sudhansu|  36|         7|300000|
|   krish|null|         8|  null|
|    herp|  27|      null| 53576|
|  sumyiy|  24|         3|170000|
|    null|null|      null|  null|
|    null|  31|         4|573547|
|   HIQWQ|null|      null|  null|
+--------+----+----------+------+



In [16]:
#To Drop the columns:

df_pyspark.drop('Name').show()

+----+----------+------+
| age|Experience|Salary|
+----+----------+------+
|  23|         2| 43000|
|  23|         3| 65888|
|  32|         3| 60000|
|  36|         7|300000|
|null|         8|340594|
|  27|      null| 53576|
|  24|         3|170000|
|  29|      null|  null|
|  31|         4|573547|
+----+----------+------+



In [24]:
#To Drop all the Null rows:

df_pyspark.na.drop().show()

+--------+---+----------+------+
|    Name|age|Experience|Salary|
+--------+---+----------+------+
|      Om| 23|         2| 43000|
|   2dwaf| 23|         3| 65888|
|     asf| 32|         3| 60000|
|sudhansu| 36|         7|300000|
|  sumyiy| 24|         3|170000|
+--------+---+----------+------+



In [26]:
#To drop rows whose entire row has null values:

df_pyspark.na.drop(how="all").show()

+--------+----+----------+------+
|    Name| age|Experience|Salary|
+--------+----+----------+------+
|      Om|  23|         2| 43000|
|   2dwaf|  23|         3| 65888|
|     asf|  32|         3| 60000|
|sudhansu|  36|         7|300000|
|   krish|null|         8|  null|
|    herp|  27|      null| 53576|
|  sumyiy|  24|         3|170000|
|    null|  31|         4|573547|
|   HIQWQ|null|      null|  null|
+--------+----+----------+------+



In [21]:
#Threshold value = 2 (atleast 2 non null values should be present)

df_pyspark.na.drop(how="any", thresh=2).show()


+--------+----+----------+------+
|    Name| age|Experience|Salary|
+--------+----+----------+------+
|      Om|  23|         2| 43000|
|   2dwaf|  23|         3| 65888|
|     asf|  32|         3| 60000|
|sudhansu|  36|         7|300000|
|   krish|null|         8|340594|
|    herp|  27|      null| 53576|
|  sumyiy|  24|         3|170000|
|    null|  31|         4|573547|
+--------+----+----------+------+



In [30]:
df_pyspark.na.drop(how = "any", thresh = 3).show()

+--------+---+----------+------+
|    Name|age|Experience|Salary|
+--------+---+----------+------+
|      Om| 23|         2| 43000|
|   2dwaf| 23|         3| 65888|
|     asf| 32|         3| 60000|
|sudhansu| 36|         7|300000|
|    herp| 27|      null| 53576|
|  sumyiy| 24|         3|170000|
|    null| 31|         4|573547|
+--------+---+----------+------+



In [32]:
df_pyspark.na.drop(how="any",thresh = 4).show()

+--------+---+----------+------+
|    Name|age|Experience|Salary|
+--------+---+----------+------+
|      Om| 23|         2| 43000|
|   2dwaf| 23|         3| 65888|
|     asf| 32|         3| 60000|
|sudhansu| 36|         7|300000|
|  sumyiy| 24|         3|170000|
+--------+---+----------+------+



In [34]:
##Subset
###Subset will remove null values from the column which has been specified in subset field:

df_pyspark.na.drop(how="any",subset=['Experience']).show()

+--------+----+----------+------+
|    Name| age|Experience|Salary|
+--------+----+----------+------+
|      Om|  23|         2| 43000|
|   2dwaf|  23|         3| 65888|
|     asf|  32|         3| 60000|
|sudhansu|  36|         7|300000|
|   krish|null|         8|  null|
|  sumyiy|  24|         3|170000|
|    null|  31|         4|573547|
+--------+----+----------+------+



In [35]:
##Subset deleting null values from 2 specified columns:

df_pyspark.na.drop(how="any",subset = ['Experience','age']).show()

+--------+---+----------+------+
|    Name|age|Experience|Salary|
+--------+---+----------+------+
|      Om| 23|         2| 43000|
|   2dwaf| 23|         3| 65888|
|     asf| 32|         3| 60000|
|sudhansu| 36|         7|300000|
|  sumyiy| 24|         3|170000|
|    null| 31|         4|573547|
+--------+---+----------+------+



In [42]:
##Filling the missing values:

df_pyspark.na.fill('Missing Values').show()

+--------------+----+----------+------+
|          Name| age|Experience|Salary|
+--------------+----+----------+------+
|            Om|  23|         2| 43000|
|         2dwaf|  23|         3| 65888|
|           asf|  32|         3| 60000|
|      sudhansu|  36|         7|300000|
|         krish|null|         8|  null|
|          herp|  27|      null| 53576|
|        sumyiy|  24|         3|170000|
|Missing Values|null|      null|  null|
|Missing Values|  31|         4|573547|
|         HIQWQ|null|      null|  null|
+--------------+----+----------+------+



In [43]:
# filling missing value with Imputer functions

from pyspark.ml.feature import Imputer


In [47]:
imputer = Imputer(
    inputCols=['age','Experience','Salary'],
    outputCols=["{}_imputed".format(c) for c in ['age','Experience','Salary']]
).setStrategy("mean")


In [48]:
#adding Imputation column to df:

imputer.fit(df_pyspark).transform(df_pyspark).show()

+--------+----+----------+------+-----------+------------------+--------------+
|    Name| age|Experience|Salary|age_imputed|Experience_imputed|Salary_imputed|
+--------+----+----------+------+-----------+------------------+--------------+
|      Om|  23|         2| 43000|         23|                 2|         43000|
|   2dwaf|  23|         3| 65888|         23|                 3|         65888|
|     asf|  32|         3| 60000|         32|                 3|         60000|
|sudhansu|  36|         7|300000|         36|                 7|        300000|
|   krish|null|         8|  null|         28|                 8|        180858|
|    herp|  27|      null| 53576|         27|                 4|         53576|
|  sumyiy|  24|         3|170000|         24|                 3|        170000|
|    null|null|      null|  null|         28|                 4|        180858|
|    null|  31|         4|573547|         31|                 4|        573547|
|   HIQWQ|null|      null|  null|       