## PySpark Session-3:

In [1]:
# Create pyspark session
from pyspark.sql import SparkSession

In [24]:
spark = SparkSession.builder.appName("Practice").getOrCreate()

In [25]:
# Load the dataset
df_pyspark = spark.read.csv("Test3.csv", header=True, inferSchema=True)

In [26]:
df_pyspark.describe()

DataFrame[summary: string, Name: string, Age: string, Exp: string, Salary: string]

In [27]:
df_pyspark.show()

+----+----+----+------+
|Name| Age| Exp|Salary|
+----+----+----+------+
|   A|  21|   3| 30000|
|   B|  23|   4| 45000|
|   C|  22|   3| 38000|
|   D|  24|   4| 55000|
|   E|null|null| 60000|
|   F|  24|   3| 58000|
|null|  25|null| 62000|
+----+----+----+------+



In [28]:
df_pyspark.na.drop() #To drop the null valued rows
df_pyspark.na.drop().show()

+----+---+---+------+
|Name|Age|Exp|Salary|
+----+---+---+------+
|   A| 21|  3| 30000|
|   B| 23|  4| 45000|
|   C| 22|  3| 38000|
|   D| 24|  4| 55000|
|   F| 24|  3| 58000|
+----+---+---+------+



In [29]:
# using how parameter: how parameter takes two values, any & all
# any: delete if any one null value present in row      all: delete the row if all are null values

df_pyspark.na.drop(how="all")
df_pyspark.na.drop(how="all").show()

df_pyspark.na.drop(how="any")
df_pyspark.na.drop(how="any").show()

+----+----+----+------+
|Name| Age| Exp|Salary|
+----+----+----+------+
|   A|  21|   3| 30000|
|   B|  23|   4| 45000|
|   C|  22|   3| 38000|
|   D|  24|   4| 55000|
|   E|null|null| 60000|
|   F|  24|   3| 58000|
|null|  25|null| 62000|
+----+----+----+------+

+----+---+---+------+
|Name|Age|Exp|Salary|
+----+---+---+------+
|   A| 21|  3| 30000|
|   B| 23|  4| 45000|
|   C| 22|  3| 38000|
|   D| 24|  4| 55000|
|   F| 24|  3| 58000|
+----+---+---+------+



In [30]:
# Threshold:
# It specifies the threshold value to be assigned
# if thresh=2: it will retain the row if there are two values(non-null), it will delete the row if 
#              its contain only one value(non-null)

df_pyspark.na.drop(how='any', thresh=2).show()
df_pyspark.na.drop(how='any', thresh=3).show()

+----+----+----+------+
|Name| Age| Exp|Salary|
+----+----+----+------+
|   A|  21|   3| 30000|
|   B|  23|   4| 45000|
|   C|  22|   3| 38000|
|   D|  24|   4| 55000|
|   E|null|null| 60000|
|   F|  24|   3| 58000|
|null|  25|null| 62000|
+----+----+----+------+

+----+---+---+------+
|Name|Age|Exp|Salary|
+----+---+---+------+
|   A| 21|  3| 30000|
|   B| 23|  4| 45000|
|   C| 22|  3| 38000|
|   D| 24|  4| 55000|
|   F| 24|  3| 58000|
+----+---+---+------+



In [31]:
# Subset Parameter:
# This parameter can be used to specify the column where we are searching null values
# it will search the null values in particular column & if found delete the rows of null values

df_pyspark.na.drop(how='any', subset=['Age'])   # It will delete the rows of null values only pertain Age column
df_pyspark.na.drop(how='any', subset=['Age']).show()

df_pyspark.na.drop(how='any', subset=['Name'])   # It will delete the rows of null values only pertain Name column
df_pyspark.na.drop(how='any', subset=['Name']).show()


+----+---+----+------+
|Name|Age| Exp|Salary|
+----+---+----+------+
|   A| 21|   3| 30000|
|   B| 23|   4| 45000|
|   C| 22|   3| 38000|
|   D| 24|   4| 55000|
|   F| 24|   3| 58000|
|null| 25|null| 62000|
+----+---+----+------+

+----+----+----+------+
|Name| Age| Exp|Salary|
+----+----+----+------+
|   A|  21|   3| 30000|
|   B|  23|   4| 45000|
|   C|  22|   3| 38000|
|   D|  24|   4| 55000|
|   E|null|null| 60000|
|   F|  24|   3| 58000|
+----+----+----+------+



In [33]:
# Filling the null/missing value
df_pyspark.describe()
df_pyspark.na.fill(value='Val_Miss').show() # it will replace all null values to Val_Miss
df_pyspark.na.fill(value='Val_Miss',subset='Age').show() # it will replace all null values of Age column to Val_Miss

+--------+----+----+------+
|    Name| Age| Exp|Salary|
+--------+----+----+------+
|       A|  21|   3| 30000|
|       B|  23|   4| 45000|
|       C|  22|   3| 38000|
|       D|  24|   4| 55000|
|       E|null|null| 60000|
|       F|  24|   3| 58000|
|Val_Miss|  25|null| 62000|
+--------+----+----+------+

+----+----+----+------+
|Name| Age| Exp|Salary|
+----+----+----+------+
|   A|  21|   3| 30000|
|   B|  23|   4| 45000|
|   C|  22|   3| 38000|
|   D|  24|   4| 55000|
|   E|null|null| 60000|
|   F|  24|   3| 58000|
|null|  25|null| 62000|
+----+----+----+------+



In [36]:
# Imputing the null values:

from pyspark.ml.feature import Imputer

imputer = Imputer(
    inputCols = ['Age','Exp','Salary'],
    outputCols = ["{}_imputed".format(c) for c in ['Age','Exp','Salary']]
    ).setStrategy('mean')

In [37]:
# Adding imputation to the columns
imputer.fit(df_pyspark).transform(df_pyspark).show()

+----+----+----+------+-----------+-----------+--------------+
|Name| Age| Exp|Salary|Age_imputed|Exp_imputed|Salary_imputed|
+----+----+----+------+-----------+-----------+--------------+
|   A|  21|   3| 30000|         21|          3|         30000|
|   B|  23|   4| 45000|         23|          4|         45000|
|   C|  22|   3| 38000|         22|          3|         38000|
|   D|  24|   4| 55000|         24|          4|         55000|
|   E|null|null| 60000|         23|          3|         60000|
|   F|  24|   3| 58000|         24|          3|         58000|
|null|  25|null| 62000|         25|          3|         62000|
+----+----+----+------+-----------+-----------+--------------+

