In [50]:
#Handling missing values
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

spark = SparkSession.builder.appName('professions.csv').getOrCreate()

In [51]:
df_pyspark = spark.read.option('header','true').csv('professions.csv',inferSchema=True) 

In [52]:
df_pyspark.show()

+---------+----+--------------------+
|     Name| Age|          Profession|
+---------+----+--------------------+
|    Ilhan|  29|Principal Softwar...|
|    Shoie|  35|          CEO - BASF|
|  Nishadh|  41|   Head of analytics|
|   Rifadh|  41|         CEO - ONINE|
|    Azhar|  34|   Surgeon - Abdomen|
|  Arshadh|  42|         CEO - EXXON|
|  Arshiya|  25|   Software Engineer|
|Chandhini|  30|  Dance School owner|
|    Suraj|null|    Marine Architect|
|Arundathi|null|                null|
|   Aparna|null|                null|
|   Aadith|null|          Footballer|
|   Kaveri|  29|                null|
+---------+----+--------------------+



In [53]:
# Remove every row that has atleast one null value
df_pyspark.na.drop(how='any').show()

+---------+---+--------------------+
|     Name|Age|          Profession|
+---------+---+--------------------+
|    Ilhan| 29|Principal Softwar...|
|    Shoie| 35|          CEO - BASF|
|  Nishadh| 41|   Head of analytics|
|   Rifadh| 41|         CEO - ONINE|
|    Azhar| 34|   Surgeon - Abdomen|
|  Arshadh| 42|         CEO - EXXON|
|  Arshiya| 25|   Software Engineer|
|Chandhini| 30|  Dance School owner|
+---------+---+--------------------+



In [54]:
# Remove every row that has all null values
df_pyspark.na.drop(how='all').show()

+---------+----+--------------------+
|     Name| Age|          Profession|
+---------+----+--------------------+
|    Ilhan|  29|Principal Softwar...|
|    Shoie|  35|          CEO - BASF|
|  Nishadh|  41|   Head of analytics|
|   Rifadh|  41|         CEO - ONINE|
|    Azhar|  34|   Surgeon - Abdomen|
|  Arshadh|  42|         CEO - EXXON|
|  Arshiya|  25|   Software Engineer|
|Chandhini|  30|  Dance School owner|
|    Suraj|null|    Marine Architect|
|Arundathi|null|                null|
|   Aparna|null|                null|
|   Aadith|null|          Footballer|
|   Kaveri|  29|                null|
+---------+----+--------------------+



In [55]:
# Remove every row that contains less than two non null values
df_pyspark.na.drop(how='any',thresh=2).show()

+---------+----+--------------------+
|     Name| Age|          Profession|
+---------+----+--------------------+
|    Ilhan|  29|Principal Softwar...|
|    Shoie|  35|          CEO - BASF|
|  Nishadh|  41|   Head of analytics|
|   Rifadh|  41|         CEO - ONINE|
|    Azhar|  34|   Surgeon - Abdomen|
|  Arshadh|  42|         CEO - EXXON|
|  Arshiya|  25|   Software Engineer|
|Chandhini|  30|  Dance School owner|
|    Suraj|null|    Marine Architect|
|   Aadith|null|          Footballer|
|   Kaveri|  29|                null|
+---------+----+--------------------+



In [56]:
# Remove every row that has a null value in the Age column
df_pyspark.na.drop(how='any',subset=['Age']).show()

+---------+---+--------------------+
|     Name|Age|          Profession|
+---------+---+--------------------+
|    Ilhan| 29|Principal Softwar...|
|    Shoie| 35|          CEO - BASF|
|  Nishadh| 41|   Head of analytics|
|   Rifadh| 41|         CEO - ONINE|
|    Azhar| 34|   Surgeon - Abdomen|
|  Arshadh| 42|         CEO - EXXON|
|  Arshiya| 25|   Software Engineer|
|Chandhini| 30|  Dance School owner|
|   Kaveri| 29|                null|
+---------+---+--------------------+



In [57]:
# Replace missing values 
df_pyspark.na.fill('No idea',['Age'])
df_pyspark.na.fill('Check Linkedin',['Profession'])


DataFrame[Name: string, Age: int, Profession: string]

In [58]:
df_pyspark.show()

+---------+----+--------------------+
|     Name| Age|          Profession|
+---------+----+--------------------+
|    Ilhan|  29|Principal Softwar...|
|    Shoie|  35|          CEO - BASF|
|  Nishadh|  41|   Head of analytics|
|   Rifadh|  41|         CEO - ONINE|
|    Azhar|  34|   Surgeon - Abdomen|
|  Arshadh|  42|         CEO - EXXON|
|  Arshiya|  25|   Software Engineer|
|Chandhini|  30|  Dance School owner|
|    Suraj|null|    Marine Architect|
|Arundathi|null|                null|
|   Aparna|null|                null|
|   Aadith|null|          Footballer|
|   Kaveri|  29|                null|
+---------+----+--------------------+



In [59]:
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Profession: string (nullable = true)



In [60]:
# lets replace null values in the age column with the mean age

from pyspark.ml.feature import Imputer
imputer = Imputer(
    inputCols=["Age"],
    outputCols=["{}_imputed".format(c) for c in ["Age"]]
).setStrategy("median")

In [63]:
df_pyspark = imputer.fit(df_pyspark).transform(df_pyspark)

In [65]:
df_pyspark = df_pyspark.withColumn("Age",col("Age_imputed"))

In [68]:
df_pyspark = df_pyspark.drop("Age_imputed")

In [69]:
df_pyspark.show()

+---------+---+--------------------+
|     Name|Age|          Profession|
+---------+---+--------------------+
|    Ilhan| 29|Principal Softwar...|
|    Shoie| 35|          CEO - BASF|
|  Nishadh| 41|   Head of analytics|
|   Rifadh| 41|         CEO - ONINE|
|    Azhar| 34|   Surgeon - Abdomen|
|  Arshadh| 42|         CEO - EXXON|
|  Arshiya| 25|   Software Engineer|
|Chandhini| 30|  Dance School owner|
|    Suraj| 34|    Marine Architect|
|Arundathi| 34|                null|
|   Aparna| 34|                null|
|   Aadith| 34|          Footballer|
|   Kaveri| 29|                null|
+---------+---+--------------------+

