In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('Pyspark').getOrCreate()

In [3]:
spark

### Create PySpark Dataframe

In [43]:
df_spark = spark.read.csv('test2.csv',header= True, inferSchema=True)

In [44]:
df_spark.show()

+------+----+----------+------+
|  Name| age|Experience|Salary|
+------+----+----------+------+
|   Sam|  31|        10| 30000|
| Carry|  30|         8| 25000|
| Jimmy|  29|         4| 20000|
|  Paul|  24|         3| 20000|
|Harsha|  21|         1| 15000|
|  Andy|  23|         2| 18000|
|Alisha|null|      null| 40000|
|  null|  34|        10| 38000|
|  null|  36|      null|  null|
+------+----+----------+------+



In [45]:
df_spark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



### Drop the column

In [46]:
df_spark.drop('age').show()

+------+----------+------+
|  Name|Experience|Salary|
+------+----------+------+
|   Sam|        10| 30000|
| Carry|         8| 25000|
| Jimmy|         4| 20000|
|  Paul|         3| 20000|
|Harsha|         1| 15000|
|  Andy|         2| 18000|
|Alisha|      null| 40000|
|  null|        10| 38000|
|  null|      null|  null|
+------+----------+------+



### Dropping the null value columns

In [48]:
# drop(how='any') --> It will delete all the records which is having a null record.

df_spark.na.drop(how='any').show()

+------+---+----------+------+
|  Name|age|Experience|Salary|
+------+---+----------+------+
|   Sam| 31|        10| 30000|
| Carry| 30|         8| 25000|
| Jimmy| 29|         4| 20000|
|  Paul| 24|         3| 20000|
|Harsha| 21|         1| 15000|
|  Andy| 23|         2| 18000|
+------+---+----------+------+



In [49]:
# drop(how='all') --> It will delete all the records which is having a null record.

df_spark.na.drop(how='all').show()

+------+----+----------+------+
|  Name| age|Experience|Salary|
+------+----+----------+------+
|   Sam|  31|        10| 30000|
| Carry|  30|         8| 25000|
| Jimmy|  29|         4| 20000|
|  Paul|  24|         3| 20000|
|Harsha|  21|         1| 15000|
|  Andy|  23|         2| 18000|
|Alisha|null|      null| 40000|
|  null|  34|        10| 38000|
|  null|  36|      null|  null|
+------+----+----------+------+



In [50]:
# setting the threshold values for null

df_spark.na.drop(how='any', thresh=2).show()

+------+----+----------+------+
|  Name| age|Experience|Salary|
+------+----+----------+------+
|   Sam|  31|        10| 30000|
| Carry|  30|         8| 25000|
| Jimmy|  29|         4| 20000|
|  Paul|  24|         3| 20000|
|Harsha|  21|         1| 15000|
|  Andy|  23|         2| 18000|
|Alisha|null|      null| 40000|
|  null|  34|        10| 38000|
+------+----+----------+------+



- As we have set threshold value to 2, it will drop the records which is having more than 2 null values.

In [52]:
## using Subset parameter

df_spark.na.drop(how="any", subset=['Age']).show()

+------+---+----------+------+
|  Name|age|Experience|Salary|
+------+---+----------+------+
|   Sam| 31|        10| 30000|
| Carry| 30|         8| 25000|
| Jimmy| 29|         4| 20000|
|  Paul| 24|         3| 20000|
|Harsha| 21|         1| 15000|
|  Andy| 23|         2| 18000|
|  null| 34|        10| 38000|
|  null| 36|      null|  null|
+------+---+----------+------+



- By using subset in drop function we can delete null values from a particular column.

### Filling the missing values

In [55]:
df_spark.na.fill(value='missing',subset=['Name']).show()

+-------+----+----------+------+
|   Name| age|Experience|Salary|
+-------+----+----------+------+
|    Sam|  31|        10| 30000|
|  Carry|  30|         8| 25000|
|  Jimmy|  29|         4| 20000|
|   Paul|  24|         3| 20000|
| Harsha|  21|         1| 15000|
|   Andy|  23|         2| 18000|
| Alisha|null|      null| 40000|
|missing|  34|        10| 38000|
|missing|  36|      null|  null|
+-------+----+----------+------+



In [64]:
df_spark.na.fill({'Name':'Missing', 'Experience':5}).show()

+-------+----+----------+------+
|   Name| age|Experience|Salary|
+-------+----+----------+------+
|    Sam|  31|        10| 30000|
|  Carry|  30|         8| 25000|
|  Jimmy|  29|         4| 20000|
|   Paul|  24|         3| 20000|
| Harsha|  21|         1| 15000|
|   Andy|  23|         2| 18000|
| Alisha|null|         5| 40000|
|Missing|  34|        10| 38000|
|Missing|  36|         5|  null|
+-------+----+----------+------+



### Fill missing values using PySpark ML library

In [66]:
from pyspark.ml.feature import Imputer

In [70]:
# filling the numeric columns with mean

imputer = Imputer(strategy='mean',
             inputCols=['age','Experience','Salary'],
             outputCols=["{}_imputed".format(i) for i in ['age','Experience','Salary']])


In [71]:
imputer.fit(df_spark).transform(df_spark).show()

+------+----+----------+------+-----------+------------------+--------------+
|  Name| age|Experience|Salary|age_imputed|Experience_imputed|Salary_imputed|
+------+----+----------+------+-----------+------------------+--------------+
|   Sam|  31|        10| 30000|         31|                10|         30000|
| Carry|  30|         8| 25000|         30|                 8|         25000|
| Jimmy|  29|         4| 20000|         29|                 4|         20000|
|  Paul|  24|         3| 20000|         24|                 3|         20000|
|Harsha|  21|         1| 15000|         21|                 1|         15000|
|  Andy|  23|         2| 18000|         23|                 2|         18000|
|Alisha|null|      null| 40000|         28|                 5|         40000|
|  null|  34|        10| 38000|         34|                10|         38000|
|  null|  36|      null|  null|         36|                 5|         25750|
+------+----+----------+------+-----------+------------------+--