In [19]:
!pip install pyspark
import pyspark



In [20]:
from pyspark.sql import SparkSession

In [21]:
spark = SparkSession.builder.appName('Dataframe').getOrCreate()

In [22]:
df_spark = spark.read.csv('/content/drive/MyDrive/Data_Science/Pyspark/test2.csv', header = True, inferSchema= True)

In [23]:
df_spark.show()

+---------+----+----------+------+
|     Name| age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         1| 15000|
|  Shubham|  23|         2| 18000|
|   Mahesh|null|      null| 40000|
|     null|  34|        10| 38000|
|     null|  36|      null|  null|
+---------+----+----------+------+



In [24]:
df_spark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [25]:
# Add the columns

df_spark = df_spark.withColumn('Experience after 2 years', df_spark['Experience']+2)

In [26]:
df_spark.show()

+---------+----+----------+------+------------------------+
|     Name| age|Experience|Salary|Experience after 2 years|
+---------+----+----------+------+------------------------+
|    Krish|  31|        10| 30000|                      12|
|Sudhanshu|  30|         8| 25000|                      10|
|    Sunny|  29|         4| 20000|                       6|
|     Paul|  24|         3| 20000|                       5|
|   Harsha|  21|         1| 15000|                       3|
|  Shubham|  23|         2| 18000|                       4|
|   Mahesh|null|      null| 40000|                    null|
|     null|  34|        10| 38000|                      12|
|     null|  36|      null|  null|                    null|
+---------+----+----------+------+------------------------+



In [27]:
df_spark = df_spark.drop('Experience after 2 years')

In [28]:
df_spark.show()

+---------+----+----------+------+
|     Name| age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         1| 15000|
|  Shubham|  23|         2| 18000|
|   Mahesh|null|      null| 40000|
|     null|  34|        10| 38000|
|     null|  36|      null|  null|
+---------+----+----------+------+



In [29]:
# Drop the null values

df_spark.na.drop().show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [30]:
# how = 'any' or 'all
#If 'any', drop a row if it contains any nulls.
#If 'all', drop a row only if all its values are null.
#Threshold will check with column rows more than 2 nulls will be removed

df_spark.na.drop(how = 'any', thresh= 2).show()

+---------+----+----------+------+
|     Name| age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         1| 15000|
|  Shubham|  23|         2| 18000|
|   Mahesh|null|      null| 40000|
|     null|  34|        10| 38000|
+---------+----+----------+------+



In [31]:
# Subset
# Removes null from Age

df_spark = df_spark.na.drop(how = 'any', subset=['Age'])

In [31]:
df_spark

In [37]:
# Filling null values

df_spark.na.fill('Missing Values', ['Name','Experience','Salary']).show()

+--------------+---+----------+------+
|          Name|age|Experience|Salary|
+--------------+---+----------+------+
|         Krish| 31|        10| 30000|
|     Sudhanshu| 30|         8| 25000|
|         Sunny| 29|         4| 20000|
|          Paul| 24|         3| 20000|
|        Harsha| 21|         1| 15000|
|       Shubham| 23|         2| 18000|
|Missing Values| 34|        10| 38000|
|Missing Values| 36|      null|  null|
+--------------+---+----------+------+



In [38]:
df_spark.show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
|     null| 34|        10| 38000|
|     null| 36|      null|  null|
+---------+---+----------+------+



In [39]:
from pyspark.ml.feature import Imputer

# Imputation estimator for completing missing values,
# using the mean, median or mode of the columns in which the missing values are located.
# All Null values in the input columns are treated as missing, and so are also imputed.

imputer = Imputer(
    inputCols = ['age', 'Experience', 'Salary'],
    outputCols = ["{}_imputed".format(c) for c in ['age', 'Experience', 'Salary']]).setStrategy('median')

In [40]:
# Add imputation columns to df

imputer.fit(df_spark).transform(df_spark).show()

+---------+---+----------+------+-----------+------------------+--------------+
|     Name|age|Experience|Salary|age_imputed|Experience_imputed|Salary_imputed|
+---------+---+----------+------+-----------+------------------+--------------+
|    Krish| 31|        10| 30000|         31|                10|         30000|
|Sudhanshu| 30|         8| 25000|         30|                 8|         25000|
|    Sunny| 29|         4| 20000|         29|                 4|         20000|
|     Paul| 24|         3| 20000|         24|                 3|         20000|
|   Harsha| 21|         1| 15000|         21|                 1|         15000|
|  Shubham| 23|         2| 18000|         23|                 2|         18000|
|     null| 34|        10| 38000|         34|                10|         38000|
|     null| 36|      null|  null|         36|                 4|         20000|
+---------+---+----------+------+-----------+------------------+--------------+

