In [1]:
import pyspark

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark=SparkSession.builder.appName('Practice').getOrCreate()

In [4]:
spark

In [5]:
df_pyspark=spark.read.csv('test2.csv',header=True,inferSchema=True)

In [6]:
df_pyspark.show()

+---------+----+----------+------+
|     Name| age|Experience|salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         1| 15000|
|  Shubham|  23|         2| 18000|
|   Mahesh|NULL|      NULL| 40000|
|     NULL|  34|        10| 38000|
|     NULL|  36|      NULL|  NULL|
+---------+----+----------+------+



In [7]:
## It deletes the row which has Null values
df_pyspark.na.drop().show()

+---------+---+----------+------+
|     Name|age|Experience|salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [8]:
df_pyspark.na.drop(how='any').show()
    ## It deletes the row which has atleast one NULL value

+---------+---+----------+------+
|     Name|age|Experience|salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [9]:
df_pyspark.na.drop(how='all').show()
    ## It deletes the row which have all NULL values

+---------+----+----------+------+
|     Name| age|Experience|salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         1| 15000|
|  Shubham|  23|         2| 18000|
|   Mahesh|NULL|      NULL| 40000|
|     NULL|  34|        10| 38000|
|     NULL|  36|      NULL|  NULL|
+---------+----+----------+------+



In [10]:
## Threshold
df_pyspark.na.drop(how='any', thresh=2).show()
    ## Rows has deleted which has at least two non NULL values

+---------+----+----------+------+
|     Name| age|Experience|salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         1| 15000|
|  Shubham|  23|         2| 18000|
|   Mahesh|NULL|      NULL| 40000|
|     NULL|  34|        10| 38000|
+---------+----+----------+------+



In [11]:
## Subset
df_pyspark.na.drop(how='any',subset=['Experience']).show()
    ## delete rows which contain NULL values from specific column 

+---------+---+----------+------+
|     Name|age|Experience|salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
|     NULL| 34|        10| 38000|
+---------+---+----------+------+



In [12]:
## Filling the NULL values
df_pyspark.na.fill('Missing Values').show()
    ## It fill the given value instead of NULL values.

+--------------+----+----------+------+
|          Name| age|Experience|salary|
+--------------+----+----------+------+
|         Krish|  31|        10| 30000|
|     Sudhanshu|  30|         8| 25000|
|         Sunny|  29|         4| 20000|
|          Paul|  24|         3| 20000|
|        Harsha|  21|         1| 15000|
|       Shubham|  23|         2| 18000|
|        Mahesh|NULL|      NULL| 40000|
|Missing Values|  34|        10| 38000|
|Missing Values|  36|      NULL|  NULL|
+--------------+----+----------+------+



In [13]:
## filling the NULL value in a specific column
df_pyspark.na.fill('Missing Value',['Experience','age']).show()

+---------+----+----------+------+
|     Name| age|Experience|salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         1| 15000|
|  Shubham|  23|         2| 18000|
|   Mahesh|NULL|      NULL| 40000|
|     NULL|  34|        10| 38000|
|     NULL|  36|      NULL|  NULL|
+---------+----+----------+------+



In [14]:
from pyspark.ml.feature import Imputer

imputer=Imputer(
        inputCols=['age','Experience','salary'],
        outputCols=["{}_imputed".format(c) for c in ['age','Experience','salary']]
    ).setStrategy('mean')
        ## It replaces the NULL values with mean 

In [15]:
imputer.fit(df_pyspark).transform(df_pyspark).show()

+---------+----+----------+------+-----------+------------------+--------------+
|     Name| age|Experience|salary|age_imputed|Experience_imputed|salary_imputed|
+---------+----+----------+------+-----------+------------------+--------------+
|    Krish|  31|        10| 30000|         31|                10|         30000|
|Sudhanshu|  30|         8| 25000|         30|                 8|         25000|
|    Sunny|  29|         4| 20000|         29|                 4|         20000|
|     Paul|  24|         3| 20000|         24|                 3|         20000|
|   Harsha|  21|         1| 15000|         21|                 1|         15000|
|  Shubham|  23|         2| 18000|         23|                 2|         18000|
|   Mahesh|NULL|      NULL| 40000|         28|                 5|         40000|
|     NULL|  34|        10| 38000|         34|                10|         38000|
|     NULL|  36|      NULL|  NULL|         36|                 5|         25750|
+---------+----+----------+-

In [16]:
from pyspark.ml.feature import Imputer

imputer=Imputer(
        inputCols=['age','Experience','salary'],
        outputCols=["{}_imputed".format(c) for c in ['age','Experience','salary']]
    ).setStrategy('median')
        ## It replaces the NULL values with median

In [17]:
imputer.fit(df_pyspark).transform(df_pyspark).show()

+---------+----+----------+------+-----------+------------------+--------------+
|     Name| age|Experience|salary|age_imputed|Experience_imputed|salary_imputed|
+---------+----+----------+------+-----------+------------------+--------------+
|    Krish|  31|        10| 30000|         31|                10|         30000|
|Sudhanshu|  30|         8| 25000|         30|                 8|         25000|
|    Sunny|  29|         4| 20000|         29|                 4|         20000|
|     Paul|  24|         3| 20000|         24|                 3|         20000|
|   Harsha|  21|         1| 15000|         21|                 1|         15000|
|  Shubham|  23|         2| 18000|         23|                 2|         18000|
|   Mahesh|NULL|      NULL| 40000|         29|                 4|         40000|
|     NULL|  34|        10| 38000|         34|                10|         38000|
|     NULL|  36|      NULL|  NULL|         36|                 4|         20000|
+---------+----+----------+-

In [18]:
df_pyspark.show()

+---------+----+----------+------+
|     Name| age|Experience|salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         1| 15000|
|  Shubham|  23|         2| 18000|
|   Mahesh|NULL|      NULL| 40000|
|     NULL|  34|        10| 38000|
|     NULL|  36|      NULL|  NULL|
+---------+----+----------+------+



In [19]:
df_pyspark.describe('Experience').show()

+-------+------------------+
|summary|        Experience|
+-------+------------------+
|  count|                 7|
|   mean| 5.428571428571429|
| stddev|3.8234863173611093|
|    min|                 1|
|    max|                10|
+-------+------------------+



In [20]:
df_pyspark.show()

+---------+----+----------+------+
|     Name| age|Experience|salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         1| 15000|
|  Shubham|  23|         2| 18000|
|   Mahesh|NULL|      NULL| 40000|
|     NULL|  34|        10| 38000|
|     NULL|  36|      NULL|  NULL|
+---------+----+----------+------+



In [21]:
df_pyspark.distinct().show()
    # The distinct() function is used to obtain distinct rows from a DataFrame, removing any duplicate rows. It returns a new DataFrame with only unique rows based on all the columns.

+---------+----+----------+------+
|     Name| age|Experience|salary|
+---------+----+----------+------+
|  Shubham|  23|         2| 18000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|    Krish|  31|        10| 30000|
|   Harsha|  21|         1| 15000|
|Sudhanshu|  30|         8| 25000|
|   Mahesh|NULL|      NULL| 40000|
|     NULL|  34|        10| 38000|
|     NULL|  36|      NULL|  NULL|
+---------+----+----------+------+



In [None]:
df_new = df_pyspark.withColumn("Age Category", when(df["age"] < 30, "Young").otherwise("Adult"))