In [15]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [16]:
import pyspark
from pyspark.sql import SparkSession

In [17]:
spark = SparkSession.builder.appName('dataframe').getOrCreate()

In [18]:
df_spark = spark.read.csv('/content/data2.csv',header=True,inferSchema=True)
df_spark.show()

+---------+----+----------+------+
|     Name| age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         1| 15000|
|  Shubham|  23|         2| 18000|
|   Mahesh|null|      null| 40000|
|     null|  34|        10| 38000|
|     null|  36|      null|  null|
+---------+----+----------+------+



In [19]:
from pyspark.ml.feature import Imputer

In [20]:
cols = ['age','Experience','Salary']

imputer = Imputer(
    inputCols = cols,
    outputCols = ["{}_imputed".format(c) for c in cols]
).setStrategy('mean')

In [21]:
df_spark = imputer.fit(df_spark).transform(df_spark)

In [22]:
df_spark.show()

+---------+----+----------+------+-----------+------------------+--------------+
|     Name| age|Experience|Salary|age_imputed|Experience_imputed|Salary_imputed|
+---------+----+----------+------+-----------+------------------+--------------+
|    Krish|  31|        10| 30000|         31|                10|         30000|
|Sudhanshu|  30|         8| 25000|         30|                 8|         25000|
|    Sunny|  29|         4| 20000|         29|                 4|         20000|
|     Paul|  24|         3| 20000|         24|                 3|         20000|
|   Harsha|  21|         1| 15000|         21|                 1|         15000|
|  Shubham|  23|         2| 18000|         23|                 2|         18000|
|   Mahesh|null|      null| 40000|         28|                 5|         40000|
|     null|  34|        10| 38000|         34|                10|         38000|
|     null|  36|      null|  null|         36|                 5|         25750|
+---------+----+----------+-

In [29]:
df_spark = df_spark.drop('age')
df_spark = df_spark.drop('Experience')
df_spark = df_spark.drop('Salary')
df_spark.show()

+---------+-----------+------------------+--------------+
|     Name|age_imputed|Experience_imputed|Salary_imputed|
+---------+-----------+------------------+--------------+
|    Krish|         31|                10|         30000|
|Sudhanshu|         30|                 8|         25000|
|    Sunny|         29|                 4|         20000|
|     Paul|         24|                 3|         20000|
|   Harsha|         21|                 1|         15000|
|  Shubham|         23|                 2|         18000|
|   Mahesh|         28|                 5|         40000|
|     null|         34|                10|         38000|
|     null|         36|                 5|         25750|
+---------+-----------+------------------+--------------+



In [30]:
df_spark=df_spark.withColumnRenamed('age_imputed','age')
df_spark.show()

+---------+---+------------------+--------------+
|     Name|age|Experience_imputed|Salary_imputed|
+---------+---+------------------+--------------+
|    Krish| 31|                10|         30000|
|Sudhanshu| 30|                 8|         25000|
|    Sunny| 29|                 4|         20000|
|     Paul| 24|                 3|         20000|
|   Harsha| 21|                 1|         15000|
|  Shubham| 23|                 2|         18000|
|   Mahesh| 28|                 5|         40000|
|     null| 34|                10|         38000|
|     null| 36|                 5|         25750|
+---------+---+------------------+--------------+



In [31]:
df_spark=df_spark.withColumnRenamed('Experience_imputed','Experience')
df_spark=df_spark.withColumnRenamed('Salary_imputed', 'Salary')
df_spark = df_spark.na.drop()
df_spark.show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
|   Mahesh| 28|         5| 40000|
+---------+---+----------+------+



## Filter

In [32]:
df_spark.filter((df_spark['Salary']>=20000)).show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Mahesh| 28|         5| 40000|
+---------+---+----------+------+



In [34]:
df_spark.filter(~(df_spark['Salary']>=20000)).select('Name','age').show()

+-------+---+
|   Name|age|
+-------+---+
| Harsha| 21|
|Shubham| 23|
+-------+---+



In [36]:
df_spark.filter((df_spark['Salary']>=20000) & (df_spark['Experience']>=8)).select('Name','age').show()

+---------+---+
|     Name|age|
+---------+---+
|    Krish| 31|
|Sudhanshu| 30|
+---------+---+

