In [107]:
import pyspark

In [108]:
#Create pyspark session
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName("Practice").getOrCreate()
spark

In [109]:
#Read data
spark_df=spark.read.csv('test_file.csv',header=True,enforceSchema=True)
spark_df.show()

+----+----------+----+----------+------+
|Name|Department| Age|experience|salary|
+----+----------+----+----------+------+
|  AA|        D1|  25|         1|    70|
|  BB|        D1|  30|         3|   100|
|  CC|        D2|  35|         4|   110|
|  DD|        D3|  22|         1|    60|
|  EE|        D2|NULL|         3|    75|
|NULL|      NULL|NULL|      NULL|  NULL|
|  FF|      NULL|  44|         3|    73|
|  ZZ|        D3|  23|         2|    46|
|  KK|        D4|NULL|         2|    80|
+----+----------+----+----------+------+



In [110]:
#Cast Columns to the Correct Data Types:

from pyspark.sql.types import IntegerType, DoubleType

cols=["Age","experience","salary"]

for col in cols:
  spark_df = spark_df.withColumn(col, spark_df[col].cast(IntegerType()))
spark_df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Department: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- experience: integer (nullable = true)
 |-- salary: integer (nullable = true)



In [111]:
spark_df.show()

+----+----------+----+----------+------+
|Name|Department| Age|experience|salary|
+----+----------+----+----------+------+
|  AA|        D1|  25|         1|    70|
|  BB|        D1|  30|         3|   100|
|  CC|        D2|  35|         4|   110|
|  DD|        D3|  22|         1|    60|
|  EE|        D2|NULL|         3|    75|
|NULL|      NULL|NULL|      NULL|  NULL|
|  FF|      NULL|  44|         3|    73|
|  ZZ|        D3|  23|         2|    46|
|  KK|        D4|NULL|         2|    80|
+----+----------+----+----------+------+



**Drop Nan/Null values**

In [112]:
#Drop nulls
spark_df.dropna( ).show()


#spark_df.na.drop().show()

+----+----------+---+----------+------+
|Name|Department|Age|experience|salary|
+----+----------+---+----------+------+
|  AA|        D1| 25|         1|    70|
|  BB|        D1| 30|         3|   100|
|  CC|        D2| 35|         4|   110|
|  DD|        D3| 22|         1|    60|
|  ZZ|        D3| 23|         2|    46|
+----+----------+---+----------+------+



In [113]:
# how='any', 'all'

spark_df.dropna(how='all').show()

+----+----------+----+----------+------+
|Name|Department| Age|experience|salary|
+----+----------+----+----------+------+
|  AA|        D1|  25|         1|    70|
|  BB|        D1|  30|         3|   100|
|  CC|        D2|  35|         4|   110|
|  DD|        D3|  22|         1|    60|
|  EE|        D2|NULL|         3|    75|
|  FF|      NULL|  44|         3|    73|
|  ZZ|        D3|  23|         2|    46|
|  KK|        D4|NULL|         2|    80|
+----+----------+----+----------+------+



In [114]:
#subset
spark_df.dropna(subset=['Department']).show()

+----+----------+----+----------+------+
|Name|Department| Age|experience|salary|
+----+----------+----+----------+------+
|  AA|        D1|  25|         1|    70|
|  BB|        D1|  30|         3|   100|
|  CC|        D2|  35|         4|   110|
|  DD|        D3|  22|         1|    60|
|  EE|        D2|NULL|         3|    75|
|  ZZ|        D3|  23|         2|    46|
|  KK|        D4|NULL|         2|    80|
+----+----------+----+----------+------+



In [115]:
spark_df.show()

+----+----------+----+----------+------+
|Name|Department| Age|experience|salary|
+----+----------+----+----------+------+
|  AA|        D1|  25|         1|    70|
|  BB|        D1|  30|         3|   100|
|  CC|        D2|  35|         4|   110|
|  DD|        D3|  22|         1|    60|
|  EE|        D2|NULL|         3|    75|
|NULL|      NULL|NULL|      NULL|  NULL|
|  FF|      NULL|  44|         3|    73|
|  ZZ|        D3|  23|         2|    46|
|  KK|        D4|NULL|         2|    80|
+----+----------+----+----------+------+



**#Filling Null values**

In [116]:
#1) filling with values
spark_df.fillna(30,'Age').show()

+----+----------+---+----------+------+
|Name|Department|Age|experience|salary|
+----+----------+---+----------+------+
|  AA|        D1| 25|         1|    70|
|  BB|        D1| 30|         3|   100|
|  CC|        D2| 35|         4|   110|
|  DD|        D3| 22|         1|    60|
|  EE|        D2| 30|         3|    75|
|NULL|      NULL| 30|      NULL|  NULL|
|  FF|      NULL| 44|         3|    73|
|  ZZ|        D3| 23|         2|    46|
|  KK|        D4| 30|         2|    80|
+----+----------+---+----------+------+



In [117]:
# spark_df.fillna('Unkown',['Department']).show()
# spark_df.fillna('30',['Age']).show()

spark_df.fillna({'Age':30,'Department':'Unknown'}).show()

spark_df = spark_df.fillna({'Age':30,'Department':'Unknown'})


+----+----------+---+----------+------+
|Name|Department|Age|experience|salary|
+----+----------+---+----------+------+
|  AA|        D1| 25|         1|    70|
|  BB|        D1| 30|         3|   100|
|  CC|        D2| 35|         4|   110|
|  DD|        D3| 22|         1|    60|
|  EE|        D2| 30|         3|    75|
|NULL|   Unknown| 30|      NULL|  NULL|
|  FF|   Unknown| 44|         3|    73|
|  ZZ|        D3| 23|         2|    46|
|  KK|        D4| 30|         2|    80|
+----+----------+---+----------+------+



In [118]:
spark_df.show()

+----+----------+---+----------+------+
|Name|Department|Age|experience|salary|
+----+----------+---+----------+------+
|  AA|        D1| 25|         1|    70|
|  BB|        D1| 30|         3|   100|
|  CC|        D2| 35|         4|   110|
|  DD|        D3| 22|         1|    60|
|  EE|        D2| 30|         3|    75|
|NULL|   Unknown| 30|      NULL|  NULL|
|  FF|   Unknown| 44|         3|    73|
|  ZZ|        D3| 23|         2|    46|
|  KK|        D4| 30|         2|    80|
+----+----------+---+----------+------+



In [119]:
#2)filling numeric columns with stats using imputer

from pyspark.ml.feature import Imputer
imputer=Imputer(inputCols=['Age'],outputCols=['Age']).setStrategy('mean')     #or mode, median
df_imputed=imputer.fit(spark_df).transform(spark_df)

df_imputed.show()


+----+----------+---+----------+------+
|Name|Department|Age|experience|salary|
+----+----------+---+----------+------+
|  AA|        D1| 25|         1|    70|
|  BB|        D1| 30|         3|   100|
|  CC|        D2| 35|         4|   110|
|  DD|        D3| 22|         1|    60|
|  EE|        D2| 30|         3|    75|
|NULL|   Unknown| 30|      NULL|  NULL|
|  FF|   Unknown| 44|         3|    73|
|  ZZ|        D3| 23|         2|    46|
|  KK|        D4| 30|         2|    80|
+----+----------+---+----------+------+



In [120]:
#filling categorical features with mode
# Step 1: Calculate the mode (most frequent value)
mode_value = df_imputed.groupBy("Department").count().orderBy("count", ascending=False).first()[0]
print(mode_value)

# Step 2: Use fillna() to replace nulls with the mode
df_imputed = spark_df.fillna({'Department': mode_value})

# Show the imputed DataFrame
df_imputed.show()


D1
+----+----------+---+----------+------+
|Name|Department|Age|experience|salary|
+----+----------+---+----------+------+
|  AA|        D1| 25|         1|    70|
|  BB|        D1| 30|         3|   100|
|  CC|        D2| 35|         4|   110|
|  DD|        D3| 22|         1|    60|
|  EE|        D2| 30|         3|    75|
|NULL|   Unknown| 30|      NULL|  NULL|
|  FF|   Unknown| 44|         3|    73|
|  ZZ|        D3| 23|         2|    46|
|  KK|        D4| 30|         2|    80|
+----+----------+---+----------+------+

