<a href="https://colab.research.google.com/github/Vasugi2003/Big-Data-Analytics/blob/main/Manipulation_of_Null_Values.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425344 sha256=9e7d351920254ad599c9dfedf32cd913902326759a52f3539514806eb41bd5cd
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0


In [None]:
from pyspark.sql import SparkSession
# Step 2: Create a Spark session
spark = SparkSession.builder.appName("PySpark_NullValue_Manipulation_Example").getOrCreate()

In [None]:
data = [
    ("PlantA", 100, None, "Healthy", "Tall"),
    ("PlantB", None, "Moderate", "Healthy", None),
    ("PlantC", 50, "Low", None, "Short"),
    ("PlantD", None, None, "Unhealthy", None),
    ("PlantE", 80, "Moderate", "Healthy", "Medium")
]
columns = ["PlantName", "Temperature", "Moisture",\
           "HealthStatus", "Size"]
# Create a DataFrame
df = spark.createDataFrame(data, columns)
df.show()

+---------+-----------+--------+------------+------+
|PlantName|Temperature|Moisture|HealthStatus|  Size|
+---------+-----------+--------+------------+------+
|   PlantA|        100|    NULL|     Healthy|  Tall|
|   PlantB|       NULL|Moderate|     Healthy|  NULL|
|   PlantC|         50|     Low|        NULL| Short|
|   PlantD|       NULL|    NULL|   Unhealthy|  NULL|
|   PlantE|         80|Moderate|     Healthy|Medium|
+---------+-----------+--------+------------+------+



In [None]:
#PySpark SQL functions lit() and typedLit() are used to add a new column to DataFrame by assigning a literal or constant value

# 1. lit
from pyspark.sql.functions import col, when, coalesce, lit

# Add a "Temperature_default" column with default values
default_temperature = 25  # Specify your default value here
df = df.withColumn("Temperature_default",
                   lit(default_temperature))
df.show()

+---------+-----------+--------+------------+------+-------------------+
|PlantName|Temperature|Moisture|HealthStatus|  Size|Temperature_default|
+---------+-----------+--------+------------+------+-------------------+
|   PlantA|        100|    NULL|     Healthy|  Tall|                 25|
|   PlantB|       NULL|Moderate|     Healthy|  NULL|                 25|
|   PlantC|         50|     Low|        NULL| Short|                 25|
|   PlantD|       NULL|    NULL|   Unhealthy|  NULL|                 25|
|   PlantE|         80|Moderate|     Healthy|Medium|                 25|
+---------+-----------+--------+------------+------+-------------------+



In [None]:
#Returns the first column that is not null

df_coalesce = df.withColumn("Temperature_filled",
      coalesce(df["Temperature"], col("Temperature_default")))
# Show the results
df_coalesce.show()

+---------+-----------+--------+------------+------+-------------------+------------------+
|PlantName|Temperature|Moisture|HealthStatus|  Size|Temperature_default|Temperature_filled|
+---------+-----------+--------+------------+------+-------------------+------------------+
|   PlantA|        100|    NULL|     Healthy|  Tall|                 25|               100|
|   PlantB|       NULL|Moderate|     Healthy|  NULL|                 25|                25|
|   PlantC|         50|     Low|        NULL| Short|                 25|                50|
|   PlantD|       NULL|    NULL|   Unhealthy|  NULL|                 25|                25|
|   PlantE|         80|Moderate|     Healthy|Medium|                 25|                80|
+---------+-----------+--------+------------+------+-------------------+------------------+



In [None]:
# 2. when

df_when = df.withColumn("Moisture_modified",
when(col("Moisture").isNotNull(), col("Moisture")).otherwise("Unknown"))
df_when.show()

+---------+-----------+--------+------------+------+-------------------+-----------------+
|PlantName|Temperature|Moisture|HealthStatus|  Size|Temperature_default|Moisture_modified|
+---------+-----------+--------+------------+------+-------------------+-----------------+
|   PlantA|        100|    NULL|     Healthy|  Tall|                 25|          Unknown|
|   PlantB|       NULL|Moderate|     Healthy|  NULL|                 25|         Moderate|
|   PlantC|         50|     Low|        NULL| Short|                 25|              Low|
|   PlantD|       NULL|    NULL|   Unhealthy|  NULL|                 25|          Unknown|
|   PlantE|         80|Moderate|     Healthy|Medium|                 25|         Moderate|
+---------+-----------+--------+------------+------+-------------------+-----------------+



In [None]:
# 3. isNull
df_isnull = df.filter(df["Size"].isNull())
df_isnull.show()

+---------+-----------+--------+------------+----+-------------------+
|PlantName|Temperature|Moisture|HealthStatus|Size|Temperature_default|
+---------+-----------+--------+------------+----+-------------------+
|   PlantB|       NULL|Moderate|     Healthy|NULL|                 25|
|   PlantD|       NULL|    NULL|   Unhealthy|NULL|                 25|
+---------+-----------+--------+------------+----+-------------------+



In [None]:
# 4. isNotNull
df.show()
df_isnotnull = df.filter(df["HealthStatus"].isNotNull())
df_isnotnull.show()

+---------+-----------+--------+------------+------+-------------------+
|PlantName|Temperature|Moisture|HealthStatus|  Size|Temperature_default|
+---------+-----------+--------+------------+------+-------------------+
|   PlantA|        100|    NULL|     Healthy|  Tall|                 25|
|   PlantB|       NULL|Moderate|     Healthy|  NULL|                 25|
|   PlantC|         50|     Low|        NULL| Short|                 25|
|   PlantD|       NULL|    NULL|   Unhealthy|  NULL|                 25|
|   PlantE|         80|Moderate|     Healthy|Medium|                 25|
+---------+-----------+--------+------------+------+-------------------+

+---------+-----------+--------+------------+------+-------------------+
|PlantName|Temperature|Moisture|HealthStatus|  Size|Temperature_default|
+---------+-----------+--------+------------+------+-------------------+
|   PlantA|        100|    NULL|     Healthy|  Tall|                 25|
|   PlantB|       NULL|Moderate|     Healthy|  NUL

In [None]:
#fillna(value, subset=None)
#fill(value, subset=None)

#value – Value should be the data type of int, long, float, string, or dict.
##Value specified here will be replaced for NULL/None values.

#subset – This is optional, when used it should be the subset of the column names
#where you wanted to replace NULL/None values.

# 5. fillna
df_nvl = df.fillna("Not Specified", subset=["Size"])
df_nvl.show()

+---------+-----------+--------+------------+-------------+-------------------+
|PlantName|Temperature|Moisture|HealthStatus|         Size|Temperature_default|
+---------+-----------+--------+------------+-------------+-------------------+
|   PlantA|        100|    NULL|     Healthy|         Tall|                 25|
|   PlantB|       NULL|Moderate|     Healthy|Not Specified|                 25|
|   PlantC|         50|     Low|        NULL|        Short|                 25|
|   PlantD|       NULL|    NULL|   Unhealthy|Not Specified|                 25|
|   PlantE|         80|Moderate|     Healthy|       Medium|                 25|
+---------+-----------+--------+------------+-------------+-------------------+



In [None]:
# 6. ifnull
df_ifnull = df.withColumn("Size_filled",
when(df["Size"].isNull(), "Not Specified").otherwise(df["Size"]))
df_ifnull.show()

+---------+-----------+--------+------------+------+-------------------+-------------+
|PlantName|Temperature|Moisture|HealthStatus|  Size|Temperature_default|  Size_filled|
+---------+-----------+--------+------------+------+-------------------+-------------+
|   PlantA|        100|    NULL|     Healthy|  Tall|                 25|         Tall|
|   PlantB|       NULL|Moderate|     Healthy|  NULL|                 25|Not Specified|
|   PlantC|         50|     Low|        NULL| Short|                 25|        Short|
|   PlantD|       NULL|    NULL|   Unhealthy|  NULL|                 25|Not Specified|
|   PlantE|         80|Moderate|     Healthy|Medium|                 25|       Medium|
+---------+-----------+--------+------------+------+-------------------+-------------+



In [None]:
# 7. nullif
df_nullif = df.withColumn("HealthStatus_differs_from_Size",
when(df["HealthStatus"] == df["Size"], None)\
                          .otherwise(df["HealthStatus"]))
df_nullif.show()

+---------+-----------+--------+------------+------+-------------------+------------------------------+
|PlantName|Temperature|Moisture|HealthStatus|  Size|Temperature_default|HealthStatus_differs_from_Size|
+---------+-----------+--------+------------+------+-------------------+------------------------------+
|   PlantA|        100|    NULL|     Healthy|  Tall|                 25|                       Healthy|
|   PlantB|       NULL|Moderate|     Healthy|  NULL|                 25|                       Healthy|
|   PlantC|         50|     Low|        NULL| Short|                 25|                          NULL|
|   PlantD|       NULL|    NULL|   Unhealthy|  NULL|                 25|                     Unhealthy|
|   PlantE|         80|Moderate|     Healthy|Medium|                 25|                       Healthy|
+---------+-----------+--------+------------+------+-------------------+------------------------------+



In [None]:
# 8. nanvl
df_filled = df.fillna(0, subset=["Temperature"])
df_filled.show()

+---------+-----------+--------+------------+------+-------------------+
|PlantName|Temperature|Moisture|HealthStatus|  Size|Temperature_default|
+---------+-----------+--------+------------+------+-------------------+
|   PlantA|        100|    NULL|     Healthy|  Tall|                 25|
|   PlantB|          0|Moderate|     Healthy|  NULL|                 25|
|   PlantC|         50|     Low|        NULL| Short|                 25|
|   PlantD|          0|    NULL|   Unhealthy|  NULL|                 25|
|   PlantE|         80|Moderate|     Healthy|Medium|                 25|
+---------+-----------+--------+------------+------+-------------------+



In [None]:
# 9. ifnan
df_ifnan = df.withColumn("Temperature_filled_nan",
when(col("Temperature").isNull(), 0.0)\
                         .otherwise(col("Temperature")))
df_ifnan.show()

+---------+-----------+--------+------------+------+-------------------+----------------------+
|PlantName|Temperature|Moisture|HealthStatus|  Size|Temperature_default|Temperature_filled_nan|
+---------+-----------+--------+------------+------+-------------------+----------------------+
|   PlantA|        100|    NULL|     Healthy|  Tall|                 25|                 100.0|
|   PlantB|       NULL|Moderate|     Healthy|  NULL|                 25|                   0.0|
|   PlantC|         50|     Low|        NULL| Short|                 25|                  50.0|
|   PlantD|       NULL|    NULL|   Unhealthy|  NULL|                 25|                   0.0|
|   PlantE|         80|Moderate|     Healthy|Medium|                 25|                  80.0|
+---------+-----------+--------+------------+------+-------------------+----------------------+

