In [11]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, col, when

spark = (SparkSession.builder
         .appName("handle-nulls")
         .master("spark://spark-master:7077")
         .config("spark.executor.memory", "512m")
         .getOrCreate())

spark.sparkContext.setLogLevel("ERROR")

In [12]:
df = (spark.read.format("json")
      .option("multiLine", "true")
      .load("../data/nobel_prizes.json"))

df_flattened = (
    df
    .withColumn("laureates",explode(col("laureates")))
    .select(col("category")
            ,col("year")
            ,col("overallMotivation")
            ,col("laureates.id")
            ,col("laureates.firstname")
            ,col("laureates.surname")
            ,col("laureates.share")
            ,col("laureates.motivation")))

                                                                                

In [13]:
# Dropping rows with null values
df_dropna = df_flattened.dropna()

# Displaying the DataFrame after dropping null values
df_dropna.show()

[Stage 1:>                                                          (0 + 1) / 1]

+---------+----+--------------------+----+----------+----------+-----+--------------------+
| category|year|   overallMotivation|  id| firstname|   surname|share|          motivation|
+---------+----+--------------------+----+----------+----------+-----+--------------------+
|  physics|2021|"for groundbreaki...| 999|   Syukuro|    Manabe|    4|"for the physical...|
|  physics|2021|"for groundbreaki...|1000|     Klaus|Hasselmann|    4|"for the physical...|
|  physics|2021|"for groundbreaki...|1001|   Giorgio|    Parisi|    2|"for the discover...|
|  physics|2019|"for contribution...| 973|     James|   Peebles|    2|"for theoretical ...|
|  physics|2019|"for contribution...| 974|    Michel|     Mayor|    4|"for the discover...|
|  physics|2019|"for contribution...| 975|    Didier|    Queloz|    4|"for the discover...|
|  physics|2018|"for groundbreaki...| 960|    Arthur|    Ashkin|    2|"for the optical ...|
|  physics|2018|"for groundbreaki...| 961|    Gérard|    Mourou|    4|"for their

                                                                                

In [14]:
# Filling null values with a specific value
df_fillna = df_flattened.fillna("N/A")

# Displaying the DataFrame after filling null values
df_fillna.show()


[Stage 2:>                                                          (0 + 1) / 1]

+----------+----+-----------------+----+--------------------+-----------+-----+--------------------+
|  category|year|overallMotivation|  id|           firstname|    surname|share|          motivation|
+----------+----+-----------------+----+--------------------+-----------+-----+--------------------+
| chemistry|2022|              N/A|1015|             Carolyn|   Bertozzi|    3|"for the developm...|
| chemistry|2022|              N/A|1016|              Morten|     Meldal|    3|"for the developm...|
| chemistry|2022|              N/A| 743|               Barry|  Sharpless|    3|"for the developm...|
| economics|2022|              N/A|1021|                 Ben|   Bernanke|    3|"for research on ...|
| economics|2022|              N/A|1022|             Douglas|    Diamond|    3|"for research on ...|
| economics|2022|              N/A|1023|              Philip|     Dybvig|    3|"for research on ...|
|literature|2022|              N/A|1017|               Annie|     Ernaux|    1|"for the cou

                                                                                

In [15]:
# Replacing null values based on conditions
df_replace = (
    df_flattened.withColumn("category", when(col("category").isNull(), "").otherwise(col("category")))
    .withColumn("overallMotivation", when(col("overallMotivation").isNull(), "").otherwise(col("overallMotivation")))
    .withColumn("firstname", when(col("firstname").isNull(), "").otherwise(col("firstname")))
    .withColumn("surname", when(col("surname").isNull(), "").otherwise(col("surname")))
    .withColumn("year", when(col("year").isNull(), 9999).otherwise(col("year"))))

# Displaying the DataFrame after replacing null values
df_replace.show()


+----------+----+-----------------+----+--------------------+-----------+-----+--------------------+
|  category|year|overallMotivation|  id|           firstname|    surname|share|          motivation|
+----------+----+-----------------+----+--------------------+-----------+-----+--------------------+
| chemistry|2022|                 |1015|             Carolyn|   Bertozzi|    3|"for the developm...|
| chemistry|2022|                 |1016|              Morten|     Meldal|    3|"for the developm...|
| chemistry|2022|                 | 743|               Barry|  Sharpless|    3|"for the developm...|
| economics|2022|                 |1021|                 Ben|   Bernanke|    3|"for research on ...|
| economics|2022|                 |1022|             Douglas|    Diamond|    3|"for research on ...|
| economics|2022|                 |1023|              Philip|     Dybvig|    3|"for research on ...|
|literature|2022|                 |1017|               Annie|     Ernaux|    1|"for the cou

### Handling null values in user-defined functions (UDFs)

In [16]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

# Sample DataFrame with null values
data = [("John", 25), ("Alice", None), ("Bob", 30)]
df = spark.createDataFrame(data, ["name", "age"])

# Define a UDF to handle null values
def process_name(name):
    if name is None:
        return "Unknown"
    else:
        return name.upper()

# Register the UDF
process_name_udf = udf(process_name, StringType())

# Apply the UDF to the DataFrame
df_with_processed_names = df.withColumn("processed_name", process_name_udf(df["name"]))

# Show the resulting DataFrame
df_with_processed_names.show()


[Stage 5:>                                                          (0 + 1) / 1]

+-----+----+--------------+
| name| age|processed_name|
+-----+----+--------------+
| John|  25|          JOHN|
|Alice|null|         ALICE|
|  Bob|  30|           BOB|
+-----+----+--------------+



                                                                                

### Handling null values in machine learning pipelines

In [17]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import Imputer

# Create a sample DataFrame with missing values
data = [
    (1, 2.0),
    (2, None),
    (3, 5.0),
    (4, None),
    (5, 7.0)
]
df = spark.createDataFrame(data, ["id", "value"])

# Create an instance of Imputer and specify the input/output columns
imputer = Imputer(inputCols=["value"], outputCols=["imputed_value"])

# Fit the imputer to the data and transform the DataFrame
imputer_model = imputer.fit(df)
imputed_df = imputer_model.transform(df)

# Show the resulting DataFrame
imputed_df.show()

                                                                                

+---+-----+-----------------+
| id|value|    imputed_value|
+---+-----+-----------------+
|  1|  2.0|              2.0|
|  2| null|4.666666666666667|
|  3|  5.0|              5.0|
|  4| null|4.666666666666667|
|  5|  7.0|              7.0|
+---+-----+-----------------+



In [18]:
spark.stop()