In [0]:
# Imports
from pyspark.sql.functions import *

In [0]:
# Ingesting the bronze data from the table
bronze_df = spark.read.table("aqi_bronze")

In [0]:
# Data has "NA" in pollutant_min/max/avg so we will cast it explicitly to double and NA will be converted to null
bronze_df = (
    bronze_df
    .withColumn("pollutant_min", expr("try_cast(pollutant_min as double)"))
    .withColumn("pollutant_max", expr("try_cast(pollutant_max as double)"))
    .withColumn("pollutant_avg", expr("try_cast(pollutant_avg as double)"))
)

In [0]:
# Converting the last_update column from string to timestamp
bronze_df = bronze_df.withColumn("last_update_timestamp", to_timestamp(col("last_update"), "dd-MM-yyyy HH:mm:ss"))

In [0]:
# Pivot the data based on pollutant id's for easier analysis
bronze_df = (
    bronze_df
    .groupBy("station", "city", "state",  "country", "last_update_timestamp", "ingestion_timestamp", "latitude", "longitude")
    .pivot("pollutant_id", ["SO2", "CO", "NO2", "OZONE", "PM2.5", "PM10", "NH3"])
    .agg(avg("pollutant_avg"))
)

In [0]:
# Write the cleaned and pivoted data to a delta table
bronze_df.write\
    .format("delta")\
    .mode("overwrite")\
    .saveAsTable("aqi_silver")