In [1]:
import os
import boto3

from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

import sagemaker
from sagemaker import get_execution_role
import sagemaker_pyspark
import datetime


In [2]:
jars = sagemaker_pyspark.classpath_jars()

classpath = ":".join(sagemaker_pyspark.classpath_jars())

# See the SageMaker Spark Github to learn how to connect to EMR from a notebook instance
spark = (
    SparkSession.builder.config("spark.driver.extraClassPath", classpath)
    .master("local[*]")
    .getOrCreate()
)

spark

In [3]:
region = boto3.Session().region_name
endpoint_domain = "com"
spark._jsc.hadoopConfiguration().set(
    "fs.s3a.endpoint", "s3.{}.amazonaws.{}".format(region, endpoint_domain)
)
date = f"{datetime.datetime.utcnow().date()}"

df = (
    spark.read.format("json")
    .option("numFeatures", "784")
    .load(f"s3a://openaq-fetches/realtime/{date}".format(region))
)

df.show()

+---------------+--------------------+---------------+-----+--------------------+-------+--------------------+--------------------+------+---------+--------------+----------+----+-----+
|_corrupt_record|         attribution|averagingPeriod| city|         coordinates|country|                date|            location|mobile|parameter|    sourceName|sourceType|unit|value|
+---------------+--------------------+---------------+-----+--------------------+-------+--------------------+--------------------+------+---------+--------------+----------+----+-----+
|           null|[[EPA AirNow DOS,...|   [hours, 1.0]|Dubai|[25.25848, 55.309...|     AE|[2022-04-15T10:00...|US Diplomatic Pos...| false|       o3|StateAir_Dubai|government| ppm|0.013|
|           null|[[EPA AirNow DOS,...|   [hours, 1.0]|Dubai|[25.25848, 55.309...|     AE|[2022-04-15T11:00...|US Diplomatic Pos...| false|       o3|StateAir_Dubai|government| ppm|0.022|
|           null|[[EPA AirNow DOS,...|   [hours, 1.0]|Dubai|[25.25848,

In [4]:
df.head()

Row(_corrupt_record=None, attribution=[Row(name='EPA AirNow DOS', url='http://airnow.gov/index.cfm?action=airnow.global_summary')], averagingPeriod=Row(unit='hours', value=1.0), city='Dubai', coordinates=Row(latitude=25.25848, longitude=55.309166), country='AE', date=Row(local='2022-04-15T10:00:00+04:00', utc='2022-04-15T06:00:00.000Z'), location='US Diplomatic Post: Dubai', mobile=False, parameter='o3', sourceName='StateAir_Dubai', sourceType='government', unit='ppm', value=0.013)

In [14]:
df.count()

3125589

In [6]:
df.summary().show()

+-------+--------------------+------------------+-------+--------------------+---------+----------+----------+-------+------------------+
|summary|     _corrupt_record|              city|country|            location|parameter|sourceName|sourceType|   unit|             value|
+-------+--------------------+------------------+-------+--------------------+---------+----------+----------+-------+------------------+
|  count|                   1|           3125588|3125588|             3125588|  3125588|   3125588|   3125588|3125588|           3125588|
|   mean|                null|29.429906542056074|   null|              7004.0|     null|      null|      null|   null|36.280697162836255|
| stddev|                null|17.964028820023188|   null|                 0.0|     null|      null|      null|   null| 963.8705733297246|
|    min|{"date":{"utc":"2...|               007|     AD| Gołdap, ul. Jaćw...|       bc| ARPALAZIO|government|    ppm|           -9999.0|
|    25%|                null|    

In [7]:
df.describe()

DataFrame[summary: string, _corrupt_record: string, city: string, country: string, location: string, parameter: string, sourceName: string, sourceType: string, unit: string, value: string]

In [8]:
df.select('parameter').distinct().show()

+---------+
|parameter|
+---------+
|       bc|
|      so2|
|     null|
|       co|
|       o3|
|     pm10|
|      no2|
|     pm25|
+---------+



In [9]:
df.groupBy('parameter').count().orderBy('count').show()

+---------+------+
|parameter| count|
+---------+------+
|     null|     1|
|       bc| 12020|
|       co|196251|
|      so2|414756|
|       o3|550762|
|     pm10|569387|
|     pm25|665468|
|      no2|716944|
+---------+------+



In [10]:
df.printSchema()

root
 |-- _corrupt_record: string (nullable = true)
 |-- attribution: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- url: string (nullable = true)
 |-- averagingPeriod: struct (nullable = true)
 |    |-- unit: string (nullable = true)
 |    |-- value: double (nullable = true)
 |-- city: string (nullable = true)
 |-- coordinates: struct (nullable = true)
 |    |-- latitude: double (nullable = true)
 |    |-- longitude: double (nullable = true)
 |-- country: string (nullable = true)
 |-- date: struct (nullable = true)
 |    |-- local: string (nullable = true)
 |    |-- utc: string (nullable = true)
 |-- location: string (nullable = true)
 |-- mobile: boolean (nullable = true)
 |-- parameter: string (nullable = true)
 |-- sourceName: string (nullable = true)
 |-- sourceType: string (nullable = true)
 |-- unit: string (nullable = true)
 |-- value: double (nullable = true)



In [11]:
filter_df = df.fillna("NA",subset =["parameter"])
filter_df.show()

+---------------+--------------------+---------------+-----+--------------------+-------+--------------------+--------------------+------+---------+--------------+----------+----+-----+
|_corrupt_record|         attribution|averagingPeriod| city|         coordinates|country|                date|            location|mobile|parameter|    sourceName|sourceType|unit|value|
+---------------+--------------------+---------------+-----+--------------------+-------+--------------------+--------------------+------+---------+--------------+----------+----+-----+
|           null|[[EPA AirNow DOS,...|   [hours, 1.0]|Dubai|[25.25848, 55.309...|     AE|[2022-04-15T10:00...|US Diplomatic Pos...| false|       o3|StateAir_Dubai|government| ppm|0.013|
|           null|[[EPA AirNow DOS,...|   [hours, 1.0]|Dubai|[25.25848, 55.309...|     AE|[2022-04-15T11:00...|US Diplomatic Pos...| false|       o3|StateAir_Dubai|government| ppm|0.022|
|           null|[[EPA AirNow DOS,...|   [hours, 1.0]|Dubai|[25.25848,

In [12]:
filter_df.groupBy('parameter').count().orderBy('count').show()

+---------+------+
|parameter| count|
+---------+------+
|       NA|     1|
|       bc| 12020|
|       co|196251|
|      so2|414756|
|       o3|550762|
|     pm10|569387|
|     pm25|665468|
|      no2|716944|
+---------+------+

