In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:

#
#    These are standard python modules
#
#import json, time, urllib.parse
import json, time
import numpy as np
import pandas as pd
#
#    The 'requests' module is a distribution module for making web requests.
#
import requests
from tqdm import tqdm

In [4]:
#########
#
#    CONSTANTS
#

#
#    This is the root of all AQS API URLs
#
API_REQUEST_URL = 'https://aqs.epa.gov/data/api'

#
#    These are 'actions' we can ask the API to take or requests that we can make of the API
#
#    Sign-up request - generally only performed once - unless you lose your key
API_ACTION_SIGNUP = '/signup?email={email}'
#
#    List actions provide information on API parameter values that are required by some other actions/requests
API_ACTION_LIST_CLASSES = '/list/classes?email={email}&key={key}'
API_ACTION_LIST_PARAMS = '/list/parametersByClass?email={email}&key={key}&pc={pclass}'
API_ACTION_LIST_SITES = '/list/sitesByCounty?email={email}&key={key}&state={state}&county={county}'
#
#    Monitor actions are requests for monitoring stations that meet specific criteria
API_ACTION_MONITORS_COUNTY = '/monitors/byCounty?email={email}&key={key}&param={param}&bdate={begin_date}&edate={end_date}&state={state}&county={county}'
API_ACTION_MONITORS_BOX = '/monitors/byBox?email={email}&key={key}&param={param}&bdate={begin_date}&edate={end_date}&minlat={minlat}&maxlat={maxlat}&minlon={minlon}&maxlon={maxlon}'
#
#    Summary actions are requests for summary data. These are for daily summaries
API_ACTION_DAILY_SUMMARY_COUNTY = '/dailyData/byCounty?email={email}&key={key}&param={param}&bdate={begin_date}&edate={end_date}&state={state}&county={county}'
API_ACTION_DAILY_SUMMARY_BOX = '/dailyData/byBox?email={email}&key={key}&param={param}&bdate={begin_date}&edate={end_date}&minlat={minlat}&maxlat={maxlat}&minlon={minlon}&maxlon={maxlon}'
#
#    It is always nice to be respectful of a free data resource.
#    We're going to observe a 100 requests per minute limit - which is fairly nice
API_LATENCY_ASSUMED = 0.002       # Assuming roughly 2ms latency on the API and network
API_THROTTLE_WAIT = (1.0/100.0)-API_LATENCY_ASSUMED
#
#
#    This is a template that covers most of the parameters for the actions we might take, from the set of actions
#    above. In the examples below, most of the time parameters can either be supplied as individual values to a
#    function - or they can be set in a copy of the template and passed in with the template.
#
AQS_REQUEST_TEMPLATE = {
    "email":      "",
    "key":        "",
    "state":      "",     # the two digit state FIPS # as a string
    "county":     "",     # the three digit county FIPS # as a string
    "begin_date": "",     # the start of a time window in YYYYMMDD format
    "end_date":   "",     # the end of a time window in YYYYMMDD format, begin_date and end_date must be in the same year
    "minlat":    0.0,
    "maxlat":    0.0,
    "minlon":    0.0,
    "maxlon":    0.0,
    "param":     "",     # a list of comma separated 5 digit codes, max 5 codes requested
    "pclass":    ""      # parameter class is only used by the List calls
}

In [35]:
USERNAME = "voreddy@uw.edu"
APIKEY = "ecruosprey95"

In [36]:
#
#    This implements the list request. There are several versions of the list request that only require email and key.
#    This code sets the default action/requests to list the groups or parameter class descriptors. Having those descriptors
#    allows one to request the individual (proprietary) 5 digit codes for individual air quality measures by using the
#    param request. Some code in later cells will illustrate those requests.
#
def request_list_info(email_address = None, key = None,
                      endpoint_url = API_REQUEST_URL,
                      endpoint_action = API_ACTION_LIST_CLASSES,
                      request_template = AQS_REQUEST_TEMPLATE,
                      headers = None):
    """
    Request information about available data classes using the Air Quality System API.

    This function sends a request to the Air Quality System API to retrieve a list of available data classes.
    Data classes are categories of air quality data, and this request provides information about what data is
    available for retrieval.

    Args:
        email_address (str): Your email address for API access.
        key (str): Your API key for authentication.
        endpoint_url (str): The base URL for the API (default is API_REQUEST_URL).
        endpoint_action (str): The specific API action for requesting data class information
            (default is API_ACTION_LIST_CLASSES).
        request_template (dict): A dictionary containing request parameters.
            Used to set up the request (default is AQS_REQUEST_TEMPLATE).
        headers (dict): Optional headers to include in the HTTP request.

    Returns:
        dict or None: A dictionary containing information about available data classes retrieved from the API, or None if the request fails.

    Raises:
        Exception: If either email_address or key is missing in the request template.

    Example:
        To retrieve information about available data classes, you can use this function as follows:
        ```
        data_info = request_list_info(email_address='your_email@example.com', key='your_api_key')
        if data_info:
            print(data_info)
        else:
            print("Request failed.")
        ```
    """
    #  Make sure we have email and key - at least
    #  This prioritizes the info from the call parameters - not what's already in the template
    if email_address:
        request_template['email'] = email_address
    if key:
        request_template['key'] = key

    # For the basic request we need an email address and a key
    if not request_template['email']:
        raise Exception("Must supply an email address to call 'request_list_info()'")
    if not request_template['key']:
        raise Exception("Must supply a key to call 'request_list_info()'")

    # compose the request
    request_url = endpoint_url+endpoint_action.format(**request_template)

    # make the request
    try:
        # Wait first, to make sure we don't exceed a rate limit in the situation where an exception occurs
        # during the request processing - throttling is always a good practice with a free data source
        response = requests.get(request_url, headers=headers)
        json_response = response.json()
    except Exception as e:
        print(e)
        json_response = None
    return json_response

In [37]:
#
#   The default should get us a list of the various groups or classes of sensors. These classes are user defined names for clustors of
#   sensors that might be part of a package or default air quality sensing station. We need a class name to start getting down to the
#   a sensor ID. Each sensor type has an ID number. We'll eventually need those ID numbers to be able to request values that come from
#   that specific sensor.
#
request_data = AQS_REQUEST_TEMPLATE.copy()
request_data['email'] = USERNAME
request_data['key'] = APIKEY

response = request_list_info(request_template=request_data)

if response["Header"][0]['status'] == "Success":
    print(json.dumps(response['Data'],indent=4))
else:
    print(json.dumps(response,indent=4))


[
    {
        "code": "AIRNOW MAPS",
        "value_represented": "The parameters represented on AirNow maps (88101, 88502, and 44201)"
    },
    {
        "code": "ALL",
        "value_represented": "Select all Parameters Available"
    },
    {
        "code": "AQI POLLUTANTS",
        "value_represented": "Pollutants that have an AQI Defined"
    },
    {
        "code": "CORE_HAPS",
        "value_represented": "Urban Air Toxic Pollutants"
    },
    {
        "code": "CRITERIA",
        "value_represented": "Criteria Pollutants"
    },
    {
        "code": "CSN DART",
        "value_represented": "List of CSN speciation parameters to populate the STI DART tool"
    },
    {
        "code": "FORECAST",
        "value_represented": "Parameters routinely extracted by AirNow (STI)"
    },
    {
        "code": "HAPS",
        "value_represented": "Hazardous Air Pollutants"
    },
    {
        "code": "IMPROVE CARBON",
        "value_represented": "IMPROVE Carbon Parameters"
    }

We're looking to calculate the Air Quality Index (AQI) by identifying and gathering data from sensors under the "AQI POLLUTANTS" category as per our 'list/Classes' query.

In [38]:
AQI_PARAM_CLASS = "AQI POLLUTANTS"

In [39]:
#
#   Structure a request to get the sensor IDs associated with the AQI
#
request_data = AQS_REQUEST_TEMPLATE.copy()
request_data['email'] = USERNAME
request_data['key'] = APIKEY
request_data['pclass'] = AQI_PARAM_CLASS  # here we specify that we want this 'pclass' or parameter classs

response = request_list_info(request_template=request_data, endpoint_action=API_ACTION_LIST_PARAMS)

if response["Header"][0]['status'] == "Success":
    print(json.dumps(response['Data'],indent=4))
else:
    print(json.dumps(response,indent=4))


[
    {
        "code": "42101",
        "value_represented": "Carbon monoxide"
    },
    {
        "code": "42401",
        "value_represented": "Sulfur dioxide"
    },
    {
        "code": "42602",
        "value_represented": "Nitrogen dioxide (NO2)"
    },
    {
        "code": "44201",
        "value_represented": "Ozone"
    },
    {
        "code": "81102",
        "value_represented": "PM10 Total 0-10um STP"
    },
    {
        "code": "88101",
        "value_represented": "PM2.5 - Local Conditions"
    },
    {
        "code": "88502",
        "value_represented": "Acceptable PM2.5 AQI & Speciation Mass"
    }
]


We've compiled a list of sensor IDs and their descriptions that track air quality. Due to the EPA AQS API's restrictions on data requests, we cannot query all AQI sensor data simultaneously as it limits us to five sensor values per request. To manage this, we've categorized the sensors into two groups: one for gas sampling sensors and another for particulate matter sensors, allowing us to query the data in two separate batches.

In [40]:
#   Gaseous AQI pollutants CO, SO2, NO2, and O2
AQI_PARAMS_GASEOUS = "42101,42401,42602,44201"
#
#   Particulate AQI pollutants PM10, PM2.5, and Acceptable PM2.5
AQI_PARAMS_PARTICULATES = "81102,88101,88502"
#
#

Choosing the city assigned. Air quality monitoring stations are dispersed across the US, each with a unique location. For testing, we'll need a few sample locations to examine the data variations from different sensors. Our list uses the FIPS codes for states and counties, which is a 5-digit identifier still prevalent despite newer codes being available. We will use these FIPS codes, as they are the standard identifiers in the AQS database, to explore data from two example cities.


In [43]:
CITY_LOCATIONS = {
    'Yakima' :       {'city'   : 'Yakima',
                  'county' : 'Yakima',
                  'state'  : 'Washington',
                  'fips'   : '53077',
                  'latlon' : [46.6021, -120.505898] }
}

In [44]:
#
#    This implements the daily summary request. Daily summary provides a daily summary value for each sensor being requested
#    from the start date to the end date.
#
#    Like the two other functions, this can be called with a mixture of a defined parameter dictionary, or with function
#    parameters. If function parameters are provided, those take precedence over any parameters from the request template.
#
def request_daily_summary(email_address = None, key = None, param=None,
                          begin_date = None, end_date = None, fips = None,
                          endpoint_url = API_REQUEST_URL,
                          endpoint_action = API_ACTION_DAILY_SUMMARY_COUNTY,
                          request_template = AQS_REQUEST_TEMPLATE,
                          headers = None):

    #  This prioritizes the info from the call parameters - not what's already in the template
    if email_address:
        request_template['email'] = email_address
    if key:
        request_template['key'] = key
    if param:
        request_template['param'] = param
    if begin_date:
        request_template['begin_date'] = begin_date
    if end_date:
        request_template['end_date'] = end_date
    if fips and len(fips)==5:
        request_template['state'] = fips[:2]
        request_template['county'] = fips[2:]
    # Make sure there are values that allow us to make a call - these are always required
    if not request_template['email']:
        raise Exception("Must supply an email address to call 'request_daily_summary()'")
    if not request_template['key']:
        raise Exception("Must supply a key to call 'request_daily_summary()'")
    if not request_template['param']:
        raise Exception("Must supply param values to call 'request_daily_summary()'")
    if not request_template['begin_date']:
        raise Exception("Must supply a begin_date to call 'request_daily_summary()'")
    if not request_template['end_date']:
        raise Exception("Must supply an end_date to call 'request_daily_summary()'")
    # Note we're not validating FIPS fields because not all of the daily summary actions require the FIPS numbers

    # compose the request
    request_url = endpoint_url+endpoint_action.format(**request_template)

    # make the request
    try:
        # Wait first, to make sure we don't exceed a rate limit in the situation where an exception occurs
        # during the request processing - throttling is always a good practice with a free data source
        if API_THROTTLE_WAIT > 0.0:
            time.sleep(API_THROTTLE_WAIT)
        response = requests.get(request_url, headers=headers)
        json_response = response.json()
    except Exception as e:
        print(e)
        json_response = None
    return json_response

In [45]:
request_data = AQS_REQUEST_TEMPLATE.copy()
request_data['email'] = USERNAME
request_data['key'] = APIKEY
request_data['param'] = AQI_PARAMS_GASEOUS
request_data['state'] = CITY_LOCATIONS['Yakima']['fips'][:2]
request_data['county'] = CITY_LOCATIONS['Yakima']['fips'][2:]

# request daily summary data for 2020
gaseous_aqi = request_daily_summary(request_template=request_data, begin_date="20200101", end_date="20201231")
print("Response for the gaseous pollutants ...")
#
if gaseous_aqi["Header"][0]['status'] == "Success":
    print(json.dumps(gaseous_aqi['Data'],indent=4))
elif gaseous_aqi["Header"][0]['status'].startswith("No data "):
    print("Looks like the response generated no data. You might take a closer look at your request and the response data.")
else:
    print(json.dumps(gaseous_aqi,indent=4))

request_data['param'] = AQI_PARAMS_PARTICULATES
# request daily summary data for 2020
particulate_aqi = request_daily_summary(request_template=request_data, begin_date="20200101", end_date="20201231")
print("Response for the particulate pollutants ...")
#
if particulate_aqi["Header"][0]['status'] == "Success":
    print(json.dumps(particulate_aqi['Data'],indent=4))
elif particulate_aqi["Header"][0]['status'].startswith("No data "):
    print("Looks like the response generated no data. You might take a closer look at your request and the response data.")
else:
    print(json.dumps(particulate_aqi,indent=4))


Response for the gaseous pollutants ...
Looks like the response generated no data. You might take a closer look at your request and the response data.
Response for the particulate pollutants ...


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [46]:
#
#    This implements the monitors request. This requests monitoring stations. This can be done by state, county, or bounding box.
#
#    Like the two other functions, this can be called with a mixture of a defined parameter dictionary, or with function
#    parameters. If function parameters are provided, those take precedence over any parameters from the request template.
#
def request_monitors(email_address = None, key = None, param=None,
                          begin_date = None, end_date = None, fips = None,
                          endpoint_url = API_REQUEST_URL,
                          endpoint_action = API_ACTION_MONITORS_COUNTY,
                          request_template = AQS_REQUEST_TEMPLATE,
                          headers = None):

    #  This prioritizes the info from the call parameters - not what's already in the template
    if email_address:
        request_template['email'] = email_address
    if key:
        request_template['key'] = key
    if param:
        request_template['param'] = param
    if begin_date:
        request_template['begin_date'] = begin_date
    if end_date:
        request_template['end_date'] = end_date
    if fips and len(fips)==5:
        request_template['state'] = fips[:2]
        request_template['county'] = fips[2:]

    # Make sure there are values that allow us to make a call - these are always required
    if not request_template['email']:
        raise Exception("Must supply an email address to call 'request_monitors()'")
    if not request_template['key']:
        raise Exception("Must supply a key to call 'request_monitors()'")
    if not request_template['param']:
        raise Exception("Must supply param values to call 'request_monitors()'")
    if not request_template['begin_date']:
        raise Exception("Must supply a begin_date to call 'request_monitors()'")
    if not request_template['end_date']:
        raise Exception("Must supply an end_date to call 'request_monitors()'")
    # Note we're not validating FIPS fields because not all of the monitors actions require the FIPS numbers

    # compose the request
    request_url = endpoint_url+endpoint_action.format(**request_template)

    # make the request
    try:
        # Wait first, to make sure we don't exceed a rate limit in the situation where an exception occurs
        # during the request processing - throttling is always a good practice with a free data source
        if API_THROTTLE_WAIT > 0.0:
            time.sleep(API_THROTTLE_WAIT)
        response = requests.get(request_url, headers=headers)
        json_response = response.json()
    except Exception as e:
        print(e)
        json_response = None
    return json_response

In [47]:
#
request_data = AQS_REQUEST_TEMPLATE.copy()
request_data['email'] = USERNAME
request_data['key'] = APIKEY
request_data['param'] = AQI_PARAMS_PARTICULATES     # remember we have both gaseous and particulates
#
#   We got the monitoring stations for Bend OR above (Deschutes county) - let's work with that one again
request_data['state'] = CITY_LOCATIONS['Yakima']['fips'][:2]
request_data['county'] = CITY_LOCATIONS['Yakima']['fips'][2:]
#
# the first example uses the default - request monitors by county, we'll just use a recent date for now
response = request_monitors(request_template=request_data, begin_date="20210701", end_date="20210731")
#
# the response should be similar to the 'list' request above - but in this case we should only get monitors that
# monitor the AQI_PARAMS_PARTICULATES set of params.
#
if response["Header"][0]['status'] == "Success":
    print(json.dumps(response['Data'],indent=4))
else:
    print(json.dumps(response,indent=4))

[
    {
        "state_code": "53",
        "county_code": "077",
        "site_number": "0016",
        "parameter_code": "88502",
        "poc": 4,
        "parameter_name": "Acceptable PM2.5 AQI & Speciation Mass",
        "open_date": "2009-10-27",
        "close_date": "2021-08-31",
        "concurred_exclusions": null,
        "dominant_source": null,
        "measurement_scale": "NEIGHBORHOOD",
        "measurement_scale_def": "500 M TO 4KM",
        "monitoring_objective": "POPULATION EXPOSURE",
        "last_method_code": "771",
        "last_method_description": "Correlated Radiance Research M903 With Heated Inlet - Nephelometry",
        "last_method_begin_date": "2009-10-27",
        "naaqs_primary_monitor": null,
        "qa_primary_monitor": null,
        "monitor_type": null,
        "networks": null,
        "monitoring_agency_code": "1136",
        "monitoring_agency": "Washington State Department Of Ecology",
        "si_id": 95288,
        "latitude": 46.37543,
     

In [48]:
#
#    This implements the daily summary request. Daily summary provides a daily summary value for each sensor being requested
#    from the start date to the end date.
#
#    Like the two other functions, this can be called with a mixture of a defined parameter dictionary, or with function
#    parameters. If function parameters are provided, those take precedence over any parameters from the request template.
#
def request_daily_summary(email_address = None, key = None, param=None,
                          begin_date = None, end_date = None, fips = None,
                          endpoint_url = API_REQUEST_URL,
                          endpoint_action = API_ACTION_DAILY_SUMMARY_COUNTY,
                          request_template = AQS_REQUEST_TEMPLATE,
                          headers = None):
    """
    Make a request to obtain daily summary data for a specific set of parameters and time window.

    Args:
        email_address (str): Email address for API access.
        key (str): API key for authentication.
        param (str): Comma-separated list of parameter codes to request.
        begin_date (str): Start date for the data request (YYYYMMDD).
        end_date (str): End date for the data request (YYYYMMDD).
        fips (str): FIPS code for state and county. Must be 5 characters (e.g., '06089' for Shasta County, California).
        endpoint_url (str): URL for the API endpoint.
        endpoint_action (str): Specific action to perform for the request.
        request_template (dict): Template for request parameters.
        headers (dict): Additional headers for the request.

    Returns:
        dict: JSON response containing daily summary data.
    """

    #  This prioritizes the info from the call parameters - not what's already in the template
    if email_address:
        request_template['email'] = email_address
    if key:
        request_template['key'] = key
    if param:
        request_template['param'] = param
    if begin_date:
        request_template['begin_date'] = begin_date
    if end_date:
        request_template['end_date'] = end_date
    if fips and len(fips)==5:
        request_template['state'] = fips[:2]
        request_template['county'] = fips[2:]

    # Make sure there are values that allow us to make a call - these are always required
    if not request_template['email']:
        raise Exception("Must supply an email address to call 'request_daily_summary()'")
    if not request_template['key']:
        raise Exception("Must supply a key to call 'request_daily_summary()'")
    if not request_template['param']:
        raise Exception("Must supply param values to call 'request_daily_summary()'")
    if not request_template['begin_date']:
        raise Exception("Must supply a begin_date to call 'request_daily_summary()'")
    if not request_template['end_date']:
        raise Exception("Must supply an end_date to call 'request_daily_summary()'")
    # Note we're not validating FIPS fields because not all of the daily summary actions require the FIPS numbers

    # compose the request
    request_url = endpoint_url+endpoint_action.format(**request_template)

    # make the request
    try:
        # Wait first, to make sure we don't exceed a rate limit in the situation where an exception occurs
        # during the request processing - throttling is always a good practice with a free data source
        # if API_THROTTLE_WAIT > 0.0:
        #     time.sleep(API_THROTTLE_WAIT)
        response = requests.get(request_url, headers=headers)
        json_response = response.json()
    except Exception as e:
        print(e)
        json_response = None
    return json_response

#
#    This is a list of field names - data - that will be extracted from each record
#
EXTRACTION_FIELDS = ['sample_duration','observation_count','arithmetic_mean','aqi']


#
#    The function creates a summary record
def extract_summary_from_response(r=None, fields=EXTRACTION_FIELDS):
    """
    Extract and structure summary data from the API response.

    Args:
        r (dict): JSON response data from the API request.
        fields (list): List of field names to extract from each record.

    Returns:
        dict: Structured summary data organized by monitoring site, parameter, and date.
    """
    ## the result will be structured around monitoring site, parameter, and then date
    result = dict()
    data = r["Data"]
    for record in data:
        # make sure the record is set up
        site = record['site_number']
        param = record['parameter_code']
        #date = record['date_local']    # this version keeps the respnse value YYYY-
        date = record['date_local'].replace('-','') # this puts it in YYYYMMDD format
        if site not in result:
            result[site] = dict()
            result[site]['local_site_name'] = record['local_site_name']
            result[site]['site_address'] = record['site_address']
            result[site]['state'] = record['state']
            result[site]['county'] = record['county']
            result[site]['city'] = record['city']
            result[site]['pollutant_type'] = dict()
        if param not in result[site]['pollutant_type']:
            result[site]['pollutant_type'][param] = dict()
            result[site]['pollutant_type'][param]['parameter_name'] = record['parameter']
            result[site]['pollutant_type'][param]['units_of_measure'] = record['units_of_measure']
            result[site]['pollutant_type'][param]['method'] = record['method']
            result[site]['pollutant_type'][param]['data'] = dict()
        if date not in result[site]['pollutant_type'][param]['data']:
            result[site]['pollutant_type'][param]['data'][date] = list()

        # now extract the specified fields
        extract = dict()
        for k in fields:
            if str(k) in record:
                extract[str(k)] = record[k]
            else:
                # this makes sure we always have the requested fields, even if
                # we have a missing value for a given day/month
                extract[str(k)] = None

        # add this extraction to the list for the day
        result[site]['pollutant_type'][param]['data'][date].append(extract)

    return result

In [49]:
def process_pollutant_data(data):
    """
    Process raw pollutant data and calculate the daily average AQI.

    Args:
        data (dict): Raw pollutant data in a nested dictionary format.

    Returns:
        pandas.DataFrame: DataFrame containing Date, Pollutant Name, and Daily Average AQI.
    """
    records = []

    for station_id, station_data in data.items():
        for pollutant_id, pollutant_data in station_data['pollutant_type'].items():
            pollutant_name = pollutant_data['parameter_name']
            pollutant_data = pollutant_data['data']

            for date, date_data in pollutant_data.items():
                for entry in date_data:
                    aqi = entry['aqi']
                    if aqi is not None:
                        records.append((date, pollutant_name, aqi))

    df = pd.DataFrame(records, columns=['Date', 'Pollutant Name', 'Avg AQI'])
    daily_avg_aqi = df.groupby(['Date', 'Pollutant Name'])['Avg AQI'].mean().reset_index()

    return daily_avg_aqi

In [50]:
# Define the columns
columns = ['Date', 'Pollutant Name', 'Avg AQI']

# Create empty DataFrames with the specified columns
gas_data = pd.DataFrame(columns=columns)
particulate_data = pd.DataFrame(columns=columns)

In [52]:
# Iterate through a range of years
for year in tqdm(range(1963, 2024), position=0, leave=True, desc="Processing Years"):
    # Define the start and end dates for the current year
    begin_date = str(year) + "0101"
    end_date = str(year) + "1231"

    # Prepare a request template for gaseous pollutants
    request_data = AQS_REQUEST_TEMPLATE.copy()
    request_data['email'] = USERNAME
    request_data['key'] = APIKEY
    request_data['param'] = AQI_PARAMS_GASEOUS
    request_data['state'] = CITY_LOCATIONS['Yakima']['fips'][:2]
    request_data['county'] = CITY_LOCATIONS['Yakima']['fips'][2:]

    # Request daily summary data for gaseous pollutants
    gaseous_aqi = request_daily_summary(request_template=request_data, begin_date=begin_date, end_date=end_date)

    # Extract and process the gaseous pollutant data
    extract_gaseous = extract_summary_from_response(gaseous_aqi)

    # Update the request template for particulate pollutants
    request_data['param'] = AQI_PARAMS_PARTICULATES

    # Request daily summary data for particulate pollutants
    particulate_aqi = request_daily_summary(request_template=request_data, begin_date=begin_date, end_date=end_date)

    # Extract and process the particulate pollutant data
    extract_particulate = extract_summary_from_response(particulate_aqi)

    # Convert the data into dataframes
    gaseous_df = process_pollutant_data(extract_gaseous)
    particulate_df = process_pollutant_data(extract_particulate)

    # Concatenate the dataframes
    gas_data = pd.concat([gas_data, gaseous_df])
    particulate_data = pd.concat([particulate_data, particulate_df])


Processing Years: 100%|██████████| 61/61 [01:27<00:00,  1.44s/it]


In [53]:
combined_data = pd.concat([gas_data, particulate_data], ignore_index=True)
combined_data

Unnamed: 0,Date,Pollutant Name,Avg AQI
0,19790101,Carbon monoxide,28.000000
1,19790102,Carbon monoxide,43.000000
2,19790103,Carbon monoxide,50.000000
3,19790104,Carbon monoxide,40.000000
4,19790105,Carbon monoxide,54.000000
...,...,...,...
27327,20230729,PM2.5 - Local Conditions,25.000000
27328,20230730,PM10 Total 0-10um STP,11.000000
27329,20230730,PM2.5 - Local Conditions,20.333333
27330,20230731,PM10 Total 0-10um STP,13.000000


In [54]:
# Convert the 'Date' column to a datetime format
combined_data['Date'] = pd.to_datetime(combined_data['Date'], format='%Y%m%d')

# Sort the data by 'Date' in ascending order and 'Avg AQI' in descending order
combined_data = combined_data.sort_values(['Date', 'Avg AQI'], ascending=[True, False])

# Group the data by 'Date' and keep the first entry (max AQI) for each date
combined_data = combined_data.groupby('Date').first().reset_index()

# Extract the 'Year' from the 'Date' column
combined_data['Year'] = combined_data['Date'].dt.year

# Group the data by 'Year' and select the top 10 AQI values for each year
top_10_aqi = combined_data.groupby('Year').apply(lambda x: x.nlargest(10, 'Avg AQI')).reset_index(drop=True)

# Calculate the mean of the top 10 AQI values for each year and rename the column
std_aqi = top_10_aqi.groupby('Year')['Avg AQI'].mean().reset_index()
std_aqi = std_aqi.rename(columns={'Avg AQI': 'Calculated AQI'})

In [55]:
# Convert the DataFrame to a JSON object
std_aqi_json = std_aqi.to_json(orient='records')

# Specify the path to the output file
output_file_path = 'std_aqi.json'

# Write the JSON data to the file
with open(output_file_path, 'w') as json_file:
    json_file.write(std_aqi_json)

# Close the file
json_file.close()


In [33]:
#
#    Create a copy of the AQS_REQUEST_TEMPLATE
#
request_data = AQS_REQUEST_TEMPLATE.copy()
request_data['email'] = USERNAME
request_data['key'] = APIKEY
request_data['param'] = AQI_PARAMS_PARTICULATES     # same particulate request as the one abover
#
#   Not going to use these - comment them out
#request_data['state'] = CITY_LOCATIONS['bend']['fips'][:2]
#request_data['county'] = CITY_LOCATIONS['bend']['fips'][2:]
#
#   Now, we need bounding box parameters

#   50 mile box
#bbox = bounding_latlon(CITY_LOCATIONS['Yakima'],scale=1.0)
#   100 mile box
#bbox = bounding_latlon(CITY_LOCATIONS['Yakima'],scale=2.0)
#   150 mile box
#bbox = bounding_latlon(CITY_LOCATIONS['Yakima'],scale=3.0)
#   200 mile box
#bbox = bounding_latlon(CITY_LOCATIONS['Yakima'],scale=4.0)
#   250 mile box
bbox = bounding_latlon(CITY_LOCATIONS['Yakima'],scale=5.0)

# the bbox response comes back as a list - [minlat,maxlat,minlon,maxlon]

#   put our bounding box into the request_data
request_data['minlat'] = bbox[0]
request_data['maxlat'] = bbox[1]
request_data['minlon'] = bbox[2]
request_data['maxlon'] = bbox[3]

#
#   we need to change the action for the API from the default to the bounding box - same recent date for now
response = request_monitors(request_template=request_data, begin_date="20020602", end_date="20020603",
                            endpoint_action = API_ACTION_MONITORS_BOX)
#
#
#
if response["Header"][0]['status'] == "Success":
    for station in response['Data']:
      print("state_code: ",station["state_code"])
      print("county_code: ",station["county_code"])
      print("open_date: ",station["open_date"])
      print("close_date: ",station["close_date"])
      print(" ")
    #print(json.dumps(response['Data'],indent=4))
else:
    print(json.dumps(response,indent=4))

{
    "Header": [
        {
            "status": "No data matched your selection",
            "request_time": "2023-11-09T02:44:04-05:00",
            "url": "https://aqs.epa.gov/data/api/monitors/byBox?email=andixit@uw.edu&key=ecruosprey95&param=81102,88101,88502&bdate=20020602&edate=20020603&minlat=44.790505797101446&maxlat=48.413694202898554&minlon=118.21652271062271&maxlon=122.79527728937728",
            "rows": 0
        }
    ],
    "Data": []
}


In [34]:
request_data = AQS_REQUEST_TEMPLATE.copy()
request_data['email'] = USERNAME
request_data['key'] = APIKEY
request_data['param'] = AQI_PARAMS_GASEOUS
request_data['state'] = 31
request_data['county'] = 171

# request daily summary data for the month of Jan in 2020
gaseous_aqi = request_daily_summary(request_template=request_data, begin_date="20200101", end_date="20200102")
print("Response for the gaseous pollutants ...")
#
if gaseous_aqi["Header"][0]['status'] == "Success":
    print(json.dumps(gaseous_aqi['Data'],indent=4))
elif gaseous_aqi["Header"][0]['status'].startswith("No data "):
    print("Looks like the response generated no data. You might take a closer look at your request and the response data.")
else:
    print(json.dumps(gaseous_aqi,indent=4))

request_data['param'] = AQI_PARAMS_PARTICULATES
# request daily summary data for the month of Jan in 2020
particulate_aqi = request_daily_summary(request_template=request_data, begin_date="20200101", end_date="20200102")
print("Response for the particulate pollutants ...")
#
if particulate_aqi["Header"][0]['status'] == "Success":
    print(json.dumps(particulate_aqi['Data'],indent=4))
elif particulate_aqi["Header"][0]['status'].startswith("No data "):
    print("Looks like the response generated no data. You might take a closer look at your request and the response data.")
else:
    print(json.dumps(particulate_aqi,indent=4))


Response for the gaseous pollutants ...
Looks like the response generated no data. You might take a closer look at your request and the response data.
Response for the particulate pollutants ...
[
    {
        "state_code": "31",
        "county_code": "171",
        "site_number": "9000",
        "parameter_code": "88502",
        "poc": 1,
        "latitude": 41.888789,
        "longitude": -100.339141,
        "datum": "WGS84",
        "parameter": "Acceptable PM2.5 AQI & Speciation Mass",
        "sample_duration_code": "7",
        "sample_duration": "24 HOUR",
        "pollutant_standard": null,
        "date_local": "2020-01-01",
        "units_of_measure": "Micrograms/cubic meter (LC)",
        "event_type": "No Events",
        "observation_count": 1,
        "observation_percent": 100.0,
        "validity_indicator": "Y",
        "arithmetic_mean": 1.2,
        "first_max_value": 1.2,
        "first_max_hour": 0,
        "aqi": 5,
        "method_code": "707",
        "metho