In [2]:
import requests 
import pandas as pd
import numpy as np
import json
import time

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Run only once
# response = requests.get("https://aqs.epa.gov/data/api/signup?email=suchiths@vt.edu")

In [3]:
def get_request(endpoint, parameters=dict()):
    api_url = 'https://aqs.epa.gov/data/api/'
    parameters['email'] = 'suchiths@vt.edu'
    parameters['key'] = 'silvermallard37'
    
    response = requests.get(api_url + endpoint, params=parameters)
    if response.status_code != 200:
        print("Request to {} failed. Error code {}:{}".format(api_url + endpoint, response.status_code, response.text))
    
    response = json.loads(response.text)
    return response

In [4]:
# Retrieve all states 
response = get_request('list/states')
states = dict()
for state in response['Data']:
    skip = ['Country Of Mexico', 'Canada']
    if state['value_represented'] not in skip:
        states[state['code']] = state['value_represented']

In [5]:

# Retrieve pollution metrics
response = get_request('list/parametersByClass', {'pc':'AQI POLLUTANTS'})
pollutants = dict()
for pollutant in response['Data']:
    if pollutant['code'] not in ['81102', '88101']:
        pollutants[pollutant['code']] = pollutant['value_represented']

pollutants_str = ','.join(pollutants.keys())
print(pollutants_str)

42101,42401,42602,44201,88502


In [6]:
print(pollutants)

{'42101': 'Carbon monoxide', '42401': 'Sulfur dioxide', '42602': 'Nitrogen dioxide (NO2)', '44201': 'Ozone', '88502': 'Acceptable PM2.5 AQI & Speciation Mass'}


In [7]:
#Determine cbsa search space (Census has the smaller amount)
census = pd.read_csv('./census_acs.csv')
cbsa_possibilites = census['cbsa_code'].unique()

In [8]:
# attributes = ['state_code', 'county_code', 'cbsa_code', 'site_number', 'year', '
data = list()
raw_df = pd.DataFrame()
for cbsa in cbsa_possibilites:
    for state in states.keys():
        for year in range(2005, 2020, 1): #2014 - 2019
            year_str = '{}0101'.format(year)
            params = {'bdate':year_str,
                      'edate':year_str,
                      'param':pollutants_str,
                      'cbsa':cbsa}
            response = get_request('annualData/byCBSA', params)
            time.sleep(6) #ensure rate/limit of 10 calls per min is adhered

            # Add to data frame
            data.extend(response['Data'])

ConnectionError: HTTPSConnectionPool(host='aqs.epa.gov', port=443): Max retries exceeded with url: /data/api/annualData/byCBSA?bdate=20080101&edate=20080101&param=42101%2C42401%2C42602%2C44201%2C88502&cbsa=10740&email=suchiths%40vt.edu&key=silvermallard37 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000001FC61650460>: Failed to establish a new connection: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond'))

In [29]:
# convert data for dataframe creation
data_df = [list(item.values()) for item in data]
    
df = pd.DataFrame(data_df, columns=data[0].keys())

# features = ['latitude', 'longitude', 'datum', 'parameter', 'sample_duration', 'pollutant_standard', 'metric_used', 'year', 'units_of_measure', 'observation_count', 'validity_indicator', 'required_day_count', 'null_observation_count', 'arithmetic_mean', 'standard_deviation', 'first_max_value', 'first_max_date
remove = ['local_site_name', 'state_code', 'county_code', 'event_type', 'site_number', 'parameter_code', 'poc', 'pollutant_standard', 'method', 'event-type', 'observation_percent', 'required_day_count', 'exceptional_data_count', 'primary_exceedance_count', 'secondary_exceedance_count', 'certification_indicator', 'first_max_nonoverlap_value', 'first_max_n_o_datetime', 'second_max_nonoverlap_value', 'second_max_n_o_datetime', 'site_address', 'cbsa', 'date_of_last_change']
features = [col for col in df.columns if col not in remove]

df[features].to_csv('epa_pollution.csv', index=False)

In [27]:
df.shape

(159367, 55)

In [24]:
read = pd.read_csv('raw_epa.csv')

In [66]:
# df['location'] = [(i, j) for i, j in zip(df['latitude'], df['longitude'])]
df_complete = df[df['cbsa_code'].isnull() == False]
cbsa = df_complete['cbsa_code'].unique()
poll = df_complete['parameter'].unique()

df_complete[(df_complete['cbsa_code'] == cbsa[0]) & (df_complete['parameter'] == poll[0])].to_csv('test.csv')


# AQI Index Report

In [63]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

# Create driver for the EPA website
driver = webdriver.Edge(r'C:\Users\Suchi\Documents\edgedriver_win32\msedgedriver.exe')
driver.get("https://www.epa.gov/outdoor-air-quality-data/air-quality-index-report")

# Retrive drop down options
driver.implicitly_wait(10)
year_select = Select(driver.find_element_by_id("year"))
year_options = [str(i) for i in range(2018, 2020)]
year_select.select_by_visible_text('2021') #dummy text to unlock state drop down
state_select = Select(driver.find_element_by_id("state"))
state_options = [o.text for o in state_select.options]
state_options.pop(0)


skip = ['Country Of Mexico', 'Canada', 'Virgin Islands']
for year in year_options:
    time.sleep(5)
    for state in state_options:
        if state in skip: break
        #Select year
        time.sleep(.11)
        year_select.select_by_visible_text(year)
        #Select state
        driver.implicitly_wait(1)
        state_select.select_by_visible_text(state)
        #Generate csv
        driver.find_element_by_id("launch").click()
        WebDriverWait(driver, 50).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="results"]/p[2]/a[2]')))
        #download csv
        driver.find_element_by_xpath('//*[@id="results"]/p[2]/a[2]').click()
    
driver.quit()

#### Combine all csv into one dataframe
            

In [67]:
import os
import re
import pandas as pd

directory = os.fsencode("./aqi_data")
   
aqi_df = pd.DataFrame()
for file in os.listdir(directory):
    filename = os.fsdecode(file)
    curr_df = pd.read_csv("./aqi_data/" + filename, na_values='.')
    
    #add the year
    year = re.search(r"aqireport([0-9]*) .+", filename)
    if not year:
        year = re.search(r"aqireport([0-9]*).csv", filename)
    curr_df['year'] = [year[1]] * curr_df.shape[0]  
        
    #append df to master df
    aqi_df = pd.concat([aqi_df, curr_df], ignore_index=True)
    

In [68]:
aqi_df.head()

Unnamed: 0,CBSA Code,CBSA,# Days with AQI,Good,Moderate,Unhealthy for Sensitive Groups,Unhealthy,Very Unhealthy,AQI Maximum,AQI 90th Percentile,AQI Median,# Days CO,# Days NO2,# Days O3,# Days SO2,# Days PM2.5,# Days PM10,year
0,11260,"Anchorage, AK",303,252.0,48.0,2.0,1.0,,151,57.0,23.0,107.0,,,,96.0,100.0,2005
1,21820,"Fairbanks, AK",225,188.0,34.0,2.0,1.0,,153,69.0,23.0,134.0,,,,91.0,,2005
2,27940,"Juneau, AK",115,99.0,14.0,2.0,,,125,57.0,20.0,,,,,110.0,5.0,2005
3,10500,"Albany, GA",116,53.0,62.0,1.0,,,107,75.0,52.0,,,,,111.0,5.0,2005
4,11140,"Americus, GA",244,175.0,65.0,4.0,,,140,80.0,42.0,,,244.0,,,,2005


In [69]:
aqi_df['cbsa_code'] = aqi_df['CBSA Code']
aqi_df.fillna(0, inplace=True)
aqi_df.drop(columns=["CBSA", "CBSA Code"]).to_csv("aqi_report.csv", index=False)