In [16]:
import requests 
import pandas as pd
import numpy as np
import json
import time

In [17]:
# Run only once
# response = requests.get("https://aqs.epa.gov/data/api/signup?email=suchiths@vt.edu")

In [18]:
def get_request(endpoint, parameters=dict()):
    api_url = 'https://aqs.epa.gov/data/api/'
    parameters['email'] = 'suchiths@vt.edu'
    parameters['key'] = 'silvermallard37'
    
    response = requests.get(api_url + endpoint, params=parameters)
    if response.status_code != 200:
        print("Request to {} failed. Error code {}:{}".format(api_url + endpoint, response.status_code, response.text))
    
    response = json.loads(response.text)
    return response

In [19]:
# Retrieve all states 
response = get_request('list/states')
states = dict()
for state in response['Data']:
    skip = ['Country Of Mexico', 'Canada']
    if state['value_represented'] not in skip:
        states[state['code']] = state['value_represented']

In [20]:

# Retrieve pollution metrics
response = get_request('list/parametersByClass', {'pc':'AQI POLLUTANTS'})
pollutants = dict()
for pollutant in response['Data']:
    if pollutant['code'] not in ['81102', '88101']:
        pollutants[pollutant['code']] = pollutant['value_represented']

pollutants_str = ','.join(pollutants.keys())
print(pollutants_str)

42101,42401,42602,44201,88502


In [21]:
print(pollutants)

{'42101': 'Carbon monoxide', '42401': 'Sulfur dioxide', '42602': 'Nitrogen dioxide (NO2)', '44201': 'Ozone', '88502': 'Acceptable PM2.5 AQI & Speciation Mass'}


In [22]:
# attributes = ['state_code', 'county_code', 'cbsa_code', 'site_number', 'year', '
data = list()
raw_df = pd.DataFrame()
for state in states.keys():
    for year in range(2005, 2020, 1): #2014 - 2019
        year_str = '{}0101'.format(year)
        params = {'bdate':year_str,
                  'edate':year_str,
                  'state':state,
                  'param':pollutants_str}
        response = get_request('annualData/byState', params)
        time.sleep(6) #ensure rate/limit of 10 calls per min is adhered
        
        # Add to data frame
        data.extend(response['Data'])

In [28]:
# convert data for dataframe creation
data_df = [list(item.values()) for item in data]
    
df = pd.DataFrame(data_df, columns=data[0].keys())

# features = ['latitude', 'longitude', 'datum', 'parameter', 'sample_duration', 'pollutant_standard', 'metric_used', 'year', 'units_of_measure', 'observation_count', 'validity_indicator', 'required_day_count', 'null_observation_count', 'arithmetic_mean', 'standard_deviation', 'first_max_value', 'first_max_date
remove = ['local_site_name', 'state_code', 'county_code', 'event_type', 'site_number', 'parameter_code', 'poc', 'pollutant_standard', 'method', 'event-type', 'observation_percent', 'required_day_count', 'exceptional_data_count', 'primary_exceedance_count', 'secondary_exceedance_count', 'certification_indicator', 'first_max_nonoverlap_value', 'first_max_n_o_datetime', 'second_max_nonoverlap_value', 'second_max_n_o_datetime', 'site_address', 'cbsa', 'date_of_last_change']
features = [col for col in df.columns if col not in remove]

df[features].to_csv('epa_pollution.csv', index=False)

In [27]:
df.shape

(159367, 55)

In [24]:
read = pd.read_csv('raw_epa.csv')

In [66]:
# df['location'] = [(i, j) for i, j in zip(df['latitude'], df['longitude'])]
df_complete = df[df['cbsa_code'].isnull() == False]
cbsa = df_complete['cbsa_code'].unique()
poll = df_complete['parameter'].unique()

df_complete[(df_complete['cbsa_code'] == cbsa[0]) & (df_complete['parameter'] == poll[0])].to_csv('test.csv')
