In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import pprint
import matplotlib.pyplot as plt
from random import randint
import datetime
from dateutil.relativedelta import relativedelta
import json

# Local imports
from data_fetcher import DataFetcher
from preprocessing import Processor

In [3]:
# Define some string constants for easy typing
SAMPLE_DATA_BY_SITE = 'sampleData/bySite'
SAMPLE_DATA_BY_COUNTY = 'sampleData/byCounty'
SAMPLE_DATA_BY_STATE = 'sampleData/byState'
SAMPLE_DATA_BY_BOX = 'sampleData/byBox'
SAMPLE_DATA_BY_CBSA = 'sampleData/byCBSA'

LIST_STATES = 'list/states'
LIST_COUNTIES_BY_STATE = 'list/countiesByState'
LIST_SITES_BY_COUNTY = 'list/sitesByCounty'
LIST_CBSAs = 'list/cbsas'
LIST_PARAM_CLASSES = 'list/classes'
LIST_PARAM_IN_CLASS = 'list/parametersByClass'

In [4]:
datafetcher = DataFetcher()

In [5]:
# Example calls to list codes
cali_code = datafetcher.get_codes(LIST_STATES, all=False, value='California')
print('California state code:', cali_code)

criteria_code = datafetcher.get_codes(LIST_PARAM_CLASSES, all=False, value='Criteria Pollutants')
print('Criteria polutants code:', criteria_code)

carbon_monoxide_code = datafetcher.get_codes(LIST_PARAM_IN_CLASS, all=False, value='Carbon monoxide', nparams={'pc':criteria_code})
print('Carbon monoxide code:', carbon_monoxide_code)

California state code: 06
Criteria polutants code: CRITERIA
Carbon monoxide code: 42101


In [6]:
datafetcher.all_codes

Unnamed: 0_level_0,value_represented
code,Unnamed: 1_level_1
11101,Suspended particulate (TSP)
11102,Suspended particulate (TSP) LC
11103,Benzene soluble organics (TSP)
11104,Total polynuclear hydrocarbons
11114,Windblown particulate
...,...
88500,PM2.5 Total Atmospheric
88501,PM2.5 Raw Data
88502,Acceptable PM2.5 AQI & Speciation Mass
88503,PM2.5 Volatile Channel


In [7]:
_ = datafetcher.find_code('Cristabalite', verbose=True)

Cristabalite code is: 11122


## Explore data by area codes to find good location for modelling

We want to find a site, or small grroup of sites, that have enough data for us to train a useeful model. We need this set to contain metereological, ozone, particulate matter, and pollutant data.

We sample a day per year for 5 of the last 20 years and find the site with the best data for this particular county/state pair. We do this sampling because the API will lock us out if we try to get too much yearly data.

This code takes roughly 30 minutes to run. And it finds that Los Angels-North Main Street had the most data.

In [8]:
# r = datafetcher.find_best_location()
# with open('data.json', 'w') as fp:
#     json.dump(r, fp)

with open('data.json', 'r') as f:
  r = json.load(f)

Let's find which sites have the most data

In [23]:
data = r['Data']
metadata = r['Metadata']

num_codes = len(data['Azusa']) # Any key in result dict
num_years = len(data['Azusa'][0])

yearly_results = {site:[sum([data[site][code][year] for code in range(num_codes)]) for year in range(num_years)] for site in data}
yearly_best = {site: ([date for date, val in enumerate(yearly_results[site]) if val == max(yearly_results[site])], max(yearly_results[site])) for site in yearly_results}
yearly_best_sorted = sorted(yearly_best.items(), key=lambda x : x[1][1], reverse=True)
yearly_best_sorted[:5] # NOTE: (year whree most measurements weree takeen, most mausrements taken)

[('Burbank', ([2], 13)),
 ('Los Angeles-North Main Street', ([3, 4], 13)),
 ('Azusa', ([1], 12)),
 ('Pico Rivera #2', ([2], 12)),
 ('Santa Clarita', ([2], 12))]

In [10]:
dates = [i[0]+':'+i[1] for i in metadata['dates']]
codes = [datafetcher.all_codes.loc[code]['value_represented'] for code in metadata['codes']]

ndf = pd.DataFrame(data['Los Angeles-North Main Street'])
ndf.index = codes
ndf.columns = dates
ndf # NOTE: Table for when we have values in chosen station!

Unnamed: 0,20000528:20000529,20051030:20051031,20100301:20100302,20150413:20150414,20200702:20200703
Carbon monoxide,True,True,True,True,True
Nitrogen dioxide (NO2),True,True,True,True,True
Ozone,True,True,True,True,True
PM2.5 - Local Conditions,True,True,True,True,True
Wind Direction - Resultant,True,True,False,True,True
Wind Speed - Resultant,True,True,False,True,True
Outdoor Temperature,True,True,True,True,True
Relative Humidity,True,True,True,True,True
Solar radiation,True,False,False,True,True
Ultraviolet radiation,False,False,False,True,True


From the results above we will proceed with <b>Los Angeles-North Main Street</b> as our primary location to model. Notice that it has data on all our criteria pollutants and MET variables. We now proceed to find the amount of VOC data we have for these sites with the same sampling as before.

<b>SAVE CODES AND YEARS IN DICT TOO, USE DATES TO SEARCH FOR VOC</b>

In [25]:
# Pick 5 best sites
best_sites = [i[0] for i in yearly_best_sorted[:5]]
best_sites_codes = [datafetcher.get_codes(LIST_SITES_BY_COUNTY, all=False, value=i, nparams={'state':'06', 'county':'037'}) for i in best_sites]
best_sites_dates = [[metadata['dates'][j] for j in i[1][0]] for i in yearly_best_sorted[:5]]
best_sites

['Burbank',
 'Los Angeles-North Main Street',
 'Azusa',
 'Pico Rivera #2',
 'Santa Clarita']

In [29]:
voc_r = datafetcher.find_voc_availability(best_sites, best_sites_codes, best_sites_dates)
with open('voc_data.json', 'w') as f:
    json.dump(voc_r, f)

# with open('voc_data.json', 'r') as f:
#   voc_r = json.load(f)

Finished site 1002, Burbank
Finished site 1103, Los Angeles-North Main Street
Finished site 0002, Azusa
Finished site 1602, Pico Rivera #2
Finished site 6012, Santa Clarita


In [30]:
voc_data = np.array(voc_r['Data'])
voc_df = pd.DataFrame(voc_r['Data'])
voc_df.index = voc_r['Metadata']['codes']
voc_df.head(5)

Unnamed: 0,Burbank,Los Angeles-North Main Street,Azusa,Pico Rivera #2,Santa Clarita
43000,[False],"[False, True]",[True],[False],[False]
43102,[False],"[False, True]",[True],[False],[False]
43202,[False],"[False, True]",[True],[False],[False]
43203,[False],"[False, True]",[True],[False],[False]
43204,[False],"[False, True]",[True],[False],[False]


In [80]:
voc_site_results = {}
for site in voc_r['Data']:
    arr = np.array(voc_r['Data'][site])
    voc_site_results[site] = arr.sum(axis=0)
voc_site_results

{'Burbank': array([0]),
 'Los Angeles-North Main Street': array([ 0, 59]),
 'Azusa': array([56]),
 'Pico Rivera #2': array([0]),
 'Santa Clarita': array([0])}

In [79]:
voc_r['Metadata']['dates']

[[['20100301', '20100302']],
 [['20150413', '20150414'], ['20200702', '20200703']],
 [['20051030', '20051031']],
 [['20100301', '20100302']],
 [['20100301', '20100302']]]

### Explore dataset for chosen sight W VOC data

Los Angeles-North Main Street had the most CRITERIA, MET, and VOC data (almost all the PAMS_VOCS are in this data set) <b>for the sampled date in 2020</b>

In [83]:
print(yearly_best_sorted[1])
print(best_sites_codes[1])

('Los Angeles-North Main Street', ([3, 4], 13))
1103


In [90]:
a = [2]
a += [1,1,1]
a

[2, 1, 1, 1]

In [93]:
df = datafetcher.create_dataset(20200101, 20201231, site='1103', county='037', state='06', processed=True, verbose=False, vocs=True) # NOTE: This crashes because my kernel on my computer can't handle the computation.
df

### Explore dataset for chosen sight W/O VOC data

In [None]:
site_code = datafetcher.get_codes(LIST_SITES_BY_COUNTY, all=False, value='Los Angeles-North Main Street', nparams={'state':'06', 'county':'037'})

# Example of Site data using Los Angeles-North Main Street, Los Angeles, California
df = datafetcher.create_dataset(20180101, 20181212, site=site_code, county='037', state='06', processed=True, verbose=False)
df

Unnamed: 0_level_0,Carbon monoxide,Nitrogen dioxide (NO2),Ozone,PM2.5 - Local Conditions,Wind Direction - Resultant,Wind Speed - Resultant,Outdoor Temperature,Relative Humidity,Solar radiation,Ultraviolet radiation,Barometric pressure
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2018-01-01 00:00:00,1.398,27.2,0.002,61.4,49.0,3.2,51.8,87.0,0.0,0.0,1009.0
2018-01-01 00:00:00,1.500,27.2,0.002,61.4,49.0,3.2,51.8,87.0,0.0,0.0,1009.0
2018-01-01 01:00:00,1.460,27.8,0.001,,35.0,2.9,51.4,84.0,0.0,0.0,1009.0
2018-01-01 01:00:00,1.600,27.8,0.001,,35.0,2.9,51.4,84.0,0.0,0.0,1009.0
2018-01-01 02:00:00,1.436,27.9,0.002,,43.0,3.8,50.9,81.0,0.0,0.0,1009.0
...,...,...,...,...,...,...,...,...,...,...,...
2018-12-12 21:00:00,1.200,33.3,0.001,,27.0,2.7,55.6,88.0,0.0,0.0,1010.0
2018-12-12 22:00:00,,30.1,0.001,,41.0,2.3,54.8,88.0,0.0,0.0,1010.0
2018-12-12 22:00:00,1.300,30.1,0.001,,41.0,2.3,54.8,88.0,0.0,0.0,1010.0
2018-12-12 23:00:00,,27.7,0.001,,44.0,3.1,53.9,89.0,0.0,0.0,1010.0


In [None]:
# Check if we are missing data for any measurement
df[df.isna().any(axis=1)]

# TODO: interpolate

Unnamed: 0_level_0,Carbon monoxide,Nitrogen dioxide (NO2),Ozone,PM2.5 - Local Conditions,Wind Direction - Resultant,Wind Speed - Resultant,Outdoor Temperature,Relative Humidity,Solar radiation,Ultraviolet radiation,Barometric pressure
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2018-01-01 01:00:00,1.460,27.8,0.001,,35.0,2.9,51.4,84.0,0.0,0.0,1009.0
2018-01-01 01:00:00,1.600,27.8,0.001,,35.0,2.9,51.4,84.0,0.0,0.0,1009.0
2018-01-01 02:00:00,1.436,27.9,0.002,,43.0,3.8,50.9,81.0,0.0,0.0,1009.0
2018-01-01 02:00:00,1.500,27.9,0.002,,43.0,3.8,50.9,81.0,0.0,0.0,1009.0
2018-01-01 03:00:00,1.418,28.7,0.001,,38.0,3.9,50.3,81.0,0.0,0.0,1009.0
...,...,...,...,...,...,...,...,...,...,...,...
2018-12-12 21:00:00,1.200,33.3,0.001,,27.0,2.7,55.6,88.0,0.0,0.0,1010.0
2018-12-12 22:00:00,,30.1,0.001,,41.0,2.3,54.8,88.0,0.0,0.0,1010.0
2018-12-12 22:00:00,1.300,30.1,0.001,,41.0,2.3,54.8,88.0,0.0,0.0,1010.0
2018-12-12 23:00:00,,27.7,0.001,,44.0,3.1,53.9,89.0,0.0,0.0,1010.0


In [None]:
df.to_csv('./data/sample.csv')