In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import pprint
import matplotlib.pyplot as plt
from random import randint
import datetime
from dateutil.relativedelta import relativedelta

# Local imports
from data_fetcher import DataFetcher
from preprocessing import Processor

In [3]:
# Define some string constants for easy typing
SAMPLE_DATA_BY_SITE = 'sampleData/bySite'
SAMPLE_DATA_BY_COUNTY = 'sampleData/byCounty'
SAMPLE_DATA_BY_STATE = 'sampleData/byState'
SAMPLE_DATA_BY_BOX = 'sampleData/byBox'
SAMPLE_DATA_BY_CBSA = 'sampleData/byCBSA'

LIST_STATES = 'list/states'
LIST_COUNTIES_BY_STATE = 'list/countiesByState'
LIST_SITES_BY_COUNTY = 'list/sitesByCounty'
LIST_CBSAs = 'list/cbsas'
LIST_PARAM_CLASSES = 'list/classes'
LIST_PARAM_IN_CLASS = 'list/parametersByClass'

In [4]:
datafetcher = DataFetcher()

In [5]:
# Example calls to list codes
cali_code = datafetcher.get_codes(LIST_STATES, all=False, value='California')
print('California state code:', cali_code)

criteria_code = datafetcher.get_codes(LIST_PARAM_CLASSES, all=False, value='Criteria Pollutants')
print('Criteria polutants code:', criteria_code)

carbon_monoxide_code = datafetcher.get_codes(LIST_PARAM_IN_CLASS, all=False, value='Carbon monoxide', nparams={'pc':criteria_code})
print('Carbon monoxide code:', carbon_monoxide_code)

California state code: 06
Criteria polutants code: CRITERIA
Carbon monoxide code: 42101


In [6]:
datafetcher.all_codes

Unnamed: 0_level_0,value_represented
code,Unnamed: 1_level_1
11101,Suspended particulate (TSP)
11102,Suspended particulate (TSP) LC
11103,Benzene soluble organics (TSP)
11104,Total polynuclear hydrocarbons
11114,Windblown particulate
...,...
88500,PM2.5 Total Atmospheric
88501,PM2.5 Raw Data
88502,Acceptable PM2.5 AQI & Speciation Mass
88503,PM2.5 Volatile Channel


In [7]:
_ = datafetcher.find_code('Cristabalite', verbose=True)

Cristabalite code is: 11122


## Explore data by area codes to find good location for modelling

We want to find a site, or small grroup of sites, that have enough data for us to train a useeful model. We need this set to contain metereological, ozone, particulate matter, VOC, and 

In [19]:
r = datafetcher.find_best_location(); r

Searching county 037 in state 06... Found 51 sites.


{'Azusa': [1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0],
 'Glendora': [1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0],
 'El Monte': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'SB25 trailer at Hollenbeck School': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'Wilmington-N. Mahar Ave': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'West Los Angeles': [1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0],
 'Carson': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'Commerce-Ayers Ave': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'City of Industry-Volkswagon': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'City of Industry-Whitco': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'Commerce-AT&SF RR': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'SITE IS LOCATED ONE HALF MILE EAST OF THE I-57/I-60 INTERCHANGE': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'Burbank': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'UNKNOWN COORDINATE LOCATION': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'ON BUILDING': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'Los Angeles-North Main Street': [1, 1, 

Right now the most important things we need are Ozone (i=2), PM2.5 (i=3), where i < 2 are criteria polutants and i > 3 are meteorological variables.

In [20]:

best = [{k: v} for k, v in r.items() if ((sum(v) >= 4) and (v[2] == 1) and (v[3] == 1))]
best

[{'Azusa': [1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0]},
 {'Los Angeles-North Main Street': [1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0]},
 {'Reseda': [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]},
 {'Compton': [1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0]},
 {'Pico Rivera #2': [1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0]},
 {'Pasadena': [1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0]},
 {'Lancaster-Division Street': [1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0]}]

From the results above we will proceed with <b>Los Angeles-North Main Street</b> as our primary location to model. Notice that it has data on all our criteria pollutants, but no data on Mixing Height and Rain/Melt precipitation, although I believe I might have the wrong parameter for the latter (there are multiple rain params?).

In [21]:
site_code = datafetcher.get_codes(LIST_SITES_BY_COUNTY, all=False, value='Los Angeles-North Main Street', nparams={'state':'06', 'county':'037'})

# Example of Site data using Los Angeles-North Main Street, Los Angeles, California
df = datafetcher.create_dataset(20180101, 20181212, site=site_code, county='037', state='06', processed=True, verbose=False)
df

No data for Mixing Height
No data for Rain/melt precipitation


Unnamed: 0_level_0,Carbon monoxide,Nitrogen dioxide (NO2),Ozone,PM2.5 - Local Conditions,Wind Direction - Resultant,Outdoor Temperature,Relative Humidity,Solar radiation,Ultraviolet radiation,Barometric pressure
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2018-01-01 00:00:00,1.398,27.2,0.002,61.4,49.0,51.8,87.0,0.0,0.0,1009.0
2018-01-01 00:00:00,1.500,27.2,0.002,61.4,49.0,51.8,87.0,0.0,0.0,1009.0
2018-01-01 01:00:00,1.460,27.8,0.001,,35.0,51.4,84.0,0.0,0.0,1009.0
2018-01-01 01:00:00,1.600,27.8,0.001,,35.0,51.4,84.0,0.0,0.0,1009.0
2018-01-01 02:00:00,1.436,27.9,0.002,,43.0,50.9,81.0,0.0,0.0,1009.0
...,...,...,...,...,...,...,...,...,...,...
2018-12-12 21:00:00,1.200,33.3,0.001,,27.0,55.6,88.0,0.0,0.0,1010.0
2018-12-12 22:00:00,,30.1,0.001,,41.0,54.8,88.0,0.0,0.0,1010.0
2018-12-12 22:00:00,1.300,30.1,0.001,,41.0,54.8,88.0,0.0,0.0,1010.0
2018-12-12 23:00:00,,27.7,0.001,,44.0,53.9,89.0,0.0,0.0,1010.0


In [23]:
# Check if we are missing data for any measurement
df[df.isna().any(axis=1)]

# TODO: interpolate

Unnamed: 0_level_0,Carbon monoxide,Nitrogen dioxide (NO2),Ozone,PM2.5 - Local Conditions,Wind Direction - Resultant,Outdoor Temperature,Relative Humidity,Solar radiation,Ultraviolet radiation,Barometric pressure
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2018-01-01 01:00:00,1.460,27.8,0.001,,35.0,51.4,84.0,0.0,0.0,1009.0
2018-01-01 01:00:00,1.600,27.8,0.001,,35.0,51.4,84.0,0.0,0.0,1009.0
2018-01-01 02:00:00,1.436,27.9,0.002,,43.0,50.9,81.0,0.0,0.0,1009.0
2018-01-01 02:00:00,1.500,27.9,0.002,,43.0,50.9,81.0,0.0,0.0,1009.0
2018-01-01 03:00:00,1.418,28.7,0.001,,38.0,50.3,81.0,0.0,0.0,1009.0
...,...,...,...,...,...,...,...,...,...,...
2018-12-12 21:00:00,1.200,33.3,0.001,,27.0,55.6,88.0,0.0,0.0,1010.0
2018-12-12 22:00:00,,30.1,0.001,,41.0,54.8,88.0,0.0,0.0,1010.0
2018-12-12 22:00:00,1.300,30.1,0.001,,41.0,54.8,88.0,0.0,0.0,1010.0
2018-12-12 23:00:00,,27.7,0.001,,44.0,53.9,89.0,0.0,0.0,1010.0


In [24]:
df.to_csv('./data/sample.csv')