In [1]:
import numpy as np
import pandas as pd

In [2]:
!pwd

/apps/waq/analysis/air/hourly


In [3]:
# calculate the air quality, according to this document:
# https://www3.epa.gov/airnow/aqi-technical-assistance-document-dec2013.pdf,
# PM 2.5 and PM 10 should be added, they were not present in the first dataset


def Air_Quality (CO, SO2, NO2, O3):        
    if (CO > 12.4 or SO2 > 185 or NO2 > 360 or O3 > 0.164):
        return 'red'
    if (CO > 4.4 or SO2 > 35 or NO2 > 53 or O3 > 0):
        return 'orange'
    else:
        return 'green'

Processing data for year 2016

In [5]:
# in the next step, this will be extended to all years;
# for figuring out current exposure, current year is enough
#years = ('10', '11', '12', '13', '14', '15', '16')

no_files = "42101", "42401", "42602", "44201"

dataset = pd.DataFrame()

for j in no_files:
    file_name = "processed/hourly_%s_2016.dat" %j # %(j, i) 
    #when we will need more data, we will take the data from multiple years
            
    ## here, see which files are read in and how is the table growing now
    print file_name
    frame = pd.read_table(file_name, names = ['State_Code', 'County_Code', 'Site_Number', 'Pollutant_Code', 'Date', 'Time', 'Measurement_Value'])
    print frame.size
    dataset = pd.concat([dataset, frame])
    print dataset.size

results_2016 = dataset

processed/hourly_42101_2016.dat
518427
518427
processed/hourly_42401_2016.dat
303961
822388
processed/hourly_42602_2016.dat
720657
1543045
processed/hourly_44201_2016.dat
1402247
2945292


In [6]:
# pivot the table in order to rearrange it as needed

# important note: we are filling missing data with 0s.
# in the next step, we need to make sure no data is missing

results_2016 = results_2016.pivot_table(
                                         index = ['State_Code', 'County_Code', 'Site_Number', 'Date', 'Time'], 
                                         columns = ['Pollutant_Code'], values = 'Measurement_Value', 
                                         aggfunc = [np.sum], fill_value=0 )

In [7]:
# see how it looks now

results_2016

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,sum,sum,sum,sum
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Pollutant_Code,42101,42401,42602,44201
State_Code,County_Code,Site_Number,Date,Time,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
6,1,7,2016-01-01,08:00,0.0,0.0,16.8,0.007
6,1,7,2016-01-01,09:00,0.0,0.0,17.0,0.010
6,1,7,2016-01-01,10:00,0.0,0.0,15.9,0.012
6,1,7,2016-01-01,12:00,0.0,0.0,8.8,0.023
6,1,7,2016-01-01,13:00,0.0,0.0,3.4,0.030
6,1,7,2016-01-01,14:00,0.0,0.0,6.2,0.026
6,1,7,2016-01-01,15:00,0.0,0.0,14.3,0.017
6,1,7,2016-01-01,16:00,0.0,0.0,11.1,0.021
6,1,7,2016-01-01,17:00,0.0,0.0,2.2,0.031
6,1,7,2016-01-01,18:00,0.0,0.0,2.0,0.031


In [8]:
# reset the column indexing to one level
results_2016.columns = results_2016.columns.droplevel()
results_2016 = results_2016.rename_axis(None, axis = 1)
results_2016

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,42101,42401,42602,44201
State_Code,County_Code,Site_Number,Date,Time,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
6,1,7,2016-01-01,08:00,0.0,0.0,16.8,0.007
6,1,7,2016-01-01,09:00,0.0,0.0,17.0,0.010
6,1,7,2016-01-01,10:00,0.0,0.0,15.9,0.012
6,1,7,2016-01-01,12:00,0.0,0.0,8.8,0.023
6,1,7,2016-01-01,13:00,0.0,0.0,3.4,0.030
6,1,7,2016-01-01,14:00,0.0,0.0,6.2,0.026
6,1,7,2016-01-01,15:00,0.0,0.0,14.3,0.017
6,1,7,2016-01-01,16:00,0.0,0.0,11.1,0.021
6,1,7,2016-01-01,17:00,0.0,0.0,2.2,0.031
6,1,7,2016-01-01,18:00,0.0,0.0,2.0,0.031


In [9]:
# also, reset row indexing
results_2016 = results_2016.reset_index()
results_2016

Unnamed: 0,State_Code,County_Code,Site_Number,Date,Time,42101,42401,42602,44201
0,6,1,7,2016-01-01,08:00,0.0,0.0,16.8,0.007
1,6,1,7,2016-01-01,09:00,0.0,0.0,17.0,0.010
2,6,1,7,2016-01-01,10:00,0.0,0.0,15.9,0.012
3,6,1,7,2016-01-01,12:00,0.0,0.0,8.8,0.023
4,6,1,7,2016-01-01,13:00,0.0,0.0,3.4,0.030
5,6,1,7,2016-01-01,14:00,0.0,0.0,6.2,0.026
6,6,1,7,2016-01-01,15:00,0.0,0.0,14.3,0.017
7,6,1,7,2016-01-01,16:00,0.0,0.0,11.1,0.021
8,6,1,7,2016-01-01,17:00,0.0,0.0,2.2,0.031
9,6,1,7,2016-01-01,18:00,0.0,0.0,2.0,0.031


In [10]:
# rename the columns to more meaningful names

results_2016.columns = ['State_Code', 'County_Code', 'Site_Number', 'Date', 'Time','CO_measurement', 'SO2_measurement', 'NO2_measuerment', 'O3_measurement']


In [11]:
results_2016

Unnamed: 0,State_Code,County_Code,Site_Number,Date,Time,CO_measurement,SO2_measurement,NO2_measuerment,O3_measurement
0,6,1,7,2016-01-01,08:00,0.0,0.0,16.8,0.007
1,6,1,7,2016-01-01,09:00,0.0,0.0,17.0,0.010
2,6,1,7,2016-01-01,10:00,0.0,0.0,15.9,0.012
3,6,1,7,2016-01-01,12:00,0.0,0.0,8.8,0.023
4,6,1,7,2016-01-01,13:00,0.0,0.0,3.4,0.030
5,6,1,7,2016-01-01,14:00,0.0,0.0,6.2,0.026
6,6,1,7,2016-01-01,15:00,0.0,0.0,14.3,0.017
7,6,1,7,2016-01-01,16:00,0.0,0.0,11.1,0.021
8,6,1,7,2016-01-01,17:00,0.0,0.0,2.2,0.031
9,6,1,7,2016-01-01,18:00,0.0,0.0,2.0,0.031


In [12]:
# calculate the degree of pollution

results_2016['Category'] = map(Air_Quality, results_2016['CO_measurement'], results_2016['SO2_measurement'], results_2016['NO2_measuerment'],results_2016['O3_measurement'])

In [13]:
results_2016

Unnamed: 0,State_Code,County_Code,Site_Number,Date,Time,CO_measurement,SO2_measurement,NO2_measuerment,O3_measurement,Category
0,6,1,7,2016-01-01,08:00,0.0,0.0,16.8,0.007,orange
1,6,1,7,2016-01-01,09:00,0.0,0.0,17.0,0.010,orange
2,6,1,7,2016-01-01,10:00,0.0,0.0,15.9,0.012,orange
3,6,1,7,2016-01-01,12:00,0.0,0.0,8.8,0.023,orange
4,6,1,7,2016-01-01,13:00,0.0,0.0,3.4,0.030,orange
5,6,1,7,2016-01-01,14:00,0.0,0.0,6.2,0.026,orange
6,6,1,7,2016-01-01,15:00,0.0,0.0,14.3,0.017,orange
7,6,1,7,2016-01-01,16:00,0.0,0.0,11.1,0.021,orange
8,6,1,7,2016-01-01,17:00,0.0,0.0,2.2,0.031,orange
9,6,1,7,2016-01-01,18:00,0.0,0.0,2.0,0.031,orange


In [14]:
# convert to csv
results_2016.to_csv('results_2016')