# Looking at measurement coverage patterns

In [1]:
%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

pd.options.display.max_columns = None

In [12]:
%%time
result = pd.read_csv('../data/water/CA-result-withmeasuregroup.csv', low_memory=False)
result = result.ix[result.StartDate >= '2010-01-01']
station = pd.read_csv('../data/water/CA-station-clean.csv', low_memory=False)
data = pd.merge(result, station, left_on='LocationIdentifier', right_on='MonitoringLocationId')

data.StartDate = pd.to_datetime(data.StartDate)
data['Year'] = data.StartDate.dt.year

del result
del station

CPU times: user 472 ms, sys: 82.5 ms, total: 555 ms
Wall time: 577 ms


In [13]:
data.columns

Index(['LocationIdentifier', 'Medium', 'MediumSubdivision', 'StartDate',
       'StartTime', 'TimeZone', 'Category', 'Pollutant', 'Unit', 'Mclg', 'Mcl',
       'Organization', 'MonitoringLocationId', 'MonitoringLocationName',
       'MonitoringLocationType', 'MonitoringLocationDescription', 'HUC',
       'DrainageArea', 'DrainageAreaUnit', 'ContributingDrainageArea',
       'ContributingDrainageAreaUnit', 'Latitude', 'Longitude',
       'VerticalMeasure', 'VerticalMeasureUnit', 'StateCode', 'CountyCode',
       'CountyName', 'AquiferName', 'FormationType', 'AquiferType', 'Provider',
       'Edits', 'Year'],
      dtype='object')

## Getting annual per-county mean values to use for zero filling

The idea is, when there are NA values at specific sites, we can zero fill them by putting in the averages for that county.

In [27]:
values = data[['CountyName', 'Year', 'Pollutant', 'Value']]
countyMeans = values.groupby(['CountyName', 'Year', 'Pollutant'])\
                    .mean()\
                    .unstack()
print("Length: {}".format(len(countyMeans)))
countyMeans.describe()

Length: 350


Unnamed: 0_level_0,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value
Pollutant,Antimony,Arsenic,Barium,Beryllium,Cadmium,Chromium,Copper,Fluoride,HAA5,Lead,Mercury,Nitrate,Nitrite,PCBs,Selenium,Simazine,TTHMs,Xylene
count,167.0,301.0,200.0,139.0,257.0,263.0,301.0,181.0,27.0,271.0,209.0,304.0,196.0,53.0,282.0,113.0,127.0,60.0
mean,0.00014,0.007462,0.073205,5.7e-05,6.4e-05,0.003444,0.004005,0.347093,0.116912,0.001109,1.138397e-05,5.95926,0.093966,-3.69438e-06,0.001869,0.000205,0.043635,3.5e-05
std,0.000306,0.051854,0.089056,0.000273,0.000237,0.017085,0.007333,0.405766,0.043379,0.004266,3.489223e-05,10.640079,0.201315,0.0003173399,0.004813,0.001379,0.101061,8.7e-05
min,0.0,9e-05,0.0,0.0,-0.0001,0.0,0.00025,0.0,0.0595,0.0,0.0,0.0,-0.0065,-0.0021045,0.0,0.0,0.0,0.0
25%,2e-06,0.0015,0.028758,0.0,0.0,0.000445,0.001194,0.104615,0.086205,4.8e-05,3.373333e-07,0.491375,0.005098,0.0,0.000105,7e-06,1e-05,0.0
50%,5.5e-05,0.002534,0.047341,2e-06,1.3e-05,0.000902,0.002073,0.23125,0.105213,0.00019,1.15e-06,2.069624,0.017945,0.0,0.000359,1.7e-05,0.000197,0.0
75%,0.000134,0.004179,0.0884,2e-05,5.7e-05,0.002013,0.003939,0.4334,0.150736,0.000509,5.210926e-06,6.93957,0.08906,1.903e-09,0.001309,3.7e-05,0.001549,2e-05
max,0.002091,0.8935,0.719,0.0024,0.002883,0.221879,0.07945,2.757628,0.212892,0.0422,0.000353,80.95,1.4,0.00064677,0.038229,0.014497,0.474391,0.00052


In [28]:
countyMeans

Unnamed: 0_level_0,Unnamed: 1_level_0,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value
Unnamed: 0_level_1,Pollutant,Antimony,Arsenic,Barium,Beryllium,Cadmium,Chromium,Copper,Fluoride,HAA5,Lead,Mercury,Nitrate,Nitrite,PCBs,Selenium,Simazine,TTHMs,Xylene
CountyName,Year,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2
Alameda County,2010,0.000000,0.001852,0.029556,0.000000,8.950000e-06,0.000111,0.002100,,0.069650,0.000017,2.731579e-07,1.525562,0.010667,2.664000e-05,0.000365,0.002200,0.017185,0.0
Alameda County,2011,0.000000,0.001542,0.024880,0.000000,1.895161e-05,0.000320,0.003939,,0.108417,0.000797,2.475000e-07,1.839956,0.028929,1.605800e-08,0.000087,,0.100969,0.0
Alameda County,2012,0.000000,0.002000,0.042333,0.000000,0.000000e+00,0.000296,0.001815,,0.082136,0.000000,0.000000e+00,1.878019,0.028667,,0.000259,,0.087402,0.0
Alameda County,2013,0.000000,0.002539,0.037636,0.000000,4.132653e-05,0.000333,0.006513,,,0.000078,5.210926e-06,2.242636,,,0.000584,,0.000000,0.0
Alameda County,2014,0.000000,0.003704,0.048311,0.000000,9.306122e-06,0.000044,0.003494,,,0.000336,0.000000e+00,2.406897,,,0.000753,,0.000000,0.0
Alameda County,2015,0.000011,0.002960,0.051480,0.000000,0.000000e+00,0.000363,0.012135,,,0.000000,0.000000e+00,2.357143,,,0.000638,,0.000000,0.0
Alameda County,2016,,,,,,,,,,,,4.000000,,,,,,
Alpine County,2010,,,,,,,,,,,,,0.001188,,,,,
Alpine County,2012,,,,,,,,0.050900,,,,0.005800,,,,,,
Alpine County,2013,,,,,,,,0.036667,,,,,,,,,,


There are still quite a lot of NAs - 349 rows, and none of them are full for all the pollutants.

Will need to think if there's a way to fill that in. Based on the pattern of missing values, I'm not sure interpolation is appropriate, because frequently it's not that in-between values are missing. Forward-filling and back-filling might work. If values are NaN all the way through, maybe we just take the average from all neighboring counties.  For now, though, I'm thinking filling with the state-wide averages for that year will do for a first pass.

In [48]:
stateMeans = values[['Year', 'Pollutant', 'Value']]\
                    .groupby(['Year', 'Pollutant'])\
                    .mean()\
                    .unstack()
print("Length: {}".format(len(stateMeans)))
stateMeans.describe()

Length: 7


Unnamed: 0_level_0,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value
Pollutant,Antimony,Arsenic,Barium,Beryllium,Cadmium,Chromium,Copper,Fluoride,HAA5,Lead,Mercury,Nitrate,Nitrite,PCBs,Selenium,Simazine,TTHMs,Xylene
count,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,6.0,7.0,7.0,7.0,7.0,6.0,7.0,7.0,6.0,6.0
mean,0.000163,0.007312,0.060953,3.2e-05,7.5e-05,0.004533,0.003587,0.764661,0.121194,0.000976,2.5e-05,7.687445,0.152921,1.813945e-05,0.004223,0.000109,0.111534,1.9e-05
std,0.00014,0.003829,0.02851,4.5e-05,0.000106,0.004437,0.000728,0.19723,0.030517,0.000892,2.9e-05,1.428797,0.086498,5.321911e-05,0.004079,9.6e-05,0.033307,2e-05
min,3.9e-05,0.003716,0.040964,5e-06,1.4e-05,0.00121,0.002686,0.535266,0.074044,0.000135,2e-06,5.805821,0.034022,-6.516647e-05,0.00047,1.7e-05,0.059553,0.0
25%,6.3e-05,0.004707,0.044343,7e-06,2.8e-05,0.001422,0.003102,0.616235,0.106292,0.000489,7e-06,6.633025,0.102136,3.125e-08,0.000709,4.9e-05,0.099668,2e-06
50%,8.7e-05,0.005467,0.053983,1.5e-05,3.6e-05,0.002514,0.003383,0.685155,0.122375,0.000558,1.3e-05,7.954041,0.158898,1.373491e-05,0.003986,0.000103,0.10859,1.2e-05
75%,0.000251,0.009097,0.059967,2.9e-05,5.4e-05,0.006212,0.004072,0.950588,0.143458,0.001182,3e-05,8.514417,0.187245,5.111121e-05,0.006051,0.000119,0.136684,3.6e-05
max,0.000387,0.014395,0.123107,0.000132,0.000314,0.012736,0.004695,0.998562,0.156952,0.002794,8.3e-05,9.757366,0.298761,8.75e-05,0.011586,0.000308,0.149731,4.5e-05


In [51]:
stateMeans

Unnamed: 0_level_0,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value
Pollutant,Antimony,Arsenic,Barium,Beryllium,Cadmium,Chromium,Copper,Fluoride,HAA5,Lead,Mercury,Nitrate,Nitrite,PCBs,Selenium,Simazine,TTHMs,Xylene
Year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2
2010,8.7e-05,0.004634,0.040964,1.5e-05,3.6e-05,0.002514,0.002832,0.535266,0.147462,0.000558,3.9e-05,7.954688,0.078402,-6.516647e-05,0.011586,0.000103,0.097179,4.2e-05
2011,0.000387,0.008232,0.043418,9e-06,2.4e-05,0.008527,0.003383,0.685155,0.156952,0.000501,1.3e-05,9.074146,0.207345,2.734482e-05,0.006273,0.000308,0.149731,4.5e-05
2012,0.000325,0.009962,0.053983,5e-06,3.1e-05,0.012736,0.004146,0.656946,0.103955,0.001307,2e-05,7.954041,0.158898,0.0,0.005828,4.4e-05,0.107136,1e-06
2013,7.9e-05,0.003716,0.045267,3.3e-05,4e-05,0.00121,0.003998,0.910391,0.074044,0.000477,8e-06,9.757366,0.298761,5.903333e-05,0.003986,0.000104,0.059553,6e-06
2014,4.7e-05,0.005467,0.057711,2.5e-05,6.7e-05,0.001503,0.004695,0.998562,0.113303,0.001057,6e-06,5.805821,0.167146,1.25e-07,0.000752,0.000133,0.145564,1.8e-05
2015,3.9e-05,0.004781,0.062223,6e-06,1.4e-05,0.001341,0.002686,0.575524,0.131447,0.000135,2e-06,6.8925,0.125869,8.75e-05,0.00047,1.7e-05,0.110043,0.0
2016,0.000177,0.014395,0.123107,0.000132,0.000314,0.003897,0.003372,0.990785,,0.002794,8.3e-05,6.373551,0.034022,,0.000666,5.5e-05,,


In [100]:
# goign to just fillna the values for 2016

stateMeans.fillna(method='pad', inplace=True)
stateMeans

Unnamed: 0_level_0,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value
Pollutant,Antimony,Arsenic,Barium,Beryllium,Cadmium,Chromium,Copper,Fluoride,HAA5,Lead,Mercury,Nitrate,Nitrite,PCBs,Selenium,Simazine,TTHMs,Xylene
Year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2
2010,8.7e-05,0.004634,0.040964,1.5e-05,3.6e-05,0.002514,0.002832,0.535266,0.147462,0.000558,3.9e-05,7.954688,0.078402,-6.516647e-05,0.011586,0.000103,0.097179,4.2e-05
2011,0.000387,0.008232,0.043418,9e-06,2.4e-05,0.008527,0.003383,0.685155,0.156952,0.000501,1.3e-05,9.074146,0.207345,2.734482e-05,0.006273,0.000308,0.149731,4.5e-05
2012,0.000325,0.009962,0.053983,5e-06,3.1e-05,0.012736,0.004146,0.656946,0.103955,0.001307,2e-05,7.954041,0.158898,0.0,0.005828,4.4e-05,0.107136,1e-06
2013,7.9e-05,0.003716,0.045267,3.3e-05,4e-05,0.00121,0.003998,0.910391,0.074044,0.000477,8e-06,9.757366,0.298761,5.903333e-05,0.003986,0.000104,0.059553,6e-06
2014,4.7e-05,0.005467,0.057711,2.5e-05,6.7e-05,0.001503,0.004695,0.998562,0.113303,0.001057,6e-06,5.805821,0.167146,1.25e-07,0.000752,0.000133,0.145564,1.8e-05
2015,3.9e-05,0.004781,0.062223,6e-06,1.4e-05,0.001341,0.002686,0.575524,0.131447,0.000135,2e-06,6.8925,0.125869,8.75e-05,0.00047,1.7e-05,0.110043,0.0
2016,0.000177,0.014395,0.123107,0.000132,0.000314,0.003897,0.003372,0.990785,0.131447,0.002794,8.3e-05,6.373551,0.034022,8.75e-05,0.000666,5.5e-05,0.110043,0.0


## Padding county-level data

In [102]:
%%time
def fillCounty(df, county):
    countydf = df.ix[county]\
                 .fillna(method='pad')\
                 .fillna(method='backfill')\
                 .reset_index()
    countydf['CountyName'] = county
    return countydf

filled = pd.concat([fillCounty(countyMeans, county) 
                    for county 
                    in data.CountyName.unique()])

CPU times: user 216 ms, sys: 6.72 ms, total: 223 ms
Wall time: 224 ms


In [103]:
filled

Unnamed: 0_level_0,Year,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,CountyName
Pollutant,Unnamed: 1_level_1,Antimony,Arsenic,Barium,Beryllium,Cadmium,Chromium,Copper,Fluoride,HAA5,Lead,Mercury,Nitrate,Nitrite,PCBs,Selenium,Simazine,TTHMs,Xylene,Unnamed: 20_level_1
0,2010,0.000000,0.003728,0.023000,0.000000e+00,0.000000,0.000250,0.003406,0.260000,0.071660,0.000313,0.000000e+00,0.616371,0.012987,0.000000e+00,0.000125,0.000007,0.000000e+00,0.000000,Solano County
1,2011,0.000000,0.001693,0.031931,0.000000e+00,0.000018,0.001723,0.002787,0.260000,0.071660,0.000372,6.674898e-06,0.293798,0.004880,0.000000e+00,0.000246,0.000004,0.000000e+00,0.000000,Solano County
2,2012,0.000007,0.002653,0.045909,2.727273e-07,0.000031,0.003888,0.004237,0.260000,0.071660,0.000903,0.000000e+00,0.711607,0.014059,0.000000e+00,0.000005,0.000022,7.692308e-07,0.000000,Solano County
3,2013,0.000000,0.002404,0.041813,0.000000e+00,0.000005,0.000781,0.002470,0.260000,0.071660,0.000109,1.985789e-06,1.279766,0.014059,0.000000e+00,0.000098,0.000013,1.097477e-01,0.000000,Solano County
4,2014,0.000000,0.003250,0.036292,0.000000e+00,0.000000,0.000667,0.002327,0.260000,0.103139,0.000083,0.000000e+00,1.935407,0.014059,0.000000e+00,0.000106,0.000000,1.779180e-01,0.000000,Solano County
5,2015,0.000000,0.003474,0.044000,0.000000e+00,0.000000,0.000684,0.002194,0.260000,0.124329,0.000000,0.000000e+00,1.675984,0.066000,0.000000e+00,0.000063,0.000013,2.049354e-01,0.000000,Solano County
6,2016,0.000000,0.003474,0.044000,0.000000e+00,0.000000,0.000684,0.003300,0.260000,0.124329,0.000000,0.000000e+00,1.105815,0.027708,0.000000e+00,0.000063,0.000070,2.049354e-01,0.000000,Solano County
0,2010,0.000100,0.004190,0.020332,,0.000000,0.001148,0.001443,0.120000,,0.000092,9.500000e-07,1.754100,0.005600,,0.000204,0.000006,3.000000e-05,,Tehama County
1,2011,0.000084,0.005375,0.030153,,0.000000,0.000984,0.000889,0.136667,,0.000016,9.500000e-07,2.228333,0.005600,,0.000068,0.000006,3.000000e-05,,Tehama County
2,2012,0.000084,0.005301,0.030153,,0.000000,0.000748,0.000713,0.136667,,0.000002,9.500000e-07,2.228333,0.005600,,0.000042,0.000006,3.000000e-05,,Tehama County


## Filling statewide annual averages for missing county data.

In [111]:
def fillnas(year):
    yeardf = filled.loc[filled.Year == year]
    yeardefaults = stateMeans.ix[year]
    return yeardf.fillna(yeardefaults)

filled = pd.concat([fillnas(year) for year in filled.Year.unique()])

filled

Unnamed: 0_level_0,Year,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,CountyName
Pollutant,Unnamed: 1_level_1,Antimony,Arsenic,Barium,Beryllium,Cadmium,Chromium,Copper,Fluoride,HAA5,Lead,Mercury,Nitrate,Nitrite,PCBs,Selenium,Simazine,TTHMs,Xylene,Unnamed: 20_level_1
0,2010,0.000000,0.003728,0.023000,0.000000e+00,0.000000e+00,0.000250,0.003406,0.260000,0.071660,0.000313,0.000000e+00,0.616371,0.012987,0.000000e+00,0.000125,0.000007,0.000000,0.000000,Solano County
0,2010,0.000100,0.004190,0.020332,1.460182e-05,0.000000e+00,0.001148,0.001443,0.120000,0.147462,0.000092,9.500000e-07,1.754100,0.005600,-6.516647e-05,0.000204,0.000006,0.000030,0.000042,Tehama County
0,2010,0.000000,0.003312,0.000000,0.000000e+00,6.290000e-05,0.000505,0.003521,0.535266,0.090667,0.000606,5.027059e-06,1.468052,0.014194,1.662500e-05,0.001354,0.000027,0.033852,0.000000,Contra Costa County
0,2010,0.000028,0.029077,0.004647,6.263158e-06,1.015714e-04,0.000212,0.001399,0.314286,0.147462,0.000115,1.079223e-04,28.694226,0.055000,0.000000e+00,0.008024,0.000030,0.000000,0.000000,Kings County
0,2010,0.000180,0.006315,0.031996,9.926471e-05,2.430435e-05,0.003301,0.003102,0.757933,0.147462,0.000418,3.530000e-04,6.033726,0.159962,0.000000e+00,0.000698,0.000014,0.000409,0.000290,San Bernardino County
0,2010,0.000093,0.000805,0.009900,2.400000e-05,1.002000e-04,0.001800,0.022000,0.305556,0.147462,0.000970,6.100000e-07,5.116072,0.034383,-6.516647e-05,0.002500,0.000017,0.000860,0.000520,San Luis Obispo County
0,2010,0.000061,0.002619,0.094059,1.900000e-05,3.320000e-04,0.005389,0.011977,0.202067,0.147462,0.001780,2.500000e-05,7.806943,0.105864,3.500000e-04,0.002707,0.000006,0.000325,0.000042,Monterey County
0,2010,0.000087,0.001796,0.040964,1.460182e-05,0.000000e+00,0.001038,0.002128,0.050000,0.147462,0.000141,4.300000e-07,0.162119,0.002806,-6.516647e-05,0.000053,0.000103,0.097179,0.000042,Nevada County
0,2010,0.000050,0.006779,0.018179,2.880000e-05,1.473913e-04,0.001471,0.002712,0.211786,0.147462,0.000943,7.376000e-07,2.602463,0.002500,-6.516647e-05,0.003742,0.000012,0.000236,0.000042,Siskiyou County
0,2010,0.000327,0.004915,0.062136,8.000000e-06,9.570000e-05,0.000372,0.001428,0.979773,0.147462,0.000116,8.174286e-07,5.804907,0.170393,2.100000e-05,0.002557,0.000103,0.097179,0.000042,Imperial County


# Grouping by day

In [192]:
%%time
values = data[['CountyName', 'LocationIdentifier', 'Year', 'StartDate', 'Pollutant', 'Value']]
dailies = values.groupby(['CountyName', 'LocationIdentifier', 'Year', 'StartDate', 'Pollutant'])\
                .mean()\
                .unstack()

CPU times: user 115 ms, sys: 24.1 ms, total: 139 ms
Wall time: 137 ms


In [193]:
dailies

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Pollutant,Antimony,Arsenic,Barium,Beryllium,Cadmium,Chromium,Copper,Fluoride,HAA5,Lead,Mercury,Nitrate,Nitrite,PCBs,Selenium,Simazine,TTHMs,Xylene
CountyName,LocationIdentifier,Year,StartDate,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Alameda County,CALWR_WQX-B9C74701355,2010,2010-03-17,,,,,,,,,,,,4.400,,,,,,
Alameda County,CALWR_WQX-B9C74701355,2010,2010-04-14,,,,,,,,,,,,4.100,,,,,,
Alameda County,CALWR_WQX-B9C74701355,2010,2010-06-23,,,,,,,,,,,,1.400,,,,,,
Alameda County,CALWR_WQX-B9C74701355,2010,2010-07-21,,,,,,,,,,,,1.300,,,,,,
Alameda County,CALWR_WQX-B9C74701355,2010,2010-08-30,,,,,,,,,,,,2.500,,,,,,
Alameda County,CALWR_WQX-B9C74701355,2010,2010-09-15,,,,,,,,,,,,1.700,,,,,,
Alameda County,CALWR_WQX-B9C74701355,2010,2010-10-13,,,,,,,,,,,,2.400,,,,,,
Alameda County,CALWR_WQX-B9C74701355,2010,2010-10-26,,,,,,,,,,,,5.300,,,,,,
Alameda County,CALWR_WQX-B9C74701355,2010,2010-11-03,,,,,,,,,0.0806,,,,,,,,0.1780,
Alameda County,CALWR_WQX-B9C74701355,2010,2010-11-09,,,,,,,,,,,,2.600,,,,,,


## Padding station-level data

In [194]:
%%time
f = lambda x: x.fillna(method='pad').fillna(method='backfill')
filled = dailies.groupby(level=[0,1,2]).transform(f).reset_index()

CPU times: user 7.29 s, sys: 78.2 ms, total: 7.37 s
Wall time: 7.47 s


In [195]:
filled

Unnamed: 0_level_0,CountyName,LocationIdentifier,Year,StartDate,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value
Pollutant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Antimony,Arsenic,Barium,Beryllium,Cadmium,Chromium,Copper,Fluoride,HAA5,Lead,Mercury,Nitrate,Nitrite,PCBs,Selenium,Simazine,TTHMs,Xylene
0,Alameda County,CALWR_WQX-B9C74701355,2010,2010-03-17,,,,,,,,,0.0806,,,4.400,,,,,0.1780,
1,Alameda County,CALWR_WQX-B9C74701355,2010,2010-04-14,,,,,,,,,0.0806,,,4.100,,,,,0.1780,
2,Alameda County,CALWR_WQX-B9C74701355,2010,2010-06-23,,,,,,,,,0.0806,,,1.400,,,,,0.1780,
3,Alameda County,CALWR_WQX-B9C74701355,2010,2010-07-21,,,,,,,,,0.0806,,,1.300,,,,,0.1780,
4,Alameda County,CALWR_WQX-B9C74701355,2010,2010-08-30,,,,,,,,,0.0806,,,2.500,,,,,0.1780,
5,Alameda County,CALWR_WQX-B9C74701355,2010,2010-09-15,,,,,,,,,0.0806,,,1.700,,,,,0.1780,
6,Alameda County,CALWR_WQX-B9C74701355,2010,2010-10-13,,,,,,,,,0.0806,,,2.400,,,,,0.1780,
7,Alameda County,CALWR_WQX-B9C74701355,2010,2010-10-26,,,,,,,,,0.0806,,,5.300,,,,,0.1780,
8,Alameda County,CALWR_WQX-B9C74701355,2010,2010-11-03,,,,,,,,,0.0806,,,5.300,,,,,0.1780,
9,Alameda County,CALWR_WQX-B9C74701355,2010,2010-11-09,,,,,,,,,0.0806,,,2.600,,,,,0.1780,


## Filling county means for missing station data

In [198]:
countyMeans = imputed.sort_values(['Year', 'CountyName'])
countyMeans.set_index(['Year', 'CountyName'], inplace=True)

def fillnasForCounty(county, df, defaults):
    countydf = df.loc[df.CountyName == county]
    countydefaults = defaults.ix[county]
    return countydf.fillna(countydefaults)

def fillnasForYear(year, df):
    yeardf = df.loc[df.Year == year]
    yeardefaults = countyMeans.ix[year]
    yeardf = pd.concat([
            fillnasForCounty(county, yeardf, yeardefaults)
            for county in yeardf.CountyName.unique()
        ])
    return yeardf

reallyFilled = pd.concat([
        fillnasForYear(year, filled)
        for year in filled.Year.unique()
    ])

In [199]:
reallyFilled.describe()

Unnamed: 0_level_0,Year,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value
Pollutant,Unnamed: 1_level_1,Antimony,Arsenic,Barium,Beryllium,Cadmium,Chromium,Copper,Fluoride,HAA5,Lead,Mercury,Nitrate,Nitrite,PCBs,Selenium,Simazine,TTHMs,Xylene
count,25692.0,25692.0,25692.0,25692.0,25692.0,25692.0,25692.0,25692.0,25692.0,25692.0,25692.0,25692.0,25692.0,25692.0,25692.0,25692.0,25692.0,25692.0,25692.0
mean,2012.174296,0.000137,0.004931,0.078691,9.7e-05,9e-05,0.00606,0.004347,0.379536,0.122011,0.000921,2.287923e-05,8.398591,0.122208,2.2e-05,0.004352,0.000149,0.042751,3.6e-05
std,1.72345,0.000326,0.023611,0.106453,0.000363,0.000305,0.026986,0.009771,1.187736,0.034499,0.003267,6.89079e-05,28.167796,0.447555,0.000141,0.016524,0.000453,0.097561,9.7e-05
min,2010.0,0.0,-0.0002,0.0,0.0,-0.00015,-0.00017,-0.00015,-0.023,0.0013,-0.00015,-2e-07,-0.6175,-0.996,-0.002806,-0.00056,0.0,0.0,0.0
25%,2011.0,3e-06,0.00165,0.030218,0.0,1e-05,0.0005,0.001555,0.16,0.103955,0.000115,0.0,0.4,0.01,0.0,0.00019,9e-06,3e-05,0.0
50%,2012.0,7.2e-05,0.002692,0.052,7e-06,3.1e-05,0.001341,0.002542,0.224286,0.113303,0.000354,6.733333e-07,2.2,0.029583,0.0,0.0007,3.4e-05,0.000325,6e-06
75%,2013.0,0.000134,0.0038,0.09745,2.5e-05,8.3e-05,0.003401,0.004288,0.382899,0.147462,0.0008,1.53175e-05,8.887351,0.12,2.7e-05,0.002072,0.000159,0.0112,4.2e-05
max,2016.0,0.01565,1.04,4.93,0.00321,0.0126,0.611017,0.62,150.0,0.7,0.14,0.005,3035.0,33.2,0.000647,0.548,0.0201,1.851,0.001159
