In [1]:
import pandas as pd
import csv

pd.options.display.max_columns = None

# Importing measure translation list

In [2]:
measures = pd.read_csv('../data/measure-rosetta-nomicrobes.txt',
                       sep='|')
measures

Unnamed: 0,Measure,Limit,MeasureGroup
0,"1,1,1-Trichloroethane",.2 mg/L,"1,1,1-Trichloroethane"
1,"1,1,2-Trichloroethane",.005 mg/L,"1,1,2-Trichloroethane"
2,"1,1-Dichloroethylene",.007 mg/L,"1,1-Dichloroethylene"
3,"1,2,4-Trichlorobenzene",.07 mg/L,"1,2,4-Trichlorobenzene"
4,"1,2-Dibromo-3-chloropropane",.0002 mg/L,"1,2-Dibromo-3-chloropropane"
5,"1,2-Dichloroethane",.005 mg/L,"1,2-Dichloroethane"
6,"1,2-Dichloropropane",.005 mg/L,"1,2-Dichloropropane"
7,"2,2',3,3',4,4',5,5',6-Nonachlorobiphenyl",.0005 mg/L,PCBs
8,"2,2',3,3',4,4',5,5'-Octachlorobiphenyl",.0005 mg/L,PCBs
9,"2,2',3,3',4,4',5,6'-Octachlorobiphenyl",.0005 mg/L,PCBs


# County Codes

In [3]:
counties = pd.read_csv('../data/ca_county_codes.csv',
                      names=['State', 'StateCode', 'CountyCode', 'CountyName', 'Class'])

counties

Unnamed: 0,State,StateCode,CountyCode,CountyName,Class
0,CA,6,1,Alameda County,H1
1,CA,6,3,Alpine County,H1
2,CA,6,5,Amador County,H1
3,CA,6,7,Butte County,H1
4,CA,6,9,Calaveras County,H1
5,CA,6,11,Colusa County,H1
6,CA,6,13,Contra Costa County,H1
7,CA,6,15,Del Norte County,H1
8,CA,6,17,El Dorado County,H1
9,CA,6,19,Fresno County,H1


# Loading station data

In [4]:
station = pd.read_csv('../data/station-clean.csv.bz2',
                     usecols=['MonitoringLocationIdentifier', 
                              'LatitudeMeasure', 
                              'LongitudeMeasure',
                              'CountyCode'])
station

Unnamed: 0,MonitoringLocationIdentifier,LatitudeMeasure,LongitudeMeasure,CountyCode
0,USBR-324903114320201,32.817711,-114.534672,25.0
1,USGS-09423500,34.851671,-114.609965,71.0
2,USGS-09424150,34.316126,-114.157170,71.0
3,USGS-09424170,34.300015,-114.163281,71.0
4,USGS-09427500,34.316126,-114.157170,71.0
5,USGS-09427520,34.295570,-114.140225,71.0
6,USGS-09427524,34.295570,-114.140225,71.0
7,USGS-09427600,34.291681,-114.157725,71.0
8,USGS-09427800,34.279182,-114.148003,71.0
9,USGS-09428000,34.258348,-114.150780,71.0


# Loading Results

In [5]:
waterresults = pd.read_csv('../data/CA-result.csv.bz2',
                           usecols=['ActivityMediaName', 'ActivityMediaSubdivisionName',
                                  'ActivityStartDate', 'ActivityStartTime/Time',
                                  'ActivityStartTime/TimeZoneCode', 'ActivityEndDate',
                                  'ActivityEndTime/Time', 'ActivityEndTime/TimeZoneCode',
                                  'MonitoringLocationIdentifier', 'CharacteristicName',
                                  'ResultMeasureValue', 'ResultMeasure/MeasureUnitCode',
                                  'ResultCommentText'],
                          error_bad_lines=False)
waterresults = waterresults[waterresults.ActivityMediaName == 'Water']
waterresults["ResultMeasureValue"] = \
    pd.to_numeric(waterresults.ResultMeasureValue, errors='coerce')

waterresults.head(10)

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,ActivityMediaName,ActivityMediaSubdivisionName,ActivityStartDate,ActivityStartTime/Time,ActivityStartTime/TimeZoneCode,ActivityEndDate,ActivityEndTime/Time,ActivityEndTime/TimeZoneCode,MonitoringLocationIdentifier,CharacteristicName,ResultMeasureValue,ResultMeasure/MeasureUnitCode,ResultCommentText
9,Water,,2009-10-10,00:00:00,,,,,MATTOLE_SALMON-MAT_EST_6B_09,"Temperature, water",15.16,deg C,daily average
10,Water,,2009-10-10,00:00:00,,,,,MATTOLE_SALMON-MAT_EST_6B_09,Dissolved oxygen (DO),10.06,mg/l,daily minimum
11,Water,,2009-10-06,00:00:00,PDT,,,,MRSENVMB_WQX-M-001,pH,7.4,,
12,Water,,2009-10-16,00:00:00,PDT,,,,MRSENVMB_WQX-M-001,Turbidity,20.0,NTU,
14,Water,,2009-07-12,00:00:00,PDT,,,,MRSENVMB_WQX-M-001,Total Coliform,4.0,MPN/100ml,
15,Water,,2009-07-12,00:00:00,PDT,,,,MRSENVMB_WQX-M-001,pH,7.6,,
16,Water,,2009-07-18,00:00:00,PDT,,,,MRSENVMB_WQX-M-001,"Temperature, water",22.0,deg C,
26,Water,,2009-10-22,00:00:00,PDT,,,,MRSENVMB_WQX-M-001,Turbidity,26.0,NTU,
30,Water,,2009-08-11,00:00:00,PDT,,,,MRSENVMB_WQX-M-001,"Biochemical oxygen demand, standard conditions",31.0,mg/l,
31,Water,,2009-10-30,00:00:00,PDT,,,,MRSENVMB_WQX-M-001,Total suspended solids,15.0,mg/l,


# Join 'er up

In [6]:
results = pd.merge(waterresults, measures, left_on='CharacteristicName', right_on='Measure')
results = pd.merge(results, station, on='MonitoringLocationIdentifier')
results = pd.merge(results, counties, on='CountyCode')

len(results)

1337122

# Distinct units

In [7]:
# String cleanup
results["Unit"] = results["ResultMeasure/MeasureUnitCode"].astype('str').str.strip()

# Nix the NaNs:
results = results.loc[pd.notnull(results.ResultMeasureValue)]

# Units that just aren't useful:

#   % recovery is the percentage of an agent that is removed.
measuresToSkip = frozenset([
        # % recovery is % of an agent that is filtered out. Not quite what we're after.
        '%', '% recovery',
        # These measure turbidity, which I'm not worrying about for now.
        'FNU', 'FTU', 'JTU', 'NTU', 'None', 'm', 'mg/l SiO2',
        # 'code' measures turbidity severity. Same there.
        'code',
        # ppm and ppb are tricky to convert to mg/l, since you need to know
        # molecular weights. Should do, but not yet.
        'ppm', 'ppb',
        # Lead-210. Not dealing with radiologicals (yet)
        'pCi/L',
        # tons/day!? what is this shit?
        'tons/day',
        # Not in the mood to deal with micromoles, either. Micromoles per what?
        'umol'
    ])

results = results.loc[~results.Unit.isin(measuresToSkip)]


# Unit mapping

# Water has a density of 1 kg/L, so mg/kg -> mg/l is a gimme:
results.loc[results.Unit == 'mg/kg', 'Unit'] = 'mg/l'
results.loc[results.Unit == 'mg/kg as N', 'Unit'] = 'mg/l'
results.loc[results.Unit == 'ug/kg', 'Unit'] = 'ug/l'

# Nitrate has an equivalent weight of 62, and everything in the data
# that is measured using ueq/L is nitrate
results.loc[results.Unit == 'ueq/L', 'ResultMeasureValue'] *= 62
results.loc[results.Unit == 'ueq/L', 'Unit'] = 'ug/l'

# Unnecessary specification:
results.loc[results.Unit == 'mg/l as N', 'Unit'] = 'mg/l'

# 1,000 picograms per nanogram
results.loc[results.Unit == 'pg/l', 'ResultMeasureValue'] *= (1/1000)
results.loc[results.Unit == 'pg/l', 'Unit'] = 'ng/l'

# 1,000 nanograms per microgram
results.loc[results.Unit == 'ng/l', 'ResultMeasureValue'] *= (1/1000)
results.loc[results.Unit == 'ng/l', 'Unit'] = 'ug/l'

# 1,000 micrograms per milligram
results.loc[results.Unit == 'ug/l', 'ResultMeasureValue'] *= (1/1000)
results.loc[results.Unit == 'ug/l', 'Unit'] = 'mg/l'

# 1,000 ppb per ppm
results.loc[results.Unit == 'ppb', 'ResultMeasureValue'] *= (1/1000)
results.loc[results.Unit == 'ppb', 'Unit'] = 'ppm'


In [8]:
sorted(results.Unit.unique())

['fibers/l', 'mg/l']

In [9]:
results[results.Unit == 'ppm'].CharacteristicName.unique()

array([], dtype=object)

In [10]:
results[results.Unit == 'umol']

Unnamed: 0,ActivityMediaName,ActivityMediaSubdivisionName,ActivityStartDate,ActivityStartTime/Time,ActivityStartTime/TimeZoneCode,ActivityEndDate,ActivityEndTime/Time,ActivityEndTime/TimeZoneCode,MonitoringLocationIdentifier,CharacteristicName,ResultMeasureValue,ResultMeasure/MeasureUnitCode,ResultCommentText,Measure,Limit,MeasureGroup,LatitudeMeasure,LongitudeMeasure,CountyCode,State,StateCode,CountyName,Class,Unit


In [11]:
#results["Unit"] = results["ResultMeasure/MeasureUnitCode"].astype('str').str.strip()

results.loc[(results.Unit == 'ueq/L') & (results.MeasureGroup == 'nitrate'), 'Unit'] = 'mg/l'

results.loc[(results.Unit == 'mg/l') & (results.MeasureGroup == 'nitrate'), 'Unit']

4939       mg/l
4940       mg/l
4941       mg/l
4944       mg/l
4947       mg/l
4948       mg/l
4949       mg/l
4951       mg/l
5355       mg/l
5356       mg/l
5357       mg/l
5358       mg/l
5359       mg/l
5360       mg/l
5361       mg/l
5362       mg/l
5363       mg/l
5364       mg/l
5365       mg/l
5366       mg/l
5367       mg/l
5368       mg/l
5369       mg/l
5370       mg/l
5371       mg/l
5372       mg/l
5373       mg/l
5374       mg/l
5375       mg/l
5376       mg/l
           ... 
1336625    mg/l
1336652    mg/l
1336653    mg/l
1336654    mg/l
1336677    mg/l
1336678    mg/l
1336701    mg/l
1336702    mg/l
1336725    mg/l
1336726    mg/l
1336749    mg/l
1336750    mg/l
1336773    mg/l
1336774    mg/l
1336797    mg/l
1336798    mg/l
1336799    mg/l
1336822    mg/l
1336823    mg/l
1336824    mg/l
1336847    mg/l
1336848    mg/l
1336871    mg/l
1336872    mg/l
1336890    mg/l
1336902    mg/l
1336959    mg/l
1336960    mg/l
1337002    mg/l
1337003    mg/l
Name: Unit, dtype: objec

# Reordering columns

In [12]:
results = results[['MonitoringLocationIdentifier',  'CountyCode', 'CountyName', 'LatitudeMeasure', 'LongitudeMeasure',
                   'ActivityMediaName', 'ActivityMediaSubdivisionName', 
                   'ActivityStartDate', 'ActivityStartTime/Time', 'ActivityStartTime/TimeZoneCode', 
                   'ActivityEndDate', 'ActivityEndTime/Time', 'ActivityEndTime/TimeZoneCode', 
                   'Measure', 'Limit', 'MeasureGroup', 
                   'ResultMeasureValue', 'ResultMeasure/MeasureUnitCode', 'ResultCommentText']]

# Save to CSV

In [13]:
results.to_csv('../data/CA-results-join.csv.bz2', index=False, quoting=csv.QUOTE_ALL, compression='bz2')