In [18]:
import pandas as pd
import csv

# Importing measure translation list

In [2]:
measures = pd.read_csv('../data/measure-rosetta.txt',
                       sep='|')
measures

Unnamed: 0,Measure,Limit,MeasureGroup
0,"1,1,1-Trichloroethane",.2 mg/L,"1,1,1-Trichloroethane"
1,"1,1,2-Trichloroethane",.005 mg/L,"1,1,2-Trichloroethane"
2,"1,1-Dichloroethylene",.007 mg/L,"1,1-Dichloroethylene"
3,"1,2,4-Trichlorobenzene",.07 mg/L,"1,2,4-Trichlorobenzene"
4,"1,2-Dibromo-3-chloropropane",.0002 mg/L,"1,2-Dibromo-3-chloropropane"
5,"1,2-Dichloroethane",.005 mg/L,"1,2-Dichloroethane"
6,"1,2-Dichloropropane",.005 mg/L,"1,2-Dichloropropane"
7,"2,2',3,3',4,4',5,5',6-Nonachlorobiphenyl",.0005 mg/L,PCBs
8,"2,2',3,3',4,4',5,5'-Octachlorobiphenyl",.0005 mg/L,PCBs
9,"2,2',3,3',4,4',5,6'-Octachlorobiphenyl",.0005 mg/L,PCBs


# Loading station data

In [4]:
station = pd.read_csv('../data/station-clean.csv.bz2',
                     usecols=['MonitoringLocationIdentifier', 
                              'LatitudeMeasure', 
                              'LongitudeMeasure',
                              'CountyCode'])
station

Unnamed: 0,MonitoringLocationIdentifier,LatitudeMeasure,LongitudeMeasure,CountyCode
0,USBR-324903114320201,32.817711,-114.534672,25.0
1,USGS-09423500,34.851671,-114.609965,71.0
2,USGS-09424150,34.316126,-114.157170,71.0
3,USGS-09424170,34.300015,-114.163281,71.0
4,USGS-09427500,34.316126,-114.157170,71.0
5,USGS-09427520,34.295570,-114.140225,71.0
6,USGS-09427524,34.295570,-114.140225,71.0
7,USGS-09427600,34.291681,-114.157725,71.0
8,USGS-09427800,34.279182,-114.148003,71.0
9,USGS-09428000,34.258348,-114.150780,71.0


# Loading Results

In [7]:
results = pd.read_csv('../data/CA-result.csv.bz2',
                     usecols=['ActivityMediaName', 'ActivityMediaSubdivisionName',
                              'ActivityStartDate', 'ActivityStartTime/Time',
                              'ActivityStartTime/TimeZoneCode', 'ActivityEndDate',
                              'ActivityEndTime/Time', 'ActivityEndTime/TimeZoneCode',
                              'MonitoringLocationIdentifier', 'CharacteristicName',
                              'ResultMeasureValue', 'ResultMeasure/MeasureUnitCode',
                              'ResultCommentText'],
                      error_bad_lines=False)
results["ResultMeasureValue"] = \
    pd.to_numeric(results.ResultMeasureValue, errors='coerce')

results.head(10)

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,ActivityMediaName,ActivityMediaSubdivisionName,ActivityStartDate,ActivityStartTime/Time,ActivityStartTime/TimeZoneCode,ActivityEndDate,ActivityEndTime/Time,ActivityEndTime/TimeZoneCode,MonitoringLocationIdentifier,CharacteristicName,ResultMeasureValue,ResultMeasure/MeasureUnitCode,ResultCommentText
0,Biological,,2009-04-08,00:00:00,,,,,21CAOCSD_WQX-4,Count,2.0,count,Chloeia pinnata~ABUNDANCE=2~QUALIFER=
1,Biological,,2009-04-08,00:00:00,,,,,21CAOCSD_WQX-4,Count,2.0,count,Onuphis sp A~ABUNDANCE=2~QUALIFER=
2,Biological,,2009-04-08,00:00:00,,,,,21CAOCSD_WQX-4,Count,6.0,count,Lumbrineris lingulata~ABUNDANCE=6~QUALIFER=
3,Biological,,2009-04-08,00:00:00,,,,,21CAOCSD_WQX-4,Count,1.0,count,Ophiuroconis bispinosa~ABUNDANCE=1~QUALIFER=
4,Biological,,2009-04-08,00:00:00,,,,,21CAOCSD_WQX-4,Count,0.0,count,Amphioplus sp A~ABUNDANCE=0~QUALIFER=
5,Biological,,2009-04-08,00:00:00,,,,,21CAOCSD_WQX-4,Count,0.0,count,Leptosynapta sp~ABUNDANCE=0~QUALIFER=
6,Biological,,2009-04-08,00:00:00,,,,,21CAOCSD_WQX-4,Count,20.0,count,Lumbrineris cruzensis~ABUNDANCE=20~QUALIFER=
7,Biological,,2009-04-08,00:00:00,,,,,21CAOCSD_WQX-4,Count,2.0,count,Sigalion spinosus~ABUNDANCE=2~QUALIFER=
8,Biological,,2009-04-08,00:00:00,,,,,21CAOCSD_WQX-4,Count,0.0,count,Nereis sp A~ABUNDANCE=0~QUALIFER=
9,Water,,2009-10-10,00:00:00,,,,,MATTOLE_SALMON-MAT_EST_6B_09,"Temperature, water",15.16,deg C,daily average


# Join 'er up

In [13]:
results = pd.merge(results, measures, left_on='CharacteristicName', right_on='Measure')
results = pd.merge(results, station, on='MonitoringLocationIdentifier')

len(results)

1763085

# Reordering columns

In [17]:
results = results[['MonitoringLocationIdentifier',  'CountyCode', 'LatitudeMeasure', 'LongitudeMeasure',
                   'ActivityMediaName', 'ActivityMediaSubdivisionName', 
                   'ActivityStartDate', 'ActivityStartTime/Time', 'ActivityStartTime/TimeZoneCode', 
                   'ActivityEndDate', 'ActivityEndTime/Time', 'ActivityEndTime/TimeZoneCode', 
                   'Measure', 'Limit', 'MeasureGroup', 
                   'ResultMeasureValue', 'ResultMeasure/MeasureUnitCode', 'ResultCommentText']]

# Save to CSV

In [21]:
results.to_csv('../data/CA-results-join.csv.bz2', index=False, quoting=csv.QUOTE_ALL, compression='bz2')