In [1]:
from urllib import request

import json
import os
import random

In [2]:
MAPBOX_ACCESS_TOKEN = os.environ['MAPBOX_ACCESS_TOKEN']

STATE_CODES = ['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA', 
               'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 
               'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 
               'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 
               'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY']

filepath = 'sites.json'

In [3]:
sites_dict = {
    'unchecked': [],
    'water': [],
    'no_water': []   
}

In [4]:
def get_sites_for_state(state_code):
    url = 'http://waterservices.usgs.gov/nwis/iv/?format=json&stateCd={}'.format(state_code)
    response = request.urlopen(url)
    sites = json.loads(response.read())['value']['timeSeries']

    return sites


def parse_site_data(site):
    site_info = site['sourceInfo']
    site_code = site_info['siteCode'][0]['value']
    site_geo_info = site_info['geoLocation']['geogLocation']

    name = site_info['siteName']
    lon = site_geo_info['longitude']
    lat = site_geo_info['latitude']

    return site_code, name, lon, lat

In [5]:
total_sites = 0
unique_sites = set()

for state_code in STATE_CODES:
    sites = get_sites_for_state(state_code)
    num_sites_for_state = 0

    for site in sites:
        site_code, name, lon, lat = parse_site_data(site)
        
        if site_code in unique_sites:
            continue

        unique_sites.add(site_code)
        num_sites_for_state += 1

        sites_dict['unchecked'].append({
            'site_code': site_code,
            'name': name,
            'lon': lon,
            'lat': lat
        })

    total_sites += num_sites_for_state
    print('Data recorded for {} {} sites.'.format(num_sites_for_state, state_code))

print('{} total sites.'.format(total_sites))

Data recorded for 224 AL sites.
Data recorded for 269 AK sites.
Data recorded for 425 AZ sites.
Data recorded for 329 AR sites.
Data recorded for 1154 CA sites.
Data recorded for 754 CO sites.
Data recorded for 105 CT sites.
Data recorded for 7 DC sites.
Data recorded for 64 DE sites.
Data recorded for 1300 FL sites.
Data recorded for 579 GA sites.
Data recorded for 281 HI sites.
Data recorded for 453 ID sites.
Data recorded for 513 IL sites.
Data recorded for 483 IN sites.
Data recorded for 307 IA sites.
Data recorded for 303 KS sites.
Data recorded for 269 KY sites.
Data recorded for 320 LA sites.
Data recorded for 153 ME sites.
Data recorded for 263 MD sites.
Data recorded for 224 MA sites.
Data recorded for 351 MI sites.
Data recorded for 312 MN sites.
Data recorded for 244 MS sites.
Data recorded for 533 MO sites.
Data recorded for 362 MT sites.
Data recorded for 307 NE sites.
Data recorded for 472 NV sites.
Data recorded for 79 NH sites.
Data recorded for 474 NJ sites.
Data recor

In [6]:
sites_dict['unchecked'] = random.sample(sites_dict['unchecked'], len(sites_dict['unchecked']))

In [7]:
sites_dict

{'unchecked': [{'site_code': '14306900',
   'name': 'BIG CREEK NEAR ROOSEVELT BEACH, OREG.',
   'lon': -124.0665067,
   'lat': 44.167899},
  {'site_code': '15300300',
   'name': 'ILIAMNA R NR PEDRO BAY AK',
   'lon': -153.8468866,
   'lat': 59.7579499},
  {'site_code': '01407760',
   'name': 'Jumping Brook near Neptune City NJ',
   'lon': -74.0658333,
   'lat': 40.2033333},
  {'site_code': '01361000',
   'name': 'KINDERHOOK CREEK AT ROSSMAN NY',
   'lon': -73.7444444,
   'lat': 42.33105556},
  {'site_code': '02312600',
   'name': 'WITHLACOOCHEE RIVER NEAR FLORAL CITY, FL',
   'lon': -82.2200892,
   'lat': 28.74359973},
  {'site_code': '05438137',
   'name': 'UNNAMED TR TO SB KISHWAUKEE C NR HUNTLEY, IL',
   'lon': -88.4222222,
   'lat': 42.195},
  {'site_code': '02444161',
   'name': 'TOMBIGBEE RIVER BEL BEVIL L&D NR PICKENSVILLE, AL.',
   'lon': -88.2886454,
   'lat': 33.21040097},
  {'site_code': '08330830',
   'name': 'RIO GRANDE AT VALLE DE ORO, NM',
   'lon': -106.6865556,
   'lat

In [8]:
len(sites_dict.get('unchecked'))

20495

In [9]:
with open(filepath, 'w+') as fp:
    json.dump(sites_dict, fp)