In [None]:
import csv
from datetime import datetime, timedelta
import requests
import time

# Setup for NOAA API

In [None]:
with open('noaa.secret') as fp:
    TOKEN = fp.read().strip()
headers = {'token': TOKEN, 'Accept': 'application/json'}

In [None]:
# Handle pagination
def get_all(endpoint, **params):
    results = []
    while True:
        r = requests.get('https://www.ncdc.noaa.gov/cdo-web/api/v2' + endpoint, headers=headers,
                         params=dict(params,
                                     limit='1000',
                                     offset=len(results)))
        r.raise_for_status()
        obj = r.json()
        results.extend(obj['results'])
        resultset = obj['metadata']['resultset']
        count = int(resultset['count'])
        if len(results) >= count:
            break
        else:
            print("{}/{}...".format(len(results), count))
        time.sleep(2)
    return results

# List all datasets

In [None]:
datasets = get_all('/datasets')
for dataset in datasets:
    print('{: <12} {}'.format(dataset['id'], dataset['name']))

In [None]:
DATASET_ID = 'GHCND'
[dataset for dataset in datasets if dataset['id'] == DATASET_ID][0]

In [None]:
r = csv.DictReader(open('discovery/noaa/datamart_noaa_discovery/noaa_city_stations.csv'))
r.fieldnames

# List all datatypes

In [None]:
datatypes = get_all('/datatypes', datasetid=DATASET_ID)
for type_ in sorted(datatypes, key=lambda t: t['id']):
    print('{: <5} {}'.format(type_['id'], type_['name']))

In [None]:
DATATYPE = 'AWND'
[datatype for datatype in datatypes if datatype['id'] == DATATYPE][0]

# List all locations of type 'CITY'

In [None]:
locations = get_all('/locations', datasetid=DATASET_ID, locationcategoryid='CITY')
locations

In [None]:
len(locations)

# Find the best station for each city

In [None]:
location_station = {}

In [None]:
now = datetime.today()
for location in locations:
    if location['id'] in location_station:
        continue
    if len(location_station) % 20 == 0:
        print("{}/{}...".format(len(location_station), len(locations)))
    time.sleep(0.5)
    stations = get_all('/stations', datasetid=DATASET_ID, locationid=location['id'])
    # Find best coverage
    mindate = None
    minstation = None
    for station in stations:
        smax = datetime.strptime(station['maxdate'], '%Y-%m-%d')
        if smax < now - timedelta(days=2):
            # Too old
            continue
        smin = datetime.strptime(station['mindate'], '%Y-%m-%d')
        if mindate is None or mindate > smin:
            mindate = smin
            minstation = station
    location_station[location['id']] = station

In [None]:
len(location_station)

In [None]:
with open('stations.csv', 'w', newline='\n') as fp:
    writer = csv.writer(fp)
    writer.writerow(['station_id', 'station_name', 'latitude', 'longitude', 'city_id', 'city_name'])
    for location_id, station in location_station.items():
        location = [l for l in locations if l['id'] == location_id]
        if len(location) != 1:
            print("locations for %r:\n%r\n" % (location_id, location))
        location = location[0]
        writer.writerow([station['id'], station['name'], station['latitude'], station['longitude'], location['id'], location['name']])

# Get data for New York, JFK airport

In [None]:
# Get data for New York, JFK airport
r = requests.get('https://www.ncdc.noaa.gov/cdo-web/api/v2/data', headers=headers,
                 params=dict(datasetid=DATASET_ID,
                             datatypeid=DATATYPE,
                             stationid='GHCND:USW00094789',
                             startdate='2018-01-01',
                             enddate='2018-02-15'))
r.raise_for_status()
r.json()