In [1]:
# import libraries
%matplotlib inline
import numpy as np
#import csv
import matplotlib.pyplot as plt
import pandas as pd
import glob
import ulmo
import os
import scipy.spatial

In [2]:
ghcn = pd.read_fwf('data/ghcnd-stations.txt', colspecs = [(0,11), (12,19), (21,29), (31,36),(38,40), (41,70), (72,74),(76,78),(80,85)], header = None) 
colnames = ['GHCN ID', 'lat', 'lon', 'elevation', 'state', 'name', 'gsn flag', 'HCN/CRN FLAG', 'WMO ID']
ghcn.columns = colnames

# append the brightness index 
BI = np.load('data/brightnessGHCN.npy')
ghcn['Brightness'] = BI
# from http://www1.ncdc.noaa.gov/pub/data/ghcn/daily/readme.txt
# FORMAT OF "ghcnd-stations.txt"
#
# ------------------------------
# Variable   Columns   Type
# ------------------------------
# ID            1-11   Character
# LATITUDE     13-20   Real
# LONGITUDE    22-30   Real
# ELEVATION    32-37   Real
# STATE        39-40   Character
# NAME         42-71   Character
# GSN FLAG     73-75   Character
# HCN/CRN FLAG 77-79   Character
# WMO ID       81-85   Character
# ------------------------------

# These variables have the following definitions:

# ID         is the station identification code.  Note that the first two
#            characters denote the FIPS  country code, the third character 
#            is a network code that identifies the station numbering system 
#            used, and the remaining eight characters contain the actual 
#            station ID. 

#            See "ghcnd-countries.txt" for a complete list of country codes.
# 	   See "ghcnd-states.txt" for a list of state/province/territory codes.

#            The network code  has the following five values:

#            0 = unspecified (station identified by up to eight 
# 	       alphanumeric characters)
# 	   1 = Community Collaborative Rain, Hail,and Snow (CoCoRaHS)
# 	       based identification number.  To ensure consistency with
# 	       with GHCN Daily, all numbers in the original CoCoRaHS IDs
# 	       have been left-filled to make them all four digits long. 
# 	       In addition, the characters "-" and "_" have been removed 
# 	       to ensure that the IDs do not exceed 11 characters when 
# 	       preceded by "US1". For example, the CoCoRaHS ID 
# 	       "AZ-MR-156" becomes "US1AZMR0156" in GHCN-Daily
#            C = U.S. Cooperative Network identification number (last six 
#                characters of the GHCN-Daily ID)
# 	   E = Identification number used in the ECA&D non-blended
# 	       dataset
# 	   M = World Meteorological Organization ID (last five
# 	       characters of the GHCN-Daily ID)
# 	   N = Identification number used in data supplied by a 
# 	       National Meteorological or Hydrological Center
# 	   R = U.S. Interagency Remote Automatic Weather Station (RAWS)
# 	       identifier
# 	   S = U.S. Natural Resources Conservation Service SNOwpack
# 	       TELemtry (SNOTEL) station identifier
#            W = WBAN identification number (last five characters of the 
#                GHCN-Daily ID)

# LATITUDE   is latitude of the station (in decimal degrees).

# LONGITUDE  is the longitude of the station (in decimal degrees).

# ELEVATION  is the elevation of the station (in meters, missing = -999.9).


# STATE      is the U.S. postal code for the state (for U.S. stations only).

# NAME       is the name of the station.

# GSN FLAG   is a flag that indicates whether the station is part of the GCOS
#            Surface Network (GSN). The flag is assigned by cross-referencing 
#            the number in the WMOID field with the official list of GSN 
#            stations. There are two possible values:

#            Blank = non-GSN station or WMO Station number not available
#            GSN   = GSN station 

# HCN/      is a flag that indicates whether the station is part of the U.S.
# CRN FLAG  Historical Climatology Network (HCN).  There are three possible 
#           values:

#            Blank = Not a member of the U.S. Historical Climatology 
# 	           or U.S. Climate Reference Networks
#            HCN   = U.S. Historical Climatology Network station
# 	   CRN   = U.S. Climate Reference Network or U.S. Regional Climate 
# 	           Network Station

# WMO ID     is the World Meteorological Organization (WMO) number for the
#            station.  If the station has no WMO number (or one has not yet 
# 	   been matched to this station), then the field is blank.

# --------------------------------------------------------------------------------
# --------------------------------------------------------------------------------
giss = pd.read_fwf('data/v3.temperature.inv.txt',skiprows = 39, header = None,
                  colspecs=[(0,3),(3,8),(8,11), (12,44),(44,49), (52,58), (58,63), (63,67), (67,68), (69,73), (73,75), (75, 77), (78,79), (79,81), (81,82),(82,84), (84,100), (100,102), (103,106)])
colnames = ['icc country code', 'WMO ID', '3 digit modifier', 'name','lat', 'lon', 'elevation', 'TELe', 'P', 'Pop', 'Tp', 'V', 'Lo', 'Co', 'Airport', 'ds', 'Vege', 'bi', 'BI']
giss.columns = colnames

# LEGEND  
# ======
# icc  =3 digit country code; the first digit represents WMO region/continent
# WMO_#=5 digit WMO station number
# ...  =3 digit modifier; 000 means the station is probably the WMO
#       station; 001, etc. mean the station is near that WMO station
# Name =30 character station name
# Lat  =latitude in degrees, negative = South of Equator
# Lon  =longitude in degrees, negative = West of Greenwich (England)
# Elev =station elevation in meters, missing is -999
# TEle =station elevation interpolated from TerrainBase gridded data set
# P    =R if rural (not associated with a town of >10,000 population)
#       S if associated with a small town (10,000-50,000 population)
#       U if associated with an urban area (>50,000 population)
# Pop  =population of the small town or urban area in 1000s
#       If rural, no analysis:  -9.
# Tp   =general topography around the station:  FL flat; HI hilly,
#       MT mountain top; MV mountainous valley or at least not on the top
#       of a mountain.
# V    =general vegetation near the station based on Operational
#       Navigation Charts;  MA marsh; FO forested; IC ice; DE desert;
#       CL clear or open;  xx information not provided
# Lo   =CO if station is within 30 km from the coast
#       LA if station is next to a large (> 25 km**2) lake
#       no if neither of the above
#       Note: Stations which are both CO and LA will be marked CO
# Co   =distance in km to the coast if Lo=CO, else -9
# A    =A if the station is at an airport; else x
# ds   =distance in km from the airport to its associated
#       small town or urban center (not relevant for rural airports
#       or non airport stations in which case ds=-9)
# Vege =gridded vegetation for the 0.5x0.5 degree grid point closest
#       to the station from a gridded vegetation data base. 16 characters.
# bi   =brightness index    A=dark B=dim C=bright   (comment added by R.Ruedy)
# BI   =brightness index    0=dark -> 256 =bright   (based on satellite night light data)

see: http://stackoverflow.com/questions/35296935/python-calculate-lots-of-distances-quickly

In [3]:
# subset the GHCN station list with the list of available stations
currentstations = ulmo.ncdc.ghcn_daily.get_stations(start_year=1985, end_year = 2016, elements = ['TMIN', 'TMAX', 'AWND'], as_dataframe=True, update=False)
currentGHCNstations = np.intersect1d(currentstations.id, ghcn.index.values) #ghcn['GHCN ID'].values)
ghcnSubset = ghcn.set_index('GHCN ID').loc[currentstations.id.values]
# at this point, ghcn must have the station id set as the index 

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike


In [4]:
#ghcn = ghcn[ghcn.index.str.contains('US')] # subset to US stations

In [5]:
ghcn_noairport = ghcn[~ghcn.name.str.contains('INTL')]
ghcn_noairport = ghcn_noairport[~ghcn_noairport.name.str.contains(' AP')]
ghcn_noairport = ghcn_noairport[~ghcn_noairport.name.str.contains('AIRPORT')]

In [7]:
ghcn = ghcn_noairport.set_index('GHCN ID').loc[currentstations.id.values]
ghcn.head()

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  if __name__ == '__main__':


Unnamed: 0_level_0,lat,lon,elevation,state,name,gsn flag,HCN/CRN FLAG,WMO ID,Brightness
GHCN ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
AE000041196,25.333,55.517,34.0,,SHARJAH INTER. AIRP,GS,,41196.0,63.0
AEM00041194,,,,,,,,,
AEM00041217,,,,,,,,,
AEM00041218,,,,,,,,,
AF000040930,35.317,69.017,3366.0,,NORTH-SALANG,GS,,40930.0,4.0


In [5]:
# compute distances between all stations
#tree = scipy.spatial.cKDTree(giss[['lon', 'lat']].values, leafsize=100)
# query the closest point 
#closestInd = tree.query(giss[['lon', 'lat']].values[11,:], k =2, distance_upper_bound=6)[1][1]

In [19]:
atlas = pd.read_csv('data/sampleAtlas.csv') # derived  from http://www.naturalearthdata.com/downloads/10m-cultural-vectors/10m-populated-places/
atlas = pd.read_csv('data/world_cities.csv')
tree = scipy.spatial.cKDTree(ghcn[['lon', 'lat']].values, leafsize=100)
#import sys
#sys.path.append('/../cityheat/Bmore/2015/')
#sys.path.append('/Users/annascott2/mountpoint/cityheat/Bmore/2015/')
#import spatialfunctions
atlas = atlas[atlas['country'].str.contains('United States')] # do non-US countries
atlas = atlas[atlas['pop']> 500000]
atlas.shape

(70, 9)

In [None]:
pd.read_csv('data/world_cities.csv')

In [20]:
70-54

16

In [8]:
#atlas = pd.read_csv('data/world_cities.csv')

In [17]:
atlas.shape

(70, 9)

In [10]:
#atlas = atlas[atlas['country'].str.contains('United States')].set_index('city')

In [107]:
brightness_threshold = 25 # this is the urban/rural threshold delimiter, difference between urban and rural
#outFile = 'worldghcnpairs_stationlengths.csv'
outFile = 'USpairs2005-2015.csv'
pairs = []
frames = []
startdate = np.datetime64('2005-06-01')
enddate = np.datetime64('2015-08-30')


In [14]:
atlas

Unnamed: 0,city,city_ascii,lat,lng,pop,country,iso2,iso3,province
6321,Mesa,Mesa,33.423915,-111.736084,762217.5,United States of America,US,USA,Arizona
6393,Arlington,Grand Prairie,32.684761,-97.020238,545107.5,United States of America,US,USA,Texas
6571,Irvine,Irvine,33.680411,-117.829950,1611303.5,United States of America,US,USA,California
6689,New Haven,New Haven,41.330383,-72.900005,707883.0,United States of America,US,USA,Connecticut
6707,Fort Lauderdale,Fort Lauderdale,26.136065,-80.141786,1103781.5,United States of America,US,USA,Florida
6847,Long Beach,Long Beach,33.786967,-118.158044,1249195.5,United States of America,US,USA,California
6870,Hartford,Hartford,41.770020,-72.679967,518509.5,United States of America,US,USA,Connecticut
6871,Providence,Providence,41.821102,-71.414980,663726.5,United States of America,US,USA,Rhode Island
6872,Birmingham,Birmingham,33.530006,-86.824995,670142.0,United States of America,US,USA,Alabama
6875,St. Petersburg,St. Petersburg,27.770539,-82.679383,523314.5,United States of America,US,USA,Florida


In [None]:
# Ask how many urban stations are around 
brightness_threshold = 25
for i in range(1, atlas.shape[0]): #range(31,32): # for baltimore
    lat = atlas.iloc[i]['lat']
    lon = atlas.iloc[i]['lng']
    city = atlas.iloc[i]['city_ascii']
    print(city)
    closeststations = tree.query([lon,lat], k =15, distance_upper_bound=1)

    # Make sure the closest stations are within a finite distance
    closestInds = closeststations[1][~np.isinf(closeststations[0])]
    # make sure that there are stations 
    if closestInds.shape[0] > 0 : 
        urban = ghcn.Brightness[closestInds].argmax()


In [115]:
#for i in range(793, atlas.shape[0]): 

data_availability_thresh = .75
for i in range(1, atlas.shape[0]): #range(31,32): # for baltimore
    lat = atlas.iloc[i]['lat']
    lon = atlas.iloc[i]['lng']
    city = atlas.iloc[i]['city_ascii']
    print(city)
    closeststations = tree.query([lon,lat], k =15, distance_upper_bound=1)

    # Make sure the closest stations are within a finite distance
    closestInds = closeststations[1][~np.isinf(closeststations[0])]
    # make sure that there are stations 
    if closestInds.shape[0] > 0 : 
        urban = ghcn.Brightness[closestInds].argmax()
        rural = ghcn.Brightness[closestInds].argmin()
        #         if urban != rural: 
        urban_data = ulmo.ncdc.ghcn_daily.get_data(urban, as_dataframe=True, update=False)
        rural_data = ulmo.ncdc.ghcn_daily.get_data(rural, as_dataframe=True, update=False)

        number_urban_stations = (ghcn.Brightness[closestInds] >= brightness_threshold).sum() # number of urban stations is equivalent to brightness over 30
        number_rural_stations = (ghcn.Brightness[closestInds] < brightness_threshold).sum() # number of rural stations is equivalent to brightness over 30
        delta_brightness = 30
        break_value = -1

        iii = 1
#        while ('TMAX' not in urban_data.keys() or 'TMIN' not in urban_data.keys()) or (urban_data['TMAX'].index[0].to_timestamp().to_datetime() > startdate) or (urban_data['TMAX'].index[-1].to_timestamp().to_datetime() < enddate) : 
        while ('TMAX' not in urban_data.keys()) or ('TMIN' not in urban_data.keys()) or (urban_data['TMAX'].index[0].to_timestamp() > startdate) or (urban_data['TMAX'].index[-1].to_timestamp() < enddate) or ((~np.isnan(urban_data['TMIN'].set_index(urban_data['TMIN'].index.to_timestamp())[startdate:enddate].value.values.astype(float))).sum()/(enddate-startdate).astype(float) < data_availability_thresh) :  
        # check if there's enough suitable urban stations to check the next one, ignoring the current one 
            if iii > number_urban_stations-1 : 
                print('no suitable urban station for %s'%city) 
                urban = break_value
                break 
            # find the next brightestcity
            print 'finding the next urban station for %s'%city
            urban = ghcn.Brightness[closestInds][(-ghcn.Brightness[closestInds].values).argsort()].index[iii]
            urban_data = ulmo.ncdc.ghcn_daily.get_data(urban, as_dataframe=True, update=False)
            iii = iii+1    

        # if no urban station found, don't bother to pair with rural 
        if urban != break_value : 
            # check rural data station different from urban has TMIN, TMAX, a long enough record, and is rural enough compared to urban (delta_brightness)
            iii = 1
            enlarge_circle = 0 # number of times we can try increasing search radius
            while (urban ==rural) or ('TMAX' not in rural_data.keys() and 'TMIN' not in rural_data.keys()) or (rural_data['TMIN'].index[0].to_timestamp() > startdate) or (rural_data['TMIN'].index[-1].to_timestamp() < enddate) or (ghcn.Brightness[urban]-ghcn.Brightness[rural]<delta_brightness) or ((~np.isnan(rural_data['TMIN'].set_index(rural_data['TMIN'].index.to_timestamp())[startdate:enddate].value.values.astype(float))).sum()/((enddate-startdate).astype(float)) < data_availability_thresh): 
                print 'finding the next rural station for %s'%city

                # if we can't find any suitable stations, try enlarging the search radius, but only try this once
                if iii > number_rural_stations-1 : 
                    if enlarge_circle > 0 : 
                        rural = break_value
                        break #break when we've already used up all the search
                    else: 
                        # find 25 closest stations within 1.5 degree circle
                        closeststations1 = tree.query([lon,lat], k =35, distance_upper_bound=1.5 ) 
                        # eliminate the ones we've already search 
                        new_stations = np.setdiff1d(closeststations1[1], closeststations[1])
                        # Make sure that we've actually found new stations, otherwise break
                        if new_stations.shape[0] >0 : 
                            # Make sure the closest stations are within a finite distance
                            closestInds1 = closeststations1[1][~np.isinf(closeststations1[0])]
                            # Eliminate repeat stations
                            closestInds = np.intersect1d(new_stations, closestInds1)
                            # reset counter, number of rural stations
                            iii = 0
                            number_rural_stations = (ghcn.Brightness[closestInds] < brightness_threshold).sum()
                        else: 
                            rural = break_value
                            break
                        # reset list of rural stations
                    print 'increasing search radius'
                    enlarge_circle = enlarge_circle+1
                    
                # find the next dimmest
                rural = ghcn.Brightness[closestInds][(ghcn.Brightness[closestInds].values).argsort()].index[iii]
                rural_data = ulmo.ncdc.ghcn_daily.get_data(rural, as_dataframe=True, update=False)
                print rural, rural_data['TMIN'].index[0], rural_data['TMIN'].index[-1], ghcn.Brightness[rural], rural_data.keys(), (~np.isnan(rural_data['TMIN'].set_index(rural_data['TMIN'].index.to_timestamp())[startdate:enddate].value.values.astype(float))).sum()/(enddate-startdate).astype(float)
                iii = iii+1
        else: 
            rural = break_value
    # else condition for not being any stations
    else: 
        urban = break_value 
        rural = break_value
            
    # save out if we've found a good pairing
    if (urban != break_value) and (rural != break_value) : 
        print('Found a pair for %s'%city)
        print('Urban start/end is: %s to %s'%(urban_data['TMAX'].index[0].to_timestamp().to_datetime(),  urban_data['TMAX'].index[-1].to_timestamp().to_datetime()))
        print('Rural start/end is: %s to %s'%(rural_data['TMAX'].index[0].to_timestamp().to_datetime(),  rural_data['TMAX'].index[-1].to_timestamp().to_datetime()))
        frames.append([city, urban, #['GHCN ID'],
                       ghcn.loc[urban].lat, 
                       ghcn.loc[urban].lon, 
                       ghcn.loc[urban].Brightness,
                       urban_data['TMAX'].index[0].to_timestamp().to_datetime(), urban_data['TMAX'].index[-1].to_timestamp().to_datetime(), 
                       rural, #['GHCN ID'],
                       ghcn.loc[rural].lat, 
                       ghcn.loc[rural].lon, 
                       ghcn.loc[rural].Brightness, 
                        rural_data['TMAX'].index[0].to_timestamp().to_datetime(), rural_data['TMAX'].index[-1].to_timestamp().to_datetime(),])
    else : 
        print 'No pair found for %s'%city
        frames.append([city, np.nan, #['GHCN ID'],
                       np.nan,
                       np.nan,
                       np.nan, 
                       np.nan, #['GHCN ID'],
                       np.nan,
                       np.nan,
                       np.nan, np.nan, np.nan, np.nan, np.nan])

    # periodiically save out results
    if i%50 ==0: 
        pairs = pd.DataFrame(frames, columns = ['City', 'Urban station', 
                                          'Urban Lat', 'Urban Lon','Urban brightness', 'Urban start', 'Urban end',
                                          'Rural station', 
                                          'Rural Lat', 'Rural Lon','Rural brightness', 'Rural start', 'Rural end'])

        pairs.to_csv(outFile)
        
# pairs = pd.DataFrame(frames, columns = ['City', 'Urban station', 
#                                   'Urban Lat', 'Urban Lon','Urban brightness', 
#                                   'Rural station', 
#                                   'Rural Lat', 'Rural Lon','Rural brightness'])

pairs = pd.DataFrame(frames, columns = ['City', 'Urban station', 
                                          'Urban Lat', 'Urban Lon','Urban brightness', 'Urban start', 'Urban end',
                                          'Rural station', 
                                          'Rural Lat', 'Rural Lon','Rural brightness', 'Rural start', 'Rural end'])

pairs.to_csv(outFile)
pairs[(~np.isnan(pairs['Urban Lat']))].to_csv(outFile)

Grand Prairie
finding the next rural station for Grand Prairie
increasing search radius
USR0000TLBJ 2000-01-01 2016-09-30 8.0 ['TAVG', 'TMAX', 'TMIN'] 1.00026723677
Found a pair for Grand Prairie
Urban start/end is: 1893-01-01 00:00:00 to 2016-09-30 00:00:00
Rural start/end is: 2000-01-01 00:00:00 to 2016-09-30 00:00:00
Irvine
finding the next urban station for Irvine
finding the next rural station for Irvine
USR0000CFRE 1991-07-01 2016-11-30 18.0 ['TAVG', 'TMAX', 'TMIN'] 1.00026723677
Found a pair for Irvine
Urban start/end is: 1916-05-01 00:00:00 to 2016-11-30 00:00:00
Rural start/end is: 1991-07-01 00:00:00 to 2016-11-30 00:00:00
New Haven
finding the next urban station for New Haven
finding the next urban station for New Haven
finding the next urban station for New Haven
finding the next urban station for New Haven
finding the next urban station for New Haven
finding the next urban station for New Haven
finding the next urban station for New Haven
finding the next urban station for

In [118]:
outFile

'USpairs2005-2015.csv'

In [116]:
pairs

Unnamed: 0,City,Urban station,Urban Lat,Urban Lon,Urban brightness,Urban start,Urban end,Rural station,Rural Lat,Rural Lon,Rural brightness,Rural start,Rural end
0,Mesa,USC00022782,33.419,-111.644,63.0,2002-08-01,2016-10-31,USC00020288,33.462,-111.481,13.0,1987-05-01,2016-10-31
1,Grand Prairie,USC00410337,32.757,-97.073,63.0,1893-01-01 00:00:00,2016-09-30,USR0000TLBJ,33.172,-97.375,8.0,2000-01-01,2016-09-30
2,Irvine,USC00047888,33.744,-117.866,63.0,1916-05-01,2016-11-30,USR0000CFRE,33.808,-117.711,18.0,1991-07-01,2016-11-30
3,New Haven,USC00307134,40.961,-72.715,37.0,1938-03-01,2016-09-30,USC00065445,41.972,-73.220,7.0,1884-11-01 00:00:00,2016-12-31
4,Fort Lauderdale,USC00083168,26.140,-80.106,63.0,1952-04-01,2016-09-30,USR0000FCAC,25.390,-80.680,4.0,1999-12-01,2016-11-30
5,Long Beach,USW00023129,33.811,-118.146,63.0,1949-01-01,2016-12-31,USR0000CCHI,34.331,-118.030,6.0,1986-02-01,2016-11-30
6,Hartford,USW00014752,41.736,-72.650,62.0,1920-01-01,2016-12-31,USC00060227,41.841,-73.008,15.0,1990-08-01,2016-10-31
7,Providence,USC00379423,41.984,-71.490,60.0,1967-11-01,2016-10-31,USC00375270,41.856,-71.733,12.0,1974-10-01,2016-10-31
8,Birmingham,USC00013781,33.272,-86.833,61.0,2003-12-01,2016-10-31,USC00010505,33.452,-87.357,7.0,1957-02-01,2016-12-31
9,St. Petersburg,USC00087205,28.023,-82.142,63.0,1892-09-01 00:00:00,2016-10-31,USC00083153,27.570,-82.137,4.0,2001-08-01,2016-09-30


(70, 13)

In [26]:
outFile

'USpairs_stationlengths_stationavailability75_noairport.csv'

In [172]:
pairs[pairs.City.str.contains('Provid')][['Urban Lon', 'Urban Lat']].values[0]

array([-71.49 ,  41.984])

In [161]:
np.array([[-71.414979700000004, 41.821102310000001],
         [-71.49 ,  41.984]])

array([[-71.4149797 ,  41.82110231],
       [-71.49      ,  41.984     ]])

In [175]:
pairs

Unnamed: 0,City,Urban station,Urban Lat,Urban Lon,Urban brightness,Urban start,Urban end,Rural station,Rural Lat,Rural Lon,Rural brightness,Rural start,Rural end
0,Mesa,USC00022782,33.419,-111.644,63.0,2002-08-01,2016-10-31,USC00020288,33.462,-111.481,13.0,1987-05-01,2016-10-31
1,Grand Prairie,USC00410337,32.757,-97.073,63.0,1893-01-01 00:00:00,2016-09-30,USR0000TLBJ,33.172,-97.375,8.0,2000-01-01,2016-09-30
2,Irvine,USC00047888,33.744,-117.866,63.0,1916-05-01,2016-11-30,USR0000CFRE,33.808,-117.711,18.0,1991-07-01,2016-11-30
3,New Haven,USC00307134,40.961,-72.715,37.0,1938-03-01,2016-09-30,USC00065445,41.972,-73.220,7.0,1884-11-01 00:00:00,2016-12-31
4,Fort Lauderdale,USC00083168,26.140,-80.106,63.0,1952-04-01,2016-09-30,USR0000FCAC,25.390,-80.680,4.0,1999-12-01,2016-11-30
5,Long Beach,USW00023129,33.811,-118.146,63.0,1949-01-01,2016-12-31,USR0000CCHI,34.331,-118.030,6.0,1986-02-01,2016-11-30
6,Hartford,USW00014752,41.736,-72.650,62.0,1920-01-01,2016-12-31,USC00060227,41.841,-73.008,15.0,1990-08-01,2016-10-31
7,Providence,USC00379423,41.984,-71.490,60.0,1967-11-01,2016-10-31,USC00375270,41.856,-71.733,12.0,1974-10-01,2016-10-31
8,Birmingham,USC00013781,33.272,-86.833,61.0,2003-12-01,2016-10-31,USC00010505,33.452,-87.357,7.0,1957-02-01,2016-12-31
9,St. Petersburg,USC00087205,28.023,-82.142,63.0,1892-09-01 00:00:00,2016-10-31,USC00083153,27.570,-82.137,4.0,2001-08-01,2016-09-30


In [187]:
# compute distances of urban stations and city center
u_distances = np.ones(pairs.shape[0])*np.nan
r_distances = np.ones(pairs.shape[0])*np.nan
for i in range(0, pairs.shape[0]): #range(31,32): # for baltimore
    lat = atlas.iloc[i]['lat']
    lon = atlas.iloc[i]['lng']
    city = atlas.iloc[i]['city_ascii']
    urban_distance = scipy.spatial.distance.pdist(np.array([[lon, lat],
         pairs[pairs.City.str.contains(city)][['Urban Lon', 'Urban Lat']].values[0] ]))
    rural_distance = scipy.spatial.distance.pdist(np.array([[lon, lat],
         pairs[pairs.City.str.contains(city)][['Rural Lon', 'Rural Lat']].values[0] ]))
    
    u_distances[i] = urban_distance
    r_distances[i] = rural_distance

In [188]:
pairs['Urban distance'] = u_distances
pairs['Rural distance'] = r_distances

In [193]:
pairs[(~np.isnan(pairs['Urban Lat']))].to_csv(outFile)

In [192]:
pairs[pairs['Urban distance'] < .1]

Unnamed: 0,City,Urban station,Urban Lat,Urban Lon,Urban brightness,Urban start,Urban end,Rural station,Rural Lat,Rural Lon,Rural brightness,Rural start,Rural end,Urban distance,Rural distance
0,Mesa,USC00022782,33.419,-111.644,63.0,2002-08-01,2016-10-31,USC00020288,33.462,-111.481,13.0,1987-05-01,2016-10-31,0.092215,0.257912
1,Grand Prairie,USC00410337,32.757,-97.073,63.0,1893-01-01 00:00:00,2016-09-30,USR0000TLBJ,33.172,-97.375,8.0,2000-01-01,2016-09-30,0.089455,0.602709
2,Irvine,USC00047888,33.744,-117.866,63.0,1916-05-01,2016-11-30,USR0000CFRE,33.808,-117.711,18.0,1991-07-01,2016-11-30,0.073097,0.174437
4,Fort Lauderdale,USC00083168,26.14,-80.106,63.0,1952-04-01,2016-09-30,USR0000FCAC,25.39,-80.68,4.0,1999-12-01,2016-11-30,0.036001,0.919939
5,Long Beach,USW00023129,33.811,-118.146,63.0,1949-01-01,2016-12-31,USR0000CCHI,34.331,-118.03,6.0,1986-02-01,2016-11-30,0.026882,0.558898
6,Hartford,USW00014752,41.736,-72.65,62.0,1920-01-01,2016-12-31,USC00060227,41.841,-73.008,15.0,1990-08-01,2016-10-31,0.045336,0.335624
10,Virginia Beach,USW00013769,36.816,-76.033,63.0,1945-03-01,2016-12-31,USC00440385,36.661,-75.911,5.0,1953-12-01,2016-12-31,0.066143,0.203662
11,Oakland,USC00046336,37.798,-122.264,63.0,1970-10-01,2016-12-31,USR0000CTRA,37.833,-122.066,13.0,1994-04-01,2016-09-30,0.051824,0.167819
12,West Palm Beach,USC00086764,26.826,-80.148,63.0,2002-11-01,2016-10-31,USC00088368,26.47,-80.64,4.0,2002-11-01,2016-09-30,0.08457,0.58505
13,Louisville,USW00013810,38.228,-85.663,63.0,2000-09-01,2016-10-31,USC00150955,37.956,-86.116,12.0,1999-10-01,2016-10-31,0.085756,0.455276


In [121]:
atlas[atlas['city'].str.contains('Provid')]

Unnamed: 0,city,city_ascii,lat,lng,pop,country,iso2,iso3,province
6871,Providence,Providence,41.821102,-71.41498,663726.5,United States of America,US,USA,Rhode Island


In [150]:
pairs[pairs.City.str.contains('Provid')][['Urban Lon', 'Urban Lat']].values

array([[-71.49 ,  41.984]])

In [131]:
atlas[atlas['city'].str.contains('Provid')]['lng'].values

array([-71.4149797])

In [132]:
# check providence
lat = atlas[atlas['city'].str.contains('Provid')]['lat'].values[0]
lon = atlas[atlas['city'].str.contains('Provid')]['lng'].values[0]
#     lat = atlas.iloc[i]['lat']
#     lon = atlas.iloc[i]['lng']
#     city = atlas.iloc[i]['city_ascii']
city = 'Providence'
closeststations = tree.query([lon,lat], k =15, distance_upper_bound=1)
# Make sure the closest stations are within a finite distanc
closestInds = closeststations[1][~np.isinf(closeststations[0])]
# make sure that there are stations 
if closestInds.shape[0] > 0 : 
    urban = ghcn.Brightness[closestInds].argmax()
    rural = ghcn.Brightness[closestInds].argmin()
#     #         if urban != rural: 
#     urban_data = ulmo.ncdc.ghcn_daily.get_data(urban, as_dataframe=True, update=False)
#     rural_data = ulmo.ncdc.ghcn_daily.get_data(rural, as_dataframe=True, update=False)

#     number_urban_stations = (ghcn.Brightness[closestInds] >= brightness_threshold).sum() # number of urban stations is equivalent to brightness over 30
#     number_rural_stations = (ghcn.Brightness[closestInds] < brightness_threshold).sum() # number of rural stations is equivalent to brightness over 30


In [133]:
urban

'USC00198367'

In [141]:
closeststations

(array([ 0.17934242,  0.20883027,  0.28371209,  0.30198826,  0.31195871,
         0.31215564,  0.31241783,  0.3199293 ,  0.32761113,  0.35333929,
         0.35499005,  0.35512523,  0.35876265,  0.36551376,  0.36781983]),
 array([16818, 16813, 16817, 13724, 13700, 13738, 13743, 16816, 16815,
        22560, 16814, 13720, 13737, 22561, 16812]))

In [140]:
ghcn.iloc[closestInds]

Unnamed: 0_level_0,lat,lon,elevation,state,name,gsn flag,HCN/CRN FLAG,WMO ID,Brightness
GHCN ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
USC00379423,41.984,-71.49,33.0,RI,WOONSOCKET,,,,60.0
USC00371875,41.668,-71.557,85.0,RI,COVENTRY 2,,,,60.0
USC00377581,41.626,-71.209,27.0,RI,TIVERTON,,,,31.0
USC00195984,41.992,-71.166,29.0,MA,NORTON WEST,,,,48.0
USC00192913,42.078,-71.238,82.0,MA,FOXBORO,,,,59.0
USC00198368,41.955,-71.133,30.0,MA,TAUNTON 4 NW,,,,60.0
USC00199316,42.133,-71.433,64.0,MA,WEST MEDWAY,,HC,,53.0
USC00375270,41.856,-71.733,192.0,RI,NORTH FOSTER 1 E,,,,12.0
USC00375215,41.5,-71.35,4.0,RI,NEWPORT ROSE,,,,43.0
USW00054796,41.491,-71.541,35.0,RI,KINGSTON 1 NW,,CR,72511.0,52.0


In [139]:
closestInds

array([16818, 16813, 16817, 13724, 13700, 13738, 13743, 16816, 16815,
       22560, 16814, 13720, 13737, 22561, 16812])

In [136]:
ghcn.Brightness[closestInds]

GHCN ID
USC00379423    60.0
USC00371875    60.0
USC00377581    31.0
USC00195984    48.0
USC00192913    59.0
USC00198368    60.0
USC00199316    53.0
USC00375270    12.0
USC00375215    43.0
USW00054796    52.0
USC00374266    52.0
USC00194760    57.0
USC00198367    61.0
USW00054797    43.0
USC00370218    15.0
Name: Brightness, dtype: float64

In [34]:
drop_cities = ['Mesa','Long Beach', 'Grand Prairie', 'Virginia Beach', 'St. Paul' ]

In [38]:
pairs[~np.isnan(pairs['Urban Lat'])].set_index(['City']).drop(drop_cities).to_csv(outFile)

In [40]:
outFile

'USpairs_stationlengths_stationavailability75_noairport.csv'