In [91]:
from uszipcode import SearchEngine
import networkx as nx
import pickle
import itertools
import random
from area import area
import math
import folium as f
from scipy.spatial import ConvexHull


search = SearchEngine(simple_zipcode=False, db_file_dir="./zip_code_cache")

In [15]:
g = nx.read_gpickle("01_cambridge.gpickle")
nodes_data_subset = g.nodes().data()

## Populate dictionary of zip codes and node_ids.

### `zip_code_dict` and `nodes_by_zip` are inverses of each other

In [18]:
def get_zip_code_dict(g):

    count = 0
    with open('zip_code_dict.pickle', 'rb') as handle:
        zip_code_dict = pickle.load(handle)

    for node_data in g.nodes().data():
        count += 1
        node_id = node_data[1]["id"]
        lon,lat = node_data[1]['lon'], node_data[1]['lat']
        if node_id not in zip_code_dict:
            print("missing zip codes being added! This might take a while...")
            zip_code_dict[node_id] = search.by_coordinates(lat, lon, returns=1)[0].zipcode
        if count % 100 == 0: 
            with open('zip_code_dict.pickle', 'wb') as handle:
                pickle.dump(zip_code_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)        

    with open('zip_code_dict.pickle', 'wb') as handle:
        pickle.dump(zip_code_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
    return zip_code_dict

# Generate reverse dictionary——nodes by zip instaed of zips by node
def get_nodes_by_zip(zip_code_dict):
    nodes_by_zip = {}
    for node_id, zip_code in zip_code_dict.items():
        nodes_by_zip.setdefault(zip_code, []).append(node_id)
    return nodes_by_zip

def area_km2_from_map(g, zip_code):
    lat_lons = [[x[1]["lat"], x[1]["lon"]] for x in g.nodes().data() if x[0] in nodes_by_zip[zip_code]]
    if len(lat_lons) < 3:
        return .000000001
    hull = ConvexHull(lat_lons)
    hull_edges = [[hull.points[ix][1], hull.points[ix][0]] for ix in hull.vertices]
    polygon = {'type': 'Polygon', 'coordinates': [hull_edges]}
    return (area(polygon) / 1000000)

# def weights_by_zip(g, zip_code_dict, nodes_by_zip):
#     for node_id in g.nodes().data():
        
    


zip_code_dict = get_zip_code_dict(g)
nodes_by_zip = get_nodes_by_zip(zip_code_dict)



In [103]:
def semi_random_nodes(g, percentage):
    k = percentage * len(g)
    zips_in_g = {zip_code_dict[node_id] for node_id in g}
    
    weight_by_zip = {}
    for zip_code in zips_in_g:
        weight_by_zip[zip_code] = area_km2_from_map(g, zip_code) * search.by_zipcode(zip_code).population_density
    
    sum_of_weights = sum([v for k,v in weight_by_zip.items()])    
    
    num_by_zip = {}
    for zip_code in zips_in_g:
        num_by_zip[zip_code] = math.ceil(weight_by_zip[zip_code]*k/sum_of_weights)
        
    set_of_nodes_g = set({node for node in g})
    return_nodes = {}
    for zip_code,num in num_by_zip.items():
        nodes_in_zip_code = list(set(nodes_by_zip[zip_code]) & set_of_nodes_g)
        if len(nodes_in_zip_code) > num:
            return_nodes[zip_code] = random.sample(nodes_in_zip_code, num)
        else:    
            return_nodes[zip_code] = nodes_in_zip_code
    

          
    return list(itertools.chain.from_iterable([nodes for zip_code,nodes in return_nodes.items()]))
aa = semi_random_nodes(g, 1)
print(len(aa)/ len(g))
## ISSUE HERE: why does it go down down down when the k value goes up?

0.8257428492085532


{'4406735122', '61283345', '7631495276', '2446018258', '61283333', '314502385', '3101702904', '4978289977', '7628615126', '7632047877', '61178893', '61323500', '4330089155', '688404530', '1548276713', '7629147340', '7628615207', '6278307929', '4330089149', '7629493777', '1881894769', '7629147362', '4329091770', '430603928', '2764945153', '64065426', '7746087857', '391540194', '4333397013', '82766883', '7629494062', '7669948666', '7629147757', '1331898210', '7629493729', '64064333', '7628615132', '7628615245', '7626171439', '7629494030', '276247444', '5458830604', '7629493599', '7628787172', '1524379438', '7629493761', '61363608', '7631668579', '71924880', '6868617302', '1881894535', '3748512197', '7058950726', '6540251565', '7658055860', '7818244749', '3622695549', '7598947552', '7631024525', '7629493755', '5458834092', '7729197243', '7628223037', '6124568187', '71922340', '1053541762', '5456384049', '61321196', '1132012105', '5458849677', '7703039100', '7629494114', '71917709', '61323

## Use `area` library to find Polygon's geographic area

### First, simple PoC with Wyoming

In [29]:
wyoming = {'type':'Polygon','coordinates':[[[-111.046768, 40.997963], 
                                            [-111.055196, 45.001320], 
                                            [-104.057691, 44.997377],
                                            [-104.053251, 41.001410],
                                            [-111.046768, 40.997963]]]}

area_km2 = area(wyoming)

area_km2 = area_km2 / 1e+6
# print ('area m2: ' + str(math.floor(area_m2)))
print ('Estimated area of Wyoming: ' + str(math.floor(area_km2)) + 'km^2')
print("Error:                     " + str(253600 - math.floor(area_km2)) + "km^2" )

Estimated area of Wyoming: 253529km^2
Error:                     71km^2


### Now running on entire Cambridge dataset. (Note—outliers skew this)

In [30]:
from scipy.spatial import ConvexHull

lat_lons = [[x[1]["lat"], x[1]["lon"]] for x in g.nodes().data()]
hull = ConvexHull(lat_lons)
hull_edges = [[hull.points[ix][1], hull.points[ix][0]] for ix in hull.vertices]

camb = {'type': 'Polygon', 'coordinates': [hull_edges]}

print("Area of convex map: " + (str (area(camb) / 1000000)) + "km^2")



Area of convex map: 21.747622196450717km^2


In [24]:
ZIP_CODE = '02139'

lat_lons = [[x[1]["lat"], x[1]["lon"]] for x in g.nodes().data() if x[0] in nodes_by_zip[ZIP_CODE]]
hull = ConvexHull(lat_lons)
hull_edges = [[hull.points[ix][1], hull.points[ix][0]] for ix in hull.vertices]
polygon = {'type': 'Polygon', 'coordinates': [hull_edges]}

print("Area of convex map:        " + (str (area(polygon) / 1000000)) + "km^2")
print("Actual land area of " + ZIP_CODE + ": " + str(search.by_zipcode(ZIP_CODE).land_area_in_sqmi * 2.58999) + "km^2")


def weights_by_zip(g, zip_code):
    


    
    


Area of convex map:        3.1954485686585246km^2
Actual land area of 02139: 4.0403844km^2


3.1954485686585246

In [34]:
m = f.Map(location = [42.3611108,-71.1079923], zoom_start=16)

# Show graph with hull edges
for node_data in g.nodes().data():
    node_id = node_data[1]["id"]
    if not node_data[1].get('lon'):
        import pdb; pdb.set_trace()
    lon,lat = node_data[1]['lon'], node_data[1]['lat']  
    if [lon, lat] in hull_edges:
        m.add_child(f.Marker(location=[lat,lon], color="red", radius=1))
    
    m.add_child(f.CircleMarker(location=[lat,lon], color="orange", radius=.5))

m

## Area-producing cruft

In [19]:
import pyproj    
import shapely
import shapely.ops as ops
from shapely.geometry.polygon import Polygon
from functools import partial


# geom = Polygon([[40.997963, -111.046768], 
#                 [45.001320, -111.055196], 
#                 [44.997377, -104.057691],
#                 [41.001410, -104.053251],
#                 [40.997963, -111.046768]])


geom = Polygon([[-111.046768, 40.997963], 
                [-111.055196, 45.001320], 
                [-104.057691, 44.997377],
                [-104.053251, 41.001410],
                [-111.046768, 40.997963]])
# geom = Polygon([[40,-111],
#                 [2,3],
#                 [3,5],
#                 [40,-111]])
geom_area = ops.transform(
    partial(
        pyproj.transform,
        pyproj.Proj(init='EPSG:4326'),
        pyproj.Proj(
            proj='aea',
            lat_1=geom.bounds[1],
            lat_2=geom.bounds[3])),
    geom)

# Print the area in m^2
print (math.floor(geom_area.area / 1000000) )

253115


  return _prepare_from_string(" ".join(pjargs))
  projstring = _prepare_from_string(" ".join((projstring, projkwargs)))


In [8]:
search.by_zipcode(ZIP_CODE).population_density

23355.0

In [9]:
ZIP_CODE

'02139'