In [155]:
import random
import csv
from math import radians, cos, sin, asin, sqrt

import networkx as nx
import pickle as pkl
import pandas as pd


from vars import DATASET

In [156]:
DATASET

'boston-metro'

# tag nodes with zipcodes, population, and total graph pop

## Known supplied zipcodes

In [157]:
zipcode_dict = {}

In [158]:
# populate zipcode_dict with nodes for each zipcode
with open('./data/zipcode/zipcodes_boston-metro.csv', newline='') as f:
    csvreader = csv.reader(f, delimiter=',')
    next(csvreader) # skip header
    for row in csvreader:
        node_id, zipcode = row[0], row[1]
        
        if zipcode_dict.get(zipcode) is not None:
            zipcode_dict[zipcode]['ids'] = zipcode_dict[zipcode]['ids'] + [node_id]
        else:
            zipcode_dict[zipcode] = {}
            zipcode_dict[zipcode]['population'] = 0
            zipcode_dict[zipcode]['ids'] = [node_id]

In [159]:
# populate zipcode_dict with population for each zipcode
# from https://worldpopulationreview.com/zips/massachusetts
with open('./data/zipcode/population_by_zip_2020.csv', newline='') as f:
    csvreader = csv.reader(f, delimiter=',')
    next(csvreader) # skip header
    for row in csvreader:
        population, zipcode = int(row[3]), row[0].zfill(5) # 5 digit zipcode
        
        if zipcode_dict.get(zipcode):
            zipcode_dict[zipcode]['population'] = population


In [160]:
# compute total population visible in graph
total_pop_dict = {}
all_nodes_set = set(g.nodes())

# populate a dictionary of zipcodes in use, and their populations
for zipcode, inner_dict in zipcode_dict.items():
    population = inner_dict['population']
    nodes = set(inner_dict['ids'])
    
    # if the zipcode hasn't gotten its population yet, and the nodes intersect with the graph
    if total_pop_dict.get(zipcode, True) and len(nodes.intersection(all_nodes_set)) > 0:
        total_pop_dict[zipcode] = population

# sum the values of the dictionary to get the total map population
total_population = sum([v for k, v in total_pop_dict.items()])

In [161]:
# tag nodes with the zipcode, population, and total graph population
for zipcode, inner_dict in zipcode_dict.items():
    population = inner_dict['population']
    nodes = inner_dict['ids']
    
    for node in nodes:
        
        # the zipcode dictionary may have nodes not contained in
        # the particular dataset, so make sure to only try to tag those
        if g.nodes.get(node):
            g.nodes()[node]['zipcode'] = zipcode
            g.nodes()[node]['population'] = population
            g.nodes()[node]['total_graph_area_pop'] = total_population

In [162]:
num_nodes = len(g.nodes())
num_nodes_with_zips = len([x for x in g.nodes() if g.nodes()[x].get('zipcode')])

print('number of nodes in the graph:' + str(num_nodes))
print('number of nodes with zipcodes:' + str(num_nodes_with_zips))
print('ratio-zipcoded: ' + str(num_nodes_with_zips/num_nodes))

number of nodes in the graph:918800
number of nodes with zipcodes:306362
ratio-zipcoded: 0.33343709185894643


## inferring zipcodes

In [185]:
def infer_zipcode_from_neighbors(node_id):
    '''given a node_id, add a zipcode if it can be inferred.
       return whether an inference was made, and the current zipcode'''
    
    # if the node has an assigned zipcode, use that.
    if g.nodes()[node_id].get('zipcode') is not None:
        return (False, g.nodes()[node_id].get('zipcode'))
    
    in_edges = g.in_edges(node_id)
    out_edges = g.out_edges(node_id)
    
    in_neighbors = [x[0] for x in g.in_edges(node_id)]
    out_neighbors = [x[1] for x in g.out_edges(node_id)]
    
    in_distances = [g.edges()[x]['havlen'] for x in in_edges]
    out_distances = [g.edges()[x]['havlen'] for x in out_edges]
    
    in_zipcodes = [g.nodes()[x].get('zipcode', None) for x in in_neighbors]
    out_zipcodes = [g.nodes()[x].get('zipcode', None) for x in out_neighbors]
    
    zip_distances = [x for x in zip(in_zipcodes, in_distances)] + [x for x in zip(out_zipcodes, out_distances)]
    
    # dictionary of zipcodes and their values
    distance_dict = {}
    for combo in zip_distances:
        zipcode = combo[0]
        distance = combo[1]
        existing_distance = distance_dict.get(zipcode, 0)
        
        if zipcode is not None:
            distance_dict[zipcode] = existing_distance + distance
            
    closest_zip = None
    closest_distance = 1000000
    for k, v in distance_dict.items():
        if v < closest_distance:
            closest_distance = v
            closest_zip = k
    
    # categorize if possible
    inference_made = False
    if closest_zip != None:
        g.nodes()[node_id]['zipcode'] = closest_zip
        inference_made = True
    
    return (inference_made, closest_zip)

In [186]:
g.nodes()['30416737']

{'lat': 42.2036233, 'lon': -71.1058422, 'id': '30416737', 'tags': {}}

In [187]:
foo = infer_zipcode_from_neighbors('30416737')
print(foo)

> [0;32m<ipython-input-185-a3aad6f711b3>[0m(42)[0;36minfer_zipcode_from_neighbors[0;34m()[0m
[0;32m     40 [0;31m    [0;31m# categorize if possible[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     41 [0;31m    [0;32mimport[0m [0mpdb[0m[0;34m;[0m [0mpdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0m
[0m[0;32m---> 42 [0;31m    [0minference_made[0m [0;34m=[0m [0;32mFalse[0m[0;34m[0m[0m
[0m[0;32m     43 [0;31m    [0;32mif[0m [0mclosest_zip[0m [0;34m!=[0m [0;32mNone[0m[0;34m:[0m[0;34m[0m[0m
[0m[0;32m     44 [0;31m        [0mg[0m[0;34m.[0m[0mnodes[0m[0;34m([0m[0;34m)[0m[0;34m[[0m[0mnode_id[0m[0;34m][0m[0;34m[[0m[0;34m'zipcode'[0m[0;34m][0m [0;34m=[0m [0mclosest_zip[0m[0;34m[0m[0m
[0m
ipdb> c
(False, None)


In [175]:
good_node = '542944364'
foo = infer_zipcode_from_neighbors(good_node)
foo

(False, '02021')

In [179]:
def nodes_without_zipcodes():
    uncategorized_nodes = []
    for node in g.nodes():
        if g.nodes()[node].get('zipcode') is None:
            uncategorized_nodes.append(node)
    return uncategorized_nodes

uncategorized_nodes = nodes_without_zipcodes()
# uncategorized_nodes[0:100]

In [184]:
uncategorized_nodes = nodes_without_zipcodes()
num_categorized = 0
nodes_since_last_categorization = 0

# while nodes_since_last_categorization < len(uncategorized_nodes) - num_categorized:

#     print('num uncategorized nodes: ' + len(uncategorized_nodes))
#     print('num_categorized: ' + str(num_categorized))
#     print('nodes_since_last_categorization' + str(nodes_since_last_categorization))
    
for node in uncategorized_nodes:
    inference = infer_zipcode_from_neighbors(node)
    inference_made = inference[0]
    zipcode = inference[1]
    
    if inference_made:
        import pdb; pdb.set_trace()
        num_categorized += 1
        print(node + ' assigned ' + str(zipcode))
        nodes_since_last_categorization = 0
    else:
        nodes_since_last_categorization += 1

> [0;32m<ipython-input-173-a3aad6f711b3>[0m(42)[0;36minfer_zipcode_from_neighbors[0;34m()[0m
[0;32m     40 [0;31m    [0;31m# categorize if possible[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     41 [0;31m    [0;32mimport[0m [0mpdb[0m[0;34m;[0m [0mpdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0m
[0m[0;32m---> 42 [0;31m    [0minference_made[0m [0;34m=[0m [0;32mFalse[0m[0;34m[0m[0m
[0m[0;32m     43 [0;31m    [0;32mif[0m [0mclosest_zip[0m [0;34m!=[0m [0;32mNone[0m[0;34m:[0m[0;34m[0m[0m
[0m[0;32m     44 [0;31m        [0mg[0m[0;34m.[0m[0mnodes[0m[0;34m([0m[0;34m)[0m[0;34m[[0m[0mnode_id[0m[0;34m][0m[0;34m[[0m[0;34m'zipcode'[0m[0;34m][0m [0;34m=[0m [0mclosest_zip[0m[0;34m[0m[0m
[0m
ipdb> inference
(True, '')
ipdb> node
'30416737'
ipdb> g.nodes()['30416737']
{'lat': 42.2036233, 'lon': -71.1058422, 'id': '30416737', 'tags': {}}
ipdb> g.nodes()[node_id]
{'lat': 42.2036233, 'lon': -71.1058422, 'i

BdbQuit: 

In [152]:
g.nodes()['61388670']

{'lat': 42.281291,
 'lon': -71.104081,
 'id': '61388670',
 'tags': {'attribution': 'Office of Geographic and Environmental Information (MassGIS)',
  'created_by': 'JOSM',
  'source': 'massgis_import_v0.1_20071008193615'},
 'zipcode': '',
 'population': 0,
 'total_graph_area_pop': 2581490}

In [145]:
leftover = nodes_without_zipcodes()

In [146]:
len(leftover)

308150

In [147]:
len(uncategorized_nodes)

308150

# Dijkstra from Central Nodes

In [21]:
# give every node an empty zipcode dictionary
for node in g.nodes():
    g.nodes()[node]['distance-from-zipcode'] = {}

In [24]:
# in testing, cutoff of 1000 does not affect the boston graph. The max distance represented is 23 km.
# a reasoanble cutoff for a larger graph might be 5000, representing 100 km.
def record_lenghts_from_source(zipcode, node_id, weight='havlen', cutoff=5000):
    """
    given a zipcode and a node_id, compute the distance from the node_id to
    all other nodes on the graph. Then, record this information in nodes,
    keyed to the zipcode.
    
    Returns lengths and paths, which may be convenient for analysis reasons.
    """
    
    lengths, paths = nx.single_source_dijkstra(g, node_id, weight=weight, cutoff=cutoff)
    
    for k, v in lenghts.items():
        g.nodes()[k]['distance-from-zipcode'][zipcode] = {'pop_percent': 0,
                                                          'distance': v}

    return lengths, paths

# Networkx Algos

In [75]:
# %%time
# betweenness - O^2
# betweenness_subset = (nx.algorithms.centrality.betweenness_centrality(g))
# central_nodes = [x[0] for x in list(betweenness_subset.items()) if x[1] > 0.2]
# for node in central_nodes:
#     g.nodes[node]['betweenness'] = True



In [76]:
# %%time 
# K-components: identifies likely subgraphs and subgraphs of subgraphs
# not useful for this project, sadly. It mostly identifies playgrounds
# and stadiums, since these are easy subsets
# from networkx.algorithms import approximation as apxa
# h = g.to_undirected() 

# foo = apxa.k_components(h, min_density=0.95)

# for node in foo[1][0]:
#     g.nodes[node]['k_1'] = True
# for node in foo[2][0]:
#     g.nodes[node]['k_2'] = True
# for node in foo[2][1]:
#     g.nodes[node]['k_2'] = True
# for node in foo[2][2]:
#     g.nodes[node]['k_2'] = True
# for node in foo[3][0]:
#     g.nodes[node]['k_2'] = True

In [77]:
# runs forever. no result
# from networkx.algorithms import approximation as apxa
# apxa.maximum_independent_set(g)

In [78]:
# %%time
# # voterank - not useful. mostly just shows parks
# central_nodes = nx.algorithms.centrality.voterank(g, number_of_nodes=10)
# for node in central_nodes:
#     g.nodes[node]['voterank'] = True


# # betweenness - O^2
# # betweenness_subset = (nx.algorithms.centrality.betweenness_centrality(g))
# # central_nodes = [x[0] for x in list(betweenness_subset.items()) if x[1] > 0.2]
# # for node in central_nodes:
# #     g.nodes[node]['betweenness'] = True


In [79]:
# %%time
# # closeness - again, not very useful.
# closeness_subset = (nx.algorithms.centrality.closeness_centrality(g, distance='havlen'))

# for node in central_nodes:
#     g.nodes[node]['closeness'] = True

In [25]:
pr = nx.pagerank(g)

# Write results to disk

In [28]:
nx.write_gpickle(g, "./data/02_" + DATASET + ".gpickle")