In [20]:
import random
import csv
from math import radians, cos, sin, asin, sqrt

import networkx as nx
import pickle as pkl
import pandas as pd

from modified_betweeness_algo import betweenness_centrality
from vars import DATASET

# use modified betweeness centrality that allows you to pass p, and defaults to 1
nx.betweenness_centrality = betweenness_centrality

In [21]:
DATASET

'boston-metro'

In [22]:
g = nx.read_gpickle("./data/02_" + DATASET + ".gpickle")

In [23]:
num_nodes = len(g.nodes())
num_nodes_missing_zipcodes = len([x for x in g.nodes() if not g.nodes()[x].get('zipcode')])

print('number of nodes: ' + str(num_nodes))
print('number of nodes missing zipcodes: ' + str(num_nodes_missing_zipcodes))
print('percent missing zips: ' + str((num_nodes_missing_zipcodes / num_nodes)) )

number of nodes: 918800
number of nodes missing zipcodes: 6639
percent missing zips: 0.0072257292120156726


# find start-nodes for each zipcode

In [5]:
zipcodes = set()
for node in g.nodes():
    zipcodes.add(g.nodes()[node].get('zipcode'))

zipcodes.remove(None)

In [7]:
central_points_in_zipcodes = {}
for zipcode in zipcodes:
    # get list of nodes
    
    central_points_in_zipcodes[zipcode] = {}
    central_points_in_zipcodes[zipcode]['nodes'] = []
    central_points_in_zipcodes[zipcode]['central'] = {}
    
    nodelist = [x for x in g.nodes() if g.nodes()[x].get('zipcode') == zipcode]
    central_points_in_zipcodes[zipcode] = nodelist
    
    # compute centrality of those nodes
    # spit out most central percent
    

In [24]:
%%time
# find the most central node of each zipcode
temp_subgraphs = []

for zipcode in list(central_points_in_zipcodes.keys()):
    h = g.subgraph(central_points_in_zipcodes[zipcode])

#     pr = nx.pagerank(h, weight='havlen')
#     g.nodes()[pd.Series(pr).idxmax()]['pr'] = True

    voterank = nx.algorithms.centrality.voterank(h, number_of_nodes=1)[0]
    g.nodes()[voterank]['voterank'] = True
    
    v = len(h.nodes())
    e = len(h.edges())
    complexity = len(h.nodes()) * len(h.edges())
    # don't go over complexity 4,500,000,000 for betweenness.
    # only sample 250 nodes per zipcode, max.
    # It's just too expensive. In this case, substitute betweenness
#     if complexity < 4500000000:
    sample_percent = 200/v
    print(str(sample_percent) + " in " + str(zipcode))
    betweenness = nx.betweenness_centrality(h, p=sample_percent, weight='havlen')
    g.nodes()[pd.Series(betweenness).idxmax()]['betweenness'] = True
#     else:
#         print("zipcode skipped: " + zipcode)
#         g.nodes()[voterank]['betweenness']
    
#     temp_subgraphs.append(h)

0.0364232380258605 in 02122
0.010412328196584756 in 02155
0.05282620179609086 in 02144
0.040666937779585195 in 01887
0.014467592592592593 in 01730
0.02737476047084588 in 02150
0.027758501040943788 in 02458
0.03588087549336204 in 02188
0.020167389331451045 in 02474
0.07993605115907274 in 01908
0.04987531172069826 in 02476
0.014349261013057828 in 02492
0.007909515146721505 in 02184
0.04159733777038269 in 02114
0.04396570674873598 in 02126
0.03709886848451122 in 02021
0.034205575508807935 in 02116
1.1627906976744187 in 02163
0.09086778736937756 in 02045
0.09970089730807577 in 01731
0.019849146486701073 in 02420
0.0042324459305032374 in 01801
0.01723246596587972 in 01890
0.14641288433382138 in 02120
0.027750797835437768 in 02136
0.04289084280506112 in 02119
0.04956629491945477 in 02210
0.05711022272986865 in 02462
0.138217000691085 in 02110
0.04423800044238001 in 02141
0.16181229773462782 in 01945
0.013970382788488405 in 01867
0.03460207612456748 in 02118
0.013592496941688188 in 02043
0.02

In [26]:
# map them
import folium as f
import random

m = f.Map(location = [42.4028327,-71.1204947], zoom_start=10)

for graph in temp_subgraphs:
    nodes_data_subset = graph.nodes().data()

    for node_data in nodes_data_subset:

        node_id = node_data[1]["id"]
        lon,lat = node_data[1]['lon'], node_data[1]['lat']  

#         if random.random() < 0.05:
#             m.add_child(f.CircleMarker(location=[lat,lon], color='gray', opacity=0.5, radius=1)) 

        if node_data[1].get('betweenness'):  
            m.add_child(f.CircleMarker(location=[lat,lon], color='blue', radius=10, fill=True, tooltip=node_id))
#         elif node_data[1].get('pr'):
#             m.add_child(f.CircleMarker(location=[lat,lon], color='green', radius=10, fill=True, tooltip=node_id))
        elif node_data[1].get('voterank'):
            m.add_child(f.CircleMarker(location=[lat,lon], color='red', radius=10, fill=True, tooltip=node_id))

m        

In [28]:
nodes_data_subset = g.nodes().data()
for node_data in nodes_data_subset:

    node_id = node_data[1]["id"]
    lon,lat = node_data[1]['lon'], node_data[1]['lat']  

#         if random.random() < 0.05:
#             m.add_child(f.CircleMarker(location=[lat,lon], color='gray', opacity=0.5, radius=1)) 

    if node_data[1].get('betweenness'):  
        m.add_child(f.CircleMarker(location=[lat,lon], color='blue', radius=10, fill=True, tooltip=node_id))
#         elif node_data[1].get('pr'):
#             m.add_child(f.CircleMarker(location=[lat,lon], color='green', radius=10, fill=True, tooltip=node_id))
    elif node_data[1].get('voterank'):
        m.add_child(f.CircleMarker(location=[lat,lon], color='red', radius=10, fill=True, tooltip=node_id))
m

In [25]:
nx.write_gpickle(g, "./data/03_" + DATASET + ".gpickle")

# Dijkstra from Central Nodes

In [21]:
# give every node an empty zipcode dictionary
for node in g.nodes():
    g.nodes()[node]['distance-from-zipcode'] = {}

In [24]:
# in testing, cutoff of 1000 does not affect the boston graph. The max distance represented is 23 km.
# a reasoanble cutoff for a larger graph might be 5000, representing 100 km.
def record_lenghts_from_source(zipcode, node_id, weight='havlen', cutoff=5000):
    """
    given a zipcode and a node_id, compute the distance from the node_id to
    all other nodes on the graph. Then, record this information in nodes,
    keyed to the zipcode.
    
    Returns lengths and paths, which may be convenient for analysis reasons.
    """
    
    lengths, paths = nx.single_source_dijkstra(g, node_id, weight=weight, cutoff=cutoff)
    
    for k, v in lenghts.items():
        g.nodes()[k]['distance-from-zipcode'][zipcode] = {'pop_percent': 0,
                                                          'distance': v}

    return lengths, paths

# Networkx Algos

In [75]:
# %%time
# betweenness - O^2
# betweenness_subset = (nx.algorithms.centrality.betweenness_centrality(g))
# central_nodes = [x[0] for x in list(betweenness_subset.items()) if x[1] > 0.2]
# for node in central_nodes:
#     g.nodes[node]['betweenness'] = True



In [76]:
# %%time 
# K-components: identifies likely subgraphs and subgraphs of subgraphs
# not useful for this project, sadly. It mostly identifies playgrounds
# and stadiums, since these are easy subsets
# from networkx.algorithms import approximation as apxa
# h = g.to_undirected() 

# foo = apxa.k_components(h, min_density=0.95)

# for node in foo[1][0]:
#     g.nodes[node]['k_1'] = True
# for node in foo[2][0]:
#     g.nodes[node]['k_2'] = True
# for node in foo[2][1]:
#     g.nodes[node]['k_2'] = True
# for node in foo[2][2]:
#     g.nodes[node]['k_2'] = True
# for node in foo[3][0]:
#     g.nodes[node]['k_2'] = True

In [77]:
# runs forever. no result
# from networkx.algorithms import approximation as apxa
# apxa.maximum_independent_set(g)

In [78]:
# %%time
# # voterank - not useful. mostly just shows parks
# central_nodes = nx.algorithms.centrality.voterank(g, number_of_nodes=10)
# for node in central_nodes:
#     g.nodes[node]['voterank'] = True


# # betweenness - O^2
# # betweenness_subset = (nx.algorithms.centrality.betweenness_centrality(g))
# # central_nodes = [x[0] for x in list(betweenness_subset.items()) if x[1] > 0.2]
# # for node in central_nodes:
# #     g.nodes[node]['betweenness'] = True


In [79]:
# %%time
# # closeness - again, not very useful.
# closeness_subset = (nx.algorithms.centrality.closeness_centrality(g, distance='havlen'))

# for node in central_nodes:
#     g.nodes[node]['closeness'] = True

In [25]:
pr = nx.pagerank(g)

# Write results to disk

In [240]:
nx.write_gpickle(g, "./data/03_" + DATASET + ".gpickle")