In [9]:
import random
import csv
from math import radians, cos, sin, asin, sqrt

import networkx as nx
import pickle as pkl
import pandas as pd


from vars import DATASET

In [7]:
DATASET

'cambridge-small'

In [8]:
g = nx.read_gpickle("./data/01_" + DATASET + ".gpickle")

# tag nodes with zipcodes, population, and total graph pop

In [99]:
zipcode_dict = {}

In [100]:
# add nodes for each zipcode
with open('./data/zipcode/zipcodes_boston-metro.csv', newline='') as f:
    csvreader = csv.reader(f, delimiter=',')
    next(csvreader) # skip header
    for row in csvreader:
        node_id, zipcode = row[0], row[1]
        
        if zipcode_dict.get(zipcode) is not None:
            zipcode_dict[zipcode]['ids'] = zipcode_dict[zipcode]['ids'] + [node_id]
        else:
            zipcode_dict[zipcode] = {}
            zipcode_dict[zipcode]['population'] = 0
            zipcode_dict[zipcode]['ids'] = [node_id]

In [101]:
# add the population data
# from https://worldpopulationreview.com/zips/massachusetts
with open('./data/zipcode/population_by_zip_2020.csv', newline='') as f:
    csvreader = csv.reader(f, delimiter=',')
    next(csvreader) # skip header
    for row in csvreader:
        population, zipcode = int(row[3]), row[0].zfill(5) # 5 digit zipcode
        
        if zipcode_dict.get(zipcode):
            zipcode_dict[zipcode]['population'] = population


In [102]:
# tag nodes with the zipcode
# compute total population visible in the graph (to add later)
for zipcode, inner_dict in zipcode_dict.items():
    population = inner_dict['population']
    nodes = inner_dict['ids']
    
    for node in nodes:
        
        # the zipcode dictionary may have nodes not contained in
        # the particular dataset
        if g.nodes.get(node):
            g.nodes()[node]['zipcode'] = zipcode
            g.nodes()[node]['population'] = population
            
#             g.nodes()[node]['total_graph_area_pop'] = total_population

In [103]:
a = set(['a', 'b', 'c'])
b = set(['d'])
c = set(['c'])

In [110]:
len(c.intersection(a))

1

In [None]:
c in a

{'2681390269',
 '2681390273',
 '2681390275',
 '2681390277',
 '2681390278',
 '2681390280',
 '2681408394',
 '2681408404',
 '2681408410',
 '2681408423',
 '2681408448',
 '2681408456',
 '2681408484',
 '2682537240',
 '2682537242',
 '2682537245',
 '276249698',
 '347921313',
 '347921314',
 '347921317',
 '347921321',
 '347921324',
 '347921325',
 '347921329',
 '347921332',
 '347921341',
 '3519311846',
 '3519311847',
 '3519311848',
 '3519311849',
 '3519311850',
 '3519311854',
 '3687642803',
 '4185277099',
 '4965812669',
 '4965812670',
 '4978261262',
 '4978261263',
 '4978261264',
 '4978261265',
 '4978261266',
 '4978261267',
 '540001526',
 '540001527',
 '541336191',
 '541336192',
 '5458232833',
 '5458232834',
 '5458232835',
 '5458232836',
 '5458365624',
 '566889783',
 '566889864',
 '567769806',
 '567769808',
 '567777293',
 '567777297',
 '568128732',
 '5851090174',
 '6114648250',
 '6114648251',
 '6114648252',
 '6114648253',
 '61283119',
 '61283126',
 '61283287',
 '61283300',
 '61283345',
 '61317333'

In [111]:
total_pop_dict = {}
total_pop_dict.get('a', True)

True

In [117]:
# compute total population visible in graph
total_pop_dict = {}
all_nodes_set = set(g.nodes())

# populate a dictionary of zipcodes in use, and their populations
for zipcode, inner_dict in zipcode_dict.items():
    population = inner_dict['population']
    nodes = set(inner_dict['ids'])
    
    # if the zipcode hasn't gotten its population yet, and the nodes intersect with the graph
    if total_pop_dict.get(zipcode, True) and len(nodes.intersection(all_nodes_set)) > 0:
        total_pop_dict[zipcode] = population

# sum the values of the dictionary to get the total map population
total_population = sum([v for k, v in total_pop_dict.items()])
        
# add the total population to graph nodes


96531

In [115]:
total_pop_dict

{'02139': 38272, '02138': 38835, '02134': 19424}

# Dijkstra from Central Nodes

In [21]:
# give every node an empty zipcode dictionary
for node in g.nodes():
    g.nodes()[node]['zipcodes'] = {}

In [24]:
# in testing, cutoff of 1000 does not affect the boston graph. The max distance represented is 23 km.
# a reasoanble cutoff for a larger graph might be 5000, representing 100 km.
def record_lenghts_from_source(zipcode, node_id, weight='havlen', cutoff=5000):
    """
    given a zipcode and a node_id, compute the distance from the node_id to
    all other nodes on the graph. Then, record this information in nodes,
    keyed to the zipcode.
    
    Returns lengths and paths, which may be convenient for analysis reasons.
    """
    
    lengths, paths = nx.single_source_dijkstra(g, node_id, weight=weight, cutoff=cutoff)
    
    for k, v in lenghts.items():
        g.nodes()[k]['zipcodes'][zipcode] = {'pop_percent': 0,
                                             'distance': v}

    return lengths, paths

# Networkx Algos

In [75]:
# %%time
# betweenness - O^2
# betweenness_subset = (nx.algorithms.centrality.betweenness_centrality(g))
# central_nodes = [x[0] for x in list(betweenness_subset.items()) if x[1] > 0.2]
# for node in central_nodes:
#     g.nodes[node]['betweenness'] = True



In [76]:
# %%time 
# K-components: identifies likely subgraphs and subgraphs of subgraphs
# not useful for this project, sadly. It mostly identifies playgrounds
# and stadiums, since these are easy subsets
# from networkx.algorithms import approximation as apxa
# h = g.to_undirected() 

# foo = apxa.k_components(h, min_density=0.95)

# for node in foo[1][0]:
#     g.nodes[node]['k_1'] = True
# for node in foo[2][0]:
#     g.nodes[node]['k_2'] = True
# for node in foo[2][1]:
#     g.nodes[node]['k_2'] = True
# for node in foo[2][2]:
#     g.nodes[node]['k_2'] = True
# for node in foo[3][0]:
#     g.nodes[node]['k_2'] = True

In [77]:
# runs forever. no result
# from networkx.algorithms import approximation as apxa
# apxa.maximum_independent_set(g)

In [78]:
# %%time
# # voterank - not useful. mostly just shows parks
# central_nodes = nx.algorithms.centrality.voterank(g, number_of_nodes=10)
# for node in central_nodes:
#     g.nodes[node]['voterank'] = True


# # betweenness - O^2
# # betweenness_subset = (nx.algorithms.centrality.betweenness_centrality(g))
# # central_nodes = [x[0] for x in list(betweenness_subset.items()) if x[1] > 0.2]
# # for node in central_nodes:
# #     g.nodes[node]['betweenness'] = True


In [79]:
# %%time
# # closeness - again, not very useful.
# closeness_subset = (nx.algorithms.centrality.closeness_centrality(g, distance='havlen'))

# for node in central_nodes:
#     g.nodes[node]['closeness'] = True

In [25]:
pr = nx.pagerank(g)

# Write results to disk

In [88]:
nx.write_gpickle(g, "./data/02_" + DATASET + ".gpickle")