In [84]:
import random
import csv
from math import radians, cos, sin, asin, sqrt

import networkx as nx
import pickle as pkl
import pandas as pd


from vars import DATASET

In [85]:
DATASET

'boston-metro'

In [86]:
g = nx.read_gpickle("./data/01_" + DATASET + ".gpickle")

In [88]:
num_nodes = len(g.nodes())
num_nodes_with_zips = len([x for x in g.nodes() if g.nodes()[x].get('zipcode')])

print('number of nodes in the graph:' + str(num_nodes))
print('number of nodes with zipcodes:' + str(num_nodes_with_zips))
print('ratio-zipcoded: ' + str(num_nodes_with_zips/num_nodes))

number of nodes in the graph:918800
number of nodes with zipcodes:0
ratio-zipcoded: 0.0


# tag nodes with zipcodes, population, and total graph pop

## Known supplied zipcodes

In [39]:
zipcode_dict = {}

In [89]:
# populate zipcode_dict with nodes for each zipcode
with open('./data/zipcode/zipcodes_boston-metro.csv', newline='') as f:
    csvreader = csv.reader(f, delimiter=',')
    next(csvreader) # skip header
    for row in csvreader:
        node_id, zipcode = row[0], row[1]
        
        if zipcode_dict.get(zipcode) is not None and zipcode is not '':
            zipcode_dict[zipcode]['ids'] = zipcode_dict[zipcode]['ids'] + [node_id]
        elif zipcode is not '':
            zipcode_dict[zipcode] = {}
            zipcode_dict[zipcode]['population'] = 0
            zipcode_dict[zipcode]['ids'] = [node_id]

In [90]:
# populate zipcode_dict with population for each zipcode
# from https://worldpopulationreview.com/zips/massachusetts
with open('./data/zipcode/population_by_zip_2020.csv', newline='') as f:
    csvreader = csv.reader(f, delimiter=',')
    next(csvreader) # skip header
    for row in csvreader:
        population, zipcode = int(row[3]), row[0].zfill(5) # 5 digit zipcode
        
        if zipcode_dict.get(zipcode):
            zipcode_dict[zipcode]['population'] = population


In [91]:
# compute total population visible in graph
total_pop_dict = {}
all_nodes_set = set(g.nodes())

# populate a dictionary of zipcodes in use, and their populations
for zipcode, inner_dict in zipcode_dict.items():
    population = inner_dict['population']
    nodes = set(inner_dict['ids'])
    
    # if the zipcode hasn't gotten its population yet, and the nodes intersect with the graph
    if total_pop_dict.get(zipcode, True) and len(nodes.intersection(all_nodes_set)) > 0:
        total_pop_dict[zipcode] = population

# sum the values of the dictionary to get the total map population
total_population = sum([v for k, v in total_pop_dict.items()])

In [101]:
# tag nodes with the zipcode, population, and total graph population
# in case of nodes without zipcodes, tag them as none

for node in g.nodes():
    g.nodes()[node]['zipcode'] = None
    g.nodes()[node]['population'] = 0
    g.nodes()[node]['total_graph_area_pop'] = total_population
    

for zipcode, inner_dict in zipcode_dict.items():
    population = inner_dict['population']
    nodes = inner_dict['ids']
    
    for node in nodes:
        
        # the zipcode dictionary may have nodes not contained in
        # the particular dataset, so make sure to only try to tag those
        if g.nodes.get(node):
            g.nodes()[node]['zipcode'] = zipcode
            g.nodes()[node]['population'] = population
            g.nodes()[node]['total_graph_area_pop'] = total_population

In [102]:
num_nodes = len(g.nodes())
num_nodes_with_zips = len([x for x in g.nodes() if g.nodes()[x].get('zipcode')])

print('number of nodes in the graph:' + str(num_nodes))
print('number of nodes with zipcodes:' + str(num_nodes_with_zips))
print('ratio-zipcoded: ' + str(num_nodes_with_zips/num_nodes))

number of nodes in the graph:918800
number of nodes with zipcodes:306361
ratio-zipcoded: 0.33343600348280367


# scratch

In [103]:
g.nodes()['30416737']

{'lat': 42.2036233,
 'lon': -71.1058422,
 'id': '30416737',
 'tags': {},
 'zipcode': None,
 'population': 0,
 'total_graph_area_pop': 2581490}

## inferring zipcodes

In [104]:
def infer_zipcode_from_neighbors(node_id):
    '''given a node_id, add a zipcode if it can be inferred.
       return whether an inference was made, and the current zipcode'''
    
    # if the node has an assigned zipcode, use that.
    if g.nodes()[node_id].get('zipcode') is not None:
        return (False, g.nodes()[node_id].get('zipcode'))
    
    in_edges = g.in_edges(node_id)
    out_edges = g.out_edges(node_id)
    
    in_neighbors = [x[0] for x in g.in_edges(node_id)]
    out_neighbors = [x[1] for x in g.out_edges(node_id)]
    
    in_distances = [g.edges()[x]['havlen'] for x in in_edges]
    out_distances = [g.edges()[x]['havlen'] for x in out_edges]
    
    in_zipcodes = [g.nodes()[x].get('zipcode', None) for x in in_neighbors]
    out_zipcodes = [g.nodes()[x].get('zipcode', None) for x in out_neighbors]
    
    zip_distances = [x for x in zip(in_zipcodes, in_distances)] + [x for x in zip(out_zipcodes, out_distances)]
    
    # dictionary of zipcodes and their values
    distance_dict = {}
    for combo in zip_distances:
        zipcode = combo[0]
        distance = combo[1]
        existing_distance = distance_dict.get(zipcode, 0)
        
        if zipcode is not None:
            distance_dict[zipcode] = existing_distance + distance
            
    closest_zip = None
    closest_distance = 1000000
    for k, v in distance_dict.items():
        if v < closest_distance:
            closest_distance = v
            closest_zip = k
    
    # categorize if possible
    inference_made = False
    if closest_zip != None:
        g.nodes()[node_id]['zipcode'] = closest_zip
        g.nodes()[node_id]['population'] = total_pop_dict[closest_zip]
        g.nodes()[node_id]['total_graph_area_pop'] = total_population
        
        inference_made = True
    
    return (inference_made, closest_zip)

In [105]:
def nodes_without_zipcodes(search_list=None):
    '''returns a list of graph nodes without zipcodes.
       if a search list is supplied, only searches that subset of nodes'''
    if search_list is None:
        return [x for x in g.nodes() if not g.nodes()[x].get('zipcode')]
    else:
        without_zips = []
        for x in search_list:
            if g.nodes()[x].get('zipcode') is None:
                without_zips.append(x)
        return without_zips

In [106]:
%%time
# successively loop through nodes. If they can be categorized using neighbors,
# categorize them.

nodes_since_last_categorization = 0
uncategorized_nodes = None

while uncategorized_nodes is None or len(uncategorized_nodes) > nodes_since_last_categorization:
    uncategorized_nodes = nodes_without_zipcodes(uncategorized_nodes)
    
    for node in uncategorized_nodes:
        inference = infer_zipcode_from_neighbors(node)
        inference_made = inference[0]
        zipcode = inference[1]

        if inference_made:
            nodes_since_last_categorization = 0
        else:
            nodes_since_last_categorization += 1

CPU times: user 3min 10s, sys: 257 ms, total: 3min 10s
Wall time: 3min 10s


In [107]:
leftover = nodes_without_zipcodes()
len(leftover)

6639

# Write results to disk

In [109]:
nx.write_gpickle(g, "./data/02_" + DATASET + ".gpickle")