# Notebook 02-voronoi_in_city_test

This notebook takes the work developed on Notebook 00-voronoi_in_ageb_test and applies it to a city (Aguascalientes).

Helped function voronoi_points_within_polygon to work properly.

## Import libraries

In [33]:
import os
import sys

import pandas as pd
import geopandas as gpd
import osmnx as ox
import numpy as np

from scipy.spatial import Voronoi, voronoi_plot_2d
import shapely

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

module_path = os.path.abspath(os.path.join('../../../'))
if module_path not in sys.path:
    sys.path.append(module_path)
    import aup

## Load city data

### Load mun_gdf (Area of interest)

In [2]:
# OUTSIDE FUNCTION:
city = 'Aguascalientes'

# --------------- CREATE AREA OF INTEREST (aoi)
# Downloads mun_gdf for city and create aoi
query = f"SELECT * FROM metropolis.metro_gdf_2020 WHERE \"city\" LIKE \'{city}\'"
mun_gdf = aup.gdf_from_query(query, geometry_col='geometry')
mun_gdf = mun_gdf.set_crs("EPSG:4326")

In [4]:
# --------------- DOWNLOAD POP DATA
aoi = mun_gdf.dissolve()
print("Loading AGEBs for area of interest.")
pop_ageb_gdf = aup.gdf_from_polygon(aoi,'censoageb','censoageb_2020')
print("Loading blocks for area of interest.")
pop_mza_gdf = aup.gdf_from_polygon(aoi,'censo_mza','censo_mza_2020')
pop_mza_gdf = pop_mza_gdf.loc[pop_mza_gdf.AMBITO == 'Urbana'].copy()

Loading AGEBs for area of interest.
Loading blocks for area of interest.


## Methodology

### 1. Select nodes within area of interest

In [None]:
G, nodes, edges = aup.create_osmnx_network(aoi)

fig,ax = plt.subplots(figsize=(5,5))
nodes.plot(ax=ax,color='red',zorder=2)
edges.plot(ax=ax,zorder=1,color='red')
aoi.plot(ax=ax,zorder=0)

### 2. Create voronoi polygons using nodes

In [None]:
def voronoi_points_within_polygon (polygon, points, admissible_error = 0.01):
    # This function takes as input a polygon and points within that polygon
    # And returns the voronoi distribution of those points within that polygon

    
    #Set area of interest (polygon) and points of interest (nodes) for voronoi analysis to crs:6372
    aoi = polygon.to_crs("EPSG:6372")
    pois = points.to_crs("EPSG:6372")

    # Distance is a number used to create a buffer around the polygon and coordinates along a bounding box of that buffer.
    # Starts at 100 (works for smaller polygons) but will increase itself until the diference between the area of 
    # the voronoi polygons created and the area of the aoi is less than the admissible_error.
    distance = 100

    # Goal area (Area of aoi)
    goal_area_gdf = aoi.copy()
    goal_area_gdf['area'] = goal_area_gdf.geometry.area
    goal_area = goal_area_gdf['area'].sum()
    
    #Loop starter:
    area_diff = admissible_error + 1
    
    # Will repeat process while difference between voronoi polygons area and goal_area is more than admissible_error.
    while area_diff > admissible_error:

        print(f'Processing area of interest for distance = {distance}.')
        
        #Create a rectangular bound for the area of interest
        polygon = aoi['geometry'].unique()[0]
        bound = polygon.buffer(distance).envelope.boundary
        
        #Create points along the rectangular boundary
        boundarypoints = [bound.interpolate(distance=d) for d in range(0, np.ceil(bound.length).astype(int), distance)]
        boundarycoords = np.array([[p.x, p.y] for p in boundarypoints])
        
        #Load the points inside the polygon
        coords = np.array(pois.get_coordinates())
        
        #Create an array of all points on the boundary and inside the polygon
        all_coords = np.concatenate((boundarycoords, coords))
        
        # Calculate voronoi to all coords and create voronois gdf (No boundary)
        vor = Voronoi(points=all_coords)
        lines = [shapely.geometry.LineString(vor.vertices[line]) for line in vor.ridge_vertices if -1 not in line]
        polys = shapely.ops.polygonize(lines)
        unbounded_voronois = gpd.GeoDataFrame(geometry=gpd.GeoSeries(polys), crs="epsg:6372")
        
        #Clip voronoi with boundary
        bounded_voronois = gpd.overlay(df1=unbounded_voronois, df2=aoi, how="intersection")
        
        # Change back crs
        voronois_gdf = bounded_voronois.to_crs("EPSG:4326")

        # Area check for while loop
        voronois_area_gdf = voronois_gdf.to_crs("EPSG:6372")
        voronois_area_gdf['area'] = voronois_area_gdf.geometry.area
        voronois_area = voronois_area_gdf['area'].sum()

        area_diff = ((goal_area - voronois_area)/(goal_area))*100
        
        if area_diff > admissible_error:
            print(f'Error = {round(area_diff,2)}%. Repeating process.')
            distance = distance * 10
        else:
            print(f'Error = {round(area_diff,2)}%. Admissible.')
            
    return voronois_gdf


In [None]:
voronois_gdf = voronoi_points_within_polygon(aoi,nodes)

# Show
fig, ax = plt.subplots(figsize=(20, 20))
nodes.plot(ax=ax, color="maroon",zorder=2)
aoi.boundary.plot(ax=ax, edgecolor="blue", linewidth=6,zorder=1)
voronois_gdf.plot(ax=ax, color="red", alpha=0.3, edgecolor="black",zorder=0)

In [None]:
# Add nodes osmid data to voronoi polygons
nodes_voronoi_gdf = gpd.sjoin(voronois_gdf,nodes[['osmid','geometry']])
nodes_voronoi_gdf = nodes_voronoi_gdf[['osmid','geometry']]

# Show
print(nodes_voronoi_gdf.shape)
nodes_voronoi_gdf.head(1)

### 3. Spatial intersection of voronoi polygons with blocks. Calculate pct of area that corresponds to each osmid within block.

In [None]:
# Calculate block area
mza_gdf = pop_mza_gdf.to_crs("EPSG:6372")
mza_gdf['area_mza'] = mza_gdf.geometry.area
mza_gdf = mza_gdf.to_crs("EPSG:4326")

# Overlay blocks with voronoi
mza_voronoi = gpd.overlay(df1=mza_gdf, df2=nodes_voronoi_gdf, how="intersection")
del mza_gdf

# Calculate area distribution of block in voronoi zones
mza_voronoi = mza_voronoi.to_crs("EPSG:6372")
mza_voronoi['area_voronoi'] = mza_voronoi.geometry.area
mza_voronoi = mza_voronoi.to_crs("EPSG:4326")
mza_voronoi['area_pct'] = mza_voronoi['area_voronoi']/mza_voronoi['area_mza']

# Drop used columns
mza_voronoi.drop(columns=['area_mza','area_voronoi'],inplace=True)

# Show
print(mza_voronoi.shape)
mza_voronoi.head(1)

### 4. Sum of pob data that corresponds to each node

In [None]:
len(mza_voronoi.osmid.unique())

In [None]:
columns_of_interest = ['POBTOT','POBFEM','POBMAS',
                    'P_0A2','P_0A2_F','P_0A2_M',
                    'P_3A5','P_3A5_F','P_3A5_M',
                    'P_6A11','P_6A11_F','P_6A11_M',
                    'P_12A14','P_12A14_F','P_12A14_M',
                    'P_15A17','P_15A17_F','P_15A17_M',
                    'P_18A24','P_18A24_F','P_18A24_M',
                    'P_60YMAS','P_60YMAS_F','P_60YMAS_M',
                    'P_3YMAS','P_3YMAS_F','P_3YMAS_M',
                    'P_12YMAS','P_12YMAS_F','P_12YMAS_M',
                    'P_15YMAS','P_15YMAS_F','P_15YMAS_M',
                    'P_18YMAS','P_18YMAS_F','P_18YMAS_M',
                    'REL_H_M','POB0_14','POB15_64','POB65_MAS']

nodes_gdf = nodes.copy()

for col in columns_of_interest:
    # Turn column to numeric 
    mza_voronoi[col] = pd.to_numeric(mza_voronoi[col])

    # Calculate population proportional to voronoi area of block
    mza_voronoi[f'voronoi_{col}'] = mza_voronoi[col] * mza_voronoi['area_pct']
    
    # Group data by osmid
    col_data = mza_voronoi[['osmid',f'voronoi_{col}']]
    osmid_grouped_data = col_data.groupby('osmid').agg({f'voronoi_{col}':np.sum})
    
    # Merge data to nodes_gdf
    osmid_grouped_data.reset_index(inplace=True)
    nodes_gdf = pd.merge(nodes_gdf,osmid_grouped_data,on='osmid')
    nodes_gdf.rename(columns={f'voronoi_{col}':col},inplace=True)

print(nodes_gdf.shape)
nodes_gdf.head(1)

## Visual test

In [None]:
nodes_pop_gdf = aup.gdf_from_polygon(aoi,'censo','nodes_pop_2020')

In [None]:
# Show
print(nodes_pop_gdf.shape)
nodes_pop_gdf.head(1)

In [None]:
fig, ax = plt.subplots(1,2,figsize=(10, 10))
nodes_gdf.plot('POBTOT',ax=ax[0],markersize=.05,cmap='Reds')
nodes_pop_gdf.plot('pobtot',ax=ax[1],markersize=.05,cmap='Reds')

## Other tests

In [None]:
test_df = pd.DataFrame()

test_cols = columns_of_interest.copy()
test_cols.remove('REL_H_M')

i = 0
for col in test_cols:
    # Turn column to numeric 
    pop_mza_gdf[col] = pd.to_numeric(pop_mza_gdf[col])
    pop_ageb_gdf[col.lower()] = pd.to_numeric(pop_ageb_gdf[col.lower()])

    # Get values of col
    blocks_value = pop_mza_gdf[col].sum()
    nodes_value = nodes_gdf[col].sum()
    ageb_value = pop_ageb_gdf[col.lower()].sum()
    db_nodes_value = nodes_pop_gdf[col.lower()].sum()

    # Assign values to corresponding row
    test_df.loc[i,'atr'] = col
    test_df.loc[i,'blocks'] = blocks_value
    test_df.loc[i,'nodes'] = nodes_value
    test_df.loc[i,'ageb'] = ageb_value
    test_df.loc[i,'db_nodes'] = db_nodes_value

    i = i+1

# Find differences in data and methodologies
test_df['diff'] = test_df['nodes'] - test_df['blocks']
test_df['blocks_diff'] = test_df['blocks'] - test_df['ageb']
test_df['diff_db'] = test_df['db_nodes'] - test_df['ageb']

# Find sum of differences and assign to final row 'TOTAL'
diff_sum = test_df['diff'].sum()
blocks_diff_sum = test_df['blocks_diff'].sum()
diff_db_sum = test_df['diff_db'].sum()

test_df.loc[i,'atr'] = 'TOTAL'
test_df.loc[i,'diff'] = diff_sum
test_df.loc[i,'blocks_diff'] = blocks_diff_sum
test_df.loc[i,'diff_db'] = diff_db_sum

# Format - reorder columns
test_df = test_df[['atr','blocks','nodes','diff','ageb','blocks_diff','db_nodes','diff_db']]

# Show
test_df

In [23]:
print(pop_ageb_gdf.pobtot.sum())
pop_ageb_gdf.loc[pop_ageb_gdf.pobtot.isna()]

1042295


Unnamed: 0,cve_geo,cve_ent,cve_mun,cve_loc,cve_ageb,geometry,entidad,nom_ent,mun,nom_mun,...,vph_cel,vph_inter,vph_stvp,vph_spmvpi,vph_cvj,vph_sinrtv,vph_sintlc,vph_sincint,vph_sintic,cve_geo_ageb


In [19]:
print(nodes_gdf.POBTOT.sum())
nodes_gdf.loc[nodes_gdf.POBTOT.isna()]

NameError: name 'nodes_gdf' is not defined

In [5]:
pop_ageb_gdf.head(1)

Unnamed: 0,cve_geo,cve_ent,cve_mun,cve_loc,cve_ageb,geometry,entidad,nom_ent,mun,nom_mun,...,vph_cel,vph_inter,vph_stvp,vph_spmvpi,vph_cvj,vph_sinrtv,vph_sintlc,vph_sincint,vph_sintic,cve_geo_ageb
0,010010001216A,1,1,1,216A,"POLYGON ((-102.27058 21.87363, -102.27083 21.8...",1,Aguascalientes,1,Aguascalientes,...,753.0,609.0,439.0,205.0,146.0,7.0,14.0,174.0,,010010001216A


## Test - Is it the same loading pop_mza_gdf through aup.gdf_from_polygon vs loading it through cve_ent+cve_mun queries?
__Answer:__ It is the same, but the former (26s/26s for Ags) is faster than the latter ((206s/206s using LIKE and = in query), (503s/212s using =,= in query), (207s/352s using LIKE and LIKE in query) for Ags.)

In [41]:
pop_mza_gdf = aup.gdf_from_polygon(aoi,'censo_mza','censo_mza_2020')
pop_mza_gdf = pop_mza_gdf.loc[pop_mza_gdf.AMBITO == 'Urbana'].copy()

In [26]:
print(pop_mza_gdf.shape)
print(pop_mza_gdf.POBTOT.sum())
pop_mza_gdf.loc[pop_mza_gdf.POBTOT.isna()]

(12932, 239)
1041064


Unnamed: 0,CVEGEO,CVE_ENT,CVE_MUN,CVE_LOC,CVE_AGEB,CVE_MZA,AMBITO,TIPOMZA,geometry,ENTIDAD,...,VPH_TELEF,VPH_CEL,VPH_INTER,VPH_STVP,VPH_SPMVPI,VPH_CVJ,VPH_SINRTV,VPH_SINLTC,VPH_SINCINT,VPH_SINTIC


In [42]:
mza_schema = 'censo_mza'
mza_table = 'censo_mza_2020'

cve_ent_list = list(pop_ageb_gdf.cve_ent.unique())
cve_mun_list = list(pop_ageb_gdf.cve_mun.unique())

pop_mza_gdf_2 = gpd.GeoDataFrame()
for cve_ent in cve_ent_list:
    for cve_mun in cve_mun_list:
        query = f"SELECT * FROM {mza_schema}.{mza_table} WHERE (\"CVE_ENT\" LIKE \'{cve_ent}\') AND (\"CVE_MUN\" = \'{cve_mun}\')"
        pop_mza_gdf_2 = pd.concat([pop_mza_gdf_2,aup.gdf_from_query(query, geometry_col='geometry')])

# Remove AMBITO == 'Rural' (If exists)
pop_mza_gdf_2 = pop_mza_gdf_2.loc[pop_mza_gdf_2.AMBITO == 'Urbana'].copy()

# Show
print(pop_mza_gdf_2.shape)
pop_mza_gdf_2.head(1)

(12932, 239)


Unnamed: 0,CVEGEO,CVE_ENT,CVE_MUN,CVE_LOC,CVE_AGEB,CVE_MZA,AMBITO,TIPOMZA,geometry,ENTIDAD,...,VPH_TELEF,VPH_CEL,VPH_INTER,VPH_STVP,VPH_SPMVPI,VPH_CVJ,VPH_SINRTV,VPH_SINLTC,VPH_SINCINT,VPH_SINTIC
0,100100010286027,1,1,1,286,27,Urbana,Típica,"POLYGON ((-102.31215 21.90182, -102.31220 21.9...",1,...,31,45,39,30,18,18,0,0,5,0


In [43]:
print(pop_mza_gdf_2.shape)
print(pop_mza_gdf_2.POBTOT.sum())
pop_mza_gdf_2.loc[pop_mza_gdf_2.POBTOT.isna()]

(12932, 239)
1041064


Unnamed: 0,CVEGEO,CVE_ENT,CVE_MUN,CVE_LOC,CVE_AGEB,CVE_MZA,AMBITO,TIPOMZA,geometry,ENTIDAD,...,VPH_TELEF,VPH_CEL,VPH_INTER,VPH_STVP,VPH_SPMVPI,VPH_CVJ,VPH_SINRTV,VPH_SINLTC,VPH_SINCINT,VPH_SINTIC


## Test - And what about quering by LIKE CVEGEO?
__Answer:__ It is still the same but still slower (354s) than aup.gdf_from_polygon.

In [47]:
mza_schema = 'censo_mza'
mza_table = 'censo_mza_2020'

# Load blocks of city
pop_mza_gdf_3 = gpd.GeoDataFrame()
for cvegeo in list(mun_gdf.CVEGEO.unique()):
    query = f"SELECT * FROM {mza_schema}.{mza_table} WHERE \"CVEGEO\" LIKE \'{cvegeo}%%\'"
    pop_mza_gdf_3 = pd.concat([pop_mza_gdf_3,aup.gdf_from_query(query, geometry_col='geometry')])

# Remove AMBITO == 'Rural' (If exists)
pop_mza_gdf_3 = pop_mza_gdf_3.loc[pop_mza_gdf_3.AMBITO == 'Urbana'].copy()

# Show
print(pop_mza_gdf_3.shape)
pop_mza_gdf_3.head(1)

(12932, 239)


Unnamed: 0,CVEGEO,CVE_ENT,CVE_MUN,CVE_LOC,CVE_AGEB,CVE_MZA,AMBITO,TIPOMZA,geometry,ENTIDAD,...,VPH_TELEF,VPH_CEL,VPH_INTER,VPH_STVP,VPH_SPMVPI,VPH_CVJ,VPH_SINRTV,VPH_SINLTC,VPH_SINCINT,VPH_SINTIC
0,100100010229001,1,1,1,229,1,Urbana,Típica,"POLYGON ((-102.29582 21.92984, -102.29579 21.9...",1,...,14,21,20,17,16,6,0,0,,0


In [48]:
print(pop_mza_gdf_3.shape)
print(pop_mza_gdf_3.POBTOT.sum())
pop_mza_gdf_3.loc[pop_mza_gdf_3.POBTOT.isna()]

(12932, 239)
1041064


Unnamed: 0,CVEGEO,CVE_ENT,CVE_MUN,CVE_LOC,CVE_AGEB,CVE_MZA,AMBITO,TIPOMZA,geometry,ENTIDAD,...,VPH_TELEF,VPH_CEL,VPH_INTER,VPH_STVP,VPH_SPMVPI,VPH_CVJ,VPH_SINRTV,VPH_SINLTC,VPH_SINCINT,VPH_SINTIC


## Test - AGEBs and blocks have the same poptot data?
__Answer:__ No. Don't know why.

In [34]:
pop_ageb_gdf = aup.gdf_from_polygon(aoi,'censoageb','censoageb_2020') #Loads in 164 segs

In [31]:
print(pop_ageb_gdf.shape)
print(pop_ageb_gdf.pobtot.sum())
pop_ageb_gdf.loc[pop_ageb_gdf.pobtot.isna()]

(392, 237)
1042295


Unnamed: 0,cve_geo,cve_ent,cve_mun,cve_loc,cve_ageb,geometry,entidad,nom_ent,mun,nom_mun,...,vph_cel,vph_inter,vph_stvp,vph_spmvpi,vph_cvj,vph_sinrtv,vph_sintlc,vph_sincint,vph_sintic,cve_geo_ageb
