# WORK IN PROGRESS

## Import libraries

In [1]:
import os
import sys

import pandas as pd
import geopandas as gpd
import osmnx as ox
import numpy as np

from scipy.spatial import Voronoi, voronoi_plot_2d
import shapely

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

module_path = os.path.abspath(os.path.join('../../../'))
if module_path not in sys.path:
    sys.path.append(module_path)
    import aup

  ox.config(


## Function definitions

In [2]:
def voronoi_points_within_polygon (polygon, points, points_id_col, admissible_error = 0.01):
    # This function takes as input a polygon and points within that polygon and returns the voronoi distribution of those points within that polygon
    # Args:
	# 	polygon (geopandas.GeoDataFrame): GeoDataFrame with area of interest (voronoi extents).
	# 	points (geopandas.GeoDataFrame): GeoDataFrame with points that generate voronoi within polygon.
	#	points_id_col (str): Name of points ID column (Will be delivered to each output polygon)

	# Returns:
	#	geopandas.GeoDataFrame: GeoDataFrame with voronoi polygons containing points ID and extending all up to the area of interest extent.
    
    #Set area of interest (polygon) and points of interest (nodes) for voronoi analysis to crs:6372
    aoi = polygon.to_crs("EPSG:6372")
    pois = points.to_crs("EPSG:6372")

    # Distance is a number used to create a buffer around the polygon and coordinates along a bounding box of that buffer.
    # Starts at 100 (works for smaller polygons) but will increase itself until the diference between the area of 
    # the voronoi polygons created and the area of the aoi is less than the admissible_error.
    distance = 100

    # Goal area (Area of aoi)
    goal_area_gdf = aoi.copy()
    goal_area_gdf['area'] = goal_area_gdf.geometry.area
    goal_area = goal_area_gdf['area'].sum()
    
    #Loop starter:
    area_diff = admissible_error + 1
    
    # Will repeat process while difference between voronoi polygons area and goal_area is more than admissible_error.
    while area_diff > admissible_error:

        print(f'Processing area of interest for distance = {distance}.')
        
        #Create a rectangular bound for the area of interest
        polygon = aoi['geometry'].unique()[0]
        bound = polygon.buffer(distance).envelope.boundary
        
        #Create points along the rectangular boundary
        boundarypoints = [bound.interpolate(distance=d) for d in range(0, np.ceil(bound.length).astype(int), distance)]
        boundarycoords = np.array([[p.x, p.y] for p in boundarypoints])
        
        #Load the points inside the polygon
        coords = np.array(pois.get_coordinates())
        
        #Create an array of all points on the boundary and inside the polygon
        all_coords = np.concatenate((boundarycoords, coords))
        
        # Calculate voronoi to all coords and create voronois gdf (No boundary)
        vor = Voronoi(points=all_coords)
        lines = [shapely.geometry.LineString(vor.vertices[line]) for line in vor.ridge_vertices if -1 not in line]
        polys = shapely.ops.polygonize(lines)
        unbounded_voronois = gpd.GeoDataFrame(geometry=gpd.GeoSeries(polys), crs="epsg:6372")

        # Add nodes ID data to voronoi polygons
        unbounded_voronois = gpd.sjoin(unbounded_voronois,pois[[points_id_col,'geometry']])
        
        #Clip voronoi with boundary
        bounded_voronois = gpd.overlay(df1=unbounded_voronois, df2=aoi, how="intersection")

        # Change back crs
        voronois_gdf = bounded_voronois.to_crs("EPSG:4326")

        # Area check for while loop
        voronois_area_gdf = voronois_gdf.to_crs("EPSG:6372")
        voronois_area_gdf['area'] = voronois_area_gdf.geometry.area
        voronois_area = voronois_area_gdf['area'].sum()

        area_diff = ((goal_area - voronois_area)/(goal_area))*100
        
        if area_diff > admissible_error:
            print(f'Error = {round(area_diff,2)}%. Repeating process.')
            distance = distance * 10
        else:
            print(f'Error = {round(area_diff,2)}%. Admissible.')
            
    return voronois_gdf

In [99]:
def main():

	##########################################################################################
	# STEP 1: LOAD DATA
    
    # --------------- 1.1 CREATE AREA OF INTEREST FOR CITY
    city_gdf = metro_gdf.loc[metro_gdf.city == city]
    city_gdf = city_gdf.set_crs("EPSG:4326")
    aoi = city_gdf.dissolve()
    
    # --------------- 1.2 LOAD POP DATA
    # Needs update for 'year' to work: The col names for pop fields in 2010 and 2020 are identical, but 2010 is .lower and 2020 is .upper
    # Needs update for 'year' to work: In 2010 the ageb key col is "ageb", while in 2020 it is "CVE_AGEB". 
    
    print("Loading AGEBs for area of interest.")
    pop_ageb_gdf = aup.gdf_from_polygon(aoi,'censoageb',f'censoageb_{year}')
    pop_ageb_gdf = pop_ageb_gdf.set_crs("EPSG:4326")
    
    print("Loading blocks for area of interest.")
    pop_mza_gdf = aup.gdf_from_polygon(aoi,'censo_mza',f'censo_mza_{year}')
    pop_mza_gdf = pop_mza_gdf.set_crs("EPSG:4326")
    pop_mza_gdf = pop_mza_gdf.loc[pop_mza_gdf.AMBITO == 'Urbana'].copy()
    
    ##########################################################################################
	# STEP 2: CALCULATE NaN VALUES for pop fields (most of them, check function) of gdf containing blocks.
    
    print("--"*30)
    print("CALCULATING NAN VALUES FOR POP FIELDS.")
    
    # --------------- 2.1 CALCULATE_CENSO_NAN_VALUES Function
    pop_mza_gdf_calc = aup.calculate_censo_nan_values_v1(pop_ageb_gdf,pop_mza_gdf,extended_logs=False)

    ##########################################################################################
	# STEP 3: DISTRIBUTE POP BLOCK DATA TO NODES USING VORONOI

    print("--"*30)
    print("DISTRIBUTING POP DATA FROM BLOCKS TO NODES STARTING USING VORONOI.")

    # --------------- 3.0 LOAD NODES
    print("Loading nodes for area of interest.")
    
    if year == '2010':
        # Needs update for 'year' to work: Will use "osmid" as node ID on function voronoi_points_within_polygon, but this version has no "osmid". ¿Use 'ID'?
        _, nodes, _ = aup.graph_from_hippo(aoi, schema='networks', edges_folder='edges_2011', nodes_folder='nodes_2011')
        # FOR VIALIDADES 2011 ONLY: Drop unncessary columns from nodes column (only present in 2010)
        nodes.drop(['ID', 'TIPOVIA', 'TIPO', 
                    'NUMERO', 'DERE_TRAN', 'ADMINISTRA', 'NUME_CARR', 'CONDICION', 
                    'ORIGEN', 'CALI_REPR', 'CVEGEO', 'NOMVIAL', 'SENTIDO', 'LONGITUD', 'UNIDAD', 
                    'vertex_pos', 'vertex_ind', 'vertex_par', 'vertex_p_1', 
                    'distance', 'angle'], inplace = True, axis=1)
        
    elif year == '2020':
        _, nodes, _ = aup.graph_from_hippo(aoi, schema='osmnx', edges_folder='edges_23_line', nodes_folder='nodes_23_point')
    
    nodes.reset_index(inplace=True)
    nodes = nodes.to_crs("EPSG:4326")

    # --------------- 3.1 CREATE VORONOI POLYGONS USING NODES

    print("Creating voronois with nodes osmid data.")

    # Create voronois
    voronois_gdf = voronoi_points_within_polygon(aoi,nodes,'osmid')
    nodes_voronoi_gdf = voronois_gdf[['osmid','geometry']]

    # --------------- 3.2 SPATIAL INTERSECTION OF POLYGONS WITH BLOCKS

    print("Creating spatial join between voronoi polygons and blocks.")
    
    # Calculate block area
    mza_gdf = pop_mza_gdf_calc.to_crs("EPSG:6372")
    mza_gdf['area_mza'] = mza_gdf.geometry.area
    mza_gdf = mza_gdf.to_crs("EPSG:4326")
    
    # Overlay blocks with voronoi (Spatial intersection)
    mza_voronoi = gpd.overlay(df1=mza_gdf, df2=nodes_voronoi_gdf, how="intersection")
    del mza_gdf

    print("Calculating area_pct that corresponds to each osmid within each block.")

    # Calculate pct of area that corresponds to each osmid within each block
    mza_voronoi = mza_voronoi.to_crs("EPSG:6372")
    mza_voronoi['area_voronoi'] = mza_voronoi.geometry.area
    mza_voronoi = mza_voronoi.to_crs("EPSG:4326")
    mza_voronoi['area_pct'] = mza_voronoi['area_voronoi']/mza_voronoi['area_mza']
    
    # Drop used columns
    mza_voronoi.drop(columns=['area_mza','area_voronoi'],inplace=True)

    # --------------- 3.3 SUM POB DATA THAT CORRESPONDS TO EACH NODE (Groups mza_voronoi data by osmid)

    print("Adding pob data by node.")
    
    columns_of_interest = ['POBTOT','POBFEM','POBMAS',
                    'P_0A2','P_0A2_F','P_0A2_M',
                    'P_3A5','P_3A5_F','P_3A5_M',
                    'P_6A11','P_6A11_F','P_6A11_M',
                    'P_12A14','P_12A14_F','P_12A14_M',
                    'P_15A17','P_15A17_F','P_15A17_M',
                    'P_18A24','P_18A24_F','P_18A24_M',
                    'P_60YMAS','P_60YMAS_F','P_60YMAS_M',
                    'P_3YMAS','P_3YMAS_F','P_3YMAS_M',
                    'P_12YMAS','P_12YMAS_F','P_12YMAS_M',
                    'P_15YMAS','P_15YMAS_F','P_15YMAS_M',
                    'P_18YMAS','P_18YMAS_F','P_18YMAS_M',
                    'POB0_14','POB15_64','POB65_MAS'] # Similar to columns_of_interest inside function calculate_censo_nan_values_v1 but with POBTOT and without REL_H_M.

    # Create pop_nodes_gdf (Will store nodes pop output)
    pop_nodes_gdf = nodes.copy()
    pop_nodes_gdf.drop(columns=['x','y','street_count','city'],inplace=True)
    
    for col in columns_of_interest:
        # Turn column to numeric 
        mza_voronoi[col] = pd.to_numeric(mza_voronoi[col])
    
        # Calculate pop data proportionaly to pct that voronoi area is of block
        mza_voronoi[f'voronoi_{col}'] = mza_voronoi[col] * mza_voronoi['area_pct']
    
        # Group data by osmid
        #col_data = mza_voronoi[['osmid',f'voronoi_{col}']]
        osmid_grouped_data = mza_voronoi.groupby('osmid').agg({f'voronoi_{col}':np.sum})
        
        # Merge data to nodes_gdf
        osmid_grouped_data.reset_index(inplace=True)
        pop_nodes_gdf = pd.merge(pop_nodes_gdf, osmid_grouped_data, on='osmid')
        pop_nodes_gdf.rename(columns={f'voronoi_{col}':col},inplace=True)

    ##########################################################################################
    # STEP 4: TURN NODES POP DATA TO HEXS
    
    print("--"*30)
    print("DISTRIBUTING POP DATA FROM NODES TO HEXGRID.")
    
    # Create hex_socio_gdf (Will store hexs pop output)
    hex_socio_gdf = gpd.GeoDataFrame()
    
    for res in res_list:
        # --------------- 4.1 LOAD HEXGRID
        # Load hexgrid from db
        print(f"Loading hexgrid res {res} for area of interest.")
        query = f"SELECT * FROM hexgrid.hexgrid_{res}_city_2020 WHERE \"city\" LIKE \'{city}\'"
        hex_res_gdf = aup.gdf_from_query(query, geometry_col='geometry')
        hex_res_gdf = hex_res_gdf.set_crs("EPSG:4326")
    
        # Format - Remove res from index name and add column with res
        hex_res_gdf.rename(columns={f'hex_id_{res}':'hex_id'},inplace=True)
        hex_res_gdf['res'] = res
        print(f"Created hex_grid with {res} resolution")

        # --------------- 4.2 GROUP POPDATA IN HEXGRID
        # Group pop data
        string_columns = ['osmid'] # Nodes string columns are not used in aup.group_sociodemographic_data. The rest are turned into numeric and processed.
        hex_socio_df = aup.socio_points_to_polygon(hex_res_gdf, pop_nodes_gdf, 'hex_id', string_columns) 
        print(f"Agregated socio data to hex with a total of {hex_socio_df.POBTOT.sum()} population for resolution {res}.")
    
        # Hexagons data to hex_gdf GeoDataFrame
        hex_socio_gdf_tmp = hex_res_gdf.merge(hex_socio_df, on='hex_id')

        # --------------- 4.3 Add additional common fields
        # Calculate population density
        hectares = hex_socio_gdf_tmp.to_crs("EPSG:6372").area / 10000
        hex_socio_gdf_tmp['DENS_POB_HA'] = hex_socio_gdf_tmp['POBTOT'] / hectares 
        print(f"Calculated an average density of {hex_socio_gdf_tmp.DENS_POB_HA.mean()}")
        
        # Concatenate in hex_socio_gdf (if more resolutions, next resolution will also be stored here)
        hex_socio_gdf = pd.concat([hex_socio_gdf,hex_socio_gdf_tmp])

    return pop_nodes_gdf, hex_socio_gdf

## Function run test (step by step)

### 0. Get base data for 1 city (Test, Aguascalientes)

In [3]:
# --------------- PARAMETERS
# Year of analysis
year = '2020' # Currently works only for 2020, look for notes that say "Needs update for 'year' to work:"

# List of skip cities (If failed / want to skip city)
skip_city_list = []

# Hexgrid res of output
res_list = [8,9] #Only 8,9,10 and 11 available, will run 8 and 9 for prox. analysis v2.

# Save info
save = True
save_schema = 'censo'
save_table = f'censo_inegi_{year[:2]}_ageb_hex'

# Test (If testing, runs res 8 for Aguascalientes ONLY and does not save it)
test = True
# --------------- 


# --------------- PARAMETERS THAT SHOULDN'T CHANGE
# Cities (2020 unless running for 2010, then metro_gdf_2015?)
metro_schema = 'metropolis'
metro_table = 'metro_gdf_2020'

# To be decided
censo_column_start = 14 #column where numeric data starts in censo (16 for 2010, 14 for 2020)
censo_column_end = 0 #column where numeric data ends in censo (-1 for 2010, all up to the end (0) for 2020)

# Do not modify unless bd names change
pop_schema = 'censoageb'
pop_table = 'censoageb_' + year
# --------------- 


# --------------- SCRIPT
# Load cities (municipalities)
query = f"SELECT * FROM {metro_schema}.{metro_table}"
metro_gdf = aup.gdf_from_query(query, geometry_col='geometry')
metro_gdf = metro_gdf.set_crs("EPSG:4326")

city_list = list(metro_gdf.city.unique())
k = len(city_list)

print(f'Loaded city list with {k} cities.')

# Prevent cities being analyzed several times in case of a crash
processed_city_list = []
try:
    query = f"SELECT city FROM {save_schema}.{save_table}"
    cities_processed = aup.df_from_query(query)
    processed_city_list = list(cities_processed.city.unique())
except:
    pass

# LOG - Print progress so far
missing_cities_list = []
for city in city_list:
    if city not in processed_city_list:
        missing_cities_list.append(city)

i = len(processed_city_list)
print(f'Already processed ({i}/{k}) cities.')
print(f'Missing procesing for cities: {missing_cities_list}')

# If test, simplifies:
if test:
    res_list = [8]
    missing_cities_list = ['Aguascalientes']
    save = False

#for city in missing_cities_list:
#    if city not in skip_city_list:
#        print("--"*40)
#        i = i + 1
#        print(f"Starting city {i}/{k}: {city}")
#        pop_mza_gdf_calc = main()

Loaded city list with 71 cities.
Already processed (0/71) cities.
Missing procesing for cities: ['Aguascalientes', 'Ensenada', 'Mexicali', 'Tijuana', 'La Paz', 'Los Cabos', 'Campeche', 'Laguna', 'Monclova', 'Piedras Negras', 'Saltillo', 'Colima', 'Tapachula', 'Tuxtla', 'Chihuahua', 'Delicias', 'Juarez', 'CDMX', 'ZMVM', 'Durango', 'Celaya', 'Guanajuato', 'Leon', 'Irapuato', 'Acapulco', 'Chilpancingo', 'Pachuca', 'Tulancingo', 'Guadalajara', 'Vallarta', 'Piedad', 'Toluca', 'Morelia', 'Zamora', 'Uruapan', 'Cuautla', 'Cuernavaca', 'Tepic', 'Monterrey', 'Oaxaca', 'Puebla', 'San Martin', 'Tehuacan', 'Queretaro', 'Cancun', 'Chetumal', 'Playa', 'SLP', 'Culiacan', 'Los Mochis', 'Mazatlan', 'Guaymas', 'Ciudad Obregon', 'Hermosillo', 'Nogales', 'Villahermosa', 'Victoria', 'Matamoros', 'Nuevo Laredo', 'Reynosa', 'Tampico', 'Tlaxcala', 'Coatzacoalcos', 'Cordoba', 'Minatitlan', 'Orizaba', 'Poza Rica', 'Veracruz', 'Xalapa', 'Merida', 'Zacatecas']


### 1. LOAD DATA

In [4]:
city = 'Aguascalientes'

In [5]:
##########################################################################################
# STEP 1: LOAD DATA

# --------------- 1.1 CREATE AREA OF INTEREST FOR CITY
city_gdf = metro_gdf.loc[metro_gdf.city == city]
city_gdf = city_gdf.set_crs("EPSG:4326")
aoi = city_gdf.dissolve()

# --------------- 1.2 LOAD POP DATA
print("Loading AGEBs for area of interest.")
pop_ageb_gdf = aup.gdf_from_polygon(aoi,'censoageb',f'censoageb_{year}')
pop_ageb_gdf = pop_ageb_gdf.set_crs("EPSG:4326")

print("Loading blocks for area of interest.")
pop_mza_gdf = aup.gdf_from_polygon(aoi,'censo_mza',f'censo_mza_{year}')
pop_mza_gdf = pop_mza_gdf.set_crs("EPSG:4326")
pop_mza_gdf = pop_mza_gdf.loc[pop_mza_gdf.AMBITO == 'Urbana'].copy()

Loading AGEBs for area of interest.
Loading blocks for area of interest.


### 2. CALCULATE NaN VALUES

In [6]:
##########################################################################################
# STEP 2: CALCULATE NaN VALUES for pop fields (most of them) of gdf containing blocks.

print("--"*30)
print("STARTING nan calculating function for block's pop fields.")

# --------------- 2.1 CALCULATE_CENSO_NAN_VALUES Function
pop_mza_gdf_calc = aup.calculate_censo_nan_values_v1(pop_ageb_gdf,pop_mza_gdf,extended_logs=False)

------------------------------------------------------------
STARTING nan calculating function for block's pop fields.
STARTING NANs calculation.
Calculating NaNs. 10% done.
Calculating NaNs. 20% done.
Calculating NaNs. 30% done.
Calculating NaNs. 40% done.
Calculating NaNs. 50% done.
Calculating NaNs. 60% done.
Calculating NaNs. 70% done.
Calculating NaNs. 80% done.
Calculating NaNs. 90% done.
Calculating NaNs. 100% done.
Finished calculating NaNs.
Percentage of NaNs found using blocks gdf: 79.7%.
Columns which could be solved entirely using equations in block_gdf: 4980.0.
Columns which required AGEB filling: 9916.0.


In [7]:
print(pop_mza_gdf_calc.shape)
print(pop_mza_gdf_calc.POBTOT.sum())
pop_mza_gdf_calc.loc[pop_mza_gdf_calc.POBTOT.isna()]

(12932, 239)
1041064


Unnamed: 0,CVEGEO,CVE_ENT,CVE_MUN,CVE_LOC,CVE_AGEB,CVE_MZA,AMBITO,TIPOMZA,geometry,ENTIDAD,...,VPH_TELEF,VPH_CEL,VPH_INTER,VPH_STVP,VPH_SPMVPI,VPH_CVJ,VPH_SINRTV,VPH_SINLTC,VPH_SINCINT,VPH_SINTIC


### 3. DISTRIBUTE POP BLOCK DATA TO NODES USING VORONOI

In [30]:
##########################################################################################
# STEP 3: DISTRIBUTE POP BLOCK DATA TO NODES USING VORONOI

print("--"*30)
print("STARTING Distribution of block data to nodes.")

# --------------- 3.0 LOAD NODES
print("Loading nodes for area of interest.")

if year == '2010':
    _, nodes, _ = aup.graph_from_hippo(aoi, schema='networks', edges_folder='edges_2011', nodes_folder='nodes_2011')
    # FOR VIALIDADES 2011 ONLY: Drop unncessary columns from nodes column (only present in 2010)
    nodes.drop(['ID', 'TIPOVIA', 'TIPO', 
                'NUMERO', 'DERE_TRAN', 'ADMINISTRA', 'NUME_CARR', 'CONDICION', 
                'ORIGEN', 'CALI_REPR', 'CVEGEO', 'NOMVIAL', 'SENTIDO', 'LONGITUD', 'UNIDAD', 
                'vertex_pos', 'vertex_ind', 'vertex_par', 'vertex_p_1', 
                'distance', 'angle'], inplace = True, axis=1)
    
elif year == '2020':
    _, nodes, _ = aup.graph_from_hippo(aoi, schema='osmnx', edges_folder='edges_23_line', nodes_folder='nodes_23_point')

nodes.reset_index(inplace=True)
nodes = nodes.to_crs("EPSG:4326")

# --------------- 3.1 CREATE VORONOI POLYGONS USING NODES

print("Creating voronois with nodes osmid data.")

# Create voronois
voronois_gdf = voronoi_points_within_polygon(aoi,nodes,'osmid')
nodes_voronoi_gdf = voronois_gdf[['osmid','geometry']]

# --------------- 3.2 SPATIAL INTERSECTION OF POLYGONS WITH BLOCKS

print("Creating spatial join between voronoi polygons and blocks.")

# Calculate block area
mza_gdf = pop_mza_gdf_calc.to_crs("EPSG:6372")

print(f'Pob mza_gdf = {mza_gdf.POBTOT.sum()}.')

mza_gdf['area_mza'] = mza_gdf.geometry.area
mza_gdf = mza_gdf.to_crs("EPSG:4326")

# Overlay blocks with voronoi (Spatial intersection)
mza_voronoi = gpd.overlay(df1=mza_gdf, df2=nodes_voronoi_gdf, how="intersection")
del mza_gdf

print(f'Pob mza_voronoi = {mza_voronoi.POBTOT.sum()}.')

print("Calculating area_pct that corresponds to each osmid within each block.")

# Calculate pct of area that corresponds to each osmid within each block
mza_voronoi = mza_voronoi.to_crs("EPSG:6372")
mza_voronoi['area_voronoi'] = mza_voronoi.geometry.area
mza_voronoi = mza_voronoi.to_crs("EPSG:4326")
mza_voronoi['area_pct'] = mza_voronoi['area_voronoi']/mza_voronoi['area_mza']

# Drop used columns
mza_voronoi.drop(columns=['area_mza','area_voronoi'],inplace=True)

# --------------- 3.3 SUM POB DATA THAT CORRESPONDS TO EACH NODE (Groups mza_voronoi data by osmid)

print("Adding pob data by node.")

columns_of_interest = ['POBTOT','POBFEM','POBMAS',
                'P_0A2','P_0A2_F','P_0A2_M',
                'P_3A5','P_3A5_F','P_3A5_M',
                'P_6A11','P_6A11_F','P_6A11_M',
                'P_12A14','P_12A14_F','P_12A14_M',
                'P_15A17','P_15A17_F','P_15A17_M',
                'P_18A24','P_18A24_F','P_18A24_M',
                'P_60YMAS','P_60YMAS_F','P_60YMAS_M',
                'P_3YMAS','P_3YMAS_F','P_3YMAS_M',
                'P_12YMAS','P_12YMAS_F','P_12YMAS_M',
                'P_15YMAS','P_15YMAS_F','P_15YMAS_M',
                'P_18YMAS','P_18YMAS_F','P_18YMAS_M',
                'POB0_14','POB15_64','POB65_MAS']



pop_nodes_gdf = nodes.copy()
pop_nodes_gdf.drop(columns=['x','y','street_count','city'],inplace=True)

for col in columns_of_interest:
    # Turn column to numeric 
    mza_voronoi[col] = pd.to_numeric(mza_voronoi[col])

    # Calculate pop data proportionaly to pct that voronoi area is of block
    mza_voronoi[f'voronoi_{col}'] = mza_voronoi[col] * mza_voronoi['area_pct']

    # Group data by osmid
    #col_data = mza_voronoi[['osmid',f'voronoi_{col}']]
    osmid_grouped_data = mza_voronoi.groupby('osmid').agg({f'voronoi_{col}':np.sum})
    
    # Merge data to nodes_gdf
    osmid_grouped_data.reset_index(inplace=True)
    pop_nodes_gdf = pd.merge(pop_nodes_gdf, osmid_grouped_data, on='osmid')
    pop_nodes_gdf.rename(columns={f'voronoi_{col}':col},inplace=True)

------------------------------------------------------------
STARTING Distribution of block data to nodes.
Loading nodes for area of interest.
Creating voronois with nodes osmid data.
Processing area of interest for distance = 100.
Error = 0.49%. Repeating process.
Processing area of interest for distance = 1000.
Error = 0.23%. Repeating process.
Processing area of interest for distance = 10000.
Error = 0.0%. Admissible.
Creating spatial join between voronoi polygons and blocks.
Pob mza_gdf = 1041064.
Pob mza_voronoi = 7181828.
Calculating area_pct that corresponds to each osmid within each block.
Adding pob data by node.


In [31]:
print(pop_nodes_gdf.shape)
print(pop_nodes_gdf.POBTOT.sum())
pop_nodes_gdf.loc[pop_nodes_gdf.POBTOT.isna()]

(35950, 41)
1041063.9769270906


Unnamed: 0,osmid,geometry,POBTOT,POBFEM,POBMAS,P_0A2,P_0A2_F,P_0A2_M,P_3A5,P_3A5_F,...,P_12YMAS_M,P_15YMAS,P_15YMAS_F,P_15YMAS_M,P_18YMAS,P_18YMAS_F,P_18YMAS_M,POB0_14,POB15_64,POB65_MAS


In [32]:
print(list(pop_nodes_gdf.columns))
pop_nodes_gdf.head(1)

['osmid', 'geometry', 'POBTOT', 'POBFEM', 'POBMAS', 'P_0A2', 'P_0A2_F', 'P_0A2_M', 'P_3A5', 'P_3A5_F', 'P_3A5_M', 'P_6A11', 'P_6A11_F', 'P_6A11_M', 'P_12A14', 'P_12A14_F', 'P_12A14_M', 'P_15A17', 'P_15A17_F', 'P_15A17_M', 'P_18A24', 'P_18A24_F', 'P_18A24_M', 'P_60YMAS', 'P_60YMAS_F', 'P_60YMAS_M', 'P_3YMAS', 'P_3YMAS_F', 'P_3YMAS_M', 'P_12YMAS', 'P_12YMAS_F', 'P_12YMAS_M', 'P_15YMAS', 'P_15YMAS_F', 'P_15YMAS_M', 'P_18YMAS', 'P_18YMAS_F', 'P_18YMAS_M', 'POB0_14', 'POB15_64', 'POB65_MAS']


Unnamed: 0,osmid,geometry,POBTOT,POBFEM,POBMAS,P_0A2,P_0A2_F,P_0A2_M,P_3A5,P_3A5_F,...,P_12YMAS_M,P_15YMAS,P_15YMAS_F,P_15YMAS_M,P_18YMAS,P_18YMAS_F,P_18YMAS_M,POB0_14,POB15_64,POB65_MAS
0,272921360,POINT (-102.29507 21.87288),0.828358,0.393971,0.437472,-0.007713,0.0,-0.003085,0.001732,0.001073,...,0.43593,0.792571,0.373764,0.418807,0.744288,0.373764,0.370525,0.038873,0.683512,0.109059


In [89]:
# VISUAL TEST
#aoi.to_file('../../../data/external/temporal_fromjupyter/voronoi_censo_test/aoi_ags.gpkg', driver='GPKG')
#nodes.to_file('../../../data/external/temporal_fromjupyter/voronoi_censo_test/nodes_ags.gpkg', driver='GPKG')
#voronois_gdf.to_file('../../../data/external/temporal_fromjupyter/voronoi_censo_test/voronois_ags.gpkg', driver='GPKG')

### 4. TURN NODES POP DATA TO HEXS

In [81]:
##########################################################################################
# STEP 4: TURN NODES POP DATA TO HEXS
res_list = [8]

hex_socio_gdf = gpd.GeoDataFrame()

for res in res_list:
   # --------------- 4.1 LOAD HEXGRID
    print(f"Loading hexgrid res {res} for area of interest.")
    query = f"SELECT * FROM hexgrid.hexgrid_{res}_city_2020 WHERE \"city\" LIKE \'{city}\'"
    hex_res_gdf = aup.gdf_from_query(query, geometry_col='geometry')
    hex_res_gdf = hex_res_gdf.set_crs("EPSG:4326")

    # Format - Remove res from index name and add column with res
    hex_res_gdf.rename(columns={f'hex_id_{res}':'hex_id'},inplace=True)
    hex_res_gdf['res'] = res
    print(f"Created hex_grid with {res} resolution")

    # Group pop data
    string_columns = ['osmid'] # Nodes string columns are not used in aup.group_sociodemographic_data. The rest are turned into numeric and processed.
    hex_socio_df = aup.socio_points_to_polygon(hex_res_gdf, pop_nodes_gdf, 'hex_id', string_columns) 
    print(f"Agregated socio data to hex with a total of {hex_socio_df.POBTOT.sum()} population for resolution {res}.")

    # Hexagons data to hex_gdf GeoDataFrame
    hex_socio_gdf_tmp = hex_res_gdf.merge(hex_socio_df, on='hex_id')
    
    # Calculate population density
    hectares = hex_socio_gdf_tmp.to_crs("EPSG:6372").area / 10000
    hex_socio_gdf_tmp['DENS_POB_HA'] = hex_socio_gdf_tmp['POBTOT'] / hectares 
    print(f"Calculated an average density of {hex_socio_gdf_tmp.DENS_POB_HA.mean()}")
    
    # Concatenate in hex_socio_gdf, where (if more resolutions) next resolution will also be stored.
    hex_socio_gdf = pd.concat([hex_socio_gdf,hex_socio_gdf_tmp])

Loading hexgrid res 8 for area of interest.
Created hex_grid with 8 resolution
Agregated socio data to hex with a total of 1040897.625 population for resolution 8.
Calculated an average density of 38.25433057895378


In [90]:
# VISUAL TEST
#hex_res_gdf.to_file('../../../data/external/temporal_fromjupyter/voronoi_censo_test/ags_hex8.gpkg', driver='GPKG')

In [82]:
print(list(hex_socio_gdf.columns))
hex_socio_gdf.head(1)

['hex_id', 'geometry', 'CVEGEO', 'NOMGEO', 'city', 'type', 'res', 'POBTOT', 'POBFEM', 'POBMAS', 'P_0A2', 'P_0A2_F', 'P_0A2_M', 'P_3A5', 'P_3A5_F', 'P_3A5_M', 'P_6A11', 'P_6A11_F', 'P_6A11_M', 'P_12A14', 'P_12A14_F', 'P_12A14_M', 'P_15A17', 'P_15A17_F', 'P_15A17_M', 'P_18A24', 'P_18A24_F', 'P_18A24_M', 'P_60YMAS', 'P_60YMAS_F', 'P_60YMAS_M', 'P_3YMAS', 'P_3YMAS_F', 'P_3YMAS_M', 'P_12YMAS', 'P_12YMAS_F', 'P_12YMAS_M', 'P_15YMAS', 'P_15YMAS_F', 'P_15YMAS_M', 'P_18YMAS', 'P_18YMAS_F', 'P_18YMAS_M', 'POB0_14', 'POB15_64', 'POB65_MAS', 'DENS_POB_HA']


Unnamed: 0,hex_id,geometry,CVEGEO,NOMGEO,city,type,res,POBTOT,POBFEM,POBMAS,...,P_15YMAS,P_15YMAS_F,P_15YMAS_M,P_18YMAS,P_18YMAS_F,P_18YMAS_M,POB0_14,POB15_64,POB65_MAS,DENS_POB_HA
0,88498e3639fffff,"POLYGON ((-102.27184 21.89588, -102.26725 21.8...",1001,Aguascalientes,Aguascalientes,urban,8,6963.374023,3558.997803,3404.375977,...,5352.204102,2759.962891,2592.240967,4911.568359,2551.991455,2359.576904,1611.170166,4845.36084,506.842896,80.614392


#### Data test - Testing aup.socio_points_to_polygon data with POBTOT

In [86]:
# # # TEST # # #

for res in [8]:
   # --------------- 4.1 LOAD HEXGRID
    print(f"Loading hexgrid res {res} for area of interest.")
    query = f"SELECT * FROM hexgrid.hexgrid_{res}_city_2020 WHERE \"city\" LIKE \'{city}\'"
    hex_res_gdf = aup.gdf_from_query(query, geometry_col='geometry')
    hex_res_gdf = hex_res_gdf.set_crs("EPSG:4326")

    # Format - Remove res from index name and add column with res
    hex_res_gdf.rename(columns={f'hex_id_{res}':'hex_id'},inplace=True)
    hex_res_gdf['res'] = res
    print(f"Created hex_grid with {res} resolution")
    
    # --------------- 4.2 ADD HEX_ID TO EACH NODE
    gdf_tmp = gpd.sjoin(hex_res_gdf,pop_nodes_gdf)

    j = 0
    df_summary = pd.DataFrame()
    # For each hex_id
    for hex_id in gdf_tmp['hex_id'].unique():
        # Find all nodes belonging to that hex_id
        gdf_tmp_hexid = gdf_tmp.loc[gdf_tmp['hex_id'] == hex_id].copy()
        # Sum test value
        value =  gdf_tmp_hexid.POBTOT.sum()
        # Register
        df_summary.loc[j,'hex_id'] = hex_id
        df_summary.loc[j,'POBTOT'] = value
        j = j+1

testing = pd.merge(hex_socio_gdf,df_summary, on='hex_id')
testing = testing[['POBTOT_x','POBTOT_y']]
testing['diff'] = testing['POBTOT_x']-testing['POBTOT_y']
print(f'Diff is: {testing["diff"].sum()}.')

Loading hexgrid res 8 for area of interest.
Created hex_grid with 8 resolution
Diff is: 0.004837982474746941.


## Function testing (Function test)

In [100]:
# --------------- PARAMETERS
# Year of analysis
year = '2020' # Currently works only for 2020, look for notes that say "Needs update for 'year' to work:"

# List of skip cities (If failed / want to skip city)
skip_city_list = []

# Hexgrid res of output
res_list = [8,9] #Only 8,9,10 and 11 available, will run 8 and 9 for prox. analysis v2.

# Save info
save = True
save_schema = 'censo'
save_table = f'censo_inegi_{year[:2]}_ageb_hex'

# Test (If testing, runs res 8 for Aguascalientes ONLY and does not save it)
test = True
# --------------- 


# --------------- PARAMETERS THAT SHOULDN'T CHANGE
# Cities (2020 unless running for 2010, then metro_gdf_2015?)
metro_schema = 'metropolis'
metro_table = 'metro_gdf_2020'

# To be decided
censo_column_start = 14 #column where numeric data starts in censo (16 for 2010, 14 for 2020)
censo_column_end = 0 #column where numeric data ends in censo (-1 for 2010, all up to the end (0) for 2020)

# Do not modify unless bd names change
pop_schema = 'censoageb'
pop_table = 'censoageb_' + year
# --------------- 


# --------------- SCRIPT
# Load cities (municipalities)
query = f"SELECT * FROM {metro_schema}.{metro_table}"
metro_gdf = aup.gdf_from_query(query, geometry_col='geometry')
metro_gdf = metro_gdf.set_crs("EPSG:4326")

city_list = list(metro_gdf.city.unique())
k = len(city_list)

print(f'Loaded city list with {k} cities.')

# Prevent cities being analyzed several times in case of a crash
processed_city_list = []
try:
    query = f"SELECT city FROM {save_schema}.{save_table}"
    cities_processed = aup.df_from_query(query)
    processed_city_list = list(cities_processed.city.unique())
except:
    pass

# LOG - Print progress so far
missing_cities_list = []
for city in city_list:
    if city not in processed_city_list:
        missing_cities_list.append(city)

i = len(processed_city_list)
print(f'Already processed ({i}/{k}) cities.')
print(f'Missing procesing for cities: {missing_cities_list}')

# If test, simplifies:
if test:
    res_list = [8]
    missing_cities_list = ['Aguascalientes']
    save = False

for city in missing_cities_list:
    if city not in skip_city_list:
        print("--"*40)
        i = i + 1
        print(f"Starting city {i}/{k}: {city}")
        pop_nodes_gdf_test, hex_socio_gdf_test = main()

Loaded city list with 71 cities.
Already processed (0/71) cities.
Missing procesing for cities: ['Aguascalientes', 'Ensenada', 'Mexicali', 'Tijuana', 'La Paz', 'Los Cabos', 'Campeche', 'Laguna', 'Monclova', 'Piedras Negras', 'Saltillo', 'Colima', 'Tapachula', 'Tuxtla', 'Chihuahua', 'Delicias', 'Juarez', 'CDMX', 'ZMVM', 'Durango', 'Celaya', 'Guanajuato', 'Leon', 'Irapuato', 'Acapulco', 'Chilpancingo', 'Pachuca', 'Tulancingo', 'Guadalajara', 'Vallarta', 'Piedad', 'Toluca', 'Morelia', 'Zamora', 'Uruapan', 'Cuautla', 'Cuernavaca', 'Tepic', 'Monterrey', 'Oaxaca', 'Puebla', 'San Martin', 'Tehuacan', 'Queretaro', 'Cancun', 'Chetumal', 'Playa', 'SLP', 'Culiacan', 'Los Mochis', 'Mazatlan', 'Guaymas', 'Ciudad Obregon', 'Hermosillo', 'Nogales', 'Villahermosa', 'Victoria', 'Matamoros', 'Nuevo Laredo', 'Reynosa', 'Tampico', 'Tlaxcala', 'Coatzacoalcos', 'Cordoba', 'Minatitlan', 'Orizaba', 'Poza Rica', 'Veracruz', 'Xalapa', 'Merida', 'Zacatecas']
------------------------------------------------------

In [103]:
print(pop_nodes_gdf_test.shape)
print(pop_nodes_gdf_test.POBTOT.sum())
pop_nodes_gdf_test.head(1)

(35950, 41)
1041063.9769270906


Unnamed: 0,osmid,geometry,POBTOT,POBFEM,POBMAS,P_0A2,P_0A2_F,P_0A2_M,P_3A5,P_3A5_F,...,P_12YMAS_M,P_15YMAS,P_15YMAS_F,P_15YMAS_M,P_18YMAS,P_18YMAS_F,P_18YMAS_M,POB0_14,POB15_64,POB65_MAS
0,272921360,POINT (-102.29507 21.87288),0.828358,0.393971,0.437472,-0.007713,0.0,-0.003085,0.001732,0.001073,...,0.43593,0.792571,0.373764,0.418807,0.744288,0.373764,0.370525,0.038873,0.683512,0.109059


In [105]:
print(hex_socio_gdf_test.shape)
print(hex_socio_gdf_test.POBTOT.sum())
hex_socio_gdf_test.head(1)

(315, 47)
1040897.6


Unnamed: 0,hex_id,geometry,CVEGEO,NOMGEO,city,type,res,POBTOT,POBFEM,POBMAS,...,P_15YMAS,P_15YMAS_F,P_15YMAS_M,P_18YMAS,P_18YMAS_F,P_18YMAS_M,POB0_14,POB15_64,POB65_MAS,DENS_POB_HA
0,88498e3639fffff,"POLYGON ((-102.27184 21.89588, -102.26725 21.8...",1001,Aguascalientes,Aguascalientes,urban,8,6963.374023,3558.997803,3404.375977,...,5352.204102,2759.962891,2592.240967,4911.568359,2551.991455,2359.576904,1611.170166,4845.36084,506.842896,80.614392


## Specific tests

In [6]:
# Load cities (municipalities)
print("Loading all cities.")

metro_schema = 'metropolis'
metro_table = 'metro_gdf_2020'
year = '2020'

query = f"SELECT * FROM {metro_schema}.{metro_table}"
metro_gdf = aup.gdf_from_query(query, geometry_col='geometry')
metro_gdf = metro_gdf.set_crs("EPSG:4326")

# --------------- 1.1 CREATE AREA OF INTEREST FOR CITY
city = 'Aguascalientes'
print(f"Loading city {city}.")

city_gdf = metro_gdf.loc[metro_gdf.city == city]
city_gdf = city_gdf.set_crs("EPSG:4326")
aoi = city_gdf.dissolve()

# --------------- 1.2 LOAD POP DATA
print("Loading AGEBs for area of interest.")
pop_ageb_gdf = aup.gdf_from_polygon(aoi,'censoageb',f'censoageb_{year}')
pop_ageb_gdf = pop_ageb_gdf.set_crs("EPSG:4326")

print("Loading blocks for area of interest.")
pop_mza_gdf = aup.gdf_from_polygon(aoi,'censo_mza',f'censo_mza_{year}')
pop_mza_gdf = pop_mza_gdf.set_crs("EPSG:4326")
pop_mza_gdf = pop_mza_gdf.loc[pop_mza_gdf.AMBITO == 'Urbana'].copy()

print(pop_mza_gdf.shape)
print(pop_mza_gdf.POBTOT.sum())
pop_mza_gdf.loc[pop_mza_gdf.POBTOT.isna()]

Loading all cities.
Loading city Aguascalientes.
Loading AGEBs for area of interest.
Loading blocks for area of interest.
(12932, 239)
1041064


Unnamed: 0,CVEGEO,CVE_ENT,CVE_MUN,CVE_LOC,CVE_AGEB,CVE_MZA,AMBITO,TIPOMZA,geometry,ENTIDAD,...,VPH_TELEF,VPH_CEL,VPH_INTER,VPH_STVP,VPH_SPMVPI,VPH_CVJ,VPH_SINRTV,VPH_SINLTC,VPH_SINCINT,VPH_SINTIC
