# 00-polygon_pop_calc

This notebook takes a __polygon as input__ an calculates various demographic values using INEGI's __2010 and 2020 census data.__

## Import libraries

In [1]:
first_folder_path = '../../../'

In [2]:
import os
import sys

import pandas as pd
import geopandas as gpd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from pandas.api.types import CategoricalDtype

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

module_path = os.path.abspath(os.path.join(first_folder_path))
if module_path not in sys.path:
    sys.path.append(module_path)
    import aup

## Notebook config

In [3]:
# ----- ----- ----- Input polygon ----- ----- ----- 
poly_dir = first_folder_path + "data/external/temporal_todocker/c+lab/Poligono2_Area_de_influencia.gpkg"

# ----- ----- ----- Projection to be used when needed ----- ----- ----- 
projected_crs = "EPSG:32614" #Monterrey

## Load data

In [4]:
# Load polygon
poly_gdf = gpd.read_file(poly_dir)

# Set CRS
if poly_gdf.crs != projected_crs:
    try:
        poly_gdf = poly_gdf.set_crs(projected_crs)
    except:
        poly_gdf = poly_gdf.to_crs(projected_crs)

# Filter and rename data
poly_gdf.rename(columns={'Name':'name'},inplace=True)

# Show
print(poly_gdf.crs)
print(poly_gdf.shape)
poly_gdf.head(2)

EPSG:32614
(1, 2)


Unnamed: 0,name,geometry
0,Poligono 2 Area de Influencia,"POLYGON ((368748.125 2836360.371, 368651.528 2..."


In [10]:
# Load 2020 population
pop_schema = 'censo'
pop_table = 'pobcenso_inegi_20_mzaageb_mza'
city = 'Monterrey'

pob_query = f"SELECT * FROM {pop_schema}.{pop_table} WHERE \"city\" = \'{city}\'"
pob_20_gdf = aup.gdf_from_query(pob_query, geometry_col='geometry')

# Set CRS
if pob_20_gdf.crs != projected_crs:
    try:
        pob_20_gdf = pob_20_gdf.set_crs(projected_crs)
    except:
        pob_20_gdf = pob_20_gdf.to_crs(projected_crs)

# Show
print(pob_20_gdf.crs)
print(pob_20_gdf.shape)
pob_20_gdf.head(2)

EPSG:32614
(64060, 238)


Unnamed: 0,nom_ent,nom_mun,nom_loc,pobtot,pobfem,pobmas,p_0a2,p_0a2_f,p_0a2_m,p_3ymas,...,cve_ageb,cve_mza,cvegeo_mun,cvegeo_loc,cvegeo_ageb,cvegeo_mza,ambito,tipomza,geometry,city
0,Nuevo LeÃ³n,Apodaca,Ciudad Apodaca,2,1.058824,0.941176,0.0,0.0,0.0,2.0,...,507,6,19006,190060001,1900600010507,1900600010507006,Urbana,Típica,"POLYGON ((379763.030 2854976.306, 379800.845 2...",Monterrey
1,Nuevo LeÃ³n,Apodaca,Ciudad Apodaca,1,0.529412,0.470588,0.0,0.0,0.0,1.0,...,507,20,19006,190060001,1900600010507,1900600010507020,Urbana,Típica,"POLYGON ((380148.844 2854109.285, 380162.765 2...",Monterrey


In [11]:
# Load 2010 population
pop_schema = 'censo'
pop_table = 'pobcenso_inegi_10_mzaageb_mza'
city = 'Monterrey'

pob_query = f"SELECT * FROM {pop_schema}.{pop_table} WHERE \"city\" = \'{city}\'"
pob_10_gdf = aup.gdf_from_query(pob_query, geometry_col='geometry')

# Set CRS
if pob_10_gdf.crs != projected_crs:
    try:
        pob_10_gdf = pob_10_gdf.set_crs(projected_crs)
    except:
        pob_10_gdf = pob_10_gdf.to_crs(projected_crs)

# Show
print(pob_10_gdf.crs)
print(pob_10_gdf.shape)
pob_10_gdf.head(2)

EPSG:32614
(50908, 204)


Unnamed: 0,nom_ent,nom_mun,nom_loc,pobtot,pobmas,pobfem,p_0a2,p_0a2_m,p_0a2_f,p_3ymas,...,cve_mun,cve_loc,cve_ageb,cve_mza,cvegeo_mun,cvegeo_loc,cvegeo_ageb,cvegeo_mza,geometry,city
0,Nuevo LeÃ³n,Cadereyta JimÃ©nez,Cadereyta JimÃ©nez,8,2.909091,5.090909,-0.242424,-0.727273,-0.727273,7.272727,...,9,1,458,14,19009,190090001,1900900010458,1900900010458014,"POLYGON ((398140.388 2830867.586, 398256.231 2...",Monterrey
1,Nuevo LeÃ³n,Cadereyta JimÃ©nez,Cadereyta JimÃ©nez,7,2.545455,4.454545,-0.212121,-0.636364,-0.636364,6.363636,...,9,1,458,7,19009,190090001,1900900010458,1900900010458007,"POLYGON ((397639.206 2830934.919, 397618.450 2...",Monterrey


## Clip data to polygon

In [15]:
pob_20_polygon = pob_20_gdf.sjoin(poly_gdf)

# Show
print(pob_20_polygon.shape)
pob_20_polygon.head(2)

(176, 240)


Unnamed: 0,nom_ent,nom_mun,nom_loc,pobtot,pobfem,pobmas,p_0a2,p_0a2_f,p_0a2_m,p_3ymas,...,cvegeo_mun,cvegeo_loc,cvegeo_ageb,cvegeo_mza,ambito,tipomza,geometry,city,index_right,name
37362,Nuevo LeÃ³n,Monterrey,Monterrey,60,29.0,31.0,2.0,0.0,2.0,58.0,...,19039,190390001,190390001420A,190390001420A015,Urbana,Típica,"POLYGON ((369248.209 2836196.056, 369272.686 2...",Monterrey,0,Poligono 2 Area de Influencia
37413,Nuevo LeÃ³n,Monterrey,Monterrey,30,17.0,13.0,2.0,2.0,0.0,28.0,...,19039,190390001,190390001420A,190390001420A008,Urbana,Típica,"POLYGON ((369254.360 2836251.790, 369248.000 2...",Monterrey,0,Poligono 2 Area de Influencia


In [17]:
list(pob_20_polygon.columns)

['nom_ent',
 'nom_mun',
 'nom_loc',
 'pobtot',
 'pobfem',
 'pobmas',
 'p_0a2',
 'p_0a2_f',
 'p_0a2_m',
 'p_3ymas',
 'p_3ymas_f',
 'p_3ymas_m',
 'p_5ymas',
 'p_5ymas_f',
 'p_5ymas_m',
 'p_12ymas',
 'p_12ymas_f',
 'p_12ymas_m',
 'p_15ymas',
 'p_15ymas_f',
 'p_15ymas_m',
 'p_18ymas',
 'p_18ymas_f',
 'p_18ymas_m',
 'p_3a5',
 'p_3a5_f',
 'p_3a5_m',
 'p_6a11',
 'p_6a11_f',
 'p_6a11_m',
 'p_8a14',
 'p_8a14_f',
 'p_8a14_m',
 'p_12a14',
 'p_12a14_f',
 'p_12a14_m',
 'p_15a17',
 'p_15a17_f',
 'p_15a17_m',
 'p_18a24',
 'p_18a24_f',
 'p_18a24_m',
 'p_15a49_f',
 'p_60ymas',
 'p_60ymas_f',
 'p_60ymas_m',
 'rel_h_m',
 'pob0_14',
 'pob15_64',
 'pob65_mas',
 'prom_hnv',
 'pnacent',
 'pnacent_f',
 'pnacent_m',
 'pnacoe',
 'pnacoe_f',
 'pnacoe_m',
 'pres2015',
 'pres2015_f',
 'pres2015_m',
 'presoe15',
 'presoe15_f',
 'presoe15_m',
 'p3ym_hli',
 'p3ym_hli_f',
 'p3ym_hli_m',
 'p3hlinhe',
 'p3hlinhe_f',
 'p3hlinhe_m',
 'p3hli_he',
 'p3hli_he_f',
 'p3hli_he_m',
 'p5_hli',
 'p5_hli_nhe',
 'p5_hli_he',
 'phog_

In [16]:
pob_10_polygon = pob_10_gdf.sjoin(poly_gdf)

# Show
print(pob_10_polygon.shape)
pob_10_polygon.head(2)

(171, 206)


Unnamed: 0,nom_ent,nom_mun,nom_loc,pobtot,pobmas,pobfem,p_0a2,p_0a2_m,p_0a2_f,p_3ymas,...,cve_ageb,cve_mza,cvegeo_mun,cvegeo_loc,cvegeo_ageb,cvegeo_mza,geometry,city,index_right,name
32742,Nuevo LeÃ³n,Monterrey,Monterrey,154,83.0,71.0,9.0,6.0,3.0,145.0,...,4178,19,19039,190390001,1903900014178,1903900014178019,"POLYGON ((369170.982 2836932.107, 369186.795 2...",Monterrey,0,Poligono 2 Area de Influencia
33369,Nuevo LeÃ³n,Monterrey,Monterrey,209,100.0,109.0,16.0,7.0,9.0,193.0,...,4178,14,19039,190390001,1903900014178,1903900014178014,"POLYGON ((369076.000 2836850.700, 369053.100 2...",Monterrey,0,Poligono 2 Area de Influencia


## Agregate data of interest

In [86]:
columns_of_interest = ['pobtot','pobfem','pobmas',
                       'p_0a2','p_0a2_f','p_0a2_m',
                       'p_3ymas','p_3ymas_f','p_3ymas_m',
                       'p_5ymas','p_5ymas_f','p_5ymas_m',
                       'p_12ymas','p_12ymas_f','p_12ymas_m',
                       'p_15ymas','p_15ymas_f','p_15ymas_m',
                       'p_18ymas','p_18ymas_f','p_18ymas_m',
                       'p_3a5','p_3a5_f','p_3a5_m',
                       'p_6a11','p_6a11_f','p_6a11_m',
                       'p_8a14','p_8a14_f','p_8a14_m',
                       'p_12a14','p_12a14_f','p_12a14_m',
                       'p_15a17','p_15a17_f','p_15a17_m',
                       'p_18a24','p_18a24_f','p_18a24_m',
                       'p_15a49_f',
                       'p_60ymas','p_60ymas_f','p_60ymas_m',
                       'pob0_14','pob15_64','pob65_mas',
                       'p3ym_hli', #Población de 3 años y más que habla una lengua indígena
                       # A PARTIR DE AQUÍ LOS VALORES SON APROXIMADOS PORQUE SON POR MANZANA Y NO ENTRAN EN calculate_censo_nan_values()
                       'p3a5_noa', #Población de 3 a 5 años que no asiste a la escuela
                       'p6a11_noa', #Población de 6 a 11 años que no asiste a la escuela
                       'p12a14noa', #Población de 12 a 14 años que no asiste a la escuela
                       'p15a17a', #Población de 15 a 17 años que SÍ van a la escuela
                       'p18a24a', #Población de 18 a 24 años que SÍ van a la escuela
                       'p15ym_an', #Población de 15 años y más que no saben leer y escribir
                       'p15sec_co', #Población de 15 años y más con secundaria completa
                       'psinder', #Población sin afiliación a servicios de salud (IMSS, ISSSTE, PEMEX, SEDENA, SEMAR, INSABI u otro)
                       'tvivpar', #Total de viviendas particulares
                       'tvivparhab'] #Viviendas particulares habitadas

summary_df = pd.DataFrame()
years = ['2010','2020']
for year in years:
    
    # Set data and columns of interest for current year
    sum_columns = columns_of_interest.copy()
    if year == '2010':
        data_gdf = pob_10_polygon.copy()
        sum_columns.append('pcon_lim')
    elif year == '2020':
        data_gdf = pob_20_polygon.copy()
        sum_columns.append('pcon_disc')

    # Iterate over each column
    idx = 0
    for col in sum_columns:
        current_data = data_gdf[col].sum()        
        summary_df.loc[idx, 'column'] = col
        summary_df.loc[idx, f'{year}_value'] = int(current_data)

        idx+=1

# Data that is already an average (do not sum)
mean_columns = ['graproes',#Grado promedio de escolaridad
                'prom_ocup']#Promedio de ocupantes en viviendas particulares habitadas

for year in years:
    
    if year == '2010':
        data_gdf = pob_10_polygon.copy()
    elif year == '2020':
        data_gdf = pob_20_polygon.copy()
        
    mean_idx = idx
    for col in mean_columns:
        current_data = np.nanmean(data_gdf[col])
        summary_df.loc[mean_idx, 'column'] = col
        summary_df.loc[mean_idx, f'{year}_value'] = round(current_data,2)

        mean_idx+=1


# Calculate changes
summary_df['diff_2020-2010'] = summary_df['2020_value'] - summary_df['2010_value']
summary_df['%diff_2020-2010'] = round((summary_df['diff_2020-2010'] / summary_df['2010_value'])*100,2)

# Calculate distribution data with reference to a parent category
# (e.g. pct_pobfem = pobfem/pobtot)

# Columns whose reference is NOT pobtot
weight_dct = {'p3ym_hli':'p_3ymas',
              'p3a5_noa':'p_3a5',
              'p6a11_noa':'p_6a11',
              'p12a14noa':'p_12a14',
              'p15a17a':'p_15a17',
              'p18a24a':'p_18a24',
              'p15ym_an':'p_15ymas',
              'p15sec_co':'p_15ymas',
              'tvivparhab':'tvivpar'}

ignore_cols = ['psinder','tvivpar']

for col in columns_of_interest:
    # Columns with 'pobtot' as weight
    if (col not in weight_dct.keys()) and (col not in ignore_cols):

        # Register weight column as 'pobtot'
        idx = summary_df['column']==col
        summary_df.loc[idx,'weight'] = 'pobtot'
        
        # Calculate value of interest for 2010 --> (2010 value/2010 pobtot)*100
        pobtot_idx = summary_df['column']=='pobtot'
        value_2010 = (summary_df.loc[idx,'2010_value'] / summary_df.loc[pobtot_idx,'2010_value'][0])*100
        # Register 2010 value
        summary_df.loc[idx,'2010_dist'] = round(value_2010,2)
        
        # Calculate value of interest for 2020 --> (2020 value/2020 pobtot)*100
        pobtot_idx = summary_df['column']=='pobtot'
        value_2020 = (summary_df.loc[idx,'2020_value'] / summary_df.loc[pobtot_idx,'2020_value'][0])*100
        # Register 2020 value
        summary_df.loc[idx,'2020_dist'] = round(value_2020,2)    

    # Columns with another column as weight
    if (col in weight_dct.keys()) and (col not in ignore_cols):

        # Register weight column as indicated in weight_dct
        idx = summary_df['column']==col
        summary_df.loc[idx,'weight'] = weight_dct[col]

        # Calculate value of interest for 2010 --> (2010 value/2010 weight_dct[col])*100
        weight_idx = summary_df['column']==weight_dct[col]
        value_2010 = (summary_df.loc[idx,'2010_value'] / summary_df.loc[weight_idx,'2010_value'].unique()[0])*100
        # Register 2010 value
        summary_df.loc[idx,'2010_dist'] = round(value_2010,2)

        # Calculate value of interest for 2020 --> (2020 value/2020 weight_dct[col])*100
        weight_idx = summary_df['column']==weight_dct[col]
        value_2020 = (summary_df.loc[idx,'2020_value'] / summary_df.loc[weight_idx,'2020_value'].unique()[0])*100
        # Register 2020 value
        summary_df.loc[idx,'2020_dist'] = round(value_2020,2)
        
summary_df

Unnamed: 0,column,2010_value,2020_value,diff_2020-2010,%diff_2020-2010,weight,2010_dist,2020_dist
0,pobtot,14990.0,13136.0,-1854.0,-12.37,pobtot,100.0,100.0
1,pobfem,7445.0,6470.0,-975.0,-13.1,pobtot,49.67,49.25
2,pobmas,7544.0,6665.0,-879.0,-11.65,pobtot,50.33,50.74
3,p_0a2,814.0,625.0,-189.0,-23.22,pobtot,5.43,4.76
4,p_0a2_f,409.0,292.0,-117.0,-28.61,pobtot,2.73,2.22
5,p_0a2_m,405.0,333.0,-72.0,-17.78,pobtot,2.7,2.54
6,p_3ymas,14049.0,12505.0,-1544.0,-10.99,pobtot,93.72,95.2
7,p_3ymas_f,6980.0,6175.0,-805.0,-11.53,pobtot,46.56,47.01
8,p_3ymas_m,7068.0,6329.0,-739.0,-10.46,pobtot,47.15,48.18
9,p_5ymas,13400.0,11979.0,-1421.0,-10.6,pobtot,89.39,91.19
