# 00b-polygon_pop_calc [Using AGEBs]

This notebook takes a __polygon as input__ an calculates various demographic values using INEGI's __2010 and 2020 census data.__

## Import libraries

In [1]:
first_folder_path = '../../../'

In [2]:
import os
import sys

import pandas as pd
import geopandas as gpd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from pandas.api.types import CategoricalDtype

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

module_path = os.path.abspath(os.path.join(first_folder_path))
if module_path not in sys.path:
    sys.path.append(module_path)
    import aup

## Notebook config

In [18]:
# ----- ----- ----- Entidad and Municipalities to filter AGEBs for entire city ----- ----- -----
city = 'Monterrey'

# ----- ----- ----- Specific AGEBs of interest for polygon analysis (In cvegeo_ageb format)----- ----- -----
polygon_agebs = ['1903900012133','1903900014182','1903900014197','190390001420A']

# ----- ----- ----- Projection to be used when needed ----- ----- ----- 
projected_crs = "EPSG:32614" #Monterrey

## Load data

### __Load data__ - Load city's area of interest

In [19]:
metro_schema = 'metropolis'
metro_table = 'metro_gdf_2020'

metro_query = f"SELECT * FROM {metro_schema}.{metro_table} WHERE \"city\" = \'{city}\'"
metro_gdf = aup.gdf_from_query(metro_query, geometry_col='geometry')
cvegeo_muns = list(metro_gdf.CVEGEO.unique())
cvegeo_muns_tpl = str(tuple(cvegeo_muns))

# Show
cvegeo_muns_tpl

"('19006', '19009', '19010', '19012', '19018', '19019', '19021', '19025', '19026', '19031', '19039', '19041', '19045', '19046', '19048', '19049')"

### __Load data__ - INEGI'S CENSO 2010 data

In [20]:
# Load 2010 population
pop_schema = 'censo'
pop_table = 'censo_inegi_10_ageb'

pob_query = f"SELECT * FROM {pop_schema}.{pop_table} WHERE \"cvegeo_mun\" IN {cvegeo_muns_tpl}"
pob_10_gdf = aup.gdf_from_query(pob_query, geometry_col='geometry')

# Show
print(pob_10_gdf.pobtot.sum())
print(pob_10_gdf.shape)
pob_10_gdf.head(2)

4131710
(1743, 203)


Unnamed: 0,nom_ent,nom_mun,nom_loc,pobtot,pobmas,pobfem,p_0a2,p_0a2_m,p_0a2_f,p_3ymas,...,cve_ent,cve_mun,cve_loc,cve_ageb,cve_mza,cvegeo_mun,cvegeo_loc,cvegeo_ageb,cvegeo_mza,geometry
0,Nuevo LeÃ³n,GarcÃ­a,Total AGEB urbana,78,36.0,42.0,5.0,,4.0,73.0,...,19,18,1,917,0,19018,190180001,1901800010917,1901800010917000,"POLYGON ((-100.57294 25.79231, -100.57294 25.7..."
1,Nuevo LeÃ³n,GarcÃ­a,Total AGEB urbana,497,254.0,243.0,46.0,23.0,23.0,451.0,...,19,18,212,739,0,19018,190180212,1901802120739,1901802120739000,"POLYGON ((-100.43679 25.78372, -100.43556 25.7..."


### __Load data__ - INEGI'S CENSO 2020 data

In [21]:
# Load 2020 population
pop_schema = 'censo'
pop_table = 'censo_inegi_20_ageb'

pob_query = f"SELECT * FROM {pop_schema}.{pop_table} WHERE \"cvegeo_mun\" IN {cvegeo_muns_tpl}"
pob_20_gdf = aup.gdf_from_query(pob_query, geometry_col='geometry')

# Show
print(pob_20_gdf.pobtot.sum())
print(pob_20_gdf.shape)
pob_20_gdf.head(2)

5245560
(2155, 235)


Unnamed: 0,nom_ent,nom_mun,nom_loc,pobtot,pobfem,pobmas,p_0a2,p_0a2_f,p_0a2_m,p_3ymas,...,cve_ent,cve_mun,cve_loc,cve_ageb,cve_mza,cvegeo_mun,cvegeo_loc,cvegeo_ageb,cvegeo_mza,geometry
0,Nuevo LeÃ³n,General Zuazua,Total AGEB urbana,26,9.0,17.0,,0.0,,25.0,...,19,25,1,324,0,19025,190250001,1902500010324,1902500010324000,"MULTIPOLYGON (((-100.12793 25.88570, -100.1280..."
1,Nuevo LeÃ³n,JuÃ¡rez,Total AGEB urbana,22,14.0,8.0,,,,20.0,...,19,31,1,2135,0,19031,190310001,1903100012135,1903100012135000,"MULTIPOLYGON (((-100.08080 25.62775, -100.0813..."


## Clip data to polygon

In [25]:
pob_10_polygon = pob_10_gdf.loc[pob_10_gdf.cvegeo_ageb.isin(polygon_agebs)].copy()

# Show
print(pob_10_polygon.pobtot.sum())
print(pob_10_polygon.shape)
pob_10_polygon

12831
(4, 203)


Unnamed: 0,nom_ent,nom_mun,nom_loc,pobtot,pobmas,pobfem,p_0a2,p_0a2_m,p_0a2_f,p_3ymas,...,cve_ent,cve_mun,cve_loc,cve_ageb,cve_mza,cvegeo_mun,cvegeo_loc,cvegeo_ageb,cvegeo_mza,geometry
1163,Nuevo LeÃ³n,Monterrey,Total AGEB urbana,3558,1765.0,1793.0,181.0,80.0,101.0,3331.0,...,19,39,1,2133,0,19039,190390001,1903900012133,1903900012133000,"POLYGON ((-100.30284 25.64256, -100.30280 25.6..."
1328,Nuevo LeÃ³n,Monterrey,Total AGEB urbana,3104,1584.0,1520.0,166.0,82.0,84.0,2916.0,...,19,39,1,4182,0,19039,190390001,1903900014182,1903900014182000,"POLYGON ((-100.30284 25.64256, -100.30284 25.6..."
1330,Nuevo LeÃ³n,Monterrey,Total AGEB urbana,2610,1321.0,1289.0,134.0,72.0,62.0,2429.0,...,19,39,1,4197,0,19039,190390001,1903900014197,1903900014197000,"POLYGON ((-100.29759 25.64044, -100.29743 25.6..."
1331,Nuevo LeÃ³n,Monterrey,Total AGEB urbana,3559,1755.0,1804.0,218.0,110.0,108.0,3316.0,...,19,39,1,420A,0,19039,190390001,190390001420A,190390001420A000,"POLYGON ((-100.29979 25.63930, -100.29968 25.6..."


In [26]:
pob_20_polygon = pob_20_gdf.loc[pob_20_gdf.cvegeo_ageb.isin(polygon_agebs)].copy()

# Show
print(pob_20_polygon.pobtot.sum())
print(pob_20_polygon.shape)
pob_20_polygon

11292
(4, 235)


Unnamed: 0,nom_ent,nom_mun,nom_loc,pobtot,pobfem,pobmas,p_0a2,p_0a2_f,p_0a2_m,p_3ymas,...,cve_ent,cve_mun,cve_loc,cve_ageb,cve_mza,cvegeo_mun,cvegeo_loc,cvegeo_ageb,cvegeo_mza,geometry
1483,Nuevo LeÃ³n,Monterrey,Total AGEB urbana,3643,1726.0,1917.0,210.0,90.0,120.0,3429.0,...,19,39,1,2133,0,19039,190390001,1903900012133,1903900012133000,"MULTIPOLYGON (((-100.30284 25.64256, -100.3028..."
1649,Nuevo LeÃ³n,Monterrey,Total AGEB urbana,2164,1050.0,1114.0,87.0,42.0,45.0,2077.0,...,19,39,1,4182,0,19039,190390001,1903900014182,1903900014182000,"MULTIPOLYGON (((-100.29967 25.64462, -100.2997..."
1650,Nuevo LeÃ³n,Monterrey,Total AGEB urbana,2456,1216.0,1240.0,103.0,46.0,57.0,2351.0,...,19,39,1,4197,0,19039,190390001,1903900014197,1903900014197000,"MULTIPOLYGON (((-100.29759 25.64044, -100.2974..."
1652,Nuevo LeÃ³n,Monterrey,Total AGEB urbana,3029,1527.0,1502.0,156.0,72.0,84.0,2873.0,...,19,39,1,420A,0,19039,190390001,190390001420A,190390001420A000,"MULTIPOLYGON (((-100.29968 25.63894, -100.2995..."


## Agregate data of interest

In [27]:
def agregate_census_data_of_interest(census_2010, census_2020):

    ################################################################## SET FIELDS OF INTEREST
    fields_of_interest = ['pobtot','pobfem','pobmas',
                          'p_0a2','p_0a2_f','p_0a2_m',
                          'p_3ymas','p_3ymas_f','p_3ymas_m',
                          'p_5ymas','p_5ymas_f','p_5ymas_m',
                          'p_12ymas','p_12ymas_f','p_12ymas_m',
                          'p_15ymas','p_15ymas_f','p_15ymas_m',
                          'p_18ymas','p_18ymas_f','p_18ymas_m',
                          'p_3a5','p_3a5_f','p_3a5_m',
                          'p_6a11','p_6a11_f','p_6a11_m',
                          'p_8a14','p_8a14_f','p_8a14_m',
                          'p_12a14','p_12a14_f','p_12a14_m',
                          'p_15a17','p_15a17_f','p_15a17_m',
                          'p_18a24','p_18a24_f','p_18a24_m',
                          'p_15a49_f',
                          'p_60ymas','p_60ymas_f','p_60ymas_m',
                          'pob0_14','pob15_64','pob65_mas',
                           # A PARTIR DE AQUÍ LOS VALORES SON APROXIMADOS PORQUE SON POR MANZANA Y NO ENTRAN EN calculate_censo_nan_values()
                          'p3ym_hli', #Población de 3 años y más que habla una lengua indígena
                          'p3a5_noa', #Población de 3 a 5 años que no asiste a la escuela
                          'p6a11_noa', #Población de 6 a 11 años que no asiste a la escuela
                          'p12a14noa', #Población de 12 a 14 años que no asiste a la escuela
                          'p15a17a', #Población de 15 a 17 años que SÍ van a la escuela
                          'p18a24a', #Población de 18 a 24 años que SÍ van a la escuela
                          'p15ym_an', #Población de 15 años y más que no saben leer y escribir
                          'p15sec_co', #Población de 15 años y más con secundaria completa
                          'psinder', #Población sin afiliación a servicios de salud (IMSS, ISSSTE, PEMEX, SEDENA, SEMAR, INSABI u otro)
                          'tvivpar', #Total de viviendas particulares
                          'tvivparhab'] #Viviendas particulares habitadas
    
    # Dataframe that will store all processed data
    summary_df = pd.DataFrame()
    
    ################################################################## CALCULATE TOTAL VALUES FOR EACH FIELD OF INTEREST
    print("Columns 2010_value and 2020_value show total data in 2010 and 2020 for each field.")
    print("--"*15)
    
    # Calculating total values for each field in fields_of_interest 
    #(Must adapt to each year)
    years = ['2010','2020']
    for year in years:
        # Set data and fields of interest for current year
        sum_fields = fields_of_interest.copy()
        if year == '2010':
            data_gdf = census_2010.copy()
            sum_fields.append('pcon_lim') #pcon_lim won't appear in summary_df, it's name gets overwritten by pcon_disc
        elif year == '2020':
            data_gdf = census_2020.copy()
            sum_fields.append('pcon_disc')
        # Iterate over each field obtaining sum of data
        idx = 0 #(Reset idx each year)
        for field in sum_fields:
            current_data = data_gdf[field].sum()        
            summary_df.loc[idx, 'field'] = field
            summary_df.loc[idx, f'{year}_value'] = int(current_data)
            idx+=1
    
    # Data that is already an average (do not sum, calculate mean)
    mean_fields = ['graproes',#Grado promedio de escolaridad
                   'prom_ocup']#Promedio de ocupantes en viviendas particulares habitadas
    
    for year in years:
        # Set data for current year
        if year == '2010':
            data_gdf = census_2010.copy()
        elif year == '2020':
            data_gdf = census_2020.copy()
        # Iterate over each field obtaining nanmean of data
        mean_idx = idx #(Assign idx as last idx used for fields that used sum of values
        for field in mean_fields:
            current_data = np.nanmean(data_gdf[field])
            summary_df.loc[mean_idx, 'field'] = field
            summary_df.loc[mean_idx, f'{year}_value'] = round(current_data,2)
            mean_idx+=1
    
    ################################################################## CALCULATE CHANGE IN TOTAL VALUES BETWEEN 2010 AND 2020
    print("Column diff_values shows change in total data between 2010 and 2020.")
    print("diff_values = 2020_value - 2010_value")
    print("--"*15)
    print("Column %diff_values shows percentage of change in total data between 2010 and 2020.")
    print("%diff_values = (diff_values / 2010_value)x100")
    print("--"*15)
    
    # Calculate changes between 2010 values and 2020 values
    summary_df['diff_values'] = summary_df['2020_value'] - summary_df['2010_value']
    summary_df['%diff_values'] = round((summary_df['diff_values'] / summary_df['2010_value'])*100,2)
    
    ################################################################## CALCULATE DISTRIBUTION OF VALUES WITH REFERENCE TO A PARENT CATEGORY
    print("Columns _dist shows the percentage that each field has with respect to it's parent category.")
    print("_dist changes depending on field. e.g.: pct_pobfem = pobfem/pobtot")
    print("--"*15)
    
    # Calculate distribution data with reference to a parent category
    # (e.g. pct_pobfem = pobfem/pobtot)
    
    # Fields whose reference is NOT pobtot
    weight_dct = {'p3ym_hli':'p_3ymas',
                  'p3a5_noa':'p_3a5',
                  'p6a11_noa':'p_6a11',
                  'p12a14noa':'p_12a14',
                  'p15a17a':'p_15a17',
                  'p18a24a':'p_18a24',
                  'p15ym_an':'p_15ymas',
                  'p15sec_co':'p_15ymas',
                  'tvivparhab':'tvivpar'}
    
    # Fields that don't have a parent category
    ignore_fields = ['psinder','tvivpar']
    
    for field in list(summary_df['field'].unique()):
    
        # Skip all fields in ignore_fields list
        if field not in ignore_fields:
            # Fields with 'pobtot' as weight
            if field not in weight_dct.keys():
        
                # Register weight field as 'pobtot'
                idx = summary_df['field']==field
                summary_df.loc[idx,'weight'] = 'pobtot'
                
                # Calculate value of interest for 2010 --> (2010 value/2010 pobtot)*100
                pobtot_idx = summary_df['field']=='pobtot'
                value_2010 = (summary_df.loc[idx,'2010_value'] / summary_df.loc[pobtot_idx,'2010_value'][0])*100
                # Register 2010 value
                summary_df.loc[idx,'2010_dist'] = round(value_2010,2)
                
                # Calculate value of interest for 2020 --> (2020 value/2020 pobtot)*100
                pobtot_idx = summary_df['field']=='pobtot'
                value_2020 = (summary_df.loc[idx,'2020_value'] / summary_df.loc[pobtot_idx,'2020_value'][0])*100
                # Register 2020 value
                summary_df.loc[idx,'2020_dist'] = round(value_2020,2)    
        
            # Fields with another field as weight
            if field in weight_dct.keys():
        
                # Register weight field as indicated in weight_dct
                idx = summary_df['field']==field
                summary_df.loc[idx,'weight'] = weight_dct[field]
        
                # Calculate value of interest for 2010 --> (2010 value/2010 weight_dct[field])*100
                weight_idx = summary_df['field']==weight_dct[field]
                value_2010 = (summary_df.loc[idx,'2010_value'] / summary_df.loc[weight_idx,'2010_value'].unique()[0])*100
                # Register 2010 value
                summary_df.loc[idx,'2010_dist'] = round(value_2010,2)
        
                # Calculate value of interest for 2020 --> (2020 value/2020 weight_dct[field])*100
                weight_idx = summary_df['field']==weight_dct[field]
                value_2020 = (summary_df.loc[idx,'2020_value'] / summary_df.loc[weight_idx,'2020_value'].unique()[0])*100
                # Register 2020 value
                summary_df.loc[idx,'2020_dist'] = round(value_2020,2)
    
    ################################################################## CALCULATE CHANGE IN DISTRIBUTION OF VALUES
    print("Column diff_dist shows the change in distribution for each field")
    print("diff_dist = 2020_dist - 2010_dist")
    print("--"*15)
    
    # Calculate changes between 2010 distributions and 2020 distributions
    summary_df['diff_dist'] = summary_df['2020_dist'] - summary_df['2010_dist']
    
    return summary_df

In [28]:
polygon_summary = agregate_census_data_of_interest(pob_10_polygon, pob_20_polygon)
# Show
polygon_summary

Columns 2010_value and 2020_value show total data in 2010 and 2020 for each field.
------------------------------
Column diff_values shows change in total data between 2010 and 2020.
diff_values = 2020_value - 2010_value
------------------------------
Column %diff_values shows percentage of change in total data between 2010 and 2020.
%diff_values = (diff_values / 2010_value)x100
------------------------------
Columns _dist shows the percentage that each field has with respect to it's parent category.
_dist changes depending on field. e.g.: pct_pobfem = pobfem/pobtot
------------------------------
Column diff_dist shows the change in distribution for each field
diff_dist = 2020_dist - 2010_dist
------------------------------


Unnamed: 0,field,2010_value,2020_value,diff_values,%diff_values,weight,2010_dist,2020_dist,diff_dist
0,pobtot,12831.0,11292.0,-1539.0,-11.99,pobtot,100.0,100.0,0.0
1,pobfem,6406.0,5519.0,-887.0,-13.85,pobtot,49.93,48.88,-1.05
2,pobmas,6425.0,5773.0,-652.0,-10.15,pobtot,50.07,51.12,1.05
3,p_0a2,699.0,556.0,-143.0,-20.46,pobtot,5.45,4.92,-0.53
4,p_0a2_f,355.0,250.0,-105.0,-29.58,pobtot,2.77,2.21,-0.56
5,p_0a2_m,344.0,306.0,-38.0,-11.05,pobtot,2.68,2.71,0.03
6,p_3ymas,11992.0,10730.0,-1262.0,-10.52,pobtot,93.46,95.02,1.56
7,p_3ymas_f,5984.0,5266.0,-718.0,-12.0,pobtot,46.64,46.63,-0.01
8,p_3ymas_m,6008.0,5464.0,-544.0,-9.05,pobtot,46.82,48.39,1.57
9,p_5ymas,11517.0,10332.0,-1185.0,-10.29,pobtot,89.76,91.5,1.74


In [29]:
metropolis_summary = agregate_census_data_of_interest(pob_10_gdf, pob_20_gdf)
# Show
metropolis_summary

Columns 2010_value and 2020_value show total data in 2010 and 2020 for each field.
------------------------------
Column diff_values shows change in total data between 2010 and 2020.
diff_values = 2020_value - 2010_value
------------------------------
Column %diff_values shows percentage of change in total data between 2010 and 2020.
%diff_values = (diff_values / 2010_value)x100
------------------------------
Columns _dist shows the percentage that each field has with respect to it's parent category.
_dist changes depending on field. e.g.: pct_pobfem = pobfem/pobtot
------------------------------
Column diff_dist shows the change in distribution for each field
diff_dist = 2020_dist - 2010_dist
------------------------------


Unnamed: 0,field,2010_value,2020_value,diff_values,%diff_values,weight,2010_dist,2020_dist,diff_dist
0,pobtot,4131710.0,5245560.0,1113850.0,26.96,pobtot,100.0,100.0,0.0
1,pobfem,2075574.0,2626163.0,550589.0,26.53,pobtot,50.24,50.06,-0.18
2,pobmas,2056104.0,2619323.0,563219.0,27.39,pobtot,49.76,49.93,0.17
3,p_0a2,214231.0,233350.0,19119.0,8.92,pobtot,5.19,4.45,-0.74
4,p_0a2_f,105094.0,115509.0,10415.0,9.91,pobtot,2.54,2.2,-0.34
5,p_0a2_m,109022.0,117721.0,8699.0,7.98,pobtot,2.64,2.24,-0.4
6,p_3ymas,3859803.0,4994353.0,1134550.0,29.39,pobtot,93.42,95.21,1.79
7,p_3ymas_f,1941660.0,2501684.0,560024.0,28.84,pobtot,46.99,47.69,0.7
8,p_3ymas_m,1918120.0,2492665.0,574545.0,29.95,pobtot,46.42,47.52,1.1
9,p_5ymas,3709506.0,4815791.0,1106285.0,29.82,pobtot,89.78,91.81,2.03
