# 00a-polygon_pop_calc (Using blocks, CANCELED)

This notebook takes a __polygon as input__ an calculates various demographic values using INEGI's __2010 and 2020 census data.__

## Import libraries

In [1]:
first_folder_path = '../../../'

In [2]:
import os
import sys

import pandas as pd
import geopandas as gpd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from pandas.api.types import CategoricalDtype

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

module_path = os.path.abspath(os.path.join(first_folder_path))
if module_path not in sys.path:
    sys.path.append(module_path)
    import aup


KeyboardInterrupt



## Notebook config

In [None]:
# ----- ----- ----- Input polygon ----- ----- ----- 
poly_dir = first_folder_path + "data/external/temporal_todocker/c+lab/Poligono2_Area_de_influencia.gpkg"


# ----- ----- ----- Projection to be used when needed ----- ----- ----- 
projected_crs = "EPSG:32614" #Monterrey

## Load data

In [None]:
# Load polygon
poly_gdf = gpd.read_file(poly_dir)

# Set CRS
if poly_gdf.crs != projected_crs:
    try:
        poly_gdf = poly_gdf.set_crs(projected_crs)
    except:
        poly_gdf = poly_gdf.to_crs(projected_crs)

# Filter and rename data
poly_gdf.rename(columns={'Name':'name'},inplace=True)

# Show
print(poly_gdf.crs)
print(poly_gdf.shape)
poly_gdf.head(2)

In [None]:
# Load 2020 population
pop_schema = 'censo'
pop_table = 'censo_inegi_20_ageb'
city = 'Monterrey'

pob_query = f"SELECT * FROM {pop_schema}.{pop_table} WHERE \"city\" = \'{city}\'"
pob_20_gdf = aup.gdf_from_query(pob_query, geometry_col='geometry')

# Set CRS
if pob_20_gdf.crs != projected_crs:
    try:
        pob_20_gdf = pob_20_gdf.set_crs(projected_crs)
    except:
        pob_20_gdf = pob_20_gdf.to_crs(projected_crs)

# Show
print(pob_20_gdf.crs)
print(pob_20_gdf.shape)
pob_20_gdf.head(2)

In [None]:
# Load 2010 population
pop_schema = 'censo'
pop_table = 'pobcenso_inegi_10_mzaageb_mza'
city = 'Monterrey'

pob_query = f"SELECT * FROM {pop_schema}.{pop_table} WHERE \"city\" = \'{city}\'"
pob_10_gdf = aup.gdf_from_query(pob_query, geometry_col='geometry')

# Set CRS
if pob_10_gdf.crs != projected_crs:
    try:
        pob_10_gdf = pob_10_gdf.set_crs(projected_crs)
    except:
        pob_10_gdf = pob_10_gdf.to_crs(projected_crs)

# Show
print(pob_10_gdf.crs)
print(pob_10_gdf.shape)
pob_10_gdf.head(2)

## Clip data to polygon

In [None]:
pob_20_polygon = pob_20_gdf.sjoin(poly_gdf)

# Show
print(pob_20_polygon.shape)
pob_20_polygon.head(2)

In [None]:
pob_10_polygon = pob_10_gdf.sjoin(poly_gdf)

# Show
print(pob_10_polygon.shape)
pob_10_polygon.head(2)

## Agregate data of interest

In [None]:
def agregate_census_data_of_interest(census_2010, census_2020):

    ################################################################## SET FIELDS OF INTEREST
    fields_of_interest = ['pobtot','pobfem','pobmas',
                          'p_0a2','p_0a2_f','p_0a2_m',
                          'p_3ymas','p_3ymas_f','p_3ymas_m',
                          'p_5ymas','p_5ymas_f','p_5ymas_m',
                          'p_12ymas','p_12ymas_f','p_12ymas_m',
                          'p_15ymas','p_15ymas_f','p_15ymas_m',
                          'p_18ymas','p_18ymas_f','p_18ymas_m',
                          'p_3a5','p_3a5_f','p_3a5_m',
                          'p_6a11','p_6a11_f','p_6a11_m',
                          'p_8a14','p_8a14_f','p_8a14_m',
                          'p_12a14','p_12a14_f','p_12a14_m',
                          'p_15a17','p_15a17_f','p_15a17_m',
                          'p_18a24','p_18a24_f','p_18a24_m',
                          'p_15a49_f',
                          'p_60ymas','p_60ymas_f','p_60ymas_m',
                          'pob0_14','pob15_64','pob65_mas',
                           # A PARTIR DE AQUÍ LOS VALORES SON APROXIMADOS PORQUE SON POR MANZANA Y NO ENTRAN EN calculate_censo_nan_values()
                          'p3ym_hli', #Población de 3 años y más que habla una lengua indígena
                          'p3a5_noa', #Población de 3 a 5 años que no asiste a la escuela
                          'p6a11_noa', #Población de 6 a 11 años que no asiste a la escuela
                          'p12a14noa', #Población de 12 a 14 años que no asiste a la escuela
                          'p15a17a', #Población de 15 a 17 años que SÍ van a la escuela
                          'p18a24a', #Población de 18 a 24 años que SÍ van a la escuela
                          'p15ym_an', #Población de 15 años y más que no saben leer y escribir
                          'p15sec_co', #Población de 15 años y más con secundaria completa
                          'psinder', #Población sin afiliación a servicios de salud (IMSS, ISSSTE, PEMEX, SEDENA, SEMAR, INSABI u otro)
                          'tvivpar', #Total de viviendas particulares
                          'tvivparhab'] #Viviendas particulares habitadas
    
    # Dataframe that will store all processed data
    summary_df = pd.DataFrame()
    
    ################################################################## CALCULATE TOTAL VALUES FOR EACH FIELD OF INTEREST
    print("Columns 2010_value and 2020_value show total data in 2010 and 2020 for each field.")
    print("--"*15)
    
    # Calculating total values for each field in fields_of_interest 
    #(Must adapt to each year)
    years = ['2010','2020']
    for year in years:
        # Set data and fields of interest for current year
        sum_fields = fields_of_interest.copy()
        if year == '2010':
            data_gdf = census_2010.copy()
            sum_fields.append('pcon_lim') #pcon_lim won't appear in summary_df, it's name gets overwritten by pcon_disc
        elif year == '2020':
            data_gdf = census_2020.copy()
            sum_fields.append('pcon_disc')
        # Iterate over each field obtaining sum of data
        idx = 0 #(Reset idx each year)
        for field in sum_fields:
            current_data = data_gdf[field].sum()        
            summary_df.loc[idx, 'field'] = field
            summary_df.loc[idx, f'{year}_value'] = int(current_data)
            idx+=1
    
    # Data that is already an average (do not sum, calculate mean)
    mean_fields = ['graproes',#Grado promedio de escolaridad
                   'prom_ocup']#Promedio de ocupantes en viviendas particulares habitadas
    
    for year in years:
        # Set data for current year
        if year == '2010':
            data_gdf = census_2010.copy()
        elif year == '2020':
            data_gdf = census_2020.copy()
        # Iterate over each field obtaining nanmean of data
        mean_idx = idx #(Assign idx as last idx used for fields that used sum of values
        for field in mean_fields:
            current_data = np.nanmean(data_gdf[field])
            summary_df.loc[mean_idx, 'field'] = field
            summary_df.loc[mean_idx, f'{year}_value'] = round(current_data,2)
            mean_idx+=1
    
    ################################################################## CALCULATE CHANGE IN TOTAL VALUES BETWEEN 2010 AND 2020
    print("Column diff_values shows change in total data between 2010 and 2020.")
    print("diff_values = 2020_value - 2010_value")
    print("--"*15)
    print("Column %diff_values shows percentage of change in total data between 2010 and 2020.")
    print("%diff_values = (diff_values / 2010_value)x100")
    print("--"*15)
    
    # Calculate changes between 2010 values and 2020 values
    summary_df['diff_values'] = summary_df['2020_value'] - summary_df['2010_value']
    summary_df['%diff_values'] = round((summary_df['diff_values'] / summary_df['2010_value'])*100,2)
    
    ################################################################## CALCULATE DISTRIBUTION OF VALUES WITH REFERENCE TO A PARENT CATEGORY
    print("Columns _dist shows the percentage that each field has with respect to it's parent category.")
    print("_dist changes depending on field. e.g.: pct_pobfem = pobfem/pobtot")
    print("--"*15)
    
    # Calculate distribution data with reference to a parent category
    # (e.g. pct_pobfem = pobfem/pobtot)
    
    # Fields whose reference is NOT pobtot
    weight_dct = {'p3ym_hli':'p_3ymas',
                  'p3a5_noa':'p_3a5',
                  'p6a11_noa':'p_6a11',
                  'p12a14noa':'p_12a14',
                  'p15a17a':'p_15a17',
                  'p18a24a':'p_18a24',
                  'p15ym_an':'p_15ymas',
                  'p15sec_co':'p_15ymas',
                  'tvivparhab':'tvivpar'}
    
    # Fields that don't have a parent category
    ignore_fields = ['psinder','tvivpar']
    
    for field in list(summary_df['field'].unique()):
    
        # Skip all fields in ignore_fields list
        if field not in ignore_fields:
            # Fields with 'pobtot' as weight
            if field not in weight_dct.keys():
        
                # Register weight field as 'pobtot'
                idx = summary_df['field']==field
                summary_df.loc[idx,'weight'] = 'pobtot'
                
                # Calculate value of interest for 2010 --> (2010 value/2010 pobtot)*100
                pobtot_idx = summary_df['field']=='pobtot'
                value_2010 = (summary_df.loc[idx,'2010_value'] / summary_df.loc[pobtot_idx,'2010_value'][0])*100
                # Register 2010 value
                summary_df.loc[idx,'2010_dist'] = round(value_2010,2)
                
                # Calculate value of interest for 2020 --> (2020 value/2020 pobtot)*100
                pobtot_idx = summary_df['field']=='pobtot'
                value_2020 = (summary_df.loc[idx,'2020_value'] / summary_df.loc[pobtot_idx,'2020_value'][0])*100
                # Register 2020 value
                summary_df.loc[idx,'2020_dist'] = round(value_2020,2)    
        
            # Fields with another field as weight
            if field in weight_dct.keys():
        
                # Register weight field as indicated in weight_dct
                idx = summary_df['field']==field
                summary_df.loc[idx,'weight'] = weight_dct[field]
        
                # Calculate value of interest for 2010 --> (2010 value/2010 weight_dct[field])*100
                weight_idx = summary_df['field']==weight_dct[field]
                value_2010 = (summary_df.loc[idx,'2010_value'] / summary_df.loc[weight_idx,'2010_value'].unique()[0])*100
                # Register 2010 value
                summary_df.loc[idx,'2010_dist'] = round(value_2010,2)
        
                # Calculate value of interest for 2020 --> (2020 value/2020 weight_dct[field])*100
                weight_idx = summary_df['field']==weight_dct[field]
                value_2020 = (summary_df.loc[idx,'2020_value'] / summary_df.loc[weight_idx,'2020_value'].unique()[0])*100
                # Register 2020 value
                summary_df.loc[idx,'2020_dist'] = round(value_2020,2)
    
    ################################################################## CALCULATE CHANGE IN DISTRIBUTION OF VALUES
    print("Column diff_dist shows the change in distribution for each field")
    print("diff_dist = 2020_dist - 2010_dist")
    print("--"*15)
    
    # Calculate changes between 2010 distributions and 2020 distributions
    summary_df['diff_dist'] = summary_df['2020_dist'] - summary_df['2010_dist']
    
    return summary_df

In [None]:
polygon_summary = agregate_census_data_of_interest(pob_10_polygon, pob_20_polygon)
# Show
polygon_summary

In [None]:
metropolis_summary = agregate_census_data_of_interest(pob_10_gdf, pob_20_gdf)
# Show
metropolis_summary