# 01c-calculate_censo_nan_values_v1 tests2 (After adding pob_condisc)

Notebook created after developing function calculate_censo_nan_values_v1 with the purpose of:
1. Have an updated notebook of the function to facilitate further tests and updates
2. Add population with disabilities to the function

## Import libraries

In [1]:
import os
import sys

import pandas as pd
import geopandas as gpd
import osmnx as ox
import numpy as np

from scipy.spatial import Voronoi, voronoi_plot_2d
import shapely

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

module_path = os.path.abspath(os.path.join('../../../'))
if module_path not in sys.path:
    sys.path.append(module_path)
    import aup

## Prep for calculate_censo_nan_values with Aguascalientes

In [2]:
city = 'Aguascalientes'
query = f"SELECT * FROM metropolis.metro_gdf_2020 WHERE \"city\" LIKE \'{city}\'"
city_gdf = aup.gdf_from_query(query, geometry_col='geometry')
city_gdf = city_gdf.set_crs("EPSG:4326")

##########################################################################################
# Based on Script 22-censo_pop_voronoi (2024 04 18)
year = '2020'
##########################################################################################

# STEP 1: LOAD DATA
print("--"*30)
print("--- LOADING CITY POP DATA.")

# 1.1 --------------- CREATE AREA OF INTEREST FOR CITY 
#city_gdf = metro_gdf.loc[metro_gdf.city == city]
#city_gdf = city_gdf.set_crs("EPSG:4326")
aoi = city_gdf.dissolve()

# 1.2 --------------- LOAD POP DATA (AGEBs and Blocks)
print("--- Loading blocks and AGEBs for area of interest.")
pop_ageb_gdf = gpd.GeoDataFrame()
pop_mza_gdf = gpd.GeoDataFrame()

# Load states for current city (CVE_ENT)
cve_ent_list = list(city_gdf.CVE_ENT.unique())

for cve_ent in cve_ent_list:
    #Load muns in each city state
    cve_mun_list = list(city_gdf.loc[city_gdf.CVE_ENT == cve_ent].CVE_MUN.unique())
    # To avoid error that happens when there's only one MUN in State: [SQL: SELECT * FROM censo_mza.censo_mza_2020 WHERE ("CVE_ENT" = '02') AND "CVE_MUN" IN ('001',) ]
    # Duplicate mun inside tupple if there's only one MUN.
    if len(cve_mun_list) >= 2:
        cve_mun_tpl = str(tuple(cve_mun_list))
    else:
        cve_mun_list.append(cve_mun_list[0])
        cve_mun_tpl = str(tuple(cve_mun_list))
    # Load AGEBs and concat
    query = f"SELECT * FROM censoageb.censoageb_{year} WHERE (\"cve_ent\" = \'{cve_ent}\') AND \"cve_mun\" IN {cve_mun_tpl} "
    pop_ageb_gdf = pd.concat([pop_ageb_gdf,aup.gdf_from_query(query, geometry_col='geometry')])
    # Load blocks and concat
    query = f"SELECT * FROM censo_mza.censo_mza_{year} WHERE (\"CVE_ENT\" = \'{cve_ent}\') AND \"CVE_MUN\" IN {cve_mun_tpl} "
    pop_mza_gdf = pd.concat([pop_mza_gdf,aup.gdf_from_query(query, geometry_col='geometry')])
    

# For 2020 dataset, select urban blocks only
if year == '2020':
    pop_mza_gdf = pop_mza_gdf.loc[pop_mza_gdf.AMBITO == 'Urbana'].copy()

# Set CRS
pop_ageb_gdf = pop_ageb_gdf.set_crs("EPSG:4326")
pop_mza_gdf = pop_mza_gdf.set_crs("EPSG:4326")

------------------------------------------------------------
--- LOADING CITY POP DATA.
--- Loading blocks and AGEBs for area of interest.


In [7]:
# Show
print(pop_ageb_gdf.shape)
pop_ageb_gdf.head(1)

(392, 237)


Unnamed: 0,cve_geo,cve_ent,cve_mun,cve_loc,cve_ageb,geometry,entidad,nom_ent,mun,nom_mun,...,vph_cel,vph_inter,vph_stvp,vph_spmvpi,vph_cvj,vph_sinrtv,vph_sintlc,vph_sincint,vph_sintic,cve_geo_ageb
0,010010001216A,1,1,1,216A,"POLYGON ((-102.27058 21.87363, -102.27083 21.8...",1,Aguascalientes,1,Aguascalientes,...,753.0,609.0,439.0,205.0,146.0,7.0,14.0,174.0,,010010001216A


In [4]:
# Show
print(pop_mza_gdf.shape)
pop_mza_gdf.head(1)

(12932, 239)


Unnamed: 0,CVEGEO,CVE_ENT,CVE_MUN,CVE_LOC,CVE_AGEB,CVE_MZA,AMBITO,TIPOMZA,geometry,ENTIDAD,...,VPH_TELEF,VPH_CEL,VPH_INTER,VPH_STVP,VPH_SPMVPI,VPH_CVJ,VPH_SINRTV,VPH_SINLTC,VPH_SINCINT,VPH_SINTIC
0,100100010286027,1,1,1,286,27,Urbana,Típica,"POLYGON ((-102.31215 21.90182, -102.31220 21.9...",1,...,31,45,39,30,18,18,0,0,5,0


## Test after editing function calculate_censo_nan_values_v1

__Edits to function:__
* Added .copy() to input argos pop_ageb_gdf and pop_mza_gdf because the method .upper() on ageb columns was altering the global variable pop_ageb_gdf when named the same.
* Added PCON_DISC to columns_of_interest (This does not add an equation to the blocks calculation, just includes the col in the AGEB distribution)

### Current result after function

In [5]:
pop_mza_gdf_calc = aup.calculate_censo_nan_values_v1(pop_ageb_gdf,pop_mza_gdf,extended_logs=False)

#Show
print(pop_mza_gdf_calc.shape)
pop_mza_gdf_calc.head(1)

INSPECTING AGEBs.
STARTING NANs calculation.
Calculating NaNs. 10% done.
Calculating NaNs. 20% done.
Calculating NaNs. 30% done.
Calculating NaNs. 40% done.
Calculating NaNs. 50% done.
Calculating NaNs. 60% done.
Calculating NaNs. 70% done.
Calculating NaNs. 80% done.
Calculating NaNs. 90% done.
Calculating NaNs. 100% done.
Finished calculating NaNs.
Percentage of NaNs found using blocks gdf: 76.46%.
Columns which could be solved entirely using equations in block_gdf: 4993.0.
Columns which required AGEB filling: 10295.0.
Columns which couldn't be solved: 0.0.
(12932, 239)


Unnamed: 0,cvegeo,cve_ent,cve_mun,cve_loc,cve_ageb,cve_mza,ambito,tipomza,geometry,entidad,...,vph_telef,vph_cel,vph_inter,vph_stvp,vph_spmvpi,vph_cvj,vph_sinrtv,vph_sinltc,vph_sincint,vph_sintic
0,100100010286027,1,1,1,286,27,Urbana,Típica,"POLYGON ((-102.31215 21.90182, -102.31220 21.9...",1,...,31,45,39,30,18,18,0,0,5,0


#### Filter blocks and agebs for data of interest

In [13]:
# Set data of interest
columns_of_interest = ['cve_ageb','POBTOT','POBFEM','POBMAS',
                       'P_0A2','P_0A2_F','P_0A2_M',
                       'P_3A5','P_3A5_F','P_3A5_M',
                       'P_6A11','P_6A11_F','P_6A11_M',
                       'P_12A14','P_12A14_F','P_12A14_M',
                       'P_15A17','P_15A17_F','P_15A17_M',
                       'P_18A24','P_18A24_F','P_18A24_M',
                       'P_60YMAS','P_60YMAS_F','P_60YMAS_M',
                       'P_3YMAS','P_3YMAS_F','P_3YMAS_M',
                       'P_12YMAS','P_12YMAS_F','P_12YMAS_M',
                       'P_15YMAS','P_15YMAS_F','P_15YMAS_M',
                       'P_18YMAS','P_18YMAS_F','P_18YMAS_M',
                       'POB0_14','POB15_64','POB65_MAS',
                       'PCON_DISC']

columns_of_interest_lower = []
for col in columns_of_interest:
    columns_of_interest_lower.append(col.lower())

# Filter for data of interest
blocks_check = pop_mza_gdf_calc[columns_of_interest_lower]
agebs_check = pop_ageb_gdf[columns_of_interest_lower]

Unnamed: 0,cve_ageb,pobtot,pobfem,pobmas,p_0a2,p_0a2_f,p_0a2_m,p_3a5,p_3a5_f,p_3a5_m,...,p_15ymas,p_15ymas_f,p_15ymas_m,p_18ymas,p_18ymas_f,p_18ymas_m,pob0_14,pob15_64,pob65_mas,pcon_disc
0,286,138,76.0,62.0,4.0,1.0,3.0,5.0,4.0,1.0,...,117.0,64.0,53.0,113.0,62.0,51.0,21.0,95.0,22.0,2.0369


#### Find values by ageb

In [17]:
# List of cols with data to check
analysis_cols = columns_of_interest_lower.copy()
analysis_cols.remove('cve_ageb')

all_result_check = pd.DataFrame()
i = 0

# Check all ageb and block values
for ageb in list(pop_ageb_gdf.cve_ageb.unique()):
    all_result_check.loc[i,'ageb'] = ageb

    current_blocks_df = blocks_check.loc[blocks_check.cve_ageb == ageb]
    current_ageb_df = agebs_check.loc[agebs_check.cve_ageb == ageb]

    for col in analysis_cols:
        all_result_check.loc[i,f"{col}_mza"] = current_blocks_df[col].sum()
        all_result_check.loc[i,f"{col}_ageb"] = current_ageb_df[col].unique()[0]
        
    i = i+1

all_result_check

Unnamed: 0,ageb,pobtot_mza,pobtot_ageb,pobfem_mza,pobfem_ageb,pobmas_mza,pobmas_ageb,p_0a2_mza,p_0a2_ageb,p_0a2_f_mza,...,p_18ymas_m_mza,p_18ymas_m_ageb,pob0_14_mza,pob0_14_ageb,pob15_64_mza,pob15_64_ageb,pob65_mas_mza,pob65_mas_ageb,pcon_disc_mza,pcon_disc_ageb
0,216A,2657.0,2657.0,1444.0,1444.0,1213.0,1213.0,112.0,112.0,61.0,...,877.0,877.0,572.0,572.0,1752.0,1752.0,333.0,333.0,195.0,195.0
1,2649,3435.0,3435.0,1783.0,1783.0,1652.0,1652.0,131.0,131.0,64.0,...,1189.0,1189.0,723.0,723.0,2536.0,2536.0,176.0,176.0,151.0,151.0
2,383A,2449.0,2449.0,1246.0,1246.0,1203.0,1203.0,151.0,151.0,69.0,...,660.0,660.0,914.0,914.0,1507.0,1507.0,28.0,28.0,62.0,62.0
3,287A,3537.0,3537.0,1829.0,1829.0,1708.0,1708.0,112.0,112.0,55.0,...,1332.0,1332.0,604.0,604.0,2560.0,2560.0,373.0,373.0,124.0,124.0
4,084A,1626.0,1626.0,867.0,867.0,759.0,759.0,33.0,32.0,13.0,...,612.0,612.0,240.0,240.0,1026.0,1026.0,359.0,359.0,122.0,122.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
387,0225,4396.0,4396.0,2257.0,2257.0,2139.0,2139.0,330.0,330.0,159.0,...,1228.0,1228.0,1648.0,1648.0,2682.0,2682.0,66.0,66.0,93.0,93.0
388,023A,5842.0,5842.0,2914.0,2914.0,2928.0,2928.0,422.0,422.0,215.0,...,1705.0,1705.0,2111.0,2111.0,3668.0,3668.0,63.0,63.0,152.0,152.0
389,016A,933.0,933.0,479.0,479.0,454.0,454.0,80.0,80.0,37.0,...,268.0,268.0,343.0,343.0,582.0,582.0,8.0,8.0,46.0,46.0
390,0189,1222.0,1222.0,609.0,609.0,613.0,613.0,117.0,117.0,62.0,...,342.0,342.0,494.0,494.0,725.0,725.0,3.0,3.0,25.0,25.0


#### Find differences in those values

In [29]:
diff_check = all_result_check.copy()

# Calculate diff by col
diff_cols = []
for col in analysis_cols:
    diff_cols.append(f"{col}_diff")
    diff_check[f"{col}_diff"] = round(diff_check[f"{col}_mza"] - diff_check[f"{col}_ageb"],0)
    diff_check.drop(columns=[f"{col}_mza",f"{col}_ageb"],inplace=True)


diff_check['total_diff'] = diff_check[diff_cols].sum(axis=1)
problem_check = diff_check.loc[diff_check['total_diff']>0]
problem_check

Unnamed: 0,ageb,pobtot_diff,pobfem_diff,pobmas_diff,p_0a2_diff,p_0a2_f_diff,p_0a2_m_diff,p_3a5_diff,p_3a5_f_diff,p_3a5_m_diff,...,p_15ymas_f_diff,p_15ymas_m_diff,p_18ymas_diff,p_18ymas_f_diff,p_18ymas_m_diff,pob0_14_diff,pob15_64_diff,pob65_mas_diff,pcon_disc_diff,total_diff
4,084A,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
19,2884,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
39,2564,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,2.0
57,369A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,6.0
87,0356,0.0,0.0,0.0,6.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,6.0,0.0,5.0,0.0,23.0
110,0483,0.0,0.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
125,1956,0.0,0.0,0.0,2.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
145,0892,0.0,0.0,0.0,4.0,0.0,2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
167,2511,0.0,0.0,0.0,3.0,1.0,2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
187,3613,0.0,0.0,0.0,0.0,3.0,2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0


### Test - Why those differences?
#### __Problem:__ There are small differences between AGEB and blocks calculated data.
#### __Result:__ After analysing the first case (AGEB 084A) it was concluded that the errors are due to source (INEGI) data.

#### Analysing case AGEB 084A

In [63]:
# AGEB 084A has a 1 person difference in p_0a2.
test_ageb = '084A'
problem_ageb = agebs_check.loc[agebs_check.cve_ageb == test_ageb]

# Results of this cell showed p_0a2 is available in that ageb.
problem_ageb

Unnamed: 0,cve_ageb,pobtot,pobfem,pobmas,p_0a2,p_0a2_f,p_0a2_m,p_3a5,p_3a5_f,p_3a5_m,...,p_15ymas,p_15ymas_f,p_15ymas_m,p_18ymas,p_18ymas_f,p_18ymas_m,pob0_14,pob15_64,pob65_mas,pcon_disc
4,084A,1626,867.0,759.0,32.0,13.0,19.0,41.0,25.0,16.0,...,1385.0,741.0,644.0,1333.0,721.0,612.0,240.0,1026.0,359.0,122.0


In [65]:
# And indeed, there are 33 (not 32) persons in the blocks data
problem_blocks = blocks_check.loc[blocks_check.cve_ageb == test_ageb]
print(problem_blocks.p_0a2.sum())
problem_blocks.head(1)

33.0


Unnamed: 0,cve_ageb,pobtot,pobfem,pobmas,p_0a2,p_0a2_f,p_0a2_m,p_3a5,p_3a5_f,p_3a5_m,...,p_15ymas,p_15ymas_f,p_15ymas_m,p_18ymas,p_18ymas_f,p_18ymas_m,pob0_14,pob15_64,pob65_mas,pcon_disc
1219,084A,44,22.0,22.0,0.0,0.0,0.0,0.0,0.0,0.0,...,41.0,21.0,20.0,41.0,21.0,20.0,3.0,30.0,11.0,4.0


In [69]:
# But when checking the original data (pop_mza_gdf, before calculate_censo_nan_values)
# The error was found to be the source data.

# Find original data for this AGEB
blocks_columns_of_interest = columns_of_interest.copy()
blocks_columns_of_interest.remove('cve_ageb')
blocks_columns_of_interest.append('CVE_AGEB')
original_blocks = pop_mza_gdf[blocks_columns_of_interest]
ageb_org_blocks = original_blocks.loc[original_blocks.CVE_AGEB == test_ageb]

# These are row's 1193 equations and how they should add up:
# P_0A2 (None) = POBTOT (56) - P_3YMAS (54) = 2
# P_0A2_F (None) = POBFEM (32) - P_3YMAS_F (31) = 1
# P_0A2_M (0) = POBMAS (24) - P_3YMAS_M (23) = 1 --------------------------> Source data has 0 instead of 1

ageb_org_blocks.loc[1193,['POBTOT','POBFEM','POBMAS','P_0A2','P_0A2_F','P_0A2_M','P_3YMAS','P_3YMAS_F','P_3YMAS_M']]

POBTOT         56
POBFEM         32
POBMAS         24
P_0A2        None
P_0A2_F      None
P_0A2_M         0
P_3YMAS        54
P_3YMAS_F      31
P_3YMAS_M      23
Name: 1193, dtype: object