# 01b - calculate_censo_nan_values_v1 tests

This Notebook runs more specific tests regarding how calculate_censo_nan_values_v1 works.

* Test 01 - For rows with nan values only (censored blocks), is pobtot always available?
    * __Answer:__ Yes (At least in Aguascalientes).
* Test 02 - Are those completely censored blocks recieving pop data after function?
    * __Answer:__ Yes, but there's a nan there. Will need another test.
* Test 03 - Why are there NaN values on block CVEGEO 0100100010229011?
    * __Answer:__ Unknown until now, there are also nan values on AGEB data
 
Also unknown until now, when not copying an argument inside a function, apparently it also affects the global variable. (pop_ageb_gdf cols get .upper after calculate_censo_nan_values_v1).

## Import libraries

In [1]:
import os
import sys

import pandas as pd
import geopandas as gpd
import osmnx as ox
import numpy as np

from scipy.spatial import Voronoi, voronoi_plot_2d
import shapely

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

module_path = os.path.abspath(os.path.join('../../../'))
if module_path not in sys.path:
    sys.path.append(module_path)
    import aup

## Prep for calculate_censo_nan_values with Aguascalientes

In [114]:
city = 'Aguascalientes'
query = f"SELECT * FROM metropolis.metro_gdf_2020 WHERE \"city\" LIKE \'{city}\'"
city_gdf = aup.gdf_from_query(query, geometry_col='geometry')
city_gdf = city_gdf.set_crs("EPSG:4326")

##########################################################################################
# Based on Script 22-censo_pop_voronoi (2024 04 18)
year = '2020'
##########################################################################################

# STEP 1: LOAD DATA
print("--"*30)
print("--- LOADING CITY POP DATA.")

# 1.1 --------------- CREATE AREA OF INTEREST FOR CITY 
#city_gdf = metro_gdf.loc[metro_gdf.city == city]
#city_gdf = city_gdf.set_crs("EPSG:4326")
aoi = city_gdf.dissolve()

# 1.2 --------------- LOAD POP DATA (AGEBs and Blocks)
print("--- Loading blocks and AGEBs for area of interest.")
pop_ageb_gdf = gpd.GeoDataFrame()
pop_mza_gdf = gpd.GeoDataFrame()

# Load states for current city (CVE_ENT)
cve_ent_list = list(city_gdf.CVE_ENT.unique())

for cve_ent in cve_ent_list:
    #Load muns in each city state
    cve_mun_list = list(city_gdf.loc[city_gdf.CVE_ENT == cve_ent].CVE_MUN.unique())
    # To avoid error that happens when there's only one MUN in State: [SQL: SELECT * FROM censo_mza.censo_mza_2020 WHERE ("CVE_ENT" = '02') AND "CVE_MUN" IN ('001',) ]
    # Duplicate mun inside tupple if there's only one MUN.
    if len(cve_mun_list) >= 2:
        cve_mun_tpl = str(tuple(cve_mun_list))
    else:
        cve_mun_list.append(cve_mun_list[0])
        cve_mun_tpl = str(tuple(cve_mun_list))
    # Load AGEBs and concat
    query = f"SELECT * FROM censoageb.censoageb_{year} WHERE (\"cve_ent\" = \'{cve_ent}\') AND \"cve_mun\" IN {cve_mun_tpl} "
    pop_ageb_gdf = pd.concat([pop_ageb_gdf,aup.gdf_from_query(query, geometry_col='geometry')])
    # Load blocks and concat
    query = f"SELECT * FROM censo_mza.censo_mza_{year} WHERE (\"CVE_ENT\" = \'{cve_ent}\') AND \"CVE_MUN\" IN {cve_mun_tpl} "
    pop_mza_gdf = pd.concat([pop_mza_gdf,aup.gdf_from_query(query, geometry_col='geometry')])
    

# For 2020 dataset, select urban blocks only
if year == '2020':
    pop_mza_gdf = pop_mza_gdf.loc[pop_mza_gdf.AMBITO == 'Urbana'].copy()

# Set CRS
pop_ageb_gdf = pop_ageb_gdf.set_crs("EPSG:4326")
pop_mza_gdf = pop_mza_gdf.set_crs("EPSG:4326")

------------------------------------------------------------
--- LOADING CITY POP DATA.
--- Loading blocks and AGEBs for area of interest.


In [121]:
print(pop_ageb_gdf.shape)
pop_ageb_gdf.head(1)

(392, 237)


Unnamed: 0,CVE_GEO,CVE_ENT,CVE_MUN,CVE_LOC,CVE_AGEB,geometry,ENTIDAD,NOM_ENT,MUN,NOM_MUN,...,VPH_CEL,VPH_INTER,VPH_STVP,VPH_SPMVPI,VPH_CVJ,VPH_SINRTV,VPH_SINTLC,VPH_SINCINT,VPH_SINTIC,CVE_GEO_AGEB
0,010010001216A,1,1,1,216A,"POLYGON ((-102.27058 21.87363, -102.27083 21.8...",1,Aguascalientes,1,Aguascalientes,...,753.0,609.0,439.0,205.0,146.0,7.0,14.0,174.0,,010010001216A


In [116]:
print(pop_mza_gdf.shape)
pop_mza_gdf.head(1)

(12932, 239)


Unnamed: 0,CVEGEO,CVE_ENT,CVE_MUN,CVE_LOC,CVE_AGEB,CVE_MZA,AMBITO,TIPOMZA,geometry,ENTIDAD,...,VPH_TELEF,VPH_CEL,VPH_INTER,VPH_STVP,VPH_SPMVPI,VPH_CVJ,VPH_SINRTV,VPH_SINLTC,VPH_SINCINT,VPH_SINTIC
0,100100010286003,1,1,1,286,3,Urbana,Típica,"POLYGON ((-102.30752 21.90227, -102.30738 21.9...",1,...,,,,,,,,,,


## Calculate censo nan values

In [117]:
pop_mza_gdf_calc = aup.calculate_censo_nan_values_v1(pop_ageb_gdf,pop_mza_gdf,extended_logs=False)

INSPECTING AGEBs.
STARTING NANs calculation.
Calculating NaNs. 10% done.
Calculating NaNs. 20% done.
Calculating NaNs. 30% done.
Calculating NaNs. 40% done.
Calculating NaNs. 50% done.
Calculating NaNs. 60% done.
Calculating NaNs. 70% done.
Calculating NaNs. 80% done.
Calculating NaNs. 90% done.
Calculating NaNs. 100% done.
Finished calculating NaNs.
Percentage of NaNs found using blocks gdf: 79.7%.
Columns which could be solved entirely using equations in block_gdf: 4980.0.
Columns which required AGEB filling: 9916.0.
Columns which couldn't be solved: 0.0.


In [118]:
# Show
print(pop_mza_gdf_calc.shape)
pop_mza_gdf_calc.head(1)

(12932, 239)


Unnamed: 0,cvegeo,cve_ent,cve_mun,cve_loc,cve_ageb,cve_mza,ambito,tipomza,geometry,entidad,...,vph_telef,vph_cel,vph_inter,vph_stvp,vph_spmvpi,vph_cvj,vph_sinrtv,vph_sinltc,vph_sincint,vph_sintic
0,100100010286003,1,1,1,286,3,Urbana,Típica,"POLYGON ((-102.30752 21.90227, -102.30738 21.9...",1,...,,,,,,,,,,


In [119]:
print(pop_ageb_gdf.shape)
pop_ageb_gdf.head(1)

(392, 237)


Unnamed: 0,CVE_GEO,CVE_ENT,CVE_MUN,CVE_LOC,CVE_AGEB,geometry,ENTIDAD,NOM_ENT,MUN,NOM_MUN,...,VPH_CEL,VPH_INTER,VPH_STVP,VPH_SPMVPI,VPH_CVJ,VPH_SINRTV,VPH_SINTLC,VPH_SINCINT,VPH_SINTIC,CVE_GEO_AGEB
0,010010001216A,1,1,1,216A,"POLYGON ((-102.27058 21.87363, -102.27083 21.8...",1,Aguascalientes,1,Aguascalientes,...,753.0,609.0,439.0,205.0,146.0,7.0,14.0,174.0,,010010001216A


## Tests

### Test 01 - For rows with nan values only (censored blocks), __is pobtot always available?__
#### __Answer:__ Yes (At least in Aguascalientes).

#### Short answer:

In [81]:
short_answer = pop_mza_gdf.loc[pop_mza_gdf.POBTOT.isna()]
short_answer

Unnamed: 0,CVEGEO,CVE_ENT,CVE_MUN,CVE_LOC,CVE_AGEB,CVE_MZA,AMBITO,TIPOMZA,geometry,ENTIDAD,...,VPH_TELEF,VPH_CEL,VPH_INTER,VPH_STVP,VPH_SPMVPI,VPH_CVJ,VPH_SINRTV,VPH_SINLTC,VPH_SINCINT,VPH_SINTIC


#### Long answer:

In [87]:
# Select data of interest
columns_of_interest = ['CVEGEO','POBTOT','POBFEM','POBMAS',
                        'P_0A2','P_0A2_F','P_0A2_M',
                        'P_3A5','P_3A5_F','P_3A5_M',
                        'P_6A11','P_6A11_F','P_6A11_M',
                        'P_12A14','P_12A14_F','P_12A14_M',
                        'P_15A17','P_15A17_F','P_15A17_M',
                        'P_18A24','P_18A24_F','P_18A24_M',
                        'P_60YMAS','P_60YMAS_F','P_60YMAS_M',
                        'P_3YMAS','P_3YMAS_F','P_3YMAS_M',
                        'P_12YMAS','P_12YMAS_F','P_12YMAS_M',
                        'P_15YMAS','P_15YMAS_F','P_15YMAS_M',
                        'P_18YMAS','P_18YMAS_F','P_18YMAS_M',
                        'POB0_14','POB15_64','POB65_MAS']
test_1 = pop_mza_gdf.copy()
test_1 = test_1[columns_of_interest]

# Find normal sum and np.nansum of all data except POBTOT
sum_cols = columns_of_interest.copy()
sum_cols.remove('CVEGEO')
sum_cols.remove('POBTOT')

def nan_sum(row):
    row['normal_sum'] = row['normal_sum'] + row[col]
    row['nansum'] = np.nansum([row['nansum'], row[col]])
    return row

test_1['normal_sum'] = 0
test_1['nansum'] = 0
for col in sum_cols:
    test_1[col] = pd.to_numeric(test_1[col])
    test_1 = test_1.apply(nan_sum, axis = 1)

test_1.head(5)

Unnamed: 0,CVEGEO,POBTOT,POBFEM,POBMAS,P_0A2,P_0A2_F,P_0A2_M,P_3A5,P_3A5_F,P_3A5_M,...,P_15YMAS_F,P_15YMAS_M,P_18YMAS,P_18YMAS_F,P_18YMAS_M,POB0_14,POB15_64,POB65_MAS,normal_sum,nansum
0,100100010233003,2,,,,,,,,,...,,,,,,,,,,0.0
1,100100010229001,57,28.0,29.0,0.0,0.0,0.0,,,,...,25.0,28.0,50.0,23.0,27.0,4.0,48.0,5.0,,570.0
2,100100010229002,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,100100010229003,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,100100010229004,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


##### Result: Blocks with no pop data (0 population)

In [89]:
# Blocks with no pop (all data is zero) will have normal sum == 0 and and nansum == 0
all_zeros = test_1.loc[(test_1['normal_sum'] == 0) & (test_1['nansum'] == 0)]
all_zeros

Unnamed: 0,CVEGEO,POBTOT,POBFEM,POBMAS,P_0A2,P_0A2_F,P_0A2_M,P_3A5,P_3A5_F,P_3A5_M,...,P_15YMAS_F,P_15YMAS_M,P_18YMAS,P_18YMAS_F,P_18YMAS_M,POB0_14,POB15_64,POB65_MAS,normal_sum,nansum
2,0100100010229002,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0100100010229003,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0100100010229004,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0100100010286034,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0100100010286038,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12993,0101100110206007,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13008,010110052016A004,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13010,010110052016A038,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13015,0101101380210042,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


##### Result: Blocks with all nan data (But with population)

In [90]:
# Blocks with all nans will have normal sum == nan and and nansum == 0
all_nans = test_1.loc[(test_1['normal_sum'].isna()) & (test_1['nansum'] == 0)]
all_nans

Unnamed: 0,CVEGEO,POBTOT,POBFEM,POBMAS,P_0A2,P_0A2_F,P_0A2_M,P_3A5,P_3A5_F,P_3A5_M,...,P_15YMAS_F,P_15YMAS_M,P_18YMAS,P_18YMAS_F,P_18YMAS_M,POB0_14,POB15_64,POB65_MAS,normal_sum,nansum
0,0100100010233003,2,,,,,,,,,...,,,,,,,,,,0.0
28,0100100010290017,5,,,,,,,,,...,,,,,,,,,,0.0
93,0100100010322031,9,,,,,,,,,...,,,,,,,,,,0.0
136,0100100010341021,9,,,,,,,,,...,,,,,,,,,,0.0
199,0100100010233042,4,,,,,,,,,...,,,,,,,,,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12883,0100100011227046,5,,,,,,,,,...,,,,,,,,,,0.0
12947,0101101380210017,10,,,,,,,,,...,,,,,,,,,,0.0
12995,0101100010028028,2,,,,,,,,,...,,,,,,,,,,0.0
13013,0101101380210039,5,,,,,,,,,...,,,,,,,,,,0.0


In [80]:
all_nans.loc[all_nans.POBTOT.isna()]

Unnamed: 0,POBTOT,POBFEM,POBMAS,P_0A2,P_0A2_F,P_0A2_M,P_3A5,P_3A5_F,P_3A5_M,P_6A11,...,P_15YMAS_F,P_15YMAS_M,P_18YMAS,P_18YMAS_F,P_18YMAS_M,POB0_14,POB15_64,POB65_MAS,normal_sum,nansum


### Test 02 - Are those completely __censored blocks recieving pop data__ after function?
#### __Answer:__ Yes, but there's a nan there. Will need another test.

In [97]:
# CVEGEOs list
all_nans_cvegeos = list(all_nans.CVEGEO.unique())
# Find those CVEGEOs on new calculated gdf
redistributed_data_to_nans = pop_mza_gdf_calc.loc[pop_mza_gdf_calc.cvegeo.isin(all_nans_cvegeos)]
# Filter
columns_of_interest_lower = []
for col in columns_of_interest:
    columns_of_interest_lower.append(col.lower())

redistributed_data_to_nans = redistributed_data_to_nans[columns_of_interest_lower]

# Show
redistributed_data_to_nans

Unnamed: 0,cvegeo,pobtot,pobfem,pobmas,p_0a2,p_0a2_f,p_0a2_m,p_3a5,p_3a5_f,p_3a5_m,...,p_12ymas_m,p_15ymas,p_15ymas_f,p_15ymas_m,p_18ymas,p_18ymas_f,p_18ymas_m,pob0_14,pob15_64,pob65_mas
0,0100100010233003,2,0.857143,1.142857,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,1.142857,2.000000,0.857143,1.142857,2.000000,0.857143,1.142857,0.000000,2.000000,0.000000
21,0100100010233042,4,1.714286,2.285714,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,2.285714,4.000000,1.714286,2.285714,4.000000,1.714286,2.285714,0.000000,4.000000,0.000000
25,0100100010233006,1,0.428571,0.571429,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.571429,1.000000,0.428571,0.571429,1.000000,0.428571,0.571429,0.000000,1.000000,0.000000
43,0100100010229011,1,0.000000,1.000000,0.000000,,,0.035714,0.017241,0.037037,...,1.000000,1.000000,0.000000,1.000000,1.000000,0.000000,1.000000,0.000000,0.000000,1.000000
56,0100100010286003,2,0.000000,2.000000,0.000000,0.000000,0.000000,0.050420,0.052632,0.022099,...,2.000000,2.000000,0.000000,2.000000,2.000000,0.000000,2.000000,0.000000,0.000000,2.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12577,0101100110193024,2,4.947368,4.736842,0.315789,0.105263,0.210526,0.188679,0.098765,0.127660,...,3.368421,6.421053,3.473684,2.947368,5.894737,3.157895,2.736842,3.263158,5.578947,0.842105
12581,0101100110193001,8,19.789474,18.947368,1.263158,0.421053,0.842105,0.754717,0.395062,0.510638,...,13.473684,25.684211,13.894737,11.789474,23.578947,12.631579,10.947368,13.052632,22.315789,3.368421
12774,010110052016A033,8,3.000000,5.000000,1.000000,0.000000,1.000000,0.000000,0.000000,0.000000,...,2.000000,5.000000,3.000000,2.000000,5.000000,3.000000,2.000000,3.000000,5.000000,0.000000
12815,0101101380210017,10,4.000000,6.000000,0.666667,0.666667,0.000000,1.333333,0.402685,0.671141,...,4.000000,5.333333,2.666667,2.666667,3.333333,2.000000,1.333333,4.666667,5.333333,0.000000


### Test 03 - __Why are there NaN values__ on block CVEGEO 0100100010229011?
#### __Answer:__ Unknown until now, there are also nan values on AGEB data.

In [103]:
ageb_ofinterest = '0229'
test_3_blocks = pop_mza_gdf_calc.loc[pop_mza_gdf_calc.cve_ageb == ageb_ofinterest]
test_3_blocks = test_3_blocks[columns_of_interest_lower]
test_3_blocks

Unnamed: 0,cvegeo,pobtot,pobfem,pobmas,p_0a2,p_0a2_f,p_0a2_m,p_3a5,p_3a5_f,p_3a5_m,...,p_12ymas_m,p_15ymas,p_15ymas_f,p_15ymas_m,p_18ymas,p_18ymas_f,p_18ymas_m,pob0_14,pob15_64,pob65_mas
32,100100010229001,57,28.0,29.0,0.0,0.0,0.0,2.035714,0.982759,1.0,...,28.0,53.0,25.0,28.0,50.0,23.0,27.0,4.0,48.0,5.0
33,100100010229002,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
34,100100010229003,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
35,100100010229004,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
36,100100010229006,26,9.0,17.0,0.0,0.0,0.0,0.928571,0.0,0.962963,...,15.0,24.0,9.0,15.0,24.0,9.0,15.0,2.0,19.0,5.0
37,100100010229007,56,22.0,34.0,2.0,1.0,1.0,3.0,1.0,2.0,...,27.0,43.0,17.0,26.0,42.0,16.0,26.0,13.0,41.0,2.0
38,100100010229008,46,23.0,23.0,0.0,0.0,0.0,0.0,0.0,0.0,...,23.0,41.0,20.0,21.0,39.0,19.0,20.0,5.0,34.0,7.0
39,100100010229010,114,65.0,49.0,1.0,1.0,0.0,2.0,1.0,1.0,...,47.0,103.0,57.0,46.0,98.0,54.0,44.0,11.0,93.0,10.0
40,100100010229015,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
41,100100010229019,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [127]:
test_3_ageb = pop_ageb_gdf.loc[pop_ageb_gdf.CVE_AGEB == ageb_ofinterest]

ageb_columns_of_interest = columns_of_interest.copy()
ageb_columns_of_interest.remove('CVEGEO')
test_3_ageb = test_3_ageb[ageb_columns_of_interest]

test_3_ageb

Unnamed: 0,POBTOT,POBFEM,POBMAS,P_0A2,P_0A2_F,P_0A2_M,P_3A5,P_3A5_F,P_3A5_M,P_6A11,...,P_12YMAS_M,P_15YMAS,P_15YMAS_F,P_15YMAS_M,P_18YMAS,P_18YMAS_F,P_18YMAS_M,POB0_14,POB15_64,POB65_MAS
66,300,147.0,153.0,3.0,,,8.0,3.0,5.0,14.0,...,141.0,265.0,128.0,137.0,254.0,121.0,133.0,35.0,235.0,30.0


In [137]:
# Get each col val to df
compare = pd.DataFrame()
i = 0
for col in ageb_columns_of_interest:
    compare.loc[i,'col'] = col
    compare.loc[i,'blocks'] = test_3_blocks[col.lower()].sum()
    compare.loc[i,'ageb'] = test_3_ageb[col].unique()[0]
    i = i+1
# Find diff
compare['diff'] = compare['blocks']-compare['ageb']

# Show
compare

Unnamed: 0,col,blocks,ageb,diff
0,POBTOT,300.0,300.0,0.0
1,POBFEM,147.0,147.0,0.0
2,POBMAS,153.0,153.0,0.0
3,P_0A2,3.0,3.0,0.0
4,P_0A2_F,2.0,,
5,P_0A2_M,1.0,,
6,P_3A5,8.0,8.0,0.0
7,P_3A5_F,3.0,3.0,0.0
8,P_3A5_M,5.0,5.0,0.0
9,P_6A11,14.0,14.0,0.0


In [None]:
]