In [1]:
import os
import sys

import pandas as pd
import geopandas as gpd
import osmnx as ox
import numpy as np

from scipy.spatial import Voronoi, voronoi_plot_2d
import shapely

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

module_path = os.path.abspath(os.path.join('../../../'))
if module_path not in sys.path:
    sys.path.append(module_path)
    import aup

  ox.config(


In [2]:
# OUTSIDE FUNCTION:
city = 'Aguascalientes'

# --------------- CREATE AREA OF INTEREST (aoi)
# Downloads mun_gdf for city and create aoi
query = f"SELECT * FROM metropolis.metro_gdf_2020 WHERE \"city\" LIKE \'{city}\'"
mun_gdf = aup.gdf_from_query(query, geometry_col='geometry')
mun_gdf = mun_gdf.set_crs("EPSG:4326")

In [3]:
# --------------- DOWNLOAD POP DATA
aoi = mun_gdf.dissolve()
print("Loading AGEBs for area of interest.")
pop_ageb_gdf = aup.gdf_from_polygon(aoi,'censoageb','censoageb_2020')
print("Loading blocks for area of interest.")
pop_mza_gdf = aup.gdf_from_polygon(aoi,'censo_mza','censo_mza_2020')
pop_mza_gdf = pop_mza_gdf.loc[pop_mza_gdf.AMBITO == 'Urbana'].copy()

Loading AGEBs for area of interest.
Loading blocks for area of interest.


In [4]:
chosen_ageb = '0515'

In [5]:
pop_mza_gdf_chosen = pop_mza_gdf.loc[pop_mza_gdf.CVE_AGEB == chosen_ageb].copy()
pop_ageb_gdf_chosen = pop_ageb_gdf.loc[pop_ageb_gdf.cve_ageb == chosen_ageb].copy()

In [6]:
pop_mza_gdf_chosen.head(1)

Unnamed: 0,CVEGEO,CVE_ENT,CVE_MUN,CVE_LOC,CVE_AGEB,CVE_MZA,AMBITO,TIPOMZA,geometry,ENTIDAD,...,VPH_TELEF,VPH_CEL,VPH_INTER,VPH_STVP,VPH_SPMVPI,VPH_CVJ,VPH_SINRTV,VPH_SINLTC,VPH_SINCINT,VPH_SINTIC
444,100100010515001,1,1,1,515,1,Urbana,Típica,"POLYGON ((-102.29552 21.88697, -102.29516 21.8...",1,...,35,49,41,27,18,3,,,14,0


In [7]:
pop_ageb_gdf_chosen

Unnamed: 0,cve_geo,cve_ent,cve_mun,cve_loc,cve_ageb,geometry,entidad,nom_ent,mun,nom_mun,...,vph_cel,vph_inter,vph_stvp,vph_spmvpi,vph_cvj,vph_sinrtv,vph_sintlc,vph_sincint,vph_sintic,cve_geo_ageb
191,100100010515,1,1,1,515,"POLYGON ((-102.29278 21.89010, -102.29275 21.8...",1,Aguascalientes,1,Aguascalientes,...,759.0,621.0,454.0,325.0,124.0,19.0,24.0,204.0,,100100010515


In [8]:
#pop_mza_gdf_chosen.to_csv('../../../data/external/temporal_fromjupyter/calculate_censo_nan_values/pop_mza_gdf_chosen.csv', index=False)
#pop_ageb_gdf_chosen.to_csv('../../../data/external/temporal_fromjupyter/calculate_censo_nan_values/pop_ageb_gdf_chosen.csv', index=False)

In [9]:
def calculate_censo_nan_values_v1(pop_ageb_gdf, pop_mza_gdf,extended_logs=False):
    ##########################################################################################
	# STEP 1: CHECK FOR DIFFERENCES IN AVAILABLE AGEBs (PREVENTS CRASH)

	# --------------- 1.1 SET COLUMNS TO .UPPER() EXCEPT FOR GEOMETRY
	# (When the equations were written, we used UPPER names, easier to change it this way and then return output with .lower columns)
    pop_ageb_gdf.columns = pop_ageb_gdf.columns.str.upper()
    pop_ageb_gdf.rename(columns={'GEOMETRY':'geometry'},inplace=True)
    
    pop_mza_gdf.columns = pop_mza_gdf.columns.str.upper()
    pop_mza_gdf.rename(columns={'GEOMETRY':'geometry'},inplace=True)

	# --------------- 1.2 CHECK FOR DIFFERENCES IN AGEBs
	# Look for AGEBs in both gdfs
    agebs_in_ageb_gdf = list(pop_ageb_gdf['CVE_AGEB'].unique())
    agebs_in_mza_gdf = list(pop_mza_gdf['CVE_AGEB'].unique())
    
    if (len(agebs_in_ageb_gdf) == 0) and (len(agebs_in_mza_gdf) == 0):
        print("Error: Area of interest has no pop data.")
        intended_crash

	# Test for AGEBs present in mza_gdf but not in AGEB_gdf
    missing_agebs = list(set(agebs_in_mza_gdf) - set(agebs_in_ageb_gdf))
    if len(missing_agebs) > 0:
        print(f'WARNING: AGEBs {missing_agebs} present in mza_gdf but missing from ageb_gdf.')
        print(f'WARNING: Removing AGEBs {missing_agebs} from AGEB analysis.')
    else:
        print("No problem")
    
	##########################################################################################
	# STEP 2: CALCULATE NAN VALUES
    print("STARTING NANs calculation.")

	# STATISTICS - LOG DATA
	# Will create progress logs when progress reaches these percentages:
    progress_logs = [10,20,30,40,50,60,70,80,90,100]
	# This df stores accumulative (All AGEBs) statistics for logs.
    acc_statistics = pd.DataFrame()

	# --------------- NaNs CALCULATION 2.0) Start
    i = 1
    for ageb in agebs_in_mza_gdf: # Most of the code of this function iterates over each AGEB
        if extended_logs:
            print('--'*20)
            print(f'Calculating NaNs for AGEB {ageb} ({i}/{len(agebs_in_mza_gdf)}.)')
		
		# STATISTICS - PROGRESS LOG DATA
		# Measures current progress, prints if passed a checkpoint of progress_logs list.
        current_progress = (i / len(agebs_in_mza_gdf))*100
        for checkpoint in progress_logs:
            if current_progress >= checkpoint:
                print(f'Calculating NaNs. {checkpoint}% done.')
                progress_logs.remove(checkpoint)
                break
        
		# --------------- NaNs CALCULATION 2.1) FIND CURRENT AGEB BLOCK DATA
        mza_ageb_gdf = pop_mza_gdf.loc[pop_mza_gdf['CVE_AGEB'] == ageb].copy()
        
		# --------------- NaNs CALCULATION 2.2) KEEP OUT OF THE PROCESS ROWS WHICH HAVE 0 VALUES (ALL values are NaNs)
		# 2.2a) Set columns to be analysed
        columns_of_interest = ['POBFEM','POBMAS',
							'P_0A2','P_0A2_F','P_0A2_M',
							'P_3A5','P_3A5_F','P_3A5_M',
							'P_6A11','P_6A11_F','P_6A11_M',
							'P_12A14','P_12A14_F','P_12A14_M',
							'P_15A17','P_15A17_F','P_15A17_M',
							'P_18A24','P_18A24_F','P_18A24_M',
							'P_60YMAS','P_60YMAS_F','P_60YMAS_M',
							'P_3YMAS','P_3YMAS_F','P_3YMAS_M',
							'P_12YMAS','P_12YMAS_F','P_12YMAS_M',
							'P_15YMAS','P_15YMAS_F','P_15YMAS_M',
							'P_18YMAS','P_18YMAS_F','P_18YMAS_M',
							'REL_H_M','POB0_14','POB15_64','POB65_MAS']
        blocks = mza_ageb_gdf[['CVEGEO','POBTOT'] + columns_of_interest].copy()
		
		# 2.2b) Set found values to 0
        blocks['found_values'] = 0
		
		# 2.2c) Find rows with nan values and sum of nan values
        for col in columns_of_interest:
			# Turn to numeric
            blocks[col] = pd.to_numeric(blocks[col])
			# Set checker column to 'exist' (1)
            blocks[f'check_{col}'] = 1
			# If it doesn't exist, set that row's check to (0)
            idx = blocks[col].isna()
            blocks.loc[idx, f'check_{col}'] = 0
			# Sum total row nan values
            blocks['found_values'] = blocks['found_values'] + blocks[f'check_{col}']
			# Drop checker column
            blocks.drop(columns=[f'check_{col}'],inplace=True)
		
		# 2.2d) Loc rows with values in columns_of_interest (Can calculate NaNs)
        blocks_values = blocks.loc[blocks['found_values'] > 0].copy()
        blocks_values.drop(columns=['found_values'],inplace=True)
		
		# 2.2e) Save rows with 0 values for later. (Can't calculate NaNs, must distribute values).
        blocks_nans = blocks.loc[blocks['found_values'] == 0].copy()
        blocks_nans.drop(columns=['found_values'],inplace=True)
        
        del blocks
		
		# --------------- NaNs CALCULATION 3) CALCULATE NaN values in blocks
        if extended_logs:
            print(f'Calculating NaNs using block data for AGEB {ageb}.')
        # 2.3a) Count current (original) nan values
        original_nan_values = int(blocks_values.isna().sum().sum())
        # 2.3b) Set a start and finish nan value for while loop and run
        start_nan_values = original_nan_values
        finish_nan_values = start_nan_values - 1
        loop_count = 1
        while start_nan_values > finish_nan_values:
			# ROUND STARTING DATA
            start_nan_values = blocks_values.isna().sum().sum()

			# 2.3c) Set of equation with structure [PARENT] = [SUB] + [SUB]
			# POBTOT = POBFEM + POBMAS
            blocks_values.POBTOT.fillna(blocks_values.POBFEM + blocks_values.POBMAS, inplace=True)
            blocks_values.POBFEM.fillna(blocks_values.POBTOT - blocks_values.POBMAS, inplace=True)
            blocks_values.POBMAS.fillna(blocks_values.POBTOT - blocks_values.POBFEM, inplace=True)
			# P_0A2 = P_0A2_F + P_0A2_M
            blocks_values.P_0A2.fillna(blocks_values.P_0A2_F + blocks_values.P_0A2_M, inplace=True)
            blocks_values.P_0A2_F.fillna(blocks_values.P_0A2 - blocks_values.P_0A2_M, inplace=True)
            blocks_values.P_0A2_M.fillna(blocks_values.P_0A2 - blocks_values.P_0A2_F, inplace=True)
			# P_3A5 = P_3A5_F + P_3A5_M
            blocks_values.P_3A5.fillna(blocks_values.P_3A5_F + blocks_values.P_3A5_M, inplace=True)
            blocks_values.P_3A5_F.fillna(blocks_values.P_3A5 - blocks_values.P_3A5_M, inplace=True)
            blocks_values.P_3A5_M.fillna(blocks_values.P_3A5 - blocks_values.P_3A5_F, inplace=True)
			# P_6A11 = P_6A11_F + P_6A11_M
            blocks_values.P_6A11.fillna(blocks_values.P_6A11_F + blocks_values.P_6A11_M, inplace=True)
            blocks_values.P_6A11_F.fillna(blocks_values.P_6A11 - blocks_values.P_6A11_M, inplace=True)
            blocks_values.P_6A11_M.fillna(blocks_values.P_6A11 - blocks_values.P_6A11_F, inplace=True)
			# P_12A14 = P_12A14_F + P_12A14_M
            blocks_values.P_12A14.fillna(blocks_values.P_12A14_F + blocks_values.P_12A14_M, inplace=True)
            blocks_values.P_12A14_F.fillna(blocks_values.P_12A14 - blocks_values.P_12A14_M, inplace=True)
            blocks_values.P_12A14_M.fillna(blocks_values.P_12A14 - blocks_values.P_12A14_F, inplace=True)
			# P_15A17 = P_15A17_F + P_15A17_M
            blocks_values.P_15A17.fillna(blocks_values.P_15A17_F + blocks_values.P_15A17_M, inplace=True)
            blocks_values.P_15A17_F.fillna(blocks_values.P_15A17 - blocks_values.P_15A17_M, inplace=True)
            blocks_values.P_15A17_M.fillna(blocks_values.P_15A17 - blocks_values.P_15A17_F, inplace=True)
			# P_18A24 = P_18A24_F + P_18A24_M
            blocks_values.P_18A24.fillna(blocks_values.P_18A24_F + blocks_values.P_18A24_M, inplace=True)
            blocks_values.P_18A24_F.fillna(blocks_values.P_18A24 - blocks_values.P_18A24_M, inplace=True)
            blocks_values.P_18A24_M.fillna(blocks_values.P_18A24 - blocks_values.P_18A24_F, inplace=True)
			# P_60YMAS = P_60YMAS_F + P_60YMAS_M
            blocks_values.P_60YMAS.fillna(blocks_values.P_60YMAS_F + blocks_values.P_60YMAS_M, inplace=True)
            blocks_values.P_60YMAS_F.fillna(blocks_values.P_60YMAS - blocks_values.P_60YMAS_M, inplace=True)
            blocks_values.P_60YMAS_M.fillna(blocks_values.P_60YMAS - blocks_values.P_60YMAS_F, inplace=True)
			
			# 2.3d) Set of equation with structure [POBTOT] - [{n}_YMAS] = [group] + [group] + ... + [group]
			# POBTOT - P_3YMAS = P_0A2
			# --> P_0A2 = POBTOT - P_3YMAS
            blocks_values.P_0A2.fillna(blocks_values.POBTOT - blocks_values.P_3YMAS, inplace=True)
            blocks_values.P_0A2_F.fillna(blocks_values.POBFEM - blocks_values.P_3YMAS_F, inplace=True)
            blocks_values.P_0A2_M.fillna(blocks_values.POBMAS - blocks_values.P_3YMAS_M, inplace=True)
			# --> P_3YMAS = POBTOT - P_0A2
            blocks_values.P_3YMAS.fillna(blocks_values.POBTOT - blocks_values.P_0A2, inplace=True)
            blocks_values.P_3YMAS_F.fillna(blocks_values.POBFEM - blocks_values.P_0A2_F, inplace=True)
            blocks_values.P_3YMAS_M.fillna(blocks_values.POBMAS - blocks_values.P_0A2_M, inplace=True)
			# POBTOT - P_12YMAS = (P_0A2 + P_3A5 + P_6A11)
			# --> P_0A2 = POBTOT - P_12YMAS - P_3A5 - P_6A11
            blocks_values.P_0A2.fillna(blocks_values.POBTOT - blocks_values.P_12YMAS - blocks_values.P_3A5 - blocks_values.P_6A11, inplace=True)
            blocks_values.P_0A2_F.fillna(blocks_values.POBFEM - blocks_values.P_12YMAS_F - blocks_values.P_3A5_F - blocks_values.P_6A11_F, inplace=True)
            blocks_values.P_0A2_M.fillna(blocks_values.POBMAS - blocks_values.P_12YMAS_M - blocks_values.P_3A5_M - blocks_values.P_6A11_M, inplace=True)
    		# --> P_3A5 = POBTOT - P_12YMAS - P_0A2 - P_6A11
            blocks_values.P_3A5.fillna(blocks_values.POBTOT - blocks_values.P_12YMAS - blocks_values.P_0A2 - blocks_values.P_6A11, inplace=True)
            blocks_values.P_3A5_F.fillna(blocks_values.POBFEM - blocks_values.P_12YMAS_F - blocks_values.P_0A2_F - blocks_values.P_6A11_F, inplace=True)
            blocks_values.P_3A5_M.fillna(blocks_values.POBMAS - blocks_values.P_12YMAS_M - blocks_values.P_0A2_M - blocks_values.P_6A11_M, inplace=True)
			# --> P_6A11 = POBTOT - P_12YMAS - P_0A2 - P_3A5
            blocks_values.P_6A11.fillna(blocks_values.POBTOT - blocks_values.P_12YMAS - blocks_values.P_0A2 - blocks_values.P_3A5, inplace=True)
            blocks_values.P_6A11_F.fillna(blocks_values.POBFEM - blocks_values.P_12YMAS_F - blocks_values.P_0A2_F - blocks_values.P_3A5_F, inplace=True)
            blocks_values.P_6A11_M.fillna(blocks_values.POBMAS - blocks_values.P_12YMAS_M - blocks_values.P_0A2_M - blocks_values.P_3A5_M, inplace=True)
			# --> P_12YMAS = POBTOT - P_0A2 - P_3A5 -P_6A11
            blocks_values.P_12YMAS.fillna(blocks_values.POBTOT - blocks_values.P_0A2 - blocks_values.P_3A5 - blocks_values.P_6A11, inplace=True)
            blocks_values.P_12YMAS_F.fillna(blocks_values.POBFEM - blocks_values.P_0A2_F - blocks_values.P_3A5_F - blocks_values.P_6A11_F, inplace=True)
            blocks_values.P_12YMAS_M.fillna(blocks_values.POBMAS - blocks_values.P_0A2_M - blocks_values.P_3A5_M - blocks_values.P_6A11_M, inplace=True)
			# POBTOT - P_15YMAS = (P_0A2 + P_3A5 + P_6A11 + P_12A14)
			# --> P_0A2 = POBTOT - P_15YMAS - P_3A5 - P_6A11 - P_12A14
            blocks_values.P_0A2.fillna(blocks_values.POBTOT - blocks_values.P_15YMAS - blocks_values.P_3A5 - blocks_values.P_6A11 - blocks_values.P_12A14, inplace=True)
            blocks_values.P_0A2_F.fillna(blocks_values.POBFEM - blocks_values.P_15YMAS_F - blocks_values.P_3A5_F - blocks_values.P_6A11_F - blocks_values.P_12A14_F, inplace=True)
            blocks_values.P_0A2_M.fillna(blocks_values.POBMAS - blocks_values.P_15YMAS_M - blocks_values.P_3A5_M - blocks_values.P_6A11_M - blocks_values.P_12A14_M, inplace=True)
			# --> P_3A5 = POBTOT - P_15YMAS - P_0A2 - P_6A11 - P_12A14
            blocks_values.P_3A5.fillna(blocks_values.POBTOT - blocks_values.P_15YMAS - blocks_values.P_0A2 - blocks_values.P_6A11 - blocks_values.P_12A14, inplace=True)
            blocks_values.P_3A5_F.fillna(blocks_values.POBFEM - blocks_values.P_15YMAS_F - blocks_values.P_0A2_F - blocks_values.P_6A11_F - blocks_values.P_12A14_F, inplace=True)
            blocks_values.P_3A5_M.fillna(blocks_values.POBMAS - blocks_values.P_15YMAS_M - blocks_values.P_0A2_M - blocks_values.P_6A11_M - blocks_values.P_12A14_M, inplace=True)
			# --> P_6A11 = POBTOT - P_15YMAS - P_0A2 - P_3A5 - P_12A14
            blocks_values.P_6A11.fillna(blocks_values.POBTOT - blocks_values.P_15YMAS - blocks_values.P_0A2 - blocks_values.P_3A5 - blocks_values.P_12A14, inplace=True)
            blocks_values.P_6A11_F.fillna(blocks_values.POBFEM - blocks_values.P_15YMAS_F - blocks_values.P_0A2_F - blocks_values.P_3A5_F - blocks_values.P_12A14_F, inplace=True)
            blocks_values.P_6A11_M.fillna(blocks_values.POBMAS - blocks_values.P_15YMAS_M - blocks_values.P_0A2_M - blocks_values.P_3A5_M - blocks_values.P_12A14_M, inplace=True)
			# --> P_12A14 = POBTOT - P_15YMAS - P_0A2 - P_3A5 - P_6A11
            blocks_values.P_12A14.fillna(blocks_values.POBTOT - blocks_values.P_15YMAS - blocks_values.P_0A2 - blocks_values.P_3A5 - blocks_values.P_6A11, inplace=True)
            blocks_values.P_12A14_F.fillna(blocks_values.POBFEM - blocks_values.P_15YMAS_F - blocks_values.P_0A2_F - blocks_values.P_3A5_F - blocks_values.P_6A11_F, inplace=True)
            blocks_values.P_12A14_M.fillna(blocks_values.POBMAS - blocks_values.P_15YMAS_M - blocks_values.P_0A2_M - blocks_values.P_3A5_M - blocks_values.P_6A11_M, inplace=True)
            # --> P_15YMAS = POBTOT - P_0A2 - P_3A5 - P_6A11 - P_12A14
            blocks_values.P_15YMAS.fillna(blocks_values.POBTOT - blocks_values.P_0A2 - blocks_values.P_3A5 - blocks_values.P_6A11 - blocks_values.P_12A14, inplace=True)
            blocks_values.P_15YMAS_F.fillna(blocks_values.POBFEM - blocks_values.P_0A2_F - blocks_values.P_3A5_F - blocks_values.P_6A11_F - blocks_values.P_12A14_F, inplace=True)
            blocks_values.P_15YMAS_M.fillna(blocks_values.POBMAS - blocks_values.P_0A2_M - blocks_values.P_3A5_M - blocks_values.P_6A11_M - blocks_values.P_12A14_M, inplace=True)
            # POBTOT - P_18YMAS = (P_0A2 + P_3A5 + P_6A11 + P_12A14 + P_15A17)
			# --> P_0A2 = POBTOT - P_18YMAS - P_3A5 - P_6A11 - P_12A14 - P_15A17
            blocks_values.P_0A2.fillna(blocks_values.POBTOT - blocks_values.P_18YMAS - blocks_values.P_3A5 - blocks_values.P_6A11 - blocks_values.P_12A14 - blocks_values.P_15A17, inplace=True)
            blocks_values.P_0A2_F.fillna(blocks_values.POBFEM - blocks_values.P_18YMAS_F - blocks_values.P_3A5_F - blocks_values.P_6A11_F - blocks_values.P_12A14_F - blocks_values.P_15A17_F, inplace=True)
            blocks_values.P_0A2_M.fillna(blocks_values.POBMAS - blocks_values.P_18YMAS_M - blocks_values.P_3A5_M - blocks_values.P_6A11_M - blocks_values.P_12A14_M - blocks_values.P_15A17_M, inplace=True)
			# --> P_3A5 = POBTOT - P_18YMAS - P_0A2 - P_6A11 - P_12A14 - P_15A17
            blocks_values.P_3A5.fillna(blocks_values.POBTOT - blocks_values.P_18YMAS - blocks_values.P_0A2 - blocks_values.P_6A11 - blocks_values.P_12A14 - blocks_values.P_15A17, inplace=True)
            blocks_values.P_3A5_F.fillna(blocks_values.POBFEM - blocks_values.P_18YMAS_F - blocks_values.P_0A2_F - blocks_values.P_6A11_F - blocks_values.P_12A14_F - blocks_values.P_15A17_F, inplace=True)
            blocks_values.P_3A5_M.fillna(blocks_values.POBMAS - blocks_values.P_18YMAS_M - blocks_values.P_0A2_M - blocks_values.P_6A11_M - blocks_values.P_12A14_M - blocks_values.P_15A17_M, inplace=True)
			# --> P_6A11 = POBTOT - P_18YMAS - P_0A2 - P_3A5 - P_12A14 - P_15A17
            blocks_values.P_6A11.fillna(blocks_values.POBTOT - blocks_values.P_18YMAS - blocks_values.P_0A2 - blocks_values.P_3A5 - blocks_values.P_12A14 - blocks_values.P_15A17, inplace=True)
            blocks_values.P_6A11_F.fillna(blocks_values.POBFEM - blocks_values.P_18YMAS_F - blocks_values.P_0A2_F - blocks_values.P_3A5_F - blocks_values.P_12A14_F - blocks_values.P_15A17_F, inplace=True)
            blocks_values.P_6A11_M.fillna(blocks_values.POBMAS - blocks_values.P_18YMAS_M - blocks_values.P_0A2_M - blocks_values.P_3A5_M - blocks_values.P_12A14_M - blocks_values.P_15A17_M, inplace=True)
			# --> P_12A14 = POBTOT - P_18YMAS - P_0A2 - P_3A5 - P_6A11 - P_15A17
            blocks_values.P_12A14.fillna(blocks_values.POBTOT - blocks_values.P_18YMAS - blocks_values.P_0A2 - blocks_values.P_3A5 - blocks_values.P_6A11 - blocks_values.P_15A17, inplace=True)
            blocks_values.P_12A14_F.fillna(blocks_values.POBFEM - blocks_values.P_18YMAS_F - blocks_values.P_0A2_F - blocks_values.P_3A5_F - blocks_values.P_6A11_F - blocks_values.P_15A17_F, inplace=True)
            blocks_values.P_12A14_M.fillna(blocks_values.POBMAS - blocks_values.P_18YMAS_M - blocks_values.P_0A2_M - blocks_values.P_3A5_M - blocks_values.P_6A11_M - blocks_values.P_15A17_M, inplace=True)
			# --> P_15A17 = POBTOT - P_18YMAS - P_0A2 - P_3A5 - P_6A11 - P_12A14
            blocks_values.P_15A17.fillna(blocks_values.POBTOT - blocks_values.P_18YMAS - blocks_values.P_0A2 - blocks_values.P_3A5 - blocks_values.P_6A11 - blocks_values.P_12A14, inplace=True)
            blocks_values.P_15A17_F.fillna(blocks_values.POBFEM - blocks_values.P_18YMAS_F - blocks_values.P_0A2_F - blocks_values.P_3A5_F - blocks_values.P_6A11_F - blocks_values.P_12A14_F, inplace=True)
            blocks_values.P_15A17_M.fillna(blocks_values.POBMAS - blocks_values.P_18YMAS_M - blocks_values.P_0A2_M - blocks_values.P_3A5_M - blocks_values.P_6A11_M - blocks_values.P_12A14_M, inplace=True)
			# --> P_18YMAS = POBTOT - P_0A2 - P_3A5 - P_6A11 - P_12A14 - P_15A17
            blocks_values.P_18YMAS.fillna(blocks_values.POBTOT - blocks_values.P_0A2 - blocks_values.P_3A5 - blocks_values.P_6A11 - blocks_values.P_12A14 - blocks_values.P_15A17, inplace=True)
            blocks_values.P_18YMAS_F.fillna(blocks_values.POBFEM - blocks_values.P_0A2_F - blocks_values.P_3A5_F - blocks_values.P_6A11_F - blocks_values.P_12A14_F - blocks_values.P_15A17_F, inplace=True)
            blocks_values.P_18YMAS_M.fillna(blocks_values.POBMAS - blocks_values.P_0A2_M - blocks_values.P_3A5_M - blocks_values.P_6A11_M - blocks_values.P_12A14_M - blocks_values.P_15A17_M, inplace=True)

			# 2.3e) Set of complementary equations
			# REL_H_M = (POBMAS/POBFEM)*100
			# --> POBMAS = (REL_H_M/100) * POBFEM
            blocks_values.POBMAS.fillna(round((blocks_values.REL_H_M / 100) * blocks_values.POBFEM,0), inplace=True)
			# --> POBFEM = (POBMAS * 100) / REL_H_M
            blocks_values.POBFEM.fillna(round((blocks_values.POBMAS * 100) / blocks_values.REL_H_M,0), inplace=True)
			# POBTOT = POB0_14 + POB15_64 + POB65_MAS
			# --> POB0_14 = POBTOT - POB15_64 - POB65_MAS
            blocks_values.POB0_14.fillna(blocks_values.POBTOT - blocks_values.POB15_64 - blocks_values.POB65_MAS, inplace=True)
    		# --> POB15_64 = POBTOT - POB0_14 - POB65_MAS
            blocks_values.POB15_64.fillna(blocks_values.POBTOT - blocks_values.POB0_14 - blocks_values.POB65_MAS, inplace=True)
			# --> POB65_MAS = POBTOT - POB0_14 - POB15_64
            blocks_values.POB65_MAS.fillna(blocks_values.POBTOT - blocks_values.POB0_14 - blocks_values.POB15_64, inplace=True)
			# POB0_14 = P_0A2 + P_3A5 + P_6A11 + P_12A14
			# --> POB0_14 = P_0A2 + P_3A5 + P_6A11 + P_12A14
            blocks_values.POB0_14.fillna(blocks_values.P_0A2 + blocks_values.P_3A5 + blocks_values.P_6A11 + blocks_values.P_12A14, inplace=True)
    		# --> P_0A2 = POB0_14 - P_3A5 - P_6A11 - P_12A14
            blocks_values.P_0A2.fillna(blocks_values.POB0_14 - blocks_values.P_3A5 - blocks_values.P_6A11 - blocks_values.P_12A14, inplace=True)
			# --> P_3A5 = POB0_14 - P_0A2 - P_6A11 - P_12A14
            blocks_values.P_3A5.fillna(blocks_values.POB0_14 - blocks_values.P_0A2 - blocks_values.P_6A11 - blocks_values.P_12A14, inplace=True)
			# --> P_6A11 = POB0_14 - P_0A2 - P_3A5 - P_12A14
            blocks_values.P_6A11.fillna(blocks_values.POB0_14 - blocks_values.P_0A2 - blocks_values.P_3A5 - blocks_values.P_12A14, inplace=True)				
			# --> P_12A14 = POB0_14 - P_0A2 - P_3A5 - P_6A11
            blocks_values.P_12A14.fillna(blocks_values.POB0_14 - blocks_values.P_0A2 - blocks_values.P_3A5 - blocks_values.P_6A11, inplace=True) 
			# P_15YMAS = POBTOT - POB0_14
			# --> P_15YMAS = POBTOT - POB0_14
            blocks_values.P_15YMAS.fillna(blocks_values.POBTOT - blocks_values.POB0_14, inplace=True)
			# --> POB0_14 = POBTOT - P_15YMAS
            blocks_values.POB0_14.fillna(blocks_values.POBTOT - blocks_values.P_15YMAS, inplace=True)
			
			# ROUND FINISHING DATA
            finish_nan_values = blocks_values.isna().sum().sum()
            if extended_logs:
                print(f'Round {loop_count} Starting with {start_nan_values} nan values. Finishing with {finish_nan_values} nan values.')
            loop_count += 1
        nan_reduction = round(((1-(finish_nan_values/original_nan_values))*100),2)
        if extended_logs:
            print(f'Originally had {original_nan_values} nan values, now there are {finish_nan_values}. A {nan_reduction}% reduction.')
		
		# 2.3f) Join back blocks with all nan values
        blocks_calc = pd.concat([blocks_values,blocks_nans])
        
        return blocks_calc, blocks_values

        a = """
        
		# --------------- NaNs CALCULATION 4) FOR THE NAN VALUES THAT COULDN'T BE SOLVED, DISTRIBUTE AGEB VALUES.
		
		# 2.4a) Prepare for second loop
		# Remove masc/fem relation from analysis. 
		# It complicates this and further processes, when needed calculate using (REL_H_M = (POBMAS/POBFEM)*100)
		ageb_filling_cols = columns_of_interest.copy()
		ageb_filling_cols.remove('REL_H_M')
		blocks_calc.drop(columns=['REL_H_M'],inplace=True)

		if ageb not in missing_agebs:

			if extended_logs:
				print(f'Calculating NaNs using AGEB data for AGEB {ageb}.')

			# Locate AGEB data in pop_ageb_gdf
			ageb_gdf = pop_ageb_gdf.loc[pop_ageb_gdf['CVE_AGEB'] == ageb]

			# Solving method used to solve column
			solved_using_blocks = 0 # for log statistics
			solved_using_ageb = 0 # for log statistics
			
			# 2.4b) Fill with AGEB values.
			for col in ageb_filling_cols:
				# Find number of nan values in current col
				col_nan_values = blocks_calc.isna().sum()[col]

				# If there are no nan values left in col, pass.
				if col_nan_values == 0:
					solved_using_blocks += 1 # for log statistics
				
				# Elif there is only one value left, assign missing value directly to cell.
				elif col_nan_values == 1: 
					# Calculate missing value
					ageb_col_value = ageb_gdf[col].unique()[0]
					current_block_sum = blocks_calc[col].sum()
					missing_value = ageb_col_value - current_block_sum
					# Add missing value to na spot in column
					blocks_calc[col].fillna(missing_value,inplace=True)
					solved_using_ageb += 1 # for log statistics
				
				# Elif there are more than one nan in col, distribute using POBTOT of those blocks as distr. method.
				elif col_nan_values > 1:        
					# Locate rows with NaNs in current col
					idx = blocks_calc[col].isna()
					# Set distributing factor to 0
					blocks_calc['dist_factor'] = 0
					# Assign to those rows a distributing factor ==> (POBTOT of each row / sum of POBTOT of those rows)
					blocks_calc.loc[idx,'dist_factor'] = (blocks_calc['POBTOT']) / blocks_calc.loc[idx]['POBTOT'].sum()
					# Calculate missing value
					ageb_col_value = ageb_gdf[col].unique()[0]
					current_block_sum = blocks_calc[col].sum()
					missing_value = ageb_col_value - current_block_sum
					# Distribute missing value in those rows using POBTOT factor
					blocks_calc[col].fillna(missing_value * blocks_calc['dist_factor'], inplace=True)
					blocks_calc.drop(columns=['dist_factor'],inplace=True)
					solved_using_ageb += 1 # for log statistics

			# Logs Statistics - How was this AGEB solved?
			if extended_logs:
				pct_col_byblocks = (solved_using_blocks / len(ageb_filling_cols))*100
				pct_col_byagebs = (solved_using_ageb / len(ageb_filling_cols))*100
				print(f'{pct_col_byblocks}% of columns solved using block data only.')
				print(f'{pct_col_byagebs}% of columns required AGEB filling.')
		
			# Logs Statistics - Add currently examined AGEB statistics to log df
			acc_statistics.loc[i,'ageb'] = ageb
			# Percentage of NaNs found using blocks gdf
			acc_statistics.loc[i,'nans_calculated'] = nan_reduction
			# Columns which could be solved entirely using equations in block_gdf
			acc_statistics.loc[i,'block_calculated'] = solved_using_blocks
			# Columns which required AGEB filling
			acc_statistics.loc[i,'ageb_filling'] = solved_using_ageb
			# All could be solved, so
			acc_statistics.loc[i,'unable_to_solve'] = 0

		else: #current AGEB is in missing_agebs list (Present in mza_gdf, but not in ageb_gdf)
			if extended_logs:
				print(f"NANs on AGEB {ageb} cannot be calculated using AGEB data because it doesn't exist.")

			# Solving method used to solve column
			solved_using_blocks = 0 # for log statistics
			unable_tosolve = 0 # for log statistics
			
			# # Statistical Loop
			for col in ageb_filling_cols:
				# Find number of nan values in current col
				col_nan_values = blocks_calc.isna().sum()[col]
				# If there are no nan values left in col, pass.
				if col_nan_values == 0:
					solved_using_blocks += 1 # for log statistics
				else:
					unable_tosolve += 1 # for log statistics

			# Logs Statistics - How was this AGEB solved?
			if extended_logs:
				pct_col_byblocks = (solved_using_blocks / len(ageb_filling_cols))*100
				pct_col_notsolved = (unable_tosolve / len(ageb_filling_cols))*100
				print(f"{pct_col_byblocks}% of columns solved using block data only.")
				print(f"{pct_col_notsolved}% of columns couldn't be solved.")

			# Logs Statistics - Add currently examined AGEB statistics to log df
			acc_statistics.loc[i,'ageb'] = ageb
			# Percentage of NaNs found using blocks gdf
			acc_statistics.loc[i,'nans_calculated'] = nan_reduction
			# Columns which could be solved entirely using equations in block_gdf
			acc_statistics.loc[i,'block_calculated'] = solved_using_blocks
			# There wasn't AGEB filling, therefore:
			acc_statistics.loc[i,'ageb_filling'] = 0
			# Columns which couldn't be solved because there was no AGEB filling
			acc_statistics.loc[i,'unable_to_solve'] = unable_tosolve

		# --------------- NaNs CALCULATION 5) Return calculated data from this AGEB to original block gdf (mza_ageb_gdf)
		# 2.5a) Change original cols for calculated cols
		calculated_cols = ['POBTOT'] + ageb_filling_cols
		
		mza_ageb_gdf = mza_ageb_gdf.drop(columns=calculated_cols) #Drops current block pop cols
		mza_ageb_gdf = pd.merge(mza_ageb_gdf, blocks_calc, on='CVEGEO') #Replaces with blocks_calc cols

		# 2.5b) Restore original column order
		column_order = list(pop_mza_gdf.columns.values)
		mza_ageb_gdf = mza_ageb_gdf[column_order]

		# 2.5c) Save to mza_calc gdf (Function output)
		if i == 1:
			mza_calc = mza_ageb_gdf.copy()
		else:
			mza_calc = pd.concat([mza_calc,mza_ageb_gdf])

		i += 1

	# Format final output and release final log statistics.
	mza_calc.reset_index(inplace=True)
	mza_calc.drop(columns=['index'],inplace=True)

	# Delivers output cols as .lower()
	mza_calc.columns = mza_calc.columns.str.lower()

	print("Finished calculating NaNs.")
	print(f"Percentage of NaNs found using blocks gdf: {round(acc_statistics['nans_calculated'].mean(),2)}%.")
	print(f"Columns which could be solved entirely using equations in block_gdf: {acc_statistics['block_calculated'].sum()}.")
	print(f"Columns which required AGEB filling: {acc_statistics['ageb_filling'].sum()}.")
	print(f"Columns which couldn't be solved: {acc_statistics['unable_to_solve'].sum()}.")
	
	return mza_calc """

In [87]:
blocks_calc, blocks_values_org = calculate_censo_nan_values_v1(pop_ageb_gdf_chosen, pop_mza_gdf_chosen,extended_logs=True)

# Show
print(blocks_values_org.shape)
blocks_values_org.head(3)

No problem
STARTING NANs calculation.
----------------------------------------
Calculating NaNs for AGEB 0515 (1/1.)
Calculating NaNs. 10% done.
Calculating NaNs using block data for AGEB 0515.
Round 1 Starting with 268 nan values. Finishing with 75 nan values.
Round 2 Starting with 75 nan values. Finishing with 47 nan values.
Round 3 Starting with 47 nan values. Finishing with 47 nan values.
Originally had 268 nan values, now there are 47. A 82.46% reduction.
(44, 41)


Unnamed: 0,CVEGEO,POBTOT,POBFEM,POBMAS,P_0A2,P_0A2_F,P_0A2_M,P_3A5,P_3A5_F,P_3A5_M,...,P_15YMAS,P_15YMAS_F,P_15YMAS_M,P_18YMAS,P_18YMAS_F,P_18YMAS_M,REL_H_M,POB0_14,POB15_64,POB65_MAS
444,100100010515001,179,117.0,62.0,6.0,4.0,2.0,9.0,7.0,2.0,...,148.0,95.0,53.0,142.0,92.0,50.0,52.99,31.0,114.0,34.0
445,100100010515002,176,95.0,81.0,4.0,1.0,3.0,2.0,1.0,1.0,...,157.0,86.0,71.0,153.0,83.0,70.0,85.26,19.0,123.0,34.0
446,100100010515003,81,47.0,34.0,4.0,4.0,0.0,3.0,1.0,2.0,...,71.0,41.0,30.0,69.0,40.0,29.0,72.34,10.0,60.0,11.0


In [11]:
original_data = pop_mza_gdf_chosen[['CVEGEO','POBTOT','POBFEM','POBMAS','P_0A2','P_0A2_F','P_0A2_M','P_3A5','P_3A5_F','P_3A5_M']]

# Show
print(original_data.shape)
original_data.head(3)

(48, 10)


Unnamed: 0,CVEGEO,POBTOT,POBFEM,POBMAS,P_0A2,P_0A2_F,P_0A2_M,P_3A5,P_3A5_F,P_3A5_M
444,100100010515001,179,117,62,6,4.0,,9.0,7.0,
445,100100010515002,176,95,81,4,,3.0,,,
446,100100010515003,81,47,34,4,4.0,0.0,3.0,,


## Planteo de solución Chat GPT

### Simplificación del problema para ver si se puede resolver.

In [98]:
blocks_values_simplified = pd.DataFrame( {'P_0A2': [4, None, 3, 4,None],
                                          'P_0A2_F': [4, 5, None, 3,None],
                                          'P_0A2_M': [0, 2, 2, None,None]})
# Respuesta a los unknowns:
# 4 | 4 | 0
# (7) | 5 | 2
# 3 | (1) | 2
# 4 | 3 | (1)
# (10) | (5) | (5)
# Totales: 28 | 18 | 10

total_values = [28,18,10]
blocks_values_simplified

Unnamed: 0,P_0A2,P_0A2_F,P_0A2_M
0,4.0,4.0,0.0
1,,5.0,2.0
2,3.0,,2.0
3,4.0,3.0,
4,,,


In [107]:
blocks_values_simplified_filled

Unnamed: 0,P_0A2,P_0A2_F,P_0A2_M
0,4.0,4.0,0.0
1,7.0,5.0,2.0
2,3.0,1.0,2.0
3,4.0,3.0,1.0
4,7.0,1.0,1.0


In [163]:
from scipy.optimize import minimize

# Esta función recibe y opera en las rows de blocks.
def fill_nans(blocks):

    # Por ahora intentamos solo con el grupo de 0 a 2 años
    # Seleccionar solo las columnas relevantes
    relevant_columns = ['P_0A2', 'P_0A2_F', 'P_0A2_M']
    
    # Aislar la información de las columnas relevantes
    blocks_relevant = blocks[relevant_columns]

    # Cantidad de valores desconocidos (Si dejo la cantidad completa, crashea. 3 es otra).
    print(f"Valores desconocidos: {np.isnan(blocks_relevant.values).sum()}")
    unknown_values = np.isnan(blocks_relevant.values).sum()
    
    # Convertir las columnas relevantes a un tipo de datos numéricos
    blocks_relevant_numeric = blocks_relevant.apply(pd.to_numeric, errors='coerce')

    # Definir la función objetivo (Lo que queremos minimizar con la función minimize): la cantidad de NaNs.
    def objective_function(x, blocks_relevant_numeric):
        return np.isnan(blocks_relevant_numeric.values).sum()

    # Crear las restricciones (Requisitos a cumplir, se pasan a manera de lista de diccionarios)
    def create_constraints(total_values,blocks_relevant_numeric):

        constraints = []

        # EXPLICACIÓN DE LAS CONSTRAINTS: Las constraints se agregan en la función minimize como lista de diccionarios..
        # En cada diccionario de restricciones, 'type':'eq' significa que la restricción es de tipo igualdad. 
        # Esto indica que queremos que una función de igualdad (definida en 'fun') sea igual a cero.
        # La función de igualdad que se debe colocar en 'fun' se crea en las siguientes definiciones. Lo que va después del return debe ser igual a cero.

        #----- RESTRICCIONES QUE SON POR FILA -----
        # En este caso, la restricción debería asegurar que por fila (por .iterrows) P_0A2 sea igual a P_0A2_F + P_0A2_M.
        # El número de variables encontradas (i) corresponderá al número de nans que hay en este caso.

        i = 0
        # Lista de variables desconocidas encontradas (Necesario para las restricciones por columna)
        P_0A2_unknown_vars = []
        P_0A2_F_unknown_vars = []
        P_0A2_M_unknown_vars = []

        unknown_vars = {} # i:(index,col)
        
        for index, row in blocks_relevant_numeric.iterrows():
            P_0A2_val = row['P_0A2']
            print(P_0A2_val)
            P_0A2_F_val = row['P_0A2_F']
            print(P_0A2_F_val)
            P_0A2_M_val = row['P_0A2_M']
            print(P_0A2_M_val)

            # Orden de las variables
            # P_0A2_val - (P_0A2_F_val + P_0A2_M_val) --> x[0] - (x[1] + x[2])

            ### ---------- ---------- CASOS EN DONDE SE AGREGAN 0 VARIABLES DESCONOCIDAS. ---------- ----------
            # Case 0: All values are known (skip)
            if not (np.isnan(P_0A2_val)) and not (np.isnan(P_0A2_F_val)) and not (np.isnan(P_0A2_M_val)):
                print("Case 0: Values complete")
                continue
                
            ### ---------- ---------- CASOS EN DONDE SE AGREGA 1 VARIABLE DESCONOCIDA. ---------- ----------
            # Case 1: Unknown P_0A2_val
            elif (np.isnan(P_0A2_val)) and not (np.isnan(P_0A2_F_val)) and not (np.isnan(P_0A2_M_val)):
                print("Case 1: Unknown P_0A2_val")
                # Se crea el constraint
                def case_1_row_constraint(x, P_0A2_F_val, P_0A2_M_val,i):
                    print(f"Reading function x[{i}] - ({P_0A2_F_val} + {P_0A2_M_val}).")
                    return x[i] - (P_0A2_F_val + P_0A2_M_val)
                constraints.append({'type': 'eq', 
                                    'fun': case_1_row_constraint, 
                                    'args': (P_0A2_F_val, P_0A2_M_val,i)})
                # Se registra la posición de la variable en la columna faltante correspondiente y se incrementa i (next case)
                P_0A2_unknown_vars.append(i)
                unknown_vars[i] = (index,'P_0A2')
                print(f"appended {i}")
                
                i += 1

            # Case 2: Unknown P_0A2_F_val
            elif not (np.isnan(P_0A2_val)) and (np.isnan(P_0A2_F_val)) and not (np.isnan(P_0A2_M_val)):
                print("Case 2: Unknown P_0A2_F_val")
                # Se crea el constraint
                def case_2_row_constraint(x, P_0A2_val, P_0A2_M_val,i):
                    print(f"Reading function {P_0A2_val} - (x[{i}] + {P_0A2_M_val}).")
                    return P_0A2_val - (x[i] + P_0A2_M_val)
                constraints.append({'type': 'eq', 
                                    'fun': case_2_row_constraint, 
                                    'args': (P_0A2_val, P_0A2_M_val,i)})
                # Se registra la posición de la variable en la columna faltante correspondiente y se incrementa i (next case)
                P_0A2_F_unknown_vars.append(i)
                unknown_vars[i] = (index,'P_0A2_F')
                print(f"appended {i}")
                
                i += 1

            # Case 3: Unknown P_0A2_M_val
            elif not (np.isnan(P_0A2_val)) and not (np.isnan(P_0A2_F_val)) and (np.isnan(P_0A2_M_val)):
                print("Case 3: Unknown P_0A2_M_val")
                # Se crea el constraint
                def case_3_row_constraint(x, P_0A2_val, P_0A2_F_val,i):
                    print(f"Reading function {P_0A2_val} - ({P_0A2_F_val} + x[{i}]).")
                    return P_0A2_val - (P_0A2_F_val + x[i])
                constraints.append({'type': 'eq', 
                                    'fun': case_3_row_constraint, 
                                    'args': (P_0A2_val, P_0A2_F_val,i)})
                # Se registra la posición de la variable en la columna faltante correspondiente y se incrementa i (next case)
                unknown_vars[i] = (index,'P_0A2_M')
                P_0A2_M_unknown_vars.append(i)
                print(f"appended {i}")
                
                i += 1
                
            ### ---------- ---------- CASOS EN DONDE SE AGREGAN 2 VARIABLES DESCONOCIDAS. ---------- ----------
            # Case 4: Unknown P_0A2_F_val and P_0A2_M_val
            elif not (np.isnan(P_0A2_val)) and (np.isnan(P_0A2_F_val)) and (np.isnan(P_0A2_M_val)):
                print("Case 4: Unknown P_0A2_F_val and P_0A2_M_val")
                # Se crea el constraint
                def case_4_row_constraint(x, P_0A2_val,i):
                    print(f"Reading function {P_0A2_val} - (x[{i}] + x[{i+1}]).")
                    return P_0A2_val - (x[i] + x[i+1])
                constraints.append({'type': 'eq', 
                                    'fun': case_4_row_constraint, 
                                    'args': (P_0A2_val,i)})
                # Se registra la posición de la variable en la columna faltante correspondiente y se incrementa i (next case)
                P_0A2_F_unknown_vars.append(i)
                unknown_vars[i] = (index,'P_0A2_F')
                print(f"appended {i}")
                
                P_0A2_M_unknown_vars.append(i+1)
                unknown_vars[i+1] = (index,'P_0A2_M')
                print(f"appended {i+1}")
                
                i += 2
                
            # Case 5: Unknown P_0A2_val and P_0A2_M_val
            elif (np.isnan(P_0A2_val)) and not (np.isnan(P_0A2_F_val)) and (np.isnan(P_0A2_M_val)):
                print("Case 5: Unknown P_0A2_val and P_0A2_M_val")
                # Se crea el constraint
                def case_5_row_constraint(x, P_0A2_F_val,i):
                    print(f"Reading function x[{i}] - ({P_0A2_F_val} + x[{i+1}]).")
                    return x[i] - (P_0A2_F_val + x[i+1])
                constraints.append({'type': 'eq', 
                                    'fun': case_5_row_constraint, 
                                    'args': (P_0A2_F_val,i)})
                # Se registra la posición de la variable en la columna faltante correspondiente y se incrementa i (next case)
                P_0A2_unknown_vars.append(i)
                unknown_vars[i] = (index,'P_0A2')
                print(f"appended {i}")
                
                P_0A2_M_unknown_vars.append(i+1)
                unknown_vars[i+1] = (index,'P_0A2_M')
                print(f"appended {i+1}")
                
                i += 2
                
            # Case 6: Unknown P_0A2_val and P_0A2_F_val
            elif (np.isnan(P_0A2_val)) and (np.isnan(P_0A2_F_val)) and not (np.isnan(P_0A2_M_val)):
                print("Case 6: Unknown P_0A2_val and P_0A2_F_val")
                # Se crea el constraint
                def case_6_row_constraint(x, P_0A2_M_val,i):
                    print(f"Reading function x[{i}] - (x[{i+1}] + {P_0A2_M_val}).")
                    return x[i] - (x[i+1] + P_0A2_M_val)
                constraints.append({'type': 'eq', 
                                    'fun': case_6_row_constraint, 
                                    'args': (P_0A2_M_val,i)})
                # Se registra la posición de la variable en la columna faltante correspondiente y se incrementa i (next case)
                P_0A2_unknown_vars.append(i)
                unknown_vars[i] = (index,'P_0A2')
                print(f"appended {i}")
                
                P_0A2_F_unknown_vars.append(i+1)
                unknown_vars[i+1] = (index,'P_0A2_F')
                print(f"appended {i+1}")
                
                i += 2
                
            ### ---------- ---------- CASOS EN DONDE SE AGREGAN 3 VARIABLES DESCONOCIDAS. ---------- ----------
            # Case 7: All unknown
            elif (np.isnan(P_0A2_val)) and (np.isnan(P_0A2_F_val)) and (np.isnan(P_0A2_M_val)):
                print("Case 7: All unknown")
                # Se crea el constraint
                def case_7_row_constraint(x,i):
                    print(f"Reading function x[{i}] - (x[{i+1}] +  x[{i+2}]).")
                    return x[i] - (x[i+1] + x[i+2])
                constraints.append({'type': 'eq', 
                                    'fun': case_7_row_constraint,
                                    'args': (i,)})
                # Se registra la posición de la variable en la columna faltante correspondiente y se incrementa i (next case)
                P_0A2_unknown_vars.append(i)
                unknown_vars[i] = (index,'P_0A2')
                print(f"appended {i}")
                
                P_0A2_F_unknown_vars.append(i+1)
                unknown_vars[i+1] = (index,'P_0A2_F')
                print(f"appended {i+1}")
                
                P_0A2_M_unknown_vars.append(i+2)
                unknown_vars[i+2] = (index,'P_0A2_M')
                print(f"appended {i+2}")
                
                i += 3
            else:
                print("Error")

        print(f"Unknown variables found: {i}.")

        #----- RESTRICCIONES QUE SON POR COLUMNA -----
        # Restricción para P_0A2
        def constraint_total_P_0A2(x,blocks_relevant_numeric,P_0A2_unknown_vars,total_values):
            #Restricción: La suma de los valores actuales en la columna + los valores encontrados - el valor total conocido[1] debe ser igual a 0.
            print(f"Reading function for col P_0A2.")
            print(np.nansum(blocks_relevant_numeric['P_0A2']))
            print(P_0A2_unknown_vars)
            for i_value in P_0A2_unknown_vars:
                print(f"x[{i_value}]: {x[i_value]}.")
            print(np.nansum(x[P_0A2_unknown_vars]))
            print(total_values[0] )
            
            return np.nansum(blocks_relevant_numeric['P_0A2']) + np.nansum(x[P_0A2_unknown_vars]) - total_values[0] 
        constraints.append({'type': 'eq', 
                            'fun': constraint_total_P_0A2,
                            'args':(blocks_relevant_numeric,P_0A2_unknown_vars,total_values)})
        
        # Restricción para P_0A2_F
        def constraint_total_P_0A2_F(x,blocks_relevant_numeric,P_0A2_F_unknown_vars,total_values):
            #Restricción: La suma de los valores actuales en la columna + los valores encontrados - el valor total conocido[2] debe ser igual a 0.
            print(f"Reading function for col P_0A2_F.")
            return np.nansum(blocks_relevant_numeric['P_0A2_F']) + np.nansum(x[P_0A2_F_unknown_vars]) - total_values[1]
        constraints.append({'type': 'eq', 
                            'fun': constraint_total_P_0A2_F,
                            'args':(blocks_relevant_numeric,P_0A2_F_unknown_vars,total_values)})
        
        # Restricción para P_0A2_M
        def constraint_total_P_0A2_M(x,blocks_relevant_numeric,P_0A2_M_unknown_vars,total_values):
            #Restricción: La suma de los valores actuales en la columna + los valores encontrados - el valor total conocido[3] debe ser igual a 0.
            print(f"Reading function for col P_0A2_.")
            return np.nansum(blocks_relevant_numeric['P_0A2_M']) + np.nansum(x[P_0A2_M_unknown_vars]) - total_values[2]
        constraints.append({'type': 'eq', 
                            'fun': constraint_total_P_0A2_M,
                            'args':(blocks_relevant_numeric,P_0A2_M_unknown_vars,total_values)})
        
        return constraints, unknown_vars

    # Definir las restricciones (Correr lo anterior)
    constraints, unknown_vars = create_constraints(total_values, blocks_relevant_numeric)
    print(f"Constraints: {len(constraints)}.")

    # Initial guess - El número de elementos en initial_guess debe corresponder al número de variables de decisión en tu problema de optimización.
    # Está crasheando si dejo todas, por eso unknown_values = 3.
    initial_guess = np.zeros(unknown_values)

    # Resolver el problema de optimización
    print("Starting optimization using minimize.")
    i = 0
    resultado = minimize(objective_function, initial_guess, 
                         args=(blocks_relevant_numeric,),
                         constraints=constraints,
                         bounds=[(0, None)] * (unknown_values))
    print(resultado)

    # Preparación para remplazar nans
    filled_blocks = blocks_relevant_numeric.copy()
    # Reemplazar NaN con los valores óptimos encontrados
    for i in unknown_vars.keys():
        
        # Find unknown value location
        index = unknown_vars[i][0]
        print(f"index:{index}")
        
        col = unknown_vars[i][1]
        print(f"col:{col}")
        
        # Replace unknown value in location
        filled_blocks.loc[index,col] = list(resultado.x)[i]
        print(f"Result: {list(resultado.x)[i]}")

    #print({col: val for col, val in zip(blocks_relevant.columns, resultado.x)})
    #filled_blocks = blocks_relevant.fillna({col: val for col, val in zip(blocks_relevant.columns, resultado.x)})

    # Unir las columnas llenas con el resto de las columnas
    #filled_blocks_full = pd.concat([filled_blocks, blocks.drop(relevant_columns, axis=1)], axis=1)

    return filled_blocks, resultado

# Ejemplo de uso
filled_blocks, resultado = fill_nans(blocks_values_simplified)

Valores desconocidos: 6
4.0
4.0
0.0
Case 0: Values complete
nan
5.0
2.0
Case 1: Unknown P_0A2_val
appended 0
3.0
nan
2.0
Case 2: Unknown P_0A2_F_val
appended 1
4.0
3.0
nan
Case 3: Unknown P_0A2_M_val
appended 2
nan
nan
nan
Case 7: All unknown
appended 3
appended 4
appended 5
Unknown variables found: 6.
Constraints: 7.
Starting optimization using minimize.
Reading function x[0] - (5.0 + 2.0).
Reading function 3.0 - (x[1] + 2.0).
Reading function 4.0 - (3.0 + x[2]).
Reading function x[3] - (x[4] +  x[5]).
Reading function for col P_0A2.
11.0
[0, 3]
x[0]: 0.0.
x[3]: 0.0.
0.0
28
Reading function for col P_0A2_F.
Reading function for col P_0A2_.
Reading function x[0] - (5.0 + 2.0).
Reading function 3.0 - (x[1] + 2.0).
Reading function 4.0 - (3.0 + x[2]).
Reading function x[3] - (x[4] +  x[5]).
Reading function for col P_0A2.
11.0
[0, 3]
x[0]: 0.0.
x[3]: 0.0.
0.0
28
Reading function for col P_0A2_F.
Reading function for col P_0A2_.
Reading function x[0] - (5.0 + 2.0).
Reading function x[0] -

In [164]:
blocks_values_simplified

Unnamed: 0,P_0A2,P_0A2_F,P_0A2_M
0,4.0,4.0,0.0
1,,5.0,2.0
2,3.0,,2.0
3,4.0,3.0,
4,,,


In [165]:
filled_blocks

Unnamed: 0,P_0A2,P_0A2_F,P_0A2_M
0,4.0,4.0,0.0
1,0.0,5.0,2.0
2,3.0,0.0,2.0
3,4.0,3.0,0.0
4,0.0,0.0,0.0


In [12]:
from scipy.optimize import fsolve

### Chat GPT paso 1: Resolver para un grupo de edad, P_0A2 = P_0A2_F + P_0A2_M [DONE]

In [194]:
def calculate_censo_nan_values_v2(pop_ageb_gdf, pop_mza_gdf,extended_logs=False):
    ##########################################################################################
	# STEP 1: CHECK FOR DIFFERENCES IN AVAILABLE AGEBs (PREVENTS CRASH)

	# --------------- 1.1 SET COLUMNS TO .UPPER() EXCEPT FOR GEOMETRY
	# (When the equations were written, we used UPPER names, easier to change it this way and then return output with .lower columns)
    pop_ageb_gdf.columns = pop_ageb_gdf.columns.str.upper()
    pop_ageb_gdf.rename(columns={'GEOMETRY':'geometry'},inplace=True)
    
    pop_mza_gdf.columns = pop_mza_gdf.columns.str.upper()
    pop_mza_gdf.rename(columns={'GEOMETRY':'geometry'},inplace=True)

	# --------------- 1.2 CHECK FOR DIFFERENCES IN AGEBs
	# Look for AGEBs in both gdfs
    agebs_in_ageb_gdf = list(pop_ageb_gdf['CVE_AGEB'].unique())
    agebs_in_mza_gdf = list(pop_mza_gdf['CVE_AGEB'].unique())
    
    if (len(agebs_in_ageb_gdf) == 0) and (len(agebs_in_mza_gdf) == 0):
        print("Error: Area of interest has no pop data.")
        intended_crash

	# Test for AGEBs present in mza_gdf but not in AGEB_gdf
    missing_agebs = list(set(agebs_in_mza_gdf) - set(agebs_in_ageb_gdf))
    if len(missing_agebs) > 0:
        print(f'WARNING: AGEBs {missing_agebs} present in mza_gdf but missing from ageb_gdf.')
        print(f'WARNING: Removing AGEBs {missing_agebs} from AGEB analysis.')
    else:
        print("No problem")
    
	##########################################################################################
	# STEP 2: CALCULATE NAN VALUES
    print("STARTING NANs calculation.")

	# STATISTICS - LOG DATA
	# Will create progress logs when progress reaches these percentages:
    progress_logs = [10,20,30,40,50,60,70,80,90,100]
	# This df stores accumulative (All AGEBs) statistics for logs.
    acc_statistics = pd.DataFrame()

	# --------------- NaNs CALCULATION 2.0) Start
    i = 1
    for ageb in agebs_in_mza_gdf: # Most of the code of this function iterates over each AGEB
        if extended_logs:
            print('--'*20)
            print(f'Calculating NaNs for AGEB {ageb} ({i}/{len(agebs_in_mza_gdf)}.)')
		
		# STATISTICS - PROGRESS LOG DATA
		# Measures current progress, prints if passed a checkpoint of progress_logs list.
        current_progress = (i / len(agebs_in_mza_gdf))*100
        for checkpoint in progress_logs:
            if current_progress >= checkpoint:
                print(f'Calculating NaNs. {checkpoint}% done.')
                progress_logs.remove(checkpoint)
                break
        
		# --------------- NaNs CALCULATION 2.1) FIND CURRENT AGEB BLOCK DATA
        mza_ageb_gdf = pop_mza_gdf.loc[pop_mza_gdf['CVE_AGEB'] == ageb].copy()
        
		# --------------- NaNs CALCULATION 2.2) KEEP OUT OF THE PROCESS ROWS WHICH HAVE 0 VALUES (ALL values are NaNs)
		# 2.2a) Set columns to be analysed
        columns_of_interest = ['POBFEM','POBMAS',
							'P_0A2','P_0A2_F','P_0A2_M',
							'P_3A5','P_3A5_F','P_3A5_M',
							'P_6A11','P_6A11_F','P_6A11_M',
							'P_12A14','P_12A14_F','P_12A14_M',
							'P_15A17','P_15A17_F','P_15A17_M',
							'P_18A24','P_18A24_F','P_18A24_M',
							'P_60YMAS','P_60YMAS_F','P_60YMAS_M',
							'P_3YMAS','P_3YMAS_F','P_3YMAS_M',
							'P_12YMAS','P_12YMAS_F','P_12YMAS_M',
							'P_15YMAS','P_15YMAS_F','P_15YMAS_M',
							'P_18YMAS','P_18YMAS_F','P_18YMAS_M',
							'REL_H_M','POB0_14','POB15_64','POB65_MAS']
        blocks = mza_ageb_gdf[['CVEGEO','POBTOT'] + columns_of_interest].copy()
		
		# 2.2b) Set found values to 0
        blocks['found_values'] = 0
		
		# 2.2c) Find rows with nan values and sum of nan values
        for col in columns_of_interest:
			# Turn to numeric
            blocks[col] = pd.to_numeric(blocks[col])
			# Set checker column to 'exist' (1)
            blocks[f'check_{col}'] = 1
			# If it doesn't exist, set that row's check to (0)
            idx = blocks[col].isna()
            blocks.loc[idx, f'check_{col}'] = 0
			# Sum total row nan values
            blocks['found_values'] = blocks['found_values'] + blocks[f'check_{col}']
			# Drop checker column
            blocks.drop(columns=[f'check_{col}'],inplace=True)
		
		# 2.2d) Loc rows with values in columns_of_interest (Can calculate NaNs)
        blocks_values = blocks.loc[blocks['found_values'] > 0].copy()
        blocks_values.drop(columns=['found_values'],inplace=True)
		
		# 2.2e) Save rows with 0 values for later. (Can't calculate NaNs, must distribute values).
        blocks_nans = blocks.loc[blocks['found_values'] == 0].copy()
        blocks_nans.drop(columns=['found_values'],inplace=True)
        
        del blocks
		
		# --------------- NaNs CALCULATION 3) CALCULATE NaN values in blocks
        if extended_logs:
            print(f'Calculating NaNs using block data for AGEB {ageb}.')
        
        # 2.3a) Count current (original) nan values
        original_nan_values = int(blocks_values.isna().sum().sum())

        ##############################################################################################################################################
        #--------------------------------------------------------------------------------------------------------------------------------------------#
        #----------------------------------------------------Finding a way to calculate all nan values ----------------------------------------------#
        #--------------------------------------------------------------- USING CHAT GPT -------------------------------------------------------------#
        #-------------------------------------------------------------- WORK IN PROGRESS ------------------------------------------------------------#
        #--------------------------------------------------------------------------------------------------------------------------------------------#
        ##############################################################################################################################################

        # Valores conocidos (Totales del AGEB)
        ageb_ageb_gdf = pop_ageb_gdf.loc[pop_ageb_gdf['CVE_AGEB'] == ageb].copy()
        
        P_0A2_tot = ageb_ageb_gdf['P_0A2'].unique()[0]  # Total de la población de 0 a 2 años
        P_0A2_F_tot = ageb_ageb_gdf['P_0A2_F'].unique()[0]  # Total de la población femenina de 0 a 2 años
        P_0A2_M_tot = ageb_ageb_gdf['P_0A2_M'].unique()[0]  # Total de la población masculina de 0 a 2 años
        
        chat_gpt_first_sol = """
        def equations(vars, *args):
            P_0A2, P_0A2_F, P_0A2_M = vars
            P_0A2_tot, P_0A2_F_tot, P_0A2_M_tot = args
            
            # Definir las ecuaciones basadas en las relaciones dadas
            eq1 = P_0A2 - (P_0A2_F + P_0A2_M)
            eq2 = P_0A2 - P_0A2_tot
            eq3 = P_0A2_F - P_0A2_F_tot
            eq4 = P_0A2_M - P_0A2_M_tot
            
            return [eq1, eq2, eq3]
        
        # Estimaciones iniciales para P_0A2, P_0A2_F, P_0A2_M (pueden ser cualquier valor)
        initial_guess = [250, 125, 125]
        
        # Resolver las ecuaciones
        result = fsolve(equations, initial_guess, args=(P_0A2_tot, P_0A2_F_tot, P_0A2_M_tot))
        
        # Asignar los valores encontrados de nuevo al DataFrame
        blocks_values.loc[:, 'P_0A2'] = result[0]
        blocks_values.loc[:, 'P_0A2_F'] = result[1]
        blocks_values.loc[:, 'P_0A2_M'] = result[2]
        """
        
        chat_gpt_second_sol = """
        def equations(vars, *args):
            P_0A2, P_0A2_F, P_0A2_M = vars
            P_0A2_tot, P_0A2_F_tot, P_0A2_M_tot = args
            
            # Definir las ecuaciones basadas en las relaciones dadas
            eq1 = P_0A2 - (P_0A2_F + P_0A2_M)
            eq2 = P_0A2 - P_0A2_tot
            eq3 = P_0A2_F - P_0A2_F_tot
            eq4 = P_0A2_M - P_0A2_M_tot
                    
            return [eq1, eq2, eq3]
        
        def solve_equations(row):
            # Solo resolver ecuaciones si hay valores desconocidos (NaN)
            if np.isnan(row['P_0A2']) or np.isnan(row['P_0A2_F']) or np.isnan(row['P_0A2_M']):
                # Estimaciones iniciales para P_0A2, P_0A2_F, P_0A2_M (pueden ser cualquier valor)
                initial_guess = [250, 125, 125]
        
                # Resolver las ecuaciones
                result = fsolve(equations, initial_guess, args=(P_0A2_tot, P_0A2_F_tot, P_0A2_M_tot))
                
                # Asignar los valores encontrados solo a las filas con NaN
                row['P_0A2'] = result[0] if np.isnan(row['P_0A2']) else row['P_0A2']
                row['P_0A2_F'] = result[1] if np.isnan(row['P_0A2_F']) else row['P_0A2_F']
                row['P_0A2_M'] = result[2] if np.isnan(row['P_0A2_M']) else row['P_0A2_M']
            
            return row
        
        # Aplicar la función solve_equations a cada fila del DataFrame
        blocks_values = blocks_values.apply(solve_equations, axis=1)

        return blocks_values"""
         
        chat_gpt_third_solution = """
        def equations(vars, *args):
            P_0A2, P_0A2_F, P_0A2_M = vars
            P_0A2_tot, P_0A2_F_tot, P_0A2_M_tot = args
            
            # Definir las ecuaciones basadas en las relaciones dadas
            eq1 = P_0A2 - (P_0A2_F + P_0A2_M)
            eq2 = P_0A2 - P_0A2_tot
            eq3 = P_0A2_F - P_0A2_F_tot
            eq4 = P_0A2_M - P_0A2_M_tot
                        
            return [eq1, eq2, eq3]
        
        def solve_equations(row, P_0A2_tot, P_0A2_F_tot, P_0A2_M_tot):
            # Solo resolver ecuaciones si hay valores desconocidos (NaN)
            if np.isnan(row['P_0A2']) or np.isnan(row['P_0A2_F']) or np.isnan(row['P_0A2_M']):
                # Estimaciones iniciales para P_0A2, P_0A2_F, P_0A2_M (pueden ser cualquier valor)
                initial_guess = [250, 125, 125]
        
                # Resolver las ecuaciones
                result = fsolve(equations, initial_guess, args=(P_0A2_tot, P_0A2_F_tot, P_0A2_M_tot))
                
                # Asignar los valores encontrados solo a las filas con NaN
                row['P_0A2'] = result[0] if np.isnan(row['P_0A2']) else row['P_0A2']
                row['P_0A2_F'] = result[1] if np.isnan(row['P_0A2_F']) else row['P_0A2_F']
                row['P_0A2_M'] = result[2] if np.isnan(row['P_0A2_M']) else row['P_0A2_M']
            
            return row
        
        # Aplicar la función solve_equations a cada fila del DataFrame
        blocks_values = blocks_values.apply(solve_equations, axis=1, args=(P_0A2_tot, P_0A2_F_tot, P_0A2_M_tot), )

        return blocks_values"""
        
        chat_gpt_fourth_solution = """
        from scipy.optimize import minimize
        
        def fill_nans(blocks):
            # Seleccionar solo las columnas relevantes
            relevant_columns = ['P_0A2', 'P_0A2_F', 'P_0A2_M']
            blocks_relevant = blocks[relevant_columns]
        
            # Convertir las columnas relevantes a un tipo de datos numéricos
            blocks_relevant_numeric = blocks_relevant.apply(pd.to_numeric, errors='coerce')
        
            # Definir la función objetivo y las restricciones
            def objective_function(x, blocks_relevant_numeric):
                return np.isnan(blocks_relevant_numeric.values).sum()
        
            def constraint1(x):
                return x[0] - (x[1] + x[2])
        
            def constraint2(x):
                return x[0] - 71
        
            def constraint3(x):
                return x[1] - 37
        
            def constraint4(x):
                return x[2] - 34
        
            # Resolver el problema de optimización
            initial_guess = np.zeros(3)
            resultado = minimize(objective_function, initial_guess, args=(blocks_relevant_numeric,),
                                 constraints=[{'type': 'eq', 'fun': constraint1},
                                              {'type': 'eq', 'fun': constraint2},
                                              {'type': 'eq', 'fun': constraint3},
                                              {'type': 'eq', 'fun': constraint4}],
                                 bounds=[(0, None), (0, None), (0, None)])
            
            # Reemplazar NaN con los valores óptimos encontrados
            filled_blocks = blocks_relevant.fillna({col: val for col, val in zip(blocks_relevant.columns, resultado.x)})
            
            # Unir las columnas llenas con el resto de las columnas
            filled_blocks_full = pd.concat([filled_blocks, blocks.drop(relevant_columns, axis=1)], axis=1)
            
            return filled_blocks_full
        
        # Ejemplo de uso
        blocks_values_filled = fill_nans(blocks_values)

        return blocks_values_filled"""

        chat_gpt_fifth_solution = """
        from scipy.optimize import minimize
        
        def fill_nans(blocks):
            # Seleccionar solo las columnas relevantes
            relevant_columns = ['P_0A2', 'P_0A2_F', 'P_0A2_M']
            blocks_relevant = blocks[relevant_columns]

            # Convertir las columnas relevantes a un tipo de datos numéricos
            blocks_relevant_numeric = blocks_relevant.apply(pd.to_numeric, errors='coerce')

            # Encontrar los valores totales por columna conocidos (por AGEB)
            total_values = [ageb_ageb_gdf[relevant_columns[0]].unique()[0],
                            ageb_ageb_gdf[relevant_columns[1]].unique()[0],
                            ageb_ageb_gdf[relevant_columns[2]].unique()[0]]
        
            # Definir la función objetivo (Lo que queremos minimizar con la función minimize): la cantidad de NaNs.
            def objective_function(x, blocks_relevant_numeric):
                return np.isnan(blocks_relevant_numeric.values).sum()

            # Crear los constraints (Requisitos a cumplir, se pasan a manera de lista de diccionarios)
            def create_constraints(total_values):
                constraints = []
                
                # Restricción para P_0A2 = P_0A2_F + P_0A2_M por fila
                def row_constraint(x):
                    return x[0] - (x[1] + x[2])
                constraints.append({'type': 'eq', 'fun': row_constraint})
                # En el diccionario de restricciones, 'type':'eq' significa que la restricción es de tipo igualdad. 
                # Esto indica que queremos que una función de igualdad (definida en 'fun') sea igual a cero. 
                # La función constraint contiene la definición de la restricción. En este caso, cada restricción asegura que 
                # la suma de los valores en una columna específica sea igual al valor total dado.
        
                # Restricciones para verificar que la suma total de cada columna sea igual al valor esperado
                # Restricción para P_0A2
                def constraint_total_P_0A2(x):
                    #Restricción: La suma de los valores comprendidos del primero al primer tercio del array (Columnas 1 de 3) - el valor total conocido[0] debe ser igual a 0.
                    return np.nansum(x[:len(x)//3]) - total_values[0] 
                constraints.append({'type': 'eq', 'fun': constraint_total_P_0A2})
            
                # Restricción para P_0A2_F
                def constraint_total_P_0A2_F(x):
                    #Restricción: La suma de los valores comprendidos del primer tercio del array al segundo tercio del array (Columna 2 de 3) - el valor total conocido[1] debe ser igual a 0.
                    return np.nansum(x[len(x)//3:2*len(x)//3]) - total_values[1]
                constraints.append({'type': 'eq', 'fun': constraint_total_P_0A2_F})
            
                # Restricción para P_0A2_M
                def constraint_total_P_0A2_M(x):
                    #Restricción: La suma de los valores comprendidos del segundo tercio del array al final del array (Columna 3 de 3) - el valor total conocido[2] debe ser igual a 0.
                    return np.nansum(x[2*len(x)//3:]) - total_values[2]
                constraints.append({'type': 'eq', 'fun': constraint_total_P_0A2_M})
            
                return constraints

            # Definir las restricciones
            constraints = create_constraints(total_values)
        
            # Initial guess - El número de elementos en initial_guess debe corresponder al 
            # número de variables de decisión en tu problema de optimización.  
            initial_guess = np.zeros(len(relevant_columns))

            # Función que regresa la solución actual (para logs)
            def callback_function(xk):
                print("Current solution:", xk)

            # Resolver el problema de optimización
            resultado = minimize(objective_function, 
                                 initial_guess, 
                                 args=(blocks_relevant_numeric,),
                                 constraints=constraints,
                                # Los límites (bounds) especifican los límites inferiores y superiores permitidos 
                                # para las variables de optimización. En este caso, cada variable de optimización 
                                # se refiere a los valores que se están ajustando para minimizar la función objetivo. (Relevant columns)
                                # El valor (0, None) indica que la variable puede ser cualquier valor mayor o igual a cero, 
                                # sin límite superior. En otras palabras, no hay límite superior para los valores de las variables.
                                 bounds=[(0, None)] * len(relevant_columns),
                                # El argumento Callback es para generar logs
                                 callback=callback_function)

            print("Result:")
            print(resultado)
            
            # Reemplazar NaN con los valores óptimos encontrados
            filled_blocks = blocks_relevant.fillna({col: val for col, val in zip(blocks_relevant.columns, resultado.x)})
            
            # Unir las columnas llenas con el resto de las columnas
            filled_blocks_full = pd.concat([filled_blocks, blocks.drop(relevant_columns, axis=1)], axis=1)
            
            return filled_blocks_full, resultado
        
        # Ejemplo de uso
        filled_blocks_full, resultado = fill_nans(blocks_values)

        return filled_blocks_full, resultado""" 

        # Hasta este punto he llegado, aún no funciona.
        # El problema por ahora es que, ya que las funciones de los constraints de [https://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.minimize.html] deben ser callables,
        # cada constraint debe de tener una función callable única (Para cada caso específico, por ejemplo en una fila falta P_0A2_F, es necesario tener una función que relacione esa variable faltante con el resto.)
        # Es decir, imposible.
        
        from scipy.optimize import minimize
        
        # Esta función recibe y opera en las rows de blocks.
        def fill_nans(blocks):
        
            # Por ahora intentamos solo con el grupo de 0 a 2 años
            # Seleccionar solo las columnas relevantes
            relevant_columns = ['P_0A2', 'P_0A2_F', 'P_0A2_M']
        
            # Encontrar los valores totales por columna conocidos (por AGEB)
            total_values = [ageb_ageb_gdf[relevant_columns[0]].unique()[0],
                            ageb_ageb_gdf[relevant_columns[1]].unique()[0],
                            ageb_ageb_gdf[relevant_columns[2]].unique()[0]]
            
            # Aislar la información de las columnas relevantes
            blocks_relevant = blocks[relevant_columns]
        
            # Cantidad de valores desconocidos (Si dejo la cantidad completa, crashea. 3 es otra).
            print(f"Valores desconocidos: {np.isnan(blocks_relevant.values).sum()}")
            unknown_values = np.isnan(blocks_relevant.values).sum()
            
            # Convertir las columnas relevantes a un tipo de datos numéricos
            blocks_relevant_numeric = blocks_relevant.apply(pd.to_numeric, errors='coerce')
        
            # Definir la función objetivo (Lo que queremos minimizar con la función minimize): la cantidad de NaNs.
            def objective_function(x, blocks_relevant_numeric):
                return np.isnan(blocks_relevant_numeric.values).sum()
        
            # Crear las restricciones (Requisitos a cumplir, se pasan a manera de lista de diccionarios)
            def create_constraints(total_values,blocks_relevant_numeric):
        
                constraints = []
        
                # EXPLICACIÓN DE LAS CONSTRAINTS: Las constraints se agregan en la función minimize como lista de diccionarios..
                # En cada diccionario de restricciones, 'type':'eq' significa que la restricción es de tipo igualdad. 
                # Esto indica que queremos que una función de igualdad (definida en 'fun') sea igual a cero.
                # La función de igualdad que se debe colocar en 'fun' se crea en las siguientes definiciones. Lo que va después del return debe ser igual a cero.
        
                #----- RESTRICCIONES QUE SON POR FILA -----
                # En este caso, la restricción debería asegurar que por fila (por .iterrows) P_0A2 sea igual a P_0A2_F + P_0A2_M.
                # El número de variables encontradas (i) corresponderá al número de nans que hay en este caso.
        
                i = 0
                # Lista de variables desconocidas encontradas (Necesario para las restricciones por columna)
                P_0A2_unknown_vars = []
                P_0A2_F_unknown_vars = []
                P_0A2_M_unknown_vars = []
        
                unknown_vars = {} # i:(index,col)
                
                for index, row in blocks_relevant_numeric.iterrows():
                    P_0A2_val = row['P_0A2']
                    #print(P_0A2_val) #Verification log
                    P_0A2_F_val = row['P_0A2_F']
                    #print(P_0A2_F_val) #Verification log
                    P_0A2_M_val = row['P_0A2_M']
                    #print(P_0A2_M_val) #Verification log
        
                    # Orden de las variables
                    # P_0A2_val - (P_0A2_F_val + P_0A2_M_val) --> x[0] - (x[1] + x[2])
        
                    ### ---------- ---------- CASOS EN DONDE SE AGREGAN 0 VARIABLES DESCONOCIDAS. ---------- ----------
                    # Case 0: All values are known (skip)
                    if not (np.isnan(P_0A2_val)) and not (np.isnan(P_0A2_F_val)) and not (np.isnan(P_0A2_M_val)):
                        #print("Case 0: Values complete") #Verification log
                        continue
                        
                    ### ---------- ---------- CASOS EN DONDE SE AGREGA 1 VARIABLE DESCONOCIDA. ---------- ----------
                    # Case 1: Unknown P_0A2_val
                    elif (np.isnan(P_0A2_val)) and not (np.isnan(P_0A2_F_val)) and not (np.isnan(P_0A2_M_val)):
                        #print("Case 1: Unknown P_0A2_val") #Verification log
                        # Se crea el constraint
                        def case_1_row_constraint(x, P_0A2_F_val, P_0A2_M_val,i):
                            # print(f"Reading function x[{i}] - ({P_0A2_F_val} + {P_0A2_M_val}).")#Verification log
                            return x[i] - (P_0A2_F_val + P_0A2_M_val)
                        constraints.append({'type': 'eq', 
                                            'fun': case_1_row_constraint, 
                                            'args': (P_0A2_F_val, P_0A2_M_val,i)})
                        # Se registra la posición de la variable en la columna faltante correspondiente y se incrementa i (next case)
                        P_0A2_unknown_vars.append(i)
                        unknown_vars[i] = (index,'P_0A2')
                        print(f"appended {i}") #Verification log
                        
                        i += 1
        
                    # Case 2: Unknown P_0A2_F_val
                    elif not (np.isnan(P_0A2_val)) and (np.isnan(P_0A2_F_val)) and not (np.isnan(P_0A2_M_val)):
                        #print("Case 2: Unknown P_0A2_F_val") #Verification log
                        # Se crea el constraint
                        def case_2_row_constraint(x, P_0A2_val, P_0A2_M_val,i):
                            #print(f"Reading function {P_0A2_val} - (x[{i}] + {P_0A2_M_val}).") #Verification log
                            return P_0A2_val - (x[i] + P_0A2_M_val)
                        constraints.append({'type': 'eq', 
                                            'fun': case_2_row_constraint, 
                                            'args': (P_0A2_val, P_0A2_M_val,i)})
                        # Se registra la posición de la variable en la columna faltante correspondiente y se incrementa i (next case)
                        P_0A2_F_unknown_vars.append(i)
                        unknown_vars[i] = (index,'P_0A2_F')
                        print(f"appended {i}") #Verification log
                        
                        i += 1
        
                    # Case 3: Unknown P_0A2_M_val
                    elif not (np.isnan(P_0A2_val)) and not (np.isnan(P_0A2_F_val)) and (np.isnan(P_0A2_M_val)):
                        #print("Case 3: Unknown P_0A2_M_val") #Verification log
                        # Se crea el constraint
                        def case_3_row_constraint(x, P_0A2_val, P_0A2_F_val,i):
                            #print(f"Reading function {P_0A2_val} - ({P_0A2_F_val} + x[{i}]).") #Verification log
                            return P_0A2_val - (P_0A2_F_val + x[i])
                        constraints.append({'type': 'eq', 
                                            'fun': case_3_row_constraint, 
                                            'args': (P_0A2_val, P_0A2_F_val,i)})
                        # Se registra la posición de la variable en la columna faltante correspondiente y se incrementa i (next case)
                        unknown_vars[i] = (index,'P_0A2_M')
                        P_0A2_M_unknown_vars.append(i)
                        print(f"appended {i}") #Verification log
                        
                        i += 1
                        
                    ### ---------- ---------- CASOS EN DONDE SE AGREGAN 2 VARIABLES DESCONOCIDAS. ---------- ----------
                    # Case 4: Unknown P_0A2_F_val and P_0A2_M_val
                    elif not (np.isnan(P_0A2_val)) and (np.isnan(P_0A2_F_val)) and (np.isnan(P_0A2_M_val)):
                        #print("Case 4: Unknown P_0A2_F_val and P_0A2_M_val") #Verification log
                        # Se crea el constraint
                        def case_4_row_constraint(x, P_0A2_val,i):
                            #print(f"Reading function {P_0A2_val} - (x[{i}] + x[{i+1}]).") #Verification log
                            return P_0A2_val - (x[i] + x[i+1])
                        constraints.append({'type': 'eq', 
                                            'fun': case_4_row_constraint, 
                                            'args': (P_0A2_val,i)})
                        # Se registra la posición de la variable en la columna faltante correspondiente y se incrementa i (next case)
                        P_0A2_F_unknown_vars.append(i)
                        unknown_vars[i] = (index,'P_0A2_F')
                        print(f"appended {i}") #Verification log
                        
                        P_0A2_M_unknown_vars.append(i+1)
                        unknown_vars[i+1] = (index,'P_0A2_M')
                        print(f"appended {i+1}") #Verification log
                        
                        i += 2
                        
                    # Case 5: Unknown P_0A2_val and P_0A2_M_val
                    elif (np.isnan(P_0A2_val)) and not (np.isnan(P_0A2_F_val)) and (np.isnan(P_0A2_M_val)):
                        #print("Case 5: Unknown P_0A2_val and P_0A2_M_val") #Verification log
                        # Se crea el constraint
                        def case_5_row_constraint(x, P_0A2_F_val,i):
                            #print(f"Reading function x[{i}] - ({P_0A2_F_val} + x[{i+1}]).") #Verification log
                            return x[i] - (P_0A2_F_val + x[i+1])
                        constraints.append({'type': 'eq', 
                                            'fun': case_5_row_constraint, 
                                            'args': (P_0A2_F_val,i)})
                        # Se registra la posición de la variable en la columna faltante correspondiente y se incrementa i (next case)
                        P_0A2_unknown_vars.append(i)
                        unknown_vars[i] = (index,'P_0A2')
                        print(f"appended {i}") #Verification log
                        
                        P_0A2_M_unknown_vars.append(i+1)
                        unknown_vars[i+1] = (index,'P_0A2_M')
                        print(f"appended {i+1}") #Verification log
                        
                        i += 2
                        
                    # Case 6: Unknown P_0A2_val and P_0A2_F_val
                    elif (np.isnan(P_0A2_val)) and (np.isnan(P_0A2_F_val)) and not (np.isnan(P_0A2_M_val)):
                        #print("Case 6: Unknown P_0A2_val and P_0A2_F_val") #Verification log
                        # Se crea el constraint
                        def case_6_row_constraint(x, P_0A2_M_val,i):
                            #print(f"Reading function x[{i}] - (x[{i+1}] + {P_0A2_M_val}).") #Verification log
                            return x[i] - (x[i+1] + P_0A2_M_val)
                        constraints.append({'type': 'eq', 
                                            'fun': case_6_row_constraint, 
                                            'args': (P_0A2_M_val,i)})
                        # Se registra la posición de la variable en la columna faltante correspondiente y se incrementa i (next case)
                        P_0A2_unknown_vars.append(i)
                        unknown_vars[i] = (index,'P_0A2')
                        print(f"appended {i}") #Verification log
                        
                        P_0A2_F_unknown_vars.append(i+1)
                        unknown_vars[i+1] = (index,'P_0A2_F')
                        print(f"appended {i+1}") #Verification log
                        
                        i += 2
                        
                    ### ---------- ---------- CASOS EN DONDE SE AGREGAN 3 VARIABLES DESCONOCIDAS. ---------- ----------
                    # Case 7: All unknown
                    elif (np.isnan(P_0A2_val)) and (np.isnan(P_0A2_F_val)) and (np.isnan(P_0A2_M_val)):
                        #print("Case 7: All unknown") #Verification log
                        # Se crea el constraint
                        def case_7_row_constraint(x,i):
                            #print(f"Reading function x[{i}] - (x[{i+1}] +  x[{i+2}]).") #Verification log
                            return x[i] - (x[i+1] + x[i+2])
                        constraints.append({'type': 'eq', 
                                            'fun': case_7_row_constraint,
                                            'args': (i,)})
                        # Se registra la posición de la variable en la columna faltante correspondiente y se incrementa i (next case)
                        P_0A2_unknown_vars.append(i)
                        unknown_vars[i] = (index,'P_0A2')
                        print(f"appended {i}") #Verification log
                        
                        P_0A2_F_unknown_vars.append(i+1)
                        unknown_vars[i+1] = (index,'P_0A2_F')
                        print(f"appended {i+1}") #Verification log
                        
                        P_0A2_M_unknown_vars.append(i+2)
                        unknown_vars[i+2] = (index,'P_0A2_M')
                        print(f"appended {i+2}") #Verification log
                        
                        i += 3
                    else:
                        print("Error")
        
                print(f"Unknown variables found: {i}.")
        
                #----- RESTRICCIONES QUE SON POR COLUMNA -----
                # Restricción para P_0A2
                def constraint_total_P_0A2(x,blocks_relevant_numeric,P_0A2_unknown_vars,total_values):
                    #Restricción: La suma de los valores actuales en la columna + los valores encontrados - el valor total conocido[1] debe ser igual a 0.
                    #print(f"Reading function for col P_0A2.") #Verification log
                    return np.nansum(blocks_relevant_numeric['P_0A2']) + np.nansum(x[P_0A2_unknown_vars]) - total_values[0] 
                constraints.append({'type': 'eq', 
                                    'fun': constraint_total_P_0A2,
                                    'args':(blocks_relevant_numeric,P_0A2_unknown_vars,total_values)})
                
                # Restricción para P_0A2_F
                def constraint_total_P_0A2_F(x,blocks_relevant_numeric,P_0A2_F_unknown_vars,total_values):
                    #Restricción: La suma de los valores actuales en la columna + los valores encontrados - el valor total conocido[2] debe ser igual a 0.
                    #print(f"Reading function for col P_0A2_F.") #Verification log
                    return np.nansum(blocks_relevant_numeric['P_0A2_F']) + np.nansum(x[P_0A2_F_unknown_vars]) - total_values[1]
                constraints.append({'type': 'eq', 
                                    'fun': constraint_total_P_0A2_F,
                                    'args':(blocks_relevant_numeric,P_0A2_F_unknown_vars,total_values)})
                
                # Restricción para P_0A2_M
                def constraint_total_P_0A2_M(x,blocks_relevant_numeric,P_0A2_M_unknown_vars,total_values):
                    #Restricción: La suma de los valores actuales en la columna + los valores encontrados - el valor total conocido[3] debe ser igual a 0.
                    #print(f"Reading function for col P_0A2_M.") #Verification log
                    return np.nansum(blocks_relevant_numeric['P_0A2_M']) + np.nansum(x[P_0A2_M_unknown_vars]) - total_values[2]
                constraints.append({'type': 'eq', 
                                    'fun': constraint_total_P_0A2_M,
                                    'args':(blocks_relevant_numeric,P_0A2_M_unknown_vars,total_values)})
                
                return constraints, unknown_vars
        
            # Definir las restricciones (Correr lo anterior para generar la lista de diccionarios)
            constraints, unknown_vars = create_constraints(total_values, blocks_relevant_numeric)
            print(f"Constraints: {len(constraints)}.")
        
            # Initial guess - El número de elementos en initial_guess debe corresponder al número de variables de decisión en tu problema de optimización.
            initial_guess = np.zeros(unknown_values)
        
            # Resolver el problema de optimización
            print("Starting optimization using minimize.")
            i = 0
            resultado = minimize(objective_function, initial_guess, 
                                 args=(blocks_relevant_numeric,),
                                 constraints=constraints,
                                 bounds=[(0, None)] * (unknown_values))
            print(resultado)
        
            # Preparación para remplazar nans
            filled_blocks = blocks_relevant_numeric.copy()
            # Reemplazar NaN con los valores óptimos encontrados
            for i in unknown_vars.keys():
                
                # Find unknown value location
                index = unknown_vars[i][0]
                #print(f"index:{index}") #Verification log
                
                col = unknown_vars[i][1]
                #print(f"col:{col}") #Verification log
                
                # Replace unknown value in location
                filled_blocks.loc[index,col] = list(resultado.x)[i]
                #print(f"Result: {list(resultado.x)[i]}") #Verification log
        
            # Unir las columnas llenas con el resto de las columnas
            filled_blocks_full = pd.concat([filled_blocks, blocks.drop(relevant_columns, axis=1)], axis=1)
        
            return filled_blocks_full, resultado

        # Ejemplo de uso
        filled_blocks_full, resultado = fill_nans(blocks_values)
        
        return filled_blocks_full, resultado

In [195]:
blocks_values_2, resultado = calculate_censo_nan_values_v2(pop_ageb_gdf_chosen, pop_mza_gdf_chosen, extended_logs=True)

No problem
STARTING NANs calculation.
----------------------------------------
Calculating NaNs for AGEB 0515 (1/1.)
Calculating NaNs. 10% done.
Calculating NaNs using block data for AGEB 0515.
Valores desconocidos: 39
appended 0
appended 1
appended 2
appended 3
appended 4
appended 5
appended 6
appended 7
appended 8
appended 9
appended 10
appended 11
appended 12
appended 13
appended 14
appended 15
appended 16
appended 17
appended 18
appended 19
appended 20
appended 21
appended 22
appended 23
appended 24
appended 25
appended 26
appended 27
appended 28
appended 29
appended 30
appended 31
appended 32
appended 33
appended 34
appended 35
appended 36
appended 37
appended 38
Unknown variables found: 39.
Constraints: 23.
Starting optimization using minimize.
 message: Inequality constraints incompatible
 success: False
  status: 4
     fun: 39
       x: [ 0.000e+00  0.000e+00 ...  0.000e+00  0.000e+00]
     nit: 1
     jac: [ 0.000e+00  0.000e+00 ...  0.000e+00  0.000e+00]
    nfev: 40
    nje

In [39]:
test = original_data.copy()
test = test.apply(pd.to_numeric, errors='coerce')
print(test.P_0A2.sum())
print(test.P_0A2_F.sum())
print(test.P_0A2_M.sum())

56.0
21.0
17.0


In [185]:
compare1 = pd.merge(blocks_values_org[['CVEGEO','P_0A2','P_0A2_F','P_0A2_M']],blocks_values_2[['CVEGEO','P_0A2','P_0A2_F','P_0A2_M']],on='CVEGEO')
compare = pd.merge(original_data[['CVEGEO','P_0A2','P_0A2_F','P_0A2_M']],compare1,on='CVEGEO')

# Diffs (Must be 0)
#compare['calcnans_diff_P_0A2'] = compare['P_0A2_x'] - compare['P_0A2_y']
#compare['calcnans_diff_P_0A2_F'] = compare['P_0A2_F_x'] - compare['P_0A2_F_y']
#compare['calcnans_diff_P_0A2_M'] = compare['P_0A2_M_x'] - compare['P_0A2_M_y']

# '_x' data is goal (current calculate_censo_nan_values_v1) '_y' data is current status.
compare

Unnamed: 0,CVEGEO,P_0A2,P_0A2_F,P_0A2_M,P_0A2_x,P_0A2_F_x,P_0A2_M_x,P_0A2_y,P_0A2_F_y,P_0A2_M_y
0,100100010515001,6.0,4.0,,6.0,4.0,2.0,6.0,4.0,0.0
1,100100010515002,4.0,,3.0,4.0,1.0,3.0,4.0,0.0,3.0
2,100100010515003,4.0,4.0,0.0,4.0,4.0,0.0,4.0,4.0,0.0
3,100100010515004,,0.0,,1.0,0.0,1.0,0.0,0.0,0.0
4,100100010515005,,,0.0,1.0,1.0,0.0,0.0,0.0,0.0
5,100100010515006,4.0,,3.0,4.0,1.0,3.0,4.0,0.0,3.0
6,100100010515007,3.0,,,3.0,2.0,1.0,3.0,0.0,0.0
7,100100010515008,,,0.0,1.0,1.0,0.0,0.0,0.0,0.0
8,100100010515009,8.0,3.0,5.0,8.0,3.0,5.0,8.0,3.0,5.0
9,100100010515011,3.0,,,3.0,1.0,2.0,3.0,0.0,0.0


In [37]:
print("P_0A2")
print(blocks_calc.P_0A2.sum())
print(blocks_calc_2.P_0A2.sum())
print("P_0A2_F")
print(blocks_calc.P_0A2_F.sum())
print(blocks_calc_2.P_0A2_F.sum())
print("P_0A2_M")
print(blocks_calc.P_0A2_M.sum())
print(blocks_calc_2.P_0A2_M.sum())

P_0A2
71.0


NameError: name 'blocks_calc_2' is not defined

In [73]:
pop_ageb_gdf_chosen[['P_0A2','P_0A2_F','P_0A2_M']]

Unnamed: 0,P_0A2,P_0A2_F,P_0A2_M
191,71.0,37.0,34.0


In [58]:
def equations(vars, *args):
    P_0A2, P_0A2_F, P_0A2_M = vars
    P_0A2_tot, P_0A2_F_tot, P_0A2_M_tot = args
    
    # Definir las ecuaciones basadas en las relaciones dadas
    eq1 = P_0A2 - (P_0A2_F + P_0A2_M)
    eq2 = P0_2 - P_0A2_tot
    eq3 = P_0A2_F - P_0A2_F_tot
    eq4 = P_0A2_M - P_0A2_M_tot
    
    return [eq1, eq2, eq3]