# 02a - Script 03-population_to_nodes update tests

This Notebook tests Script 03 updates by comparing them to its previous output (hex_bins_pop_2020).

Script 03 was updated on 2024 05 21 in order to:
1. __Include all metropolitan areas__ from 2020 (uses metro_gdf_2020 instead of Metropolis_CVE.json).
2. **Use hexgrid_{res}_city_2020**, which uses the previously mentioned metropolitan areas.
3. __Allow processing different resolutions__ instead of one res at a time on one output table each.
4. __Rename output tables__ (previously hex_bins_pop_2010 and hex_bins_pop_2020 followed no convention. New names should be __<data_source_year_format>: <censo_inegi_10_ageb_hex> and <censo_inegi_20_ageb_hex>, <censo_inegi_10_ageb_node> and <censo_inegi_20_ageb_node>__.


#### __Conclusion: Update works. Also fixed unexpected previous error__ in col vph_inter (Last cells of this Notebook).

## Import libraries

In [1]:
import os
import sys

import pandas as pd
import geopandas as gpd
import osmnx as ox
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

module_path = os.path.abspath(os.path.join('../../../'))
if module_path not in sys.path:
    sys.path.append(module_path)
    import aup

## Load city data

In [2]:
city = 'Aguascalientes'

In [3]:
# gdf_mun
metro_schema = 'metropolis'
metro_table = 'metro_gdf_2020'

query = f"SELECT * FROM {metro_schema}.{metro_table} WHERE \"city\" LIKE \'{city}\'"
metro_gdf = aup.gdf_from_query(query, geometry_col='geometry')
metro_gdf = metro_gdf.set_crs("EPSG:4326")

cvegeo_list = list(metro_gdf.CVEGEO.unique())

# Show
print(metro_gdf.shape)
print(len(cvegeo_list))
print(cvegeo_list)
metro_gdf.head(1)

(3, 6)
3
['01001', '01005', '01011']


Unnamed: 0,CVEGEO,CVE_ENT,CVE_MUN,NOMGEO,geometry,city
0,1001,1,1,Aguascalientes,"POLYGON ((-102.10641 22.06035, -102.10368 22.0...",Aguascalientes


## 2020 analysis

### Load data - Current (original) data

In [84]:
year = '2020'

# pop data 2020 (original)
org_pop_schema = 'censo'
org_pop_table = f'hex_bins_pop_{year}'

org20_pop_gdf = gpd.GeoDataFrame()

for cvegeo in cvegeo_list:
    query = f"SELECT * FROM {org_pop_schema}.{org_pop_table} WHERE \"CVEGEO\" LIKE \'{cvegeo}\'"
    org20_pop_gdf = pd.concat([org20_pop_gdf,aup.gdf_from_query(query, geometry_col='geometry')])

org20_pop_gdf = org20_pop_gdf.set_crs("EPSG:4326")
org20_pop_gdf.rename(columns={'hex_id_8':'hex_id'},inplace=True)

# Show
print(org20_pop_gdf.pobtot.sum())
print(org20_pop_gdf.shape)
org20_pop_gdf.head(1)

1042106.2545753999
(309, 225)


Unnamed: 0,geometry,hex_id,CVEGEO,pobtot,pobfem,pobmas,p_0a2,p_0a2_f,p_0a2_m,p_3ymas,...,vph_telef,vph_cel,vph_inter,vph_stvp,vph_spmvpi,vph_cvj,vph_sinrtv,vph_sintlc,vph_sincint,vph_sintic
0,"POLYGON ((-102.34777 21.92336, -102.34267 21.9...",88498e36dbfffff,1001,264.7125,131.68126,133.03125,15.806251,7.65,8.15625,248.90625,...,22.893751,57.2625,36.225002,25.5375,18.393751,14.5125,0.61875,2.7,21.99375,0.3375


### Load data - Updated data

In [85]:
year = '2020'

# pop data 2020 (Updated)
new_pop_schema = 'censo'
new_pop_table = f'censo_inegi_{year[2:]}_ageb_hex_test'

query = f"SELECT * FROM {new_pop_schema}.{new_pop_table} WHERE \"city\" LIKE \'{city}\'"
new20_pop_gdf = aup.gdf_from_query(query, geometry_col='geometry')

new20_pop_gdf = new20_pop_gdf.set_crs("EPSG:4326")
new20_pop_gdf = new20_pop_gdf.loc[new20_pop_gdf.res == 8]

# Show
print(new20_pop_gdf.pobtot.sum())
print(new20_pop_gdf.shape)
new20_pop_gdf.head(1)

1042106.2545754
(309, 229)


Unnamed: 0,hex_id,geometry,CVEGEO,NOMGEO,city,type,pobtot,pobfem,pobmas,p_0a2,...,vph_cel,vph_inter,vph_stvp,vph_spmvpi,vph_cvj,vph_sinrtv,vph_sintlc,vph_sincint,vph_sintic,res
0,88498e3639fffff,"POLYGON ((-102.27184 21.89588, -102.26725 21.8...",1001,Aguascalientes,Aguascalientes,urban,6770.6924,3466.771,3303.9204,253.7511,...,1825.0607,1360.8694,930.15955,574.7969,405.25732,16.533459,42.783176,473.04376,3.089069,8


### Comparison

In [86]:
columns_of_interest_2020 = list(org20_pop_gdf.columns)
columns_of_interest_2020.remove('geometry')
columns_of_interest_2020.remove('CVEGEO')
columns_of_interest_2020

original_2020 = org20_pop_gdf[columns_of_interest_2020].copy()
new_2020 = new20_pop_gdf[columns_of_interest_2020].copy()

In [87]:
compare = pd.merge(original_2020,new_2020,on='hex_id')

# Compare old and new fields
compare_list = []
col_list = columns_of_interest_2020.copy()
col_list.remove('hex_id')

for col in col_list:

    old_col = f"{col}_x"
    new_col = f"{col}_y"
    compare[f"{col}_diff"] = compare[new_col] - compare[old_col]

    compare_list.append(f"{col}_diff")

# Save df with time differences only
compare_diff = compare[compare_list]

# Visualize sum of time differences
summary = pd.DataFrame()
i = 0
for compare_col in compare_list:

    diff_value = compare[compare_col].sum()
    summary.loc[i,'attribute'] = compare_col
    summary.loc[i,'diff'] = diff_value

    i = i+1

summary.loc[summary['diff'] != 0 ]

  compare[f"{col}_diff"] = compare[new_col] - compare[old_col]
  compare[f"{col}_diff"] = compare[new_col] - compare[old_col]
  compare[f"{col}_diff"] = compare[new_col] - compare[old_col]
  compare[f"{col}_diff"] = compare[new_col] - compare[old_col]
  compare[f"{col}_diff"] = compare[new_col] - compare[old_col]
  compare[f"{col}_diff"] = compare[new_col] - compare[old_col]
  compare[f"{col}_diff"] = compare[new_col] - compare[old_col]
  compare[f"{col}_diff"] = compare[new_col] - compare[old_col]
  compare[f"{col}_diff"] = compare[new_col] - compare[old_col]
  compare[f"{col}_diff"] = compare[new_col] - compare[old_col]
  compare[f"{col}_diff"] = compare[new_col] - compare[old_col]
  compare[f"{col}_diff"] = compare[new_col] - compare[old_col]
  compare[f"{col}_diff"] = compare[new_col] - compare[old_col]
  compare[f"{col}_diff"] = compare[new_col] - compare[old_col]
  compare[f"{col}_diff"] = compare[new_col] - compare[old_col]
  compare[f"{col}_diff"] = compare[new_col] - compare[o

Unnamed: 0,attribute,diff
47,prom_hnv_diff,8.327741e-07
131,graproes_diff,4.577317e-06
132,graproes_f_diff,2.153654e-05
133,graproes_m_diff,-1.487263e-05
178,prom_ocup_diff,-1.823691e-06
179,pro_ocup_c_diff,-2.114877e-06


#### __Differences are minimal (less than 1 total)__

In [90]:
columns_with_differences = ['prom_hnv_diff','graproes_diff','graproes_f_diff','graproes_m_diff','prom_ocup_diff','pro_ocup_c_diff']

# How much is the difference between old and new data by hex?
differences_to_analyse = [1,0.000000000001,0.0000000000001,0.00000000000001,0.000000000000001]

hexs_diff2020_df = pd.DataFrame()

for range in differences_to_analyse:
    i = 0
    for diff_col in columns_with_differences:
        hexs_diff2020_df.loc[i,'field'] = diff_col
        
        hexs_with_positive_diff = compare.loc[(compare[diff_col]>range)].shape[0]
        hexs_with_negative_diff = compare.loc[(compare[diff_col]<-range)].shape[0]

        hexs_diff2020_df.loc[i,f"{range}_diff"] = hexs_with_positive_diff
        hexs_diff2020_df.loc[i,f"{-range}_diff"] = hexs_with_negative_diff

        i = i+1

print("This table shows how many hexs have a given difference (col name) in data.")
print("'2' in col '1_diff' would mean that 2 hexagons have a difference of 1 or more (data, for example people).")

hexs_diff2020_df

This table shows how many hexs have a given difference (col name) in data.
'2' in col '1_diff' would mean that 2 hexagons have a difference of 1 or more (data, for example people).


Unnamed: 0,field,1_diff,-1_diff,1e-12_diff,-1e-12_diff,1e-13_diff,-1e-13_diff,1e-14_diff,-1e-14_diff,1e-15_diff,-1e-15_diff
0,prom_hnv_diff,0.0,0.0,160.0,145.0,160.0,145.0,160.0,145.0,160.0,145.0
1,graproes_diff,0.0,0.0,164.0,143.0,164.0,143.0,164.0,143.0,164.0,143.0
2,graproes_f_diff,0.0,0.0,171.0,134.0,171.0,134.0,171.0,134.0,171.0,134.0
3,graproes_m_diff,0.0,0.0,145.0,160.0,145.0,160.0,145.0,160.0,145.0,160.0
4,prom_ocup_diff,0.0,0.0,141.0,166.0,141.0,166.0,141.0,166.0,141.0,166.0
5,pro_ocup_c_diff,0.0,0.0,128.0,176.0,128.0,176.0,128.0,176.0,128.0,176.0


#### __Sum of all diff is the result of very small individual differences (due to use of weighted averages)__

## 2010 analysis

### Load data - Current (original) data

In [91]:
year = '2010'

# pop data 2010 (original)
org_pop_schema = 'censo'
org_pop_table = f'hex_bins_pop_{year}'

org10_pop_gdf = gpd.GeoDataFrame()

for cvegeo in cvegeo_list:
    query = f"SELECT * FROM {org_pop_schema}.{org_pop_table} WHERE \"CVEGEO\" LIKE \'{cvegeo}\'"
    org10_pop_gdf = pd.concat([org10_pop_gdf,aup.gdf_from_query(query, geometry_col='geometry')])

org10_pop_gdf = org10_pop_gdf.set_crs("EPSG:4326")
org10_pop_gdf.rename(columns={'hex_id_8':'hex_id'},inplace=True)

# Show
print(org10_pop_gdf.pobtot.sum())
print(org10_pop_gdf.shape)
org10_pop_gdf.head(1)

828107.1610885999
(235, 193)


Unnamed: 0,geometry,hex_id,CVEGEO,pobtot,pobmas,pobfem,p_0a2,p_0a2_m,p_0a2_f,p_3ymas,...,vph_snbien,vph_radio,vph_tv,vph_refri,vph_lavad,vph_autom,vph_pc,vph_telef,vph_cel,vph_inter
0,"POLYGON ((-102.22142 21.87929, -102.22192 21.8...",88498e3751fffff,1001,528.5707,264.43634,264.13434,49.161545,25.402107,23.75944,475.85077,...,0.0,117.33275,135.24582,122.804214,101.556625,76.77524,26.071114,16.445126,110.26164,698.0


### Load data - Updated data

In [92]:
year = '2010'

# pop data 2010 (Updated)
new_pop_schema = 'censo'
new_pop_table = f'censo_inegi_{year[2:]}_ageb_hex_test'

query = f"SELECT * FROM {new_pop_schema}.{new_pop_table} WHERE \"city\" LIKE \'{city}\'"
new10_pop_gdf = aup.gdf_from_query(query, geometry_col='geometry')

new10_pop_gdf = new10_pop_gdf.set_crs("EPSG:4326")
new10_pop_gdf = new10_pop_gdf.loc[new10_pop_gdf.res == 8]

# Show
print(new10_pop_gdf.pobtot.sum())
print(new10_pop_gdf.shape)
new10_pop_gdf.head(1)

828107.1610885999
(235, 197)


Unnamed: 0,hex_id,geometry,CVEGEO,NOMGEO,city,type,pobtot,pobmas,pobfem,p_0a2,...,vph_radio,vph_tv,vph_refri,vph_lavad,vph_autom,vph_pc,vph_telef,vph_cel,vph_inter,res
0,88498e3639fffff,"POLYGON ((-102.27184 21.89588, -102.26725 21.8...",1001,Aguascalientes,Aguascalientes,urban,7114.5293,3502.9412,3611.5898,364.07013,...,1676.501,1806.3595,1748.7489,1590.0066,1127.0553,734.81714,1020.1815,1479.1868,488.48477,8


### Comparison

In [93]:
columns_of_interest_2010 = list(org10_pop_gdf.columns)
columns_of_interest_2010.remove('geometry')
columns_of_interest_2010.remove('CVEGEO')
columns_of_interest_2010

original_2010 = org10_pop_gdf[columns_of_interest_2010].copy()
new_2010 = new10_pop_gdf[columns_of_interest_2010].copy()

In [94]:
compare = pd.merge(original_2010,new_2010,on='hex_id')

# Compare old and new fields
compare_list = []
col_list = columns_of_interest_2010.copy()
col_list.remove('hex_id')

for col in col_list:

    old_col = f"{col}_x"
    new_col = f"{col}_y"
    compare[f"{col}_diff"] = compare[new_col] - compare[old_col]

    compare_list.append(f"{col}_diff")

# Save df with time differences only
compare_diff = compare[compare_list]

# Visualize sum of time differences
summary = pd.DataFrame()
i = 0
for compare_col in compare_list:

    diff_value = compare[compare_col].sum()
    summary.loc[i,'attribute'] = compare_col
    summary.loc[i,'diff'] = diff_value

    i = i+1

summary.loc[summary['diff'] != 0 ]

  compare[f"{col}_diff"] = compare[new_col] - compare[old_col]
  compare[f"{col}_diff"] = compare[new_col] - compare[old_col]
  compare[f"{col}_diff"] = compare[new_col] - compare[old_col]
  compare[f"{col}_diff"] = compare[new_col] - compare[old_col]
  compare[f"{col}_diff"] = compare[new_col] - compare[old_col]
  compare[f"{col}_diff"] = compare[new_col] - compare[old_col]
  compare[f"{col}_diff"] = compare[new_col] - compare[old_col]
  compare[f"{col}_diff"] = compare[new_col] - compare[old_col]
  compare[f"{col}_diff"] = compare[new_col] - compare[old_col]
  compare[f"{col}_diff"] = compare[new_col] - compare[old_col]
  compare[f"{col}_diff"] = compare[new_col] - compare[old_col]
  compare[f"{col}_diff"] = compare[new_col] - compare[old_col]
  compare[f"{col}_diff"] = compare[new_col] - compare[old_col]
  compare[f"{col}_diff"] = compare[new_col] - compare[old_col]
  compare[f"{col}_diff"] = compare[new_col] - compare[old_col]
  compare[f"{col}_diff"] = compare[new_col] - compare[o

Unnamed: 0,attribute,diff
47,prom_hnv_diff,3.488473e-06
121,graproes_diff,4.068219e-06
122,graproes_m_diff,1.097732e-05
123,graproes_f_diff,-1.040306e-05
163,prom_ocup_diff,-3.284272e-06
164,pro_ocup_c_diff,-6.958197e-07
189,vph_inter_diff,-4130659.0


#### __Again, differences are minimal (less than 1 total)__

In [95]:
columns_with_differences = ['prom_hnv_diff','graproes_diff','graproes_f_diff','graproes_m_diff','prom_ocup_diff','pro_ocup_c_diff','vph_inter_diff']

# How much is the difference between old and new data by hex?
differences_to_analyse = [100,10,1,0.000000000001,0.0000000000001,0.00000000000001,0.000000000000001]

hexs_diff_df = pd.DataFrame()

for range in differences_to_analyse:
    i = 0
    for diff_col in columns_with_differences:
        hexs_diff_df.loc[i,'field'] = diff_col
        
        hexs_with_positive_diff = compare.loc[(compare[diff_col]>range)].shape[0]
        hexs_with_negative_diff = compare.loc[(compare[diff_col]<-range)].shape[0]

        hexs_diff_df.loc[i,f"{range}_diff"] = hexs_with_positive_diff
        hexs_diff_df.loc[i,f"{-range}_diff"] = hexs_with_negative_diff

        i = i+1

print("This table shows how many hexs have a given difference (col name) in data.")
print("'2' in col '1_diff' would mean that 2 hexagons have a difference of 1 or more (data, for example people).")

hexs_diff_df

This table shows how many hexs have a given difference (col name) in data.
'2' in col '1_diff' would mean that 2 hexagons have a difference of 1 or more (data, for example people).


Unnamed: 0,field,100_diff,-100_diff,10_diff,-10_diff,1_diff,-1_diff,1e-12_diff,-1e-12_diff,1e-13_diff,-1e-13_diff,1e-14_diff,-1e-14_diff,1e-15_diff,-1e-15_diff
0,prom_hnv_diff,0.0,0.0,0.0,0.0,0.0,0.0,126.0,108.0,126.0,108.0,126.0,108.0,126.0,108.0
1,graproes_diff,0.0,0.0,0.0,0.0,0.0,0.0,122.0,113.0,122.0,113.0,122.0,113.0,122.0,113.0
2,graproes_f_diff,0.0,0.0,0.0,0.0,0.0,0.0,112.0,123.0,112.0,123.0,112.0,123.0,112.0,123.0
3,graproes_m_diff,0.0,0.0,0.0,0.0,0.0,0.0,120.0,111.0,120.0,111.0,120.0,111.0,120.0,111.0
4,prom_ocup_diff,0.0,0.0,0.0,0.0,0.0,0.0,119.0,112.0,119.0,112.0,119.0,112.0,119.0,112.0
5,pro_ocup_c_diff,0.0,0.0,0.0,0.0,0.0,0.0,106.0,129.0,106.0,129.0,106.0,129.0,106.0,129.0
6,vph_inter_diff,0.0,214.0,0.0,224.0,0.0,230.0,0.0,230.0,0.0,230.0,0.0,230.0,0.0,230.0


#### __Sum of all diff is the result of very small individual differences (due to use of weighted averages)__
#### __Big difference in vph_inter_diff was found to be a source error (hex_bins_pop_2010 error) after inspection in QGIS__

In [80]:
# Here, original data (hex_bins_pop_2010) shows a hex with 708 households with internet access, while the total of households is no more than 50.
org10_pop_gdf.loc[org10_pop_gdf.hex_id=='88498eac83fffff'][['vivtot','tvivparhab','vph_inter']]

Unnamed: 0,vivtot,tvivparhab,vph_inter
90,41.313435,36.44776,708.0


In [82]:
# Updated data makes sense
new10_pop_gdf.loc[new10_pop_gdf.hex_id=='88498eac83fffff'][['vivtot','tvivparhab','vph_inter']]

Unnamed: 0,vivtot,tvivparhab,vph_inter
78,41.313435,36.44776,10.567164
