# 00-pop-cd-cuidado-data

This notebook creates the table "cdcuidadoras_popageb_2020_hex" on schema 'prox_analysis'. The new table differs from 'cd_cuidadoras_hexres8' because this notebook:
- Adds 'pob_0a2','pob_3a5' and 'pob_0a5'data from 'censo'>'hex_bins_pop_2020' (by AGEB)
- Removes _8 from hex_id name
- Adds "res" column for resolution

In [1]:
import os
import sys

import pandas as pd
import geopandas as gpd
import osmnx as ox
import numpy as np

from shapely import wkt

import matplotlib.pyplot as plt
import seaborn as sns

from pandas.api.types import CategoricalDtype

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

module_path = os.path.abspath(os.path.join('../../../'))
if module_path not in sys.path:
    sys.path.append(module_path)
    import aup



## Part 1: Create new gdf in db that contains pop data by city.

In [2]:
def main(city, cvegeo_list, save = True):
    
    print(f'STARTING ANALYSIS FOR {city}.')
    print(f'{city} con {len(cvegeo_list)} cvegeos:')
    print(cvegeo_list)
    
    #--------------- DOWNLOAD DATA ---------------
    # Download prox data for city
    schema = 'prox_analysis'
    table = 'time_15_min_analysis_hexres8'
    query = f"SELECT * FROM {schema}.{table}  WHERE \"city\" LIKE \'{city}\'"
    prox_city = aup.gdf_from_query(query, geometry_col='geometry')
    
    pob_tot = prox_city.pobtot.sum()
    
    print(f'Downloaded prox_city data with a total of {pob_tot} persons.')
    
    # Download hex_pop for city
    schema = 'censo'
    table = 'hex_bins_pop_2020'

    hex_pop = gpd.GeoDataFrame()
    i = 1
    for cvegeo in cvegeo_list:
        print(f'Downloading pop data for CVEGEO {cvegeo}, {i} of {len(cvegeo_list)}.')
        
        query = f"SELECT * FROM {schema}.{table} WHERE \"CVEGEO\" LIKE \'{cvegeo}%%\'"
        hex_tmp = aup.gdf_from_query(query, geometry_col='geometry')
        hex_pop = pd.concat([hex_pop, hex_tmp],
        ignore_index = True, axis = 0)
        i = i + 1
        
        
    pob_tot = hex_pop.pobtot.sum()
    
    print(f'Downloaded hex_pop data for all cvegeos, with a total of {pob_tot} persons.')
    
    # Save disk space
    del hex_tmp
    
    #--------------- PROCESS DATA ---------------
    # Filter for data of interest
    hex_pop_f = hex_pop[['hex_id_8','p_0a2','p_3a5']]
    
    # Save disk space
    del hex_pop
    
    hex_pop_f.rename(columns={'p_0a2':'pob_0a2','p_3a5':'pob_3a5'},inplace=True)

    # Calculate age groups
    hex_pop_f['pob_0a5'] = hex_pop_f['pob_0a2'] + hex_pop_f['pob_3a5']
    
    print(f'Calculated age groups, merging data.')

    # Merge data
    hex_prox_city_pop = pd.merge(prox_city,hex_pop_f, on='hex_id_8')
    
    print(f'Merged data. Final formatting')
    
    # Save disk space
    del prox_city
    del hex_pop_f

    # Add res data and remove res from hexid
    hex_prox_city_pop['res'] = 8
    hex_prox_city_pop.rename(columns={'hex_id_8':'hex_id'},inplace=True)

    # Reorder columns
    reordered_list = ['hex_id', 'res','geometry',
                      'max_escuelas',
                      'max_preescolar',
                      'max_primaria',
                      'max_secundaria',
                      'max_servicios comunitarios',
                      'max_salud',
                      'max_guarderías',
                      'max_asistencia social',
                      'max_comercio',
                      'max_alimentos',
                      'max_personal',
                      'max_farmacias',
                      'max_hogar',
                      'max_complementarios',
                      'max_entretenimiento',
                      'max_social',
                      'max_actividad física',
                      'max_cultural',
                      'max_idx_15_min',
                      'pobtot',
                      'pobfem',
                      'pobmas',
                      'pob_0a2',
                      'pob_3a5',
                      'pob_0a5',
                      'pob_0a14',
                      'pob_15a24',
                      'pob_25a59',
                      'p_60ymas',
                      'dens_pobha',
                      'city']

    hex_prox_city_pop = hex_prox_city_pop[reordered_list]
    
    print(f'Finished processing {city}.')
    
    #--------------- UPLOAD DATA ---------------
    if save:
        aup.gdf_to_db_slow(hex_prox_city_pop, "proximityanalysis_20_ageb_hex8", 'prox_analysis', if_exists='append')
        print(f'Uploaded {city} data to db')
    print('--'*20)

In [11]:
# FIRST RUN TRIAL
city = 'Aguascalientes'

#Load mun data
mun_schema = 'metropolis'
mun_table = 'metro_gdf'
query = f"SELECT * FROM {mun_schema}.{mun_table} WHERE \"city\" LIKE \'{city}\'"
mun_gdf = aup.gdf_from_query(query, geometry_col='geometry')

cvegeo_list = list(mun_gdf.loc[mun_gdf.city==city]["CVEGEO"].unique())
main(city, cvegeo_list, save=True)

STARTING ANALYSIS FOR Aguascalientes.
Aguascalientes con 3 cvegeos:
['01001', '01005', '01011']
Downloaded prox_city data with a total of 1042106.2545753999 persons.
Downloading pop data for CVEGEO 01001, 1 of 3.
Downloading pop data for CVEGEO 01005, 2 of 3.
Downloading pop data for CVEGEO 01011, 3 of 3.
Downloaded hex_pop data for all cvegeos, with a total of 1042106.2545753999 persons.
Uploaded Aguascalientes data to db
----------------------------------------


In [3]:
#Load mun data
mun_schema = 'metropolis'
mun_table = 'metro_gdf'
query = f"SELECT * FROM {mun_schema}.{mun_table}" 
mun_gdf = aup.gdf_from_query(query, geometry_col='geometry')

In [5]:
#Find already processed cities
prox_schema = 'prox_analysis'
prox_table = 'proximityanalysis_20_ageb_hex8'
query = f"SELECT * FROM {prox_schema}.{prox_table}"
prox_all = aup.gdf_from_query(query, geometry_col='geometry')
processed_city_list = list(prox_all.city.unique())

# Skip ZMVM due to size
#processed_city_list.append('ZMVM')

print(f"Already processed {len(processed_city_list)} cities:")
print(processed_city_list)

#Run main function
for city in mun_gdf.city.unique():
        if city not in processed_city_list:
            cvegeo_list = list(mun_gdf.loc[mun_gdf.city==city]["CVEGEO"].unique())
            main(city, cvegeo_list, save=True)
            print('--'*40)
            print('--'*20)

Already processed 73 cities:
['Aguascalientes', 'Ensenada', 'Mexicali', 'Tijuana', 'La Paz', 'Campeche', 'Laguna', 'Monclova', 'Piedras Negras', 'Saltillo', 'Colima', 'Tecoman', 'Tapachula', 'Tuxtla', 'Chihuahua', 'Delicias', 'Juarez', 'Parral', 'Durango', 'Celaya', 'Guanajuato', 'Leon', 'Moroleon', 'San Francisco', 'Acapulco', 'Chilpancingo', 'Pachuca', 'Tula', 'Tulancingo', 'Guadalajara', 'Ocotlan', 'Vallarta', 'Tianguistenco', 'Toluca', 'Piedad', 'Morelia', 'Zamora', 'Cuautla', 'Cuernavaca', 'Tepic', 'Monterrey', 'Oaxaca', 'Tehuantepec', 'Puebla', 'Tehuacan', 'Teziutlan', 'Queretaro', 'Cancun', 'Chetumal', 'Rio Verde', 'SLP', 'Culiacan', 'Mazatlan', 'Guaymas', 'Hermosillo', 'Nogales', 'Villahermosa', 'Victoria', 'Matamoros', 'Nuevo Laredo', 'Reynosa', 'Tampico', 'Tlaxcala', 'Acayucan', 'Coatzacoalcos', 'Cordoba', 'Minatitlan', 'Orizaba', 'Poza Rica', 'Veracruz', 'Xalapa', 'Merida', 'Zacatecas']
STARTING ANALYSIS FOR ZMVM.
ZMVM con 75 cvegeos:
['09002', '09003', '09004', '09005', '09

## Data comparison

### Load both datasets

City

In [12]:
city = 'Monterrey'

Original data

In [13]:
# Download cd_cuidadoras for city
schema = 'prox_analysis'
table = 'time_15_min_analysis_hexres8'
query = f"SELECT * FROM {schema}.{table} WHERE \"city\" LIKE \'{city}\'"
proximity_analysis = aup.gdf_from_query(query, geometry_col='geometry')

New data

In [14]:
#Find already processed cities
prox_schema = 'prox_analysis'
prox_table = 'proximityanalysis_20_ageb_hex8'
query = f"SELECT * FROM {prox_schema}.{prox_table} WHERE \"city\" LIKE \'{city}\'"
proximity_analysis_pop = aup.gdf_from_query(query, geometry_col='geometry')

### Merge datasets

In [15]:
proximity_analysis_pop_new = proximity_analysis_pop.rename(columns={'hex_id':'hex_id_8'})
merged_proximity_analysis = proximity_analysis.merge(proximity_analysis_pop_new, on='hex_id_8')

In [16]:
merged_proximity_analysis.head(2)

Unnamed: 0,hex_id_8,geometry_x,max_escuelas_x,max_preescolar_x,max_primaria_x,max_secundaria_x,max_servicios comunitarios_x,max_salud_x,max_guarderías_x,max_asistencia social_x,...,pobmas_y,pob_0a2,pob_3a5,pob_0a5,pob_0a14_y,pob_15a24_y,pob_25a59_y,p_60ymas_y,dens_pobha_y,city_y
0,8848a23881fffff,"POLYGON ((-100.38026 25.93877, -100.38069 25.9...",24.460722,24.460722,21.807982,9.385404,44.928272,22.735261,44.928272,12.703557,...,123.34427,14.639343,11.836064,26.475407,65.098359,43.606558,109.950839,31.459015,2.970079,Monterrey
1,8848a23883fffff,"POLYGON ((-100.38588 25.94625, -100.38631 25.9...",14.182725,14.182725,12.408625,8.665118,60.525388,12.251763,60.525388,9.001794,...,234.69318,26.69555,25.030443,51.725993,122.381728,78.758787,219.166333,65.82904,5.674415,Monterrey


### Compare datasets columns

In [17]:
compare_data_cols = [
 'max_escuelas',
 'max_preescolar',
 'max_primaria',
 'max_secundaria',
 'max_servicios comunitarios',
 'max_salud',
 'max_guarderías',
 'max_asistencia social',
 'max_comercio',
 'max_alimentos',
 'max_personal',
 'max_farmacias',
 'max_hogar',
 'max_complementarios',
 'max_entretenimiento',
 'max_social',
 'max_actividad física',
 'max_cultural',
 'max_idx_15_min',
 'pobtot',
 'pobfem',
 'pobmas',
 'pob_0a14',
 'pob_15a24',
 'pob_25a59',
 'p_60ymas',
 'dens_pobha']

diff_cols = []

for col in compare_data_cols:
    original_col = col+'_x'
    new_col = col+'_y'
    diff_col = f'diff_{col}'
    
    merged_proximity_analysis[diff_col] = merged_proximity_analysis[original_col] - merged_proximity_analysis[new_col]
    merged_proximity_analysis.drop(columns=[original_col, new_col],inplace=True)
    
    diff_cols.append(diff_col)
    
for col in diff_cols:
    diff = merged_proximity_analysis[col].sum()
    print(f'Diferencia en {col} es de {diff}')

Diferencia en diff_max_escuelas es de 0.0
Diferencia en diff_max_preescolar es de 0.0
Diferencia en diff_max_primaria es de 0.0
Diferencia en diff_max_secundaria es de 0.0
Diferencia en diff_max_servicios comunitarios es de 0.0
Diferencia en diff_max_salud es de 0.0
Diferencia en diff_max_guarderías es de 0.0
Diferencia en diff_max_asistencia social es de 0.0
Diferencia en diff_max_comercio es de 0.0
Diferencia en diff_max_alimentos es de 0.0
Diferencia en diff_max_personal es de 0.0
Diferencia en diff_max_farmacias es de 0.0
Diferencia en diff_max_hogar es de 0.0
Diferencia en diff_max_complementarios es de 0.0
Diferencia en diff_max_entretenimiento es de 0.0
Diferencia en diff_max_social es de 0.0
Diferencia en diff_max_actividad física es de 0.0
Diferencia en diff_max_cultural es de 0.0
Diferencia en diff_max_idx_15_min es de 0.0
Diferencia en diff_pobtot es de 0.0
Diferencia en diff_pobfem es de 0.0
Diferencia en diff_pobmas es de 0.0
Diferencia en diff_pob_0a14 es de 0.0
Diferenci