# 00-pop-cd-cuidado-data

This notebook creates the table "cdcuidadoras_popageb_2020_hex" on schema 'prox_analysis'. The new table differs from 'cd_cuidadoras_hexres8' because this notebook:
- Adds 'pob_0a2','pob_3a5' and 'pob_0a5'data from 'censo'>'hex_bins_pop_2020' (by AGEB)
- Removes _8 from hex_id name
- Adds "res" column for resolution

In [1]:
import os
import sys

import pandas as pd
import geopandas as gpd
import osmnx as ox
import numpy as np

from shapely import wkt

import matplotlib.pyplot as plt
import seaborn as sns

from pandas.api.types import CategoricalDtype

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

module_path = os.path.abspath(os.path.join('../../../'))
if module_path not in sys.path:
    sys.path.append(module_path)
    import aup



## Part 1: Create new gdf in db that contains pop data by city.

In [2]:
def main(city, cvegeo_list, save = True):
    
    print(f'STARTING ANALYSIS FOR {city}.')
    print(f'{city} con {len(cvegeo_list)} cvegeos:')
    print(cvegeo_list)
    
    #--------------- DOWNLOAD DATA ---------------
    # Download cd_cuidadoras for city
    schema = 'prox_analysis'
    table = 'cd_cuidadoras_hexres8'
    query = f"SELECT * FROM {schema}.{table}  WHERE \"city\" LIKE \'{city}\'"
    cd_cuidadoras = aup.gdf_from_query(query, geometry_col='geometry')
    
    pob_tot = cd_cuidadoras.pobtot.sum()
    
    print(f'Downloaded cd_cuidadoras data with a total of {pob_tot} persons.')
    
    # Download hex_pop for city
    schema = 'censo'
    table = 'hex_bins_pop_2020'

    hex_pop = gpd.GeoDataFrame()
    i = 1
    for cvegeo in cvegeo_list:
        print(f'Downloading pop data for CVEGEO {cvegeo}, {i} of {len(cvegeo_list)}.')
        
        query = f"SELECT * FROM {schema}.{table} WHERE \"CVEGEO\" LIKE \'{cvegeo}%%\'"
        hex_tmp = aup.gdf_from_query(query, geometry_col='geometry')
        hex_pop = pd.concat([hex_pop, hex_tmp],
        ignore_index = True, axis = 0)
        i = i + 1
        
        
    pob_tot = hex_pop.pobtot.sum()
    
    print(f'Downloaded hex_pop data for all cvegeos, with a total of {pob_tot} persons.')
    
    # Save disk space
    del hex_tmp
    
    #--------------- PROCESS DATA ---------------
    # Filter for data of interest
    hex_pop_f = hex_pop[['hex_id_8','p_0a2','p_3a5']]
    
    # Save disk space
    del hex_pop
    
    hex_pop_f.rename(columns={'p_0a2':'pob_0a2','p_3a5':'pob_3a5'},inplace=True)

    # Calculate age groups
    hex_pop_f['pob_0a5'] = hex_pop_f['pob_0a2'] + hex_pop_f['pob_3a5']

    # Merge data
    hex_cdcuidadoras_pop = pd.merge(cd_cuidadoras,hex_pop_f, on='hex_id_8')
    
    # Save disk space
    del cd_cuidadoras
    del hex_pop_f

    # Add res data and remove res from hexid
    hex_cdcuidadoras_pop['res'] = 8
    hex_cdcuidadoras_pop.rename(columns={'hex_id_8':'hex_id'},inplace=True)

    # Reorder columns
    reordered_list = ['hex_id', 'res','geometry',
                      'max_preescolar',
                      'max_primaria',
                      'max_secundaria',
                      'max_salud',
                      'max_guarderias',
                      'max_alimentos',
                      'max_personal',
                      'max_parques',
                      'max_idx_15_min',
                      'pobtot',
                      'pobfem',
                      'pobmas',
                      'pob_0a2',
                      'pob_3a5',
                      'pob_0a5',
                      'pob_0a14',
                      'pob_15a24',
                      'pob_25a59',
                      'p_60ymas',
                      'dens_pobha',
                      'city']

    hex_cdcuidadoras_pop = hex_cdcuidadoras_pop[reordered_list]
    
    #--------------- UPLOAD DATA ---------------
    if save:
        aup.gdf_to_db_slow(hex_cdcuidadoras_pop, "cdcuidadoras_popageb_2020_hex", 'prox_analysis', if_exists='append')
        print(f'Uploaded {city} data to db')
    print('--'*20)

In [2]:
#Load mun data
mun_schema = 'metropolis'
mun_table = 'metro_gdf'
query = f"SELECT * FROM {mun_schema}.{mun_table}" 
mun_gdf = aup.gdf_from_query(query, geometry_col='geometry')

In [4]:
#Find already processed cities
prox_schema = 'prox_analysis'
prox_table = 'cdcuidadoras_popageb_2020_hex'
query = f"SELECT * FROM {prox_schema}.{prox_table}"
prox_all = aup.gdf_from_query(query, geometry_col='geometry')
processed_city_list = list(prox_all.city.unique())

# Skip ZMVM due to size
#processed_city_list.append('ZMVM')

print(f"Already processed {len(processed_city_list)} cities:")
print(processed_city_list)

#Run main function
for city in mun_gdf.city.unique():
        if city not in processed_city_list:
            cvegeo_list = list(mun_gdf.loc[mun_gdf.city==city]["CVEGEO"].unique())
            main(city, cvegeo_list, save=True)

Already processed 73 cities:
['Aguascalientes', 'Ensenada', 'Mexicali', 'Tijuana', 'La Paz', 'Campeche', 'Laguna', 'Monclova', 'Piedras Negras', 'Saltillo', 'Colima', 'Tecoman', 'Tapachula', 'Tuxtla', 'Chihuahua', 'Delicias', 'Juarez', 'Parral', 'Durango', 'Celaya', 'Guanajuato', 'Leon', 'Moroleon', 'San Francisco', 'Acapulco', 'Chilpancingo', 'Pachuca', 'Tula', 'Tulancingo', 'Guadalajara', 'Ocotlan', 'Vallarta', 'Tianguistenco', 'Toluca', 'Piedad', 'Morelia', 'Zamora', 'Cuautla', 'Cuernavaca', 'Tepic', 'Monterrey', 'Oaxaca', 'Tehuantepec', 'Puebla', 'Tehuacan', 'Teziutlan', 'Queretaro', 'Cancun', 'Chetumal', 'Rio Verde', 'SLP', 'Culiacan', 'Mazatlan', 'Guaymas', 'Hermosillo', 'Nogales', 'Villahermosa', 'Victoria', 'Matamoros', 'Nuevo Laredo', 'Reynosa', 'Tampico', 'Tlaxcala', 'Acayucan', 'Coatzacoalcos', 'Cordoba', 'Minatitlan', 'Orizaba', 'Poza Rica', 'Veracruz', 'Xalapa', 'Merida', 'Zacatecas']
STARTING ANALYSIS FOR ZMVM.
ZMVM con 75 cvegeos:
['09002', '09003', '09004', '09005', '09