In [3]:
import os
import joblib
import requests
import zipfile
import io
import pandas as pd
from pathlib import Path


In [2]:
denue_path = '../data/external/denue/'
url_base = 'http://www.beta.inegi.org.mx/contenidos/masiva/denue/denue_{0:02}_csv.zip'

for i in range(1, 33):
    url = url_base.format(i)
    
    folder_path = os.path.join(denue_path, f'{i:02}')
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

    r = requests.get(url)
    print(r.status_code, f'{i:02}')
    with zipfile.ZipFile(io.BytesIO(r.content)) as zfile:
        zfile.extractall(folder_path)


200 01
200 02
200 03
200 04
200 05
200 06
200 07
200 08
200 09
200 10
200 11
200 12
200 13
200 14
200 15
200 16
200 17
200 18
200 19
200 20
200 21
200 22
200 23
200 24
200 25
200 26
200 27
200 28
200 29
200 30
200 31
200 32


In [None]:
# Procesar denue

In [None]:
def cargar_denue(path):
    cols = [
        'id',
         'nom_estab',
         'raz_social',
         'codigo_act',
         'nombre_act',
         'per_ocu',
         'tipo_vial',
         'nom_vial',
         'tipo_v_e_1',
         'nom_v_e_1',
         'tipo_v_e_2',
         'nom_v_e_2',
         'tipo_v_e_3',
         'nom_v_e_3',
         'numero_ext',
         'letra_ext',
         'edificio',
         'edificio_e',
         'numero_int',
         'letra_int',
         'tipo_asent',
         'nomb_asent',
         'tipoCenCom',
         'nom_CenCom',
         'num_local',
         'cve_ent',
         'cve_mun',
         'cve_loc',
         'ageb',
         'manzana',
         'tipoUniEco',
         'latitud',
         'longitud',
         'fecha_alta'
    ]
    df = pd.read_csv(
        path, error_bad_lines=False, usecols=cols,
        dtype={'codigo_act': str, 'cve_ent': str, 'cve_mun': str, 'cve_loc': str, 'ageb': str, 'manzana': str}
    )
    cvegeo = df.cve_ent + df.cve_mun + df.cve_loc + df.ageb + df.manzana
    df = df.assign(cvegeo=cvegeo)
    df = df.drop(['cve_ent', 'cve_mun', 'cve_loc', 'ageb', 'manzana'], axis=1)
    return df

denue_path = Path('../data/external/denue/').resolve()


dfs = joblib.Parallel(n_jobs=-1, verbose=0)(
    joblib.delayed(cargar_denue)(path)
    for path in denue_path.glob('*/conjunto_de_datos/*.csv')
)

df_denue = pd.concat(dfs, axis=0, ignore_index=True)
df_denue = df_denue.loc[~df_denue.cvegeo.isnull()]
print(df_denue.shape)
df_denue.head()

In [None]:
df_denue.to_csv('../data/external/denue_completo.csv', encoding='utf-8', index=False, quoting=1)