In [1]:
import os

import pandas as pd
import numpy as np

import re
import unidecode

In [2]:
# Crear una función para extraer colonia y CP
def extraer_colonia_cp(direccion, df_cp):
    for _, row in df_cp.iterrows():
        if row['d_asenta'] in direccion:
            return row['d_asenta'], row['d_codigo']
    return None, None

In [3]:
# Estandarizar texto: quitar acentos, convertir a minúsculas, eliminar espacios extra y reemplazar "\n" por " "
def estandarizar_texto(texto):
    if pd.isna(texto):
        return ""
    texto = texto.replace("\n", " ")  # Reemplazar saltos de línea por espacios
    texto = texto.replace(",", "")  # Quitar comas
    return unidecode.unidecode(texto.strip().lower())

### Read webscrapping data

In [4]:
df_web = pd.read_parquet('../../data/processed/inmuebles24_departamentos_20250927.parquet')

In [5]:
df_web

Unnamed: 0,precio_mxn,lote_m2,recamaras,baños,estacionamiento,es_amueblado,es_penthouse,cuenta_con_cocina_integral,cuenta_con_sala,cuenta_con_closet,...,cuenta_con_terraza,cuenta_con_comedor,cuenta_con_area_de_lavado,cuenta_con_salon_usos_multiples,cuenta_con_mantenimiento_incluido,cuenta_con_vigilancia_24_horas,direccion,colonia,cp,municipio
0,24000.0,100,2.0,2.0,1.0,0,0,1,1,0,...,0,1,0,0,0,0,san jeronimo lidice san jeronimo lidice la mag...,san jeronimo lidice,10200,la magdalena contreras
1,12000.0,50,1.0,1.0,1.0,1,0,0,1,1,...,0,0,1,0,1,0,callejon del prado barrio san francisco la mag...,el prado,9480,la magdalena contreras
2,34100.0,232,3.0,4.0,2.0,0,0,1,1,1,...,1,1,0,0,0,1,blvd. adolfo ruiz cortines 2775 san jeronimo l...,adolfo ruiz cortines,4630,la magdalena contreras
3,16000.0,165,2.0,1.0,1.0,0,0,0,1,0,...,0,1,0,0,0,0,magnolia 26 san jeronimo lidice la magdalena c...,san jeronimo lidice,10200,la magdalena contreras
4,26000.0,180,3.0,2.0,2.0,0,0,1,0,1,...,0,0,0,0,0,1,san marcos 11 pedregal 2 la magdalena contreras,san marcos,2020,la magdalena contreras
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11863,12000.0,54,2.0,2.0,1.0,0,0,1,1,0,...,0,1,0,0,0,1,renta depto. san marcos azcapotzalco cdmx san ...,san marcos,2020,azcapotzalco
11864,11500.0,68,2.0,1.0,1.0,0,0,1,1,0,...,0,1,0,0,1,0,av ferrocarriles nacionales 25 santiago ahuizo...,santiago ahuizotla,2750,azcapotzalco
11865,13500.0,120,3.0,1.0,1.0,0,0,1,1,0,...,0,1,0,0,0,1,45 calzada azcapotzalco la villa santo tomas a...,santo tomas,2020,azcapotzalco
11866,9600.0,50,1.0,1.0,1.0,0,0,1,1,0,...,1,1,0,0,0,0,totonacas 6 tezozomoc azcapotzalco,tezozomoc,2459,azcapotzalco


### Read AGEB data with colonia

In [6]:
data_folder = "../../data/processed/INEGI/colonia"
csv_files = [f for f in os.listdir(data_folder) if f.endswith('.csv')]
print("Archivos encontrados:", list(csv_files))

Archivos encontrados: ['20250925224413_benito_juarez.csv', '20250926110422_cuauhtemoc.csv', '20250926043944_milpa_alta.csv', '20250926223742_venustiano_carranza.csv', '20250929081834_xochimilco.csv', '20250928025355_coyoacan.csv', '20250928113408_alvaro_obregon.csv', '20250926014033_cuajimalpa_de_morelos.csv', '20250926074342_la_magdalena_contreras.csv', '20250927030053_azcapotzalco.csv', '20250927085746_tlahuac.csv', '20250926182507_iztacalco.csv', '20250926143135_miguel_hidalgo.csv']


In [7]:
len(csv_files)

13

In [8]:
data = []
for file in csv_files:
    print(f"\nArchivo: {file}")
    df_tmp = pd.read_csv(os.path.join(data_folder, file), low_memory=False)
    data.append(df_tmp)


Archivo: 20250925224413_benito_juarez.csv

Archivo: 20250926110422_cuauhtemoc.csv

Archivo: 20250926043944_milpa_alta.csv

Archivo: 20250926223742_venustiano_carranza.csv

Archivo: 20250929081834_xochimilco.csv

Archivo: 20250928025355_coyoacan.csv

Archivo: 20250928113408_alvaro_obregon.csv

Archivo: 20250926014033_cuajimalpa_de_morelos.csv

Archivo: 20250926074342_la_magdalena_contreras.csv

Archivo: 20250927030053_azcapotzalco.csv

Archivo: 20250927085746_tlahuac.csv

Archivo: 20250926182507_iztacalco.csv

Archivo: 20250926143135_miguel_hidalgo.csv


In [9]:
df_ageb = pd.concat(data, ignore_index=True)

In [10]:
pat = re.compile(r"'neighbourhood':\s*'([^']+)'")
df_ageb['neighbourhood'] = df_ageb['address'].str.extract(pat)

In [11]:
df_ageb['neighbourhood'].isnull().sum()

19800

In [12]:
df_ageb.columns

Index(['ENTIDAD', 'NOM_ENT', 'MUN', 'NOM_MUN', 'LOC', 'NOM_LOC', 'AGEB', 'MZA',
       'POBTOT', 'POBFEM',
       ...
       'NOMREF3', 'lon', 'lat', 'municipio', 'address', 'road', 'quarter',
       'borough', 'postcode', 'neighbourhood'],
      dtype='object', length=135)

### Read localidades data

In [13]:
df_loc = pd.read_csv('../../data/raw/localidades/TR_LOCALIDAD_09 2.csv')

In [14]:
df_loc.columns

Index(['ENT', 'MUN', 'LOC', 'TIPOLOC', 'OTROTIPO_C', 'DIS_TRANS', 'AUTO',
       'MICRO', 'CAMION', 'TAXI',
       ...
       'USOIGL', 'USOFEST', 'USOCOMP', 'USOFAM', 'USOHOG', 'PROBLEMA',
       'PROBLEMA_O_C', 'POBTOT', 'TAMLOC', 'LOC_LENGUA40PC'],
      dtype='object', length=162)

In [15]:
df_loc.head()

Unnamed: 0,ENT,MUN,LOC,TIPOLOC,OTROTIPO_C,DIS_TRANS,AUTO,MICRO,CAMION,TAXI,...,USOIGL,USOFEST,USOCOMP,USOFAM,USOHOG,PROBLEMA,PROBLEMA_O_C,POBTOT,TAMLOC,LOC_LENGUA40PC
0,9,4,10,2,,3,,,,,...,9.0,9.0,9.0,9.0,9.0,8,,728,4,3
1,9,4,50,4,,1,1.0,3.0,6.0,7.0,...,9.0,9.0,9.0,9.0,9.0,6,,486,3,3
2,9,4,54,2,,1,1.0,3.0,6.0,7.0,...,9.0,9.0,9.0,9.0,9.0,8,,1233,5,3
3,9,4,55,2,,1,2.0,3.0,6.0,7.0,...,,,,,,6,,80,1,3
4,9,4,56,7,,3,,,,,...,,,,,,6,,108,2,3


In [16]:
df_loc.columns


Index(['ENT', 'MUN', 'LOC', 'TIPOLOC', 'OTROTIPO_C', 'DIS_TRANS', 'AUTO',
       'MICRO', 'CAMION', 'TAXI',
       ...
       'USOIGL', 'USOFEST', 'USOCOMP', 'USOFAM', 'USOHOG', 'PROBLEMA',
       'PROBLEMA_O_C', 'POBTOT', 'TAMLOC', 'LOC_LENGUA40PC'],
      dtype='object', length=162)

### Read colonias/cp file

In [17]:
df_cp = pd.read_csv('../../data/raw/CP/CPdescarga - Distrito_Federal.csv')

### Join INEGI files

In [18]:
df_loc.shape

(601, 162)

In [19]:
df_ageb.shape

(219441, 135)

In [20]:
set(df_ageb.columns).intersection(set(df_loc.columns))

{'LOC', 'MUN', 'POBTOT'}

In [21]:
df_loc[(df_loc.LOC == 10)&(df_loc.MUN == 4)]

Unnamed: 0,ENT,MUN,LOC,TIPOLOC,OTROTIPO_C,DIS_TRANS,AUTO,MICRO,CAMION,TAXI,...,USOIGL,USOFEST,USOCOMP,USOFAM,USOHOG,PROBLEMA,PROBLEMA_O_C,POBTOT,TAMLOC,LOC_LENGUA40PC
0,9,4,10,2,,3,,,,,...,9.0,9.0,9.0,9.0,9.0,8,,728,4,3


In [22]:
df_ageb[['LOC','MUN']]

Unnamed: 0,LOC,MUN
0,1,14
1,1,14
2,1,14
3,1,14
4,1,14
...,...,...
219436,1,16
219437,1,16
219438,1,16
219439,1,16


In [23]:
df_loc.LOC.value_counts()

LOC
140    4
112    4
113    4
135    3
198    3
      ..
344    1
345    1
348    1
352    1
293    1
Name: count, Length: 361, dtype: int64

In [24]:
df_ageb.LOC.value_counts()

LOC
1      203039
11       3253
21       2162
24       2108
20       2011
33       1400
17       1249
36       1009
26        961
29        774
152       584
15        425
300       271
110       195
Name: count, dtype: int64

In [25]:
df_ageb[(df_ageb.LOC == 10)]

Unnamed: 0,ENTIDAD,NOM_ENT,MUN,NOM_MUN,LOC,NOM_LOC,AGEB,MZA,POBTOT,POBFEM,...,NOMREF3,lon,lat,municipio,address,road,quarter,borough,postcode,neighbourhood


In [26]:
#df_inegi = df_ageb.merge(df_loc, on = ['LOC', 'MUN'])

### Join all files

In [27]:
df_web.colonia.value_counts()

colonia
hidalgo                3445
cuauhtemoc             1834
santa fe                645
cuajimalpa              436
hipodromo               276
                       ... 
la palmita                1
villa de aragon           1
torres lindavista         1
infonavit iztacalco       1
santiago ahuizotla        1
Name: count, Length: 396, dtype: int64

In [28]:
df_ageb.neighbourhood.value_counts()

neighbourhood
Colonia Agrícola Pantitlán                2535
Colonia Jardín Balbuena                   1365
Colonia INFONAVIT Sur 20                  1154
Colonia Kennedy                           1057
Centro                                     970
                                          ... 
Colonia Parques del Pedregal                 1
Colonia Arenal de Guadalupe                  1
Colonia Fernando Casas Alemán                1
Colonia San Juan de Aragón 2a. Sección       1
Colonia San Gabriel                          1
Name: count, Length: 1295, dtype: int64

In [29]:
df_ageb['neighbourhood'] = df_ageb['neighbourhood'].apply(estandarizar_texto)
df_ageb['neighbourhood'] = df_ageb['neighbourhood'].str.replace('colonia ' , '').replace('Colonia ' , '')

In [30]:
df_ageb.neighbourhood.value_counts()

neighbourhood
                        19800
agricola pantitlan       2535
jardin balbuena          1573
infonavit sur 20         1154
kennedy                  1057
                        ...  
arenal de guadalupe         1
parques del pedregal        1
mexicaltzingo               1
sinatel                     1
san gabriel                 1
Name: count, Length: 1273, dtype: int64

In [31]:
df_ageb[df_ageb.neighbourhood == 'hidalgo']

Unnamed: 0,ENTIDAD,NOM_ENT,MUN,NOM_MUN,LOC,NOM_LOC,AGEB,MZA,POBTOT,POBFEM,...,NOMREF3,lon,lat,municipio,address,road,quarter,borough,postcode,neighbourhood
104164,9,Ciudad de México,10,Álvaro Obregón,1,Álvaro Obregón,31,1,85,45,...,Bondojito,2792338.0,825251.40285,alvaro_obregon,"{'road': 'Calle Ixmiquilpan', 'neighbourhood':...",Calle Ixmiquilpan,,Álvaro Obregón,1120.0,hidalgo
104165,9,Ciudad de México,10,Álvaro Obregón,1,Álvaro Obregón,31,1,85,45,...,Bejuco,2792371.0,825231.288991,alvaro_obregon,"{'house_number': '47', 'road': 'Calle Huichapa...",Calle Huichapan de León,,Álvaro Obregón,1120.0,hidalgo
104166,9,Ciudad de México,10,Álvaro Obregón,1,Álvaro Obregón,31,1,85,45,...,Ixmiquilpan,2792343.0,825205.28335,alvaro_obregon,"{'road': 'Calle Bondojito', 'neighbourhood': '...",Calle Bondojito,,Álvaro Obregón,1120.0,hidalgo
104167,9,Ciudad de México,10,Álvaro Obregón,1,Álvaro Obregón,31,1,85,45,...,Huichapan de León,2792310.0,825225.39715,alvaro_obregon,"{'road': 'Calle Bejuco', 'neighbourhood': 'Col...",Calle Bejuco,,Álvaro Obregón,1120.0,hidalgo
104168,9,Ciudad de México,10,Álvaro Obregón,1,Álvaro Obregón,31,2,85,48,...,Bondojito,2792408.0,825260.1328,alvaro_obregon,"{'road': 'Calle Ixmiquilpan', 'neighbourhood':...",Calle Ixmiquilpan,,Álvaro Obregón,1120.0,hidalgo
104169,9,Ciudad de México,10,Álvaro Obregón,1,Álvaro Obregón,31,2,85,48,...,Huichapan de León,2792446.0,825240.538733,alvaro_obregon,"{'road': 'Calle Florencio Miranda', 'neighbour...",Calle Florencio Miranda,,Álvaro Obregón,1120.0,hidalgo
104170,9,Ciudad de México,10,Álvaro Obregón,1,Álvaro Obregón,31,2,85,48,...,Ixmiquilpan,2792416.0,825212.399346,alvaro_obregon,"{'house_number': '295', 'road': 'Calle Bondoji...",Calle Bondojito,,Álvaro Obregón,1120.0,hidalgo
104171,9,Ciudad de México,10,Álvaro Obregón,1,Álvaro Obregón,31,2,85,48,...,Florencio Miranda,2792378.0,825231.992854,alvaro_obregon,"{'house_number': '47', 'road': 'Calle Huichapa...",Calle Huichapan de León,,Álvaro Obregón,1120.0,hidalgo
104173,9,Ciudad de México,10,Álvaro Obregón,1,Álvaro Obregón,31,3,160,92,...,Tulancingo,2792420.0,825204.06455,alvaro_obregon,"{'house_number': '295', 'road': 'Calle Bondoji...",Calle Bondojito,,Álvaro Obregón,1120.0,hidalgo
104174,9,Ciudad de México,10,Álvaro Obregón,1,Álvaro Obregón,31,3,160,92,...,Bondojito,2792425.0,825143.198428,alvaro_obregon,"{'road': 'Calle Tulancingo', 'neighbourhood': ...",Calle Tulancingo,,Álvaro Obregón,1120.0,hidalgo


In [32]:
context = ['ENTIDAD', 'NOM_ENT', 'MUN', 'NOM_MUN', 'LOC', 'NOM_LOC', 'AGEB', 'MZA',
            'key', 'ENTIDAD_fm', 'MUN_fm', 'LOC_fm', 'AGEB_fm', 'MZA_fm', 'CVEVIAL',
            'CVESEG', 'CVEFT', 'NOMVIAL', 'TIPOVIAL', 'CVEVIAL1', 'CVESEG1',
            'CVEREF1', 'TIPOVR1', 'NOMREF1', 'CVEVIAL2', 'CVESEG2', 'CVEREF2',
            'TIPOVR2', 'NOMREF2', 'CVEVIAL3', 'CVESEG3', 'CVEREF3', 'TIPOVR3',
            'NOMREF3', 'lon', 'lat', 'municipio', 'address', 'road', 'quarter',
            'borough', 'postcode', 'neighbourhood']

In [33]:
cols = list(set(df_ageb.columns)-set(context))

In [34]:
df_ageb.columns[100:]

Index(['key', 'ENTIDAD_fm', 'MUN_fm', 'LOC_fm', 'AGEB_fm', 'MZA_fm', 'CVEVIAL',
       'CVESEG', 'CVEFT', 'NOMVIAL', 'TIPOVIAL', 'CVEVIAL1', 'CVESEG1',
       'CVEREF1', 'TIPOVR1', 'NOMREF1', 'CVEVIAL2', 'CVESEG2', 'CVEREF2',
       'TIPOVR2', 'NOMREF2', 'CVEVIAL3', 'CVESEG3', 'CVEREF3', 'TIPOVR3',
       'NOMREF3', 'lon', 'lat', 'municipio', 'address', 'road', 'quarter',
       'borough', 'postcode', 'neighbourhood'],
      dtype='object')

In [35]:
df_ageb[cols] = df_ageb[cols].replace('*',0)
df_ageb[cols] = df_ageb[cols].replace('N/D',0)

In [36]:
df_ageb[cols] = df_ageb[cols].astype('float64')

In [37]:
df_ageb = df_ageb[df_ageb.neighbourhood != '']

In [38]:
df_ageb_grp = df_ageb.groupby('neighbourhood')[cols].mean().reset_index()

In [39]:
df_ageb_grp.shape

(1272, 93)

In [40]:
df_ageb_grp

Unnamed: 0,neighbourhood,P_12A14_F,TVIVPAR,P18YM_PB_F,VPH_SPMVPI,VPH_MOTO,POBFEM,VIVPAR_HAB,OCUPVIVPAR,TOTHOG,...,P_18A24,P_6A11_M,VPH_2YMASD,POB65_MAS,VPH_CISTER,P18YM_PB,P18YM_PB_M,P_0A2_F,VPH_NDACMM,POBTOT
0,10 de mayo,1.583333,44.583333,30.500000,13.500000,2.666667,65.250000,38.916667,124.166667,39.000000,...,12.333333,5.583333,29.166667,14.416667,32.416667,61.083333,30.583333,1.500000,24.916667,124.500000
1,16 de septiembre,5.860465,124.651163,113.255814,56.325581,8.604651,210.767442,111.139535,414.046512,123.930233,...,42.209302,13.813953,83.604651,49.744186,42.139535,221.023256,107.767442,4.674419,65.395349,414.046512
2,19 de mayo,3.782258,47.854839,38.612903,14.145161,3.822581,96.572581,44.072581,189.056452,48.879032,...,20.959677,7.879032,35.040323,16.580645,11.306452,73.596774,34.983871,2.645161,24.508065,189.056452
3,1a. seccion del bosque de chapultepec,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,1a. seccion los cerrillos,9.278132,108.326964,57.316348,10.772824,8.683652,195.766454,96.346072,378.874735,97.108280,...,47.777070,19.855626,61.036093,20.261146,38.785563,109.953291,52.628450,7.583864,59.653928,378.874735
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1267,zentlapatl,16.492537,178.562189,119.333333,40.997512,12.753731,359.218905,163.751244,692.845771,172.223881,...,80.054726,36.094527,117.353234,42.363184,30.562189,234.353234,115.014925,13.119403,108.149254,692.860697
1268,zona centro,6.807692,95.153846,53.403846,20.211538,8.653846,156.173077,83.326923,295.250000,85.326923,...,33.423077,14.000000,68.865385,28.923077,80.365385,105.230769,51.826923,5.557692,63.346154,297.269231
1269,zona de reserva ecologica,2.010638,31.531915,23.648936,3.797872,2.787234,58.170213,29.276596,111.659574,29.404255,...,12.382979,5.361702,20.617021,8.872340,8.191489,46.414894,22.744681,1.042553,15.351064,111.659574
1270,zona rosa,0.000000,76.333333,39.666667,39.666667,4.666667,46.000000,57.666667,106.666667,57.666667,...,8.000000,0.000000,21.000000,8.000000,57.666667,92.333333,52.666667,1.666667,17.000000,106.666667


In [41]:
df_ageb_grp.neighbourhood.values

array(['10 de mayo', '16 de septiembre', '19 de mayo', ...,
       'zona de reserva ecologica', 'zona rosa', 'zona rustica'],
      dtype=object)

In [42]:
df_merge = df_web.merge(df_ageb_grp, left_on='colonia', right_on='neighbourhood', how='left', indicator=True)

In [43]:
df_merge.shape

(11868, 119)

In [44]:
df_merge.columns

Index(['precio_mxn', 'lote_m2', 'recamaras', 'baños', 'estacionamiento',
       'es_amueblado', 'es_penthouse', 'cuenta_con_cocina_integral',
       'cuenta_con_sala', 'cuenta_con_closet',
       ...
       'P_6A11_M', 'VPH_2YMASD', 'POB65_MAS', 'VPH_CISTER', 'P18YM_PB',
       'P18YM_PB_M', 'P_0A2_F', 'VPH_NDACMM', 'POBTOT', '_merge'],
      dtype='object', length=119)

In [45]:
df_merge._merge.value_counts()

_merge
both          10262
left_only      1606
right_only        0
Name: count, dtype: int64

In [46]:
df_merge[df_merge._merge == 'left_only'][['colonia', 'neighbourhood']]

Unnamed: 0,colonia,neighbourhood
0,san jeronimo lidice,
3,san jeronimo lidice,
5,san nicolas totolapan,
6,san jeronimo lidice,
8,san jeronimo lidice,
...,...,...
11835,lindavista norte,
11837,vasco de quiroga,
11838,la laguna ticoman,
11851,san martin xochinahuac,


In [47]:
df_merge[df_merge._merge == 'left_only']['colonia'].value_counts()

colonia
bosque                           197
polanco v seccion                161
tlalpan                          130
polanco iv seccion               121
del valle sur                     83
                                ... 
la magdalena                       1
centro (area 5)                    1
infonavit iztacalco                1
ocotillos del pueblo tetelpan      1
Zacatenco                          1
Name: count, Length: 157, dtype: int64

In [48]:
df_merge = df_merge[df_merge._merge == 'both']

In [49]:
len(cols)

92

In [50]:
df_merge.columns[:50]

Index(['precio_mxn', 'lote_m2', 'recamaras', 'baños', 'estacionamiento',
       'es_amueblado', 'es_penthouse', 'cuenta_con_cocina_integral',
       'cuenta_con_sala', 'cuenta_con_closet', 'cuenta_con_balcon',
       'cuenta_con_gimnasio', 'cuenta_con_alberca', 'cuenta_con_elevador',
       'cuenta_con_roof_garden', 'cuenta_con_terraza', 'cuenta_con_comedor',
       'cuenta_con_area_de_lavado', 'cuenta_con_salon_usos_multiples',
       'cuenta_con_mantenimiento_incluido', 'cuenta_con_vigilancia_24_horas',
       'direccion', 'colonia', 'cp', 'municipio', 'neighbourhood', 'P_12A14_F',
       'TVIVPAR', 'P18YM_PB_F', 'VPH_SPMVPI', 'VPH_MOTO', 'POBFEM',
       'VIVPAR_HAB', 'OCUPVIVPAR', 'TOTHOG', 'PHOGJEF_M', 'VPH_STVP',
       'VPH_1DOR', 'P_60YMAS', 'VPH_AEASP', 'VPH_S_ELEC', 'P_3A5_F', 'P_0A2_M',
       'VIVTOT', 'P_60YMAS_F', 'VPH_2CUART', 'P_3A5', 'VPH_CVJ', 'P_8A14',
       'P_18A24_M'],
      dtype='object')

In [51]:
df_merge.to_parquet('../../data/processed/merged_inmuebles24_departamentos_20250927.parquet', index=False)