# Union de bases de Airbnb, Crimen, Turismo, Google y Polígonos y Características AGEB

En este código se hace la unión entre todas las bases necesarias para obtener el dataframe final de donde se harán las estimaciones econométricas

## Importacion de librerias y definir el nombre y la ruta de los archivos

In [1]:
import pandas as pd
import geopandas as gpd
import os
import numpy as np
from shapely.geometry import Point
from shapely.ops import nearest_points

In [2]:
#Definir la carpeta de donde obtendremos los archivos
ruta_base = r"C:\Users\santo\Documents\CIDE\Tesis\datos_unidos"

#Crear un diccionario con los nombres de los archivos para que sea más fácil llamarlos
archivos = {
    "poligonos" : "poligono_ageb_cdmx.gpkg", #Poligonos de las AGEB de la CDMX
    "airbnb" : "airbnb_limpio.csv", #Airbnb's
    "delitos" : "crimen_limpio.csv", #Carpetas de investigación
    "turismo" : "turismo_limpio.csv", #Negocios que están en la categoría 72 del SCIAN
    "gtrends" : "gtrends_limpio.csv", #Google Trends
    "controles" : "controles_limpio.csv", #Indicadores a nivel AGEB (controles)
    "luz" : "luz_limpio.csv", #Contaminación lumínica por AGEB (2019 y 2020)
    "luz_mensual" : "luz_monthly_limpio.csv", #Contaminación lumínica por AGEB a nivel mensual de 2019-03 a 2020-03
    "turistas" : "turistas_limpio.csv", #Número de turistas (camiones, aeropuertos y hospedajes) de 2019-03 a 2020-03
    "negocios" : "negocios_limpio.csv", #Negocios de comercio al por menor que están en la categoría 46 del SCIAN
    "scian_72" : "restaurantes_limpio.csv", #Negocios que están en la categoría 72 del SCIAN
    "restaurantes" : "restaurantessolo_limpio.csv", #Restaurantes que están en la categoría 722 del SCIAN

}

## Cargar Polígonos

In [3]:
gdf_poligonos = gpd.read_file(os.path.join(ruta_base, archivos["poligonos"])) #Leer el archivo
id_col_ageb = 'CVEGEO' #Definir una variable con el nombre de la columna con los ID's de las AGEB (polígonos) para usarla en las próximas funciones

### Función para asignar puntos a su polígono (o al más cercano)

En esta parte escribo una función que asigna a cada punto su polígono (si cae dentro) o el más cercano (si no está dentro de ningún polígono). Esto es porque hay algunos datos en los que hay puntos afuera de los polígonos que conforman las AGEB de la CDMX. Pero, se supone, que esos datos siguen siendo de México.

In [4]:
def asignar_poligono(gdf_puntos):
    gdf_puntos = gdf_puntos.to_crs(gdf_poligonos.crs) #Confirmar que los puntos estén en el mismo sistema de coordenadas que los polígonos
    joined = gpd.sjoin(gdf_puntos, gdf_poligonos, how = "left", predicate = 'within') #Unión espacial, asignando a cada punto el polígono dentro del cual cae
    puntos_fuera = joined[joined['index_right'].isna()].copy() #Seleccionar los puntos que no cayeron dentro de ningún polígono
    
    def encontrar_mas_cercano(punto): # función auxiliar que encuentra la frontera del polígono más cercano al punto
        punto_proj = gpd.GeoSeries([punto], crs=gdf_poligonos.crs).to_crs("EPSG:6369").iloc[0]
        gdf_proj = gdf_poligonos.to_crs("EPSG:6369") #Reproyectamos el punto como los poligonos al sistema en metros porque estaba tratando de medir distancias usando coordenadas geográficas (latitud y longitud), no metros. Eso puede dar resultados imprecisos o erróneos.
        distancias = gdf_proj.geometry.distance(punto_proj) #Calcula la distancia del punto a todos los polígonos del GeoDataFrame
        idx = distancias.idxmin() #Obtiene el índice del polígono que tiene la distancia más corta al punto
        return gdf_poligonos.loc[idx, id_col_ageb] #Devuelve el ID del polígono más cercano
    
    puntos_fuera['ageb_cercana'] = puntos_fuera.geometry.apply(encontrar_mas_cercano) #Aplica la función del polígono más cercano a cada punto que quedó fuera
    joined['poligono_final'] = joined[id_col_ageb] #Crear columna 'poligono_final' con el ID del poligono asignado directamente si cayó adentro)
    joined.loc[joined['index_right'].isna(), 'poligono_final'] = puntos_fuera['ageb_cercana'].values #Para los puntos que estaban fuera, sobreescribir "poligono_final" con el polígono más cercano
    return joined     

## Procesar Airbnb

Primero paso el CSV a un GeoDataFrame y luego cuento cuántos Airbnb hay por polígono en cada fecha. Después de eso, cuento cuántos Airbnb hay en cada polígono por fecha dependiendo del *tipo* de Airbnb. 

In [5]:
df_airbnb = pd.read_csv(os.path.join(ruta_base, archivos["airbnb"]), parse_dates = ['fecha_recabada_dt']) #Leer el CSV de Airbnb y convertir fecha_recabada_dt a tipo fecha
gdf_airbnb = gpd.GeoDataFrame(df_airbnb, geometry = gpd.points_from_xy(df_airbnb['longitude'], df_airbnb['latitude']), crs = "EPSG:4326") #Convertir a GeoDataFrame usando longitud y latitud
gdf_airbnb = asignar_poligono(gdf_airbnb) #Utilizar la función para asignar polígonos a los puntos
gdf_airbnb['mes'] = gdf_airbnb['fecha_recabada_dt'].dt.to_period('M').astype(str) #Crear la columna mes en formato YYYY-MM

#Contar cuantos Airbnb's hay por mes según la condición(columna) utilizada
airbnb_counts = gdf_airbnb.groupby(['poligono_final', 'mes']) [
    ['activo_3m', 'activo_6m', 'activo_1y', 'activo_siempre']
].sum().reset_index()

In [6]:
airbnb_counts

Unnamed: 0,poligono_final,mes,activo_3m,activo_6m,activo_1y,activo_siempre
0,0900200010010,2019-03,0,0,0,1
1,0900200010010,2019-10,1,1,1,1
2,0900200010010,2020-06,1,1,1,1
3,0900200010025,2019-03,0,0,2,2
4,0900200010025,2019-04,0,0,1,1
...,...,...,...,...,...,...
35224,0901700011524,2020-11,1,1,1,2
35225,0901700011524,2020-12,1,1,1,2
35226,0901700011524,2024-06,0,0,0,1
35227,0901700011524,2024-09,0,0,0,1


In [7]:
gdf_airbnb

Unnamed: 0,id,latitude,longitude,room_type,fecha_recabada_dt,activo_3m,activo_6m,activo_1y,activo_siempre,geometry,index_right,layer,path,CVEGEO,CVE_ENT,CVE_MUN,CVE_AGEB,CVE_LOC,poligono_final,mes
0,14714,19.430350,-99.155110,Private room,2019-03-01,False,False,False,True,POINT (-99.15511 19.43035),1784.0,poligono_ageb_urbanas_cdmx,C:/Users/santo/Documents/CIDE/Tesis/Mapas/poli...,0901500010856,09,015,0856,0001,0901500010856,2019-03
1,22787,19.440760,-99.163240,Private room,2019-03-01,False,False,False,True,POINT (-99.16324 19.44076),1721.0,poligono_ageb_urbanas_cdmx,C:/Users/santo/Documents/CIDE/Tesis/Mapas/poli...,0901500010678,09,015,0678,0001,0901500010678,2019-03
2,33681,19.272150,-99.218480,Private room,2019-03-01,False,False,False,True,POINT (-99.21848 19.27215),317.0,poligono_ageb_urbanas_cdmx,C:/Users/santo/Documents/CIDE/Tesis/Mapas/poli...,0901200011481,09,012,1481,0001,0901200011481,2019-03
3,35797,19.383990,-99.273350,Entire home/apt,2019-03-01,False,False,False,True,POINT (-99.27335 19.38399),1590.0,poligono_ageb_urbanas_cdmx,C:/Users/santo/Documents/CIDE/Tesis/Mapas/poli...,0900400010176,09,004,0176,0001,0900400010176,2019-03
4,44616,19.410060,-99.176450,Private room,2019-03-01,True,True,True,True,POINT (-99.17645 19.41006),1833.0,poligono_ageb_urbanas_cdmx,C:/Users/santo/Documents/CIDE/Tesis/Mapas/poli...,0901500011318,09,015,1318,0001,0901500011318,2019-03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
515863,1318906185008171760,19.436998,-99.196733,Private room,2024-12-01,False,False,False,True,POINT (-99.19673 19.437),1366.0,poligono_ageb_urbanas_cdmx,C:/Users/santo/Documents/CIDE/Tesis/Mapas/poli...,0901600010641,09,016,0641,0001,0901600010641,2024-12
515864,1318960023050066631,19.449230,-99.136170,Entire home/apt,2024-12-01,False,False,False,True,POINT (-99.13617 19.44923),1726.0,poligono_ageb_urbanas_cdmx,C:/Users/santo/Documents/CIDE/Tesis/Mapas/poli...,0901500010451,09,015,0451,0001,0901500010451,2024-12
515865,1318997244771691320,19.316820,-99.116781,Shared room,2024-12-01,False,False,False,True,POINT (-99.11678 19.31682),1846.0,poligono_ageb_urbanas_cdmx,C:/Users/santo/Documents/CIDE/Tesis/Mapas/poli...,0900300010906,09,003,0906,0001,0900300010906,2024-12
515866,1319078571963617776,19.400096,-99.172981,Entire home/apt,2024-12-01,False,False,False,True,POINT (-99.17298 19.4001),1329.0,poligono_ageb_urbanas_cdmx,C:/Users/santo/Documents/CIDE/Tesis/Mapas/poli...,0901600011226,09,016,1226,0001,0901600011226,2024-12


In [8]:
# ——————————————————————————————————————————————————————————————
# ➊ Desagregar por room_type y activo_X (formato status_tipo)
# ——————————————————————————————————————————————————————————————
# Columnas de condición
active_cols = ['activo_3m', 'activo_6m', 'activo_1y', 'activo_siempre']

# Prefijos para cada room_type
type_prefix = {
    'Entire home/apt': 'home',
    'Hotel room':      'hotel',
    'Private room':    'private',
    'Shared room':     'shared'
}

# Pivot: suma de cada activo_X por room_type
rt_counts = gdf_airbnb.pivot_table(
    index   = ['poligono_final', 'mes'],
    columns = 'room_type',
    values  = active_cols,
    aggfunc = 'sum',
    fill_value=0
)

# Aplanar MultiIndex y renombrar columnas: e.g. (activo_3m, 'Entire home/apt') → 'activo_3m_home'
rt_counts.columns = [
    f"{status}_{type_prefix[room]}" 
    for status, room in rt_counts.columns
]
rt_counts = rt_counts.reset_index()

# Unir estos nuevos contadores a airbnb_counts
airbnb_counts = airbnb_counts.merge(
    rt_counts,
    on=['poligono_final', 'mes'],
    how='left'
)

# ——————————————————————————————————————————————————————————————
# ➋ Agregar variables “entero vs cuarto” (formato status_entero / status_cuarto)
# ——————————————————————————————————————————————————————————————
for status in active_cols:
    home_col   = f"{status}_home"
    cuarto_col = f"{status}_cuarto"
    # Sumar hotel + private + shared
    airbnb_counts[cuarto_col] = (
        airbnb_counts[f"{status}_hotel"] +
        airbnb_counts[f"{status}_private"] +
        airbnb_counts[f"{status}_shared"]
    )
    # (home ya existe como status_home gracias al pivot)

In [9]:
airbnb_counts

Unnamed: 0,poligono_final,mes,activo_3m,activo_6m,activo_1y,activo_siempre,activo_1y_home,activo_1y_hotel,activo_1y_private,activo_1y_shared,...,activo_6m_private,activo_6m_shared,activo_siempre_home,activo_siempre_hotel,activo_siempre_private,activo_siempre_shared,activo_3m_cuarto,activo_6m_cuarto,activo_1y_cuarto,activo_siempre_cuarto
0,0900200010010,2019-03,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
1,0900200010010,2019-10,1,1,1,1,0,0,1,0,...,1,0,0,0,1,0,1,1,1,1
2,0900200010010,2020-06,1,1,1,1,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,0900200010025,2019-03,0,0,2,2,0,0,2,0,...,0,0,0,0,2,0,0,0,2,2
4,0900200010025,2019-04,0,0,1,1,0,0,1,0,...,0,0,0,0,1,0,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35224,0901700011524,2020-11,1,1,1,2,1,0,0,0,...,0,0,1,0,1,0,0,0,0,1
35225,0901700011524,2020-12,1,1,1,2,1,0,0,0,...,0,0,1,0,1,0,0,0,0,1
35226,0901700011524,2024-06,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
35227,0901700011524,2024-09,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


## Procesar Delitos

Primero paso el CSV a un GeoDataFrame y luego cuento cuántos delitos hay por polígono en cada fecha. Además, paso como columnas los distintos tipos de delitos para que pueda analizarlos como outcomes.

In [10]:
df_delitos = pd.read_csv(os.path.join(ruta_base, archivos["delitos"]), parse_dates = ['fecha_hecho'])
gdf_delitos = gpd.GeoDataFrame(df_delitos, geometry = gpd.points_from_xy(df_delitos['longitud'], df_delitos['latitud']), crs = "EPSG:4326")
gdf_delitos = asignar_poligono(gdf_delitos)
gdf_delitos['mes'] = gdf_delitos['fecha_hecho'].dt.to_period('M').astype(str)

delito_counts = gdf_delitos.groupby(['poligono_final', 'mes', 'delitos_grupos']).size().reset_index(name = 'conteo_delitos') #Esto agrupa el DataFrame por múltiples columnas y cuenta cuántas filas hay en cada combinación. Esto da el número de delitos de cada tipo en cada AGEB en cada fecha. Luego se convierte a un nuevo DataFrame y como agrupamos, hay que utilizar "reset_index()" para regresar el índice y se nombra la columna de conteo como 'conteo_delitos'.
delitos_pivot = delito_counts.pivot_table(index = ['poligono_final', 'mes'], columns = 'delitos_grupos', values = 'conteo_delitos', fill_value = 0).reset_index() #Pasamos los datos de formato "long" a "wide" donde cada fila representa un polígono y un mes y cada columna representa un tipo de delito y el valor en la celda es cuántos delitos de ese tipo ocurrieron en ese polígono y mes

crime_cols = delitos_pivot.columns.difference(["poligono_final", "mes"]) #creamos una lista con las columnas de delitos_pivot excepto por poligono final y mes
delitos_pivot["crimen_total"] = delitos_pivot[crime_cols].sum(axis=1) #sumamos todos los crimenes por AGEB

In [11]:
delito_counts

Unnamed: 0,poligono_final,mes,delitos_grupos,conteo_delitos
0,0900200010010,2019-01,patrimonio,5
1,0900200010010,2019-02,otros,1
2,0900200010010,2019-02,vida_integridad_corporal,1
3,0900200010010,2019-03,patrimonio,4
4,0900200010010,2019-04,familia,1
...,...,...,...,...
133842,0901700011524,2020-11,patrimonio,1
133843,0901700011524,2020-12,familia,3
133844,0901700011524,2020-12,libertad_seguridad_sexual,1
133845,0901700011524,2020-12,otros,1


In [12]:
delitos_pivot

delitos_grupos,poligono_final,mes,familia,libertad_personal,libertad_seguridad_sexual,otros,patrimonio,sociedad,vida_integridad_corporal,crimen_total
0,0900200010010,2019-01,0.0,0.0,0.0,0.0,5.0,0.0,0.0,5.0
1,0900200010010,2019-02,0.0,0.0,0.0,1.0,0.0,0.0,1.0,2.0
2,0900200010010,2019-03,0.0,0.0,0.0,0.0,4.0,0.0,0.0,4.0
3,0900200010010,2019-04,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0
4,0900200010010,2019-05,1.0,0.0,0.0,1.0,3.0,0.0,0.0,5.0
...,...,...,...,...,...,...,...,...,...,...
52794,0901700011524,2020-08,1.0,0.0,1.0,1.0,3.0,0.0,0.0,6.0
52795,0901700011524,2020-09,1.0,0.0,1.0,0.0,0.0,0.0,1.0,3.0
52796,0901700011524,2020-10,3.0,0.0,0.0,4.0,2.0,0.0,1.0,10.0
52797,0901700011524,2020-11,6.0,0.0,1.0,3.0,1.0,0.0,0.0,11.0


## Procesar Negocios/Turismo

Primero paso el CSV a un GeoDataFrame y luego cuento cuántos negocios hay por polígono en los 2 años (2015 y 2019). Además, paso los años de cada conteo como columnas para que pueda intercambiarlos de una manera más fácil en las regresiones.

In [13]:
df_turismo = pd.read_csv(os.path.join(ruta_base, archivos["turismo"]))
gdf_turismo = gpd.GeoDataFrame(df_turismo, geometry = gpd.points_from_xy(df_turismo['longitud'], df_turismo['latitud']), crs = "EPSG:4326")
gdf_turismo = asignar_poligono(gdf_turismo)

negocios_counts = gdf_turismo.groupby(['poligono_final', 'year']).size().reset_index(name = 'conteo') #Contamos cuántos negocios hay en cada grupo de polígono y año con "size()" y nombramos esta columna como 'conteo'
negocios_pivot = negocios_counts.pivot_table(index = 'poligono_final', columns = 'year', values = 'conteo', fill_value = 0).reset_index() #Pivotamos la tabla para que tenga la opción de utilizar un año o el otro. Y con "fill_value = 0" estoy poniendo que si para algún polígono no hay negocios en un año, ponga el valor de "0" en vez de "NaN"
negocios_pivot.columns.name = None #Eliminar el eje de las columnas que quedaría como "year" en caso de no hacer esto
negocios_pivot = negocios_pivot.rename(columns = {2015: 'negocios_2015', 2019: 'negocios_2019'})

## Unir bases: Airbnb + Delitos + Negocios/Turismo

Se crea primero una base esquelto con todos los meses y todas las AGEB. Esto es para evitar que se pueda perdern perder algunos poligonos o periodos de meses porque no hubo datos en esa fecha de crimen o airbnb.


In [14]:
# ==========================
# Base maestra del panel
# ==========================

# 1) Todos los polígonos de la CDMX (AGEB)
poligonos = gdf_poligonos[id_col_ageb].unique()

# 2) Rango completo de meses según las series largas (Airbnb y crimen)
min_mes = min(gdf_airbnb['mes'].min(), gdf_delitos['mes'].min())
max_mes = max(gdf_airbnb['mes'].max(), gdf_delitos['mes'].max())

# Crear lista de meses en formato YYYY-MM (como ya usas en todo el notebook)
meses = pd.period_range(min_mes, max_mes, freq='M').astype(str)

# 3) Malla completa poligono_final × mes
panel_grid = (
    pd.MultiIndex.from_product([poligonos, meses],
                               names=['poligono_final', 'mes'])
      .to_frame(index=False)
)


In [15]:
# ==========================
# Construir df_final DESDE la malla completa
# ==========================

# Empezamos de la base maestra poligono_final × mes
df_final = panel_grid.copy()

# Airbnb por polígono y mes (incluye tus columnas home/cuartos, etc.)
df_final = pd.merge(df_final, airbnb_counts,
                    how='left',
                    on=['poligono_final', 'mes'])

# Delitos por polígono y mes
df_final = pd.merge(df_final, delitos_pivot,
                    how='left',
                    on=['poligono_final', 'mes'])

# Negocios anuales (2015, 2019) por polígono
df_final = pd.merge(df_final, negocios_pivot,
                    how='left',
                    on='poligono_final')


In [16]:
#df_merged = pd.merge(airbnb_counts, delitos_pivot, how = 'outer', on = ['poligono_final', 'mes']) #Unión de Airbnb con crimen
#df_final = pd.merge(df_merged, negocios_pivot, how = 'outer', on = 'poligono_final') #Unión del df anterior con negocios/turismo

In [17]:
df_final

Unnamed: 0,poligono_final,mes,activo_3m,activo_6m,activo_1y,activo_siempre,activo_1y_home,activo_1y_hotel,activo_1y_private,activo_1y_shared,...,familia,libertad_personal,libertad_seguridad_sexual,otros,patrimonio,sociedad,vida_integridad_corporal,crimen_total,negocios_2015,negocios_2019
0,090110471,2019-01,,,,,,,,,...,2.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,3.0
1,090110471,2019-02,,,,,,,,,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,3.0
2,090110471,2019-03,,,,,,,,,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,1.0,3.0
3,090110471,2019-04,,,,,,,,,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,3.0
4,090110471,2019-05,,,,,,,,,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,1.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
176683,,2024-08,,,,,,,,,...,,,,,,,,,,
176684,,2024-09,,,,,,,,,...,,,,,,,,,,
176685,,2024-10,,,,,,,,,...,,,,,,,,,,
176686,,2024-11,,,,,,,,,...,,,,,,,,,,


## Procesar Google Trends y unirla

In [18]:
df_gtrends = pd.read_csv(os.path.join(ruta_base, archivos["gtrends"]))
df_gtrends['mes'] = pd.to_datetime(df_gtrends['Mes']).dt.to_period('M').astype(str)
df_gtrends = df_gtrends.drop(columns = 'Mes')

#Union directa por mes
df_final = pd.merge(df_final, df_gtrends, how = 'left', on = 'mes')

In [19]:
df_final

Unnamed: 0,poligono_final,mes,activo_3m,activo_6m,activo_1y,activo_siempre,activo_1y_home,activo_1y_hotel,activo_1y_private,activo_1y_shared,...,a_ciumex_nac_2019_2020,a_cdmx_nac_2019_2021,a_ciumex_nac_2019_2021,a_mexcity_nac_2019_2021,a_cdmx_nac_2019_2025,a_ciumex_nac_2019_2025,a_mexcity_nac_2019_2025,a_cdmx_nac_2023_2025,a_ciumex_nac_2023_2025,a_mexcity_nac_2023_2025
0,090110471,2019-01,,,,,,,,,...,10.00,33.00,0.0,19.0,,,,,,
1,090110471,2019-02,,,,,,,,,...,10.00,35.75,10.0,0.0,25.0,9.0,8.0,,,
2,090110471,2019-03,,,,,,,,,...,19.80,43.00,13.6,3.2,29.0,14.0,9.0,,,
3,090110471,2019-04,,,,,,,,,...,12.00,47.75,0.0,5.5,32.0,12.0,9.0,,,
4,090110471,2019-05,,,,,,,,,...,6.75,37.25,0.0,5.0,28.0,10.0,8.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
176683,,2024-08,,,,,,,,,...,,,,,87.0,11.0,5.0,73.75,6.25,0.0
176684,,2024-09,,,,,,,,,...,,,,,82.0,13.0,8.0,74.20,10.00,2.6
176685,,2024-10,,,,,,,,,...,,,,,94.0,7.0,9.0,71.50,0.00,5.0
176686,,2024-11,,,,,,,,,...,,,,,86.0,13.0,11.0,72.25,8.50,7.5


### Crear un lag de las Google Trends y unirla

In [20]:
df_gtrends_lag = df_gtrends.copy()
df_gtrends_lag['mes'] = pd.to_datetime(df_gtrends_lag['mes']).dt.to_period('M') + 1 #Desplazo la fecha un mes hacia adelante para simular el lag
df_gtrends_lag = df_gtrends_lag.astype(str)

#Renombro las columnas (excepto 'mes') con el sufijo "_lag_1m" para identificar más fácil
cols_to_rename = {col: f"{col}_lag_1m" for col in df_gtrends_lag.columns if col != 'mes'}
df_gtrends_lag = df_gtrends_lag.rename(columns = cols_to_rename)

#Unir el lag a la base principal
df_final = pd.merge(df_final, df_gtrends_lag, how = 'left', on = 'mes')

In [21]:
df_final

Unnamed: 0,poligono_final,mes,activo_3m,activo_6m,activo_1y,activo_siempre,activo_1y_home,activo_1y_hotel,activo_1y_private,activo_1y_shared,...,a_ciumex_nac_2019_2020_lag_1m,a_cdmx_nac_2019_2021_lag_1m,a_ciumex_nac_2019_2021_lag_1m,a_mexcity_nac_2019_2021_lag_1m,a_cdmx_nac_2019_2025_lag_1m,a_ciumex_nac_2019_2025_lag_1m,a_mexcity_nac_2019_2025_lag_1m,a_cdmx_nac_2023_2025_lag_1m,a_ciumex_nac_2023_2025_lag_1m,a_mexcity_nac_2023_2025_lag_1m
0,090110471,2019-01,,,,,,,,,...,0.0,,,,,,,,,
1,090110471,2019-02,,,,,,,,,...,10.0,33.0,0.0,19.0,,,,,,
2,090110471,2019-03,,,,,,,,,...,10.0,35.75,10.0,0.0,25.0,9.0,8.0,,,
3,090110471,2019-04,,,,,,,,,...,19.8,43.0,13.6,3.2,29.0,14.0,9.0,,,
4,090110471,2019-05,,,,,,,,,...,12.0,47.75,0.0,5.5,32.0,12.0,9.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
176683,,2024-08,,,,,,,,,...,,,,,82.0,9.0,8.0,67.5,2.5,2.5
176684,,2024-09,,,,,,,,,...,,,,,87.0,11.0,5.0,73.75,6.25,0.0
176685,,2024-10,,,,,,,,,...,,,,,82.0,13.0,8.0,74.2,10.0,2.6
176686,,2024-11,,,,,,,,,...,,,,,94.0,7.0,9.0,71.5,0.0,5.0


## Procesar características de AGEB (controles) y unirla

In [22]:
df_controles = pd.read_csv(os.path.join(ruta_base, archivos["controles"]))
df_final = pd.merge(df_final, df_controles, how = 'left', on = 'poligono_final')

In [23]:
df_final

Unnamed: 0,poligono_final,mes,activo_3m,activo_6m,activo_1y,activo_siempre,activo_1y_home,activo_1y_hotel,activo_1y_private,activo_1y_shared,...,a_mexcity_nac_2019_2025_lag_1m,a_cdmx_nac_2023_2025_lag_1m,a_ciumex_nac_2023_2025_lag_1m,a_mexcity_nac_2023_2025_lag_1m,pobtot,phog_ind,graproes,pocupada,pobhog,tvivhab
0,090110471,2019-01,,,,,,,,,...,,,,,,,,,,
1,090110471,2019-02,,,,,,,,,...,,,,,,,,,,
2,090110471,2019-03,,,,,,,,,...,8.0,,,,,,,,,
3,090110471,2019-04,,,,,,,,,...,9.0,,,,,,,,,
4,090110471,2019-05,,,,,,,,,...,9.0,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
176683,,2024-08,,,,,,,,,...,8.0,67.5,2.5,2.5,,,,,,
176684,,2024-09,,,,,,,,,...,5.0,73.75,6.25,0.0,,,,,,
176685,,2024-10,,,,,,,,,...,8.0,74.2,10.0,2.6,,,,,,
176686,,2024-11,,,,,,,,,...,9.0,71.5,0.0,5.0,,,,,,


## Procesar datos de restaurantes y hoteles (SCIAN 72)

In [24]:
df_scian72 = pd.read_csv(os.path.join(ruta_base, archivos["scian_72"]))
gdf_scian72 = gpd.GeoDataFrame(df_scian72, geometry = gpd.points_from_xy(df_scian72['longitud'], df_scian72['latitud']), crs = "EPSG:4326")
gdf_scian72 = asignar_poligono(gdf_scian72)

scian72_counts = gdf_scian72.groupby(['poligono_final', 'mes']).size().reset_index(name = 'scian_72')

In [25]:
scian72_counts

Unnamed: 0,poligono_final,mes,scian_72
0,0900200010010,2019-03,27
1,0900200010010,2019-09,27
2,0900200010010,2020-03,27
3,0900200010025,2019-03,18
4,0900200010025,2019-09,18
...,...,...,...
6879,0901700011492,2019-09,2
6880,0901700011492,2020-03,2
6881,0901700011524,2019-03,41
6882,0901700011524,2019-09,41


In [26]:
df_final = pd.merge(df_final, scian72_counts, how = 'left', on = ['poligono_final', 'mes'])

In [27]:
df_final

Unnamed: 0,poligono_final,mes,activo_3m,activo_6m,activo_1y,activo_siempre,activo_1y_home,activo_1y_hotel,activo_1y_private,activo_1y_shared,...,a_cdmx_nac_2023_2025_lag_1m,a_ciumex_nac_2023_2025_lag_1m,a_mexcity_nac_2023_2025_lag_1m,pobtot,phog_ind,graproes,pocupada,pobhog,tvivhab,scian_72
0,090110471,2019-01,,,,,,,,,...,,,,,,,,,,
1,090110471,2019-02,,,,,,,,,...,,,,,,,,,,
2,090110471,2019-03,,,,,,,,,...,,,,,,,,,,3.0
3,090110471,2019-04,,,,,,,,,...,,,,,,,,,,
4,090110471,2019-05,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
176683,,2024-08,,,,,,,,,...,67.5,2.5,2.5,,,,,,,
176684,,2024-09,,,,,,,,,...,73.75,6.25,0.0,,,,,,,
176685,,2024-10,,,,,,,,,...,74.2,10.0,2.6,,,,,,,
176686,,2024-11,,,,,,,,,...,71.5,0.0,5.0,,,,,,,


## Procesar datos de restaurantes (SCIAN 722)

In [28]:
df_restaurantes = pd.read_csv(os.path.join(ruta_base, archivos["restaurantes"]))
gdf_restaurantes = gpd.GeoDataFrame(df_restaurantes, geometry = gpd.points_from_xy(df_restaurantes['longitud'], df_restaurantes['latitud']), crs = "EPSG:4326")
gdf_restaurantes = asignar_poligono(gdf_restaurantes)

restaurantes_counts = gdf_restaurantes.groupby(['poligono_final', 'mes']).size().reset_index(name = 'restaurantes')

In [29]:
restaurantes_counts

Unnamed: 0,poligono_final,mes,restaurantes
0,0900200010010,2019-03,27
1,0900200010010,2019-09,27
2,0900200010010,2020-03,27
3,0900200010025,2019-03,18
4,0900200010025,2019-09,18
...,...,...,...
6873,0901700011492,2019-09,2
6874,0901700011492,2020-03,2
6875,0901700011524,2019-03,41
6876,0901700011524,2019-09,41


In [30]:
df_final = pd.merge(df_final, restaurantes_counts, how = 'left', on = ['poligono_final', 'mes'])
df_final

Unnamed: 0,poligono_final,mes,activo_3m,activo_6m,activo_1y,activo_siempre,activo_1y_home,activo_1y_hotel,activo_1y_private,activo_1y_shared,...,a_ciumex_nac_2023_2025_lag_1m,a_mexcity_nac_2023_2025_lag_1m,pobtot,phog_ind,graproes,pocupada,pobhog,tvivhab,scian_72,restaurantes
0,090110471,2019-01,,,,,,,,,...,,,,,,,,,,
1,090110471,2019-02,,,,,,,,,...,,,,,,,,,,
2,090110471,2019-03,,,,,,,,,...,,,,,,,,,3.0,3.0
3,090110471,2019-04,,,,,,,,,...,,,,,,,,,,
4,090110471,2019-05,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
176683,,2024-08,,,,,,,,,...,2.5,2.5,,,,,,,,
176684,,2024-09,,,,,,,,,...,6.25,0.0,,,,,,,,
176685,,2024-10,,,,,,,,,...,10.0,2.6,,,,,,,,
176686,,2024-11,,,,,,,,,...,0.0,5.0,,,,,,,,


## Procesar datos de negocios al por menor

In [31]:
df_negocios = pd.read_csv(os.path.join(ruta_base, archivos["negocios"]))
gdf_negocios = gpd.GeoDataFrame(df_negocios, geometry = gpd.points_from_xy(df_negocios['longitud'], df_negocios['latitud']), crs = "EPSG:4326")
gdf_negocios = asignar_poligono(gdf_negocios)

negocios_counts = gdf_negocios.groupby(['poligono_final', 'mes']).size().reset_index(name = 'negocios')

  return lib.distance(a, b, **kwargs)


In [32]:
df_final = pd.merge(df_final, negocios_counts, how = 'left', on = ['poligono_final', 'mes'])

In [33]:
df_final

Unnamed: 0,poligono_final,mes,activo_3m,activo_6m,activo_1y,activo_siempre,activo_1y_home,activo_1y_hotel,activo_1y_private,activo_1y_shared,...,a_mexcity_nac_2023_2025_lag_1m,pobtot,phog_ind,graproes,pocupada,pobhog,tvivhab,scian_72,restaurantes,negocios
0,090110471,2019-01,,,,,,,,,...,,,,,,,,,,
1,090110471,2019-02,,,,,,,,,...,,,,,,,,,,
2,090110471,2019-03,,,,,,,,,...,,,,,,,,3.0,3.0,19.0
3,090110471,2019-04,,,,,,,,,...,,,,,,,,,,
4,090110471,2019-05,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
176683,,2024-08,,,,,,,,,...,2.5,,,,,,,,,
176684,,2024-09,,,,,,,,,...,0.0,,,,,,,,,
176685,,2024-10,,,,,,,,,...,2.6,,,,,,,,,
176686,,2024-11,,,,,,,,,...,5.0,,,,,,,,,


## Procesar datos de contaminación lumínica por AGEB y unirla

In [34]:
df_luz = pd.read_csv(os.path.join(ruta_base, archivos["luz"]))
df_final = pd.merge(df_final, df_luz, how = 'left', on = ['poligono_final', 'mes'])

In [35]:
df_final

Unnamed: 0,poligono_final,mes,activo_3m,activo_6m,activo_1y,activo_siempre,activo_1y_home,activo_1y_hotel,activo_1y_private,activo_1y_shared,...,pobtot,phog_ind,graproes,pocupada,pobhog,tvivhab,scian_72,restaurantes,negocios,light_mean
0,090110471,2019-01,,,,,,,,,...,,,,,,,,,,
1,090110471,2019-02,,,,,,,,,...,,,,,,,,,,
2,090110471,2019-03,,,,,,,,,...,,,,,,,3.0,3.0,19.0,11.043286
3,090110471,2019-04,,,,,,,,,...,,,,,,,,,,
4,090110471,2019-05,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
176683,,2024-08,,,,,,,,,...,,,,,,,,,,
176684,,2024-09,,,,,,,,,...,,,,,,,,,,
176685,,2024-10,,,,,,,,,...,,,,,,,,,,
176686,,2024-11,,,,,,,,,...,,,,,,,,,,


In [36]:
df_luz_mensual = pd.read_csv(os.path.join(ruta_base, archivos["luz_mensual"]))
df_final = pd.merge(df_final, df_luz_mensual, how = 'left', on = ['poligono_final', 'mes'])

In [37]:
df_final

Unnamed: 0,poligono_final,mes,activo_3m,activo_6m,activo_1y,activo_siempre,activo_1y_home,activo_1y_hotel,activo_1y_private,activo_1y_shared,...,graproes,pocupada,pobhog,tvivhab,scian_72,restaurantes,negocios,light_mean,light_monthly_mean,light_monthly_sum
0,090110471,2019-01,,,,,,,,,...,,,,,,,,,,
1,090110471,2019-02,,,,,,,,,...,,,,,,,,,,
2,090110471,2019-03,,,,,,,,,...,,,,,3.0,3.0,19.0,11.043286,11.986346,1246.579997
3,090110471,2019-04,,,,,,,,,...,,,,,,,,,12.150192,1263.619995
4,090110471,2019-05,,,,,,,,,...,,,,,,,,,12.009423,1248.979997
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
176683,,2024-08,,,,,,,,,...,,,,,,,,,,
176684,,2024-09,,,,,,,,,...,,,,,,,,,,
176685,,2024-10,,,,,,,,,...,,,,,,,,,,
176686,,2024-11,,,,,,,,,...,,,,,,,,,,


## Procesar datos de número de turistas por mes y unirla

In [38]:
df_turistas = pd.read_csv(os.path.join(ruta_base, archivos["turistas"]))
df_final = pd.merge(df_final, df_turistas, how = 'left', on = 'mes')

In [39]:
df_final

Unnamed: 0,poligono_final,mes,activo_3m,activo_6m,activo_1y,activo_siempre,activo_1y_home,activo_1y_hotel,activo_1y_private,activo_1y_shared,...,light_monthly_sum,tur_hot_nac,tur_hot_int,tur_hot_tot,tur_aer,tur_bus_norte,tur_bus_poniente,tur_bus_oriente,tur_bus_tot,tur_tot
0,090110471,2019-01,,,,,,,,,...,,,,,,,,,,
1,090110471,2019-02,,,,,,,,,...,,754516.0,241910.0,996426.0,395130.0,871240.0,936387.0,953725.0,2761352.0,4152908.0
2,090110471,2019-03,,,,,,,,,...,1246.579997,910811.0,303310.0,1214121.0,438125.0,998735.0,1069257.0,964625.0,3032617.0,4684863.0
3,090110471,2019-04,,,,,,,,,...,1263.619995,870938.0,285079.0,1156017.0,379609.0,1180467.0,985292.0,979775.0,3145534.0,4681160.0
4,090110471,2019-05,,,,,,,,,...,1248.979997,787955.0,292558.0,1080513.0,411064.0,1046227.0,1104025.0,958275.0,3108527.0,4600104.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
176683,,2024-08,,,,,,,,,...,,,,,,,,,,
176684,,2024-09,,,,,,,,,...,,,,,,,,,,
176685,,2024-10,,,,,,,,,...,,,,,,,,,,
176686,,2024-11,,,,,,,,,...,,,,,,,,,,


### Crear un lag de número de turistas y unirlo

In [40]:
df_turistas_lag = (df_turistas.copy())
df_turistas_lag['mes'] = pd.to_datetime(df_turistas_lag['mes']).dt.to_period('M') + 1  #Desplazo la fecha un mes hacia adelante para simular el lag
df_turistas_lag = df_turistas_lag.astype(str)

#Renombro las columnas (excepto 'mes') con el sufijo "_lag_1m" para identificar más fácil
cols_to_rename = {col: f"{col}_lag_1m" for col in df_turistas_lag.columns if col != 'mes'}
df_turistas_lag = df_turistas_lag.rename(columns=cols_to_rename)

#Unir el lag a la base principal
df_final = pd.merge(df_final, df_turistas_lag, how='left', on='mes')
df_final

Unnamed: 0,poligono_final,mes,activo_3m,activo_6m,activo_1y,activo_siempre,activo_1y_home,activo_1y_hotel,activo_1y_private,activo_1y_shared,...,tur_tot,tur_hot_nac_lag_1m,tur_hot_int_lag_1m,tur_hot_tot_lag_1m,tur_aer_lag_1m,tur_bus_norte_lag_1m,tur_bus_poniente_lag_1m,tur_bus_oriente_lag_1m,tur_bus_tot_lag_1m,tur_tot_lag_1m
0,090110471,2019-01,,,,,,,,,...,,,,,,,,,,
1,090110471,2019-02,,,,,,,,,...,4152908.0,,,,,,,,,
2,090110471,2019-03,,,,,,,,,...,4684863.0,754516,241910,996426,395130,871240,936387,953725,2761352,4152908
3,090110471,2019-04,,,,,,,,,...,4681160.0,910811,303310,1214121,438125,998735,1069257,964625,3032617,4684863
4,090110471,2019-05,,,,,,,,,...,4600104.0,870938,285079,1156017,379609,1180467,985292,979775,3145534,4681160
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
176683,,2024-08,,,,,,,,,...,,,,,,,,,,
176684,,2024-09,,,,,,,,,...,,,,,,,,,,
176685,,2024-10,,,,,,,,,...,,,,,,,,,,
176686,,2024-11,,,,,,,,,...,,,,,,,,,,


## Convertir a numéricas todas las variables que entraran en el modelo y poner 0 en donde hay NA para que no haya errores al correr la regresión. Asimismo convertir las variables de crimen a logaritmo

In [41]:
#Convertir todas las columnas (excepto 'poligono_final' y 'mes') a numérico y llenar NA con 0
columnas_excluir = ['poligono_final', 'mes']
columnas_a_convertir = [col for col in df_final.columns if col not in columnas_excluir]
df_final[columnas_a_convertir] = df_final[columnas_a_convertir].apply(pd.to_numeric, errors='coerce').fillna(0)

In [42]:
# 1. Identificar las columnas de delitos (Son todas las columnas que vinieron de delitos_pivot, excepto 'poligono_final' y 'mes')
crime_cols = [col for col in df_final.columns 
              if col not in ['poligono_final', 'mes']
              and col in delitos_pivot.columns]

# 2. Crear nuevas columnas con el logaritmo natural de (1 + conteo para que no haya errores con los 0)
for col in crime_cols:
    df_final[f'log_{col}'] = np.log1p(df_final[col])

  df_final[f'log_{col}'] = np.log1p(df_final[col])
  df_final[f'log_{col}'] = np.log1p(df_final[col])
  df_final[f'log_{col}'] = np.log1p(df_final[col])
  df_final[f'log_{col}'] = np.log1p(df_final[col])
  df_final[f'log_{col}'] = np.log1p(df_final[col])
  df_final[f'log_{col}'] = np.log1p(df_final[col])
  df_final[f'log_{col}'] = np.log1p(df_final[col])
  df_final[f'log_{col}'] = np.log1p(df_final[col])


## Exportar la base de datos final

In [43]:
#renombrar la columna de poligonos para que sea más fácil identificar a qué hace referencia
df_final = df_final.rename(columns={'poligono_final': 'ageb'})

In [44]:
df_final = df_final.sort_values(by=["ageb", "mes"])
df_final.to_csv(os.path.join(ruta_base, "base_final_con_c.csv"), index = False)

In [45]:
df_final

Unnamed: 0,ageb,mes,activo_3m,activo_6m,activo_1y,activo_siempre,activo_1y_home,activo_1y_hotel,activo_1y_private,activo_1y_shared,...,tur_bus_tot_lag_1m,tur_tot_lag_1m,log_familia,log_libertad_personal,log_libertad_seguridad_sexual,log_otros,log_patrimonio,log_sociedad,log_vida_integridad_corporal,log_crimen_total
47088,0900200010010,2019-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.000000,1.791759,0.0,0.000000,1.791759
47089,0900200010010,2019-02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.693147,0.000000,0.0,0.693147,1.098612
47090,0900200010010,2019-03,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,2761352.0,4152908.0,0.000000,0.0,0.0,0.000000,1.609438,0.0,0.000000,1.609438
47091,0900200010010,2019-04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3032617.0,4684863.0,0.693147,0.0,0.0,0.000000,0.693147,0.0,0.000000,1.098612
47092,0900200010010,2019-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3145534.0,4681160.0,0.693147,0.0,0.0,0.693147,1.386294,0.0,0.000000,1.791759
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
176683,,2024-08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000
176684,,2024-09,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000
176685,,2024-10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000
176686,,2024-11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000


### Filtrar la base de una vez

In [46]:
df_final_filtrado_2019_03_2020_03 = df_final.loc[(df_final['mes'] >= '2019-03') & (df_final['mes'] <= '2020-03')].copy()
df_final_filtrado_2019_03_2020_03.to_csv(os.path.join(ruta_base, "base_final_con_c_fil_2019_03_2020_03_v21.csv"), index = False)
df_final_filtrado_2019_03_2020_03

Unnamed: 0,ageb,mes,activo_3m,activo_6m,activo_1y,activo_siempre,activo_1y_home,activo_1y_hotel,activo_1y_private,activo_1y_shared,...,tur_bus_tot_lag_1m,tur_tot_lag_1m,log_familia,log_libertad_personal,log_libertad_seguridad_sexual,log_otros,log_patrimonio,log_sociedad,log_vida_integridad_corporal,log_crimen_total
47090,0900200010010,2019-03,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,2761352.0,4152908.0,0.000000,0.0,0.0,0.000000,1.609438,0.0,0.0,1.609438
47091,0900200010010,2019-04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3032617.0,4684863.0,0.693147,0.0,0.0,0.000000,0.693147,0.0,0.0,1.098612
47092,0900200010010,2019-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3145534.0,4681160.0,0.693147,0.0,0.0,0.693147,1.386294,0.0,0.0,1.791759
47093,0900200010010,2019-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3108527.0,4600104.0,0.693147,0.0,0.0,0.000000,1.609438,0.0,0.0,1.791759
47094,0900200010010,2019-07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3271725.0,4910410.0,0.000000,0.0,0.0,0.000000,1.386294,0.0,0.0,1.386294
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
176626,,2019-11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2702451.0,4529470.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000
176627,,2019-12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2727867.0,4481933.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000
176628,,2020-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3013988.0,4532444.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000
176629,,2020-02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2782244.0,4244785.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000


In [47]:
# Paso el head del dataframe a csv para poder subirlo a un LLM sin tener que subir toda la base
df_final_filtrado_2019_03_2020_03.head().to_csv(os.path.join(ruta_base, 'head_del_dataframe.csv'), index=False)