# Applied pipeline integration

This notebook integrates every element from the pipeline notebooks into one and uses the developed functions as a testing ground, for the prediction of urban land use when there is no input data for model training.

## Import libraries

In [1]:
import geopandas as gpd
import pandas as pd
import numpy as np
import rasterio as ro
from ast import literal_eval

from joblib import Parallel, delayed
from tqdm import tqdm

import pickle

import distancerasters as dr

import momepy as mp
from shapely import wkt

from spatial_kde import spatial_kernel_density

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.decomposition import PCA

from keras import utils                                   # tools for creating one-hot encoding
from keras.models import Sequential                       # Type of model we wish to use
from keras.layers import Dense, Dropout, Activation
from sklearn.preprocessing import LabelEncoder
# from scikeras.wrappers import KerasClassifier, KerasRegressor
from keras import utils
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
# from sklearn.pipeline import Pipeline

from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error,confusion_matrix

import os
import sys
module_path = os.path.abspath(os.path.join('../../'))
if module_path not in sys.path:
    sys.path.append(module_path)
    import aup
else:
    import aup

module_path = os.path.abspath(os.path.join('../../scripts/landuse_model/'))
if module_path not in sys.path:
    sys.path.append(module_path)
    import prediction_functions
else:
    import prediction_functions


2025-09-09 15:57:08.192452: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-09-09 15:57:08.198898: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1757455028.205829   55849 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1757455028.207824   55849 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1757455028.213287   55849 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

## Donwload municipality for analysis

In [2]:
schema = 'metropolis'
table = 'metro_gdf_2020'
city = 'Monterrey'
query = f"SELECT * FROM {schema}.{table} WHERE city = \'{city}\'"

gdf = aup.gdf_from_query(query)
gdf = gdf.to_crs('EPSG:4326')

In [3]:
city_name = 'Apodaca'
metropolitan_area_name = 'Monterrey'
gdf_mun = gdf[gdf.NOMGEO == city_name].copy()
print(gdf_mun.shape)
gdf_mun.head(2)

(1, 6)


Unnamed: 0,CVEGEO,CVE_ENT,CVE_MUN,NOMGEO,geometry,city
0,19006,19,6,Apodaca,"POLYGON ((-100.24794 25.86462, -100.24672 25.8...",Monterrey


## DENUE to Spatial KDE

Download DENUE data

In [4]:
 # Download DENUE data
schema = "denue"
table = "denue_2022"

aup.log(f"Downloading DENUE data for {city_name}")

denue_gdf = aup.gdf_from_polygon(gdf_mun, schema, table)

denue_gdf = denue_gdf[['cve_ent','cve_mun','cve_loc',
                       'ageb','manzana',
                       'codigo_act','per_ocu','geometry']].copy()

aup.log(f"Downloaded {denue_gdf.shape[0]} rows")

Download census data

In [5]:
# Download Census data
schema = "sociodemografico"
table = "censo_inegi_20_mza"

poly_geom = gdf_mun.dissolve().geometry.iloc[0]
poly_wkt = poly_geom.wkt  # Este sí es un string

aup.log(f"Downloading Census data for {city_name}")

# Consulta que devuelve WKT en lugar de geometría nativa
query_censo = f"""
SELECT
"cvegeo_mza",
"pobtot","geometry" FROM {schema}.{table}
WHERE ST_Intersects(geometry, \'SRID=4326;{poly_wkt}\')
"""

block_gdf = aup.gdf_from_query(query_censo)
block_gdf = block_gdf.set_crs("EPSG:4326")

aup.log(f"Downloaded {block_gdf.shape[0]} rows")

Change crs

In [6]:
# Change crs
gdf_mun = gdf_mun.to_crs("EPSG:6372")
denue_gdf = denue_gdf.to_crs("EPSG:6372")
block_gdf = block_gdf.to_crs("EPSG:6372")

Calculate DENUE data

In [7]:
denue_gdf['tipo_act'] = denue_gdf['codigo_act'].apply(prediction_functions.asignar_tipo)
denue_gdf['per_ocu_num'] = denue_gdf.per_ocu.apply(lambda per_ocu: prediction_functions.number_of_jobs(per_ocu))

# Create block CVEGEO column from denue
denue_gdf['cvegeo_mza'] = (
    denue_gdf['cve_ent'].astype(str).str[:2] +
    denue_gdf['cve_mun'].astype(str).str[:3] +
    denue_gdf['cve_loc'].astype(str).str[:4] +
    denue_gdf['ageb'].astype(str).str[:4] +
    denue_gdf['manzana'].astype(str).str[:3]
)

aup.log("Created CVEGEO column in DENUE")

Merge Census and DENUE data

In [8]:
# Create centroid from blocks
block_cnt = block_gdf.copy()
block_cnt['cnt_geometry'] = block_cnt['geometry'].centroid

# Merge block centroid geometry to DENUE gdf
denue_gdf = pd.merge(
    denue_gdf,
    block_cnt[['cvegeo_mza', 'cnt_geometry']],
    on=['cvegeo_mza'],
    how='inner'
)

aup.log("Merged block centroid geometry to DENUE gdf")


Calculate distance from blocks to DENUE

In [9]:
# Calcular la distancia de cada punto al centroide de su manzana
denue_gdf['distancia'] = denue_gdf['geometry'].distance(denue_gdf['cnt_geometry'])

aup.log("Calculated distance from each point to its block centroid")

# Calcular d_mean por manzana
denue_to_cnt = denue_gdf.groupby(['cvegeo_mza'])['distancia'].mean()
denue_to_cnt = denue_to_cnt.reset_index(name='d_mean')

Calculate KDE

In [None]:
# Add average distance to each block centroid
denue_gdf = denue_gdf.merge(denue_to_cnt, on='cvegeo_mza')

aup.log("Added average distance to each block centroid")

# Execute in parallel
output_dir = '../../data/processed/prediccion_uso_suelo/monterrey/kde_output/'

aup.log("Executing KDE activities by block in parallel")

results = Parallel(n_jobs=16, verbose=1)(
    delayed(prediction_functions.process_block_activities)(idx, manzana, denue_gdf, output_dir)
    for idx, manzana in tqdm(block_gdf.iterrows(), desc="Preparing tasks")
)

del results # kde processing output
del denue_gdf # DENUE data
del denue_to_cnt # relation between DENUE and block centroids
del block_gdf # blocks with census data

aup.log("Finished executing KDE activities by block in parallel and deleted variables")

Preparing tasks: 0it [00:00, ?it/s][Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.
Preparing tasks: 48it [00:01, 27.94it/s][Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    1.8s
Preparing tasks: 192it [00:05, 38.83it/s][Parallel(n_jobs=16)]: Done 168 tasks      | elapsed:    5.1s
Preparing tasks: 448it [00:10, 47.86it/s][Parallel(n_jobs=16)]: Done 418 tasks      | elapsed:   10.6s
Preparing tasks: 784it [00:18, 47.82it/s][Parallel(n_jobs=16)]: Done 768 tasks      | elapsed:   18.5s
Preparing tasks: 1248it [00:29, 29.36it/s][Parallel(n_jobs=16)]: Done 1218 tasks      | elapsed:   29.8s
Preparing tasks: 1792it [00:42, 46.08it/s][Parallel(n_jobs=16)]: Done 1768 tasks      | elapsed:   42.4s
Preparing tasks: 2448it [01:06, 34.53it/s][Parallel(n_jobs=16)]: Done 2418 tasks      | elapsed:  1.1min
Preparing tasks: 2768it [01:19, 41.77it/s]

## Create area of prediction

Read building data

In [15]:
aup.log("Reading building data")

read_data = True

if read_data:
    # Read data
    bld_gdf = pd.read_csv('../../data/processed/prediccion_uso_suelo/867_buildings.csv')
    bld_gdf['geometry'] = bld_gdf['geometry'].apply(wkt.loads)
    bld_gdf = gpd.GeoDataFrame(bld_gdf, crs='epsg:4326')
    bld_gdf = bld_gdf.to_crs("EPSG:6372")

Clip building municipality data

In [16]:
# clip buildings to municipality
gdf_mun = gdf_mun.to_crs("EPSG:6372")
bld_clip = gpd.clip(bld_gdf, gdf_mun)
bld_gdf = bld_gdf.loc[bld_gdf.full_plus_code.isin(list(bld_clip.full_plus_code.unique()))].copy()

aup.log(f"Finished reading and clipping building data with {len(bld_gdf)} buildings")

Download block data

In [17]:
# Download block data
schema = 'marco'
table = 'mza_2020'
query = f"SELECT * FROM {schema}.{table} WHERE ST_Intersects(geometry, 'SRID=4326;{poly_wkt}')"
block_gdf = aup.gdf_from_query(query, geometry_col='geometry')
block_gdf = block_gdf.set_crs("EPSG:4326")

block_gdf = block_gdf.to_crs("EPSG:6372")
block_gdf = block_gdf[['CVEGEO','geometry']].copy()

Building data to blocks

In [18]:
# Building data to blocks
bld_block = gpd.overlay(bld_gdf, block_gdf, how='intersection')

aup.log(f"Finished overlaying building data with {len(bld_block)} buildings")

Create area of prediction

In [20]:
# Get unique CVEGEOs to process
unique_cvegeos = bld_block.CVEGEO.unique()

aup.log(f"Create building tesselation with parallel processing")

# Execute in parallel - FIXED: Remove extra brackets and pass correct parameters
results = Parallel(n_jobs=20, verbose=1)(
    delayed(prediction_functions.building_tesselation)(cvegeo, block_gdf, bld_block)
    for cvegeo in tqdm(unique_cvegeos, desc="Processing blocks")
)

# Filter out None results and concatenate all DataFrames
valid_results = [result[0] for result in results if result is not None]
tess_gdf = pd.concat(valid_results, ignore_index=True)

aup.log(f"Finished creating building tesselation with {len(tess_gdf)} polygons")


Processing blocks:   0%|                                                                                                                                                              | 0/7367 [00:00<?, ?it/s][A[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.

Processing blocks:   1%|▊                                                                                                                                                    | 40/7367 [00:02<06:11, 19.74it/s][A[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    3.8s

Processing blocks:   1%|█▏                                                                                                                                                   | 60/7367 [00:05<12:05, 10.08it/s][A
Processing blocks:   1%|█▌                                                                                                                                                   | 80/7367 [00:08<15:02,  8.07it/s][A
Processing block

In [21]:
# block to area of prediction
block_gdf['block_area_m2'] = block_gdf.to_crs("EPSG:6372").area
tess_gdf = tess_gdf.merge(block_gdf[['CVEGEO','block_area_m2']],
                         on='CVEGEO')

# convert to GeoDataFrame
tess_gdf = gpd.GeoDataFrame(tess_gdf)
tess_gdf = tess_gdf.set_crs("EPSG:6372")

tess_gdf = tess_gdf.rename(columns={'area_in_meters':'bld_area_m2'})

tess_gdf = tess_gdf.reset_index(drop=True).reset_index().rename(columns={'index':'fid'})

tess_gdf['area_m2'] = tess_gdf.area

# update area
tess_gdf['pred_area_m2'] = tess_gdf.area
tess_gdf['pred_area_pct'] = tess_gdf['pred_area_m2'] / tess_gdf['block_area_m2']

tess_gdf['bld_pred_area_pct'] = tess_gdf['bld_area_m2'] / tess_gdf['pred_area_m2']

tess_gdf.to_file('../../data/processed/prediccion_uso_suelo/monterrey/area_of_prediction.gpkg')

aup.log(f"Finished saving building tesselation to file")

# del tess_gdf
# del block_gdf
del bld_clip
del results
del valid_results

## Spatial KDE to Area of Prediction

In [23]:
aup.log(f"Starting spatial KDE to area of prediction")

kde_dir = '../../data/processed/prediccion_uso_suelo/monterrey/kde_output/'
aop_gdf = tess_gdf.copy()
del tess_gdf

if 'fid' not in list(aop_gdf.columns):
    aop_gdf = aop_gdf.reset_index().rename(columns={'index':'fid'})

Download census dataa

In [24]:
# Download census data
poly_wkt = gdf_mun.to_crs("EPSG:4326").dissolve().geometry.to_wkt()[0]
schema = "sociodemografico"
table = "censo_inegi_20_mza"

# Consulta que devuelve WKT en lugar de geometría nativa
query_censo = f"""
SELECT
"cvegeo_mza",
"pobtot","geometry" FROM {schema}.{table}
WHERE ST_Intersects(geometry, \'SRID=4326;{poly_wkt}\')
"""

block_gdf = aup.gdf_from_query(query_censo)

# add population data to area of prediction
aop_data = aop_gdf.merge(block_gdf[['cvegeo_mza','pobtot']], left_on='CVEGEO',
                        right_on='cvegeo_mza')
aop_data = aop_data.drop(columns=['cvegeo_mza'])

Execute merge algorithm

In [25]:
# Execute in single process

aop_kde = gpd.GeoDataFrame()

cvegeo_list = list(aop_gdf.CVEGEO.unique())

aup.log(f"Starting spatial KDE to area of prediction")

for cvegeo in tqdm(cvegeo_list, total=len(cvegeo_list), desc="Processing blocks"):
    kde_block = f'kde_mnz_{cvegeo}'
    aop_tmp = aop_gdf.loc[aop_gdf.CVEGEO==cvegeo].copy()

    # iterate over every file
    for filename in os.listdir(kde_dir):

        # gather those corresponding to the specific block
        if filename.startswith(kde_block):

            # skip complementary raster files
            if filename.endswith('.aux.xml'):
                continue

            kde_act = filename.replace(kde_block+'_',"").replace('.tif',"").lower()

            # read file
            raster_kde = ro.open(kde_dir+filename)

            aop_tmp[kde_act] = aop_tmp.geometry.apply(lambda geom: aup.clean_mask(geom, raster_kde)).apply(np.ma.mean)

    aop_kde = pd.concat([aop_kde, aop_tmp])

aop_kde = aop_kde.fillna(0)

aup.log(f"Finished spatial KDE to area of prediction")


Processing blocks:   0%|                                                                                                                                                              | 0/7357 [00:00<?, ?it/s][A
Processing blocks:   0%|▏                                                                                                                                                     | 7/7357 [00:00<02:23, 51.22it/s][A
Processing blocks:   0%|▎                                                                                                                                                    | 13/7357 [00:00<02:51, 42.76it/s][A
Processing blocks:   0%|▎                                                                                                                                                    | 18/7357 [00:00<03:44, 32.65it/s][A
Processing blocks:   0%|▍                                                                                                                                  

Transform density data to usage values

In [26]:

aop_data = aop_data.merge(aop_kde[['fid','agropecuario','industria',
    'servicios','alojamiento','comercio',
    'cultural_recreativo','educacion','salud','gobierno','otros']])

aop_area = aop_data[['CVEGEO','area_m2']].groupby('CVEGEO').sum().reset_index().rename(columns={'area_m2':'area_m2_tot'})
aop_data = aop_data.merge(aop_area, on='CVEGEO')
aop_data['pobtot_relative'] = aop_data['pobtot'] * (aop_data['area_m2']/aop_data['area_m2_tot'])

aop_data = aop_data.rename(columns={'pobtot_relative':'habitacional',
                                   'educación':'educacion'})

uso_list = ['agropecuario','industria',
    'servicios','alojamiento','comercio',
    'cultural_recreativo','educacion','salud','gobierno','otros']

aop_data['uso_tot'] = aop_data[uso_list].sum(axis=1)

for us in uso_list:
    aop_data['pct_'+us] = aop_data[us]/aop_data['uso_tot']

aop_data = aop_data.fillna(0)

aop_data.to_file('../../data/processed/prediccion_uso_suelo/monterrey/area_of_prediction_kde.gpkg')

del aop_kde
del block_gdf

aup.log("Finished processing KDE to Area of Prediction")

## Environmental data

In [69]:
aup.log("Started processing environmental data")

gdf = aop_data.copy()
del aop_data

NDVI data

In [70]:
aup.log("Downloading NDVI data")

schema = 'raster_analysis'
table = 'ndvi_analysis_hex'
res = 11

query = f'SELECT hex_id,ndvi_mean FROM {schema}.{table} WHERE \"city\" = \'{metropolitan_area_name}\' and \"res\"={res}'

ndvi_gdf = aup.df_from_query(query)

aup.log(f"Downloaded {len(ndvi_gdf)} rows")

NDMI data

In [71]:
aup.log("Downloading NDMI data")

schema = 'raster_analysis'
table = 'ndmi_analysis_hex'
res = 11

query = f'SELECT hex_id,ndmi_diff FROM {schema}.{table} WHERE \"city\" = \'{metropolitan_area_name}\' and \"res\"={res}'

ndmi_gdf = aup.df_from_query(query)

aup.log(f"Downloaded {len(ndmi_gdf)} rows")

Temperature data

In [72]:
aup.log("Downloading Temperature data")

schema = 'raster_analysis'
table = 'temperature_analysis_hex'
res = 11

query = f'SELECT hex_id,temperature_mean,geometry FROM {schema}.{table} WHERE \"city\" = \'{metropolitan_area_name}\' and \"res\"={res}'

temp_gdf = aup.gdf_from_query(query, geometry_col='geometry')

aup.log(f"Downloaded {len(temp_gdf)} rows")

# calculate the variation from the mean
temp_gdf = temp_gdf[~temp_gdf.temperature_mean.isin([float('inf')])].copy()
temp_gdf['temperature_mean_diff'] = temp_gdf.temperature_mean.mean() - temp_gdf.temperature_mean
temp_gdf = temp_gdf.drop(columns=['temperature_mean'])

Environmental data to area of prediction

In [74]:
env_gdf = temp_gdf.copy()
env_gdf = env_gdf.merge(ndvi_gdf, on='hex_id')
env_gdf = env_gdf.merge(ndmi_gdf, on='hex_id')

aup.log("Merged environment data")

del ndvi_gdf
del ndmi_gdf
del temp_gdf

env_gdf = env_gdf.set_crs("EPSG:4326")
env_gdf = env_gdf.to_crs("EPSG:6372")

gdf_int = gdf.overlay(env_gdf, how='intersection')
gdf_int = gdf_int[['full_plus_code','temperature_mean_diff',
        'ndvi_mean','ndmi_diff']].copy()

gdf_int = gdf_int.groupby('full_plus_code').mean().reset_index()
gdf = gdf.merge(gdf_int, on='full_plus_code')

  return geopandas.overlay(


Save output

In [75]:
gdf.to_file('../../data/processed/prediccion_uso_suelo/monterrey/area_of_prediction_env.gpkg')

del gdf_int
del env_gdf

aup.log(f"Finished transfering environmental data to {len(gdf)} polygons")

## Distance to roads

Download edges

In [76]:
# DISTANCE TO ROADS

aup.log("Started processing distance to roads")

aop_gdf = gdf.copy()
del gdf

aup.log("Downloading edges data")

schema = "osmnx"
table = "edges_osmnx_23_line"

edges = aup.gdf_from_polygon(gdf_mun, schema, table)

aup.log(f"Downloaded {len(edges)} edges")

Edges preprocessing

In [77]:
aup.log("Processing edges data")

edges['highway'] = edges.highway.apply(lambda row: prediction_functions.check_for_lists(row))

edges.loc[edges.highway.map(lambda x:
                            isinstance(x, list)),'highway'] = edges.loc[
    edges.highway.map(lambda x: isinstance(x, list))].apply(
        lambda row: row['highway'][0], axis=1)

edges.loc[edges['highway'].str.contains(
    "_link"),'highway'] = edges[edges['highway'].str.contains(
    "_link")].highway.apply(lambda x: x.replace('_link',''))

road_dict = {
    'motorway':['motorway'],
    'primary':['primary'],
    'secondary':['secondary'],
    'tertiary':['tertiary'],
    'residential':['residential','living_street'],
    'other':[]
}

road_list = ['motorway','primary','secondary',
            'tertiary','residential','living_street']

for road in edges.highway.unique():
    if road not in road_list:
        road_dict['other'].append(road)

aup.log(f"Preprocessed edges")

Prepare data for proximity analysis

In [78]:
edges = edges.to_crs("EPSG:4326")
aop_gdf = aop_gdf.to_crs("EPSG:4326")

pixel_size = 0.00023 # 0.00023° -> 25m

output_dir = '../../data/processed/prediccion_uso_suelo/monterrey/prox_vialidades/'

if 'fid' not in list(aop_gdf.columns):
    aop_gdf = aop_gdf.reset_index().rename(columns={'index':'fid'})

# define bounds according to area of prediction
bounds = []
for c in aop_gdf.bounds:
    try:
        if 'min' in c:
            bounds.append(aop_gdf.bounds[c].min().item()-0.05)
        else:
            bounds.append(aop_gdf.bounds[c].max().item()+0.05)
    except AttributeError:
        if 'min' in c:
            bounds.append(aop_gdf.bounds[c].min()-0.05)
        else:
            bounds.append(aop_gdf.bounds[c].max()+0.05)
bounds = tuple(bounds)

Distance calculation

In [82]:
results = []

aup.log("Starting parallel processing for proximity to roads")

for road_class in tqdm(road_dict.keys(), total=len(road_dict.keys()), desc="Processing blocks"):

    road_type = road_dict[road_class]

    results.append(prediction_functions.road_type_to_area_of_prediction(aop_gdf,
                                                    edges,
                                                    road_class,
                                                    road_type,
                                                    pixel_size,
                                                    bounds,
                                                    output_dir)
                                                    )

for results_df in results:
    aop_gdf = aop_gdf.merge(results_df, on='fid', how='left')

aup.log("Finished parallel processing for proximity to roads")

Processing blocks:   0%|                                                                                                                                                                                                     | 0/6 [00:00<?, ?it/s]

Tree build time: 0.0022 seconds


Processing blocks:   0%|                                                                                                                                                                                                     | 0/6 [00:15<?, ?it/s]

Distance calc run time: 15.0931 seconds





NameError: name 'rasterio' is not defined

Clean up ouput

In [None]:
#########################################
if 'path_distance_y' in aop_gdf.columns:
    aop_gdf = aop_gdf.drop(columns=['motorway_distance_y', 'primary_distance_y',
           'secondary_distance_y', 'tertiary_distance_y', 'residential_distance_y',
           'other_distance_y'])

    aop_gdf = aop_gdf.rename(columns={'residential_distance_x':'residential_distance',
                           'primary_distance_x':'primary_distance',
                           'tertiary_distance_x':'tertiary_distance',
                           'secondary_distance_x':'secondary_distance',
                           'motorway_distance_x':'motorway_distance',
                                      'other_distance_x':'other_distance'
                           })

    aop_gdf.to_file('../../data/processed/prediccion_uso_suelo/monterrey/area_of_prediction_roads.gpkg')

    del edges
    del results

    aup.log("Finished processing proximity to roads")