# MODELADO

## LSTM


<hr>

<code> **Proyecto de Datos II** </code>

## Índice

- [Importación de los datos](#importación-de-los-datos)
- [Preprocesamiento](#preprocesamiento)
- [Entrenamiento](#entrenamiento)
- [Análisis del modelo](#análisis-del-modelo)
- [Registro del modelo en MLflow](#registro-del-modelo-en-mlflow)


In [1]:
import time
import mlflow
import pandas as pd
from evaluation.evaluator import Evaluator

SEED = 22 # replicabilidad

# =====================================
MODEL_NAME = "LSTM" 
# =====================================

## Importación de los datos

In [11]:
# Iniciamos la sesión de spark
import findspark
findspark.init()

In [12]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Spark en local") \
    .config("spark.master", "local[*]") \
    .config("spark.hadoop.fs.defaultFS", "file:///") \
    .config("spark.sql.warehouse.dir", "file:///tmp/spark-warehouse") \
    .config("spark.driver.extraJavaOptions", "-Dderby.system.home=/tmp/derby") \
    .getOrCreate()

sc = spark.sparkContext

25/04/26 18:34:55 WARN Utils: Your hostname, neutron.local resolves to a loopback address: 127.0.0.1; using 10.8.63.80 instead (on interface en0)
25/04/26 18:34:55 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/26 18:34:55 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [52]:
df_train = spark.read.parquet("/Users/maria/Downloads/train_spark.parquet")
df_test = spark.read.parquet("/Users/maria/Downloads/test_spark.parquet")

# Quitamos las columnas de ICAO, Callsign y Timestamp
col_to_drop = ['timestamp', 'icao', 'callsign']
df_train2 = df_train.drop(*col_to_drop)
df_test2 = df_test.drop(*col_to_drop)

# Separamos las variables de la variable objetivo
X_train, y_train = df_train2.drop("takeoff_time"), df_train2.select("takeoff_time")
X_test, y_test = df_test2.drop("takeoff_time"), df_test2.select("takeoff_time") 

In [15]:
(X_train.count(), len(X_train.columns)), (X_test.count(), len(X_test.columns))

((123733, 58), (27791, 58))

## Preprocesamiento

In [35]:
# =====================================
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, MinMaxScaler, StringIndexer

# 1. Crear indexadores para las columnas categóricas (tipo string)
indexers = [
    StringIndexer(inputCol=col, outputCol=f"{col}_index", handleInvalid="keep")
    for col in X_train.columns
    if str(X_train.schema[col].dataType) == 'StringType()'
]

# 2. Definir columnas de entrada para el ensamblador
# (indexadas si son categóricas, originales si son numéricas)
assembler_inputs = [
    f"{col}_index" if str(X_train.schema[col].dataType) == 'StringType()' else col
    for col in X_train.columns
]

# 3. Construir el pipeline: indexación -> ensamblado -> escalado
pipeline = Pipeline(stages=[
    *indexers,
    VectorAssembler(inputCols=assembler_inputs, outputCol="features_raw"),
    MinMaxScaler(inputCol="features_raw", outputCol="features")
])

# 4. Ajustar el pipeline SOLO en X_train
pipeline_model = pipeline.fit(X_train)

# 5. Transformar X_train y X_test usando el mismo pipeline
X_train_prepared = pipeline_model.transform(X_train)
X_test_prepared = pipeline_model.transform(X_test)

# =====================================

## Entrenamiento

In [22]:
start_time = time.time()

# ========================================

import numpy as np
import joblib
import pandas as pd

from tensorflow.keras import regularizers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping

from scikeras.wrappers import KerasRegressor

from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.preprocessing import MinMaxScaler


# 1. Convertir Spark DataFrame a NumPy para las características (ya preparado en X_train_prepared)
pdf_train = X_train_prepared.select("features").toPandas()
X = np.stack(pdf_train["features"].values)

# 2. Escalar la variable objetivo y (takeoff_time) usando MinMaxScaler
y_scaler = MinMaxScaler()
y_train_scaled = y_scaler.fit_transform(y_train.toPandas().values.reshape(-1, 1)).flatten()

# Redimensionar X para que coincida con la entrada de LSTM (muestras, pasos de tiempo, características)
X = X.reshape((X.shape[0], 1, X.shape[1]))

# 3. Definir el modelo LSTM
def build_model(units=64, dropout_rate=0.2, l2_reg=0.01):
    model = Sequential([
        LSTM(units, input_shape=(1, X.shape[2]), 
             kernel_regularizer=regularizers.l2(l2_reg)),
        BatchNormalization(), 
        Dropout(dropout_rate),
        Dense(1, kernel_regularizer=regularizers.l2(l2_reg))
    ])
    model.compile(optimizer='adam', loss='mse')
    return model


# 4. Definir el regressor usando KerasRegressor
regressor = KerasRegressor(
    model=build_model,
    units=64,
    dropout_rate=0.2,
    verbose=0
)

# 5. EarlyStopping para evitar sobreajuste
early_stop = EarlyStopping(monitor='loss', patience=5, restore_best_weights=True)

# 6. Definir el grid de hiperparámetros para la búsqueda
param_grid = {
    "units": [32, 64],
    "dropout_rate": [0.2, 0.4],
    "epochs": [20, 40],
    "batch_size": [16, 32]
}

# 7. Definir la validación cruzada con TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=5)

grid = GridSearchCV(
    estimator=regressor,
    param_grid=param_grid,
    scoring='neg_root_mean_squared_error',
    cv=tscv,
    verbose=1, 
    n_jobs=-1 # Usa todos los núcleos disponibles
)

# 8. Entrenar el modelo con GridSearchCV
grid_result = grid.fit(X, y_train_scaled, callbacks=[early_stop])


# ========================================

end_time = time.time()
execution_time = end_time - start_time

                                                                                

Fitting 5 folds for each of 16 candidates, totalling 80 fits


  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)


  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)


  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)


In [23]:
# 1. Obtener el mejor modelo (el que tiene los mejores hiperparámetros)
best_model = grid_result.best_estimator_

# 2. Obtener los mejores hiperparámetros
best_params = grid_result.best_params_

# 3. Obtener el puntaje de validación del modelo elegido
best_score = grid_result.best_score_

In [24]:
print(execution_time)

6052.185297966003


In [34]:
best_model

In [None]:
KerasRegressor(
	model=<function build_model at 0x28b1afa60>
	build_fn=None
	warm_start=False
	random_state=None
	optimizer=rmsprop
	loss=None
	metrics=None
	batch_size=32
	validation_batch_size=None
	verbose=0
	callbacks=None
	validation_split=0.0
	shuffle=True
	run_eagerly=False
	epochs=20
	units=64
	dropout_rate=0.2
)

## Análisis del modelo

In [46]:
# ===============================================================

from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

# 1. Predicciones en el conjunto de entrenamiento
y_train_pred_scaled = grid_result.predict(X)  # Predicciones en escala normalizada
y_train_pred = y_scaler.inverse_transform(y_train_pred_scaled.reshape(-1, 1)).flatten()

# 3. Calcular MAE y RMSE en el conjunto de entrenamiento
y_train_np = y_train.toPandas().takeoff_time.to_numpy()
mae_train = mean_absolute_error(y_train_np, y_train_pred)
rmse_train = np.sqrt(mean_squared_error(y_train_np, y_train_pred))

mae_val = None
rmse_val = None
# ===============================================================

In [48]:
mae_train, rmse_train

(69.92971714447049, 105.18334969410066)

In [53]:
# ===============================================================
# Generar predicciones en test

# 1. Convertir el DataFrame de Spark de test a Pandas
pdf_test = X_test_prepared.select("features").toPandas()
X_test = np.stack(pdf_test["features"].values)

# 2. Redimensionar X_test para que coincida con la entrada de LSTM (muestras, pasos de tiempo, características)
X_test = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))  # (n_samples, 1, n_features)

# 3. Hacer las predicciones en el conjunto de test usando el mejor modelo
y_test_pred_scaled = grid_result.best_estimator_.predict(X_test)  

# 4. Desescalar las predicciones
y_test_pred = y_scaler.inverse_transform(y_test_pred_scaled.reshape(-1, 1)).flatten()

# ===============================================================
df_test = df_test.toPandas()
df_test['prediction'] = y_test_pred

In [50]:
y_test_np = y_test.toPandas().takeoff_time.to_numpy()

mae_test = mean_absolute_error(y_test_np, y_test_pred)
rmse_test = np.sqrt(mean_squared_error(y_test_np, y_test_pred))
mae_test

71.46203526091307

In [54]:
# Nota: df_test tiene que tener la columna 'prediction'
ev = Evaluator(df_test, MODEL_NAME, mae_val, rmse_val)
report = ev.getReport()
ev.visualEvaluation()

In [43]:
report

{'global': {'mae': 80.59104347098604,
  'rmse': 106.47107933793535,
  'mse': 11336.090735384925,
  'r2': 0.04250677667514269,
  'mape': 48.33818510797021},
 'by_runway': {'32L/14R': {'mae': 100.50540708528338,
   'rmse': 131.2773149799497},
  '32R/14L': {'mae': 77.11424707775703, 'rmse': 101.26736771369758},
  '36L/18R': {'mae': 79.60461535735402, 'rmse': 105.294016872584},
  '36R/18L': {'mae': 70.43884122541853, 'rmse': 91.11007996910652}},
 'by_holding_point': {'K1': {'mae': 115.45991589946132,
   'rmse': 171.77260458717342},
  'K2': {'mae': 70.87230255734029, 'rmse': 84.6258790846638},
  'K3': {'mae': 151.99290313720704, 'rmse': 187.46256297590537},
  'LA': {'mae': 103.36139083344382, 'rmse': 135.8826663837165},
  'LB': {'mae': 97.80254079889954, 'rmse': 130.46898241341307},
  'LC': {'mae': 106.31957201687794, 'rmse': 151.8946835326238},
  'LE': {'mae': 98.29135130595385, 'rmse': 116.77872076485512},
  'Y1': {'mae': 67.31269910293803, 'rmse': 87.43701914214775},
  'Y2': {'mae': 79.0

### Influencia de las variables

In [None]:
# ===============================================================
# INFLUENCIA DE LAS VARIABLES
# En el caso de el modelo LSTM no se puede saber la influencia de cada variable
# ===============================================================

## Registro del modelo en MLflow

In [56]:
mlflow.set_tracking_uri("./mlflow_experiments")
mlflow.set_experiment("takeoff_time_prediction")

with mlflow.start_run():

    # - Datos generales -

    # ========================================================================
    mlflow.set_tag("model_type", MODEL_NAME)
    mlflow.set_tag("framework", "tensorflow.keras") # scikit-learn, tensorflow, etc.
    mlflow.set_tag("target_variable", "takeoff_time") # variable respuesta
    mlflow.set_tag("preprocessing", "StringIndexer+VectorAssembler+MinMaxScaler") # transformaciones separadas por un +
    mlflow.set_tag("dataset", "original") # indicar si se ha modificado el conjunto de datos
    mlflow.set_tag("seed", SEED) # semilla para replicabilidad
    # ========================================================================
    
    # - Hiperparámetros óptimos -
    
    # =====================================
    # AÑADIR HIPERPARÁMETROS
    best_params = grid_result.best_params_
    for param_name, param_value in best_params.items():
        mlflow.log_param(param_name, param_value)
        
    # Hiperparámetros que estaban fijos en este modelo
    mlflow.log_param("l2_reg", 0.01)
    
    mlflow.log_param("model", MODEL_NAME)
    # =====================================
    
    # - Métricas -

    mlflow.log_metric("execution_time_s", execution_time)

    #mlflow.log_metric("mae_val", mae_val)
    #mlflow.log_metric("rmse_val", rmse_val)

    mlflow.log_metric("mae_train", mae_train)
    mlflow.log_metric("rmse_train", rmse_train)

    # Registrar métricas globales en test
    for metric_name, value in report["global"].items():
        mlflow.log_metric(f"{metric_name}_test", value)
    
    # Registrar métricas por runway
    for runway, metrics in report["by_runway"].items():
        for metric_name, value in metrics.items():
            mlflow.log_metric(f"{metric_name}_test_runway_{runway}", value)
    
    # Registrar métricas por holding point
    for hp, metrics in report["by_holding_point"].items():
        for metric_name, value in metrics.items():
            mlflow.log_metric(f"{metric_name}_test_hp_{hp}", value)

    # - Modelo -

    # ========================================================================
    # NOTA - Dependiendo de con qué has hecho el modelo esto hay que cambiarlo
    mlflow.sklearn.log_model(grid_result.best_estimator_, MODEL_NAME)
    # ========================================================================
    

PermissionError: [Errno 13] Permission denied: '/Users/javimartinfuentes'

In [None]:
# - Visualizar experimentos -
# !mlflow ui --backend-store-uri ./mlflow_experiments