# Modelos Varios

En este notebook están los modelos:

+ CNN (Convolutional Neural Network)
+ Transformer
+ TCN (Temporal Convolutional Network)
+ GRU (Gated Recurrent Unit)
+ Wavenet
+ Tanmet
+ Attention-Only

In [None]:
# Install required packages
%pip install --upgrade pip
%pip install polars numpy scikit-learn matplotlib joblib openpyxl fastexcel tensorflow tensorflow.keras

# For TensorFlow on Mac, you need to install tensorflow-macos
%pip install tensorflow-macos tensorflow-metal

In [None]:
# %%
import polars as pl
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from scipy.optimize import minimize
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import (
    Dense, Dropout, Input, Concatenate, BatchNormalization,
    Conv1D, MaxPooling1D, LayerNormalization, MultiHeadAttention,
    Add, GlobalAveragePooling1D, GRU, Activation, SimpleRNN, Bidirectional, TimeDistributed
)
from keras.saving import register_keras_serializable
import matplotlib.pyplot as plt
import os
from joblib import Parallel, delayed
from datetime import timedelta
import openpyxl

# Configuración de Matplotlib para evitar errores con Tkinter
import matplotlib
matplotlib.use('TkAgg')

## Constantes

In [None]:
# Definición de la ruta del proyecto
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), "..", ".."))
SUBJECTS_RELATIVE_PATH = "data/Subjects"
SUBJECTS_PATH = os.path.join(PROJECT_ROOT, SUBJECTS_RELATIVE_PATH)

# Crear directorios para resultados
FIGURES_DIR = os.path.join(PROJECT_ROOT, "figures", "various_models")
os.makedirs(FIGURES_DIR, exist_ok=True)
MODELS_DIR = os.path.join(PROJECT_ROOT, "models")
os.makedirs(MODELS_DIR, exist_ok=True)

subject_files = [f for f in os.listdir(SUBJECTS_PATH) if f.startswith("Subject") and f.endswith(".xlsx")]
print(f"Total sujetos: {len(subject_files)}")

## Preprocesamiento y Procesamiento de Datos

In [None]:
def get_cgm_window(bolus_time, cgm_df: pl.DataFrame, window_hours: int = 2) -> np.ndarray:
    """
    Obtiene la ventana de datos CGM para un tiempo de bolo específico.

    Parámetros:
    -----------
    bolus_time : datetime
        Tiempo del bolo de insulina
    cgm_df : pl.DataFrame
        DataFrame con datos CGM
    window_hours : int, opcional
        Horas de la ventana de datos (default: 2)

    Retorna:
    --------
    np.ndarray
        Ventana de datos CGM o None si no hay suficientes datos
    """
    window_start = bolus_time - timedelta(hours=window_hours)
    window = cgm_df.filter(
        (pl.col("date") >= window_start) & (pl.col("date") <= bolus_time)
    ).sort("date").tail(24)
    
    if window.height < 24:
        return None
    return window.get_column("mg/dl").to_numpy()

def calculate_iob(bolus_time, basal_df: pl.DataFrame, half_life_hours: float = 4.0) -> float:
    """
    Calcula la insulina activa en el cuerpo (IOB).

    Parámetros:
    -----------
    bolus_time : datetime
        Tiempo del bolo de insulina
    basal_df : pl.DataFrame
        DataFrame con datos de insulina basal
    half_life_hours : float, opcional
        Vida media de la insulina en horas (default: 4.0)

    Retorna:
    --------
    float
        Cantidad de insulina activa
    """
    if basal_df is None or basal_df.is_empty():
        return 0.0
    
    iob = 0.0
    for row in basal_df.iter_rows(named=True):
        start_time = row["date"]
        duration_hours = row["duration"] / (1000 * 3600)
        end_time = start_time + timedelta(hours=duration_hours)
        rate = row["rate"] if row["rate"] is not None else 0.9
        
        if start_time <= bolus_time <= end_time:
            time_since_start = (bolus_time - start_time).total_seconds() / 3600
            remaining = rate * (1 - (time_since_start / half_life_hours))
            iob += max(0.0, remaining)
    return iob

def process_subject(subject_path: str, idx: int) -> list:
    """
    Procesa los datos de un sujeto.

    Parámetros:
    -----------
    subject_path : str
        Ruta al archivo del sujeto
    idx : int
        Índice del sujeto

    Retorna:
    --------
    list
        Lista de diccionarios con características procesadas
    """
    print(f"Procesando {os.path.basename(subject_path)} ({idx+1}/{len(subject_files)})...")
    
    try:
        cgm_df = pl.read_excel(subject_path, sheet_name="CGM")
        bolus_df = pl.read_excel(subject_path, sheet_name="Bolus")
        try:
            basal_df = pl.read_excel(subject_path, sheet_name="Basal")
        except Exception:
            basal_df = None
    except Exception as e:
        print(f"Error al cargar {os.path.basename(subject_path)}: {e}")
        return []

    # Conversión de fechas
    cgm_df = cgm_df.with_columns(pl.col("date").cast(pl.Datetime))
    bolus_df = bolus_df.with_columns(pl.col("date").cast(pl.Datetime))
    if basal_df is not None:
        basal_df = basal_df.with_columns(pl.col("date").cast(pl.Datetime))
    
    cgm_df = cgm_df.sort("date")

    processed_data = []
    for row in bolus_df.iter_rows(named=True):
        bolus_time = row["date"]
        cgm_window = get_cgm_window(bolus_time, cgm_df)
        
        if cgm_window is not None:
            iob = calculate_iob(bolus_time, basal_df)
            hour_of_day = bolus_time.hour / 23.0
            bg_input = row["bgInput"] if row["bgInput"] is not None else cgm_window[-1]
            normal = row["normal"] if row["normal"] is not None else 0.0
            
            # Cálculo del factor de sensibilidad personalizado
            isf_custom = 50.0
            if normal > 0 and bg_input > 100:
                isf_custom = (bg_input - 100) / normal
            
            features = {
                'subject_id': idx,
                'cgm_window': cgm_window,
                'carbInput': row["carbInput"] if row["carbInput"] is not None else 0.0,
                'bgInput': bg_input,
                'insulinCarbRatio': row["insulinCarbRatio"] if row["insulinCarbRatio"] is not None else 10.0,
                'insulinSensitivityFactor': isf_custom,
                'insulinOnBoard': iob,
                'hour_of_day': hour_of_day,
                'normal': normal
            }
            processed_data.append(features)
    
    return processed_data

# Ejecución en paralelo
all_processed_data = Parallel(n_jobs=-1)(
    delayed(process_subject)(
        os.path.join(SUBJECTS_PATH, f), 
        idx
    ) for idx, f in enumerate(subject_files)
)

all_processed_data = [item for sublist in all_processed_data for item in sublist]

# Conversión a DataFrame
df_processed = pl.DataFrame(all_processed_data)
print("Muestra de datos procesados combinados:")
print(df_processed.head())
print(f"Total muestras: {len(df_processed)}")

### División de Ventana CGM y Valores Nulos

In [None]:
# Dividir ventana CGM y otras características
cgm_columns = [f'cgm_{i}' for i in range(24)]
df_cgm = pl.DataFrame({
    col: [row['cgm_window'][i] for row in all_processed_data]
    for i, col in enumerate(cgm_columns)
}, schema={col: pl.Float64 for col in cgm_columns})

# Combinar con otras características
df_processed = pl.concat([
    df_cgm,
    df_processed.drop('cgm_window')
], how="horizontal")

# Verificar valores nulos
print("Verificación de valores nulos en df_processed:")
print(df_processed.null_count())
df_processed = df_processed.drop_nulls()

### Normalización de Datos

In [None]:
# Normalizar características
scaler_cgm = MinMaxScaler(feature_range=(0, 1))
scaler_other = StandardScaler()

# Normalizar CGM
x_cgm = scaler_cgm.fit_transform(df_processed.select(cgm_columns).to_numpy())
x_cgm = x_cgm.reshape(x_cgm.shape[0], x_cgm.shape[1], 1)

# Normalizar otras características (incluyendo hour_of_day)
other_features = ['carbInput', 'bgInput', 'insulinOnBoard', 'insulinCarbRatio', 
                  'insulinSensitivityFactor', 'subject_id', 'hour_of_day']
x_other = scaler_other.fit_transform(df_processed.select(other_features).to_numpy())

# Etiquetas
y = df_processed.get_column('normal').to_numpy()

# Verificar NaN
print("NaN en x_cgm:", np.isnan(x_cgm).sum())
print("NaN en x_other:", np.isnan(x_other).sum())
print("NaN en y:", np.isnan(y).sum())
if np.isnan(x_cgm).sum() > 0 or np.isnan(x_other).sum() > 0 or np.isnan(y).sum() > 0:
    raise ValueError("Valores NaN detectados en x_cgm, x_other o y")

### División por Sujeto de los Datos

In [None]:
# División por sujeto
subject_ids = df_processed.get_column('subject_id').unique().to_numpy()
train_subjects, temp_subjects = train_test_split(subject_ids, test_size=0.2, random_state=42)
val_subjects, test_subjects = train_test_split(temp_subjects, test_size=0.5, random_state=42)

### Creación de Máscaras

In [None]:
# Crear máscaras
train_mask = df_processed.get_column('subject_id').is_in(train_subjects).to_numpy()
val_mask = df_processed.get_column('subject_id').is_in(val_subjects).to_numpy()
test_mask = df_processed.get_column('subject_id').is_in(test_subjects).to_numpy()

x_cgm_train, x_cgm_val, x_cgm_test = x_cgm[train_mask], x_cgm[val_mask], x_cgm[test_mask]
x_other_train, x_other_val, x_other_test = x_other[train_mask], x_other[val_mask], x_other[test_mask]
y_train, y_val, y_test = y[train_mask], y[val_mask], y[test_mask]
subject_test = df_processed.filter(pl.col('subject_id').is_in(test_subjects)).get_column('subject_id').to_numpy()

print(f"Entrenamiento CGM: {x_cgm_train.shape}, Validación CGM: {x_cgm_val.shape}, Prueba CGM: {x_cgm_test.shape}")
print(f"Entrenamiento Otros: {x_other_train.shape}, Validación Otros: {x_other_val.shape}, Prueba Otros: {x_other_test.shape}")
print(f"Sujetos de prueba: {test_subjects}")

## Modelos

### Constantes

In [None]:
"""Model configuration parameters"""

TCN_CONFIG = {
    'filters': [32, 64, 128],
    'kernel_size': 3,
    'dilations': [1, 2, 4, 8, 16],
    'dropout_rate': [0.2, 0.1],
    'activation': 'gelu',
    'epsilon': 1e-6,
    'use_layer_norm': True,
    'use_weight_norm': True,
    'use_spatial_dropout': True,
    'residual_dropout': 0.1
}

TRANSFORMER_CONFIG = {
    'num_heads': 8,
    'key_dim': 64,
    'num_layers': 4,
    'ff_dim': 256,
    'dropout_rate': 0.1,
    'epsilon': 1e-6,
    'activation': 'gelu',
    'use_relative_pos': True,
    'max_position': 32,
    'head_size': 32,
    'use_bias': True,
    'prenorm': True
}

WAVENET_CONFIG = {
    'filters': [32, 64, 128],
    'kernel_size': 3,
    'dilations': [1, 2, 4, 8, 16],
    'dropout_rate': 0.2,
    'use_gating': True,
    'use_skip_scale': True,
    'use_residual_scale': 0.1,
    'activation': 'elu'
}

TABNET_CONFIG = {
    'feature_dim': 128,
    'output_dim': 64,
    'num_decision_steps': 8,
    'relaxation_factor': 1.5,
    'sparsity_coefficient': 1e-4,
    'batch_momentum': 0.98,
    'virtual_batch_size': 128,
    'num_attention_heads': 4,
    'attention_dropout': 0.2,
    'feature_dropout': 0.1
}

ATTENTION_CONFIG = {
    'num_heads': 8,
    'key_dim': 64,
    'num_layers': 4,
    'ff_dim': 256,
    'dropout_rate': 0.1,
    'use_relative_attention': True,
    'max_relative_position': 32,
    'activation': 'gelu',
    'head_size': 32,
    'use_mask_future': False,
    'layer_dropout': 0.1
}

GRU_CONFIG = {
    'hidden_units': [64, 128, 256],
    'dropout_rate': 0.3,
    'recurrent_dropout': 0.2,
    'epsilon': 1e-5,
    'attention_heads': 4
}

CNN_CONFIG = {
    'filters': [32, 64, 128, 256],
    'kernel_size': 3,
    'pool_size': 2,
    'dropout_rate': 0.2,
    'use_se_block': True,
    'se_ratio': 16,
    'use_layer_norm': True,
    'activation': 'gelu',
    'dilation_rates': [1, 2, 4]
}

GRU_CONFIG = {
    'hidden_units': [64, 128, 256],
    'dropout_rate': 0.3,
    'recurrent_dropout': 0.2,
    'epsilon': 1e-5,
    'attention_heads': 4
}

RNN_CONFIG = {
    'hidden_units': [64, 32],
    'dropout_rate': 0.2,
    'recurrent_dropout': 0.1,
    'bidirectional': True,
    'epsilon': 1e-6,
    'use_time_distributed': True,
    'activation': 'relu'
}

### Attention-Only Model

In [None]:
@register_keras_serializable()
class RelativePositionEncoding(tf.keras.layers.Layer):
    """
    Codificación de posición relativa para mejorar la atención temporal.
    """
    def __init__(self, max_position: int, depth: int, **kwargs):
        super().__init__(**kwargs)
        self.max_position = max_position
        self.depth = depth
        
    def build(self, input_shape):
        self.rel_embeddings = self.add_weight(
            name="rel_embeddings",
            shape=[2 * self.max_position - 1, self.depth],
            initializer="glorot_uniform"
        )
        
    def call(self, length):
        pos_emb = tf.gather(
            self.rel_embeddings,
            tf.range(length)[:, tf.newaxis] - tf.range(length)[tf.newaxis, :] + self.max_position - 1
        )
        return pos_emb

def create_attention_block(x: tf.Tensor, num_heads: int, key_dim: int, 
                         ff_dim: int, dropout_rate: float, training: bool = None) -> tf.Tensor:
    """
    Crea un bloque de atención mejorado con posición relativa y gating.

    Parámetros:
    -----------
    x : tf.Tensor
        Tensor de entrada
    num_heads : int
        Número de cabezas de atención
    key_dim : int
        Dimensión de la clave
    ff_dim : int
        Dimensión de la red feed-forward
    dropout_rate : float
        Tasa de dropout
    training : bool
        Indica si está en modo entrenamiento
    
    Retorna:
    --------
    tf.Tensor
        Tensor procesado
    """
    # Relative position encoding
    if ATTENTION_CONFIG['use_relative_attention']:
        pos_encoding = RelativePositionEncoding(
            ATTENTION_CONFIG['max_relative_position'],
            key_dim
        )(tf.shape(x)[1])
        
        attention_output = MultiHeadAttention(
            num_heads=num_heads,
            key_dim=key_dim,
            value_dim=ATTENTION_CONFIG['head_size']
        )(x, x, attention_bias=pos_encoding)
    else:
        attention_output = MultiHeadAttention(
            num_heads=num_heads,
            key_dim=key_dim
        )(x, x)
    
    # Gating mechanism
    gate = tf.keras.layers.Dense(attention_output.shape[-1], activation='sigmoid')(x)
    attention_output = gate * attention_output
    
    attention_output = Dropout(dropout_rate)(attention_output, training=training)
    x = LayerNormalization(epsilon=1e-6)(x + attention_output)
    
    # Enhanced feed-forward network with GLU
    ffn = Dense(ff_dim)(x)
    ffn_gate = Dense(ff_dim, activation='sigmoid')(x)
    ffn = ffn * ffn_gate
    ffn = Dense(x.shape[-1])(ffn)
    ffn = Dropout(dropout_rate)(ffn, training=training)
    
    return LayerNormalization(epsilon=1e-6)(x + ffn)

def create_attention_model(cgm_shape: tuple, other_features_shape: tuple) -> Model:
    """
    Crea un modelo basado únicamente en mecanismos de atención.

    Parámetros:
    -----------
    cgm_shape : tuple
        Forma de los datos CGM (samples, timesteps, features)
    other_features_shape : tuple
        Forma de otras características (samples, features)

    Retorna:
    --------
    Model
        Modelo de atención compilado
    """
    cgm_input = Input(shape=cgm_shape[1:])
    other_input = Input(shape=(other_features_shape[1],))
    
    # Initial projection
    x = Dense(ATTENTION_CONFIG['key_dim'] * ATTENTION_CONFIG['num_heads'])(cgm_input)
    
    # Stochastic depth (layer dropout)
    survive_rates = tf.linspace(1.0, 0.5, ATTENTION_CONFIG['num_layers'])
    
    # Stack attention blocks with stochastic depth
    for i in range(ATTENTION_CONFIG['num_layers']):
        if tf.random.uniform([]) < survive_rates[i]:
            x = create_attention_block(
                x,
                ATTENTION_CONFIG['num_heads'],
                ATTENTION_CONFIG['key_dim'],
                ATTENTION_CONFIG['ff_dim'],
                ATTENTION_CONFIG['dropout_rate']
            )
    
    # Global context
    attention_pooled = GlobalAveragePooling1D()(x)
    max_pooled = tf.keras.layers.GlobalMaxPooling1D()(x)
    x = Concatenate()([attention_pooled, max_pooled])
    
    # Combine with other features
    x = Concatenate()([x, other_input])
    
    # Final MLP with residual
    skip = x
    x = Dense(128, activation=ATTENTION_CONFIG['activation'])(x)
    x = LayerNormalization(epsilon=1e-6)(x)
    x = Dropout(ATTENTION_CONFIG['dropout_rate'])(x)
    x = Dense(128, activation=ATTENTION_CONFIG['activation'])(x)
    if skip.shape[-1] == 128:
        x = Add()([x, skip])
    
    output = Dense(1)(x)
    
    return Model(inputs=[cgm_input, other_input], outputs=output)

### Convolutional Neural Network (CNN)

In [None]:
@register_keras_serializable()
class SqueezeExcitationBlock(tf.keras.layers.Layer):
    """
    Bloque Squeeze-and-Excitation como capa personalizada.
    """
    def __init__(self, filters: int, se_ratio: int = 16, **kwargs):
        super().__init__(**kwargs)
        self.filters = filters
        self.se_ratio = se_ratio
        
        # Define layers
        self.gap = GlobalAveragePooling1D()
        self.dense1 = Dense(filters // se_ratio, activation='gelu')
        self.dense2 = Dense(filters, activation='sigmoid')
    
    def call(self, inputs):
        # Squeeze
        x = self.gap(inputs)
        
        # Excitation
        x = self.dense1(x)
        x = self.dense2(x)
        
        # Reshape for broadcasting
        x = tf.expand_dims(x, axis=1)
        
        # Scale
        return inputs * x
    
    def get_config(self):
        config = super().get_config()
        config.update({
            "filters": self.filters,
            "se_ratio": self.se_ratio
        })
        return config

# Update the residual block to use the new layer
def create_residual_block(x, filters, dilation_rate=1):
    """
    Crea un bloque residual mejorado con dilated convolutions y SE.
    
    Parámetros:
    -----------
    x : tensor
        Tensor de entrada
    filters : int
        Número de filtros
    dilation_rate : int
        Tasa de dilatación para las convoluciones
        
    Retorna:
    --------
    tensor
        Tensor procesado
    """
    skip = x
    
    # Convolution path
    x = Conv1D(
        filters=filters,
        kernel_size=CNN_CONFIG['kernel_size'],
        padding='same',
        dilation_rate=dilation_rate
    )(x)
    x = LayerNormalization()(x)
    x = Activation(CNN_CONFIG['activation'])(x)
    
    # Squeeze-and-Excitation
    if CNN_CONFIG['use_se_block']:
        x = SqueezeExcitationBlock(filters, CNN_CONFIG['se_ratio'])(x)
    
    # Project residual if needed
    if skip.shape[-1] != filters:
        skip = Conv1D(filters, 1, padding='same')(skip)
    
    return Add()([x, skip])

def create_cnn_model(cgm_shape: tuple, other_features_shape: tuple) -> Model:
    """
    Crea un modelo CNN (Convolutional Neural Network) con entrada dual para datos CGM y otras características.
    
    Parámetros:
    -----------
    cgm_shape : tuple
        Forma de los datos CGM (samples, timesteps, features)
    other_features_shape : tuple
        Forma de otras características (samples, features)
        
    Retorna:
    --------
    Model
        Modelo CNN compilado
    """
    # Entrada CGM
    cgm_input = Input(shape=cgm_shape[1:], name='cgm_input')
    
    # Proyección inicial
    x = Conv1D(CNN_CONFIG['filters'][0], 1, padding='same')(cgm_input)
    x = LayerNormalization()(x) if CNN_CONFIG['use_layer_norm'] else BatchNormalization()(x)
    
    # Bloques residuales con different dilation rates
    for filters in CNN_CONFIG['filters']:
        for dilation_rate in CNN_CONFIG['dilation_rates']:
            x = create_residual_block(x, filters, dilation_rate)
        x = MaxPooling1D(pool_size=CNN_CONFIG['pool_size'])(x)
    
    # Pooling global con concat de max y average
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = tf.keras.layers.GlobalMaxPooling1D()(x)
    x = Concatenate()([avg_pool, max_pool])
    
    # Entrada de otras características
    other_input = Input(shape=(other_features_shape[1],), name='other_input')
    
    # Combinar características
    combined = Concatenate()([x, other_input])
    
    # Capas densas con residual connections
    skip = combined
    dense = Dense(256, activation=CNN_CONFIG['activation'])(combined)
    dense = LayerNormalization()(dense) if CNN_CONFIG['use_layer_norm'] else BatchNormalization()(dense)
    dense = Dropout(CNN_CONFIG['dropout_rate'])(dense)
    dense = Dense(256, activation=CNN_CONFIG['activation'])(dense)
    if skip.shape[-1] == 256:
        dense = Add()([dense, skip])
    
    # Final layers
    dense = Dense(128, activation=CNN_CONFIG['activation'])(dense)
    dense = LayerNormalization()(dense) if CNN_CONFIG['use_layer_norm'] else BatchNormalization()(dense)
    dense = Dropout(CNN_CONFIG['dropout_rate'] / 2)(dense)
    
    output = Dense(1)(dense)
    
    return Model(inputs=[cgm_input, other_input], outputs=output)

### Gated Recurrent Unit (GRU)

In [None]:
def create_gru_attention_block(x, units, num_heads=4):
    """
    Crea un bloque GRU con self-attention y conexiones residuales.
    
    Parámetros:
    -----------
    x : tensor
        Tensor de entrada
    units : int
        Número de unidades GRU
    num_heads : int
        Número de cabezas de atención
    """
    # GRU con skip connection
    skip1 = x
    x = GRU(
        units,
        return_sequences=True,
        dropout=GRU_CONFIG['dropout_rate'],
        recurrent_dropout=GRU_CONFIG['recurrent_dropout']
    )(x)
    x = LayerNormalization(epsilon=GRU_CONFIG['epsilon'])(x)
    if skip1.shape[-1] == units:
        x = Add()([x, skip1])
    
    # Multi-head attention
    skip2 = x
    attention_output = MultiHeadAttention(
        num_heads=num_heads,
        key_dim=units // num_heads
    )(x, x)
    x = LayerNormalization(epsilon=GRU_CONFIG['epsilon'])(attention_output + skip2)
    
    return x

def create_gru_model(cgm_shape: tuple, other_features_shape: tuple) -> Model:
    """
    Crea un modelo GRU avanzado con self-attention y conexiones residuales.
    
    Parámetros:
    -----------
    cgm_shape : tuple
        Forma de los datos CGM (samples, timesteps, features)
    other_features_shape : tuple
        Forma de otras características (samples, features)
        
    Retorna:
    --------
    Model
        Modelo GRU compilado
    """
    # Entradas
    cgm_input = Input(shape=cgm_shape[1:])
    other_input = Input(shape=(other_features_shape[1],))
    
    # Proyección inicial
    x = Dense(GRU_CONFIG['hidden_units'][0])(cgm_input)
    x = LayerNormalization(epsilon=GRU_CONFIG['epsilon'])(x)
    
    # Bloques GRU con attention
    for units in GRU_CONFIG['hidden_units']:
        x = create_gru_attention_block(x, units)
    
    # Pooling global
    x = GlobalAveragePooling1D()(x)
    
    # Combinar con otras características
    combined = Concatenate()([x, other_input])
    
    # Red densa final con skip connections
    for units in [128, 64]:
        skip = combined
        x = Dense(units, activation='relu')(combined)
        x = LayerNormalization(epsilon=GRU_CONFIG['epsilon'])(x)
        x = Dropout(GRU_CONFIG['dropout_rate'])(x)
        if skip.shape[-1] == units:
            combined = Add()([x, skip])
        else:
            combined = x
    
    output = Dense(1)(combined)
    
    return Model(inputs=[cgm_input, other_input], outputs=output)

### Recurrent Neural Network (RNN)

In [None]:
def create_rnn_model(cgm_shape: tuple, other_features_shape: tuple) -> Model:
    """
    Crea un modelo RNN optimizado para velocidad con procesamiento temporal distribuido.
    
    Parámetros:
    -----------
    cgm_shape : tuple
        Forma de los datos CGM (samples, timesteps, features)
    other_features_shape : tuple
        Forma de otras características (samples, features)
        
    Retorna:
    --------
    Model
        Modelo RNN compilado
    """
    # Entradas
    cgm_input = Input(shape=cgm_shape[1:])
    other_input = Input(shape=(other_features_shape[1],))
    
    # Procesamiento temporal distribuido inicial
    if RNN_CONFIG['use_time_distributed']:
        x = TimeDistributed(Dense(32, activation=RNN_CONFIG['activation']))(cgm_input)
        x = TimeDistributed(BatchNormalization(epsilon=RNN_CONFIG['epsilon']))(x)
    else:
        x = cgm_input
    
    # Reducir secuencia temporal para procesamiento más rápido
    x = MaxPooling1D(pool_size=2)(x)
    
    # Capas RNN con menos unidades pero bidireccionales
    for units in RNN_CONFIG['hidden_units']:
        rnn_layer = SimpleRNN(
            units,
            activation=RNN_CONFIG['activation'],
            dropout=RNN_CONFIG['dropout_rate'],
            recurrent_dropout=RNN_CONFIG['recurrent_dropout'],
            return_sequences=True,
            unroll=True  # Desenrollar para secuencias cortas
        )
        
        if RNN_CONFIG['bidirectional']:
            x = Bidirectional(rnn_layer)(x)
        else:
            x = rnn_layer(x)
            
        x = BatchNormalization(
            epsilon=RNN_CONFIG['epsilon'],
            momentum=0.9  # Aumentar momentum para actualización más rápida
        )(x)
    
    # Último RNN sin return_sequences
    final_rnn = SimpleRNN(
        RNN_CONFIG['hidden_units'][-1],
        activation=RNN_CONFIG['activation'],
        dropout=RNN_CONFIG['dropout_rate'],
        recurrent_dropout=RNN_CONFIG['recurrent_dropout'],
        unroll=True
    )
    
    if RNN_CONFIG['bidirectional']:
        x = Bidirectional(final_rnn)(x)
    else:
        x = final_rnn(x)
    
    # Combinar características
    x = Concatenate()([x, other_input])
    
    # Reducir capas densas
    x = Dense(32, activation=RNN_CONFIG['activation'])(x)
    x = BatchNormalization(epsilon=RNN_CONFIG['epsilon'])(x)
    x = Dropout(RNN_CONFIG['dropout_rate'])(x)
    
    output = Dense(1)(x)
    
    return Model(inputs=[cgm_input, other_input], outputs=output)

### TabNet

In [None]:
@register_keras_serializable()
class GLU(tf.keras.layers.Layer):
    """
    Gated Linear Unit como capa personalizada.
    """
    def __init__(self, units, **kwargs):
        super().__init__(**kwargs)
        self.units = units
        self.dense = Dense(units * 2)

    def call(self, inputs):
        x = self.dense(inputs)
        return x[:, :self.units] * tf.nn.sigmoid(x[:, self.units:])

@register_keras_serializable()
class MultiHeadFeatureAttention(tf.keras.layers.Layer):
    """
    Atención multi-cabeza para características.
    """
    def __init__(self, num_heads: int, key_dim: int, dropout: float = 0.0, **kwargs):
        super().__init__(**kwargs)
        self.num_heads = num_heads
        self.key_dim = key_dim
        self.attention = tf.keras.layers.MultiHeadAttention(
            num_heads=num_heads,
            key_dim=key_dim,
            dropout=dropout
        )
        self.layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    
    def call(self, inputs, training=None):
        attention_output = self.attention(inputs, inputs, training=training)
        return self.layernorm(inputs + attention_output)

@register_keras_serializable()
class EnhancedFeatureTransformer(tf.keras.layers.Layer):
    """
    Transformador de características mejorado con atención y ghost batch norm.
    """
    def __init__(self, feature_dim: int, num_heads: int, 
                 virtual_batch_size: int, dropout_rate: float = 0.1, **kwargs):
        super().__init__(**kwargs)
        self.feature_dim = feature_dim
        self.virtual_batch_size = virtual_batch_size
        
        # GLU layers
        self.glu1 = GLU(feature_dim)
        self.glu2 = GLU(feature_dim)
        
        # Attention layer
        self.attention = MultiHeadFeatureAttention(
            num_heads=num_heads,
            key_dim=feature_dim // num_heads,
            dropout=dropout_rate
        )
        
        # Ghost Batch Normalization
        self.ghost_bn1 = tf.keras.layers.BatchNormalization(
            virtual_batch_size=virtual_batch_size
        )
        self.ghost_bn2 = tf.keras.layers.BatchNormalization(
            virtual_batch_size=virtual_batch_size
        )
        
        self.dropout = tf.keras.layers.Dropout(dropout_rate)

    def call(self, inputs, training=None):
        x = self.glu1(inputs)
        x = self.ghost_bn1(x, training=training)
        x = self.attention(x, training=training)
        x = self.glu2(x)
        x = self.ghost_bn2(x, training=training)
        return self.dropout(x, training=training)

def custom_softmax(x: tf.Tensor, axis: int=-1) -> tf.Tensor:
    """
    Implementación de softmax con estabilidad numérica.

    Parámetros:
    -----------
    x : tf.Tensor
        Tensor de entrada
    axis : int
        Eje de normalización
    
    Retorna:
    --------
    tf.Tensor
        Tensor normal
    """
    exp_x = tf.exp(x - tf.reduce_max(x, axis=axis, keepdims=True))
    return exp_x / tf.reduce_sum(exp_x, axis=axis, keepdims=True)

def glu(x: tf.Tensor, n_units: int) -> tf.Tensor:
    """
    Gated Linear Unit.
    
    Parámetros:
    -----------
    x : tf.Tensor
        Tensor de entrada
    n_units : int
        Número de unidades

    Retorna:
    --------
    tf.Tensor
        Tensor GLU
    """
    return x[:, :n_units] * tf.nn.sigmoid(x[:, n_units:])

def feature_transformer(x: tf.Tensor, feature_dim: int, batch_momentum: float=0.98) -> tf.Tensor:
    """
    Transformador de características.

    Parámetros:
    -----------
    x : tf.Tensor
        Tensor de entrada
    feature_dim : int
        Dimensión de las características
    batch_momentum : float
        Momento de la normalización por lotes
    
    Retorna:
    --------
    tf.Tensor
        Tensor transform
    """
    transform = Dense(feature_dim * 2)(x)
    transform = glu(transform, feature_dim)
    return BatchNormalization(momentum=batch_momentum)(transform)

@register_keras_serializable()
class TabNetModel(tf.keras.Model):
    """
    Modelo TabNet personalizado con manejo de pérdidas de entropía.
    """
    def __init__(self, cgm_shape, other_features_shape, **kwargs):
        super().__init__(**kwargs)
        self.cgm_shape = cgm_shape
        self.other_shape = other_features_shape
        self.entropy_tracker = tf.keras.metrics.Mean(name='entropy_loss')
        
        # Definir capas
        self.flatten = tf.keras.layers.Flatten()
        self.feature_dropout = tf.keras.layers.Dropout(TABNET_CONFIG['feature_dropout'])
        self.transformers = [
            EnhancedFeatureTransformer(
                feature_dim=TABNET_CONFIG['feature_dim'],
                num_heads=TABNET_CONFIG['num_attention_heads'],
                virtual_batch_size=TABNET_CONFIG['virtual_batch_size'],
                dropout_rate=TABNET_CONFIG['attention_dropout']
            ) for _ in range(TABNET_CONFIG['num_decision_steps'])
        ]
        
        # Capas finales
        self.final_dense1 = Dense(TABNET_CONFIG['output_dim'], activation='selu')
        self.final_norm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.final_dropout = tf.keras.layers.Dropout(TABNET_CONFIG['attention_dropout'])
        self.final_dense2 = Dense(TABNET_CONFIG['output_dim'] // 2, activation='selu')
        self.final_norm2 = tf.keras.layers.LayerNormalization()
        self.final_dense3 = Dense(TABNET_CONFIG['output_dim'], activation='selu')
        self.output_layer = Dense(1)

    def call(self, inputs, training=None):
        cgm_input, other_input = inputs
        
        # Procesamiento inicial
        x = self.flatten(cgm_input)
        x = Concatenate()([x, other_input])
        
        # Feature masking
        if training:
            feature_mask = self.feature_dropout(tf.ones_like(x))
            x = tf.multiply(x, feature_mask)
        
        # Pasos de decisión
        step_outputs = []
        entropy_loss = 0.0
        
        for transformer in self.transformers:
            step_output = transformer(x, training=training)
            
            # Feature selection
            attention_mask = Dense(x.shape[-1])(step_output)
            mask = custom_softmax(attention_mask)
            masked_x = tf.multiply(x, mask)
            
            step_outputs.append(masked_x)
            
            if training:
                # Calcular entropía
                entropy = tf.reduce_mean(tf.reduce_sum(
                    -mask * tf.math.log(mask + 1e-15), axis=1
                ))
                entropy_loss += entropy
        
        # Combinar salidas con atención
        combined = tf.stack(step_outputs, axis=1)
        attention_weights = Dense(len(step_outputs), activation='softmax')(
            tf.reduce_mean(combined, axis=2)
        )
        x = tf.reduce_sum(
            combined * tf.expand_dims(attention_weights, -1),
            axis=1
        )
        
        # Actualizar métrica de entropía
        if training:
            entropy_loss *= TABNET_CONFIG['sparsity_coefficient']
            self.entropy_tracker.update_state(entropy_loss)
            self.add_loss(entropy_loss)
        
        # Capas finales con residual
        x = self.final_dense1(x)
        x = self.final_norm1(x)
        x = self.final_dropout(x, training=training)
        
        skip = x
        x = self.final_dense2(x)
        x = self.final_norm2(x)
        x = self.final_dense3(x)
        x = tf.keras.layers.Add()([x, skip])
        
        return self.output_layer(x)

def create_tabnet_model(cgm_shape: tuple, other_features_shape: tuple) -> Model:
    """
    Crea un modelo TabNet mejorado.
    
    Parámetros:
    -----------
    cgm_shape : tuple
        Forma de los datos CGM
    other_features_shape : tuple
        Forma de otras características
        
    Retorna:
    --------
    Model
        Modelo TabNet compilado
    """
    model = TabNetModel(cgm_shape, other_features_shape)
    
    # Build model
    dummy_cgm = tf.keras.layers.Input(shape=cgm_shape[1:])
    dummy_other = tf.keras.layers.Input(shape=(other_features_shape[1],))
    model([dummy_cgm, dummy_other])
    
    return model

### Temporal Convolutional Network (TCN)

In [None]:
@register_keras_serializable()
class WeightNormalization(tf.keras.layers.Wrapper):
    """
    Normalización de pesos para capas convolucionales.
    """
    def __init__(self, layer, **kwargs):
        super().__init__(layer, **kwargs)
        self.layer = layer

    def build(self, input_shape):
        self.layer.build(input_shape)
        self.g = self.add_weight(
            name='g',
            shape=(self.layer.filters,),
            initializer='ones',
            trainable=True
        )

    def call(self, inputs):
        weights = self.layer.weights[0]
        norm = tf.sqrt(tf.sum(tf.square(weights), axis=[0, 1]))
        self.layer.kernel = weights * (self.g / norm)
        outputs = self.layer.call(inputs)
        return outputs

@register_keras_serializable()
class CausalPadding(tf.keras.layers.Layer):
    """
    Capa personalizada para padding causal.
    """
    def __init__(self, padding_size, **kwargs):
        super().__init__(**kwargs)
        self.padding_size = padding_size

    def call(self, inputs):
        return tf.pad(inputs, [[0, 0], [self.padding_size, 0], [0, 0]])

    def compute_output_shape(self, input_shape):
        return (input_shape[0], input_shape[1] + self.padding_size, input_shape[2])

def create_tcn_block(input_layer: tf.Tensor, filters: int, kernel_size: int, 
                    dilation_rate: int, dropout_rate: float) -> tf.Tensor:
    """
    Crea un bloque TCN (Temporal Convolutional Network).
    
    Parámetros:
    -----------
    input_layer : tf.Tensor
        Capa de entrada
    filters : int
        Número de filtros
    kernel_size : int
        Tamaño del kernel
    dilation_rate : int
        Tasa de dilatación
    dropout_rate : float
        Tasa de dropout
    
    Retorna:
    --------
    tf.Tensor
        Salida del bloque TCN
    """
    padding_size = (kernel_size - 1) * dilation_rate
    padded_input = CausalPadding(padding_size)(input_layer)
    
    # Convolución con weight normalization
    conv_layer = Conv1D(
        filters=filters * 2,  # Double for gating
        kernel_size=kernel_size,
        dilation_rate=dilation_rate,
        padding='valid',
        activation=None
    )
    
    if TCN_CONFIG['use_weight_norm']:
        conv_layer = WeightNormalization(conv_layer)
    
    conv = conv_layer(padded_input)
    
    # Gating mechanism (GLU)
    gate, linear = tf.split(conv, 2, axis=-1)
    gate = tf.nn.sigmoid(gate)
    conv = linear * gate
    
    # Normalization
    if TCN_CONFIG['use_layer_norm']:
        conv = LayerNormalization(epsilon=TCN_CONFIG['epsilon'])(conv)
    else:
        conv = BatchNormalization()(conv)
    
    # Spatial dropout
    if TCN_CONFIG['use_spatial_dropout']:
        conv = tf.keras.layers.SpatialDropout1D(dropout_rate)(conv)
    else:
        conv = Dropout(dropout_rate)(conv)
    
    # Residual connection
    if input_layer.shape[-1] == filters:
        cropped_input = input_layer[:, -conv.shape[1]:, :]
        if TCN_CONFIG['residual_dropout'] > 0:
            cropped_input = Dropout(TCN_CONFIG['residual_dropout'])(cropped_input)
        return Add()([conv, cropped_input])
    
    return conv

def create_tcn_model(input_shape: tuple, other_features_shape: tuple) -> Model:
    """
    Crea un modelo TCN completo.
    
    Parámetros:
    -----------
    input_shape : tuple
        Forma de los datos CGM
    other_features_shape : tuple
        Forma de otras características
    
    Retorna:
    --------
    Model
        Modelo TCN compilado
    """
    cgm_input = Input(shape=input_shape[1:], name='cgm_input')
    other_input = Input(shape=(other_features_shape[1],), name='other_input')
    
    # Proyección inicial
    x = Conv1D(TCN_CONFIG['filters'][0], 1, padding='same')(cgm_input)
    
    # Bloques TCN con skip connections
    skip_connections = []
    
    for filters in TCN_CONFIG['filters']:
        for dilation_rate in TCN_CONFIG['dilations']:
            tcn_out = create_tcn_block(
                x,
                filters=filters,
                kernel_size=TCN_CONFIG['kernel_size'],
                dilation_rate=dilation_rate,
                dropout_rate=TCN_CONFIG['dropout_rate'][0]
            )
            skip_connections.append(tcn_out)
            x = tcn_out
    
    # Combinar skip connections con normalización
    if skip_connections:
        target_len = skip_connections[-1].shape[1]
        aligned_skips = [skip[:, -target_len:, :] for skip in skip_connections]
        x = Add()(aligned_skips)
        x = x / tf.sqrt(float(len(skip_connections)))  # Scale appropriately
    
    # Global pooling con concatenación de estadísticas
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = tf.keras.layers.GlobalMaxPooling1D()(x)
    x = tf.keras.layers.Concatenate()([avg_pool, max_pool])
    
    # Combinar con otras características
    x = tf.keras.layers.Concatenate()([x, other_input])
    
    # MLP final con residual connections
    skip = x
    x = Dense(128, activation=TCN_CONFIG['activation'])(x)
    x = LayerNormalization(epsilon=TCN_CONFIG['epsilon'])(x)
    x = Dropout(TCN_CONFIG['dropout_rate'][0])(x)
    x = Dense(128, activation=TCN_CONFIG['activation'])(x)
    if skip.shape[-1] == 128:
        x = Add()([x, skip])
    
    x = Dense(64, activation=TCN_CONFIG['activation'])(x)
    x = LayerNormalization(epsilon=TCN_CONFIG['epsilon'])(x)
    x = Dropout(TCN_CONFIG['dropout_rate'][1])(x)
    
    output = Dense(1)(x)
    
    return Model(inputs=[cgm_input, other_input], outputs=output)

### Transformer

In [None]:
@register_keras_serializable()
class PositionEncoding(tf.keras.layers.Layer):
    """
    Codificación posicional para el Transformer.
    """
    def __init__(self, max_position: int, d_model: int, **kwargs):
        super().__init__(**kwargs)
        self.max_position = max_position
        self.d_model = d_model
        
    def build(self, input_shape):
        positions = tf.range(self.max_position, dtype=tf.float32)[:, tf.newaxis]
        dimensions = tf.range(self.d_model, dtype=tf.float32)[tf.newaxis, :]
        angle_rates = 1 / tf.pow(10000.0, (2 * (dimensions // 2)) / tf.cast(self.d_model, tf.float32))
        angle_rads = positions * angle_rates

        # Apply sin to even indices, cos to odd indices
        pos_encoding = tf.stack([
            tf.sin(angle_rads[:, 0::2]),
            tf.cos(angle_rads[:, 1::2])
        ], axis=-1)

        self.pos_encoding = tf.reshape(pos_encoding, [self.max_position, self.d_model])
        
    def call(self, inputs):
        sequence_length = tf.shape(inputs)[1]
        return inputs + self.pos_encoding[:sequence_length, :]

def create_transformer_block(inputs, head_size, num_heads, ff_dim, dropout_rate, prenorm=True):
    """
    Crea un bloque Transformer mejorado con pre/post normalización.
    
    Parámetros:
    -----------
    inputs : tf.Tensor
        Tensor de entrada
    head_size : int
        Tamaño de la cabeza de atención
    num_heads : int
        Número de cabezas de atención
    ff_dim : int
        Dimensión de la red feed-forward
    dropout_rate : float
        Tasa de dropout
    prenorm : bool
        Indica si se usa pre-normalización
        
    Retorna:
    --------
    tf.Tensor
        Tensor procesado
    """
    if prenorm:
        # Pre-normalization architecture (better training stability)
        x = LayerNormalization(epsilon=TRANSFORMER_CONFIG['epsilon'])(inputs)
        x = MultiHeadAttention(
            num_heads=num_heads,
            key_dim=head_size,
            value_dim=head_size,
            use_bias=TRANSFORMER_CONFIG['use_bias'],
            dropout=dropout_rate
        )(x, x)
        x = Dropout(dropout_rate)(x)
        res1 = Add()([inputs, x])
        
        # Feed-forward network
        x = LayerNormalization(epsilon=TRANSFORMER_CONFIG['epsilon'])(res1)
        x = Dense(ff_dim, activation=TRANSFORMER_CONFIG['activation'])(x)
        x = Dropout(dropout_rate)(x)
        x = Dense(inputs.shape[-1])(x)
        x = Dropout(dropout_rate)(x)
        return Add()([res1, x])
    else:
        # Post-normalization architecture (original)
        attn = MultiHeadAttention(
            num_heads=num_heads,
            key_dim=head_size,
            value_dim=head_size,
            use_bias=TRANSFORMER_CONFIG['use_bias'],
            dropout=dropout_rate
        )(inputs, inputs)
        attn = Dropout(dropout_rate)(attn)
        res1 = LayerNormalization(epsilon=TRANSFORMER_CONFIG['epsilon'])(inputs + attn)
        
        # Feed-forward network
        ffn = Dense(ff_dim, activation=TRANSFORMER_CONFIG['activation'])(res1)
        ffn = Dropout(dropout_rate)(ffn)
        ffn = Dense(inputs.shape[-1])(ffn)
        ffn = Dropout(dropout_rate)(ffn)
        return LayerNormalization(epsilon=TRANSFORMER_CONFIG['epsilon'])(res1 + ffn)

def create_transformer_model(cgm_shape: tuple, other_features_shape: tuple) -> Model:
    """
    Crea un modelo Transformer con entrada dual para datos CGM y otras características.
    
    Parámetros:
    -----------
    cgm_shape : tuple
        Forma de los datos CGM (samples, timesteps, features)
    other_features_shape : tuple
        Forma de otras características (samples, features)
        
    Retorna:
    --------
    Model
        Modelo Transformer compilado
    """
    cgm_input = Input(shape=cgm_shape[1:], name='cgm_input')
    other_input = Input(shape=(other_features_shape[1],), name='other_input')
    
    # Proyección inicial y codificación posicional
    x = Dense(TRANSFORMER_CONFIG['key_dim'] * TRANSFORMER_CONFIG['num_heads'])(cgm_input)
    if TRANSFORMER_CONFIG['use_relative_pos']:
        x = PositionEncoding(
            TRANSFORMER_CONFIG['max_position'],
            TRANSFORMER_CONFIG['key_dim'] * TRANSFORMER_CONFIG['num_heads']
        )(x)
    
    # Bloques Transformer
    for _ in range(TRANSFORMER_CONFIG['num_layers']):
        x = create_transformer_block(
            x,
            TRANSFORMER_CONFIG['head_size'],
            TRANSFORMER_CONFIG['num_heads'],
            TRANSFORMER_CONFIG['ff_dim'],
            TRANSFORMER_CONFIG['dropout_rate'],
            TRANSFORMER_CONFIG['prenorm']
        )
    
    # Pooling con estadísticas
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = tf.keras.layers.GlobalMaxPooling1D()(x)
    x = Concatenate()([avg_pool, max_pool])
    
    # Combinar con otras características
    x = Concatenate()([x, other_input])
    
    # MLP final con residual connections
    skip = x
    x = Dense(128, activation=TRANSFORMER_CONFIG['activation'])(x)
    x = LayerNormalization(epsilon=TRANSFORMER_CONFIG['epsilon'])(x)
    x = Dropout(TRANSFORMER_CONFIG['dropout_rate'])(x)
    x = Dense(128, activation=TRANSFORMER_CONFIG['activation'])(x)
    if skip.shape[-1] == 128:
        x = Add()([x, skip])
    
    x = Dense(64, activation=TRANSFORMER_CONFIG['activation'])(x)
    x = LayerNormalization(epsilon=TRANSFORMER_CONFIG['epsilon'])(x)
    x = Dropout(TRANSFORMER_CONFIG['dropout_rate'])(x)
    
    output = Dense(1)(x)
    
    return Model(inputs=[cgm_input, other_input], outputs=output)

### Wavenet

In [None]:
@register_keras_serializable()
class WaveNetBlock(tf.keras.layers.Layer):
    """
    Bloque WaveNet mejorado con activaciones gated y escalado adaptativo.
    """
    def __init__(self, filters, kernel_size, dilation_rate, dropout_rate, **kwargs):
        super().__init__(**kwargs)
        self.filters = filters
        
        # Gated convolutions
        self.filter_conv = Conv1D(
            filters=filters,
            kernel_size=kernel_size,
            dilation_rate=dilation_rate,
            padding='causal'
        )
        self.gate_conv = Conv1D(
            filters=filters,
            kernel_size=kernel_size,
            dilation_rate=dilation_rate,
            padding='causal'
        )
        
        # Normalization and regularization
        self.filter_norm = BatchNormalization()
        self.gate_norm = BatchNormalization()
        self.dropout = Dropout(dropout_rate)
        
        # Projections
        self.residual_proj = Conv1D(filters, 1, padding='same')
        self.skip_proj = Conv1D(filters, 1, padding='same')
        
        # Scaling factors
        self.residual_scale = WAVENET_CONFIG['use_residual_scale']
        self.use_skip_scale = WAVENET_CONFIG['use_skip_scale']

    def call(self, inputs, training=None):
        # Gated activation
        filter_out = self.filter_conv(inputs)
        gate_out = self.gate_conv(inputs)
        
        filter_out = self.filter_norm(filter_out, training=training)
        gate_out = self.gate_norm(gate_out, training=training)
        
        # tanh(filter) * sigmoid(gate)
        gated_out = tf.nn.tanh(filter_out) * tf.nn.sigmoid(gate_out)
        gated_out = self.dropout(gated_out, training=training)
        
        # Residual connection
        residual = self.residual_proj(inputs)
        residual = residual[:, -gated_out.shape[1]:, :]
        residual_out = (gated_out * self.residual_scale) + residual
        
        # Skip connection
        skip_out = self.skip_proj(gated_out)
        if self.use_skip_scale:
            skip_out = skip_out * tf.math.sqrt(self.residual_scale)
        
        return residual_out, skip_out

def create_wavenet_block(x, filters, kernel_size, dilation_rate, dropout_rate):
    """
    Crea un bloque WaveNet con conexiones residuales y skip connections.

    Parámetros:
    -----------
    x : tf.Tensor
        Tensor de entrada
    filters : int
        Número de filtros de la capa convolucional
    kernel_size : int
        Tamaño del kernel de la capa convolucional
    dilation_rate : int
        Tasa de dilatación de la capa convolucional
    dropout_rate : float
        Tasa de dropout

    Retorna:
    --------
    tf.Tensor
        Tensor de salida del bloque WaveNet
    """
    # Convolución dilatada
    conv = Conv1D(filters=filters, kernel_size=kernel_size,
                 dilation_rate=dilation_rate, padding='causal')(x)
    conv = BatchNormalization()(conv)
    conv = Activation('relu')(conv)
    conv = Dropout(dropout_rate)(conv)
    
    # Conexión residual con proyección 1x1 si es necesario
    if x.shape[-1] != filters:
        x = Conv1D(filters, 1, padding='same')(x)
    
    # Alinear dimensiones temporales
    x = x[:, -conv.shape[1]:, :]
    res = Add()([conv, x])
    
    return res, conv

def create_wavenet_model(cgm_shape: tuple, other_features_shape: tuple) -> Model:
    """
    Crea un modelo WaveNet para predicción de series temporales.

    Parámetros:
    -----------
    cgm_shape : tuple
        Forma de los datos CGM (samples, timesteps, features)
    other_features_shape : tuple
        Forma de otras características (samples, features)

    Retorna:
    --------
    Model
        Modelo WaveNet compilado
    """
    cgm_input = Input(shape=cgm_shape[1:])
    other_input = Input(shape=(other_features_shape[1],))
    
    # Proyección inicial
    x = Conv1D(WAVENET_CONFIG['filters'][0], 1, padding='same')(cgm_input)
    
    # Saltar conexiones
    skip_outputs = []
    
    # WaveNet stack
    for filters in WAVENET_CONFIG['filters']:
        for dilation in WAVENET_CONFIG['dilations']:
            wavenet_block = WaveNetBlock(
                filters=filters,
                kernel_size=WAVENET_CONFIG['kernel_size'],
                dilation_rate=dilation,
                dropout_rate=WAVENET_CONFIG['dropout_rate']
            )
            x, skip = wavenet_block(x)
            skip_outputs.append(skip)
    
    # Combinar skip connections
    if skip_outputs:
        target_len = skip_outputs[-1].shape[1]
        aligned_skips = [skip[:, -target_len:, :] for skip in skip_outputs]
        x = Add()(aligned_skips) / tf.sqrt(float(len(skip_outputs)))
    
    # Post-procesamiento
    x = Activation(WAVENET_CONFIG['activation'])(x)
    x = Conv1D(WAVENET_CONFIG['filters'][-1], 1, padding='same')(x)
    x = Activation(WAVENET_CONFIG['activation'])(x)
    x = GlobalAveragePooling1D()(x)
    
    # Combinación con otras features
    x = Concatenate()([x, other_input])
    
    # Capas densas finales con residual connections
    skip = x
    x = Dense(128)(x)
    x = BatchNormalization()(x)
    x = Activation(WAVENET_CONFIG['activation'])(x)
    x = Dropout(WAVENET_CONFIG['dropout_rate'])(x)
    x = Dense(128)(x)
    if skip.shape[-1] == 128:
        x = Add()([x, skip])
    
    output = Dense(1)(x)
    
    return Model(inputs=[cgm_input, other_input], outputs=output)

In [None]:
MODEL_CREATORS = {
    'CNN': create_cnn_model,
    'Transformer': create_transformer_model,
    'GRU': create_gru_model,
    'Attention': create_attention_model,
    'RNN': create_rnn_model,
    'TabNet': create_tabnet_model,
    'TCN': create_tcn_model,
}

## Funciones Visualización

In [None]:
def plot_training_history(histories: dict, model_names: list):
    """
    Visualiza el historial de entrenamiento de múltiples modelos.
    
    Parámetros:
    -----------
    histories : dict
        Diccionario con historiales de entrenamiento por modelo
    model_names : list
        Lista de nombres de modelos
    """
    plt.figure(figsize=(12, 6))
    
    for name, history in histories.items():
        plt.plot(history['loss'], label=f'{name} (train)')
        plt.plot(history['val_loss'], label=f'{name} (val)', linestyle='--')
    
    plt.xlabel('Épocas')
    plt.ylabel('Pérdida MSE')
    plt.title('Comparación de Historiales de Entrenamiento')
    plt.legend()
    plt.savefig(os.path.join(FIGURES_DIR, 'training_comparison.png'), dpi=300, bbox_inches='tight')
    plt.close()

def plot_predictions_comparison(y_test: np.ndarray, predictions: dict):
    """
    Visualiza comparación de predicciones de múltiples modelos.
    
    Parámetros:
    -----------
    y_test : np.ndarray
        Valores reales de prueba
    predictions : dict
        Diccionario con predicciones por modelo
    """
    plt.figure(figsize=(15, 5))
    
    # Scatter plot
    plt.subplot(1, 2, 1)
    for name, y_pred in predictions.items():
        plt.scatter(y_test, y_pred, alpha=0.5, label=name)
    plt.plot([0, 15], [0, 15], 'r--')
    plt.xlabel('Dosis Real (u. de insulina)')
    plt.ylabel('Dosis Predicha (u. de insulina)')
    plt.legend()
    plt.title('Predicción vs. Real (Todos los Modelos)')
    
    # Residuals
    plt.subplot(1, 2, 2)
    for name, y_pred in predictions.items():
        plt.hist(y_test - y_pred, bins=20, alpha=0.5, label=name)
    plt.xlabel('Residuo (u. de insulina)')
    plt.ylabel('Frecuencia')
    plt.legend()
    plt.title('Distribución de Residuos')
    
    plt.tight_layout()
    plt.savefig(os.path.join(FIGURES_DIR, 'predictions_comparison.png'), dpi=300, bbox_inches='tight')
    plt.close()


## Función de Entrenamiento

In [None]:
def create_dataset(x_cgm, x_other, y, batch_size=32):
    """
    Crea un dataset optimizado usando tf.data.
    
    Parámetros:
    -----------
    x_cgm : np.ndarray
        Datos CGM
    x_other : np.ndarray
        Otras características
    y : np.ndarray
        Etiquetas
    batch_size : int
        Tamaño del batch
        
    Retorna:
    --------
    tf.data.Dataset
        Dataset optimizado
    """
    dataset = tf.data.Dataset.from_tensor_slices((
        (x_cgm, x_other), y
    ))
    return dataset.cache().shuffle(1000).batch(batch_size).prefetch(tf.data.AUTOTUNE)

In [None]:
def create_ensemble_prediction(predictions_dict: dict, weights: np.ndarray = None) -> np.ndarray:
    """
    Combina predicciones de múltiples modelos usando un promedio ponderado.
    
    Parámetros:
    -----------
    predictions_dict : dict
        Diccionario con predicciones de cada modelo
    weights : np.ndarray, opcional
        Pesos para cada modelo. Si es None, usa promedio simple
        
    Retorna:
    --------
    np.ndarray
        Predicciones combinadas del ensemble
    """
    all_preds = np.stack(list(predictions_dict.values()))
    if weights is None:
        weights = np.ones(len(predictions_dict)) / len(predictions_dict)
    return np.average(all_preds, axis=0, weights=weights)

def optimize_ensemble_weights(predictions_dict: dict, y_true: np.ndarray) -> np.ndarray:
    """
    Optimiza los pesos del ensemble usando validación cruzada.
    
    Parámetros:
    -----------
    predictions_dict : dict
        Diccionario con predicciones de cada modelo
    y_true : np.ndarray
        Valores reales
        
    Retorna:
    --------
    np.ndarray
        Pesos optimizados para cada modelo
    """
    
    def objective(weights):
        # Normalizar pesos
        weights = weights / np.sum(weights)
        # Obtener predicción del ensemble
        ensemble_pred = create_ensemble_prediction(predictions_dict, weights)
        # Calcular error
        return mean_squared_error(y_true, ensemble_pred)
    
    n_models = len(predictions_dict)
    initial_weights = np.ones(n_models) / n_models
    bounds = [(0, 1) for _ in range(n_models)]
    
    result = minimize(
        objective,
        initial_weights,
        bounds=bounds,
        constraints={'type': 'eq', 'fun': lambda w: np.sum(w) - 1}
    )
    
    return result.x / np.sum(result.x)


In [None]:
def train_and_evaluate_model(model: Model, model_name: str, 
                           x_cgm_train: np.ndarray, x_other_train: np.ndarray, 
                           y_train: np.ndarray, x_cgm_val: np.ndarray, 
                           x_other_val: np.ndarray, y_val: np.ndarray,
                           x_cgm_test: np.ndarray, x_other_test: np.ndarray, 
                           y_test: np.ndarray) -> tuple:
    """
    Entrena y evalúa un modelo específico con características avanzadas de entrenamiento.
    
    Parámetros:
    -----------
    model : Model
        Modelo a entrenar
    model_name : str
        Nombre del modelo para guardado/logging
    x_cgm_train, x_other_train, y_train : np.ndarray
        Datos de entrenamiento
    x_cgm_val, x_other_val, y_val : np.ndarray
        Datos de validación
    x_cgm_test, x_other_test, y_test : np.ndarray
        Datos de prueba
        
    Retorna:
    --------
    tuple
        (history, y_pred, metrics_dict)
    """
    # Habilitar compilación XLA
    tf.config.optimizer.set_jit(True)
    
    # Crear datasets optimizados
    train_ds = create_dataset(x_cgm_train, x_other_train, y_train)
    val_ds = create_dataset(x_cgm_val, x_other_val, y_val)
    
    # Configurar learning rate con decaimiento
    initial_learning_rate = 0.001
    lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate,
        decay_steps=1000,
        decay_rate=0.9
    )
    
    # Optimizador con gradient clipping
    optimizer = tf.keras.optimizers.Adam(
        learning_rate=lr_schedule,
        clipnorm=1.0
    )
    
    # Habilitar entrenamiento con precisión mixta
    tf.keras.mixed_precision.set_global_policy('mixed_float16')
    
    # Compilar modelo con múltiples métricas
    model.compile(
        optimizer=optimizer,
        loss='mse',
        metrics=['mae', tf.keras.metrics.RootMeanSquaredError()]
    )
    
    # Callbacks para monitoreo y optimización
    callbacks = [
        # Early stopping para evitar overfitting
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=10,
            restore_best_weights=True
        ),
        # Reducción de learning rate cuando el modelo se estanca
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.5,
            patience=5,
            min_lr=1e-6
        ),
        # Guardado del mejor modelo
        tf.keras.callbacks.ModelCheckpoint(
            os.path.join(MODELS_DIR, f'best_{model_name}.h5'),
            monitor='val_loss',
            save_best_only=True
        ),
        # TensorBoard para visualización
        tf.keras.callbacks.TensorBoard(
            log_dir=os.path.join(MODELS_DIR, 'logs', model_name),
            histogram_freq=1
        )
    ]
    
    # Entrenar modelo
    history = model.fit(
        train_ds,
        validation_data=val_ds,
        epochs=100,
        callbacks=callbacks,
        verbose=1
    )
    
    # Predecir y evaluar
    y_pred = model.predict([x_cgm_test, x_other_test]).flatten()
    
    # Calcular métricas
    metrics = {
        'mae': mean_absolute_error(y_test, y_pred),
        'rmse': np.sqrt(mean_squared_error(y_test, y_pred)),
        'r2': r2_score(y_test, y_pred)
    }
    
    # Guardar modelo final
    model.save(os.path.join(MODELS_DIR, f'{model_name}.keras'))
    
    # Restaurar política de precisión default
    tf.keras.mixed_precision.set_global_policy('float32')
    
    return history, y_pred, metrics

def train_model_parallel(name, input_shapes):
    """
    Entrenamiento en paralelo de un modelo específico.
    
    Parámeteros:
    -----------
    name : str
        Name of the model to create
    input_shapes : tuple
        Shapes for CGM and other inputs
    """
    print(f"\nEntrenando modelo {name}...")
    
    
    model = MODEL_CREATORS[name](input_shapes[0], input_shapes[1])
    
    return name, train_and_evaluate_model(
        model, name,
        x_cgm_train, x_other_train, y_train,
        x_cgm_val, x_other_val, y_val,
        x_cgm_test, x_other_test, y_test
    )

In [None]:
def cross_validate_model(create_model_fn, x_cgm: np.ndarray, x_other: np.ndarray, 
                        y: np.ndarray, n_splits: int = 5) -> tuple:
    """
    Realiza validación cruzada de un modelo.
    
    Parámetros:
    -----------
    create_model_fn : callable
        Función que crea el modelo
    x_cgm : np.ndarray
        Datos CGM
    x_other : np.ndarray
        Otras características
    y : np.ndarray
        Etiquetas
    n_splits : int
        Número de divisiones para validación cruzada
        
    Retorna:
    --------
    tuple
        (media_metricas, std_metricas)
    """
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    scores = []
    
    for fold, (train_idx, val_idx) in enumerate(kf.split(x_cgm)):
        print(f"\nEntrenando fold {fold + 1}/{n_splits}")
        
        # Dividir datos
        x_cgm_train_fold = x_cgm[train_idx]
        x_cgm_val_fold = x_cgm[val_idx]
        x_other_train_fold = x_other[train_idx]
        x_other_val_fold = x_other[val_idx]
        y_train_fold = y[train_idx]
        y_val_fold = y[val_idx]
        
        # Crear y entrenar modelo
        model = create_model_fn()
        history = train_and_evaluate_model(
            model=model,
            model_name=f'fold_{fold}',
            x_cgm_train=x_cgm_train_fold,
            x_other_train=x_other_train_fold,
            y_train=y_train_fold,
            x_cgm_val=x_cgm_val_fold,
            x_other_val=x_other_val_fold,
            y_val=y_val_fold,
            x_cgm_test=x_cgm_val_fold,
            x_other_test=x_other_val_fold,
            y_test=y_val_fold
        )
        
        scores.append(history[2])  # Append metrics dictionary
    
    # Calcular estadísticas
    mean_scores = {
        metric: np.mean([s[metric] for s in scores])
        for metric in scores[0].keys()
    }
    std_scores = {
        metric: np.std([s[metric] for s in scores])
        for metric in scores[0].keys()
    }
    
    return mean_scores, std_scores

In [None]:
train_config = {
    'batch_size': 32,
    'epochs': 100,
    'learning_rate': 0.001,
    'patience': 10,
    'mixed_precision': True
}

def train_model_sequential(args):
    """Train a model and return only picklable results"""
    name, input_shapes = args
    
    try:
        # Create datasets with prefetching
        train_ds = (
            create_dataset(x_cgm_train, x_other_train, y_train)
            .prefetch(tf.data.AUTOTUNE)
        )
        val_ds = (
            create_dataset(x_cgm_val, x_other_val, y_val)
            .prefetch(tf.data.AUTOTUNE)
        )
        
        # Create and compile model
        model = MODEL_CREATORS[name](input_shapes[0], input_shapes[1])
        
        # Compile with gradient clipping
        model.compile(
            optimizer=tf.keras.optimizers.Adam(
                learning_rate=train_config['learning_rate'],
                clipnorm=1.0
            ),
            loss='mse',
            jit_compile=False  # XLA disabled
        )
        
        # Train
        history = model.fit(
            train_ds,
            validation_data=val_ds,
            epochs=train_config['epochs'],
            batch_size=train_config['batch_size'],
            callbacks=[
                tf.keras.callbacks.EarlyStopping(
                    monitor='val_loss',
                    patience=train_config['patience'],
                    restore_best_weights=True
                ),
                tf.keras.callbacks.ReduceLROnPlateau(
                    monitor='val_loss',
                    factor=0.5,
                    patience=5,
                    min_lr=1e-6
                ),
                tf.keras.callbacks.ModelCheckpoint(
                    f'checkpoints/{name}_best.h5',
                    monitor='val_loss',
                    save_best_only=True,
                    mode='min'
                )
            ],
            verbose=1
        )
        
        # Predictions with error handling
        try:
            y_pred = model.predict(
                [x_cgm_test, x_other_test],
                batch_size=train_config['batch_size'],
                verbose=1
            ).flatten()
            
            return {
                'name': name,
                'history': history.history,
                'predictions': y_pred,
                'model': model
            }
            
        except Exception as e:
            print(f"Prediction error for {name}: {str(e)}")
            return None
            
    except Exception as e:
        print(f"Training error for {name}: {str(e)}")
        return None

def calculate_metrics(predictions, y_true):
    """Calculate metrics for predictions"""
    return {
        'mae': mean_absolute_error(y_true, predictions),
        'rmse': np.sqrt(mean_squared_error(y_true, predictions)),
        'r2': r2_score(y_true, predictions)
    }

In [None]:
def enhance_features(x_cgm, x_other):
    # Add derivative features for CGM
    cgm_diff = np.diff(x_cgm.squeeze(), axis=1)
    cgm_diff = np.pad(cgm_diff, ((0,0), (1,0), (0,0)), mode='edge')
    
    # Add rolling statistics
    window = 5
    rolling_mean = np.apply_along_axis(
        lambda x: np.convolve(x, np.ones(window)/window, mode='same'),
        1, x_cgm.squeeze()
    )
    
    x_cgm_enhanced = np.concatenate([
        x_cgm,
        cgm_diff[..., np.newaxis],
        rolling_mean[..., np.newaxis]
    ], axis=-1)
    
    return x_cgm_enhanced, x_other

## Entrenamiento y Evaluación de los Modelos

In [None]:
# Entrenamiento y evaluación de modelos
# Configure GPU memory at the very beginning
try:
    # Attempt to configure GPU memory
    gpus = tf.config.list_physical_devices('GPU')
    if gpus:
        # Configure GPU memory growth
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print("GPU memory growth enabled")
    else:
        print("No GPU devices found, using CPU")
except RuntimeError as e:
    print(f"GPU configuration error: {e}")

# Disable XLA
tf.config.optimizer.set_jit(False)

# Training Configuration
train_config = {
    'batch_size': 32,
    'epochs': 100,
    'learning_rate': 0.001,
    'patience': 10,
    'mixed_precision': True
}

# Enable mixed precision if requested
if train_config['mixed_precision']:
    tf.keras.mixed_precision.set_global_policy('mixed_float16')
    print("Mixed precision enabled")

# Entrenamiento y evaluación de modelos
input_shapes = (x_cgm_train.shape, x_other_train.shape)
models_names = ['CNN', 'Transformer', 'GRU', 'Attention', 'RNN', 'TabNet', 'TCN', 'WaveNet']

histories = {}
predictions = {}
metrics = {}

# Train models
model_results = []
for name in models_names:
    print(f"\nEntrenando modelo {name}...")
    try:
        result = train_model_sequential((name, input_shapes))
        if result is not None:
            model_results.append(result)
    except Exception as e:
        print(f"Error entrenando {name}: {str(e)}")
        continue

# Process results in parallel
print("\nCalculando métricas en paralelo...")
with Parallel(n_jobs=-1, verbose=1) as parallel:
    metric_results = parallel(
        delayed(calculate_metrics)(
            np.array(result['predictions']), 
            y_test
        ) for result in model_results
    )

# Store results
for result, metric in zip(model_results, metric_results):
    name = result['name']
    histories[name] = result['history']
    predictions[name] = np.array(result['predictions'])
    metrics[name] = metric

# Evaluación por sujeto
print("\nRendimiento por sujeto:")
for subject_id in test_subjects:
    mask = subject_test == subject_id
    y_test_sub = y_test[mask]
    
    print(f"\nSujeto {subject_id}:")
    print("-" * 40)
    for name, y_pred in predictions.items():
        y_pred_sub = y_pred[mask]
        mae_sub = mean_absolute_error(y_test_sub, y_pred_sub)
        rmse_sub = np.sqrt(mean_squared_error(y_test_sub, y_pred_sub))
        r2_sub = r2_score(y_test_sub, y_pred_sub)
        print(f"{name:<15} MAE={mae_sub:.2f}, RMSE={rmse_sub:.2f}, R²={r2_sub:.2f}")


## Visualización de los Resultados

In [None]:
# Visualización de resultados
plot_training_history(histories, models_names)
plot_predictions_comparison(y_test, predictions)

In [None]:
# After storing individual model results
print("\nCreando predicciones del ensemble...")

# Crear predicciones del ensemble
ensemble_pred = create_ensemble_prediction(predictions)
ensemble_metrics = calculate_metrics(ensemble_pred, y_test)

# Agregar métricas del ensemble
metrics['Ensemble'] = ensemble_metrics
predictions['Ensemble'] = ensemble_pred

# Optimizar pesos del ensemble
print("\nOptimizando pesos del ensemble...")
optimal_weights = optimize_ensemble_weights(predictions, y_test)

# Crear predicción del ensemble con pesos optimizados
ensemble_pred_optimized = create_ensemble_prediction(predictions, optimal_weights)
ensemble_metrics_optimized = calculate_metrics(ensemble_pred_optimized, y_test)

# Agregar métricas del ensemble optimizado
metrics['Ensemble (Opt)'] = ensemble_metrics_optimized
predictions['Ensemble (Opt)'] = ensemble_pred_optimized

# Validación cruzada para cada modelo
print("\nRealizando validación cruzada...")
cv_results = {}

for name in models_names:
    print(f"\nValidación cruzada para {name}")
    model_creator = lambda name=name: MODEL_CREATORS[name](input_shapes[0], input_shapes[1])
    mean_scores, std_scores = cross_validate_model(
        create_model_fn=model_creator,
        x_cgm=x_cgm,
        x_other=x_other,
        y=y
    )
    cv_results[name] = {
        'mean': mean_scores,
        'std': std_scores
    }

# Imprimir resultados
print("\nResultados de validación cruzada:")
print("-" * 70)
print(f"{'Modelo':<15} {'MAE':>12} {'RMSE':>12} {'R²':>12}")
print("-" * 70)
for name, results in cv_results.items():
    mean = results['mean']
    std = results['std']
    print(f"{name:<15} {mean['mae']:>8.2f}±{std['mae']:4.2f} "
          f"{mean['rmse']:>8.2f}±{std['rmse']:4.2f} "
          f"{mean['r2']:>8.2f}±{std['r2']:4.2f}")

# Actualizar visualizaciones
plot_training_history(histories, models_names + ['Ensemble', 'Ensemble (Opt)'])
plot_predictions_comparison(y_test, predictions)

## Métricas Comparativas

In [None]:
# Imprimir métricas comparativas
print("\nComparación de métricas:")
print("-" * 50)
print(f"{'Modelo':<15} {'MAE':>8} {'RMSE':>8} {'R²':>8}")
print("-" * 50)
for name, metric in metrics.items():
    print(f"{name:<15} {metric['mae']:8.2f} {metric['rmse']:8.2f} {metric['r2']:8.2f}")