# 05 keras custom layers models
**Location: TensorVerseHub/notebooks/02_neural_networks_with_keras/05_keras_custom_layers_models.ipynb**

In [None]:
import tensorflow as tf
import numpy as np
print(f"TensorFlow version: {tf.__version__}")
# TODO: Add comprehensive implementation

# Custom tf.keras Layers, Models & Training Loops

**File Location:** `notebooks/02_neural_networks_with_keras/05_custom_layers_models_keras.ipynb`

Build custom tf.keras layers, implement advanced model architectures, and create custom training loops. Master the art of extending TensorFlow with your own components for cutting-edge research and specialized applications.

## Learning Objectives
- Build custom tf.keras layers with proper state management
- Implement custom loss functions and metrics
- Create advanced custom training loops with tf.GradientTape
- Handle complex model architectures with custom components
- Implement attention mechanisms and advanced layers
- Master gradient computation and optimization strategies

---

## 1. Custom Layer Fundamentals

In [None]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification, make_regression
import warnings
warnings.filterwarnings('ignore')

print(f"TensorFlow version: {tf.__version__}")
tf.random.set_seed(42)
np.random.seed(42)

# Basic custom layer template
class BasicCustomLayer(tf.keras.layers.Layer):
    """Template for basic custom layer implementation"""
    
    def __init__(self, units=32, activation=None, **kwargs):
        super().__init__(**kwargs)
        self.units = units
        self.activation = tf.keras.activations.get(activation)
    
    def build(self, input_shape):
        """Create layer weights"""
        self.w = self.add_weight(
            shape=(input_shape[-1], self.units),
            initializer='random_normal',
            trainable=True,
            name='kernel'
        )
        self.b = self.add_weight(
            shape=(self.units,),
            initializer='zeros',
            trainable=True,
            name='bias'
        )
        super().build(input_shape)
    
    def call(self, inputs, training=None):
        """Forward pass"""
        output = tf.matmul(inputs, self.w) + self.b
        if self.activation is not None:
            output = self.activation(output)
        return output
    
    def get_config(self):
        """Return layer configuration for serialization"""
        config = super().get_config()
        config.update({
            'units': self.units,
            'activation': tf.keras.activations.serialize(self.activation)
        })
        return config

# Advanced custom layers
class ResidualBlock(tf.keras.layers.Layer):
    """Residual block with optional bottleneck"""
    
    def __init__(self, filters, kernel_size=3, stride=1, use_bottleneck=False, **kwargs):
        super().__init__(**kwargs)
        self.filters = filters
        self.kernel_size = kernel_size
        self.stride = stride
        self.use_bottleneck = use_bottleneck
        
        if use_bottleneck:
            self.conv1 = tf.keras.layers.Dense(filters // 4, activation='relu')
            self.conv2 = tf.keras.layers.Dense(filters // 4, activation='relu')
            self.conv3 = tf.keras.layers.Dense(filters, activation=None)
        else:
            self.conv1 = tf.keras.layers.Dense(filters, activation='relu')
            self.conv2 = tf.keras.layers.Dense(filters, activation=None)
        
        self.bn1 = tf.keras.layers.BatchNormalization()
        self.bn2 = tf.keras.layers.BatchNormalization()
        self.bn3 = tf.keras.layers.BatchNormalization() if use_bottleneck else None
        
        self.dropout = tf.keras.layers.Dropout(0.3)
        self.add = tf.keras.layers.Add()
        self.activation = tf.keras.layers.Activation('relu')
        
        # Skip connection adjustment
        self.use_projection = False
        if stride != 1:
            self.use_projection = True
            self.projection = tf.keras.layers.Dense(filters)
            self.projection_bn = tf.keras.layers.BatchNormalization()
    
    def call(self, inputs, training=None):
        # Main path
        x = self.conv1(inputs)
        x = self.bn1(x, training=training)
        x = self.dropout(x, training=training)
        
        if self.use_bottleneck:
            x = self.conv2(x)
            x = self.bn2(x, training=training)
            x = self.dropout(x, training=training)
            x = self.conv3(x)
            x = self.bn3(x, training=training)
        else:
            x = self.conv2(x)
            x = self.bn2(x, training=training)
        
        # Skip connection
        shortcut = inputs
        if self.use_projection:
            shortcut = self.projection(shortcut)
            shortcut = self.projection_bn(shortcut, training=training)
        
        # Add and activate
        x = self.add([x, shortcut])
        return self.activation(x)
    
    def get_config(self):
        config = super().get_config()
        config.update({
            'filters': self.filters,
            'kernel_size': self.kernel_size,
            'stride': self.stride,
            'use_bottleneck': self.use_bottleneck
        })
        return config

class MultiHeadSelfAttention(tf.keras.layers.Layer):
    """Custom multi-head self-attention layer"""
    
    def __init__(self, embed_dim, num_heads=8, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        
        assert embed_dim % num_heads == 0, "embed_dim must be divisible by num_heads"
        self.depth = embed_dim // num_heads
        
        self.wq = tf.keras.layers.Dense(embed_dim)
        self.wk = tf.keras.layers.Dense(embed_dim)
        self.wv = tf.keras.layers.Dense(embed_dim)
        
        self.dense = tf.keras.layers.Dense(embed_dim)
        self.layer_norm = tf.keras.layers.LayerNormalization()
        self.dropout = tf.keras.layers.Dropout(0.1)
    
    def split_heads(self, x, batch_size):
        """Split the last dimension into (num_heads, depth)"""
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])
    
    def scaled_dot_product_attention(self, q, k, v, mask=None):
        """Calculate the attention weights"""
        matmul_qk = tf.matmul(q, k, transpose_b=True)
        
        # Scale matmul_qk
        dk = tf.cast(tf.shape(k)[-1], tf.float32)
        scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
        
        # Apply mask if provided
        if mask is not None:
            scaled_attention_logits += (mask * -1e9)
        
        # Softmax
        attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)
        attention_weights = self.dropout(attention_weights)
        
        output = tf.matmul(attention_weights, v)
        return output, attention_weights
    
    def call(self, inputs, training=None, mask=None):
        batch_size = tf.shape(inputs)[0]
        seq_len = tf.shape(inputs)[1]
        
        # Linear transformations
        q = self.wq(inputs)
        k = self.wk(inputs)
        v = self.wv(inputs)
        
        # Split heads
        q = self.split_heads(q, batch_size)
        k = self.split_heads(k, batch_size)
        v = self.split_heads(v, batch_size)
        
        # Attention
        attention_output, attention_weights = self.scaled_dot_product_attention(q, k, v, mask)
        
        # Concatenate heads
        attention_output = tf.transpose(attention_output, perm=[0, 2, 1, 3])
        concat_attention = tf.reshape(attention_output, (batch_size, seq_len, self.embed_dim))
        
        # Final linear layer
        output = self.dense(concat_attention)
        
        # Add & norm
        output = self.layer_norm(output + inputs)
        
        return output
    
    def get_config(self):
        config = super().get_config()
        config.update({
            'embed_dim': self.embed_dim,
            'num_heads': self.num_heads
        })
        return config

# Test custom layers
print("=== Testing Custom Layers ===")

# Create sample data
X_sample = np.random.randn(100, 20).astype(np.float32)
y_sample = np.random.randint(0, 3, 100)

# Test basic custom layer
basic_layer = BasicCustomLayer(units=64, activation='relu')
basic_output = basic_layer(X_sample[:5])
print(f"Basic Custom Layer Output Shape: {basic_output.shape}")

# Test residual block
residual_layer = ResidualBlock(filters=64, use_bottleneck=True)
residual_output = residual_layer(X_sample[:5])
print(f"Residual Block Output Shape: {residual_output.shape}")

# Test attention layer (reshape data for sequence)
X_sequence = X_sample.reshape(100, 4, 5)  # (batch, seq_len, embed_dim)
attention_layer = MultiHeadSelfAttention(embed_dim=5, num_heads=1)
attention_output = attention_layer(X_sequence[:5])
print(f"Attention Layer Output Shape: {attention_output.shape}")

## 2. Advanced Custom Layers

In [None]:
# Specialized custom layers
class GatedLinearUnit(tf.keras.layers.Layer):
    """Gated Linear Unit for improved gradient flow"""
    
    def __init__(self, units, **kwargs):
        super().__init__(**kwargs)
        self.units = units
        self.dense_gate = tf.keras.layers.Dense(units)
        self.dense_value = tf.keras.layers.Dense(units)
        
    def call(self, inputs):
        gate = tf.nn.sigmoid(self.dense_gate(inputs))
        value = self.dense_value(inputs)
        return gate * value
    
    def get_config(self):
        config = super().get_config()
        config.update({'units': self.units})
        return config

class SwishActivation(tf.keras.layers.Layer):
    """Custom Swish activation layer"""
    
    def __init__(self, beta=1.0, **kwargs):
        super().__init__(**kwargs)
        self.beta = beta
    
    def call(self, inputs):
        return inputs * tf.nn.sigmoid(self.beta * inputs)
    
    def get_config(self):
        config = super().get_config()
        config.update({'beta': self.beta})
        return config

class LayerScale(tf.keras.layers.Layer):
    """Layer scaling for improved training stability"""
    
    def __init__(self, init_value=1e-4, **kwargs):
        super().__init__(**kwargs)
        self.init_value = init_value
    
    def build(self, input_shape):
        self.scale = self.add_weight(
            shape=(input_shape[-1],),
            initializer=tf.keras.initializers.Constant(self.init_value),
            trainable=True,
            name='scale'
        )
        super().build(input_shape)
    
    def call(self, inputs):
        return inputs * self.scale
    
    def get_config(self):
        config = super().get_config()
        config.update({'init_value': self.init_value})
        return config

class NoiseRegularization(tf.keras.layers.Layer):
    """Add noise during training for regularization"""
    
    def __init__(self, noise_stddev=0.1, **kwargs):
        super().__init__(**kwargs)
        self.noise_stddev = noise_stddev
    
    def call(self, inputs, training=None):
        if training:
            noise = tf.random.normal(tf.shape(inputs), stddev=self.noise_stddev)
            return inputs + noise
        return inputs
    
    def get_config(self):
        config = super().get_config()
        config.update({'noise_stddev': self.noise_stddev})
        return config

class FeatureSqueezeExcitation(tf.keras.layers.Layer):
    """Squeeze and Excitation block for feature recalibration"""
    
    def __init__(self, reduction_ratio=16, **kwargs):
        super().__init__(**kwargs)
        self.reduction_ratio = reduction_ratio
    
    def build(self, input_shape):
        self.feature_dim = input_shape[-1]
        reduced_dim = max(1, self.feature_dim // self.reduction_ratio)
        
        self.squeeze = tf.keras.layers.GlobalAveragePooling1D()
        self.excitation = tf.keras.Sequential([
            tf.keras.layers.Dense(reduced_dim, activation='relu'),
            tf.keras.layers.Dense(self.feature_dim, activation='sigmoid')
        ])
        super().build(input_shape)
    
    def call(self, inputs):
        # For 2D inputs, add a dummy sequence dimension
        if len(inputs.shape) == 2:
            x = tf.expand_dims(inputs, axis=1)
            squeeze_output = tf.squeeze(self.squeeze(x), axis=1)
        else:
            squeeze_output = self.squeeze(inputs)
        
        # Excitation
        excitation_output = self.excitation(squeeze_output)
        
        # Scale original input
        if len(inputs.shape) == 2:
            return inputs * excitation_output
        else:
            return inputs * tf.expand_dims(excitation_output, axis=1)
    
    def get_config(self):
        config = super().get_config()
        config.update({'reduction_ratio': self.reduction_ratio})
        return config

# Position encoding for transformers
class PositionalEncoding(tf.keras.layers.Layer):
    """Positional encoding for transformer architectures"""
    
    def __init__(self, max_position=1000, **kwargs):
        super().__init__(**kwargs)
        self.max_position = max_position
    
    def build(self, input_shape):
        self.d_model = input_shape[-1]
        
        # Create positional encoding matrix
        position = tf.range(self.max_position, dtype=tf.float32)[:, tf.newaxis]
        div_term = tf.exp(tf.range(0, self.d_model, 2, dtype=tf.float32) * 
                         -(tf.math.log(10000.0) / self.d_model))
        
        pos_encoding = tf.zeros((self.max_position, self.d_model))
        pos_encoding = tf.concat([
            tf.sin(position * div_term),
            tf.cos(position * div_term)
        ], axis=1)
        
        if self.d_model % 2 == 1:
            pos_encoding = pos_encoding[:, :-1]
        
        self.pos_encoding = tf.Variable(
            pos_encoding, trainable=False, name='positional_encoding'
        )
        super().build(input_shape)
    
    def call(self, inputs):
        seq_len = tf.shape(inputs)[1]
        return inputs + self.pos_encoding[:seq_len, :]

# Test advanced layers
print("\n=== Testing Advanced Custom Layers ===")

# Test GLU
glu_layer = GatedLinearUnit(32)
glu_output = glu_layer(X_sample[:5])
print(f"GLU Output Shape: {glu_output.shape}")

# Test Swish activation
swish_layer = SwishActivation(beta=1.5)
swish_output = swish_layer(X_sample[:5])
print(f"Swish Output Shape: {swish_output.shape}")

# Test Squeeze and Excitation
se_layer = FeatureSqueezeExcitation(reduction_ratio=8)
se_output = se_layer(X_sample[:5])
print(f"SE Output Shape: {se_output.shape}")

# Test positional encoding
pos_enc = PositionalEncoding(max_position=100)
pos_output = pos_enc(X_sequence[:5])
print(f"Positional Encoding Output Shape: {pos_output.shape}")

## 3. Custom Loss Functions and Metrics

In [None]:
# Custom loss functions
class FocalLoss(tf.keras.losses.Loss):
    """Focal Loss for addressing class imbalance"""
    
    def __init__(self, alpha=0.25, gamma=2.0, **kwargs):
        super().__init__(**kwargs)
        self.alpha = alpha
        self.gamma = gamma
    
    def call(self, y_true, y_pred):
        # Convert to one-hot if needed
        if len(y_true.shape) == 1:
            y_true = tf.one_hot(tf.cast(y_true, tf.int32), tf.shape(y_pred)[-1])
        
        # Calculate cross entropy
        ce_loss = tf.keras.losses.categorical_crossentropy(y_true, y_pred)
        
        # Calculate focal weight
        p_t = tf.where(tf.equal(y_true, 1), y_pred, 1 - y_pred)
        alpha_t = tf.where(tf.equal(y_true, 1), self.alpha, 1 - self.alpha)
        focal_weight = alpha_t * tf.pow(1 - p_t, self.gamma)
        
        return tf.reduce_mean(focal_weight * ce_loss)
    
    def get_config(self):
        config = super().get_config()
        config.update({'alpha': self.alpha, 'gamma': self.gamma})
        return config

class ContrastiveLoss(tf.keras.losses.Loss):
    """Contrastive loss for similarity learning"""
    
    def __init__(self, margin=1.0, **kwargs):
        super().__init__(**kwargs)
        self.margin = margin
    
    def call(self, y_true, y_pred):
        # y_pred should be distances, y_true should be 0/1 (similar/dissimilar)
        square_pred = tf.square(y_pred)
        margin_square = tf.square(tf.maximum(self.margin - y_pred, 0))
        
        loss = tf.reduce_mean(
            y_true * square_pred + (1 - y_true) * margin_square
        )
        return loss
    
    def get_config(self):
        config = super().get_config()
        config.update({'margin': self.margin})
        return config

class LabelSmoothingLoss(tf.keras.losses.Loss):
    """Label smoothing cross-entropy loss"""
    
    def __init__(self, smoothing=0.1, **kwargs):
        super().__init__(**kwargs)
        self.smoothing = smoothing
    
    def call(self, y_true, y_pred):
        if len(y_true.shape) == 1:
            y_true = tf.one_hot(tf.cast(y_true, tf.int32), tf.shape(y_pred)[-1])
        
        num_classes = tf.cast(tf.shape(y_true)[-1], tf.float32)
        smooth_positives = 1.0 - self.smoothing
        smooth_negatives = self.smoothing / num_classes
        
        smoothed_labels = y_true * smooth_positives + smooth_negatives
        
        return tf.keras.losses.categorical_crossentropy(smoothed_labels, y_pred)
    
    def get_config(self):
        config = super().get_config()
        config.update({'smoothing': self.smoothing})
        return config

# Custom metrics
class F1Score(tf.keras.metrics.Metric):
    """F1 Score metric for classification"""
    
    def __init__(self, num_classes, average='macro', **kwargs):
        super().__init__(**kwargs)
        self.num_classes = num_classes
        self.average = average
        
        self.true_positives = self.add_weight(
            'true_positives', shape=(num_classes,), initializer='zeros'
        )
        self.false_positives = self.add_weight(
            'false_positives', shape=(num_classes,), initializer='zeros'
        )
        self.false_negatives = self.add_weight(
            'false_negatives', shape=(num_classes,), initializer='zeros'
        )
    
    def update_state(self, y_true, y_pred, sample_weight=None):
        if len(y_true.shape) > 1:
            y_true = tf.argmax(y_true, axis=-1)
        y_pred = tf.argmax(y_pred, axis=-1)
        
        for i in range(self.num_classes):
            true_i = tf.equal(y_true, i)
            pred_i = tf.equal(y_pred, i)
            
            tp = tf.reduce_sum(tf.cast(tf.logical_and(true_i, pred_i), tf.float32))
            fp = tf.reduce_sum(tf.cast(tf.logical_and(tf.logical_not(true_i), pred_i), tf.float32))
            fn = tf.reduce_sum(tf.cast(tf.logical_and(true_i, tf.logical_not(pred_i)), tf.float32))
            
            self.true_positives[i].assign_add(tp)
            self.false_positives[i].assign_add(fp)
            self.false_negatives[i].assign_add(fn)
    
    def result(self):
        precision = self.true_positives / (self.true_positives + self.false_positives + tf.keras.backend.epsilon())
        recall = self.true_positives / (self.true_positives + self.false_negatives + tf.keras.backend.epsilon())
        f1 = 2 * (precision * recall) / (precision + recall + tf.keras.backend.epsilon())
        
        if self.average == 'macro':
            return tf.reduce_mean(f1)
        elif self.average == 'weighted':
            weights = self.true_positives + self.false_negatives
            return tf.reduce_sum(f1 * weights) / tf.reduce_sum(weights)
        else:
            return f1
    
    def reset_state(self):
        self.true_positives.assign(tf.zeros_like(self.true_positives))
        self.false_positives.assign(tf.zeros_like(self.false_positives))
        self.false_negatives.assign(tf.zeros_like(self.false_negatives))

class TopKCategoricalAccuracy(tf.keras.metrics.Metric):
    """Top-K categorical accuracy"""
    
    def __init__(self, k=5, **kwargs):
        super().__init__(**kwargs)
        self.k = k
        self.total = self.add_weight('total', initializer='zeros')
        self.count = self.add_weight('count', initializer='zeros')
    
    def update_state(self, y_true, y_pred, sample_weight=None):
        if len(y_true.shape) == 1:
            y_true = tf.one_hot(tf.cast(y_true, tf.int32), tf.shape(y_pred)[-1])
        
        top_k_pred = tf.nn.top_k(y_pred, k=self.k)
        y_true_labels = tf.argmax(y_true, axis=-1)
        
        matches = tf.reduce_any(
            tf.equal(tf.expand_dims(y_true_labels, axis=-1), top_k_pred.indices),
            axis=-1
        )
        
        self.total.assign_add(tf.reduce_sum(tf.cast(matches, tf.float32)))
        self.count.assign_add(tf.cast(tf.shape(y_true)[0], tf.float32))
    
    def result(self):
        return self.total / self.count
    
    def reset_state(self):
        self.total.assign(0.0)
        self.count.assign(0.0)

# Test custom losses and metrics
print("\n=== Testing Custom Loss Functions and Metrics ===")

# Create sample data for testing
y_true_sample = tf.constant([0, 1, 2, 1, 0])
y_pred_sample = tf.constant([
    [0.8, 0.1, 0.1],
    [0.2, 0.7, 0.1],
    [0.1, 0.2, 0.7],
    [0.3, 0.6, 0.1],
    [0.9, 0.05, 0.05]
], dtype=tf.float32)

# Test Focal Loss
focal_loss = FocalLoss(alpha=0.25, gamma=2.0)
focal_loss_value = focal_loss(y_true_sample, y_pred_sample)
print(f"Focal Loss: {focal_loss_value:.4f}")

# Test Label Smoothing Loss
smooth_loss = LabelSmoothingLoss(smoothing=0.1)
smooth_loss_value = smooth_loss(y_true_sample, y_pred_sample)
print(f"Label Smoothing Loss: {smooth_loss_value:.4f}")

# Test F1 Score
f1_metric = F1Score(num_classes=3, average='macro')
f1_metric.update_state(y_true_sample, y_pred_sample)
f1_score = f1_metric.result()
print(f"F1 Score: {f1_score:.4f}")

# Test Top-K Accuracy
topk_metric = TopKCategoricalAccuracy(k=2)
topk_metric.update_state(y_true_sample, y_pred_sample)
topk_accuracy = topk_metric.result()
print(f"Top-2 Accuracy: {topk_accuracy:.4f}")

## 4. Custom Training Loops

In [None]:
# Advanced custom training loop implementation
class CustomTrainer:
    """Advanced custom training loop with multiple optimizers and schedulers"""
    
    def __init__(self, model, loss_fn, optimizer, metrics=None):
        self.model = model
        self.loss_fn = loss_fn
        self.optimizer = optimizer
        self.metrics = metrics or []
        
        # Training state
        self.train_loss = tf.keras.metrics.Mean(name='train_loss')
        self.val_loss = tf.keras.metrics.Mean(name='val_loss')
        
        # History
        self.history = {'train_loss': [], 'val_loss': []}
        for metric in self.metrics:
            self.history[f'train_{metric.name}'] = []
            self.history[f'val_{metric.name}'] = []
    
    @tf.function
    def train_step(self, x_batch, y_batch):
        """Single training step with gradient computation"""
        with tf.GradientTape() as tape:
            predictions = self.model(x_batch, training=True)
            loss = self.loss_fn(y_batch, predictions)
            
            # Add regularization losses
            if self.model.losses:
                loss += tf.add_n(self.model.losses)
        
        # Compute gradients
        gradients = tape.gradient(loss, self.model.trainable_variables)
        
        # Apply gradients
        self.optimizer.apply_gradients(zip(gradients, self.model.trainable_variables))
        
        # Update metrics
        self.train_loss.update_state(loss)
        for metric in self.metrics:
            metric.update_state(y_batch, predictions)
        
        return loss, predictions
    
    @tf.function
    def val_step(self, x_batch, y_batch):
        """Single validation step"""
        predictions = self.model(x_batch, training=False)
        loss = self.loss_fn(y_batch, predictions)
        
        self.val_loss.update_state(loss)
        for metric in self.metrics:
            metric.update_state(y_batch, predictions)
        
        return loss, predictions
    
    def fit(self, train_dataset, val_dataset=None, epochs=10, 
            callbacks=None, verbose=1):
        """Custom fit method"""
        
        callbacks = callbacks or []
        
        for epoch in range(epochs):
            if verbose:
                print(f"\nEpoch {epoch + 1}/{epochs}")
            
            # Reset metrics
            self.train_loss.reset_states()
            self.val_loss.reset_states()
            for metric in self.metrics:
                metric.reset_states()
            
            # Training phase
            train_batches = 0
            for x_batch, y_batch in train_dataset:
                loss, predictions = self.train_step(x_batch, y_batch)
                train_batches += 1
                
                if verbose and train_batches % 10 == 0:
                    print('.', end='', flush=True)
            
            # Validation phase
            if val_dataset is not None:
                for x_batch, y_batch in val_dataset:
                    val_loss, val_predictions = self.val_step(x_batch, y_batch)
            
            # Log metrics
            epoch_logs = {
                'train_loss': self.train_loss.result().numpy(),
                'val_loss': self.val_loss.result().numpy() if val_dataset else 0.0
            }
            
            for metric in self.metrics:
                epoch_logs[f'train_{metric.name}'] = metric.result().numpy()
                if val_dataset is not None:
                    # Note: This is simplified - in practice you'd need separate val metrics
                    epoch_logs[f'val_{metric.name}'] = metric.result().numpy()
            
            # Update history
            for key, value in epoch_logs.items():
                if key in self.history:
                    self.history[key].append(value)
            
            if verbose:
                metrics_str = " - ".join([f"{k}: {v:.4f}" for k, v in epoch_logs.items()])
                print(f"\n{metrics_str}")
            
            # Run callbacks
            for callback in callbacks:
                if hasattr(callback, 'on_epoch_end'):
                    callback.on_epoch_end(epoch, epoch_logs)
        
        return self.history

# Advanced training with multiple objectives
class MultiObjectiveTrainer:
    """Training with multiple loss functions and objectives"""
    
    def __init__(self, model, loss_functions, loss_weights, optimizers):
        self.model = model
        self.loss_functions = loss_functions  # Dict of loss functions
        self.loss_weights = loss_weights
        self.optimizers = optimizers  # Dict of optimizers for different parts
        
        # Metrics tracking
        self.loss_trackers = {
            name: tf.keras.metrics.Mean(name=f'{name}_loss')
            for name in loss_functions.keys()
        }
        self.total_loss_tracker = tf.keras.metrics.Mean(name='total_loss')
    
    @tf.function
    def train_step(self, inputs, targets):
        """Multi-objective training step"""
        with tf.GradientTape(persistent=True) as tape:
            predictions = self.model(inputs, training=True)
            
            # Compute individual losses
            losses = {}
            total_loss = 0
            
            for name, loss_fn in self.loss_functions.items():
                if name in targets:
                    loss_value = loss_fn(targets[name], predictions[name])
                    losses[name] = loss_value
                    total_loss += self.loss_weights[name] * loss_value
            
            # Add regularization
            if self.model.losses:
                total_loss += tf.add_n(self.model.losses)
        
        # Compute gradients for each optimizer
        for opt_name, optimizer in self.optimizers.items():
            if opt_name in losses:
                # Get variables for this optimizer
                variables = self.get_variables_for_optimizer(opt_name)
                gradients = tape.gradient(losses[opt_name], variables)
                optimizer.apply_gradients(zip(gradients, variables))
        
        # Update metrics
        self.total_loss_tracker.update_state(total_loss)
        for name, loss_value in losses.items():
            self.loss_trackers[name].update_state(loss_value)
        
        del tape
        return losses, predictions
    
    def get_variables_for_optimizer(self, optimizer_name):
        """Get trainable variables for specific optimizer"""
        # This would be customized based on your model architecture
        # For example, different optimizers for different parts of the model
        if optimizer_name == 'main':
            return self.model.trainable_variables
        else:
            # Return subset of variables
            return [v for v in self.model.trainable_variables if optimizer_name in v.name]

# Gradient accumulation trainer
class GradientAccumulationTrainer:
    """Training with gradient accumulation for large effective batch sizes"""
    
    def __init__(self, model, loss_fn, optimizer, accumulation_steps=4):
        self.model = model
        self.loss_fn = loss_fn
        self.optimizer = optimizer
        self.accumulation_steps = accumulation_steps
        
        # Metrics
        self.train_loss = tf.keras.metrics.Mean(name='train_loss')
        self.accumulated_gradients = []
    
    def initialize_accumulated_gradients(self):
        """Initialize accumulated gradients"""
        self.accumulated_gradients = [
            tf.Variable(tf.zeros_like(var), trainable=False)
            for var in self.model.trainable_variables
        ]
    
    @tf.function
    def accumulate_gradients(self, x_batch, y_batch):
        """Accumulate gradients from a mini-batch"""
        with tf.GradientTape() as tape:
            predictions = self.model(x_batch, training=True)
            loss = self.loss_fn(y_batch, predictions) / self.accumulation_steps
        
        gradients = tape.gradient(loss, self.model.trainable_variables)
        
        # Accumulate gradients
        for i, grad in enumerate(gradients):
            if grad is not None:
                self.accumulated_gradients[i].assign_add(grad)
        
        self.train_loss.update_state(loss * self.accumulation_steps)
        return loss
    
    @tf.function
    def apply_accumulated_gradients(self):
        """Apply accumulated gradients and reset"""
        self.optimizer.apply_gradients(
            zip(self.accumulated_gradients, self.model.trainable_variables)
        )
        
        # Reset accumulated gradients
        for accumulated_grad in self.accumulated_gradients:
            accumulated_grad.assign(tf.zeros_like(accumulated_grad))
    
    def fit(self, dataset, epochs=10, steps_per_epoch=None):
        """Training with gradient accumulation"""
        
        self.initialize_accumulated_gradients()
        
        for epoch in range(epochs):
            print(f"\nEpoch {epoch + 1}/{epochs}")
            self.train_loss.reset_states()
            
            step = 0
            for x_batch, y_batch in dataset:
                # Accumulate gradients
                self.accumulate_gradients(x_batch, y_batch)
                step += 1
                
                # Apply gradients every accumulation_steps
                if step % self.accumulation_steps == 0:
                    self.apply_accumulated_gradients()
                
                if steps_per_epoch and step >= steps_per_epoch:
                    break
                
                if step % 50 == 0:
                    print('.', end='', flush=True)
            
            # Apply remaining gradients if any
            if step % self.accumulation_steps != 0:
                self.apply_accumulated_gradients()
            
            print(f"\nTrain Loss: {self.train_loss.result():.4f}")

# Test custom training loops
print("\n=== Testing Custom Training Loops ===")

# Create sample model and data
sample_model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(20,)),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(3, activation='softmax')
])

# Create datasets
X_train, y_train = make_classification(n_samples=800, n_features=20, n_classes=3, random_state=42)
X_val, y_val = make_classification(n_samples=200, n_features=20, n_classes=3, random_state=24)

train_dataset = tf.data.Dataset.from_tensor_slices((X_train.astype(np.float32), y_train))
train_dataset = train_dataset.batch(32).prefetch(tf.data.AUTOTUNE)

val_dataset = tf.data.Dataset.from_tensor_slices((X_val.astype(np.float32), y_val))
val_dataset = val_dataset.batch(32).prefetch(tf.data.AUTOTUNE)

# Test basic custom trainer
custom_trainer = CustomTrainer(
    model=sample_model,
    loss_fn=tf.keras.losses.SparseCategoricalCrossentropy(),
    optimizer=tf.keras.optimizers.Adam(0.001),
    metrics=[tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy')]
)

print("Training with custom trainer...")
history = custom_trainer.fit(
    train_dataset, val_dataset, 
    epochs=5, verbose=1
)

print(f"Final train accuracy: {history['train_accuracy'][-1]:.4f}")
print(f"Final val accuracy: {history['val_accuracy'][-1]:.4f}")

# Test gradient accumulation trainer
print("\nTesting gradient accumulation trainer...")
ga_model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(20,)),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(3, activation='softmax')
])

ga_trainer = GradientAccumulationTrainer(
    model=ga_model,
    loss_fn=tf.keras.losses.SparseCategoricalCrossentropy(),
    optimizer=tf.keras.optimizers.Adam(0.001),
    accumulation_steps=4
)

ga_trainer.fit(train_dataset, epochs=3, steps_per_epoch=50)

## 5. Advanced Model Architectures with Custom Components

In [None]:
# Complete custom model using all components
class AdvancedTransformerBlock(tf.keras.layers.Layer):
    """Advanced transformer block with custom components"""
    
    def __init__(self, embed_dim, num_heads, ff_dim, dropout_rate=0.1, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.ff_dim = ff_dim
        
        # Multi-head attention
        self.attention = MultiHeadSelfAttention(embed_dim, num_heads)
        
        # Feed-forward network
        self.ffn = tf.keras.Sequential([
            tf.keras.layers.Dense(ff_dim, activation='relu'),
            tf.keras.layers.Dropout(dropout_rate),
            tf.keras.layers.Dense(embed_dim)
        ])
        
        # Layer normalization
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        
        # Dropout
        self.dropout1 = tf.keras.layers.Dropout(dropout_rate)
        self.dropout2 = tf.keras.layers.Dropout(dropout_rate)
        
        # Layer scaling
        self.layer_scale1 = LayerScale()
        self.layer_scale2 = LayerScale()
        
        # Squeeze and excitation
        self.se_block = FeatureSqueezeExcitation(reduction_ratio=8)
    
    def call(self, inputs, training=None, mask=None):
        # Multi-head attention with residual connection
        attn_output = self.attention(inputs, training=training, mask=mask)
        attn_output = self.dropout1(attn_output, training=training)
        attn_output = self.layer_scale1(attn_output)
        out1 = self.layernorm1(inputs + attn_output)
        
        # Feed-forward network with residual connection
        ffn_output = self.ffn(out1, training=training)
        ffn_output = self.dropout2(ffn_output, training=training)
        ffn_output = self.layer_scale2(ffn_output)
        out2 = self.layernorm2(out1 + ffn_output)
        
        # Squeeze and excitation
        out2 = self.se_block(out2)
        
        return out2
    
    def get_config(self):
        config = super().get_config()
        config.update({
            'embed_dim': self.embed_dim,
            'num_heads': self.num_heads,
            'ff_dim': self.ff_dim
        })
        return config

class HybridNeuralNetwork(tf.keras.Model):
    """Hybrid model combining multiple custom components"""
    
    def __init__(self, num_classes, embed_dim=128, num_heads=8, **kwargs):
        super().__init__(**kwargs)
        self.num_classes = num_classes
        self.embed_dim = embed_dim
        
        # Input processing
        self.input_projection = tf.keras.layers.Dense(embed_dim)
        self.positional_encoding = PositionalEncoding(max_position=1000)
        self.input_dropout = tf.keras.layers.Dropout(0.1)
        
        # Transformer blocks
        self.transformer_blocks = [
            AdvancedTransformerBlock(embed_dim, num_heads, embed_dim * 4)
            for _ in range(3)
        ]
        
        # Residual connections
        self.residual_blocks = [
            ResidualBlock(embed_dim, use_bottleneck=True)
            for _ in range(2)
        ]
        
        # Feature processing
        self.feature_squeeze = FeatureSqueezeExcitation(reduction_ratio=16)
        self.glu = GatedLinearUnit(embed_dim // 2)
        
        # Output layers
        self.global_pool = tf.keras.layers.GlobalAveragePooling1D()
        self.classifier_dropout = tf.keras.layers.Dropout(0.4)
        self.classifier = tf.keras.layers.Dense(num_classes)
        self.output_activation = SwishActivation(beta=1.0)
    
    def call(self, inputs, training=None):
        # Input processing
        x = self.input_projection(inputs)
        
        # Add sequence dimension if needed
        if len(x.shape) == 2:
            x = tf.expand_dims(x, axis=1)
        
        # Positional encoding
        x = self.positional_encoding(x)
        x = self.input_dropout(x, training=training)
        
        # Transformer blocks
        for transformer in self.transformer_blocks:
            x = transformer(x, training=training)
        
        # Global pooling to remove sequence dimension
        x = self.global_pool(x)
        
        # Residual blocks (now in 2D)
        for residual in self.residual_blocks:
            x = residual(x, training=training)
        
        # Feature processing
        x = self.feature_squeeze(x)
        x = self.glu(x)
        
        # Classification
        x = self.classifier_dropout(x, training=training)
        x = self.classifier(x)
        outputs = self.output_activation(x)
        
        return outputs
    
    def get_config(self):
        config = super().get_config()
        config.update({
            'num_classes': self.num_classes,
            'embed_dim': self.embed_dim
        })
        return config

# Build and test hybrid model
print("\n=== Testing Hybrid Neural Network ===")

hybrid_model = HybridNeuralNetwork(num_classes=3, embed_dim=64, num_heads=4)

# Compile with custom loss and metrics
hybrid_model.compile(
    optimizer=tf.keras.optimizers.AdamW(learning_rate=0.001, weight_decay=0.01),
    loss=LabelSmoothingLoss(smoothing=0.1),
    metrics=[
        F1Score(num_classes=3, average='macro'),
        TopKCategoricalAccuracy(k=2)
    ]
)

# Build model
_ = hybrid_model(X_train[:1].astype(np.float32))
print(f"Hybrid Model Parameters: {hybrid_model.count_params():,}")

# Train with custom trainer and advanced techniques
advanced_trainer = CustomTrainer(
    model=hybrid_model,
    loss_fn=LabelSmoothingLoss(smoothing=0.1),
    optimizer=tf.keras.optimizers.AdamW(learning_rate=0.001, weight_decay=0.01),
    metrics=[
        F1Score(num_classes=3, average='macro', name='f1_score'),
        tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy')
    ]
)

print("Training hybrid model with custom components...")
hybrid_history = advanced_trainer.fit(
    train_dataset, val_dataset,
    epochs=8, verbose=1
)

print(f"Final hybrid model performance:")
print(f"  Train F1 Score: {hybrid_history['train_f1_score'][-1]:.4f}")
print(f"  Val F1 Score: {hybrid_history['val_f1_score'][-1]:.4f}")
print(f"  Train Accuracy: {hybrid_history['train_accuracy'][-1]:.4f}")
print(f"  Val Accuracy: {hybrid_history['val_accuracy'][-1]:.4f}")

## 6. Model Debugging and Optimization

In [None]:
# Advanced debugging utilities for custom models
class CustomModelDebugger:
    """Advanced debugging utilities for custom models"""
    
    def __init__(self, model):
        self.model = model
        
    def analyze_gradient_flow(self, x_sample, y_sample, loss_fn):
        """Analyze gradient flow through custom layers"""
        with tf.GradientTape() as tape:
            predictions = self.model(x_sample, training=True)
            loss = loss_fn(y_sample, predictions)
        
        gradients = tape.gradient(loss, self.model.trainable_variables)
        
        print("=== Gradient Flow Analysis ===")
        layer_gradients = {}
        
        for i, (var, grad) in enumerate(zip(self.model.trainable_variables, gradients)):
            if grad is not None:
                layer_name = var.name.split('/')[0] if '/' in var.name else f'layer_{i}'
                grad_norm = tf.norm(grad).numpy()
                grad_mean = tf.reduce_mean(tf.abs(grad)).numpy()
                
                if layer_name not in layer_gradients:
                    layer_gradients[layer_name] = []
                
                layer_gradients[layer_name].append({
                    'variable': var.name,
                    'grad_norm': grad_norm,
                    'grad_mean': grad_mean,
                    'shape': var.shape
                })
        
        for layer_name, grads in layer_gradients.items():
            print(f"\nLayer: {layer_name}")
            for grad_info in grads:
                print(f"  {grad_info['variable']}: norm={grad_info['grad_norm']:.6f}, "
                      f"mean_abs={grad_info['grad_mean']:.6f}, shape={grad_info['shape']}")
        
        return layer_gradients
    
    def profile_layer_performance(self, x_sample, num_iterations=100):
        """Profile performance of each custom layer"""
        import time
        
        print("=== Layer Performance Profiling ===")
        
        # Get intermediate outputs
        layer_outputs = []
        x = x_sample
        
        times = []
        for i, layer in enumerate(self.model.layers):
            start_time = time.time()
            
            for _ in range(num_iterations):
                if hasattr(layer, 'call'):
                    x_temp = layer(x, training=False)
                else:
                    x_temp = layer(x)
            
            end_time = time.time()
            avg_time = (end_time - start_time) / num_iterations
            
            times.append(avg_time)
            x = layer(x, training=False) if hasattr(layer, 'call') else layer(x)
            
            print(f"Layer {i} ({layer.__class__.__name__}): {avg_time*1000:.3f} ms avg")
        
        return times

# Performance optimization utilities
class ModelOptimizer:
    """Utilities for optimizing custom models"""
    
    @staticmethod
    def mixed_precision_setup():
        """Setup mixed precision training"""
        policy = tf.keras.mixed_precision.Policy('mixed_float16')
        tf.keras.mixed_precision.set_global_policy(policy)
        print("Mixed precision enabled")
        
        return policy
    
    @staticmethod
    def quantize_model(model, representative_dataset):
        """Quantize model for deployment"""
        
        def representative_data_gen():
            for batch in representative_dataset.take(100):
                yield [batch[0]]
        
        converter = tf.lite.TFLiteConverter.from_keras_model(model)
        converter.optimizations = [tf.lite.Optimize.DEFAULT]
        converter.representative_dataset = representative_data_gen
        converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
        converter.inference_input_type = tf.int8
        converter.inference_output_type = tf.int8
        
        quantized_model = converter.convert()
        return quantized_model

# Test debugging and optimization
print("\n=== Testing Model Debugging and Optimization ===")

# Debug hybrid model
debugger = CustomModelDebugger(hybrid_model)

# Analyze gradient flow
sample_x = X_train[:8].astype(np.float32)
sample_y = y_train[:8]

gradient_analysis = debugger.analyze_gradient_flow(
    sample_x, sample_y, LabelSmoothingLoss(smoothing=0.1)
)

# Profile layer performance
performance_profile = debugger.profile_layer_performance(sample_x[:1])

# Setup mixed precision
optimizer = ModelOptimizer()
mixed_precision_policy = optimizer.mixed_precision_setup()

print(f"\nMixed precision policy: {mixed_precision_policy.name}")
print("Model debugging and optimization utilities ready!")

## Summary

This comprehensive notebook mastered advanced tf.keras customization techniques:

**Custom Components Built:**
- **Basic & Advanced Layers**: Templates, residual blocks, attention mechanisms
- **Specialized Layers**: GLU, Swish activation, layer scaling, noise regularization
- **Custom Loss Functions**: Focal loss, contrastive loss, label smoothing
- **Custom Metrics**: F1-Score, Top-K accuracy with proper state management
- **Custom Training Loops**: Multi-objective, gradient accumulation, advanced optimization

**Advanced Architectures:**
- **Hybrid Models**: Combining transformers, residual networks, and attention
- **Multi-Component Integration**: Seamless combination of custom layers
- **State Management**: Proper variable tracking and serialization

**Training Innovations:**
- **Multi-Objective Learning**: Multiple loss functions with weighted optimization
- **Gradient Accumulation**: Large effective batch sizes with limited memory
- **Advanced Optimizers**: Custom learning rate schedules and weight decay

**Debugging & Optimization:**
- **Gradient Flow Analysis**: Systematic gradient monitoring across layers
- **Performance Profiling**: Layer-by-layer execution timing
- **Mixed Precision**: Hardware acceleration support

This foundation enables building cutting-edge neural network architectures with complete customization capability!