In [None]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.datasets import mnist
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.optimizers import SGD

# Load and preprocess the MNIST dataset
(X_train, y_train), (X_test, y_test) = mnist.load_data()
X_train, X_test = X_train / 255.0, X_test / 255.0

# Function to create shallow or deep network model
def create_model(layers=3, activation='sigmoid'):
    model = Sequential([Flatten(input_shape=(28, 28))])
    for _ in range(layers):
        model.add(Dense(128, activation=activation))
    model.add(Dense(10, activation='softmax'))
    return model

# Function to train and record gradient values
def train_and_record_gradients(model, epochs=5):
    sgd = SGD(learning_rate=0.01)
    model.compile(optimizer=sgd, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    gradient_magnitudes = []

    # Custom training loop to record gradient magnitudes
    for epoch in range(epochs):
        with tf.GradientTape() as tape:
            predictions = model(X_train, training=True)
            loss = tf.reduce_mean(tf.keras.losses.sparse_categorical_crossentropy(y_train, predictions))
        
        # Get gradients and compute their magnitudes
        gradients = tape.gradient(loss, model.trainable_variables)
        grad_norms = [tf.norm(g).numpy() for g in gradients if g is not None]
        avg_grad_norm = np.mean(grad_norms)
        gradient_magnitudes.append(avg_grad_norm)
        print(f"Epoch {epoch+1}, Avg Gradient Norm: {avg_grad_norm}")

        model.optimizer.apply_gradients(zip(gradients, model.trainable_variables))
        
    return gradient_magnitudes

# Experiment with shallow and deep networks with sigmoid and ReLU activations
results = {}

for architecture in ['shallow', 'deep']:
    for activation in ['sigmoid', 'relu']:
        print(f"\nTraining {architecture} network with {activation} activation:")
        layers = 3 if architecture == 'shallow' else 10  # Shallow: 3 layers, Deep: 10 layers
        model = create_model(layers=layers, activation=activation)
        gradient_magnitudes = train_and_record_gradients(model, epochs=10)
        results[f"{architecture}_{activation}"] = gradient_magnitudes

# Plotting the gradient magnitudes to observe vanishing and exploding gradients
plt.figure(figsize=(12, 6))

for key, gradients in results.items():
    plt.plot(gradients, label=key)

plt.xlabel('Epochs')
plt.ylabel('Average Gradient Magnitude')
plt.title('Gradient Magnitude Across Epochs')
plt.legend()
plt.show()
