Optimization Algorithms from scratch implementations and library implementations.

1.Stochastic Gradient Descent (SGD) from scatch

In [1]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.datasets import mnist
from tensorflow.keras.utils import to_categorical
import numpy as np

# Load and preprocess MNIST dataset of handwritten digits
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0  # Normalize pixel values
x_train = x_train.reshape(-1, 28 * 28)  # Flatten 28x28 images to 784-dim vectors
x_test = x_test.reshape(-1, 28 * 28)
y_train = to_categorical(y_train, 10)  # Convert labels to one-hot encoding
y_test = to_categorical(y_test, 10)

# Build neural network model
model = Sequential([
    Dense(128, activation='relu', input_shape=(28 * 28,)), #relu introduces non-linearity
    Dense(10, activation='softmax')])# Output layer with probability distribution

# Define a custom Stochastic Gradient Descent (SGD) optimizer from scratch
class CustomSGD:
    def __init__(self, learning_rate=0.01):
        # Initialize learning rate for weight updates
        self.learning_rate = learning_rate

    def update(self, weights, grads):
        """
        Update weights using gradient descent.
        :param weights: List of model weights (trainable variables)
        :param grads: List of gradients corresponding to the weights
        """
        for i in range(len(weights)):
            # Update each weight: weight = weight - learning_rate * gradient
            weights[i].assign_sub(self.learning_rate * grads[i])

# Define categorical cross-entropy loss function for multi-class classification
def categorical_crossentropy(y_true, y_pred):
    # Calculate negative log likelihood of true labels given predictions
    return -tf.reduce_mean(tf.reduce_sum(y_true * tf.math.log(y_pred), axis=1))

# Training loop function
def train_model(model, x_train, y_train, epochs=5, batch_size=32, learning_rate=0.01):
    # Initialize our custom SGD optimizer with given learning rate
    optimizer = CustomSGD(learning_rate=learning_rate)

    # Calculate number of batches per epoch
    num_samples = x_train.shape[0]
    num_batches = int(np.ceil(num_samples / batch_size))

    # Training loop over epochs
    for epoch in range(epochs):
        print(f"Epoch {epoch + 1}/{epochs}")

        # Process each batch in the epoch
        for batch in range(num_batches):
            # Get current batch of data
            start = batch * batch_size
            end = start + batch_size
            x_batch = x_train[start:end]
            y_batch = y_train[start:end]

            # Forward pass: compute predictions and loss
            with tf.GradientTape() as tape:
                y_pred = model(x_batch, training=True)
                loss = categorical_crossentropy(y_batch, y_pred)

            # Backward pass: compute gradients of loss with respect to weights
            grads = tape.gradient(loss, model.trainable_variables)

            # Update weights using our custom optimizer
            optimizer.update(model.trainable_variables, grads)

            # Print progress every 100 batches
            if batch % 100 == 0:
                print(f"Batch {batch}/{num_batches}, Loss: {loss.numpy()}")

# Train the model for 5 epochs with batch size 32 and learning rate 0.01
train_model(model, x_train, y_train, epochs=5, batch_size=32, learning_rate=0.01)

# Function to evaluate model performance on test set
def evaluate_model(model, x_test, y_test):
    # Get model predictions
    y_pred = model(x_test)

    # Convert predictions and true labels from one-hot to class indices
    y_pred = tf.argmax(y_pred, axis=1)
    y_true = tf.argmax(y_test, axis=1)

    # Calculate accuracy by comparing predictions to true labels
    accuracy = tf.reduce_mean(tf.cast(tf.equal(y_pred, y_true), tf.float32))
    print(f"Test Accuracy: {accuracy.numpy()}")

# Evaluate the trained model on test data
evaluate_model(model, x_test, y_test)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
[1m11490434/11490434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/5
Batch 0/1875, Loss: 2.414673089981079
Batch 100/1875, Loss: 1.5826143026351929
Batch 200/1875, Loss: 0.9501537680625916
Batch 300/1875, Loss: 0.825154185295105
Batch 400/1875, Loss: 0.765343189239502
Batch 500/1875, Loss: 0.8656417727470398
Batch 600/1875, Loss: 0.44429463148117065
Batch 700/1875, Loss: 0.4542093873023987
Batch 800/1875, Loss: 0.41611939668655396
Batch 900/1875, Loss: 0.3259173035621643
Batch 1000/1875, Loss: 0.5184282064437866
Batch 1100/1875, Loss: 0.4106502830982208
Batch 1200/1875, Loss: 0.4260193109512329
Batch 1300/1875, Loss: 0.5035765171051025
Batch 1400/1875, Loss: 0.38824009895324707
Batch 1500/1875, Loss: 0.298125684261322
Batch 1600/1875, Loss: 0.33617550134658813
Batch 1700/1875, Loss: 0.2514995336532593
Batch 1800/1875, Loss: 0.34522655606269836
Epoch 2/5
Batch 0/1875, Loss: 0.42509835958480835
Batch 100/1875, Loss: 0.4461454749107361
Batch 200/1875, Loss: 0.3921183943748474
Batch 300/1875, Loss: 0.26688694953918457
Batch 400/1875, Loss: 0.28448

2. SGD with momentum from scatch

In [2]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.datasets import mnist
from tensorflow.keras.utils import to_categorical
import numpy as np

# Load and preprocess MNIST dataset of handwritten digits
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0  # Normalize pixel values
x_train = x_train.reshape(-1, 28 * 28)  # Flatten 28x28 images to 784-dim vectors
x_test = x_test.reshape(-1, 28 * 28)
y_train = to_categorical(y_train, 10)  # Convert labels to one-hot encoding
y_test = to_categorical(y_test, 10)

# Build neural network model
model = Sequential([
    Dense(128, activation='relu', input_shape=(28 * 28,)), #relu introduces non-linearity
    Dense(10, activation='softmax') ]) # Output layer with probability distribution

# Define a custom SGD with Momentum optimizer
class SGDMomentum:
    def __init__(self, learning_rate=0.01, momentum=0.9):
        self.learning_rate = learning_rate  # Step size for weight updates
        self.momentum = momentum  # Controls how much of past gradients are retained (0.9 is common)
        self.velocities = None  # Will store velocity terms for each weight (initialized in first update)

    def update(self, weights, grads):
        """
        Update weights using SGD with momentum.
        :param weights: List of model weights (trainable variables).
        :param grads: List of gradients corresponding to the weights.
        """
        if self.velocities is None:
            # Initialize velocities as zero vectors matching the shape of each weight tensor
            self.velocities = [tf.Variable(tf.zeros_like(w)) for w in weights]

        for i in range(len(weights)):
            # Update velocity: v = momentum * v - learning_rate * grad (momentum accumulates past gradients)
            self.velocities[i].assign(self.momentum * self.velocities[i] - self.learning_rate * grads[i])
            # Update weights: w = w + v (applies the smoothed velocity instead of raw gradients)
            weights[i].assign_add(self.velocities[i])

# Loss function (categorical cross-entropy)
def categorical_crossentropy(y_true, y_pred):
    # Computes cross-entropy loss between true labels (one-hot) and predicted probabilities
    return -tf.reduce_mean(tf.reduce_sum(y_true * tf.math.log(y_pred), axis=1))

# Training loop
def train_model(model, x_train, y_train, epochs=5, batch_size=32, learning_rate=0.01, momentum=0.9):
    optimizer = SGDMomentum(learning_rate=learning_rate, momentum=momentum)
    num_samples = x_train.shape[0]  # Total training samples (60,000)
    num_batches = int(np.ceil(num_samples / batch_size))  # Number of batches per epoch

    for epoch in range(epochs):
        print(f"Epoch {epoch + 1}/{epochs}")
        for batch in range(num_batches):
            # Get a batch of data
            start = batch * batch_size
            end = start + batch_size
            x_batch = x_train[start:end]  # Input batch (shape: [batch_size, 784])
            y_batch = y_train[start:end]  # Label batch (shape: [batch_size, 10])

            # Forward pass (compute predictions under gradient tape for autodiff)
            with tf.GradientTape() as tape:
                y_pred = model(x_batch, training=True)  # Model outputs (probabilities)
                loss = categorical_crossentropy(y_batch, y_pred)  # Compute loss

            # Backward pass (compute gradients of loss w.r.t. trainable weights)
            grads = tape.gradient(loss, model.trainable_variables)

            # Update weights using the custom SGD with Momentum optimizer
            optimizer.update(model.trainable_variables, grads)

            # Print loss every 100 batches
            if batch % 100 == 0:
                print(f"Batch {batch}/{num_batches}, Loss: {loss.numpy()}")

# Train the model
train_model(model, x_train, y_train, epochs=5, batch_size=32, learning_rate=0.01, momentum=0.9)

# Evaluate the model
def evaluate_model(model, x_test, y_test):
    y_pred = model(x_test)  # Get model predictions (probabilities)
    y_pred = tf.argmax(y_pred, axis=1)  # Convert to class labels (0-9)
    y_true = tf.argmax(y_test, axis=1)  # Convert one-hot labels to class labels
    accuracy = tf.reduce_mean(tf.cast(tf.equal(y_pred, y_true), tf.float32))  # Compute accuracy
    print(f"Test Accuracy: {accuracy.numpy()}")

evaluate_model(model, x_test, y_test)

Epoch 1/5
Batch 0/1875, Loss: 2.277789831161499
Batch 100/1875, Loss: 0.48276323080062866
Batch 200/1875, Loss: 0.37784743309020996
Batch 300/1875, Loss: 0.27518677711486816
Batch 400/1875, Loss: 0.23590126633644104
Batch 500/1875, Loss: 0.40305736660957336
Batch 600/1875, Loss: 0.17073597013950348
Batch 700/1875, Loss: 0.16989339888095856
Batch 800/1875, Loss: 0.15086308121681213
Batch 900/1875, Loss: 0.159725159406662
Batch 1000/1875, Loss: 0.3938857316970825
Batch 1100/1875, Loss: 0.2124837338924408
Batch 1200/1875, Loss: 0.20387053489685059
Batch 1300/1875, Loss: 0.276106595993042
Batch 1400/1875, Loss: 0.2740926742553711
Batch 1500/1875, Loss: 0.1632634550333023
Batch 1600/1875, Loss: 0.2519829273223877
Batch 1700/1875, Loss: 0.11846309900283813
Batch 1800/1875, Loss: 0.2113291323184967
Epoch 2/5
Batch 0/1875, Loss: 0.1133892685174942
Batch 100/1875, Loss: 0.26501283049583435
Batch 200/1875, Loss: 0.18137653172016144
Batch 300/1875, Loss: 0.06045425683259964
Batch 400/1875, Loss: 

3. Root Mean Squared Propagation (RMSprop) from scatch

In [4]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.datasets import mnist
from tensorflow.keras.utils import to_categorical
import numpy as np

# Load and preprocess data
(x_train, y_train), (x_test, y_test) = mnist.load_data()

# Normalize pixel values to range [0,1]
x_train, x_test = x_train / 255.0, x_test / 255.0

# Flatten images from 28x28 to 784-dimensional vectors
x_train = x_train.reshape(-1, 28 * 28)
x_test = x_test.reshape(-1, 28 * 28)

# Convert labels to one-hot encoded format
y_train = to_categorical(y_train, 10)
y_test = to_categorical(y_test, 10)

# Build a simple feedforward neural network
model = Sequential([
    Dense(128, activation='relu', input_shape=(28 * 28,)),  # Hidden layer with 128 neurons
    Dense(10, activation='softmax')  # Output layer with 10 neurons (one for each digit)
])

class RMSProp:
    """
    Custom implementation of the RMSProp optimizer.
    RMSProp maintains an exponentially decaying average of past squared gradients
    to normalize future updates.
    """
    def __init__(self, learning_rate=0.001, rho=0.9, epsilon=1e-7):
        """
        Initialize optimizer parameters.

        Args:
        learning_rate (float): Step size for parameter updates.
        rho (float): Decay factor for the moving average of squared gradients.
        epsilon (float): Small constant to prevent division by zero.
        """
        self.lr = learning_rate
        self.rho = rho
        self.epsilon = epsilon
        self.v = None  # Will store moving averages of squared gradients

    def apply_gradients(self, grads, vars):
        """
        Apply gradients to update model parameters using RMSProp.

        Args:
        grads (list of tensors): Computed gradients of the loss with respect to variables.
        vars (list of tensors): Model parameters to be updated.
        """
        if self.v is None:
            # Initialize v as tf.Variables for proper assignment
            self.v = [tf.Variable(tf.zeros_like(var)) for var in vars]

        # Iterate over model parameters and gradients
        for var, grad, v in zip(vars, grads, self.v):
            if grad is None:
                continue

            # Update moving average of squared gradients
            v.assign(self.rho * v + (1 - self.rho) * tf.square(grad))

            # Update parameter values using RMSProp rule
            var.assign_sub(self.lr * grad / (tf.sqrt(v) + self.epsilon))

def train_model(model, x_train, y_train, epochs=5, batch_size=32):
    """
    Train the model using mini-batch gradient descent with RMSProp.

    Args:
    model (Sequential): The neural network model.
    x_train (numpy array): Training input data.
    y_train (numpy array): Training labels (one-hot encoded).
    epochs (int): Number of times the model sees the entire dataset.
    batch_size (int): Number of samples per mini-batch.
    """
    optimizer = RMSProp(learning_rate=0.001)

    for epoch in range(epochs):
        print(f"Epoch {epoch+1}/{epochs}")

        # Shuffle training data to ensure randomness in mini-batches
        indices = np.random.permutation(len(x_train))
        x_shuffled = x_train[indices]
        y_shuffled = y_train[indices]

        for batch in range(0, len(x_train), batch_size):
            # Get mini-batch
            x_batch = x_shuffled[batch:batch + batch_size]
            y_batch = y_shuffled[batch:batch + batch_size]

            with tf.GradientTape() as tape:
                # Forward pass: compute model predictions
                preds = model(x_batch)

                # Compute categorical cross-entropy loss
                loss = tf.reduce_mean(
                    tf.keras.losses.categorical_crossentropy(y_batch, preds)
                )

            # Compute gradients of the loss with respect to model parameters
            grads = tape.gradient(loss, model.trainable_variables)

            # Apply gradients to update model parameters using RMSProp
            optimizer.apply_gradients(grads, model.trainable_variables)

            if batch % 100 == 0:
                print(f"Batch {batch}, Loss: {loss.numpy():.4f}")

def evaluate(model, x_test, y_test):
    """
    Evaluate model performance on test data.

    Args:
    model (Sequential): The trained neural network model.
    x_test (numpy array): Test input data.
    y_test (numpy array): Test labels (one-hot encoded).
    """
    preds = model(x_test)  # Get predictions

    # Compute accuracy by comparing predicted and actual labels
    accuracy = tf.reduce_mean(
        tf.cast(
            tf.equal(tf.argmax(y_test, axis=1),
                     tf.argmax(preds, axis=1)),
            tf.float32 ))

    print(f"Test Accuracy: {accuracy.numpy() * 100:.2f}%")

# Train and evaluate the model
train_model(model, x_train, y_train)
evaluate(model, x_test, y_test)


Epoch 1/5
Batch 0, Loss: 2.5080
Batch 800, Loss: 0.8016
Batch 1600, Loss: 0.7601
Batch 2400, Loss: 0.4361
Batch 3200, Loss: 0.3866
Batch 4000, Loss: 0.3057
Batch 4800, Loss: 0.2728
Batch 5600, Loss: 0.5450
Batch 6400, Loss: 0.3420
Batch 7200, Loss: 0.4329
Batch 8000, Loss: 0.4170
Batch 8800, Loss: 0.4125
Batch 9600, Loss: 0.7629
Batch 10400, Loss: 0.1626
Batch 11200, Loss: 0.4347
Batch 12000, Loss: 0.4849
Batch 12800, Loss: 0.1728
Batch 13600, Loss: 0.3058
Batch 14400, Loss: 0.1866
Batch 15200, Loss: 0.1389
Batch 16000, Loss: 0.1698
Batch 16800, Loss: 0.2578
Batch 17600, Loss: 0.0932
Batch 18400, Loss: 0.3345
Batch 19200, Loss: 0.1821
Batch 20000, Loss: 0.1327
Batch 20800, Loss: 0.2788
Batch 21600, Loss: 0.3314
Batch 22400, Loss: 0.4195
Batch 23200, Loss: 0.1408
Batch 24000, Loss: 0.3518
Batch 24800, Loss: 0.1454
Batch 25600, Loss: 0.1775
Batch 26400, Loss: 0.2199
Batch 27200, Loss: 0.2454
Batch 28000, Loss: 0.0895
Batch 28800, Loss: 0.0976
Batch 29600, Loss: 0.3020
Batch 30400, Loss: 

4. Adaptive Moment Estimation (Adam) from scratch

In [6]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.datasets import mnist
from tensorflow.keras.utils import to_categorical
import numpy as np

# Load and preprocess MNIST dataset of handwritten digits
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0  # Normalize pixel values
x_train = x_train.reshape(-1, 28 * 28)  # Flatten 28x28 images to 784-dim vectors
x_test = x_test.reshape(-1, 28 * 28)
y_train = to_categorical(y_train, 10)  # Convert labels to one-hot encoding
y_test = to_categorical(y_test, 10)

# Build neural network model
model = Sequential([
    Dense(128, activation='relu', input_shape=(28 * 28,)), #relu introduces non-linearity
    Dense(10, activation='softmax')])  # Output layer with probability distribution

class CustomAdam:
    """
    Custom implementation of Adam (Adaptive Moment Estimation) optimizer
    Combines benefits of RMSProp (adaptive learning rates) and momentum

    Key components:
    - m: First moment vector (mean of gradients)
    - v: Second moment vector (uncentered variance of gradients)
    - beta1: Exponential decay rate for first moment (typically 0.9)
    - beta2: Exponential decay rate for second moment (typically 0.999)
    - epsilon: Small constant for numerical stability
    - t: Time step counter for bias correction
    """
    def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-7):
        self.learning_rate = learning_rate
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon
        self.m = None  # Will store first moment estimates
        self.v = None  # Will store second moment estimates
        self.t = 0  # Iteration counter

    def update(self, weights, grads):
        """Update weights using Adam optimization algorithm"""
        # Initialize moment vectors on first update
        if self.m is None:
            # Initialize m and v as lists of tf.Variables
            self.m = [tf.Variable(tf.zeros_like(w)) for w in weights]
            self.v = [tf.Variable(tf.zeros_like(w)) for w in weights]

        self.t += 1  # Increment time step

        for i in range(len(weights)):
            # Update biased first moment estimate (like momentum)
            self.m[i].assign(self.beta1 * self.m[i] + (1 - self.beta1) * grads[i])

            # Update biased second moment estimate (like RMSProp)
            self.v[i].assign(self.beta2 * self.v[i] + (1 - self.beta2) * tf.square(grads[i]))

            # Compute bias-corrected first moment estimate
            m_hat = self.m[i] / (1 - tf.pow(self.beta1, self.t))

            # Compute bias-corrected second moment estimate
            v_hat = self.v[i] / (1 - tf.pow(self.beta2, self.t))

            # Update parameters with adaptive learning rate
            weights[i].assign_sub(self.learning_rate * m_hat / (tf.sqrt(v_hat) + self.epsilon))

# Loss function for multi-class classification
def categorical_crossentropy(y_true, y_pred):
    """Computes cross-entropy between true labels and predicted probabilities"""
    return -tf.reduce_mean(tf.reduce_sum(y_true * tf.math.log(y_pred), axis=1))

def train_model(model, x_train, y_train, epochs=5, batch_size=32, learning_rate=0.001):
    """Training loop with custom Adam optimizer"""
    optimizer = CustomAdam(learning_rate=learning_rate)
    num_samples = x_train.shape[0]
    num_batches = int(np.ceil(num_samples / batch_size))

    for epoch in range(epochs):
        print(f"Epoch {epoch + 1}/{epochs}")
        epoch_loss = []

        for batch in range(num_batches):
            # Get batch data
            start = batch * batch_size
            end = start + batch_size
            x_batch = x_train[start:end]
            y_batch = y_train[start:end]

            # Forward pass with gradient tracking
            with tf.GradientTape() as tape:
                y_pred = model(x_batch, training=True)
                loss = categorical_crossentropy(y_batch, y_pred)

            # Compute gradients
            grads = tape.gradient(loss, model.trainable_variables)

            # Update weights using Adam
            optimizer.update(model.trainable_variables, grads)

            # Track loss
            epoch_loss.append(loss.numpy())

            # Print progress
            if batch % 100 == 0:
                print(f"Batch {batch}/{num_batches}, Loss: {loss.numpy():.4f}")

        print(f"Epoch Avg Loss: {np.mean(epoch_loss):.4f}")

# Train the model
train_model(model, x_train, y_train)

def evaluate_model(model, x_test, y_test):
    """Evaluate model accuracy on test set"""
    y_pred = model(x_test)
    y_pred = tf.argmax(y_pred, axis=1)  # Convert probabilities to class indices
    y_true = tf.argmax(y_test, axis=1)  # Convert one-hot to class indices
    accuracy = tf.reduce_mean(tf.cast(tf.equal(y_pred, y_true), tf.float32))
    print(f"\nTest Accuracy: {accuracy.numpy()*100:.2f}%")

# Evaluate model performance
evaluate_model(model, x_test, y_test)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/5
Batch 0/1875, Loss: 2.4785
Batch 100/1875, Loss: 0.5396
Batch 200/1875, Loss: 0.3674
Batch 300/1875, Loss: 0.2544
Batch 400/1875, Loss: 0.1693
Batch 500/1875, Loss: 0.3770
Batch 600/1875, Loss: 0.1740
Batch 700/1875, Loss: 0.1249
Batch 800/1875, Loss: 0.2094
Batch 900/1875, Loss: 0.1223
Batch 1000/1875, Loss: 0.4019
Batch 1100/1875, Loss: 0.2129
Batch 1200/1875, Loss: 0.1556
Batch 1300/1875, Loss: 0.2041
Batch 1400/1875, Loss: 0.2790
Batch 1500/1875, Loss: 0.1339
Batch 1600/1875, Loss: 0.2121
Batch 1700/1875, Loss: 0.0936
Batch 1800/1875, Loss: 0.2057
Epoch Avg Loss: 0.2716
Epoch 2/5
Batch 0/1875, Loss: 0.0635
Batch 100/1875, Loss: 0.2010
Batch 200/1875, Loss: 0.1275
Batch 300/1875, Loss: 0.0388
Batch 400/1875, Loss: 0.0953
Batch 500/1875, Loss: 0.1513
Batch 600/1875, Loss: 0.0440
Batch 700/1875, Loss: 0.0449
Batch 800/1875, Loss: 0.1037
Batch 900/1875, Loss: 0.0372
Batch 1000/1875, Loss: 0.2292
Batch 1100/1875, Loss: 0.1453
Batch 1200/1875, Loss: 0.0806
Batch 1300/1875, Loss

5. Stochastic Gradient Descent (SGD) using tensorflow

In [7]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.datasets import mnist
from tensorflow.keras.utils import to_categorical

# Load and preprocess data
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0  # Normalize pixel values
x_train = x_train.reshape(-1, 28 * 28)  # Flatten images
x_test = x_test.reshape(-1, 28 * 28)
y_train = to_categorical(y_train, 10)  # One-hot encode labels
y_test = to_categorical(y_test, 10)

# Create and compile the model
model = Sequential([Dense(128, activation='relu', input_shape=(28 * 28,)), Dense(10, activation='softmax')])
model.compile(optimizer='sgd', loss='categorical_crossentropy', metrics=['accuracy'])

# Train and evaluate the model
model.fit(x_train, y_train, epochs=5, batch_size=32, validation_split=0.2)
test_loss, test_acc = model.evaluate(x_test, y_test)
print("Test accuracy:",test_acc)

Epoch 1/5
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.7056 - loss: 1.1232 - val_accuracy: 0.9012 - val_loss: 0.3806
Epoch 2/5
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 4ms/step - accuracy: 0.8968 - loss: 0.3835 - val_accuracy: 0.9130 - val_loss: 0.3138
Epoch 3/5
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - accuracy: 0.9109 - loss: 0.3187 - val_accuracy: 0.9227 - val_loss: 0.2771
Epoch 4/5
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.9225 - loss: 0.2769 - val_accuracy: 0.9287 - val_loss: 0.2545
Epoch 5/5
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.9259 - loss: 0.2620 - val_accuracy: 0.9348 - val_loss: 0.2364
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9218 - loss: 0.2751
Test accuracy: 0.9333999752998352


6. SGD with momentum using tensorflow

In [8]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.datasets import mnist
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import SGD  # Import SGD optimizer

# Load and preprocess data
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0  # Normalize pixel values
x_train = x_train.reshape(-1, 28 * 28)  # Flatten images
x_test = x_test.reshape(-1, 28 * 28)
y_train = to_categorical(y_train, 10)  # One-hot encode labels
y_test = to_categorical(y_test, 10)

# Create and compile the model with SGD with Momentum
model = Sequential([
    Dense(128, activation='relu', input_shape=(28 * 28,)),  # Hidden layer
    Dense(10, activation='softmax')  # Output layer
])

# Configure SGD with Momentum (momentum=0.9 is a common value)
sgd_with_momentum = SGD(learning_rate=0.01, momentum=0.9)
model.compile(optimizer=sgd_with_momentum,
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Train and evaluate the model
history = model.fit(x_train, y_train,
                    epochs=5,
                    batch_size=32,
                    validation_split=0.2)

test_loss, test_acc = model.evaluate(x_test, y_test)
print("Test accuracy:", test_acc)

Epoch 1/5
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 4ms/step - accuracy: 0.8362 - loss: 0.5568 - val_accuracy: 0.9482 - val_loss: 0.1857
Epoch 2/5
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.9484 - loss: 0.1747 - val_accuracy: 0.9617 - val_loss: 0.1370
Epoch 3/5
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - accuracy: 0.9658 - loss: 0.1179 - val_accuracy: 0.9672 - val_loss: 0.1127
Epoch 4/5
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 3ms/step - accuracy: 0.9739 - loss: 0.0932 - val_accuracy: 0.9692 - val_loss: 0.1057
Epoch 5/5
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 4ms/step - accuracy: 0.9781 - loss: 0.0764 - val_accuracy: 0.9714 - val_loss: 0.0953
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9682 - loss: 0.1009
Test accuracy: 0.9732999801635742


7. Adaptive Moment Estimation (Adam) using tensorflow

In [9]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.datasets import mnist
from tensorflow.keras.utils import to_categorical


# Load and preprocess data
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0  # Normalize pixel values
x_train = x_train.reshape(-1, 28 * 28)  # Flatten images
x_test = x_test.reshape(-1, 28 * 28)
y_train = to_categorical(y_train, 10)  # One-hot encode labels
y_test = to_categorical(y_test, 10)

# Create and compile the model
model = Sequential([Dense(128, activation='relu', input_shape=(28 * 28,)), Dense(10, activation='softmax')])
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train and evaluate the model
model.fit(x_train, y_train, epochs=5, batch_size=32, validation_split=0.2)
test_loss, test_acc = model.evaluate(x_test, y_test)
print("Test accuracy:",test_acc)

Epoch 1/5
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 5ms/step - accuracy: 0.8680 - loss: 0.4692 - val_accuracy: 0.9513 - val_loss: 0.1696
Epoch 2/5
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 5ms/step - accuracy: 0.9575 - loss: 0.1434 - val_accuracy: 0.9678 - val_loss: 0.1128
Epoch 3/5
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 4ms/step - accuracy: 0.9711 - loss: 0.0962 - val_accuracy: 0.9707 - val_loss: 0.1004
Epoch 4/5
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 5ms/step - accuracy: 0.9805 - loss: 0.0683 - val_accuracy: 0.9703 - val_loss: 0.0978
Epoch 5/5
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 5ms/step - accuracy: 0.9851 - loss: 0.0499 - val_accuracy: 0.9736 - val_loss: 0.0921
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9708 - loss: 0.0935
Test accuracy: 0.9740999937057495


8. Root Mean Squared Propagation (RMSprop) using tensorflow

In [10]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.datasets import mnist
from tensorflow.keras.utils import to_categorical

# Load and preprocess data
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0  # Normalize pixel values
x_train = x_train.reshape(-1, 28 * 28)  # Flatten images
x_test = x_test.reshape(-1, 28 * 28)
y_train = to_categorical(y_train, 10)  # One-hot encode labels
y_test = to_categorical(y_test, 10)

# Create and compile the model
model = Sequential([Dense(128, activation='relu', input_shape=(28 * 28,)), Dense(10, activation='softmax')])
model.compile(optimizer='rmsprop' ,loss='categorical_crossentropy', metrics=['accuracy'])

# Train and evaluate the model
model.fit(x_train, y_train, epochs=5, batch_size=32, validation_split=0.2)
test_loss, test_acc = model.evaluate(x_test, y_test)
print("Test accuracy:",test_acc)

Epoch 1/5
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 6ms/step - accuracy: 0.8719 - loss: 0.4611 - val_accuracy: 0.9520 - val_loss: 0.1629
Epoch 2/5
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 4ms/step - accuracy: 0.9584 - loss: 0.1420 - val_accuracy: 0.9653 - val_loss: 0.1200
Epoch 3/5
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - accuracy: 0.9717 - loss: 0.0938 - val_accuracy: 0.9679 - val_loss: 0.1073
Epoch 4/5
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 4ms/step - accuracy: 0.9786 - loss: 0.0728 - val_accuracy: 0.9732 - val_loss: 0.0926
Epoch 5/5
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 4ms/step - accuracy: 0.9828 - loss: 0.0604 - val_accuracy: 0.9737 - val_loss: 0.0921
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9717 - loss: 0.1041
Test accuracy: 0.9749000072479248
