Optimization Algorithms from scratch implementations and library implementations.

1.Stochastic Gradient Descent (SGD) from scatch

In [3]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.datasets import mnist
from tensorflow.keras.utils import to_categorical
import numpy as np

# Load and preprocess data
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0  # Normalize pixel values to [0, 1]
x_train = x_train.reshape(-1, 28 * 28)  # Flatten the images
x_test = x_test.reshape(-1, 28 * 28)
y_train = to_categorical(y_train, 10)  # One-hot encode labels
y_test = to_categorical(y_test, 10)

# Create the model
model = Sequential([
    Dense(128, activation='relu', input_shape=(28 * 28,)),
    Dense(10, activation='softmax')
])

# Define a custom optimizer from scratch
class CustomSGD:
    def __init__(self, learning_rate=0.01):
        self.learning_rate = learning_rate

    def update(self, weights, grads):
        """
        Update weights using gradient descent.
        :param weights: List of model weights (trainable variables).
        :param grads: List of gradients corresponding to the weights.
        """
        for i in range(len(weights)):
            weights[i].assign_sub(self.learning_rate * grads[i])  # w = w - lr * grad

# Loss function (categorical cross-entropy)
def categorical_crossentropy(y_true, y_pred):
    return -tf.reduce_mean(tf.reduce_sum(y_true * tf.math.log(y_pred), axis=1))

# Training loop
def train_model(model, x_train, y_train, epochs=5, batch_size=32, learning_rate=0.01):
    optimizer = CustomSGD(learning_rate=learning_rate)
    num_samples = x_train.shape[0]
    num_batches = int(np.ceil(num_samples / batch_size))

    for epoch in range(epochs):
        print(f"Epoch {epoch + 1}/{epochs}")
        for batch in range(num_batches):
            # Get a batch of data
            start = batch * batch_size
            end = start + batch_size
            x_batch = x_train[start:end]
            y_batch = y_train[start:end]

            # Forward pass
            with tf.GradientTape() as tape:
                y_pred = model(x_batch, training=True)
                loss = categorical_crossentropy(y_batch, y_pred)

            # Backward pass (compute gradients)
            grads = tape.gradient(loss, model.trainable_variables)

            # Update weights using the custom optimizer
            optimizer.update(model.trainable_variables, grads)

            # Print loss every 100 batches
            if batch % 100 == 0:
                print(f"Batch {batch}/{num_batches}, Loss: {loss.numpy()}")

# Train the model
train_model(model, x_train, y_train, epochs=5, batch_size=32, learning_rate=0.01)

# Evaluate the model
def evaluate_model(model, x_test, y_test):
    y_pred = model(x_test)
    y_pred = tf.argmax(y_pred, axis=1)
    y_true = tf.argmax(y_test, axis=1)
    accuracy = tf.reduce_mean(tf.cast(tf.equal(y_pred, y_true), tf.float32))
    print(f"Test Accuracy: {accuracy.numpy()}")

evaluate_model(model, x_test, y_test)

Epoch 1/5
Batch 0/1875, Loss: 2.3270652294158936
Batch 100/1875, Loss: 1.6160459518432617
Batch 200/1875, Loss: 1.0054333209991455
Batch 300/1875, Loss: 0.8898893594741821
Batch 400/1875, Loss: 0.8000794649124146
Batch 500/1875, Loss: 0.8378880023956299
Batch 600/1875, Loss: 0.5260149836540222
Batch 700/1875, Loss: 0.5218067765235901
Batch 800/1875, Loss: 0.40523576736450195
Batch 900/1875, Loss: 0.34206241369247437
Batch 1000/1875, Loss: 0.5301038026809692
Batch 1100/1875, Loss: 0.43280118703842163
Batch 1200/1875, Loss: 0.44740551710128784
Batch 1300/1875, Loss: 0.48487603664398193
Batch 1400/1875, Loss: 0.368030309677124
Batch 1500/1875, Loss: 0.2723538875579834
Batch 1600/1875, Loss: 0.3821481466293335
Batch 1700/1875, Loss: 0.2665557861328125
Batch 1800/1875, Loss: 0.38760656118392944
Epoch 2/5
Batch 0/1875, Loss: 0.4517681896686554
Batch 100/1875, Loss: 0.3804982304573059
Batch 200/1875, Loss: 0.379755437374115
Batch 300/1875, Loss: 0.29828667640686035
Batch 400/1875, Loss: 0.286

2. Adaptive Moment Estimation (Adam) from scatch

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.datasets import mnist
from tensorflow.keras.utils import to_categorical
import numpy as np

# Load and preprocess data
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0  # Normalize pixel values to [0, 1]
x_train = x_train.reshape(-1, 28 * 28)  # Flatten the images
x_test = x_test.reshape(-1, 28 * 28)
y_train = to_categorical(y_train, 10)  # One-hot encode labels
y_test = to_categorical(y_test, 10)

# Create the model
model = Sequential([
    Dense(128, activation='relu', input_shape=(28 * 28,)),
    Dense(10, activation='softmax')
])

# Define the RMSProp optimizer from scratch
class RMSProp:
    def __init__(self, learning_rate=0.001, beta=0.9, epsilon=1e-8):
        self.learning_rate = learning_rate
        self.beta = beta  # Decay rate for the moving average of squared gradients
        self.epsilon = epsilon  # Small constant to avoid division by zero
        self.v = {}  # Dictionary to store the moving average of squared gradients

    def update(self, params, grads):
        """
        Update parameters using the RMSProp optimizer.
        :param params: Dictionary of model parameters (weights and biases).
        :param grads: Dictionary of gradients corresponding to the parameters.
        """
        for key in params:
            if key not in self.v:
                # Initialize the moving average of squared gradients for this parameter
                self.v[key] = tf.zeros_like(params[key])

            # Update the moving average of squared gradients
            self.v[key] = self.beta * self.v[key] + (1 - self.beta) * tf.square(grads[key])

            # Update the parameter using the RMSProp formula
            params[key].assign_sub(self.learning_rate * grads[key] / (tf.sqrt(self.v[key]) + self.epsilon))

# Loss function (categorical cross-entropy)
def categorical_crossentropy(y_true, y_pred):
    return -tf.reduce_mean(tf.reduce_sum(y_true * tf.math.log(y_pred), axis=1))

# Training loop
def train_model(model, x_train, y_train, epochs=5, batch_size=32, learning_rate=0.001):
    optimizer = RMSProp(learning_rate=learning_rate)
    num_samples = x_train.shape[0]
    num_batches = int(np.ceil(num_samples / batch_size))

    for epoch in range(epochs):
        print(f"Epoch {epoch + 1}/{epochs}")
        for batch in range(num_batches):
            # Get a batch of data
            start = batch * batch_size
            end = start + batch_size
            x_batch = x_train[start:end]
            y_batch = y_train[start:end]

            # Forward pass
            with tf.GradientTape() as tape:
                y_pred = model(x_batch, training=True)
                loss = categorical_crossentropy(y_batch, y_pred)

            # Backward pass (compute gradients)
            grads = tape.gradient(loss, model.trainable_variables)

            # Convert gradients and parameters to dictionaries for RMSProp
            grads_dict = {f'w{i}': grad for i, grad in enumerate(grads)}
            params_dict = {f'w{i}': param for i, param in enumerate(model.trainable_variables)}

            # Update weights using the custom RMSProp optimizer
            optimizer.update(params_dict, grads_dict)

            # Print loss every 100 batches
            if batch % 100 == 0:
                print(f"Batch {batch}/{num_batches}, Loss: {loss.numpy()}")

# Train the model
train_model(model, x_train, y_train, epochs=5, batch_size=32, learning_rate=0.001)

# Evaluate the model
def evaluate_model(model, x_test, y_test):
    y_pred = model(x_test)
    y_pred = tf.argmax(y_pred, axis=1)
    y_true = tf.argmax(y_test, axis=1)
    accuracy = tf.reduce_mean(tf.cast(tf.equal(y_pred, y_true), tf.float32))
    print(f"Test Accuracy: {accuracy.numpy()}")

evaluate_model(model, x_test, y_test)

Epoch 1/5
Batch 0/1875, Loss: 2.4011390209198
Batch 100/1875, Loss: 0.4783175587654114
Batch 200/1875, Loss: 0.34606945514678955
Batch 300/1875, Loss: 0.1497497260570526
Batch 400/1875, Loss: 0.1784745305776596
Batch 500/1875, Loss: 0.31149929761886597
Batch 600/1875, Loss: 0.28456440567970276
Batch 700/1875, Loss: 0.13499826192855835
Batch 800/1875, Loss: 0.18486712872982025
Batch 900/1875, Loss: 0.13458126783370972
Batch 1000/1875, Loss: 0.461059033870697
Batch 1100/1875, Loss: 0.28478243947029114
Batch 1200/1875, Loss: 0.15832790732383728
Batch 1300/1875, Loss: 0.1299748420715332
Batch 1400/1875, Loss: 0.16376101970672607
Batch 1500/1875, Loss: 0.1700705885887146
Batch 1600/1875, Loss: 0.14414790272712708
Batch 1700/1875, Loss: 0.06671467423439026
Batch 1800/1875, Loss: 0.1317974030971527
Epoch 2/5
Batch 0/1875, Loss: 0.1034427285194397
Batch 100/1875, Loss: 0.1183730885386467
Batch 200/1875, Loss: 0.17879898846149445
Batch 300/1875, Loss: 0.018749509006738663
Batch 400/1875, Loss: 

3. Root Mean Squared Propagation (RMSprop) from scratch

In [4]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.datasets import mnist
from tensorflow.keras.utils import to_categorical
import numpy as np

# Load and preprocess data
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0  # Normalize pixel values to [0, 1]
x_train = x_train.reshape(-1, 28 * 28)  # Flatten the images
x_test = x_test.reshape(-1, 28 * 28)
y_train = to_categorical(y_train, 10)  # One-hot encode labels
y_test = to_categorical(y_test, 10)

# Create the model
model = Sequential([
    Dense(128, activation='relu', input_shape=(28 * 28,)),
    Dense(10, activation='softmax')
])

# Define the Adam optimizer from scratch
class CustomAdam:
    def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-7):
        self.learning_rate = learning_rate
        self.beta1 = beta1  # Exponential decay rate for the first moment estimates
        self.beta2 = beta2  # Exponential decay rate for the second moment estimates
        self.epsilon = epsilon  # Small constant to avoid division by zero
        self.m = None  # First moment vector (mean)
        self.v = None  # Second moment vector (uncentered variance)
        self.t = 0  # Time step (iteration counter)

    def update(self, weights, grads):
        """
        Update weights using the Adam optimizer.
        :param weights: List of model weights (trainable variables).
        :param grads: List of gradients corresponding to the weights.
        """
        if self.m is None:
            # Initialize first and second moment vectors
            self.m = [tf.zeros_like(w) for w in weights]
            self.v = [tf.zeros_like(w) for w in weights]

        self.t += 1  # Increment time step

        for i in range(len(weights)):
            # Update biased first moment estimate (m)
            self.m[i] = self.beta1 * self.m[i] + (1 - self.beta1) * grads[i]
            # Update biased second moment estimate (v)
            self.v[i] = self.beta2 * self.v[i] + (1 - self.beta2) * tf.square(grads[i])

            # Compute bias-corrected first moment estimate
            m_hat = self.m[i] / (1 - self.beta1 ** self.t)
            # Compute bias-corrected second moment estimate
            v_hat = self.v[i] / (1 - self.beta2 ** self.t)

            # Update weights
            weights[i].assign_sub(self.learning_rate * m_hat / (tf.sqrt(v_hat) + self.epsilon))

# Loss function (categorical cross-entropy)
def categorical_crossentropy(y_true, y_pred):
    return -tf.reduce_mean(tf.reduce_sum(y_true * tf.math.log(y_pred), axis=1))

# Training loop
def train_model(model, x_train, y_train, epochs=5, batch_size=32, learning_rate=0.001):
    optimizer = CustomAdam(learning_rate=learning_rate)
    num_samples = x_train.shape[0]
    num_batches = int(np.ceil(num_samples / batch_size))

    for epoch in range(epochs):
        print(f"Epoch {epoch + 1}/{epochs}")
        for batch in range(num_batches):
            # Get a batch of data
            start = batch * batch_size
            end = start + batch_size
            x_batch = x_train[start:end]
            y_batch = y_train[start:end]

            # Forward pass
            with tf.GradientTape() as tape:
                y_pred = model(x_batch, training=True)
                loss = categorical_crossentropy(y_batch, y_pred)

            # Backward pass (compute gradients)
            grads = tape.gradient(loss, model.trainable_variables)

            # Update weights using the custom Adam optimizer
            optimizer.update(model.trainable_variables, grads)

            # Print loss every 100 batches
            if batch % 100 == 0:
                print(f"Batch {batch}/{num_batches}, Loss: {loss.numpy()}")

# Train the model
train_model(model, x_train, y_train, epochs=5, batch_size=32, learning_rate=0.001)

# Evaluate the model
def evaluate_model(model, x_test, y_test):
    y_pred = model(x_test)
    y_pred = tf.argmax(y_pred, axis=1)
    y_true = tf.argmax(y_test, axis=1)
    accuracy = tf.reduce_mean(tf.cast(tf.equal(y_pred, y_true), tf.float32))
    print(f"Test Accuracy: {accuracy.numpy()}")

evaluate_model(model, x_test, y_test)

Epoch 1/5
Batch 0/1875, Loss: 2.474738121032715
Batch 100/1875, Loss: 0.5045326948165894
Batch 200/1875, Loss: 0.37019145488739014
Batch 300/1875, Loss: 0.20342355966567993
Batch 400/1875, Loss: 0.191922128200531
Batch 500/1875, Loss: 0.38823366165161133
Batch 600/1875, Loss: 0.16828876733779907
Batch 700/1875, Loss: 0.13100148737430573
Batch 800/1875, Loss: 0.159723699092865
Batch 900/1875, Loss: 0.11896780878305435
Batch 1000/1875, Loss: 0.46159279346466064
Batch 1100/1875, Loss: 0.17954494059085846
Batch 1200/1875, Loss: 0.17885930836200714
Batch 1300/1875, Loss: 0.1602754294872284
Batch 1400/1875, Loss: 0.2274518609046936
Batch 1500/1875, Loss: 0.16448281705379486
Batch 1600/1875, Loss: 0.19772203266620636
Batch 1700/1875, Loss: 0.06007068604230881
Batch 1800/1875, Loss: 0.16574259102344513
Epoch 2/5
Batch 0/1875, Loss: 0.08356095850467682
Batch 100/1875, Loss: 0.1930152177810669
Batch 200/1875, Loss: 0.16765405237674713
Batch 300/1875, Loss: 0.048770565539598465
Batch 400/1875, Lo

4. Stochastic Gradient Descent (SGD) using tensorflow

In [8]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.datasets import mnist 
from tensorflow.keras.utils import to_categorical # Load and preprocess data

(x_train, y_train), (x_test, y_test) = mnist.load_data() 
x_train, x_test =x_train / 255.0, x_test / 255.0 
x_train = x_train.reshape(-1, 28 * 28)
x_test = x_test.reshape(-1, 28 * 28) 
y_train = to_categorical(y_train, 10) 
y_test = to_categorical(y_test, 10)

# Create and compile the model 
model = Sequential([Dense(128, activation='relu', input_shape=(28 * 28,)), Dense(10, activation='softmax')])
model.compile(optimizer='sgd', loss='categorical_crossentropy', metrics=['accuracy'])

# Train and evaluate the model
model.fit(x_train, y_train, epochs=5, batch_size=32, validation_split=0.2) 
test_loss, test_acc = model.evaluate(x_test, y_test)
print("Test accuracy:",test_acc)

Epoch 1/5
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.7035 - loss: 1.1322 - val_accuracy: 0.9011 - val_loss: 0.3780
Epoch 2/5
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.8924 - loss: 0.3839 - val_accuracy: 0.9150 - val_loss: 0.3076
Epoch 3/5
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9097 - loss: 0.3232 - val_accuracy: 0.9231 - val_loss: 0.2756
Epoch 4/5
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9201 - loss: 0.2821 - val_accuracy: 0.9296 - val_loss: 0.2535
Epoch 5/5
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9296 - loss: 0.2560 - val_accuracy: 0.9330 - val_loss: 0.2369
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9246 - loss: 0.2717
Test accuracy: 0.9340000152587891


5. Adaptive Moment Estimation (Adam) using tensorflow

In [7]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.datasets import mnist 
from tensorflow.keras.utils import to_categorical # Load and preprocess data

(x_train, y_train), (x_test, y_test) = mnist.load_data() 
x_train, x_test =x_train / 255.0, x_test / 255.0 
x_train = x_train.reshape(-1, 28 * 28)
x_test = x_test.reshape(-1, 28 * 28) 
y_train = to_categorical(y_train, 10) 
y_test = to_categorical(y_test, 10)

# Create and compile the model 
model = Sequential([Dense(128, activation='relu', input_shape=(28 * 28,)), Dense(10, activation='softmax')])
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train and evaluate the model
model.fit(x_train, y_train, epochs=5, batch_size=32, validation_split=0.2) 
test_loss, test_acc = model.evaluate(x_test, y_test)
print("Test accuracy:",test_acc)

Epoch 1/5
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.8626 - loss: 0.4759 - val_accuracy: 0.9572 - val_loss: 0.1536
Epoch 2/5
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9587 - loss: 0.1384 - val_accuracy: 0.9664 - val_loss: 0.1157
Epoch 3/5
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9734 - loss: 0.0896 - val_accuracy: 0.9691 - val_loss: 0.1018
Epoch 4/5
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.9817 - loss: 0.0629 - val_accuracy: 0.9679 - val_loss: 0.1089
Epoch 5/5
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9850 - loss: 0.0483 - val_accuracy: 0.9747 - val_loss: 0.0852
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9738 - loss: 0.0837
Test accuracy: 0.9771999716758728


6. Root Mean Squared Propagation (RMSprop) using tensorflow

In [10]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.datasets import mnist 
from tensorflow.keras.utils import to_categorical # Load and preprocess data

(x_train, y_train), (x_test, y_test) = mnist.load_data() 
x_train, x_test =x_train / 255.0, x_test / 255.0 
x_train = x_train.reshape(-1, 28 * 28)
x_test = x_test.reshape(-1, 28 * 28) 
y_train = to_categorical(y_train, 10) 
y_test = to_categorical(y_test, 10)

# Create and compile the model 
model = Sequential([Dense(128, activation='relu', input_shape=(28 * 28,)), Dense(10, activation='softmax')])
model.compile(optimizer='rmsprop' ,loss='categorical_crossentropy', metrics=['accuracy'])

# Train and evaluate the model
model.fit(x_train, y_train, epochs=5, batch_size=32, validation_split=0.2) 
test_loss, test_acc = model.evaluate(x_test, y_test)
print("Test accuracy:",test_acc)

Epoch 1/5
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.8743 - loss: 0.4492 - val_accuracy: 0.9539 - val_loss: 0.1587
Epoch 2/5
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9591 - loss: 0.1387 - val_accuracy: 0.9663 - val_loss: 0.1212
Epoch 3/5
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.9716 - loss: 0.0954 - val_accuracy: 0.9683 - val_loss: 0.1111
Epoch 4/5
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9788 - loss: 0.0722 - val_accuracy: 0.9735 - val_loss: 0.0948
Epoch 5/5
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.9831 - loss: 0.0582 - val_accuracy: 0.9743 - val_loss: 0.0998
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9727 - loss: 0.0966
Test accuracy: 0.9754999876022339
