# Gradient Descent with Minibatch and Momentum

In [3]:
import numpy as np

class MomentumGradientDescent:
    def __init__(
        self,
        learning_rate=0.001,
        momentum=0.9,
        max_iters=1e4,
        epsilon=1e-8,
        batch_size=10,
        record_history=False,
    ):
        self.learning_rate = learning_rate
        self.momentum = momentum
        self.max_iters = max_iters
        self.record_history = record_history
        self.epsilon = epsilon
        self.batch_size = batch_size
        self.prev_delta_w = None
        if record_history:
            # to store the weight history for visualization
            self.w_history = []

    def run(self, gradient_fn, x, y, w):
        grad = np.inf
        t = 1
        N, D = x.shape
        self.prev_delta_w = np.zeros(w.shape)
        while np.linalg.norm(grad) > self.epsilon and t < self.max_iters:
            grad = gradient_fn(x, y, w)
            delta_w = self.get_delta_w(grad)

            # weight update step
            w = w - self.learning_rate * delta_w
            if self.record_history:
                self.w_history.append(w)
            t += 1
        return w

#     def run(self, gradient_fn, x, y, w):
#         grad = np.inf
#         t = 1
#         N, D = x.shape
#         self.prev_delta_w = np.zeros(D)
#         while np.linalg.norm(grad) > self.epsilon and t < self.max_iters:
#             for i in range(0, N, self.batch_size):
#                 if x.ndim == 1:
#                     batch_x = x[i : i + self.batch_size]
#                 else:
#                     batch_x = x[i : i + self.batch_size, :]

#                 if y.ndim == 1:
#                     batch_y = y[i : i + self.batch_size]
#                 else:
#                     batch_y = y[i : i + self.batch_size, :]

#                 # compute the gradient with present weight
#                 grad = gradient_fn(batch_x, batch_y, w)
#                 delta_w = self.get_delta_w(grad)

#                 # weight update step
#                 w = w - self.learning_rate * delta_w
#                 if self.record_history:
#                     self.w_history.append(w)
#             t += 1
#         return w

    def get_delta_w(self, grad):
        beta = self.momentum
        delta_w = beta * self.prev_delta_w + (1 - beta) * grad
        self.prev_delta_w = delta_w

        return delta_w

# Softmax Regression

In [134]:
import numpy as np

# from the given Colab code
logistic = lambda z: 1./ (1 + np.exp(-z))  

class LinearRegression:

    def __init__(self, add_bias=True):
        self.add_bias = add_bias
        pass
            
    def fit(self, x, y, C, optimizer):
        if x.ndim == 1:
            x = x[:, None]
        if self.add_bias:
            N = x.shape[0]
            x = np.column_stack([x,np.ones(N)])
        N,D = x.shape
        
        def to_onehot(a):
            return np.eye(C)[a]
        
        def gradient(x, y, w):
            N, D = x.shape
            # yh: N x C
            yh = self.softmax(np.dot(x, w))
            # both are N x C
            yh = to_onehot(self.to_classlabel(yh))
            y = to_onehot(y)
            
            grad = np.dot(x.T, yh - y) / N
            return grad
        
        # initialize all weights to 0
        w0 = np.zeros((D,C)) 
        # run the optimizer to get the optimal weights
        self.w = optimizer.run(gradient, x, y, w0) 
        return self
    
    def softmax(self, z):
        # to prevent overflow/underflow
        z = z - np.max(z, axis=-1, keepdims=True)
        return (np.exp(z.T) / np.sum(np.exp(z), axis=1)).T

    def to_classlabel(self, z):
        return z.argmax(axis=1)
    
    def predict(self, x):
        if self.add_bias:
            x = np.column_stack([x,np.ones(N)])
        # convert from 1D to 2D
        x = np.reshape(x, (1, -1))
        yh = self.softmax(np.dot(x, self.w))
        return self.to_classlabel(yh)[0]
        

# Validation of Model

Validation is performed using K-fold cross-validation. 

In [144]:
import math
import pandas as pd 

# Returns 2 datasets (training and validation)
def k_fold_splitter(fold, dataset):
    start = math.floor(fold*(dataset.shape[0]/5))
    end = math.floor((fold+1)*(dataset.shape[0]/5))

    training = np.delete(dataset, slice(start, end), axis=0)
    validation = dataset[start:end-1]

    return training, validation

# Digits Dataset

In [145]:
from sklearn.datasets import load_digits

digits = load_digits()
x, y = digits.data, digits.target

C = 10
accuracies = []
# do 5-fold cross-validation
for fold_num in range(5):
    train_data, validation_data = k_fold_splitter(fold_num, x)
    train_labels, validation_labels = k_fold_splitter(fold_num, y)
    
    optimizer = MomentumGradientDescent(learning_rate=.005, max_iters=1000)
    model = LinearRegression(add_bias=False)
    model.fit(train_data, train_labels, C, optimizer)
    
    num_misclassified = 0
    # calculate the accuracy
    for i in range(len(validation_data)):
        prediction = model.predict(validation_data[i, :])
        if prediction != validation_labels[i]:
            num_misclassified += 1
            
    misclassification_rate = num_misclassified / len(validation_labels)
    accuracies.append(1 - misclassification_rate)
    
print("Accuracy: {}".format(np.average(accuracies)))

Accuracy: 0.9146278458162804


# Wine Dataset

In [146]:
from sklearn.datasets import load_wine

wine = load_wine()
x, y = wine.data, wine.target

C = 3
accuracies = []
# do 5-fold cross-validation
for fold_num in range(5):
    train_data, validation_data = k_fold_splitter(fold_num, x)
    train_labels, validation_labels = k_fold_splitter(fold_num, y)
    
    optimizer = MomentumGradientDescent(learning_rate=.005, max_iters=10000)
    model = LinearRegression(add_bias=False)
    model.fit(train_data, train_labels, C, optimizer)
    
    num_misclassified = 0
    # calculate the accuracy
    for i in range(len(validation_data)):
        prediction = model.predict(validation_data[i, :])
        if prediction != validation_labels[i]:
            num_misclassified += 1
            
    misclassification_rate = num_misclassified / len(validation_labels)
    accuracies.append(1 - misclassification_rate)
    
print("Accuracy: {}".format(np.average(accuracies)))

Accuracy: 0.924705882352941
