In [87]:
import numpy as np

class MomentumGradientDescent:
    def __init__(
        self,
        learning_rate=0.001,
        momentum=0.9,
        max_iters=1e4,
        epsilon=1e-8,
        batch_size=10,
        record_history=False,
    ):
        self.learning_rate = learning_rate
        self.momentum = momentum
        self.max_iters = max_iters
        self.record_history = record_history
        self.epsilon = epsilon
        self.batch_size = batch_size
        self.prev_delta_w = None
        if record_history:
            # to store the weight history for visualization
            self.w_history = []

    def run(self, gradient_fn, x, y, w):
        grad = np.inf
        t = 1
        N, D = x.shape
        self.prev_delta_w = np.zeros(w.shape)
        while np.linalg.norm(grad) > self.epsilon and t < self.max_iters:
            grad = gradient_fn(x, y, w)
            delta_w = self.get_delta_w(grad)

            # weight update step
            w = w - self.learning_rate * delta_w
            if self.record_history:
                self.w_history.append(w)
            t += 1
        return w

#     def run(self, gradient_fn, x, y, w):
#         grad = np.inf
#         t = 1
#         N, D = x.shape
#         self.prev_delta_w = np.zeros(D)
#         while np.linalg.norm(grad) > self.epsilon and t < self.max_iters:
#             for i in range(0, N, self.batch_size):
#                 if x.ndim == 1:
#                     batch_x = x[i : i + self.batch_size]
#                 else:
#                     batch_x = x[i : i + self.batch_size, :]

#                 if y.ndim == 1:
#                     batch_y = y[i : i + self.batch_size]
#                 else:
#                     batch_y = y[i : i + self.batch_size, :]

#                 # compute the gradient with present weight
#                 grad = gradient_fn(batch_x, batch_y, w)
#                 delta_w = self.get_delta_w(grad)

#                 # weight update step
#                 w = w - self.learning_rate * delta_w
#                 if self.record_history:
#                     self.w_history.append(w)
#             t += 1
#         return w

    def get_delta_w(self, grad):
        beta = self.momentum
        delta_w = beta * self.prev_delta_w + (1 - beta) * grad
        self.prev_delta_w = delta_w

        return delta_w

In [97]:
import numpy as np

# from the given Colab code
logistic = lambda z: 1./ (1 + np.exp(-z))  

class LinearRegression:

    def __init__(self, add_bias=True):
        self.add_bias = add_bias
        pass
            
    def fit(self, x, y, C, optimizer):
        if x.ndim == 1:
            x = x[:, None]
        if self.add_bias:
            N = x.shape[0]
            x = np.column_stack([x,np.ones(N)])
        N,D = x.shape
        
        def softmax(z):
            return (np.exp(z.T) / np.sum(np.exp(z), axis=1)).T
        
        def to_classlabel(z):
            return z.argmax(axis=1)

#         def gradient(x, y, w):  # Use softmax gradient function
#             print(y)
#             N,D = x.shape
#             expSum = 0
#             yh = []
#             for c in y.shape:
#                 cur = np.exp(np.dot(x, w[c]))
#                 yh.append(cur)
#                 expSum += cur
                
#             grads = []
#             for c in range(len(yh)):
#                 yh[c] = yh[c] / expSum
#                 grads.append(np.dot(x.T, yh[c] - y[c]) / N)
#             return grads
        
        def gradient(x, y, w):
            N, D = x.shape
            yh = softmax(np.dot(x, w))
            print(to_classlabel(yh).shape)
            print(y.shape)
            print(x.T.shape)
            
            grad = np.dot(x.T, to_classlabel(yh) - y) / N
            return grad

        w0 = np.zeros((D,C)) # initialize the weights to 0
        self.w = optimizer.run(gradient, x, y, w0) # run the optimizer to get the optimal weights
        return self
    
    def predict(self, x):
        if self.add_bias:
            x = np.column_stack([x,np.ones(N)])
        yh = x @ self.w
        return yh
        

In [98]:
from sklearn.datasets import load_digits
# from GradientDescent import *

digits = load_digits()
x, y = digits.data, digits.target

optimizer = MomentumGradientDescent(learning_rate=.005, max_iters=1000)
model = LinearRegression(add_bias=False)
# the digits dataset has 10 classes
model.fit(x, y, 10, optimizer)

# plot the final fit (not sure if this will work)
plt.plot(x, y, '.')
plt.plot(x, x*model.w[0]+model.w[1], 'r-', label=str(model.w[0]))
plt.xlabel('x')
plt.ylabel(r'$y=xw_1 + w_0$')
plt.show()

(1797,)
(1797,)
(64, 1797)


ValueError: operands could not be broadcast together with shapes (64,10) (64,) 