In [13]:
# Gradient Descent.

# MSE cost function for Linear Regression is...
# + convex i.e. no local minima
# + continuous i.e. everywhere differentiable.

# Gradient descent is optimal if all features are scaled equivalently.
# Otherwise, it follows the feature with smallest scale first.

# Need derivative of cost function.
# Use partial derivative along each feature.
# Use DEL for upside-down DELTA.
# DEL = vector of partial derivatives = gradient vector.
# DEL * MSE = (2/m)*XT*(X*theta-y).
# Application to all training examples = one batch.

# The gradient points toward max so subtract it to go toward min.
# Learning rate eta: too small => long time, too big => no convergence.
# Each parameter update is: theta - eta * DEL * MSE * theta.

# Reproduce steps from previous notebook.
m = 100   # training set size
import numpy as np
X = 2 * np.random.rand(m,1)  
y = 4 + 3 * X + np.random.randn(m,1)
biasX = np.c_[np.ones((m,1)), X] 

In [14]:
# Batch Gradient Descent
# From one initial spot, iterate over all examples.
# Optimal solution if convex, otherwise may find local minima.
# High memory requirement.
# Slow: #iterations * #examples.

iterations = 1000   # how much patience we have
eta = 0.1  # learn rate
theta = np.random.randn(2,1)  # random initializer where seach begins
for i in range(iterations):
    # This iterates over all (x,y) training examples.
    gradientMSE = (2/m)*(biasX.T).dot(biasX.dot(theta)-y)
    # Each one update is based on all training examples.
    theta = theta - eta * gradientMSE
theta

array([[4.10210131],
       [2.97381822]])

In [15]:
# Stochastic Gradient Descent.
# From one initial spot, iterate over random set of examples.
# Lower memory requirement. Possibility of faster convergence.
# Approximation, not optimal solution, for convex.
# Optimal if you choose optimal parameters e.g. learning schedule.
# More chance to hop out of local minima.
# Simulated annealing: gradually decresing learn rate.
# Convention: use m iterations and remove m from denominator.
# If the training data is sorted, shuffle it first.

theta = np.random.randn(2,1)  # random initializer where seach begins
t0, t1 = 5,50  # learn schedule parameters
def learning_schedule(t):
    return t0/(t+t1)

epochs = 50
for epoch in range(epochs):
    for i in range(m):
        index = np.random.randint(m)
        (xi,yi) = ( biasX[index:index+1] , y[index:index+1] )
        gradientMSE = 2*(xi.T).dot(xi.dot(theta)-yi)
        eta=learning_schedule(epoch * m + i)
        theta = theta - eta * gradientMSE
theta

array([[4.12694867],
       [3.03293732]])

In [17]:
# Of course, you can do it in one line with SKLearn.
# Stochastic Gradient Descent.
from sklearn.linear_model import SGDRegressor
sgd_reg = SGDRegressor(max_iter=m, tol=1e-3,penalty=None,eta0=0.1)
sgd_reg.fit(X,y.ravel())
sgd_reg.intercept_, sgd_reg.coef_

(array([4.12380719]), array([3.06738848]))

In [18]:
# Mini-Batch Gradient Descent.
# Combine Batch and Stochastic i.e. small batches = random subsets.
# May be faster on GPU or hardware matrix optimization.

# SKLearn did not have it and book does not show an example. 
# SKLearn decided to add it but I cannot find where! 
# Disccusion https://github.com/scikit-learn/scikit-learn/issues/14468
# Work ticket https://github.com/mozilla/CANOSP-2019/issues/30

# Author/reader discussion suggests you can roll your own.
# https://github.com/ageron/handson-ml/issues/226