<a href="https://colab.research.google.com/github/PaulToronto/Stanford-Andrew-Ng-Machine-Learning-Specialization/blob/main/Applied_Gradient_Descent_with_Numpy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Applied Gradient Descent with `numpy`

## Imports

In [1]:
import pandas as pd
import numpy as np
import math
import copy
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

## The Data

In [2]:
path = 'https://raw.githubusercontent.com/PaulToronto/'
path += 'Stanford-Andrew-Ng-Machine-Learning-Specialization/main/data/'
path += 'houses2.csv'
houses = pd.read_csv(path, header=None)

In [3]:
houses.columns = ['size_sqft', 'bedrooms', 'floors', 'age', 'price']
houses

Unnamed: 0,size_sqft,bedrooms,floors,age,price
0,952.0,2.0,1.0,65.0,271.5
1,1244.0,3.0,1.0,64.0,300.0
2,1947.0,3.0,2.0,17.0,509.8
3,1725.0,3.0,2.0,42.0,394.0
4,1959.0,3.0,2.0,15.0,540.0
...,...,...,...,...,...
95,1224.0,2.0,2.0,12.0,329.0
96,1432.0,2.0,1.0,43.0,388.0
97,1660.0,3.0,2.0,19.0,390.0
98,1212.0,3.0,1.0,20.0,356.0


### `X_train`, `y_train`

In [4]:
X_train = houses.drop('price', axis=1).values
X_train.shape, X_train[0:5]

((100, 4),
 array([[9.520e+02, 2.000e+00, 1.000e+00, 6.500e+01],
        [1.244e+03, 3.000e+00, 1.000e+00, 6.400e+01],
        [1.947e+03, 3.000e+00, 2.000e+00, 1.700e+01],
        [1.725e+03, 3.000e+00, 2.000e+00, 4.200e+01],
        [1.959e+03, 3.000e+00, 2.000e+00, 1.500e+01]]))

In [5]:
y_train = houses['price'].values
y_train.shape, y_train

((100,),
 array([271.5  , 300.   , 509.8  , 394.   , 540.   , 415.   , 230.   ,
        560.   , 294.   , 718.2  , 200.   , 302.   , 468.   , 374.2  ,
        388.   , 282.   , 311.8  , 401.   , 449.8  , 301.   , 502.   ,
        340.   , 400.282, 572.   , 264.   , 304.   , 298.   , 219.8  ,
        490.7  , 216.96 , 368.2  , 280.   , 526.87 , 237.   , 562.426,
        369.8  , 460.   , 374.   , 390.   , 158.   , 426.   , 390.   ,
        277.774, 216.96 , 425.8  , 504.   , 329.   , 464.   , 220.   ,
        358.   , 478.   , 334.   , 426.98 , 290.   , 463.   , 390.8  ,
        354.   , 350.   , 460.   , 237.   , 288.304, 282.   , 249.   ,
        304.   , 332.   , 351.8  , 310.   , 216.96 , 666.336, 330.   ,
        480.   , 330.3  , 348.   , 304.   , 384.   , 316.   , 430.4  ,
        450.   , 284.   , 275.   , 414.   , 258.   , 378.   , 350.   ,
        412.   , 373.   , 225.   , 390.   , 267.4  , 464.   , 174.   ,
        340.   , 430.   , 440.   , 216.   , 329.   , 388.   , 390.  

### `best_w`, `best_b`

In [6]:
model = LinearRegression()
model.fit(X_train, y_train)
w_best = model.coef_
b_best = model.intercept_
w_best, b_best

(array([  0.26836643, -32.90362407, -67.28804158,  -1.46516763]),
 221.50226366888353)

## Functions

### Model Prediction Function

In [7]:
def f_wb(X, w, b):
    pred = X @ w + b
    return pred

In [8]:
f_wb(X_train, w_best, b_best)

array([248.65591798, 295.58045873, 485.81689524, 389.61035726,
       491.96762765, 420.52811442, 223.57450463, 523.31626324,
       267.84509755, 685.00447351, 182.66630919, 318.25079707,
       479.88048606, 409.96399029, 393.83650249, 287.69238564,
       323.97747981, 405.90878584, 436.31413687, 270.13705992,
       500.97979389, 329.10878125, 388.16122321, 552.41859037,
       242.3019055 , 295.69011978, 283.12331913, 217.52238166,
       491.06890089, 229.84738873, 341.18908562, 291.69315602,
       489.92233626, 239.03010489, 597.95447816, 384.07367102,
       452.70618447, 401.4470128 , 405.86539494, 173.12727791,
       423.54102555, 434.16299105, 277.3044564 , 229.84738873,
       448.56536512, 488.89226144, 332.01742952, 465.74081655,
       222.41425711, 386.76114193, 456.59466253, 370.7813079 ,
       469.12413346, 310.3417961 , 426.44966536, 392.08127297,
       347.54993532, 339.61307315, 471.54323795, 243.74234709,
       298.2466504 , 273.2843497 , 250.25244212, 297.97

### Compute Cost Function

In [9]:
def compute_cost(X, y, w, b):
    """
    compute cost
    Args:
      X (ndarray (m,n)): Data, m examples with n features
      y (ndarray (m,)) : target values
      w (ndarray (n,)) : model parameters
      b (scalar)       : model parameter

    Returns:
      cost (scalar): cost
    """
    m = X.shape[0]
    cost = 0.0
    for i in range(m):
        f_wb_i = w.dot(X[i]) + b
        cost = cost + (f_wb_i - y[i])**2
    cost = cost / (2 * m)
    return cost

In [10]:
compute_cost(X_train, y_train, w_best, b_best)

219.7113017649621

In [11]:
def compute_cost_matrix(X, y, w, b, verbose=False):
    """
    Computes the gradient for linear regression
     Args:
      X : (array_like Shape (m,n)) variable such as house size
      y : (array_like Shape (m,)) actual value
      w : (array_like Shape (n,)) parameters of the model
      b : (scalar               ) parameter of the model
      verbose : (Boolean) If true, print out intermediate value f_wb
    Returns
      cost: (scalar)
    """
    m,n = X.shape

    # calculate f_wb for all examples.
    f_wb = X @ w + b
    # calculate cost
    total_cost = (1/(2*m)) * np.sum((f_wb-y)**2)

    if verbose: print("f_wb:")
    if verbose: print(f_wb)

    return total_cost

In [12]:
compute_cost_matrix(X_train, y_train, w_best, b_best)

219.71130176496206

### Compute Gradient Function

In [13]:
def compute_gradient(X, y, w, b):
    """
    Computes the gradient for linear regression
    Args:
      X (ndarray (m,n)): Data, m examples with n features
      y (ndarray (m,)) : target values
      w (ndarray (n,)) : model parameters
      b (scalar)       : model parameter

    Returns:
      dj_dw (ndarray (n,)): The gradient of the cost w.r.t. the parameters w.
      dj_db (scalar):       The gradient of the cost w.r.t. the parameter b.
    """
    m, n = X.shape
    dj_dw = np.zeros((n, ))
    dj_db = 0.0

    for i in range(m):
        err = w.dot(X[i]) + b - y[i]
        for j in range(n):
            dj_dw[j] = dj_dw[j] + err * X[i, j]
        dj_db = dj_db + err

    dj_dw = dj_dw / m
    dj_db = dj_db / m

    return dj_db, dj_dw

In [14]:
compute_gradient(X_train, y_train, w_best, b_best)

(4.860112312599085e-14,
 array([ 4.73301043e-11, -6.70752343e-14,  1.13402621e-13, -7.33371053e-11]))

In [15]:
def compute_gradient_matrix(X, y, w, b):
    """
    Computes the gradient for linear regression

    Args:
      X : (array_like Shape (m,n)) variable such as house size
      y : (array_like Shape (m,1)) actual value
      w : (array_like Shape (n,1)) Values of parameters of the model
      b : (scalar )                Values of parameter of the model
    Returns
      dj_dw: (array_like Shape (n,1)) The gradient of the cost w.r.t. the parameters w.
      dj_db: (scalar)                The gradient of the cost w.r.t. the parameter b.

    """
    m,n = X.shape
    f_wb = X @ w + b
    err = f_wb - y
    dj_dw  = (1/m) * (X.T @ err)
    dj_db  = (1/m) * np.sum(err)

    return dj_db,dj_dw

In [16]:
compute_gradient_matrix(X_train, y_train, w_best, b_best)

(4.860112312599085e-14,
 array([ 4.59840521e-11, -6.70752343e-14,  1.13686838e-13, -7.33780325e-11]))

### Gradient Descent Function

In [17]:
def gradient_descent(X, y, w_in, b_in, f_cost, f_gradient, alpha, num_iters):
    """
    Performs batch gradient descent to learn w and b. Updates w and b by taking
    num_iters gradient steps with learning rate alpha

    Args:
      X (ndarray (m,n))   : Data, m examples with n features
      y (ndarray (m,))    : target values
      w_in (ndarray (n,)) : initial model parameters
      b_in (scalar)       : initial model parameter
      cost_function       : function to compute cost
      gradient_function   : function to compute the gradient
      alpha (float)       : Learning rate
      num_iters (int)     : number of iterations to run gradient descent

    Returns:
      w (ndarray (n,)) : Updated values of parameters
      b (scalar)       : Updated value of parameter
    """
    J_history = []
    w = copy.deepcopy(w_in) # avoid modify global w within function
    b = b_in

    for i in range(num_iters):
        # calculate the gradient
        dj_db, dj_dw = f_gradient(X, y, w, b)
        collect_w = []

        # update the parameters
        w = w - alpha * dj_dw
        b = b - alpha * dj_db

        if i < 100_000: # prevent resource exhaustion
            J_history.append(f_cost(X, y, w, b))

        # print cost
        if i % math.ceil(num_iters / 10) == 0:
            print(f'Iteration {i:4d}: Cost {J_history[-1]:8.2f}: w {w}: b {b}')

    return w, b, J_history

In [18]:
w_init = np.zeros_like(w_best)
b_init = 0.0
alpha = 1e-7
iterations = 1000
w, b, j_hist = gradient_descent(X_train,
                                y_train,
                                w_init,
                                b_init,
                                compute_cost_matrix,
                                compute_gradient_matrix,
                                alpha,
                                iterations)

Iteration    0: Cost 44154.43: w [5.48850468e-02 1.00137228e-04 5.16041620e-05 1.24551926e-03]: b 3.6223952e-05
Iteration  100: Cost  1565.13: w [ 2.53110091e-01  3.99748568e-04  1.58795427e-04 -5.20149806e-04]: b 0.00020954284178614476
Iteration  200: Cost  1560.99: w [ 2.53263981e-01  3.36211727e-04  7.68794965e-05 -6.95667688e-03]: b 0.0002540372436067192
Iteration  300: Cost  1556.92: w [ 2.53416458e-01  2.73231665e-04 -4.94840021e-06 -1.33341039e-02]: b 0.00029884478436808425
Iteration  400: Cost  1552.92: w [ 2.53567535e-01  2.10803270e-04 -8.66890711e-05 -1.96529735e-02]: b 0.00034396258850019504
Iteration  500: Cost  1549.00: w [ 2.53717225e-01  1.48921476e-04 -1.68343316e-04 -2.59138234e-02]: b 0.00038938780683453575
Iteration  600: Cost  1545.15: w [ 2.53865541e-01  8.75812639e-05 -2.49911928e-04 -3.21171864e-02]: b 0.0004351176163647442
Iteration  700: Cost  1541.36: w [ 2.54012495e-01  2.67776603e-05 -3.31395692e-04 -3.82635903e-02]: b 0.00048114922000638046
Iteration  800:

### Z-score Normalization Function

In [19]:
def zscore_normalize_features(X):
    """
    computes  X, zcore normalized by column

    Args:
      X (ndarray (m,n))     : input data, m examples, n features

    Returns:
      X_norm (ndarray (m,n)): input normalized by column
      mu (ndarray (n,))     : mean of each feature
      sigma (ndarray (n,))  : standard deviation of each feature
    """
    # find the mean of each feature
    mu = np.mean(X, axis=0)
    # find the standard deviation of each feature
    sigma = np.std(X, axis=0)
    # element-wise, subtract mean for each column and divide by standard deviation
    X_norm = (X - mu) / sigma

    return X_norm, mu, sigma

## Run Gradient Descent

In [20]:
def run_gradient_descent(X, y, alpha, iterations):
    m, n = X.shape
    initial_w = np.zeros(n)
    initial_b = 0

    w, b, j_hist = gradient_descent(X,
                                    y,
                                    initial_w,
                                    initial_b,
                                    compute_cost_matrix,
                                    compute_gradient_matrix,
                                    alpha,
                                    iterations)

    return w, b, j_hist

In [21]:
alpha = 1e-7
iterations = 1000

w, b, j_hist = run_gradient_descent(X_train, y_train, alpha, iterations)

Iteration    0: Cost 44154.43: w [5.48850468e-02 1.00137228e-04 5.16041620e-05 1.24551926e-03]: b 3.6223952e-05
Iteration  100: Cost  1565.13: w [ 2.53110091e-01  3.99748568e-04  1.58795427e-04 -5.20149806e-04]: b 0.00020954284178614476
Iteration  200: Cost  1560.99: w [ 2.53263981e-01  3.36211727e-04  7.68794965e-05 -6.95667688e-03]: b 0.0002540372436067192
Iteration  300: Cost  1556.92: w [ 2.53416458e-01  2.73231665e-04 -4.94840021e-06 -1.33341039e-02]: b 0.00029884478436808425
Iteration  400: Cost  1552.92: w [ 2.53567535e-01  2.10803270e-04 -8.66890711e-05 -1.96529735e-02]: b 0.00034396258850019504
Iteration  500: Cost  1549.00: w [ 2.53717225e-01  1.48921476e-04 -1.68343316e-04 -2.59138234e-02]: b 0.00038938780683453575
Iteration  600: Cost  1545.15: w [ 2.53865541e-01  8.75812639e-05 -2.49911928e-04 -3.21171864e-02]: b 0.0004351176163647442
Iteration  700: Cost  1541.36: w [ 2.54012495e-01  2.67776603e-05 -3.31395692e-04 -3.82635903e-02]: b 0.00048114922000638046
Iteration  800:

In [22]:
w, w_best

(array([ 2.54443890e-01, -1.51872883e-04, -5.74533712e-04, -5.63067399e-02]),
 array([  0.26836643, -32.90362407, -67.28804158,  -1.46516763]))

In [23]:
b, b_best

(0.0006205565597795537, 221.50226366888353)

## Feature Scaling

- Comparing `w` to `w_best` and `b` to `b_best`, we see that 1000 iterations doesn't get us very far when $\alpha = 1e-7$
- If we scale the data, then we can make $\alpha$ vastly larger so we should get more out of 1000 iterations

In [24]:
X_norm, mu, sigma = zscore_normalize_features(X_train)
X_norm.shape, X_norm[0:5]

((100, 4),
 array([[-1.12018542, -1.08793896, -0.78288136,  1.02191098],
        [-0.41174475,  0.44436943, -0.78288136,  0.98312878],
        [ 1.29385043,  0.44436943,  1.27733275, -0.83963464],
        [ 0.75524143,  0.44436943,  1.27733275,  0.12992037],
        [ 1.32296443,  0.44436943,  1.27733275, -0.91719904]]))

In [25]:
model_norm = LinearRegression()
model_norm.fit(X_norm, y_train)
w_best_norm = model_norm.coef_
b_best_norm = model_norm.intercept_
w_best_norm, b_best_norm

(array([110.61335173, -21.47323884, -32.66070323, -37.77938362]), 362.23952)

In [26]:
alpha = 1.0e-1
iterations = 1000

w_norm, b_norm, j_hist_norm = run_gradient_descent(X_norm, y_train, alpha, iterations)

Iteration    0: Cost 57326.42: w [ 8.91588014  3.01913493  3.32746878 -5.99326502]: b 36.223952000000004
Iteration  100: Cost   221.73: w [107.8550472  -20.10250817 -31.10498049 -38.30743962]: b 362.23086057045475
Iteration  200: Cost   219.71: w [110.51801435 -21.42511468 -32.60780139 -37.79800254]: b 362.2395197699934
Iteration  300: Cost   219.71: w [110.61005719 -21.47157412 -32.65887703 -37.78002774]: b 362.23951999999383
Iteration  400: Cost   219.71: w [110.61323789 -21.47318131 -32.66064012 -37.77940588]: b 362.23951999999974
Iteration  500: Cost   219.71: w [110.6133478  -21.47323685 -32.66070105 -37.77938439]: b 362.2395199999998
Iteration  600: Cost   219.71: w [110.6133516  -21.47323877 -32.66070315 -37.77938365]: b 362.2395199999998
Iteration  700: Cost   219.71: w [110.61335173 -21.47323884 -32.66070322 -37.77938362]: b 362.2395199999998
Iteration  800: Cost   219.71: w [110.61335173 -21.47323884 -32.66070323 -37.77938362]: b 362.2395199999998
Iteration  900: Cost   219.7

In [27]:
w_norm, w_best_norm

(array([110.61335173, -21.47323884, -32.66070323, -37.77938362]),
 array([110.61335173, -21.47323884, -32.66070323, -37.77938362]))

In [28]:
b_norm, b_best_norm

(362.2395199999998, 362.23952)

By normalizing the training data, we get excellent convergence after just 1000 iterations.

## `mean_squared_error`

In [29]:
(mean_squared_error(y_train, model.predict(X_train)),
 mean_squared_error(y_train, model_norm.predict(X_norm)))

(439.4226035299241, 439.4226035299241)

In [30]:
(np.sum((y_train - f_wb(X_train, w, b))**2) / X_train.shape[0],
 np.sum((y_train - f_wb(X_norm, w_norm, b_norm))**2) / X_norm.shape[0])

(3060.9320333060627, 439.4226035299239)