<a href="https://colab.research.google.com/github/PaulToronto/Stanford-Andrew-Ng-Machine-Learning-Specialization/blob/main/Gradient_Descent_with_Numpy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Gradient Descent with `numpy`

The `sympy` versions of these functions are developed and tested in the [Gradient Descent with Sympy](https://colab.research.google.com/drive/1-CmDZG76UYe1EkpuzyTY1lZVoqsbsoDG#scrollTo=iJO3dQp2zsEL) notebook

## Imports

In [1]:
import sympy as sym
import pandas as pd
import numpy as np
from math import ceil

## Toy Datasets

###  Dataset for multiple linear regression

In [2]:
data_multi = pd.DataFrame({'feature1': [2104, 1416, 852],
                         'feature2': [5, 3, 2],
                         'feature3': [1, 2, 1],
                         'feature4': [45, 40, 35],
                         'target': [460, 232, 178]})

data_multi

Unnamed: 0,feature1,feature2,feature3,feature4,target
0,2104,5,1,45,460
1,1416,3,2,40,232
2,852,2,1,35,178


In [3]:
X_train_multi = np.array(data_multi.drop('target', axis=1))
y_train_multi = np.array(data_multi['target']).reshape(3, -1)
X_train_multi, y_train_multi

(array([[2104,    5,    1,   45],
        [1416,    3,    2,   40],
        [ 852,    2,    1,   35]]),
 array([[460],
        [232],
        [178]]))

In [4]:
X_train_multi_sym = sym.Matrix(X_train_multi)
y_train_multi_sym = sym.Matrix(y_train_multi)
display(X_train_multi_sym)
y_train_multi_sym

Matrix([
[2104, 5, 1, 45],
[1416, 3, 2, 40],
[ 852, 2, 1, 35]])

Matrix([
[460],
[232],
[178]])

In [5]:
# optimal w and b for testing
w_multi_best = np.array([ 0.39133535, 18.75376741, -53.36032453, -26.42131618]).reshape(4, -1)
b_multi_best = 785.1811367994083
w_multi_best, b_multi_best

(array([[  0.39133535],
        [ 18.75376741],
        [-53.36032453],
        [-26.42131618]]),
 785.1811367994083)

In [6]:
w_multi_best_sym = sym.Matrix(w_multi_best)
b_multi_best_sym = b_multi_best
display(w_multi_best_sym)
b_multi_best_sym

Matrix([
[  0.39133535],
[ 18.75376741],
[-53.36032453],
[-26.42131618]])

785.1811367994083

### Dataset for simple linear regression

In [7]:
data_simple = pd.DataFrame({'feature': [1, 2],
                            'target': [300, 500]})
data_simple

Unnamed: 0,feature,target
0,1,300
1,2,500


In [8]:
X_train_simple = np.array(data_simple.drop('target', axis=1))
y_train_simple = np.array(data_simple['target']).reshape(2, -1)
X_train_simple, y_train_simple

(array([[1],
        [2]]),
 array([[300],
        [500]]))

In [9]:
X_train_simple_sym = sym.Matrix(X_train_simple)
y_train_simple_sym = sym.Matrix(y_train_simple)
display(X_train_simple_sym)
y_train_simple_sym

Matrix([
[1],
[2]])

Matrix([
[300],
[500]])

In [10]:
# optimal w and b for testing
w_simple_best = np.array([[200]])
b_simple_best = 100
w_simple_best, b_simple_best

(array([[200]]), 100)

In [11]:
w_simple_best_sym = sym.Matrix(w_simple_best)
b_simple_best_sym = b_simple_best
display(w_simple_best_sym)
b_simple_best_sym

Matrix([[200]])

100

## The Model Prediction

$$
f_{\mathbf{w},b}(\mathbf{x}^{(i)}) = \mathbf{w}\cdot\mathbf{x}^{(i)} + b
$$

where

- $\mathbf{x}$ is a vector representing the $i^{th}$ row of $\mathbf{X}$
- $\mathbf{w}$ is column vector containing the weights of the model
- $b$ is a scalar representing the bias

### `sympy` version

In [12]:
# `X * w` is used instead of `X @ w`
#   so that the function also works
#   for simple linear regression
def f_wb_sym(X, w, b):
    m = X.shape[0]
    pred = X * w + b * sym.ones(m, 1)
    return pred

In [13]:
f_wb_sym(X_train_multi_sym, w_multi_best_sym, b_multi_best_sym)

Matrix([
[459.999997619408],
[231.999998369408],
[177.999998989408]])

In [14]:
f_wb_sym(X_train_simple_sym, w_simple_best_sym, b_simple_best_sym)

Matrix([
[300],
[500]])

### `numpy` version

In [15]:
def f_wb(X, w, b):
    m = X.shape[0]
    pred = X @ w + b
    return pred

In [16]:
# works with multiple rows
f_wb(X_train_multi, w_multi_best, b_multi_best)

array([[459.99999762],
       [231.99999837],
       [177.99999899]])

In [17]:
# works with a single row
f_wb(X_train_multi[0], w_multi_best, b_multi_best)

array([459.99999762])

In [18]:
# works with simple regression
f_wb(X_train_simple, w_simple_best, b_simple_best)

array([[300],
       [500]])

In [19]:
# works with simple regression, single row
f_wb(X_train_simple[0], w_simple_best, b_simple_best)

array([300])

### Speed comparison: `sympy` vs `numpy`

In [20]:
%%timeit -r7 -n1000
f_wb_sym(X_train_multi_sym, w_multi_best_sym, b_multi_best_sym)

954 µs ± 159 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [21]:
%%timeit -r7 -n1000
f_wb(X_train_multi, w_multi_best, b_multi_best)

The slowest run took 5.83 times longer than the fastest. This could mean that an intermediate result is being cached.
22.4 µs ± 11.9 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


## The Cost Function

$$
J(\mathbf{w},b) = \frac{1}{2m} \sum\limits_{i = 0}^{m-1} (f_{\mathbf{w},b}(\mathbf{x}^{(i)}) - y^{(i)})^2
$$

### `sympy` versions

In [22]:
def compute_cost_loop_sym(X, y, w, b):
    m = X.shape[0]
    cost = sym.Matrix([0.0])

    for i in range(m):
        f_wb_i = f_wb_sym(X[i,:], w, b)
        cost = cost + (f_wb_i - sym.Matrix([y[i]])).applyfunc(lambda x: x**2)
    cost = cost / (2 * m)
    return cost[0]

In [23]:
def compute_cost_sym(X, y, w, b):
    m = X.shape[0]
    pred = f_wb_sym(X, w, b)
    cost = sum((pred - y).applyfunc(lambda x: x**2)) / (2 * m)
    return cost

In [24]:
compute_cost_sym(X_train_multi_sym,
                 y_train_multi_sym,
                 w_multi_best_sym,
                 b_multi_best_sym)

1.55789044289666e-12

In [25]:
compute_cost_sym(X_train_simple_sym,
                 y_train_simple_sym,
                 w_simple_best_sym,
                 b_simple_best_sym)

0

### `numpy` versions

#### `compute_cost_loop`

In [26]:
def compute_cost_loop(X, y, w, b):
    m = X.shape[0]

    cost = 0.0
    for i in range(m):
        f_wb_i = f_wb(X[i], w, b)
        cost = cost + (f_wb_i - y[i])**2
    cost = cost / (2 * m)
    return cost[0]

In [27]:
# test for multiple linear regression
compute_cost_loop(X_train_multi, y_train_multi, w_multi_best, b_multi_best)

1.5578904428966628e-12

In [28]:
# test for simple linear regression
compute_cost_loop(X_train_simple, y_train_simple, w_simple_best, b_simple_best)

0.0

#### `compute_cost`

In [29]:
def compute_cost(X, y, w, b):
    m = X.shape[0]
    pred = f_wb(X, w, b)
    cost = sum((pred - y)**2) / (2 * m)
    return cost[0]

In [30]:
# test for multiple linear regression
compute_cost(X_train_multi, y_train_multi, w_multi_best, b_multi_best)

1.5578904045996674e-12

In [31]:
# test for simple linear regression
compute_cost(X_train_simple, y_train_simple, w_simple_best, b_simple_best)

0.0

### Speed comparison: `sympy` vs `numpy`

In [32]:
%%timeit -r7 -n1000
compute_cost_loop_sym(X_train_multi_sym,
                      y_train_multi_sym,
                      w_multi_best_sym,
                      b_multi_best_sym)

The slowest run took 4.40 times longer than the fastest. This could mean that an intermediate result is being cached.
3.03 ms ± 2.05 ms per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [33]:
%%timeit -r7 -n1000
compute_cost_sym(X_train_multi_sym,
                 y_train_multi_sym,
                 w_multi_best_sym,
                 b_multi_best_sym)

752 µs ± 129 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [34]:
%%timeit -r7 -n1000
compute_cost_loop(X_train_multi,
                  y_train_multi,
                  w_multi_best,
                  b_multi_best)

25.5 µs ± 3.75 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [35]:
%%timeit -r7 -n1000
compute_cost(X_train_multi,
             y_train_multi,
             w_multi_best,
             b_multi_best)

15.4 µs ± 2.58 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


## The Gradient

$$
\begin{align}
\frac{\partial J(\mathbf{w},b)}{\partial w_j}  &= \frac{1}{m} \sum\limits_{i = 0}^{m-1} (f_{\mathbf{w},b}(\mathbf{x}^{(i)}) - y^{(i)})x_{j}^{(i)} \\
\frac{\partial J(\mathbf{w},b)}{\partial b}  &= \frac{1}{m} \sum\limits_{i = 0}^{m-1} (f_{\mathbf{w},b}(\mathbf{x}^{(i)}) - y^{(i)})
\end{align}
$$

### `sympy` versions

In [36]:
def compute_gradient_loop_sym(X, y, w, b):
    m, n = X.shape

    dj_dw = sym.zeros(n, 1)
    dj_db = 0.0

    for i in range(m):
        err = f_wb_sym(X[i,:], w, b)[0] - y[i]
        for j in range(n):
            dj_dw[j] = dj_dw[j] + (err * X[i, j])
        dj_db = dj_db + err
    dj_dw = dj_dw / m
    dj_db = dj_db / m

    return dj_db, dj_dw

In [37]:
def compute_gradient_sym(X, y, w, b):
    m, n = X.shape

    y_pred = f_wb_sym(X, w, b)
    err = y_pred - y

    dj_dw = (X.T @ err) / m
    dj_db = sum(err) / m

    return dj_db, dj_dw

In [38]:
dj_db, dj_dw = compute_gradient_sym(X_train_multi_sym,
                                    y_train_multi_sym,
                                    w_multi_best_sym,
                                    b_multi_best_sym)
display(dj_db)
display(dj_dw)

-1.67392515019552e-6

Matrix([
[-0.00272623577196403],
[-6.27197262777675e-6],
[-2.21745578225333e-6],
[-6.92403390682254e-5]])

In [39]:
dj_db, dj_dw = compute_gradient_sym(X_train_simple_sym,
                                    y_train_simple_sym,
                                    w_simple_best_sym,
                                    b_simple_best_sym)
display(dj_db)
display(dj_dw)

0

Matrix([[0]])

### `numpy` versions

#### `compute_gradient_loop`

In [40]:
def compute_gradient_loop(X, y, w, b):
    m, n = X.shape

    dj_dw = np.zeros((n, 1))
    dj_db = 0.0

    for i in range(m):
        err = f_wb(X[i], w, b)[0] - y[i]
        for j in range(n):
            dj_dw[j] = dj_dw[j] + (err * X[i,j])
        dj_db = dj_db + err

    dj_dw = dj_dw / m
    dj_db = dj_db / m

    return dj_db[0], dj_dw

In [41]:
# test for multiple linear regression
dj_db, dj_dw = compute_gradient_loop(X_train_multi,
                                     y_train_multi,
                                     w_multi_best,
                                     b_multi_best)

dj_db, dj_dw

(-1.6739251501955248e-06,
 array([[-2.72623577e-03],
        [-6.27197263e-06],
        [-2.21745578e-06],
        [-6.92403391e-05]]))

In [42]:
# test for simple linear regression
dj_db, dj_dw = compute_gradient_loop(X_train_simple,
                                     y_train_simple,
                                     w_simple_best,
                                     b_simple_best)

dj_db, dj_dw

(0.0, array([[0.]]))

#### `compute_gradient`

In [43]:
def compute_gradient(X, y, w, b):
    m, n = X.shape

    y_pred = f_wb(X, w, b)
    err = y_pred - y

    dj_dw = (X.T @ err) / m
    dj_db = sum(err) / m

    return dj_db[0], dj_dw

In [44]:
# test for multiple linear regression
dj_db, dj_dw = compute_gradient(X_train_multi,
                                y_train_multi,
                                w_multi_best,
                                b_multi_best)

dj_db, dj_dw

(-1.6739251122999121e-06,
 array([[-2.72623574e-03],
        [-6.27197255e-06],
        [-2.21745574e-06],
        [-6.92403377e-05]]))

In [45]:
# test for simple linear regression
dj_db, dj_dw = compute_gradient(X_train_simple,
                                y_train_simple,
                                w_simple_best,
                                b_simple_best)

dj_db, dj_dw

(0.0, array([[0.]]))

### Speed comparison: `sympy` vs `numpy`

In [46]:
%%timeit -r7 -n1000
compute_gradient_loop_sym(X_train_multi_sym,
                         y_train_multi_sym,
                         w_multi_best_sym,
                         b_multi_best_sym)

1.78 ms ± 289 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [47]:
%%timeit -r7 -n1000
compute_gradient_sym(X_train_multi_sym,
                     y_train_multi_sym,
                     w_multi_best_sym,
                     b_multi_best_sym)

1.09 ms ± 218 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [48]:
%%timeit -r7 -n1000
compute_gradient_loop(X_train_multi,
                     y_train_multi,
                     w_multi_best,
                     b_multi_best)

123 µs ± 9.15 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [49]:
%%timeit -r7 -n1000
compute_gradient(X_train_multi,
                 y_train_multi,
                 w_multi_best,
                 b_multi_best)

29.6 µs ± 9.11 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


## Gradient Descent

### `sympy` version

In [50]:
def gradient_descent_sym(X, y, w, b, f_cost, f_gradient, alpha, num_iters):

    w = sym.Matrix([w]) # so it works with simple regression

    J_history = []

    for i in range(iterations):
        dj_db, dj_dw = f_gradient(X, y, w, b)

        w = w - alpha * dj_dw
        b = b - alpha * dj_db

        J_history.append(f_cost(X, y, w, b))

        # print cost
        if i % ceil(num_iters / 10) == 0:
            print(f'Iteration {i:4d}: Cost {J_history[-1]:8.2f}')

    return w, b, J_history

In [51]:
# test multiple linear regression with `compute_cost_sym`
# and `compute_gradient_sym`
initial_w = sym.zeros(w_multi_best.shape[0], 1)
initial_b = 0.0
iterations = 1000
alpha = 5.0e-7

w_final, b_final, J_history = gradient_descent_sym(X_train_multi_sym,
                                                   y_train_multi_sym,
                                                   initial_w,
                                                   initial_b,
                                                   compute_cost_sym,
                                                   compute_gradient_sym,
                                                   alpha,
                                                   iterations)

print('w_final:')
display(w_final)
print('b_final:')
display(b_final)
print('Cost:')
J_history[-1]

Iteration    0: Cost  2529.46
Iteration  100: Cost   695.99
Iteration  200: Cost   694.92
Iteration  300: Cost   693.86
Iteration  400: Cost   692.81
Iteration  500: Cost   691.77
Iteration  600: Cost   690.73
Iteration  700: Cost   689.71
Iteration  800: Cost   688.70
Iteration  900: Cost   687.69
w_final:


Matrix([
[  0.203965687318831],
[0.00374919220982854],
[-0.0112487038789788],
[-0.0658613999237372]])

b_final:


-0.00223540753093253

Cost:


686.703411666521

In [52]:
# test simple linear regression
initial_w = 0
initial_b = 0
iterations = 10_000
alpha = 1.0e-2
w_final, b_final, J_history = gradient_descent_sym(X_train_simple_sym,
                                                   y_train_simple_sym,
                                                   initial_b, initial_w,
                                                   compute_cost_sym,
                                                   compute_gradient_sym,
                                                   alpha,
                                                   iterations)

print('w_final:')
display(w_final)
print('b_final:')
display(b_final)
print('Cost:')
J_history[-1]

Iteration    0: Cost 79274.81
Iteration 1000: Cost     3.41
Iteration 2000: Cost     0.79
Iteration 3000: Cost     0.18
Iteration 4000: Cost     0.04
Iteration 5000: Cost     0.01
Iteration 6000: Cost     0.00
Iteration 7000: Cost     0.00
Iteration 8000: Cost     0.00
Iteration 9000: Cost     0.00
w_final:


Matrix([[199.992850751318]])

b_final:


100.011567727362

Cost:


6.74501466258040e-6

### `numpy` version

In [53]:
def gradient_descent(X, y, w, b, f_cost, f_gradient, alpha, num_iters):

    # so that it works for simple regression
    if np.isscalar(w):
        w = np.array(w).reshape(1, 1)

    J_history = []

    for i in range(num_iters):
        dj_db, dj_dw = f_gradient(X, y, w, b)

        w = w - alpha * dj_dw
        b = b - alpha * dj_db

        J_history.append(f_cost(X, y, w, b))

        # print cost
        if i % ceil(num_iters / 10) == 0:
            print(f'Iteration {i:4d}: Cost {J_history[-1]:8.2f}')

    return w, b, J_history

In [54]:
# test multiple linear regression with `compute_cost_loop`
# and `compute_gradient_loop`

initial_w = np.zeros((w_multi_best.shape[0], 1))
initial_b = 0.0
iterations = 1000
alpha = 5.0e-7

w_final, b_final, J_history = gradient_descent(X_train_multi,
                                               y_train_multi,
                                               initial_w,
                                               initial_b,
                                               compute_cost_loop,
                                               compute_gradient_loop,
                                               alpha,
                                               iterations)

print('w_final:')
display(w_final)
print('b_final:')
display(b_final)
print('Cost:')
J_history[-1]

Iteration    0: Cost  2529.46
Iteration  100: Cost   695.99
Iteration  200: Cost   694.92
Iteration  300: Cost   693.86
Iteration  400: Cost   692.81
Iteration  500: Cost   691.77
Iteration  600: Cost   690.73
Iteration  700: Cost   689.71
Iteration  800: Cost   688.70
Iteration  900: Cost   687.69
w_final:


array([[ 0.20396569],
       [ 0.00374919],
       [-0.0112487 ],
       [-0.0658614 ]])

b_final:


-0.002235407530932535

Cost:


686.7034116665205

In [55]:
# test multiple linear regression with `compute_cost`
# and `compute_gradient`

initial_w = np.zeros((w_multi_best.shape[0], 1))
initial_b = 0.0
iterations = 1000
alpha = 5.0e-7

w_final, b_final, J_history = gradient_descent(X_train_multi,
                                               y_train_multi,
                                               initial_w,
                                               initial_b,
                                               compute_cost,
                                               compute_gradient,
                                               alpha,
                                               iterations)

print('w_final:')
display(w_final)
print('b_final:')
display(b_final)
print('Cost:')
J_history[-1]

Iteration    0: Cost  2529.46
Iteration  100: Cost   695.99
Iteration  200: Cost   694.92
Iteration  300: Cost   693.86
Iteration  400: Cost   692.81
Iteration  500: Cost   691.77
Iteration  600: Cost   690.73
Iteration  700: Cost   689.71
Iteration  800: Cost   688.70
Iteration  900: Cost   687.69
w_final:


array([[ 0.20396569],
       [ 0.00374919],
       [-0.0112487 ],
       [-0.0658614 ]])

b_final:


-0.002235407530932535

Cost:


686.7034116665213

In [56]:
# test simple linear regression with `compute_cost_loop`
# and `compute_gradient_loop`
initial_w = 0
initial_b = 0
iterations = 10_000
alpha = 1.0e-2
w_final, b_final, J_history = gradient_descent(X_train_simple,
                                               y_train_simple,
                                               initial_b,
                                               initial_w,
                                               compute_cost_loop,
                                               compute_gradient_loop,
                                               alpha,
                                               iterations)

print('w_final:')
display(w_final)
print('b_final:')
display(b_final)
print('Cost:')
J_history[-1]

Iteration    0: Cost 79274.81
Iteration 1000: Cost     3.41
Iteration 2000: Cost     0.79
Iteration 3000: Cost     0.18
Iteration 4000: Cost     0.04
Iteration 5000: Cost     0.01
Iteration 6000: Cost     0.00
Iteration 7000: Cost     0.00
Iteration 8000: Cost     0.00
Iteration 9000: Cost     0.00
w_final:


array([[199.99285075]])

b_final:


100.011567727362

Cost:


6.745014662580395e-06

In [57]:
# test simple linear regression with `compute_cost`
# and `compute_gradient`
initial_w = 0
initial_b = 0
iterations = 10_000
alpha = 1.0e-2
w_final, b_final, J_history = gradient_descent(X_train_simple,
                                               y_train_simple,
                                               initial_b,
                                               initial_w,
                                               compute_cost,
                                               compute_gradient,
                                               alpha,
                                               iterations)

print('w_final:')
display(w_final)
print('b_final:')
display(b_final)
print('Cost:')
J_history[-1]

Iteration    0: Cost 79274.81
Iteration 1000: Cost     3.41
Iteration 2000: Cost     0.79
Iteration 3000: Cost     0.18
Iteration 4000: Cost     0.04
Iteration 5000: Cost     0.01
Iteration 6000: Cost     0.00
Iteration 7000: Cost     0.00
Iteration 8000: Cost     0.00
Iteration 9000: Cost     0.00
w_final:


array([[199.99285075]])

b_final:


100.011567727362

Cost:


6.745014662580395e-06

### Speed comparison: `sympy` vs `numpy`

To do this we need versions of the `gradient_descent` functions that suppress printing

In [58]:
def gradient_descent_sym_no_print(X, y, w, b, f_cost, f_gradient, alpha, num_iters):

    w = sym.Matrix([w]) # so it works with simple regression

    for i in range(iterations):
        dj_db, dj_dw = f_gradient(X, y, w, b)

        w = w - alpha * dj_dw
        b = b - alpha * dj_db

        cost = f_cost(X, y, w, b)

    return w, b, cost

def gradient_descent_no_print(X, y, w, b, f_cost, f_gradient, alpha, num_iters):

    # so that it works for simple regression
    if np.isscalar(w):
        w = np.array(w).reshape(1, 1)

    for i in range(num_iters):
        dj_db, dj_dw = f_gradient(X, y, w, b)

        w = w - alpha * dj_dw
        b = b - alpha * dj_db

        cost = f_cost(X, y, w, b)

    return w, b, cost

In [59]:
initial_w = sym.zeros(w_multi_best.shape[0], 1)
initial_b = 0.0
iterations = 1000
alpha = 5.0e-7

In [60]:
%%timeit -r3 -n50
gradient_descent_sym_no_print(X_train_multi_sym,
                              y_train_multi_sym,
                              initial_w,
                              initial_b,
                              compute_cost_loop_sym,
                              compute_gradient_loop_sym,
                              alpha,
                              iterations)

5.18 s ± 17.1 ms per loop (mean ± std. dev. of 3 runs, 50 loops each)


In [61]:
%%timeit -r3 -n50
gradient_descent_sym_no_print(X_train_multi_sym,
                             y_train_multi_sym,
                             initial_w,
                             initial_b,
                             compute_cost_sym,
                             compute_gradient_sym,
                             alpha,
                             iterations)

3.37 s ± 20.5 ms per loop (mean ± std. dev. of 3 runs, 50 loops each)


In [62]:
initial_w = np.zeros((w_multi_best.shape[0], 1))
initial_b = 0.0
iterations = 1000
alpha = 5.0e-7

In [63]:
%%timeit -r3 -n50
gradient_descent_no_print(X_train_multi,
                          y_train_multi,
                          initial_w,
                          initial_b,
                          compute_cost_loop,
                          compute_gradient_loop,
                          alpha,
                          iterations)

102 ms ± 12.9 ms per loop (mean ± std. dev. of 3 runs, 50 loops each)


In [64]:
%%timeit -r3 -n50
gradient_descent_no_print(X_train_multi,
                          y_train_multi,
                          initial_w,
                          initial_b,
                          compute_cost,
                          compute_gradient,
                          alpha,
                          iterations)

51.8 ms ± 9.83 ms per loop (mean ± std. dev. of 3 runs, 50 loops each)
