<a href="https://colab.research.google.com/github/PaulToronto/Stanford-Andrew-Ng-Machine-Learning-Specialization/blob/main/Gradient_Descent_with_Sympy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Gradient Descent with `sympy`

## Imports

In [1]:
import sympy as sym
import pandas as pd
import numpy as np
from math import ceil

from sklearn.linear_model import LinearRegression

## Functions

In [2]:
def predict(X, w, b):
    w = sym.Matrix(w)
    if type(b) is not type(X):
        b = b * sym.ones(X.shape[0], 1)
    p = X @ w + b
    return sym.Matrix(p)

def compute_cost(X, y, w, b):
    m = X.shape[0]
    y = sym.Matrix(y)
    y_pred = predict(X, w, b)
    cost = sum((y_pred - y).applyfunc(lambda x: x**2)) / (2 * m)
    return cost

def compute_gradient(X, y, w, b):
    m, n = X.shape
    y = sym.Matrix(y)

    y_pred = predict(X, w, b)
    err = y_pred - y

    dj_dw = (X.T @ err) / m
    dj_db = sum(err) / m


    return dj_db, dj_dw

## Example: Simple Linear Regression

1. Predict the house price, given the `house` data and the ideal parameters.
2. Build a regression model using gradient descent.

## Example: Multiple Linear Regression

1. Predict the house price, given the `houses` data and the ideal parameters.
2. Build a regression model using gradient descent.

In [3]:
houses = pd.DataFrame({'size_sqft': [2104, 1416, 852],
                       'bedrooms': [5, 3, 2],
                       'floors': [1, 2, 1],
                       'age': [45, 40, 35],
                       'price': [460, 232, 178]})

houses

Unnamed: 0,size_sqft,bedrooms,floors,age,price
0,2104,5,1,45,460
1,1416,3,2,40,232
2,852,2,1,35,178


In [4]:
# idea parameters
w_init = np.array([ 0.39133535, 18.75376741, -53.36032453, -26.42131618])
b_init = 785.1811367994083

### Prediction Using Known Ideal Parameters

In [5]:
# X_train
X = sym.Matrix(houses.drop('price', axis=1))
X

Matrix([
[2104, 5, 1, 45],
[1416, 3, 2, 40],
[ 852, 2, 1, 35]])

In [6]:
# y_train
y = sym.Matrix(houses['price'])
y

Matrix([
[460],
[232],
[178]])

In [7]:
# parameter vector
w_1, w_2, w_3, w_4, b = sym.symbols('w_1 w_2 w_3 w_4 b')
w = sym.Matrix((w_1, w_2, w_3, w_4))
display(w)
display(b)

Matrix([
[w_1],
[w_2],
[w_3],
[w_4]])

b

$$
\begin{align}
Xw + b &= y \\
\left[\begin{matrix}2104 & 5 & 1 & 45\\1416 & 3 & 2 & 40\\852 & 2 & 1 & 35\end{matrix}\right]
\cdot \left[\begin{matrix}w_{1}\\w_{2}\\w_{3}\\w_{4}\end{matrix}\right] + b
\left[\begin{matrix}1\\1\\1 \end{matrix}\right] &=
\left[\begin{matrix}460\\232\\178\end{matrix}\right] \\
\left[\begin{matrix}b + 2104 w_{1} + 5 w_{2} + w_{3} + 45 w_{4}\\b + 1416 w_{1} + 3 w_{2} + 2 w_{3} + 40 w_{4}\\b + 852 w_{1} + 2 w_{2} + w_{3} + 35 w_{4}\end{matrix}\right] &= \left[\begin{matrix}460\\232\\178\end{matrix}\right]
\end{align}
$$

In [8]:
# using the `predict` function
predict(X, w_init, b_init)

Matrix([
[459.999997619408],
[231.999998369408],
[177.999998989408]])

In [9]:
w_best = w.subs({w[i]: w_init[i] for i in range(len(w))})
b_best = b.subs({b:b_init}) * sym.ones(X.shape[0], 1)
display(w_best)
display(b_best)

Matrix([
[  0.39133535],
[ 18.75376741],
[-53.36032453],
[-26.42131618]])

Matrix([
[785.181136799408],
[785.181136799408],
[785.181136799408]])

In [10]:
predict(X, w_best, b_best)

Matrix([
[459.999997619408],
[231.999998369408],
[177.999998989408]])

### Regression Model Using Gradient Descent

In [11]:
# test compute_cost
compute_cost(X, y, w_init, b_init)

1.55789044289666e-12

In [12]:
# test compute_gradient
dj_db, dj_dw = compute_gradient(X, y, w_init, b_init)
display(dj_db)
display(dj_dw)

-1.67392515019552e-6

Matrix([
[-0.00272623577196403],
[-6.27197262777675e-6],
[-2.21745578225333e-6],
[-6.92403390682254e-5]])

In [13]:
def gradient_descent(X, y, w, b, f_cost, f_gradient, alpha, num_iters):

    J_history = []

    for i in range(num_iters):
        dj_db, dj_dw = f_gradient(X, y, w, b)

        w = w - alpha * dj_dw
        b = b - alpha * dj_db

        J_history.append(f_cost(X, y, w, b))

        # print cost
        if i % ceil(num_iters / 10) == 0:
            print(f'Iteration {i:4d}: Cost {J_history[-1]:8.2f}')

    return w, b, J_history



initial_w = sym.zeros(w_init.shape[0], 1)
initial_b = 0.0
iterations = 1000
alpha = 5.0e-7

w_final, b_final, J_hist = gradient_descent(X, y, initial_w, initial_b, compute_cost, compute_gradient, alpha, iterations)

Iteration    0: Cost  2529.46
Iteration  100: Cost   695.99
Iteration  200: Cost   694.92
Iteration  300: Cost   693.86
Iteration  400: Cost   692.81
Iteration  500: Cost   691.77
Iteration  600: Cost   690.73
Iteration  700: Cost   689.71
Iteration  800: Cost   688.70
Iteration  900: Cost   687.69


In [14]:
display(b_final)
display(w_final)

-0.00223540753093253

Matrix([
[  0.203965687318831],
[0.00374919220982854],
[-0.0112487038789788],
[-0.0658613999237372]])

# Compare our weights to the ideal weights

In [15]:
predict(X, w_final, b_final)

Matrix([
[426.185304971892],
[286.167472007856],
[171.467630871323]])

In [16]:
predict(X, w_init, b_init)

Matrix([
[459.999997619408],
[231.999998369408],
[177.999998989408]])

In [17]:
y

Matrix([
[460],
[232],
[178]])