In [None]:
'''
loss = (y_pred — y)²/n
loss = (y_pred² + y² — 2y*y_pred)/n (expanding the whole square)
=>( (x*w+b)² + y² — 2y*(x*w+b))/n (substitute y_pred)
=> ((x*w+b)²/n ) + (y²/n) + ((-2y(x*w+b))/n) (splitting the terms)
Let A = ((x*w+b)²/n )
Let B = (y²/n), 
Let C = ((-2y(x*w+b))/n)
A = ( x²w² + b² + 2xwb )/n (expanding)
∂A/∂w = ( 2x²w + 2xb )/n (differentiating)
∂B/∂w = 0 (differentiating)
C = (-2yxw — 2yb)/n
∂C/∂w = (-2yx)/n (differentiating)

∂loss/∂w = (2x²w + wxb — 2yx)/n
=> (2x(x*w + b — y))/n

dw=(2/n)*(y_pred — y)*x
db=(2/n)*(y_pred — y)
'''

In [1]:
def gradientdescent(x,y):
  weight = 0.1
  bias = 0.1
  learningrate = 0.05
  n = len(x)

  epochs = 50

  for i in range(epochs):  # epochs are iterations
    #forward Propagation
    ypred = x*weight + bias
    mse = mean_squared_error(y, ypred)

    #backward Propagation -: step1 of backward propogation
    dw = -(2/n)*sum(x*(y-ypred))  # derivative of weight
    db = -(2/n)*sum(y-ypred) # derivative of bias

    #assign new weights & bias -: step2 of backward propogation
    weight = weight - (learningrate*dw) # assigning weights 
    bias = bias - (learningrate*db) # assigning bias
    # where lr is a small number ranging between 0.1 to 0.0000001 (approx).
    # Doing this, we can reduce w if slope of tangent of loss at w is positive and increase w if slope is negative.  Learning rate 
    # is something that we have to manually choose and it is something which we don’t know beforehand. Choosing it is a matter of trial and error.
    # The reason we do not directly subtract dw from w is because, it might result in too much change in the value of w and might not end up in 
    # global minimum but, even further away from it.
    print(f"Iterations -: {i}, bias -: {bias}, weight -: {weight}, MSE -: {mse}")

In [2]:
from sklearn.metrics import mean_squared_error
import numpy as np

In [3]:
x = np.array([1,2,3,4])
y = np.array([10,12,20,25])

In [4]:
gradientdescent(x,y)

Iterations -: 0, bias -: 1.74, weight -: 4.85, MSE -: 304.33500000000004
Iterations -: 1, bias -: 2.0285, weight -: 5.6275, MSE -: 10.151350000000008
Iterations -: 2, bias -: 2.093775, weight -: 5.74975, MSE -: 2.135152874999998
Iterations -: 3, bias -: 2.12196, weight -: 5.76399375, MSE -: 1.9072832506249988
Iterations -: 4, bias -: 2.1437655625, weight -: 5.7605084374999995, MSE -: 1.8916610056429681
Iterations -: 5, bias -: 2.1642618968750003, weight -: 5.75418571875, MSE -: 1.8820949985420456
Iterations -: 6, bias -: 2.1842892775000005, weight -: 5.74748095546875, MSE -: 1.8729654313654303
Iterations -: 7, bias -: 2.203990110882813, weight -: 5.740797919492187, MSE -: 1.8641112904817663
Iterations -: 8, bias -: 2.2233916199214847, weight -: 5.734201952152343, MSE -: 1.855520362583569
Iterations -: 9, bias -: 2.2425019698912503, weight -: 5.727702583057715, MSE -: 1.8471847166628241
Iterations -: 10, bias -: 2.2613261271376963, weight -: 5.721300153291616, MSE -: 1.8390967640466949


In [4]:
# the final error that we are receiving is 1.65
# Gradient descent is an optimizer. There are many more optimizers we have. Like adam