<a href="https://colab.research.google.com/github/TrieuLe0801/Forward_gradient_without_backpropagation/blob/master/Forward_gradient.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
import numpy as np

In [None]:
from sklearn import datasets
from sklearn import preprocessing
from scipy.stats import multivariate_normal
iris = datasets.load_iris()
X = torch.tensor(preprocessing.normalize(iris.data[:, :2]), dtype=torch.float)
y = torch.tensor(iris.target.reshape(-1, 1), dtype=torch.float)

In [None]:
# Set manual seed
torch.manual_seed(2)

<torch._C.Generator at 0x7f17159c3dd0>

In [None]:
print(X.size()[0])
print(y.size())

150
torch.Size([150, 1])


# **Simple torch**

In [None]:
from pandas._libs.tslibs.period import validate_end_alias
class FNN(nn.Module):
    def __init__(self, ):
        super().__init__()

        # Dimensions for input, hidden and output
        self.input_dim = 2
        self.hidden_dim = 32
        self.output_dim = 1

        # Learning rate definition
        self.learning_rate = 0.001

        # Our parameters (weights)
        # w1: 2 x 32
        self.w1 = torch.randn(self.input_dim, self.hidden_dim)

        # w2: 32 x 1
        self.w2 = torch.randn(self.hidden_dim, self.output_dim)

    def sigmoid(self, s):
        return 1 / (1 + torch.exp(-s))

    def sigmoid_first_order_derivative(self, s):
        return s * (1 - s)

    # Forward propagation
    def forward(self, X):
        # First linear layer
        self.y1 = torch.matmul(X, self.w1) # 3 X 3 ".dot" does not broadcast in PyTorch

        # First non-linearity
        self.y2 = self.sigmoid(self.y1)

        # Second linear layer
        self.y3 = torch.matmul(self.y2, self.w2)

        # Second non-linearity
        y4 = self.sigmoid(self.y3)
        return y4

    # Backward propagation
    def backward(self, X, l, y4):
        # Derivative of binary cross entropy cost w.r.t. final output y4
        self.dC_dy4 = y4 - l

        '''
        Gradients for w2: partial derivative of cost w.r.t. w2
        dC/dw2
        '''
        self.dy4_dy3 = self.sigmoid_first_order_derivative(y4)
        self.dy3_dw2 = self.y2

        # Y4 delta: dC_dy4 dy4_dy3
        self.y4_delta = self.dC_dy4 * self.dy4_dy3

        # This is our gradients for w1: dC_dy4 dy4_dy3 dy3_dw2
        self.dC_dw2 = torch.matmul(torch.t(self.dy3_dw2), self.y4_delta)

        '''
        Gradients for w1: partial derivative of cost w.r.t w1
        dC/dw1
        '''
        self.dy3_dy2 = self.w2
        self.dy2_dy1 = self.sigmoid_first_order_derivative(self.y2)

        # Y2 delta: (dC_dy4 dy4_dy3) dy3_dy2 dy2_dy1
        self.y2_delta = torch.matmul(self.y4_delta, torch.t(self.dy3_dy2)) * self.dy2_dy1

        # Gradients for w1: (dC_dy4 dy4_dy3) dy3_dy2 dy2_dy1 dy1_dw1
        self.dC_dw1 = torch.matmul(torch.t(X), self.y2_delta)

        # Gradient descent on the weights from our 2 linear layers
        self.w1 -= self.learning_rate * self.dC_dw1
        self.w2 -= self.learning_rate * self.dC_dw2

    def train(self, X, l):
        # Forward propagation
        y4 = self.forward(X)

        # Backward propagation and gradient descent
        self.backward(X, l, y4)

In [None]:
# Instantiate our model class and assign it to our model object
model = FNN()

# Loss list for plotting of loss behaviour
loss_lst = []

# Number of times we want our FNN to look at all 100 samples we have, 100 implies looking through 100x
num_epochs = 500

# Let's train our model with 100 epochs
for epoch in range(num_epochs):
    # Get our predictions
    y_hat = model(X)
    
    # Cross entropy loss, remember this can never be negative by nature of the equation
    # But it does not mean the loss can't be negative for other loss functions
    cross_entropy_loss = -(y * torch.log(y_hat) + (1 - y) * torch.log(1 - y_hat))
    
    # We have to take cross entropy loss over all our samples, 100 in this 2-class iris dataset
    mean_cross_entropy_loss = torch.mean(cross_entropy_loss).detach().item()
    
    # Print our mean cross entropy loss
    if epoch % 20 == 0:
        print('Epoch {} | Loss: {}'.format(epoch, mean_cross_entropy_loss))
    loss_lst.append(mean_cross_entropy_loss)
    
    # (1) Forward propagation: to get our predictions to pass to our cross entropy loss function
    # (2) Back propagation: get our partial derivatives w.r.t. parameters (gradients)
    # (3) Gradient Descent: update our weights with our gradients
    model.train(X,y)

Epoch 0 | Loss: 0.10400425642728806
Epoch 20 | Loss: 0.09804712980985641
Epoch 40 | Loss: 0.09328694641590118
Epoch 60 | Loss: 0.08935928344726562
Epoch 80 | Loss: 0.08603835105895996
Epoch 100 | Loss: 0.08317585289478302
Epoch 120 | Loss: 0.08066964149475098
Epoch 140 | Loss: 0.07844727486371994
Epoch 160 | Loss: 0.07645518332719803
Epoch 180 | Loss: 0.07465314865112305
Epoch 200 | Loss: 0.07301096618175507
Epoch 220 | Loss: 0.07150297611951828
Epoch 240 | Loss: 0.07011140137910843
Epoch 260 | Loss: 0.06881993263959885
Epoch 280 | Loss: 0.06761502474546432
Epoch 300 | Loss: 0.0664873793721199
Epoch 320 | Loss: 0.06542769074440002
Epoch 340 | Loss: 0.06442893296480179
Epoch 360 | Loss: 0.06348369270563126
Epoch 380 | Loss: 0.06258773803710938
Epoch 400 | Loss: 0.06173551082611084
Epoch 420 | Loss: 0.060923803597688675
Epoch 440 | Loss: 0.060148175805807114
Epoch 460 | Loss: 0.05940607562661171
Epoch 480 | Loss: 0.05869435518980026


# **Synthetic Gradient**

In [None]:
class FNN_Synthetic(nn.Module):
    def __init__(self, ):
        super().__init__()

        # Dimensions for input, hidden and output
        self.input_dim = 2
        self.hidden_dim = 32
        self.output_dim = 1

        # Learning rate definition
        self.learning_rate = 0.001

        # Our parameters (weights)
        # w1: 2 x 32
        self.w1 = torch.randn(self.input_dim, self.hidden_dim)
        self.w1_synthetic = torch.randn(self.hidden_dim, self.hidden_dim)
        self.bias_1 = torch.randn(self.hidden_dim)

        # w2: 32 x 1
        self.w2 = torch.randn(self.hidden_dim, self.output_dim)
        self.w2_synthetic = torch.randn(self.output_dim, self.output_dim)
        self.bias_2 = torch.randn(self.output_dim)

    def sigmoid(self, s):
        return 1 / (1 + torch.exp(-s))

    def sigmoid_first_order_derivative(self, s):
        return s * (1 - s)

    # Forward propagation
    def forward_and_synthetic_update(self, X, update=True):
        # First linear layer
        self.input = X
        self.y1 = torch.matmul(self.input, self.w1) # 3 X 3 ".dot" does not broadcast in PyTorch

        # First non-linearity
        self.y2 = self.sigmoid(self.y1 + self.bias_1)

        if update:
          self.synthetic_gradient_1 = torch.matmul(self.y2,self.w1_synthetic) + self.bias_1
          self.weight_synthetic_gradient_1 = self.synthetic_gradient_1*self.sigmoid_first_order_derivative(self.y2)
          self.w1 -= torch.matmul(torch.transpose(self.input,0,1), self.weight_synthetic_gradient_1)*self.learning_rate
          self.bias_1 -= torch.mean(self.weight_synthetic_gradient_1,0) * self.learning_rate

        # Second linear layer
        self.y3 = torch.matmul(self.y2, self.w2)

        # Second non-linearity
        y4 = self.sigmoid(self.y3)
        self.output= y4

        if update:
          self.synthetic_gradient_2 = torch.matmul(self.output,self.w2_synthetic) + self.bias_2
          self.weight_synthetic_gradient_2 = self.synthetic_gradient_2*self.sigmoid_first_order_derivative(self.output)
          self.w2 -= torch.matmul(torch.transpose(self.y2,0,1), self.weight_synthetic_gradient_2)*self.learning_rate
          self.bias_2 -= torch.mean(self.weight_synthetic_gradient_2,0) * self.learning_rate

        return torch.matmul(self.weight_synthetic_gradient_2,torch.transpose(self.w2, 0, 1)), self.output

    def normal_synthetic_weights(self, X, output, y):
        true_gradient = output - y
        grad_2 = true_gradient * self.sigmoid_first_order_derivative(self.output)
        
        self.w2 -= torch.matmul(torch.transpose(self.y2,0,1),grad_2) * self.learning_rate
        self.bias_2 -= torch.mean(grad_2,0) * self.learning_rate

        grad_1 = grad_2 * self.sigmoid_first_order_derivative(self.y2)
        
        self.w1 -= torch.matmul(torch.transpose(X,0,1),grad_1)*self.learning_rate
        self.bias_1 -= torch.mean(grad_1,0)*self.learning_rate
        

    def train_with_synthetic(self, X, y):
      _, output = self.forward_and_synthetic_update(X, update=True)
      self.normal_synthetic_weights(X, output, y)
      

In [None]:
# Instantiate our model class and assign it to our model object
model = FNN_Synthetic()

# Loss list for plotting of loss behaviour
loss_lst = []

# Number of times we want our FNN to look at all 100 samples we have, 100 implies looking through 100x
num_epochs = 500

# Let's train our model with 100 epochs
for epoch in range(num_epochs):
    # Get our predictions
    _,y_hat = model.forward_and_synthetic_update(X)
    
    # Cross entropy loss, remember this can never be negative by nature of the equation
    # But it does not mean the loss can't be negative for other loss functions
    cross_entropy_loss = -(y * torch.log(y_hat) + (1 - y) * torch.log(1 - y_hat))
    
    # We have to take cross entropy loss over all our samples, 100 in this 2-class iris dataset
    mean_cross_entropy_loss = torch.mean(cross_entropy_loss).detach().item()
    
    # Print our mean cross entropy loss
    if epoch % 20 == 0:
        print('Epoch {} | Loss: {}'.format(epoch, mean_cross_entropy_loss))
    loss_lst.append(mean_cross_entropy_loss)
    
    # train without backpropagation (not really without but just back for check)
    model.train_with_synthetic(X,y)

Epoch 0 | Loss: 1.6488170623779297
Epoch 20 | Loss: 3.2910521030426025
Epoch 40 | Loss: 3.4551761150360107
Epoch 60 | Loss: 0.10038378834724426
Epoch 80 | Loss: 0.022134065628051758
Epoch 100 | Loss: 0.020746955648064613
Epoch 120 | Loss: 0.027922162786126137
Epoch 140 | Loss: 0.028838997706770897
Epoch 160 | Loss: 0.033190250396728516
Epoch 180 | Loss: 0.032893095165491104
Epoch 200 | Loss: 0.027912253513932228
Epoch 220 | Loss: 0.022654660046100616
Epoch 240 | Loss: 0.01936769112944603
Epoch 260 | Loss: 0.01680806465446949
Epoch 280 | Loss: 0.01369522139430046
Epoch 300 | Loss: 0.010337581858038902
Epoch 320 | Loss: 0.008052155375480652
Epoch 340 | Loss: 0.006605313625186682
Epoch 360 | Loss: 0.005640109535306692
Epoch 380 | Loss: 0.004940312821418047
Epoch 400 | Loss: 0.0045160166919231415
Epoch 420 | Loss: 0.004154227674007416
Epoch 440 | Loss: 0.003879779251292348
Epoch 460 | Loss: 0.003616275731474161
Epoch 480 | Loss: 0.003483212785795331
