In [1]:
from collections import OrderedDict

import torch
import torch.nn as nn
import torch.nn.functional as F
from mlp import MLP, mse_loss, bce_loss

import numpy as np


In [100]:
a = torch.tensor([[ 0.9455,  0.2088,  0.1070],
        [ 0.0823,  0.6509,  0.1171]])
b = torch.tensor([ 0.4321,  0.8250, 0.0679])
a.add(b[None, :])


tensor([[1.3776, 1.0338, 0.1749],
        [0.5144, 1.4759, 0.1850]])

In [101]:
(a > 0.5) * 1.0

tensor([[1., 0., 0.],
        [0., 1., 0.]])

In [281]:
class MLP:
    def __init__(
        self,
        linear_1_in_features,
        linear_1_out_features,
        f_function,
        linear_2_in_features,
        linear_2_out_features,
        g_function
    ):
        """
        Args:
            linear_1_in_features: the in features of first linear layer
            linear_1_out_features: the out features of first linear layer
            linear_2_in_features: the in features of second linear layer
            linear_2_out_features: the out features of second linear layer
            f_function: string for the f function: relu | sigmoid | identity
            g_function: string for the g function: relu | sigmoid | identity
        """
        self.f_function = f_function
        self.g_function = g_function

        self.parameters = dict(
            W1 = torch.randn(linear_1_out_features, linear_1_in_features),
            b1 = torch.randn(linear_1_out_features),
            W2 = torch.randn(linear_2_out_features, linear_2_in_features),
            b2 = torch.randn(linear_2_out_features),
        )
        self.grads = dict(
            dJdW1 = torch.zeros(linear_1_out_features, linear_1_in_features),
            dJdb1 = torch.zeros(linear_1_out_features),
            dJdW2 = torch.zeros(linear_2_out_features, linear_2_in_features),
            dJdb2 = torch.zeros(linear_2_out_features),
        )

        # put all the cache value you need in self.cache
        self.cache = dict()

    def forward(self, x):
        """
        Args:
            x: tensor shape (batch_size, linear_1_in_features)
        """
        # TODO: Implement the forward function
        print(self.g_function)
        activation_mapping = {'relu':nn.ReLU(), 'sigmoid':nn.Sigmoid(), 'identity':nn.Identity()}
        z1 = torch.matmul(x, self.parameters['W1'].t()) + torch.ger(torch.ones(x.shape[0]), self.parameters['b1'])
        z2 = activation_mapping[self.f_function](z1)
        z3 = torch.matmul(z2, self.parameters['W2'].t()) + torch.ger(torch.ones(x.shape[0]), self.parameters['b2'])
        y_hat = activation_mapping[self.g_function](z3)
        self.cache['z1'] = z1
        self.cache['z2'] = z2
        self.cache['z3'] = z3
        self.cache['x'] = x
        
        return y_hat
    
    def backward(self, dJdy_hat):
        """
        Args:
            dJdy_hat: The gradient tensor of shape (batch_size, linear_2_out_features)
        """
        # TODO: Implement the backward function
        def grad(function, input_):
            grad_mappings = {'relu':(input_ > 0) * 1.0, 
                         'sigmoid':torch.exp(-input_)*(1+torch.exp(-input_))**(-2), 
                         'identity':torch.ones(input_.size())}
            return grad_mappings[function]
        
        dz1dW1= self.cache['x'].t()
        dz1db1= torch.ones(self.cache['x'].shape[0])
        dz2dz1 = grad(self.f_function, self.cache['z1'])
        dz3dz2 = self.parameters['W2']
        dz3dW2= self.cache['z2'].t()
        dz3db2= torch.ones(self.cache['z2'].shape[0])
        dy_hatdz3 = grad(self.g_function, self.cache['z3'])
        
        self.grads['dJdW1'] = torch.matmul(dz1dW1, torch.matmul(dJdy_hat * dy_hatdz3, dz3dz2) * dz2dz1).t()
        self.grads['dJdb1'] = torch.matmul(dz1db1, torch.matmul(dJdy_hat * dy_hatdz3, dz3dz2) * dz2dz1)
        self.grads['dJdW2'] = torch.matmul(dz3dW2, dJdy_hat * dy_hatdz3).t()
        self.grads['dJdb2'] = torch.matmul(dz3db2, dJdy_hat * dy_hatdz3)
        

    
    def clear_grad_and_cache(self):
        for grad in self.grads:
            self.grads[grad].zero_()
        self.cache = dict()
        

In [902]:
def mse_loss(y, y_hat):
    """
    Args:
        y: the label tensor (batch_size, linear_2_out_features)
        y_hat: the prediction tensor (batch_size, linear_2_out_features)

    Return:
        J: scalar of loss
        dJdy_hat: The gradient tensor of shape (batch_size, linear_2_out_features)
    """
    # TODO: Implement the mse loss
    loss = (0.5*(y_hat - y)**2).mean()
    dJdy_hat = (y_hat - y)/(y.shape[0]*y.shape[1])

    return loss, dJdy_hat

def bce_loss(y, y_hat):
    """
    Args:
        y_hat: the prediction tensor
        y: the label tensor
        
    Return:
        loss: scalar of loss
        dJdy_hat: The gradient tensor of shape (batch_size, linear_2_out_features)
    """
    # TODO: Implement the bce loss
    
    def bounded_log(y_hat):
        result = torch.log(y_hat)
        result[result < -100] = -100
        return result
    def bounded_yhat(y_hat):
        result = torch.Tensor(y_hat.shape).copy_(y_hat)
        result[result == 1] = 1- torch.exp(torch.tensor(-17.))
        result[result == 0] = torch.exp(torch.tensor(-100.))
        return result
    loss = - (y * bounded_log(y_hat) + (1-y) * bounded_log(1-y_hat)).mean()
#     dJdy_hat = (- y/y_hat + (1-y)/(1-y_hat)) * (((- y/y_hat + (1-y)/(1-y_hat)) > -100)*1.0)/(y.shape[0]*y.shape[1])
#     y_hat = torch.exp(bounded_log(y_hat))
    dJdy_hat = (- y/bounded_yhat(y_hat) + (1-y)/(1-bounded_yhat(y_hat)))/(y.shape[0]*y.shape[1])
    return loss, dJdy_hat


In [889]:
y_hat.copy_(y_hat)

tensor([[9.9871e-01, 9.9999e-01],
        [5.8329e-01, 6.0355e-08],
        [1.0000e+00, 9.8905e-01],
        [6.4327e-01, 9.7553e-01],
        [1.0000e+00, 9.5003e-01],
        [8.9108e-01, 9.9222e-01],
        [9.9995e-01, 9.9263e-01],
        [9.5898e-01, 3.8934e-01],
        [1.0000e+00, 1.0000e+00],
        [1.0000e+00, 9.9999e-01]])

In [939]:
net = MLP(
    linear_1_in_features=2,
    linear_1_out_features=20,
    f_function='relu',
    linear_2_in_features=20,
    linear_2_out_features=2,
    g_function='sigmoid'
)

x = torch.randn(10, 2)
# y = ((torch.randn(10) > 0.5) * 1.0).unsqueeze(-1)
y = ((torch.randn(10, 2) > 0.5) * 1.0)

net.clear_grad_and_cache()
y_hat = net.forward(x)


sigmoid


In [940]:
(((y_hat != 0)&(y_hat != 1))*1.0)

tensor([[1., 1.],
        [1., 1.],
        [1., 1.],
        [1., 1.],
        [1., 1.],
        [1., 1.],
        [1., 1.],
        [1., 1.],
        [1., 1.],
        [1., 1.]])

In [941]:
J, dJdy_hat = bce_loss(y, y_hat)
net.backward(dJdy_hat)

In [942]:
(y_hat!=0)&(y_hat!=1)

tensor([[True, True],
        [True, True],
        [True, True],
        [True, True],
        [True, True],
        [True, True],
        [True, True],
        [True, True],
        [True, True],
        [True, True]])

In [943]:
net_autograd = nn.Sequential(
    OrderedDict([
        ('linear1', nn.Linear(2, 20)),
        ('relu', nn.ReLU()),
        ('linear2', nn.Linear(20, 2)),
        ('sigmoid', nn.Sigmoid())
    ])
)
net_autograd.linear1.weight.data = net.parameters['W1']
net_autograd.linear1.bias.data = net.parameters['b1']
net_autograd.linear2.weight.data = net.parameters['W2']
net_autograd.linear2.bias.data = net.parameters['b2']

y_hat_autograd = net_autograd(x)

In [944]:
y_hat == y_hat_autograd

tensor([[True, True],
        [True, True],
        [True, True],
        [True, True],
        [True, True],
        [True, True],
        [True, True],
        [True, True],
        [True, True],
        [True, True]])

In [951]:
a = 0.8
a_hat = torch.exp(torch.tensor(-100.))
-(a/a_hat+(1-a)/(1-a_hat))

tensor(-inf)

In [965]:
torch.exp(torch.tensor(-100.)) == 0

tensor(False)

In [966]:
1-torch.exp(torch.tensor(-100.)) == 1

tensor(True)

In [945]:
J, dJdy_hat = bce_loss(y, y_hat)
J

tensor(3.6914)

In [946]:
J_autograd = F.binary_cross_entropy(y_hat_autograd, y)
J_autograd

tensor(3.6914, grad_fn=<BinaryCrossEntropyBackward>)

In [947]:
# J_autograd = 0.5 * F.binary_cross_entropy(y_hat_autograd, y)
J_autograd = F.binary_cross_entropy(y_hat_autograd, y)

net_autograd.zero_grad()
J_autograd.backward()

print('dJdW1', net.grads['dJdW1'])
print(net_autograd.linear1.weight.grad.data)
print('dJdb1', net.grads['dJdb1'])
print(net_autograd.linear1.bias.grad.data)
print('dJdW2', net.grads['dJdW2'])
print(net_autograd.linear2.weight.grad.data)
print('dJdb2', net.grads['dJdb2'])
print(net_autograd.linear2.bias.grad.data)

print((net_autograd.linear1.weight.grad.data - net.grads['dJdW1']).norm())
print((net_autograd.linear1.bias.grad.data - net.grads['dJdb1']).norm())
print((net_autograd.linear2.weight.grad.data - net.grads['dJdW2']).norm())
print((net_autograd.linear2.bias.grad.data - net.grads['dJdb2']).norm())
#------------------------------------------------



dJdW1 tensor([[ 0.0000,  0.0000],
        [ 0.0550,  0.0297],
        [-0.0553,  0.1047],
        [ 0.0041, -0.0031],
        [-0.0238,  0.1432],
        [ 0.0000,  0.0000],
        [ 0.0625, -0.0298],
        [-0.2098,  0.0952],
        [-0.0894,  0.0395],
        [-0.0758,  0.2430],
        [ 0.0062, -0.0024],
        [ 0.0385,  0.0049],
        [ 0.1133,  0.2157],
        [-0.1328,  0.0198],
        [ 0.0180, -0.0282],
        [-0.0609,  0.4745],
        [ 0.0222, -0.0826],
        [ 0.0090,  0.0054],
        [-0.2402,  0.0817],
        [-0.1345,  0.0529]])
tensor([[ 0.0000,  0.0000],
        [ 0.0550,  0.0297],
        [-0.0553,  0.1047],
        [ 0.0041, -0.0031],
        [-0.0238,  0.1432],
        [ 0.0000,  0.0000],
        [ 0.0625, -0.0298],
        [-0.2098,  0.0952],
        [-0.0894,  0.0395],
        [-0.0758,  0.2430],
        [ 0.0062, -0.0024],
        [ 0.0385,  0.0049],
        [ 0.1133,  0.2157],
        [-0.1328,  0.0198],
        [ 0.0180, -0.0282],
        [-0.0

In [516]:
net = MLP(
    linear_1_in_features=10,
    linear_1_out_features=20,
    f_function='relu',
    linear_2_in_features=20,
    linear_2_out_features=1,
    g_function='sigmoid'
)
x = torch.randn(10, 10)
y = ((torch.randn(10) > 0.5) * 1.0).unsqueeze(-1)
# y = ((torch.randn(10, 2) > 0.5) * 1.0)
net.clear_grad_and_cache()
y_hat = net.forward(x)
J, dJdy_hat = bce_loss(y, y_hat)
net.backward(dJdy_hat)

sigmoid


In [517]:
#------------------------------------------------
# check the result with autograd
net_autograd = nn.Sequential(
    OrderedDict([
        ('linear1', nn.Linear(10, 20)),
        ('relu', nn.ReLU()),
        ('linear2', nn.Linear(20, 1)),
        ('sigmoid', nn.Sigmoid())
    ])
)
net_autograd.linear1.weight.data = net.parameters['W1']
net_autograd.linear1.bias.data = net.parameters['b1']
net_autograd.linear2.weight.data = net.parameters['W2']
net_autograd.linear2.bias.data = net.parameters['b2']

y_hat_autograd = net_autograd(x)

In [518]:
y_hat == y_hat_autograd

tensor([[True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True]])

In [519]:
# J_autograd = 0.5 * F.mse_loss(y_hat_autograd, y)
J_autograd = F.binary_cross_entropy(y_hat_autograd, y)

net_autograd.zero_grad()
J_autograd.backward()

print(J_autograd, J)

print('dJdW1', net.grads['dJdW1'])
print(net_autograd.linear1.weight.grad.data)
print('dJdb1', net.grads['dJdb1'])
print(net_autograd.linear1.bias.grad.data)
print('dJdW2', net.grads['dJdW2'])
print(net_autograd.linear2.weight.grad.data)
print('dJdb2', net.grads['dJdb2'])
print(net_autograd.linear2.bias.grad.data)

print((net_autograd.linear1.weight.grad.data - net.grads['dJdW1']).norm())
print((net_autograd.linear1.bias.grad.data - net.grads['dJdb1']).norm())
print((net_autograd.linear2.weight.grad.data - net.grads['dJdW2']).norm())
print((net_autograd.linear2.bias.grad.data - net.grads['dJdb2']).norm())
#------------------------------------------------


tensor(1.2830, grad_fn=<BinaryCrossEntropyBackward>) tensor(1.2830)
dJdW1 tensor([[ 0.4260, -0.0992,  0.2537, -0.2614, -0.0274, -0.0733, -0.1926,  0.1593,
         -0.2237, -0.3298],
        [ 0.0373,  0.0471,  0.1268,  0.1263,  0.0738, -0.0024,  0.1056,  0.1270,
         -0.1270, -0.1124],
        [-0.0701,  0.0574, -0.0127,  0.1146,  0.0330,  0.0150,  0.0973, -0.0043,
          0.0015,  0.0265],
        [-0.0417,  0.0118,  0.0008,  0.0162, -0.0303, -0.0150,  0.0142, -0.0070,
          0.0174,  0.0528],
        [-0.1968, -0.0140, -0.0100,  0.0015,  0.1446, -0.0044,  0.0382,  0.1388,
         -0.1883,  0.1233],
        [ 0.0036, -0.0008, -0.0015, -0.0057,  0.0096,  0.0026,  0.0109, -0.0091,
         -0.0071, -0.0018],
        [-0.0369,  0.0159,  0.0205,  0.0301,  0.0507, -0.0007,  0.0381,  0.0519,
         -0.0738,  0.0067],
        [ 0.0006, -0.0127, -0.0020, -0.0030,  0.0013, -0.0144, -0.0009, -0.0034,
          0.0100, -0.0004],
        [ 0.0298, -0.0043,  0.1684,  0.1206,  0.1272, 

In [514]:
net = MLP(
    linear_1_in_features=5,
    linear_1_out_features=10,
    f_function='relu',
    linear_2_in_features=10,
    linear_2_out_features=1,
    g_function='sigmoid'
)
x = torch.randn(10, 5)
y = ((torch.randn(10) > 0.5) * 1.0).unsqueeze(-1)
# y = ((torch.randn(10, 2) > 0.5) * 1.0)
net.clear_grad_and_cache()
y_hat = net.forward(x)
# J, dJdy_hat = mse_loss(y, y_hat)
J, dJdy_hat = bce_loss(y, y_hat)
net.backward(dJdy_hat)
#------------------------------------------------
# check the result with autograd
net_autograd = nn.Sequential(
    OrderedDict([
        ('linear1', nn.Linear(5, 10)),
        ('relu', nn.ReLU()),
        ('linear2', nn.Linear(10, 1)),
        ('sigmoid', nn.Sigmoid())
    ])
)
net_autograd.linear1.weight.data = net.parameters['W1']
net_autograd.linear1.bias.data = net.parameters['b1']
net_autograd.linear2.weight.data = net.parameters['W2']
net_autograd.linear2.bias.data = net.parameters['b2']

y_hat_autograd = net_autograd(x)

# J_autograd = 0.5 * F.mse_loss(y_hat_autograd, y)
J_autograd = F.binary_cross_entropy(y_hat_autograd, y)

net_autograd.zero_grad()
J_autograd.backward()

print(y_hat == y_hat_autograd)
print(J, J_autograd)

print('dJdW1', net.grads['dJdW1'])
print(net_autograd.linear1.weight.grad.data)
print('dJdb1', net.grads['dJdb1'])
print(net_autograd.linear1.bias.grad.data)
print('dJdW2', net.grads['dJdW2'])
print(net_autograd.linear2.weight.grad.data)
print('dJdb2', net.grads['dJdb2'])
print(net_autograd.linear2.bias.grad.data)

print((net_autograd.linear1.weight.grad.data - net.grads['dJdW1']).norm())
print((net_autograd.linear1.bias.grad.data - net.grads['dJdb1']).norm())
print((net_autograd.linear2.weight.grad.data - net.grads['dJdW2']).norm())
print((net_autograd.linear2.bias.grad.data - net.grads['dJdb2']).norm())
#------------------------------------------------


sigmoid
tensor([[True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True]])
tensor(1.3618) tensor(1.3618, grad_fn=<BinaryCrossEntropyBackward>)
dJdW1 tensor([[ 5.2330e-08,  4.5411e-08,  5.1723e-08,  3.3947e-10,  1.6062e-08],
        [ 9.4287e-02,  2.6134e-02, -2.2243e-01, -2.0428e-01, -5.3525e-03],
        [-2.5902e-01,  2.8303e-01, -3.1989e-02,  3.0906e-02,  1.1835e-02],
        [ 2.5386e-02, -3.3068e-02, -1.0336e-03, -1.4944e-02,  1.1282e-02],
        [ 1.1485e-01, -1.0632e-01, -6.9186e-02, -1.2111e-01, -8.6954e-02],
        [ 1.7376e-02, -9.4481e-03, -1.8985e-02, -2.5139e-02, -8.4100e-03],
        [ 2.6013e-02,  4.4231e-02, -1.1121e-01, -1.2943e-01, -2.9918e-02],
        [ 2.7404e-03, -4.2974e-03,  9.7438e-04, -4.3676e-04, -8.6682e-04],
        [-1.4892e-01,  8.0971e-02,  1.6270e-01,  2.1544e-01,  7.2075e-02],
        [-4.0202e-02,  2.4951e-02,  3.9676e-02,  6.2537e-02,  2.2434e-02]])
tensor