### Initialization

In [1]:
# For Colab only!

try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass

In [2]:
import tensorflow as tf

In [3]:
import torch
from torch.nn import functional as F

In [3]:
print(tf.__version__)
print(tf.test.is_gpu_available())

2.1.0
Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
True


In [4]:
print(torch.__version__)
print(torch.cuda.is_available())

1.4.0
True


### MSE gradient

In [4]:
def one_hot(label, depth):
    out = torch.zeros(label.size(0), depth)
    idx = torch.LongTensor(label).view(-1, 1)
    out.scatter_(dim=1, index=idx, value=1)
    return out

In [12]:
# Example: [3,4] linear conversion ->[3,2]
#  y = x*w +c  x:[3,4] w:[4,2] b:[2], y:[3]
#  y one-hot depth = 2

x = tf.random.uniform([3,4])
w = tf.random.uniform([4,2])
b = tf.zeros([2])
y = tf.constant([0, 1, 1])

with tf.GradientTape() as tape:
    # if the tensors are not variables
    tape.watch([w,b])
    
    logits = x @ w + b
    probs = tf.nn.softmax(logits)
    
    y_true = tf.one_hot(y, depth=2)
    
    losses = tf.losses.MSE(y_true,probs)
    loss = tf.reduce_mean(losses)
    
grads = tape.gradient(loss, [w,b])

grads_w = grads[0]
grads_b = grads[1]

print(loss)
print(grads[0])
print(grads[1])

tf.Tensor(0.23290308, shape=(), dtype=float32)
tf.Tensor(
[[ 0.00120118 -0.00120119]
 [ 0.01929211 -0.01929212]
 [ 0.03523264 -0.03523265]
 [ 0.04114018 -0.04114018]], shape=(4, 2), dtype=float32)
tf.Tensor([ 0.07563752 -0.07563753], shape=(2,), dtype=float32)


In [14]:
# Example: [3,4] linear conversion ->[3,2]
#  y = x*w +c  x:[3,4] w:[4,2] b:[2], y:[3]
#  y one-hot depth = 2

x = torch.rand(3,4)
w = torch.rand([4,2], requires_grad=True)
b = torch.zeros([2], requires_grad=True)
y = torch.LongTensor([0, 1, 1])

# if "requires_grad=Flase"
# w.requires_grad_()
# b.requires_grad_()

logits = x @ w +b
probs = F.softmax(logits, dim = 1)

y_true = one_hot(y, depth=2)
loss = F.mse_loss(y_true, probs)



grads = torch.autograd.grad(loss, [w, b])

grads_w = grads[0]
grads_b = grads[1]


print(loss)
print(grads_w)
print(grads_b)

# Alternative way:

# loss.backward()
# print(w.grad)
# print(b.grad)

tensor(0.1932, grad_fn=<MeanBackward0>)
tensor([[-0.0028,  0.0028],
        [-0.0552,  0.0552],
        [ 0.0329, -0.0329],
        [-0.0147,  0.0147]])
tensor([-0.0148,  0.0148])


### Sorftmax

In [53]:
logits = tf.random.uniform([3,3])
logits = tf.Variable(logits)

with tf.GradientTape() as tape:
#     tape.watch([logits])
    
    probs = tf.nn.softmax(logits, axis=1)
    
grads = tape.gradient(probs[1][1], logits)


# print(logits)
# print(probs)

print(grads)

<tf.Variable 'Variable:0' shape=(3, 3) dtype=float32, numpy=
array([[2.6201749e-01, 9.3732166e-01, 1.5761006e-01],
       [6.7006159e-01, 6.8807602e-04, 7.3578167e-01],
       [6.9119418e-01, 9.5617390e-01, 3.1275153e-02]], dtype=float32)>
tf.Tensor(
[[0.2586995  0.5082489  0.23305157]
 [0.38760337 0.19846426 0.41393238]
 [0.3545725  0.4621514  0.1832761 ]], shape=(3, 3), dtype=float32)
None


In [51]:
# logist: [b, 3], probs: [b, 3]
logits = torch.rand(3,3)
logits.requires_grad_()

probs = F.softmax(logits, dim = 1)

print(logits)
print(probs)

probs[1][1].backward()
print(logits.grad)


# grad_0_0 = torch.autograd.grad(probs[0][0], logits, retain_graph=True)
# print(grad_0_0)

# grad_1_1 = torch.autograd.grad(probs[1][1], logits, retain_graph=True)
# print(grad_1_1)

tensor([[0.5588, 0.8455, 0.1835],
        [0.2511, 0.7790, 0.7572],
        [0.5120, 0.8886, 0.9761]], requires_grad=True)
tensor([[0.3312, 0.4412, 0.2276],
        [0.2297, 0.3894, 0.3810],
        [0.2470, 0.3600, 0.3930]], grad_fn=<SoftmaxBackward>)
tensor([[ 0.0000,  0.0000,  0.0000],
        [-0.0894,  0.2378, -0.1483],
        [ 0.0000,  0.0000,  0.0000]])
