In [19]:
import torch

All about strides:
[PyTorch internals](http://blog.ezyang.com/2019/05/pytorch-internals/)

In [27]:
x = torch.arange(32).view(2,4,4)
x

tensor([[[ 0,  1,  2,  3],
         [ 4,  5,  6,  7],
         [ 8,  9, 10, 11],
         [12, 13, 14, 15]],

        [[16, 17, 18, 19],
         [20, 21, 22, 23],
         [24, 25, 26, 27],
         [28, 29, 30, 31]]])

In [28]:
x = x.permute(1,0,2)
x

tensor([[[ 0,  1,  2,  3],
         [16, 17, 18, 19]],

        [[ 4,  5,  6,  7],
         [20, 21, 22, 23]],

        [[ 8,  9, 10, 11],
         [24, 25, 26, 27]],

        [[12, 13, 14, 15],
         [28, 29, 30, 31]]])

In [29]:
try:
    x = x.view(4,8)
except RuntimeError as e:
    print(e)

view size is not compatible with input tensor's size and stride (at least one dimension spans across two contiguous subspaces). Use .reshape(...) instead.


In [30]:
x = x.contiguous().view(4,8)
x

tensor([[ 0,  1,  2,  3, 16, 17, 18, 19],
        [ 4,  5,  6,  7, 20, 21, 22, 23],
        [ 8,  9, 10, 11, 24, 25, 26, 27],
        [12, 13, 14, 15, 28, 29, 30, 31]])

In [None]:
x.t()

# Autograd

In [35]:
import torch

# 定义常量 x 和 y
x = torch.tensor([1.0, 2.0, 3.0], requires_grad=False)  # 定值
y = torch.tensor([0.5, -1.0, 2.0], requires_grad=False)

# 初始化 W_x 和 W_y，手动创建梯度张量
seed = 42
torch.manual_seed(seed)
W_x = torch.randn(3,3, requires_grad=False)
W_y = torch.randn(3,3, requires_grad=False) 
print("W_x:", W_x)
print("W_y:", W_y)

# 定义中间变量 z = W_x * x + W_y * y
z = torch.mm(W_x,x) + torch.mm(W_y * y)

# 定义激活函数 tanh(z)
tanh_z = torch.tanh(z)

# 定义目标函数 l = sum(tanh(z))
l = tanh_z.sum()

# 手动计算 dl/dz
grad_l = torch.tensor(1.0, requires_grad=False)
dl_dz = grad_l.expand_as(tanh_z) 
# 因为 l = sum(tanh(z))，所以对每个 tanh_z 的分量 dl/dtanh_z = 1 

# 计算 dz/dW_x 和 dz/dW_y
dz_dWx = x  # 因为 z = W_x * x + W_y * y，dz/dW_x = x
dz_dWy = y  # dz/dW_y = y

# 根据链式法则计算 dl/dW_x 和 dl/dW_y
# tanh'(z) = 1 - tanh(z)^2
# dl/dW_x = dl/dz * dtanh(z)/dz * dz/dW_x
# dl/dW_y = dl/dz * dtanh(z)/dz * dz/dW_y
dtanh_dz = 1 - tanh_z**2
dl_dWx = dl_dz * dtanh_dz * dz_dWx
dl_dWy = dl_dz * dtanh_dz * dz_dWy

# 打印结果
print("l:", l.item())  # 标量输出
print("Gradient of W_x (manual):", dl_dWx.sum())  # dl/dW_x
print("Gradient of W_y (manual):", dl_dWy.sum())  # dl/dW_y


W_x: tensor([[ 0.3367,  0.1288,  0.2345],
        [ 0.2303, -1.1229, -0.1863],
        [ 2.2082, -0.6380,  0.4617]])
W_y: tensor([[ 0.2674,  0.5349,  0.8094],
        [ 1.1103, -1.6898, -0.9890],
        [ 0.9580,  1.3221,  0.8172]])
l: 1.3094227313995361
Gradient of W_x (manual): tensor(4.9995)
Gradient of W_y (manual): tensor(-0.8519)


In [40]:
import torch

# 定义常量 x 和 y
x = torch.tensor([1.0, 2.0, 3.0], requires_grad=False)  # 定值
y = torch.tensor([0.5, -1.0, 2.0], requires_grad=False)

# 初始化 W_x 和 W_y，手动创建梯度张量
seed = 42
torch.manual_seed(seed)
W_x = torch.randn(3,3, requires_grad=True)
W_y = torch.randn(3,3, requires_grad=True) 
print("W_x:", W_x)
print("W_y:", W_y)

# 定义中间变量 z = W_x * x + W_y * y
# 定义激活函数 tanh(z)
# 定义目标函数 l = sum(tanh(z))
z = W_x * x + W_y * y
tanh_z = torch.tanh(z)
l = tanh_z.sum()

l.backward()
# 打印结果
print("l:", l.item())  # 标量输出
print("Gradient of W_x (manual):", W_x._grad.sum())  # dl/dW_x
print("Gradient of W_y (manual):", W_y._grad.sum())  # dl/dW_y


W_x: tensor([[ 0.3367,  0.1288,  0.2345],
        [ 0.2303, -1.1229, -0.1863],
        [ 2.2082, -0.6380,  0.4617]], requires_grad=True)
W_y: tensor([[ 0.2674,  0.5349,  0.8094],
        [ 1.1103, -1.6898, -0.9890],
        [ 0.9580,  1.3221,  0.8172]], requires_grad=True)
l: 1.3094227313995361
Gradient of W_x (manual): tensor(4.9995)
Gradient of W_y (manual): tensor(-0.8519)
