In [None]:
import torch

class Mysoftmax(torch.autograd.Function):
    @staticmethod
    def forward(ctx, input):
        result = input - input.max()
        result = result.exp()
        result = result/result.sum()
        ctx.save_for_backward(result)
        return result
    
    @staticmethod
    def backward(ctx, grad_output):
        # grad_outputにはforwardのinputに入っている変数の値が入っている。
        result,  = ctx.saved_tensors
        J = torch.zeros(result.size()[-1], result.size()[-1])
        print(result)
        for i in range(result.size()[-1]):
            for j in range(result.size()[-1]):
                if i==j:
                    J[i][j] = result[0][i]*(1-result[0][i])
                else:
                    J[i][j] = -result[0][i]*result[0][j]
        grad_output = torch.mm(grad_output, J)
        return grad_output
    
# このailiasをやらないとtensorが出力にならない
softmax = Mysoftmax.apply

In [62]:
a = torch.tensor([[1.0,2.0,3.0]], dtype=torch.float32, requires_grad=True)

In [63]:
b = softmax(a)
print(b)

tensor([[0.0900, 0.2447, 0.6652]], grad_fn=<MysoftmaxBackward>)


In [64]:
b.backward(a)
print(a.grad)

tensor([[0.0900, 0.2447, 0.6652]], grad_fn=<MysoftmaxBackward>)
tensor([[-0.1418, -0.1408,  0.2826]])


In [65]:
x = torch.tensor([1.0,2.0,3.0], dtype=torch.float32, requires_grad=True)
y = x - x.max()
y = y.exp()
result = y/y.sum()
result.backward(x)
print(x.grad)

tensor([-0.1418, -0.1408,  0.2826])


In [50]:
import torch
import torch.nn.functional as F
c_a = torch.tensor([[1.0,2.0,3.0]], dtype=torch.float32, requires_grad=True)
c = F.softmax(c_a)
print(c)

tensor([[0.0900, 0.2447, 0.6652]], grad_fn=<SoftmaxBackward>)


  after removing the cwd from sys.path.


In [51]:
c.backward(c_a)
print(c_a.grad)

tensor([[-0.1418, -0.1408,  0.2826]])


In [52]:
c_a = torch.tensor([[1.0,2.0,3.0]], dtype=torch.float32, requires_grad=True)
c = F.softmax(c_a, dim = 0)
print(c)
c.backward(c_a)
print(c_a.grad)

tensor([[1., 1., 1.]], grad_fn=<SoftmaxBackward>)
tensor([[0., 0., 0.]])


In [53]:
c_a = torch.tensor([[1.0,2.0,3.0]], dtype=torch.float32, requires_grad=True)
c = F.softmax(c_a, dim=1)
print(c)
c.backward(c_a)
print(c_a.grad)

tensor([[0.0900, 0.2447, 0.6652]], grad_fn=<SoftmaxBackward>)
tensor([[-0.1418, -0.1408,  0.2826]])


In [61]:
c_a = torch.tensor([[1.0,2.0,3.0]], dtype=torch.float32, requires_grad=True)
c = F.softmax(c_a, dim=1)
print(c)
c.backward(c_a)
print(c_a.grad)

tensor([[0.0900, 0.2447, 0.6652]], grad_fn=<SoftmaxBackward>)
tensor([[-0.1418, -0.1408,  0.2826]])


In [137]:
import torch 
import torch.nn.functional as F

# テンソルを作成
# requires_grad=Falseだと微分の対象にならず勾配はNoneが返る
x = torch.tensor([0.0, 1.0, 4.0], requires_grad=True)

# 計算グラフを構築
# y = 2 * x + 3
y = F.relu(x)

# 勾配を計算
y.backward(x)

# 勾配を表示
print(x.grad)  # dy/dx = w = 2

tensor([0., 1., 4.])


In [107]:
print(x.grad)

tensor([0., 1., 4.])


pytorch tutorialから

In [21]:
import torch

# x = torch.ones(2, 2, requires_grad=True)
x = torch.tensor([[1,2],[3,4]], requires_grad=True, dtype=torch.float32)
print(x)
y = x + 2
print(y)
z = y * y * 3 # これはアダマール積(要素積，　行列の掛け算ではない)
out = z.mean()

print(z, out)

out.backward()
print(x.grad)

tensor([[1., 2.],
        [3., 4.]], requires_grad=True)
tensor([[3., 4.],
        [5., 6.]], grad_fn=<AddBackward0>)
tensor([[ 9., 16.],
        [25., 36.]], grad_fn=<MulBackward0>) tensor(21.5000, grad_fn=<MeanBackward1>)
tensor([[1.5000, 2.0000],
        [2.5000, 3.0000]])


softmaxのヤコビアンの計算

単純な実装

In [22]:
a = torch.tensor([1.0,2.0,3.0], dtype=torch.float32)

# J = [a[i]*(1-a[i]) if i==j else -a[i]*a[j] for i in range(len(a)) for j in range(len(a))]
J = [a[i]*(1-a[i]) for i in range(len(a))]

In [23]:
print(J)

[tensor(0.), tensor(-2.), tensor(-6.)]


In [116]:
a = torch.tensor([1.0,2.0,3.0], dtype=torch.float32)
J = []
for i in range(len(a)):
    tmp = []
    for j in range(len(a)):
        if i==j:
            tmp.append(a[i]*(1-a[i]))
        else:
            tmp.append(-a[i]*a[j])
    J.append([tmp])
J = torch.tensor(J)
print(J)

tensor([[[ 0., -2., -3.]],

        [[-2., -2., -6.]],

        [[-3., -6., -6.]]])


In [137]:
import torch

x = torch.tensor([1,2,3], dtype=torch.float32, requires_grad=True)
print(len(x))

3


In [226]:
a = torch.randn(4, 4)
print(a)
print(torch.sum(a, 1))
print(torch.sum(a, 0))

tensor([[-0.2957,  0.7737, -0.4852,  2.5937],
        [ 0.3693,  1.4469,  0.0763,  0.6498],
        [-1.5699,  1.8273,  0.4325, -2.0887],
        [-0.3461, -2.0550, -0.7831, -0.2714]])
tensor([ 2.5865,  2.5423, -1.3988, -3.4555])
tensor([-1.8424,  1.9928, -0.7594,  0.8835])


In [257]:
a = torch.tensor([1.0,2.0,3.0], dtype=torch.float32)

print(a[0].item())

1.0


In [172]:
import torch

class Mysoftmax(torch.autograd.Function):
    @staticmethod
    def forward(ctx, x):
        y = x - max(x)
        y = y.exp()
        result = y/y.sum()
        ctx.save_for_backward(x, result)
        return result
    
    @staticmethod
    def backward(ctx, grad_output):
        x, result = ctx.saved_tensors
        #print(result,x)
        #"""
        J = []
        for i in range(len(result)):
            tmp = []
            for j in range(len(result)):
                if i==j:
                    tmp.append(result[i].item()*(1-result[i].item()))
                else:
                    tmp.append(-result[i].item()*result[j].item())
            #J.append(torch.tensor(tmp))
            J.append(tmp)
        J = torch.tensor(J)
        #print("J\n",J)
        grad_input = torch.mv(J, x)
        return grad_input

# cosine similarity

my cosine

In [52]:
import torch

class Mycossim(torch.autograd.Function):
    @staticmethod
    def forward(ctx, input, key_vec):
        #k_1 = k.view(-1)
        #x = x.view(x.size()[1], x.size()[2])
        #y = [torch.dot(x/x.norm(), k/k.norm()) for x in x]
        #y = torch.tensor(y)
        """
        z = []
        for i in x:
            z.append(torch.dot(i/ torch.norm(i, dim=-1), k / torch.norm(k)))
        """
        print("input", input)
        print("key", key_vec)
        #mem_k = (mem*key_vec).sum()
        input_norm = input.norm()
        key_vec_norm = key_vec.norm()
        #result = mem_k/(max(mem_norm*key_vec_norm, 1e-8))
        result = (input*key_vec).sum()/(torch.max(input_norm*key_vec_norm, 1e-8))
        ctx.save_for_backward(input, key_vec, result)
        return result

    @staticmethod
    def backward(ctx, grad_output):
        mem, key_vec, result = ctx.saved_tensors
        print("grad_output", grad_output)
        """print(m)
        print(torch.norm(m, dim=-1))
        print(m.norm())
        print(torch.norm(m, dim=-1)**2)
        print(m.pow(2).sum())
        """
        #print(result*k/k.pow(2).sum())
        eps = torch.tensor([[1e-8]])
        m = mem
        k = key_vec
        m_norm = torch.norm(m)
        k_norm = torch.norm(k)
        m_grad = k/max(m_norm*k_norm, eps) - result*m/max(m_norm.pow(2), eps)
        k_grad = m/max(m_norm*k_norm, eps) - result*k/max(k_norm.pow(2), eps)
        print(m_grad)
        print(k_grad)
        #m_norm = torch.abs(m)
        #k_norm = torch.abs(k)
        #m_grad = k/torch.sum(m_norm*k_norm) - result*m/torch.sum(m_norm.pow(2))
        #k_grad = m/torch.sum(m_norm*k_norm) - result*k/torch.sum(k_norm.pow(2))
        m_grad = m_grad*m
        k_grad = k_grad*k
        return m_grad, k_grad

In [53]:
a = torch.tensor([[1.0, 2.0, 3.0]], requires_grad=True)
b = torch.tensor([[-1.0,-2.0,-3.0]], requires_grad=True)
#b = torch.tensor([[-3.0,-6.0,-2.0]], dtype=torch.float32, requires_grad=True)

print("\nMy cosine")
# このailiasをやらないとtensorが出力にならない
cosine_similarity = Mycossim.apply
c = cosine_similarity(a,b)
print(c)

c.backward(a)

print("a grad:", a.grad)
print("b grad:", b.grad)


# ------------------


f_c = F.cosine_similarity(a,b, dim=-1)
print(f_c)
# aについて
f_c.backward(a)
print("a grad:", a.grad)
print("b grad:", b.grad)


My cosine
input tensor([[1., 2., 3.]], requires_grad=True)
key tensor([[-1., -2., -3.]], requires_grad=True)


TypeError: max() received an invalid combination of arguments - got (Tensor, float), but expected one of:
 * (Tensor input)
 * (Tensor input, Tensor other, Tensor out)
 * (Tensor input, int dim, bool keepdim, tuple of Tensors out)


In [38]:
import torch
import torch.nn.functional as F

a = torch.tensor([1.0,2.0,3.0], requires_grad=True)
b = torch.tensor([-1.0,-2.0,-3.0], requires_grad=True)

f_c = F.cosine_similarity(a,b, dim=-1)
print(f_c)
# aについて
f_c.backward(b)
print("a grad:", a.grad)
print("b grad:", b.grad)

# ----------
# このailiasをやらないとtensorが出力にならない
cosine_similarity = Mycossim.apply
c = cosine_similarity(a,b)
print(c)

c.backward(b)

print("a grad:", a.grad)
print("b grad:", b.grad)

tensor(-1.0000, grad_fn=<DivBackward0>)
a grad: tensor([2.9802e-08, 5.9605e-08, 0.0000e+00])
b grad: tensor([-2.9802e-08, -5.9605e-08,  0.0000e+00])
tensor(-1.0000, grad_fn=<MycossimBackward>)
grad_output tensor(-6., grad_fn=<SumBackward0>)
tensor([-7.4506e-09, -1.4901e-08, -1.4901e-08])
tensor([7.4506e-09, 1.4901e-08, 1.4901e-08])
a grad: tensor([ 2.2352e-08,  2.9802e-08, -4.4703e-08])
b grad: tensor([-3.7253e-08, -8.9407e-08, -4.4703e-08])


In [309]:
print(a*b)
print(torch.dot(a, b))
print(a/b)
print(torch.div(a, b))

tensor([ -3., -12.,  -6.], grad_fn=<MulBackward0>)
tensor(-21., grad_fn=<DotBackward>)
tensor([-0.3333, -0.3333, -1.5000], grad_fn=<DivBackward0>)
tensor([-0.3333, -0.3333, -1.5000], grad_fn=<DivBackward0>)


In [3]:
import torch
import torch.nn.functional as F

m = torch.tensor([1.0,2.0,3.0], dtype=torch.float32, requires_grad=True)
k = torch.tensor([1.0,2.0,3.0], dtype=torch.float32, requires_grad=True)

m_k = torch.sum(m*k)
m_norm = torch.norm(m)
k_norm = torch.norm(k)
result = m_k/(max(m_norm*k_norm, 1e-8))

result.backward(result)
print("m.grad", m.grad)
print("k.grad", k.grad)

print("m_norm.grad", m_norm.grad)
print("k_norm.grad", k_norm.grad)

print("result", result.grad)


m = torch.tensor([1.0,2.0,3.0], dtype=torch.float32, requires_grad=True)
k = torch.tensor([1.0,2.0,3.0], dtype=torch.float32, requires_grad=True)
c = F.cosine_similarity(m,k, dim=-1)
c.backward(c)
print("m_grad", m.grad)
print("k_grad", k.grad)

print(c.grad)

m.grad tensor([7.4506e-09, 1.4901e-08, 1.4901e-08])
k.grad tensor([7.4506e-09, 1.4901e-08, 1.4901e-08])
m_norm.grad None
k_norm.grad None
result None
m_grad tensor([7.4506e-09, 1.4901e-08, 1.4901e-08])
k_grad tensor([7.4506e-09, 1.4901e-08, 1.4901e-08])
None


In [74]:
m = torch.tensor([[[1.0,2.0,3.0],[4.0,5.0,6.0]]], dtype=torch.float32, requires_grad=True)

In [75]:
torch.max(m)

tensor(6., grad_fn=<MaxBackward1>)

In [82]:
m = torch.tensor([[[1.0,2.0,3.0],[4.0,5.0,6.0]]], dtype=torch.float32, requires_grad=True)
k = torch.tensor([1.0,2.0,3.0], requires_grad=True)
eps = torch.tensor([[1e-8]])

print(m*k)
print((m*k).sum(dim=-1))
print(m.norm(dim=-1))
print(k.norm(dim=-1))
print((m*k).sum(dim=-1)/torch.max(m.norm(dim=-1)*k.norm(dim=-1), eps))

print(F.cosine_similarity(m, k, dim=-1))

tensor([[[ 1.,  4.,  9.],
         [ 4., 10., 18.]]], grad_fn=<MulBackward0>)
tensor([[14., 32.]], grad_fn=<SumBackward2>)
tensor([[3.7417, 8.7750]], grad_fn=<NormBackward1>)
tensor(3.7417, grad_fn=<NormBackward1>)
tensor([[1.0000, 0.9746]], grad_fn=<DivBackward0>)
tensor([[1.0000, 0.9746]], grad_fn=<DivBackward0>)


In [57]:
print(result)

tensor(1.2888, grad_fn=<DivBackward0>)
