In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import math
# import pytorch_lightning as pl

In [6]:
a = torch.tensor([[1,2,3,0,0]], dtype=torch.float32)
mask=torch.tensor([1,1,0,0,1], dtype=torch.float32)

a.masked_fill(mask == 1, -3)

tensor([[-3., -3.,  3.,  0., -3.]])

In [7]:
F.softmax(a, dim=-1)

tensor([[0.0844, 0.2295, 0.6239, 0.0311, 0.0311]])

In [2]:
def scaled_dot_product(q, k, v, mask=None):
    d_k = q.size()[-1]
    attn_logits = torch.matmul(q, k.transpose(-2, -1))
    attn_logits = attn_logits / math.sqrt(d_k)
    if mask is not None:
        attn_logits = attn_logits.masked_fill(mask == 0, -9e15)
    attention = F.softmax(attn_logits, dim=-1)
    values = torch.matmul(attention, v)
    return values, attention

In [116]:
seq_len, d_k = 3, 2
# pl.seed_everything(42)
q = torch.randn(seq_len, d_k)
k = torch.randn(seq_len, d_k)
v = torch.randn(seq_len, d_k)
values, attention = scaled_dot_product(q, k, v)
print("Q\n", q)
print("K\n", k)
print("V\n", v)
print("Values\n", values)
print("Attention\n", attention)

Q
 tensor([[0.6036, 1.3728],
        [0.3793, 0.0693],
        [1.2414, 0.7625]])
K
 tensor([[ 0.9727, -0.9654],
        [ 1.0191, -0.7601],
        [ 1.6993,  0.8332]])
V
 tensor([[0.9240, 0.4468],
        [2.2781, 0.0229],
        [0.3701, 0.3375]])
Values
 tensor([[0.6613, 0.3094],
        [1.1180, 0.2741],
        [0.7578, 0.3016]])
Attention
 tensor([[0.0994, 0.1238, 0.7768],
        [0.2985, 0.3053, 0.3962],
        [0.1398, 0.1626, 0.6976]])


In [117]:
q

tensor([[0.6036, 1.3728],
        [0.3793, 0.0693],
        [1.2414, 0.7625]])

In [119]:
scaled_dot_product(q, q, q)

(tensor([[0.8082, 0.9808],
         [0.7803, 0.7618],
         [0.8707, 0.8857]]), tensor([[0.5045, 0.1293, 0.3662],
         [0.3295, 0.2911, 0.3795],
         [0.3750, 0.1525, 0.4724]]))

In [120]:
q.shape

torch.Size([3, 2])

In [121]:
q = torch.unsqueeze(q, axis=0)
q.shape

torch.Size([1, 3, 2])

In [125]:
q=q.permute([0,2,1])
q.shape

torch.Size([3, 1, 2])

In [126]:
multihead_attn = nn.MultiheadAttention(embed_dim=3, num_heads=1)

In [127]:
attn_output, attn_output_weights = multihead_attn(q, q,q)
attn_output

AssertionError: 

In [6]:
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter('runs/mnist1')

In [14]:
import torch
celsius_q = torch.from_numpy(np.array([-40, -10, 0, 8, 15, 22, 38], dtype=np.float32).reshape(-1,1))
fahrenheit_a = torch.from_numpy(np.array([-40, 14, 32, 46, 59, 72, 100], dtype=np.float32).reshape(-1,1))
model = torch.nn.Sequential(
 torch.nn.Linear(1,1)
)
loss_fn = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
for t in range(500):
    y_pred = model(celsius_q)
    loss = loss_fn(y_pred, fahrenheit_a)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    if (t+1) % 100 == 0:
            n_total_steps=500
            print (f'Step [{t+1}/{n_total_steps}], Loss: {loss.item():.4f}')
            ############## TENSORBOARD ########################
            writer.add_scalar('training loss', loss.item(), (t+1)/n_total_steps)
#             running_correct = 0
#             running_loss = 0.0
    
    
writer.add_graph(model, torch.tensor([100], dtype=torch.float32))
writer.close()

print(model(torch.from_numpy(np.array([100], dtype=np.float32))))

Step [100/500], Loss: 3998.9595
Step [200/500], Loss: 1939.6367
Step [300/500], Loss: 823.5095
Step [400/500], Loss: 303.4089
Step [500/500], Loss: 96.2532
tensor([211.2323], grad_fn=<AddBackward0>)


In [12]:
x=torch.arange(0,5).expand(8,5)

In [13]:
x+x

tensor([[0, 2, 4, 6, 8],
        [0, 2, 4, 6, 8],
        [0, 2, 4, 6, 8],
        [0, 2, 4, 6, 8],
        [0, 2, 4, 6, 8],
        [0, 2, 4, 6, 8],
        [0, 2, 4, 6, 8],
        [0, 2, 4, 6, 8]])

In [16]:
~torch.tril(torch.ones((5,5), dtype=torch.bool))

tensor([[False,  True,  True,  True,  True],
        [False, False,  True,  True,  True],
        [False, False, False,  True,  True],
        [False, False, False, False,  True],
        [False, False, False, False, False]])

In [18]:
 def generate_square_subsequent_mask(sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

In [21]:
generate_square_subsequent_mask(5)

tensor([[0., -inf, -inf, -inf, -inf],
        [0., 0., -inf, -inf, -inf],
        [0., 0., 0., -inf, -inf],
        [0., 0., 0., 0., -inf],
        [0., 0., 0., 0., 0.]])

In [22]:
a = torch.arange(10).reshape(5,2)
a

tensor([[0, 1],
        [2, 3],
        [4, 5],
        [6, 7],
        [8, 9]])

In [27]:
torch.split(a, 1)

(tensor([[0, 1]]),
 tensor([[2, 3]]),
 tensor([[4, 5]]),
 tensor([[6, 7]]),
 tensor([[8, 9]]))

In [28]:
torch.cat(torch.split(a, 1), dim=0)

tensor([[0, 1],
        [2, 3],
        [4, 5],
        [6, 7],
        [8, 9]])

In [30]:
>>> class TestClass(object):
...     def __init__(self):
...         self.a = 5
...         self.b = 'xyz'
... 
>>> test = TestClass()
>>> test.__dict__

{'a': 5, 'b': 'xyz'}

In [31]:
test.__dict__.update({'c':5})
test.__dict__

{'a': 5, 'b': 'xyz', 'c': 5}