In [1]:
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision.models as models
from torch.utils.data import DataLoader,Dataset
import torchvision.transforms as T

In [2]:
resnet = models.resnet50(pretrained=True)

In [3]:
for params in resnet.parameters():
    params.requires_grad_(False)
modules = list(resnet.children())[:-2]
resnet = nn.Sequential(*modules)

In [4]:
x = torch.randn(4, 3, 224, 224)

In [5]:
features = resnet(x)

In [6]:
features.shape

torch.Size([4, 2048, 7, 7])

In [7]:
features = features.permute(0, 2, 3, 1)
features.shape

torch.Size([4, 7, 7, 2048])

In [8]:
features = features.view(features.shape[0], -1, features.shape[-1])
features.shape

torch.Size([4, 49, 2048])

In [23]:
U = nn.Linear(2048,256)
u_hs = U(features)
u_hs.shape

torch.Size([4, 49, 256])

In [18]:
feature_mean = features.mean(dim=1)

In [19]:
linear = nn.Linear(2048, 512)

In [20]:
h = linear(feature_mean)
h.shape

torch.Size([4, 512])

In [25]:
W = nn.Linear(512,256)
w_ah = W(h)
w_ah.shape

torch.Size([4, 256])

In [26]:
w_ah.unsqueeze(1).shape

torch.Size([4, 1, 256])

In [27]:
combined_states = torch.tanh(u_hs + w_ah.unsqueeze(1))
combined_states.shape

torch.Size([4, 49, 256])

In [29]:
A = nn.Linear(256,1)
attention_scores = A(combined_states)
attention_scores.shape

torch.Size([4, 49, 1])

In [30]:
attention_scores = attention_scores.squeeze(2) 
attention_scores.shape

torch.Size([4, 49])

In [31]:
alpha = F.softmax(attention_scores,dim=1)
alpha.shape

torch.Size([4, 49])

In [32]:
alpha.unsqueeze(2).shape

torch.Size([4, 49, 1])

In [33]:
attention_weights = features * alpha.unsqueeze(2)
attention_weights.shape

torch.Size([4, 49, 2048])

In [34]:
attention_weights = attention_weights.sum(dim=1)
attention_weights.shape

torch.Size([4, 2048])

In [35]:
input_ = torch.LongTensor([[1,2,4,5, 6],[4,3,2,9, 0], [1,2,4,5, 6],[4,3,2,9, 0]])
embedding = nn.Embedding(10, 3)
embed_out = embedding(input_)

In [36]:
embed_out.shape

torch.Size([4, 5, 3])

In [38]:
embed_out[:, 0].shape

torch.Size([4, 3])

In [41]:
lstm_input = torch.cat((embed_out[:, 0], attention_weights), dim=1)
lstm_input.shape

torch.Size([4, 2051])

In [None]:

input_feature = torch.randn(2, 3, 10)


In [40]:
lstm_cell = nn.LSTMCell(2051, 512, bias=True)

In [42]:
h_, c_ = lstm_cell(lstm_input, (h, h))
print(h_.shape, c_.shape)

torch.Size([4, 512]) torch.Size([4, 512])


In [45]:
m = nn.Dropout(p=0.2)
dtgr = torch.randn(2, 2)
print(dtgr)
output = m(dtgr)
output

tensor([[ 0.0841,  0.8472],
        [-2.1221,  0.2664]])


tensor([[0.1052, 1.0589],
        [-0.0000, 0.3330]])

In [None]:
fcn = nn.Linear(decoder_dim,vocab_size)