In [1]:
import numpy as np
import torch
import torch.nn as nn


class Decoder(nn.Module):
  def __init__(self, num_of_features, dim_of_features, hidden_size, vocab_size, embedding_size):
        super(Decoder, self).__init__()
        
        self.num_of_features = num_of_features
        self.dim_of_features = dim_of_features
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.word_embed = nn.Embedding(self.vocab_size, self.embedding_size)
        
        # LSTM related variables
        self.lstm_cell = nn.LSTMCell(embedding_size+dim_of_features, hidden_size)
        self.deep_output = nn.Linear(self.hidden_size, self.vocab_size)
        
  
#   def create_feature_matrix(self):
#     feature_matrix = [[0 for i in range(vis_dim)] for j in range(vis_num)]
#     return feature_matrix
  
  
  def call_softMax(self, input):
    a = nn.Softmax(dim=1)(input).float() 
    return a
      
  
  # Finds the feature to be focused for the current time step 
  def build_attention_model(self, input):
    batch_size = input.size(0)
    input_layer = nn.Linear(self.dim_of_features, self.dim_of_features, bias=False)
#     input = torch.randn(batch_size, self.num_of_features, self.dim_of_features)
    output = input_layer(input)   
#     print("output size: ", output.size())
#     print(input_layer)    
    hidden_layer = nn.Linear(self.dim_of_features, self.dim_of_features, bias=False)
    input_h = torch.randn(batch_size, 1, self.dim_of_features)
    output_h = hidden_layer(input_h)
#     print("output_h size: ", output_h.size())
#     print(hidden_layer)
    concat_input = output + output_h
#     print("concat_input size = ", concat_input.size())
    bias = nn.Parameter(torch.zeros(self.num_of_features)).view(1, -1, 1)
    fullconnected_layer = nn.ReLU()(concat_input + bias)
#     print("bias = ", bias) 
#     print("fullconnected_layer = ", fullconnected_layer.size())
    
    # Add last layer for final transformation
    final_layer = nn.Linear(self.dim_of_features, 1, bias=False)
    final_output = final_layer(fullconnected_layer)
#     print("final_output 1 = ", final_output.size())
    final_output = final_output.squeeze(2)
#     print("final_output 2 = ", final_output.size())
    a = self.call_softMax(final_output).unsqueeze(2)
#     print("a = ", (a).size)
    new_in = np.multiply(input.detach(), a.detach())
    z = torch.sum(new_in, dim=1)
#     print("z = ", type(z))

    return a, z
    
    

output size:  torch.Size([32, 196, 512])
Linear(in_features=512, out_features=512, bias=False)
output_h size:  torch.Size([32, 1, 512])
Linear(in_features=512, out_features=512, bias=False)
concat_input size =  torch.Size([32, 196, 512])
fullconnected_layer =  torch.Size([32, 196, 512])
final_output 1 =  torch.Size([32, 196, 1])
final_output 2 =  torch.Size([32, 196])
a =  <built-in method size of Tensor object at 0x7f2952bffc60>
z =  <class 'torch.Tensor'>
