In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

import numpy as np

import math

In [2]:
class CNN(nn.Module):
    """
    TODO: ADD DROPOUT
    """
    def __init__(self, vocab_size, embedding_dim):
        super(CNN, self).__init__()
        
        self.conv = nn.Conv1d(in_channels=embedding_dim, out_channels=100, kernel_size=5)
        self.fc = nn.Linear(100, 100)
        
    def forward(self, x):
        
        #print(x.shape)
        #conv1d wants (N, C, L)
        x = x.permute(0, 2, 1)
        #print(x.shape)
        x = self.conv(x)
        #print(x.shape)
        x = F.max_pool1d(x, x.size()[2])
        #print(x.shape)
        x = x.squeeze(2)
        x = self.fc(x)
        #print(x.shape)
        return x

In [3]:
"""x = Variable(torch.LongTensor(np.random.randint(0,50,size=(32,10))))
#50 is vocab size, 64 is embedding size
cnn = CNN(50, 64)
cnn(x)"""

'x = Variable(torch.LongTensor(np.random.randint(0,50,size=(32,10))))\n#50 is vocab size, 64 is embedding size\ncnn = CNN(50, 64)\ncnn(x)'

In [4]:
class LSTM(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, n_layers=1, bidirectional=True, dropout=0.2):
        super(LSTM, self).__init__()
        
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout)
        
    def forward(self, x):
        
        #print(x.shape) #[b, s, e]
        x = x.permute(1, 0, 2)
        #print(x.shape) #[s, b, e]
        x, (h, c) = self.lstm(x)
        #print(x.shape) #[s, b, h*2]
        return x

In [5]:
class Highway(torch.nn.Module):
    """
    COPIED FROM ALLENNLP GITHUB
    
    A `Highway layer <https://arxiv.org/abs/1505.00387>`_ does a gated combination of a linear
    transformation and a non-linear transformation of its input.  :math:`y = g * x + (1 - g) *
    f(A(x))`, where :math:`A` is a linear transformation, :math:`f` is an element-wise
    non-linearity, and :math:`g` is an element-wise gate, computed as :math:`sigmoid(B(x))`.
    This module will apply a fixed number of highway layers to its input, returning the final
    result.
    Parameters
    ----------
    input_dim : ``int``
        The dimensionality of :math:`x`.  We assume the input has shape ``(batch_size,
        input_dim)``.
    num_layers : ``int``, optional (default=``1``)
        The number of highway layers to apply to the input.
    activation : ``Callable[[torch.Tensor], torch.Tensor]``, optional (default=``torch.nn.functional.relu``)
        The non-linearity to use in the highway layers.
    """
    def __init__(self,
                 embedding_dim,
                 num_layers,
                 activation = torch.nn.functional.relu):
        super(Highway, self).__init__()
        self._embedding_dim = embedding_dim
        self._layers = torch.nn.ModuleList([torch.nn.Linear(embedding_dim, embedding_dim * 2)
                                            for _ in range(num_layers)])
        self._activation = activation
                
        for layer in self._layers:
            # We should bias the highway layer to just carry its input forward.  We do that by
            # setting the bias on `B(x)` to be positive, because that means `g` will be biased to
            # be high, to we will carry the input forward.  The bias on `B(x)` is the second half
            # of the bias vector in each Linear layer.
            layer.bias[embedding_dim:].data.fill_(1)

    def forward(self, inputs: torch.Tensor) -> torch.Tensor:  # pylint: disable=arguments-differ
        current_input = inputs
        for layer in self._layers:
            projected_input = layer(current_input)
            linear_part = current_input
            # NOTE: if you modify this, think about whether you should modify the initialization
            # above, too.
            nonlinear_part = projected_input[:, (0 * self._embedding_dim):(1 * self._embedding_dim)]
            gate = projected_input[:, (1 * self._embedding_dim):(2 * self._embedding_dim)]
            nonlinear_part = self._activation(nonlinear_part)
            gate = torch.nn.functional.sigmoid(gate)
            current_input = gate * linear_part + (1 - gate) * nonlinear_part
        return current_input

In [6]:
class Embedding(nn.Module):
    """
    NEED ONE FOR CHARS AND ONE FOR WORDS"""
    def __init__(self, vocab_size, embedding_dim):
        super(Embedding, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
    def forward(self, x):
        return self.embedding(x)

In [7]:
class MatrixAttention(nn.Module):
    '''
    This ``Module`` takes two matrices as input and returns a matrix of attentions.
    We compute the similarity between each row in each matrix and return unnormalized similarity
    scores.  Because these scores are unnormalized, we don't take a mask as input; it's up to the
    caller to deal with masking properly when this output is used.
    By default similarity is computed with a dot product, but you can alternatively use a
    parameterized similarity function if you wish.
    This is largely similar to using ``TimeDistributed(Attention)``, except the result is
    unnormalized.  You should use this instead of ``TimeDistributed(Attention)`` if you want to
    compute multiple normalizations of the attention matrix.
    Input:
        - matrix_1: ``(batch_size, num_rows_1, embedding_dim)``
        - matrix_2: ``(batch_size, num_rows_2, embedding_dim)``
    Output:
        - ``(batch_size, num_rows_1, num_rows_2)``
    Parameters
    ----------
    similarity_function: ``SimilarityFunction``, optional (default=``DotProductSimilarity``)
        The similarity function to use when computing the attention.
    '''
    def __init__(self, similarity_function) -> None:
        super(MatrixAttention, self).__init__()

        self._similarity_function = similarity_function

    def forward(self, matrix_1: torch.Tensor, matrix_2: torch.Tensor) -> torch.Tensor:
        # pylint: disable=arguments-differ
        tiled_matrix_1 = matrix_1.unsqueeze(2).expand(matrix_1.size()[0],
                                                      matrix_1.size()[1],
                                                      matrix_2.size()[1],
                                                      matrix_1.size()[2])
        tiled_matrix_2 = matrix_2.unsqueeze(1).expand(matrix_2.size()[0],
                                                      matrix_1.size()[1],
                                                      matrix_2.size()[1],
                                                      matrix_2.size()[2])
        return self._similarity_function(tiled_matrix_1, tiled_matrix_2)

In [8]:
class LinearSimilarity(nn.Module):
    """
    This similarity function performs a dot product between a vector of weights and some
    combination of the two input vectors, followed by an (optional) activation function.  The
    combination used is configurable.
    If the two vectors are ``x`` and ``y``, we allow the following kinds of combinations: ``x``,
    ``y``, ``x*y``, ``x+y``, ``x-y``, ``x/y``, where each of those binary operations is performed
    elementwise.  You can list as many combinations as you want, comma separated.  For example, you
    might give ``x,y,x*y`` as the ``combination`` parameter to this class.  The computed similarity
    function would then be ``w^T [x; y; x*y] + b``, where ``w`` is a vector of weights, ``b`` is a
    bias parameter, and ``[;]`` is vector concatenation.
    Note that if you want a bilinear similarity function with a diagonal weight matrix W, where the
    similarity function is computed as `x * w * y + b` (with `w` the diagonal of `W`), you can
    accomplish that with this class by using "x*y" for `combination`.
    Parameters
    ----------
    tensor_1_dim : ``int``
        The dimension of the first tensor, ``x``, described above.  This is ``x.size()[-1]`` - the
        length of the vector that will go into the similarity computation.  We need this so we can
        build weight vectors correctly.
    tensor_2_dim : ``int``
        The dimension of the second tensor, ``y``, described above.  This is ``y.size()[-1]`` - the
        length of the vector that will go into the similarity computation.  We need this so we can
        build weight vectors correctly.
    combination : ``str``, optional (default="x,y")
        Described above.
    activation : ``Activation``, optional (default=linear (i.e. no activation))
        An activation function applied after the ``w^T * [x;y] + b`` calculation.  Default is no
        activation.
    """
    def __init__(self,
                 tensor_1_dim: int,
                 tensor_2_dim: int,
                 combination: str = 'x,y') -> None:
        super(LinearSimilarity, self).__init__()
        self._combination = combination
        combined_dim = get_combined_dim(combination, [tensor_1_dim, tensor_2_dim])
        self._weight_vector = nn.Parameter(torch.Tensor(combined_dim))
        self._bias = nn.Parameter(torch.Tensor(1))
        self.reset_parameters()

    def reset_parameters(self):
        std = math.sqrt(6 / (self._weight_vector.size(0) + 1))
        self._weight_vector.data.uniform_(-std, std)
        self._bias.data.fill_(0)

    def forward(self, tensor_1: torch.Tensor, tensor_2: torch.Tensor) -> torch.Tensor:
        combined_tensors = combine_tensors(self._combination, [tensor_1, tensor_2])
        dot_product = torch.matmul(combined_tensors, self._weight_vector)
        return dot_product + self._bias

In [9]:
def combine_tensors(combination: str, tensors) -> torch.Tensor:
    """
    Combines a list of tensors using element-wise operations and concatenation, specified by a
    ``combination`` string.  The string refers to (1-indexed) positions in the input tensor list,
    and looks like ``"1,2,1+2,3-1"``.
    We allow the following kinds of combinations: ``x``, ``x*y``, ``x+y``, ``x-y``, and ``x/y``,
    where ``x`` and ``y`` are positive integers less than or equal to ``len(tensors)``.  Each of
    the binary operations is performed elementwise.  You can give as many combinations as you want
    in the ``combination`` string.  For example, for the input string ``"1,2,1*2"``, the result
    would be ``[1;2;1*2]``, as you would expect, where ``[;]`` is concatenation along the last
    dimension.
    If you have a fixed, known way to combine tensors that you use in a model, you should probably
    just use something like ``torch.cat([x_tensor, y_tensor, x_tensor * y_tensor])``.  This
    function adds some complexity that is only necessary if you want the specific combination used
    to be `configurable`.
    If you want to do any element-wise operations, the tensors involved in each element-wise
    operation must have the same shape.
    This function also accepts ``x`` and ``y`` in place of ``1`` and ``2`` in the combination
    string.
    """
    if len(tensors) > 9:
        raise ConfigurationError("Double-digit tensor lists not currently supported")
    combination = combination.replace('x', '1').replace('y', '2')
    to_concatenate = [_get_combination(piece, tensors) for piece in combination.split(',')]
    return torch.cat(to_concatenate, dim=-1)

In [10]:
def get_combined_dim(combination: str, tensor_dims) -> int:
    """
    For use with :func:`combine_tensors`.  This function computes the resultant dimension when
    calling ``combine_tensors(combination, tensors)``, when the tensor dimension is known.  This is
    necessary for knowing the sizes of weight matrices when building models that use
    ``combine_tensors``.
    Parameters
    ----------
    combination : ``str``
        A comma-separated list of combination pieces, like ``"1,2,1*2"``, specified identically to
        ``combination`` in :func:`combine_tensors`.
    tensor_dims : ``List[int]``
        A list of tensor dimensions, where each dimension is from the `last axis` of the tensors
        that will be input to :func:`combine_tensors`.
    """
    if len(tensor_dims) > 9:
        raise ConfigurationError("Double-digit tensor lists not currently supported")
    combination = combination.replace('x', '1').replace('y', '2')
    return sum([_get_combination_dim(piece, tensor_dims) for piece in combination.split(',')])

In [11]:
def _get_combination(combination: str, tensors) -> torch.Tensor:
    if combination.isdigit():
        index = int(combination) - 1
        return tensors[index]
    else:
        if len(combination) != 3:
            raise ConfigurationError("Invalid combination: " + combination)
        first_tensor = _get_combination(combination[0], tensors)
        second_tensor = _get_combination(combination[2], tensors)
        operation = combination[1]
        if operation == '*':
            return first_tensor * second_tensor
        elif operation == '/':
            return first_tensor / second_tensor
        elif operation == '+':
            return first_tensor + second_tensor
        elif operation == '-':
            return first_tensor - second_tensor
        else:
            raise ConfigurationError("Invalid operation: " + operation)

In [12]:
def _get_combination_dim(combination: str, tensor_dims) -> int:
    if combination.isdigit():
        index = int(combination) - 1
        return tensor_dims[index]
    else:
        if len(combination) != 3:
            raise ConfigurationError("Invalid combination: " + combination)
        first_tensor_dim = _get_combination_dim(combination[0], tensor_dims)
        second_tensor_dim = _get_combination_dim(combination[2], tensor_dims)
        operation = combination[1]
        if first_tensor_dim != second_tensor_dim:
            raise ConfigurationError("Tensor dims must match for operation \"{}\"".format(operation))
        return first_tensor_dim

In [13]:
def weighted_sum(matrix: torch.Tensor, attention: torch.Tensor) -> torch.Tensor:
    """
    Takes a matrix of vectors and a set of weights over the rows in the matrix (which we call an
    "attention" vector), and returns a weighted sum of the rows in the matrix.  This is the typical
    computation performed after an attention mechanism.
    Note that while we call this a "matrix" of vectors and an attention "vector", we also handle
    higher-order tensors.  We always sum over the second-to-last dimension of the "matrix", and we
    assume that all dimensions in the "matrix" prior to the last dimension are matched in the
    "vector".  Non-matched dimensions in the "vector" must be `directly after the batch dimension`.
    For example, say I have a "matrix" with dimensions ``(batch_size, num_queries, num_words,
    embedding_dim)``.  The attention "vector" then must have at least those dimensions, and could
    have more. Both:
        - ``(batch_size, num_queries, num_words)`` (distribution over words for each query)
        - ``(batch_size, num_documents, num_queries, num_words)`` (distribution over words in a
          query for each document)
    are valid input "vectors", producing tensors of shape:
    ``(batch_size, num_queries, embedding_dim)`` and
    ``(batch_size, num_documents, num_queries, embedding_dim)`` respectively.
    """
    # We'll special-case a few settings here, where there are efficient (but poorly-named)
    # operations in pytorch that already do the computation we need.
    if attention.dim() == 2 and matrix.dim() == 3:
        return attention.unsqueeze(1).bmm(matrix).squeeze(1)
    if attention.dim() == 3 and matrix.dim() == 3:
        return attention.bmm(matrix)
    if matrix.dim() - 1 < attention.dim():
        expanded_size = list(matrix.size())
        for i in range(attention.dim() - matrix.dim() + 1):
            matrix = matrix.unsqueeze(1)
            expanded_size.insert(i + 1, attention.size(i + 1))
        matrix = matrix.expand(*expanded_size)
    intermediate = attention.unsqueeze(-1).expand_as(matrix) * matrix
    return intermediate.sum(dim=-2)

In [38]:
class TimeDistributed(torch.nn.Module):
    """
    Given an input shaped like ``(batch_size, time_steps, [rest])`` and a ``Module`` that takes
    inputs like ``(batch_size, [rest])``, ``TimeDistributed`` reshapes the input to be
    ``(batch_size * time_steps, [rest])``, applies the contained ``Module``, then reshapes it back.
    Note that while the above gives shapes with ``batch_size`` first, this ``Module`` also works if
    ``batch_size`` is second - we always just combine the first two dimensions, then split them.
    """
    def __init__(self, module):
        super(TimeDistributed, self).__init__()
        self._module = module

    def forward(self, *inputs):  # pylint: disable=arguments-differ
        reshaped_inputs = []
        for input_tensor in inputs:
            input_size = input_tensor.size()
            if len(input_size) <= 2:
                raise RuntimeError("No dimension to distribute: " + str(input_size))

            # Squash batch_size and time_steps into a single axis; result has shape
            # (batch_size * time_steps, input_size).
            squashed_shape = [-1] + [x for x in input_size[2:]]
            reshaped_inputs.append(input_tensor.contiguous().view(*squashed_shape))

        reshaped_outputs = self._module(*reshaped_inputs)

        # Now get the output back into the right shape.
        # (batch_size, time_steps, [hidden_size])
        new_shape = [input_size[0], input_size[1]] + [x for x in reshaped_outputs.size()[1:]]
        outputs = reshaped_outputs.contiguous().view(*new_shape)

        return outputs

In [59]:
def _get_best_span(span_start_logits: Variable, span_end_logits: Variable) -> Variable:
        if span_start_logits.dim() != 2 or span_end_logits.dim() != 2:
            raise ValueError("Input shapes must be (batch_size, passage_length)")
        batch_size, passage_length = span_start_logits.size()
        max_span_log_prob = [-1e20] * batch_size
        span_start_argmax = [0] * batch_size
        best_word_span = Variable(span_start_logits.data.new()
                                  .resize_(batch_size, 2).fill_(0)).long()

        span_start_logits = span_start_logits.data.cpu().numpy()
        span_end_logits = span_end_logits.data.cpu().numpy()

        for b in range(batch_size):  # pylint: disable=invalid-name
            for j in range(passage_length):
                val1 = span_start_logits[b, span_start_argmax[b]]
                if val1 < span_start_logits[b, j]:
                    span_start_argmax[b] = j
                    val1 = span_start_logits[b, j]

                val2 = span_end_logits[b, j]

                if val1 + val2 > max_span_log_prob[b]:
                    best_word_span[b, 0] = span_start_argmax[b]
                    best_word_span[b, 1] = j
                    max_span_log_prob[b] = val1 + val2
        return best_word_span

In [39]:
x = Variable(torch.LongTensor(np.random.randint(0,50,size=(32))))
#50 is vocab size, 64 is embedding size
"""
TODO THIS ONLY WORKS 1 WAY, EITHER MAKE BIDIRECTIONAL ORRRRRRR HAVE AN INPUT WITH THE WORDS REVERSED
I DON'T ACTUALLY THINK YOU NEED TO MAKE BIDIRECTIONAL NOW?
"""

#highway = Highway(50, 200, 2)
#highway(x)

"\nTODO THIS ONLY WORKS 1 WAY, EITHER MAKE BIDIRECTIONAL ORRRRRRR HAVE AN INPUT WITH THE WORDS REVERSED\nI DON'T ACTUALLY THINK YOU NEED TO MAKE BIDIRECTIONAL NOW?\n"

In [55]:
word_embedding_dim = 100
char_embedding_dim = 16
n_words = 1000 #vocab size, not length
n_chars = 256 #vocab size, not length
batch_size = 32
context_max_word_len = 250 #max words in context
context_max_char_len = 10 #max characters per word
query_max_word_len = 15 #max words in context
query_max_char_len = 10 #max characters per word SHOULD BE THE SAME AS CONTEXT_MAX_CHAR_LEN

context_words = Variable(torch.LongTensor(np.random.randint(0,n_words,size=(batch_size,context_max_word_len))))
context_chars = Variable(torch.LongTensor(np.random.randint(0,n_chars,size=(batch_size,context_max_word_len,context_max_char_len))))

query_words = Variable(torch.LongTensor(np.random.randint(0,n_words,size=(batch_size,query_max_word_len))))
query_chars = Variable(torch.LongTensor(np.random.randint(0,n_chars,size=(batch_size,query_max_word_len,query_max_char_len))))

"""
BEGIN CONTEXT TO ATTENTION FLOW LAYER INPUTS
x_T to h_T
"""

word_embedding = Embedding(n_words, word_embedding_dim) #instantiate word -> word vectors module
char_embedding = TimeDistributed(Embedding(n_chars, char_embedding_dim)) #instantiate char -> word vectors module

c_embedded_word = word_embedding(context_words) #input words, get out vectors
c_embedded_char = char_embedding(context_chars) #input chars, get out vectors

char_cnn = TimeDistributed(CNN(n_chars, char_embedding_dim)) #instantiate word vectors -> cnn vectors module

c_cnn_embedded_chars = char_cnn(c_embedded_char) #input chars, get out vectors

highway = TimeDistributed(Highway(word_embedding_dim*2, num_layers=2)) #instantiate word_emb + char_emb (from CNN) -> embedded

c_highway_input = torch.cat((c_cnn_embedded_chars, c_embedded_word), dim=2) #concat char_cnn_emb + word_emb

c_embedded = highway(c_highway_input) #input concat, get out embedded context

phrase_layer = LSTM(word_embedding_dim*2, word_embedding_dim, n_layers=1, bidirectional=True) #lstm to turn context embedding into phrase embedding

c_embedded_phrase = phrase_layer(c_embedded) #pass through layer to get embedded phrase

"""
BEGIN QUERY TO ATTENTION FLOW LAYER INPUTS
q_J to u_J
"""

q_embedded_word = word_embedding(query_words) #input words, get out vectors
q_embedded_char = char_embedding(query_chars) #input chars, get out vectors

q_cnn_embedded_chars = char_cnn(q_embedded_char) #input chars, get out vectors

q_highway_input = torch.cat((q_cnn_embedded_chars, q_embedded_word), dim=2) ##concat char_cnn_emb + word_emb

q_embedded = highway(q_highway_input) #input concat, get out embedded context

q_embedded_phrase = phrase_layer(q_embedded) #pass through layer to get embedded phrase

similarity_function = LinearSimilarity(200,200) #used for matrix attention

attention_matrix = MatrixAttention(similarity_function) #gets un-normalized matrix attention

#must be [batch, length, emb]
c_embedded_phrase = c_embedded_phrase.permute(1, 0, 2)
q_embedded_phrase = q_embedded_phrase.permute(1, 0, 2)

"""
ATTENTION FLOW LAYER
"""

c_q_similarity = attention_matrix(c_embedded_phrase, q_embedded_phrase) #similarity between context and query

c_q_attention = F.softmax(c_q_similarity, dim=2) #normalise along the query dim (the shorter one) 

c_q_vectors = weighted_sum(q_embedded_phrase, c_q_attention) #apply attention to query

q_c_similarity = c_q_similarity.max(dim=-1)[0]

q_c_attention = F.softmax(q_c_similarity, dim=1) #normalise along the context dim (the longer one)

q_c_vectors = weighted_sum(c_embedded_phrase, q_c_attention) #apply attention to context

tiled_q_c_vectors = q_c_vectors.unsqueeze(1).expand(c_q_vectors.shape) #make weighted vectors same size

#have: 
#the emb phrase context
#context attn applied to query
#emb phrase context * context attn applied to query 
#emb phrase context * query attn applied to context
final_merged_c = torch.cat([c_embedded_phrase,
                            c_q_vectors,
                            c_embedded_phrase * c_q_vectors,
                            c_embedded_phrase * tiled_q_c_vectors],
                            dim=-1)

modeling_layer = LSTM(word_embedding_dim*8, word_embedding_dim, n_layers=2, bidirectional=True) #uses attention and phrase to "model" 

modeled_context = modeling_layer(final_merged_c) #apply modeling layer

modeled_context = modeled_context.permute(1, 0, 2) #need to permute after lstm for batch first

span_start_input = torch.cat((final_merged_c, modeled_context), dim=-1) #embedded phrase and attention concat modeling

span_start_predictor = TimeDistributed(nn.Linear(1000, 1)) #module to predict span start from phrase + attention

span_start_logits = span_start_predictor(span_start_input).squeeze(-1) #apply module to predict span start from phrase + attention

span_start_probs = F.softmax(span_start_logits, dim=1) #turn into probabilities over context to where answer span begins

span_start_representation = weighted_sum(modeled_context, span_start_probs) #apply the probabilities over the modeled context

tiled_start_representation = span_start_representation.unsqueeze(1).expand(modeled_context.shape) #reshape to predict span end

#have:
#phrase + attention merge
#modeled context from the above
#begin of span representation
#above 2 mulitplied together
span_end_representation = torch.cat([final_merged_c,
                                     modeled_context,
                                     tiled_start_representation,
                                     modeled_context * tiled_start_representation],
                                     dim=-1)

#LSTM over all of the above
span_end_encoder = LSTM(1400, 100, n_layers=1, bidirectional=True)

#used to find span end
encoded_span_end = span_end_encoder(span_end_representation)

encoded_span_end = encoded_span_end.permute(1, 0, 2)

#conat lstm representation as well as all the phrase and context stuff
span_end_input = torch.cat((final_merged_c, encoded_span_end),dim=-1)

#module to produce logits 
span_end_predictor = TimeDistributed(torch.nn.Linear(1000, 1))

#un-normalized outputs for span end probabily
span_end_logits = span_end_predictor(span_end_input).squeeze(-1)

#prob of each word in span being the span end
span_end_probs = F.softmax(span_end_logits, dim=1)

In [60]:
print(span_start_probs)
print(span_end_probs)
print(_get_best_span(span_start_probs, span_end_probs))

Variable containing:
1.00000e-03 *
 3.9767  4.0361  4.0894  ...   4.0464  3.8924  3.9780
 3.8900  4.0661  4.0770  ...   4.0577  4.0544  3.8594
 3.9607  3.9651  3.8803  ...   3.8963  4.0381  4.0798
          ...             ⋱             ...          
 3.9333  4.0308  3.9659  ...   4.1081  4.0074  3.9999
 3.7908  3.9123  4.1663  ...   3.8650  3.9431  4.0505
 3.9692  3.9894  4.0286  ...   4.0498  4.0669  4.1005
[torch.FloatTensor of size 32x250]

Variable containing:
1.00000e-03 *
 3.8633  3.9326  3.9281  ...   3.9944  4.0016  3.8890
 3.9752  3.9891  4.1079  ...   4.0001  3.9269  3.9458
 3.9494  3.9447  4.0215  ...   4.0203  4.1238  4.0914
          ...             ⋱             ...          
 4.0060  4.0729  4.0784  ...   4.0225  3.9789  3.9763
 3.9401  3.8875  4.1226  ...   3.9323  3.9203  4.0247
 3.9129  3.9043  3.8974  ...   3.8401  3.8216  3.9164
[torch.FloatTensor of size 32x250]

Variable containing:
   45    73
   59   194
   45    45
   93    93
   95   173
  100   139
  145   2