In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

import numpy as np

In [14]:
class CNN(nn.Module):
    """
    TODO: ADD DROPOUT
    """
    def __init__(self, vocab_size, embedding_dim):
        super(CNN, self).__init__()
        
        self.conv = nn.Conv1d(in_channels=embedding_dim, out_channels=100, kernel_size=5)
        self.fc = nn.Linear(100, 100)
        
    def forward(self, x):
        
        #print(x.shape)
        #conv1d wants (N, C, L)
        x = x.permute(0, 2, 1)
        #print(x.shape)
        x = self.conv(x)
        #print(x.shape)
        x = F.max_pool1d(x, x.size()[2])
        #print(x.shape)
        x = x.squeeze(2)
        x = self.fc(x)
        #print(x.shape)
        return x

In [15]:
"""x = Variable(torch.LongTensor(np.random.randint(0,50,size=(32,10))))
#50 is vocab size, 64 is embedding size
cnn = CNN(50, 64)
cnn(x)"""

'x = Variable(torch.LongTensor(np.random.randint(0,50,size=(32,10))))\n#50 is vocab size, 64 is embedding size\ncnn = CNN(50, 64)\ncnn(x)'

In [16]:
class LSTM(nn.Module):
    def __init__(self, embedding_dim, hidden_dim):
        super(LSTM, self).__init__()
        
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True, dropout=0.2)
        
    def forward(self, x):
        
        #print(x.shape) #[b, s, e]
        x = x.permute(1, 0, 2)
        #print(x.shape) #[s, b, e]
        x, (h, c) = self.lstm(x)
        #print(x.shape) #[s, b, h*2]
        return x

In [17]:
class Highway(torch.nn.Module):
    """
    A `Highway layer <https://arxiv.org/abs/1505.00387>`_ does a gated combination of a linear
    transformation and a non-linear transformation of its input.  :math:`y = g * x + (1 - g) *
    f(A(x))`, where :math:`A` is a linear transformation, :math:`f` is an element-wise
    non-linearity, and :math:`g` is an element-wise gate, computed as :math:`sigmoid(B(x))`.
    This module will apply a fixed number of highway layers to its input, returning the final
    result.
    Parameters
    ----------
    input_dim : ``int``
        The dimensionality of :math:`x`.  We assume the input has shape ``(batch_size,
        input_dim)``.
    num_layers : ``int``, optional (default=``1``)
        The number of highway layers to apply to the input.
    activation : ``Callable[[torch.Tensor], torch.Tensor]``, optional (default=``torch.nn.functional.relu``)
        The non-linearity to use in the highway layers.
    """
    def __init__(self,
                 embedding_dim,
                 num_layers,
                 activation = torch.nn.functional.relu):
        super(Highway, self).__init__()
        self._embedding_dim = embedding_dim
        self._layers = torch.nn.ModuleList([torch.nn.Linear(embedding_dim, embedding_dim * 2)
                                            for _ in range(num_layers)])
        self._activation = activation
                
        for layer in self._layers:
            # We should bias the highway layer to just carry its input forward.  We do that by
            # setting the bias on `B(x)` to be positive, because that means `g` will be biased to
            # be high, to we will carry the input forward.  The bias on `B(x)` is the second half
            # of the bias vector in each Linear layer.
            layer.bias[embedding_dim:].data.fill_(1)

    def forward(self, inputs: torch.Tensor) -> torch.Tensor:  # pylint: disable=arguments-differ
        current_input = inputs
        for layer in self._layers:
            projected_input = layer(current_input)
            linear_part = current_input
            # NOTE: if you modify this, think about whether you should modify the initialization
            # above, too.
            nonlinear_part = projected_input[:, (0 * self._embedding_dim):(1 * self._embedding_dim)]
            gate = projected_input[:, (1 * self._embedding_dim):(2 * self._embedding_dim)]
            nonlinear_part = self._activation(nonlinear_part)
            gate = torch.nn.functional.sigmoid(gate)
            current_input = gate * linear_part + (1 - gate) * nonlinear_part
        return current_input

In [18]:
class Embedding(nn.Module):
    """
    NEED ONE FOR CHARS AND ONE FOR WORDS"""
    def __init__(self, vocab_size, embedding_dim):
        super(Embedding, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
    def forward(self, x):
        return self.embedding(x)

In [19]:
x = Variable(torch.LongTensor(np.random.randint(0,50,size=(32))))
#50 is vocab size, 64 is embedding size
"""
TODO THIS ONLY WORKS 1 WAY, EITHER MAKE BIDIRECTIONAL ORRRRRRR HAVE AN INPUT WITH THE WORDS REVERSED
"""

#highway = Highway(50, 200, 2)
#highway(x)

'\nTODO THIS ONLY WORKS 1 WAY, EITHER MAKE BIDIRECTIONAL ORRRRRRR HAVE AN INPUT WITH THE WORDS REVERSED\n'

In [23]:
word_embedding_dim = 100
char_embedding_dim = 16
n_words = 1000 #vocab size, not length
n_chars = 256 #vocab size, not length
batch_size = 32
context_max_word_len = 250 #max words in context
context_max_char_len = 10 #max characters per word
query_max_word_len = 15 #max words in context
query_max_char_len = 10 #max characters per word SHOULD BE THE SAME AS CONTEXT_MAX_CHAR_LEN

context_words = Variable(torch.LongTensor(np.random.randint(0,n_words,size=(batch_size,context_max_word_len))))
context_chars = Variable(torch.LongTensor(np.random.randint(0,n_chars,size=(batch_size,context_max_char_len))))

query_words = Variable(torch.LongTensor(np.random.randint(0,n_words,size=(batch_size,query_max_word_len))))
query_chars = Variable(torch.LongTensor(np.random.randint(0,n_chars,size=(batch_size,query_max_char_len))))

"""
BEGIN CONTEXT TO ATTENTION FLOW LAYER INPUTS
x_T to h_T
"""

word_embedding = Embedding(n_words, word_embedding_dim) #instantiate word -> vectors module
char_embedding = Embedding(n_chars, char_embedding_dim) #instantiatet char -> vectors module

c_embedded_word = word_embedding(context_words) #input words, get out vectors
c_embedded_char = char_embedding(context_chars) #input char, get out vector

char_cnn = CNN(n_chars, char_embedding_dim) #instantiate chars -> vector

c_cnn_embedded_char = char_cnn(c_embedded_char) #input chars (of single word), get out vector

#this is cnn "encoded" of a single word, need to do a loop to get a 32,100 for every word
#every word needs to be broken down into characters
#need to loop over the char_cnned = char_cnn(chars) line

print('word.shape',c_embedded_word.shape)
print('char.shape',c_embedded_char.shape)
print('char_cnned.shape',c_cnn_embedded_char.shape) 
 
#for now, use this to pretend we have one char_cnned per word
c_cnn_embedded_chars = c_cnn_embedded_char.unsqueeze(1).expand(batch_size, context_max_word_len, word_embedding_dim) 
  
print('c_cnn_embedded_char after expand', c_cnn_embedded_chars.shape) 
    
assert c_embedded_word.shape == c_cnn_embedded_chars.shape

#MUST BE THE SAME BEFORE GOING THROUGH HIGHWAY LAYER

highway = Highway(word_embedding_dim*2, num_layers=2) #instantiate word_emb + char_emb (from CNN)

c_highway_input = torch.cat((c_cnn_embedded_chars, c_embedded_word), dim=2)

print('highway_input.shape',c_highway_input.shape)

print('highway_input.shape (single)',c_highway_input[:,0,:].shape)

c_embedded = highway(c_highway_input[:,0,:])

print('embedded.shape',c_embedded.shape) 

#this is output for one word + char combination, need to do for all, but for now lets expand!

c_embeddeds = c_embedded.unsqueeze(1).expand(batch_size, context_max_word_len, word_embedding_dim*2)

print('embedded.shape after expand', c_embeddeds.shape)

phrase_layer = LSTM(word_embedding_dim*2, word_embedding_dim)

c_embedded_phrase = phrase_layer(c_embeddeds)

print('c_embedded_phrase.shape',c_embedded_phrase.shape)

"""
BEGIN QUERY TO ATTENTION FLOW LAYER INPUTS
q_J to u_J
"""

q_embedded_word = word_embedding(query_words) #input words, get out vectors
q_embedded_char = char_embedding(query_chars) #input char, get out vector

q_cnn_embedded_char = char_cnn(q_embedded_char) #input chars (of single word), get out vector

#HACK FOR NOW
q_cnn_embedded_chars = q_cnn_embedded_char.unsqueeze(1).expand(batch_size, query_max_word_len, word_embedding_dim) 

q_highway_input = torch.cat((q_cnn_embedded_chars, q_embedded_word), dim=2)

q_embedded = highway(q_highway_input[:,0,:])

#HACK FOR NOW
q_embeddeds = q_embedded.unsqueeze(1).expand(batch_size, query_max_word_len, word_embedding_dim*2)

q_embedded_phrase = phrase_layer(q_embeddeds)

print('q_embedded_phrase.shape',q_embedded_phrase.shape)

#https://github.com/allenai/allennlp/blob/master/allennlp/modules/similarity_functions/linear.py
#https://github.com/allenai/allennlp/blob/master/allennlp/modules/matrix_attention.py
#allennlp/training_config/bidaf.json

word.shape torch.Size([32, 250, 100])
char.shape torch.Size([32, 10, 16])
char_cnned.shape torch.Size([32, 100])
c_cnn_embedded_char after expand torch.Size([32, 250, 100])
highway_input.shape torch.Size([32, 250, 200])
highway_input.shape (single) torch.Size([32, 200])
embedded.shape torch.Size([32, 200])
embedded.shape after expand torch.Size([32, 250, 200])
c_embedded_phrase.shape torch.Size([250, 32, 200])
q_embedded_phrase.shape torch.Size([15, 32, 200])


In [None]:
class BiDAF(nn.Module):
    def __init__(self, 
                 char_cnn_model_c, 
                 word_lstm_model_c, 
                 char_cnn_model_q, 
                 word_lstm_model_q, 
                 phrase_lstm_model, 
                 modeling_lstm_model):
        
        super(BiDAF, self).__init__()
            
        self.char_cnn_model_c =  
        self.word_lstm_model_c =
        self.char_cnn_model_q = 
        self.word_lstm_model_q = 
        self.phrase_lstm_model = 
        self.modeling_lstm_model =
        
        self.start_span = nn.Linear(whatever, max_len)
        self.end_span = 

In [None]:
char_cnn_model_c = CNN(1)
word_lstm_model_c = LSTM(1)
char_cnn_model_q = CNN(1)
word_lstm_model_q = LSTM(1)
phrase_lstm_model = LSTM(1)
modeling_lstm_model = LSTM(1)

bidaf = BiDAF(char_cnn_model, word_lstm_model, phrase_lstm_model, modeling_lstm_model)

In [None]:
context = torch.LongTensor(np.random.randint(0,50,size=(32,100)))
query = torch.LongTensor(np.random.randint(0,50,size=(32,10)))