In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from IPython.display import display
import pandas as pd
import h5py
import os
# to load data from the test file
from utils.create_datasets import SumDatasets
# To load the data sets here, also we will create some examples to explore torch
data_dir = '../data/'
from utils import config
import pickle
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

  from ._conv import register_converters as _register_converters


In [2]:
sum_dataset = SumDatasets(os.path.join(data_dir, 'features-600-40_v2.hdf5'))

In [5]:
features_1, features_2, features_3, features_4 = sum_dataset[0:10] # assume we have a batch data of 20 items.

## given the data, build a very naive summarization model here
* first, we need to compute all the hidden state of each time step and a final state, contain cell state and hidden state of the final time and feed it into the network and do the decoder agian.

In [34]:
class Encoder(nn.Module):
    def __init__(self):
        super(Encoder, self).__init__()
        self.embed = nn.Embedding(config.NUM_WORDS + 2, config.embedding_dim)
        self.rnn = nn.LSTM(config.embedding_dim, config.hidden_dim, bidirectional=True)
        self.reduce_ = nn.Linear(2 * config.hidden_dim, config.hidden_dim)
        
    def forward(self, X, seq_lens):
        """return the final satets and also the outputs of each timesteps, for the later usage of
        computing the Attentaion matrix foe each time step input of the Decoder.
        
        Parameters
        ----------
        X : [Torch tensor with batch*MAX_STEP]
            
        seq_lens : [descend order of the real length of the data]
        """
        X = self.embed(X)
        batch_size = X.size()[0]
        packed_x = pack_padded_sequence(X, seq_lens, batch_first=True)
        outputs, hidden = self.rnn(packed_x)
        outputs, seq_lens = pad_packed_sequence(outputs, batch_first=True)
        # outputs is a bathc*max_enc_steps*(2*hidden_dim), but for the hidden
        # must give then to batch first format, so we need to implement this
        # with the following code.
        hidden_c, hidden_s = hidden
        hidden_c = self.reduce_(hidden_c.permute([1, 0, 2]).contiguous().view(batch_size, -1))
        hidden_s = self.reduce_(hidden_s.permute([1, 0, 2]).contiguous().view(batch_size, -1))
        # then combine all the hidden states to a entry for the decoder rnn initialize
        return outputs, hidden_c, hidden_s

In [44]:
encoder = Encoder()
outputs, hidden_c, hidden_s = encoder(features_1, features_2)
display(hidden_c.unsqueeze_(0).shape)
display(hidden_s.unsqueeze_(0).shape)


torch.Size([1, 10, 50])

torch.Size([1, 10, 50])

In [95]:
class Decoder(nn.Module):
    def __init__(self):
        super(Decoder, self).__init__()
        self.embed = nn.Embedding(config.NUM_WORDS+4, config.embedding_dim)
        self.rnn = nn.LSTM(config.embedding_dim, config.hidden_dim)
        self.logits = nn.Linear(config.hidden_dim, config.NUM_WORDS, bias=False)
        
        
        
    def forward(self, X, hidden_c, hidden_s):
        """
        using the encoder's hidden state to initiaize the decoder
        input and also use the teaching force in the training mode. here
        TODO: how we tell the real difference between the train, teaching force and
        the evalutaion?
        """
        X = self.embed(X)
        # first we need to transoform the X to be the seq_len first
        outputs, hidden = self.rnn(X.permute([1, 0, 2]), (hidden_c, hidden_s))
        outputs = F.softmax(self.logits(outputs), dim=2)
        outputs = outputs.permute([1, 0, 2]).contiguous().view(-1, config.NUM_WORDS)
        return outputs