In [1]:
import numpy as np
import os
import sys

In [2]:
from keras.layers import Input, Embedding, LSTM, RepeatVector, concatenate, Dense, TimeDistributed
from keras import Model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [7]:
BASE_DIR = ''
GLOVE_DIR = os.path.join(BASE_DIR, '')
TEXT_DATA_DIR = os.path.join(BASE_DIR, './data/')
SUM_DATA_DIR = os.path.join(BASE_DIR, './summary/')
MAX_SEQUENCE_LENGTH = 7700
MAX_SUM_LENGTH = 100
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 50
VALIDATION_SPLIT = 0.0
texts = []
texts_sum = []
data = ()
y = ()
sequences = ()

In [8]:
def load_embed_weights():
    
    print('Indexing word vectors.')
    
    embeddings_index = {}
    with open(os.path.join(GLOVE_DIR, 'glove.6B.50d.txt')) as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
            
    print('Found %s word vectors.' % len(embeddings_index))
    print('Processing text dataset')
    
    for name in sorted(os.listdir(TEXT_DATA_DIR)):
        fpath = os.path.join(TEXT_DATA_DIR, name)
        spath = os.path.join(SUM_DATA_DIR, name)
        args = {} if sys.version_info < (3,) else {'encoding': 'latin-1'}
        with open(fpath, **args) as f:
            t = f.read()
            texts.append(t)
        with open(spath, **args) as f:
            t = f.read()
            texts_sum.append(t)
    
    print('Found %s texts.' % len(texts))
    
    tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)
    
    tokenizer2 = Tokenizer(num_words=MAX_NUM_WORDS)
    tokenizer2.fit_on_texts(texts_sum)
    sequences2 = tokenizer2.texts_to_sequences(texts_sum)
    # print(sequences[1:20])
    word_index = tokenizer.word_index
    
    print('Found %s unique tokens.' % len(word_index))
    
    data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
    y = pad_sequences(sequences2, maxlen=MAX_SUM_LENGTH)
    print('Shape of data tensor:', data.shape)
    print('Shape of summary tensor:', y.shape)
    
    print('Preparing embedding matrix.')

    # prepare embedding matrix
    num_words = min(MAX_NUM_WORDS, len(word_index) + 1)
    embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
    for word, i in word_index.items():
        if i >= MAX_NUM_WORDS:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector

    # load pre-trained word embeddings into an Embedding layer
    # note that we set trainable = False so as to keep the embeddings fixed
    embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)
    
    # print(embedding_layer)
    
    return (data, embedding_layer, y)

In [9]:
data, embedding, y = load_embed_weights()

Indexing word vectors.
Found 400000 word vectors.
Processing text dataset
Found 2963 texts.
Found 62555 unique tokens.
Shape of data tensor: (2963, 7700)
Shape of summary tensor: (2963, 100)
Preparing embedding matrix.


In [11]:
sum_txt_length = 100
vocab_size = 20

In [12]:
def build_model():
    
    inputs1 = Input(shape=(MAX_SEQUENCE_LENGTH,))
    article1 = embedding(inputs1)
    article2 = LSTM(128)(article1)
    article3 = RepeatVector(sum_txt_length)(article2)
    
    # summary input model
    inputs2 = Input(shape=(sum_txt_length,))
    summ1 = Embedding(vocab_size, 128)(inputs2)
    
    # decoder model
    decoder1 = concatenate([article3, summ1])
    decoder2 = LSTM(128)(decoder1)
    outputs = Dense(vocab_size, activation='softmax')(decoder2))
    
    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    
    return model

In [13]:
model = build_model()
model.compile(loss='categorical_crossentropy', optimizer='adam')

Instructions for updating:
keep_dims is deprecated, use keepdims instead


TypeError: Output tensors to a Model must be Keras tensors. Found: <keras.layers.wrappers.TimeDistributed object at 0x7fc5d47ec5c0>

In [12]:
data.shape

(2963, 7700)

In [22]:
summary = np.zeros(shape=(data.shape[0],sum_txt_length),dtype=np.int32)

In [29]:
summary.shape

(2963, 100)

In [30]:
for i in range(1):
    model.fit([data, summary],y,batch_size=100, epochs=1)

ValueError: The model expects 2 input arrays, but only received one array. Found: array with shape (2963, 7700)