In [5]:
from nltk.corpus import gutenberg as gb
from numpy import array, asarray, zeros, append, ceil
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.utils import to_categorical
from keras.constraints import maxnorm
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, load_model
from keras.layers import Dense, Bidirectional, Dropout
from keras.layers import LSTM
from keras.layers import Embedding
from keras.layers import Flatten
from keras import backend as K
from itertools import chain

In [6]:
temp = gb.fileids()
temp_length = len(temp)
train_set = temp[: int(ceil(0.6*temp_length))]
dev_set = temp[int(ceil(0.6*temp_length)) : int(ceil(0.8*temp_length))]
test_set = temp[int(ceil(0.8*temp_length)):]
gb_sent_train = gb.sents('burgess-busterbrown.txt')
gb_sents_train = []
for sent3 in gb_sent_train:
    sent3 = list(filter(lambda a: a not in ("``","''",".","[","]",",",":",";","--","-","?","-","`","'","s","@","#","$","%","(",")","{","}","\""), sent3))
    sent3 = [x.lower() for x in sent3]
    sent3 = ['<s>'] + sent3 + ['</s>']
    gb_sents_train.append(sent3)

In [64]:
t = Tokenizer(oov_token='<unk>', filters='!"#$%&(),*+-./:;<=>?@[\\]^_`{|}~\t\r\n\'')
t.fit_on_texts(gb_sents_train)

vocab_size = len(t.word_index) + 1
print('vocab size: ', vocab_size)
encoded_docs = t.texts_to_sequences(gb_sents_train)
encoded_docs2 = list(chain.from_iterable(encoded_docs))
print(encoded_docs[:2])
sequences = list()

for i in range(len(encoded_docs2)):
    sequences.append(encoded_docs2[i:i+10])
print('Total Sequences: %d' % len(sequences))
max_length = 10 #max([len(seq) for seq in sequences])
sequences = pad_sequences(sequences, maxlen=max_length, padding='pre', truncating='post', dtype='float32')
print('Max Sequence Length: %d' % max_length)
# split into input and output elements
sequences = array(sequences)
X, y = sequences[:,:-1],sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)

embeddings_index = dict()
f = open('glove.6B.100d.txt', encoding="utf8")
for line in f:
	values = line.split()
	word = values[0]
	coefs = asarray(values[1:], dtype='float32')
	embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

vocab size:  1552
[[1, 4, 567, 7, 12, 21, 107, 929, 930, 931, 932, 2], [1, 19, 2]]
Total Sequences: 18565
Max Sequence Length: 10
Loaded 400000 word vectors.


In [65]:
# create a weight matrix for words in training docs
embedding_matrix = zeros((vocab_size, 100))
for word, i in t.word_index.items():
	embedding_vector = embeddings_index.get(word)
	if embedding_vector is not None:
		embedding_matrix[i] = embedding_vector

In [66]:
model = Sequential()
e = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=max_length-1, trainable=True)
#e = Embedding(vocab_size, 50, input_length=max_length-1)

model.add(e)
model.add(Dropout(0.2))
model.add(Bidirectional(LSTM(units=50, activation='tanh', unroll=True,kernel_constraint=maxnorm(3))))
#model.add(LSTM(units=9, activation='tanh'))
#model.add(Flatten())
model.add(Dense(vocab_size, activation='softmax'))
# compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
# summarize the model
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 9, 100)            155200    
_________________________________________________________________
dropout_8 (Dropout)          (None, 9, 100)            0         
_________________________________________________________________
bidirectional_8 (Bidirection (None, 100)               60400     
_________________________________________________________________
dense_8 (Dense)              (None, 1552)              156752    
Total params: 372,352
Trainable params: 372,352
Non-trainable params: 0
_________________________________________________________________
None


In [68]:
model.fit(X, y, epochs=500, verbose=2)

Epoch 1/500
 - 28s - loss: 4.4809 - acc: 0.2237
Epoch 2/500
 - 28s - loss: 4.1817 - acc: 0.2462
Epoch 3/500
 - 28s - loss: 3.9334 - acc: 0.2640
Epoch 4/500
 - 28s - loss: 3.7216 - acc: 0.2787
Epoch 5/500
 - 28s - loss: 3.5370 - acc: 0.2940
Epoch 6/500
 - 28s - loss: 3.3742 - acc: 0.3061
Epoch 7/500
 - 28s - loss: 3.2181 - acc: 0.3216
Epoch 8/500
 - 28s - loss: 3.0795 - acc: 0.3362
Epoch 9/500
 - 28s - loss: 2.9486 - acc: 0.3515
Epoch 10/500
 - 28s - loss: 2.8363 - acc: 0.3697
Epoch 11/500
 - 28s - loss: 2.7224 - acc: 0.3885
Epoch 12/500
 - 28s - loss: 2.6181 - acc: 0.4085
Epoch 13/500
 - 28s - loss: 2.5241 - acc: 0.4284
Epoch 14/500
 - 28s - loss: 2.4357 - acc: 0.4453
Epoch 15/500
 - 28s - loss: 2.3523 - acc: 0.4586
Epoch 16/500
 - 28s - loss: 2.2752 - acc: 0.4766
Epoch 17/500
 - 28s - loss: 2.2057 - acc: 0.4849
Epoch 18/500
 - 28s - loss: 2.1388 - acc: 0.5008
Epoch 19/500
 - 28s - loss: 2.0715 - acc: 0.5166
Epoch 20/500
 - 28s - loss: 2.0103 - acc: 0.5242
Epoch 21/500
 - 28s - loss: 1

 - 27s - loss: 0.2468 - acc: 0.9297
Epoch 168/500
 - 27s - loss: 0.2536 - acc: 0.9277
Epoch 169/500
 - 28s - loss: 0.2556 - acc: 0.9266
Epoch 170/500
 - 27s - loss: 0.2447 - acc: 0.9321
Epoch 171/500
 - 27s - loss: 0.2441 - acc: 0.9308
Epoch 172/500
 - 27s - loss: 0.2410 - acc: 0.9323
Epoch 173/500
 - 27s - loss: 0.2371 - acc: 0.9328
Epoch 174/500
 - 27s - loss: 0.2326 - acc: 0.9348
Epoch 175/500
 - 27s - loss: 0.2363 - acc: 0.9335
Epoch 176/500
 - 27s - loss: 0.2370 - acc: 0.9346
Epoch 177/500
 - 27s - loss: 0.2296 - acc: 0.9346
Epoch 178/500
 - 27s - loss: 0.2258 - acc: 0.9369
Epoch 179/500
 - 27s - loss: 0.2324 - acc: 0.9340
Epoch 180/500
 - 28s - loss: 0.2292 - acc: 0.9362
Epoch 181/500
 - 27s - loss: 0.2327 - acc: 0.9352
Epoch 182/500
 - 27s - loss: 0.2195 - acc: 0.9382
Epoch 183/500
 - 27s - loss: 0.2324 - acc: 0.9341
Epoch 184/500
 - 27s - loss: 0.2262 - acc: 0.9350
Epoch 185/500
 - 27s - loss: 0.2153 - acc: 0.9403
Epoch 186/500
 - 27s - loss: 0.2209 - acc: 0.9380
Epoch 187/500


 - 30s - loss: 0.1424 - acc: 0.9558
Epoch 332/500
 - 29s - loss: 0.1424 - acc: 0.9583
Epoch 333/500
 - 29s - loss: 0.1292 - acc: 0.9617
Epoch 334/500
 - 30s - loss: 0.1364 - acc: 0.9585
Epoch 335/500
 - 32s - loss: 0.1426 - acc: 0.9578
Epoch 336/500
 - 29s - loss: 0.1384 - acc: 0.9594
Epoch 337/500
 - 28s - loss: 0.1400 - acc: 0.9592
Epoch 338/500
 - 28s - loss: 0.1338 - acc: 0.9610
Epoch 339/500
 - 28s - loss: 0.1422 - acc: 0.9571
Epoch 340/500
 - 28s - loss: 0.1404 - acc: 0.9590
Epoch 341/500
 - 28s - loss: 0.1367 - acc: 0.9604
Epoch 342/500
 - 28s - loss: 0.1329 - acc: 0.9607
Epoch 343/500
 - 28s - loss: 0.1318 - acc: 0.9590
Epoch 344/500
 - 28s - loss: 0.1416 - acc: 0.9574
Epoch 345/500
 - 28s - loss: 0.1349 - acc: 0.9593
Epoch 346/500
 - 28s - loss: 0.1324 - acc: 0.9607
Epoch 347/500
 - 28s - loss: 0.1356 - acc: 0.9585
Epoch 348/500
 - 28s - loss: 0.1355 - acc: 0.9595
Epoch 349/500
 - 28s - loss: 0.1273 - acc: 0.9617
Epoch 350/500
 - 28s - loss: 0.1278 - acc: 0.9626
Epoch 351/500


 - 29s - loss: 0.1033 - acc: 0.9661
Epoch 496/500
 - 32s - loss: 0.1121 - acc: 0.9676
Epoch 497/500
 - 30s - loss: 0.1102 - acc: 0.9651
Epoch 498/500
 - 30s - loss: 0.1073 - acc: 0.9656
Epoch 499/500
 - 29s - loss: 0.1086 - acc: 0.9667
Epoch 500/500
 - 31s - loss: 0.1089 - acc: 0.9671


<keras.callbacks.History at 0x25ad8599668>