In [1]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM, Bidirectional
from tensorflow.keras.layers import Embedding

In [2]:
def load_organized_sequences(filename):
  file = open(filename, 'r')
  text =file.read()
  file.close()
  return text

In [3]:
filename = 'organized_sequences.txt'
doc = load_organized_sequences(filename)
lines = doc.split('\n')

In [4]:
len(set(" ".join(lines).split(" ")))

7478

In [5]:
lines[0:3]

['we have a long ride before us gared pointed out eight days maybe nine and night is falling ser waymar royce glanced at the sky with disinterest it does that every day about this time are you unmanned by the dark gared will could see the tightness around gareds mouth the',
 'have a long ride before us gared pointed out eight days maybe nine and night is falling ser waymar royce glanced at the sky with disinterest it does that every day about this time are you unmanned by the dark gared will could see the tightness around gareds mouth the barely',
 'a long ride before us gared pointed out eight days maybe nine and night is falling ser waymar royce glanced at the sky with disinterest it does that every day about this time are you unmanned by the dark gared will could see the tightness around gareds mouth the barely sup']

# Tokenize(encode) words in lines

In [6]:
# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
sequences = tokenizer.texts_to_sequences(lines)

In [7]:
tokenizer.word_index['the']

1

In [8]:
# Seeing word mappings to corresponding indexes
print(lines[0])
print(sequences[0])
print()
print('=========================')
print(lines[1])
print(sequences[1])

we have a long ride before us gared pointed out eight days maybe nine and night is falling ser waymar royce glanced at the sky with disinterest it does that every day about this time are you unmanned by the dark gared will could see the tightness around gareds mouth the
[79, 33, 3, 96, 275, 101, 154, 534, 795, 53, 1009, 365, 718, 1075, 2, 152, 25, 761, 58, 614, 537, 517, 21, 1, 536, 19, 7476, 11, 393, 17, 263, 137, 127, 56, 103, 51, 12, 7475, 69, 1, 236, 534, 39, 47, 76, 1, 4408, 126, 4407, 330, 1]

have a long ride before us gared pointed out eight days maybe nine and night is falling ser waymar royce glanced at the sky with disinterest it does that every day about this time are you unmanned by the dark gared will could see the tightness around gareds mouth the barely
[33, 3, 96, 275, 101, 154, 534, 795, 53, 1009, 365, 718, 1075, 2, 152, 25, 761, 58, 614, 537, 517, 21, 1, 536, 19, 7476, 11, 393, 17, 263, 137, 127, 56, 103, 51, 12, 7475, 69, 1, 236, 534, 39, 47, 76, 1, 4408, 126, 4407,

In [9]:
len(tokenizer.word_index)

7478

In [10]:
tokenizer.word_index

{'the': 1,
 'and': 2,
 'a': 3,
 'to': 4,
 'of': 5,
 'he': 6,
 'his': 7,
 'was': 8,
 'her': 9,
 'in': 10,
 'it': 11,
 'you': 12,
 'had': 13,
 'i': 14,
 'she': 15,
 'him': 16,
 'that': 17,
 'as': 18,
 'with': 19,
 'said': 20,
 'at': 21,
 'not': 22,
 'for': 23,
 'on': 24,
 'is': 25,
 'but': 26,
 'they': 27,
 'no': 28,
 'my': 29,
 'from': 30,
 'were': 31,
 'all': 32,
 'have': 33,
 'jon': 34,
 'would': 35,
 'be': 36,
 'them': 37,
 'lord': 38,
 'will': 39,
 'when': 40,
 'ned': 41,
 'your': 42,
 'there': 43,
 'so': 44,
 'me': 45,
 'up': 46,
 'could': 47,
 'if': 48,
 'one': 49,
 'bran': 50,
 'are': 51,
 'what': 52,
 'out': 53,
 'like': 54,
 'man': 55,
 'this': 56,
 'been': 57,
 'ser': 58,
 'their': 59,
 'eyes': 60,
 'did': 61,
 'arya': 62,
 'back': 63,
 'looked': 64,
 'hand': 65,
 'do': 66,
 'than': 67,
 'down': 68,
 'by': 69,
 'boy': 70,
 'now': 71,
 'then': 72,
 'king': 73,
 'never': 74,
 'told': 75,
 'see': 76,
 'an': 77,
 'tyrion': 78,
 'we': 79,
 'catelyn': 80,
 'sansa': 81,
 'only': 82,


In [11]:
# vocabulary size
vocab_size = len(tokenizer.word_index) + 1
vocab_size

7479

# Separating Input and Output words

In [12]:
import numpy as np
from tensorflow.keras.utils import to_categorical, plot_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [13]:
# separate into input and output
sequences = np.array(sequences)
X, y = sequences[:,:-1], sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)
seq_length = X.shape[1]

In [14]:
X

array([[  79,   33,    3, ...,  126, 4407,  330],
       [  33,    3,   96, ..., 4407,  330,    1],
       [   3,   96,  275, ...,  330,    1, 1882],
       ...,
       [   1, 1161, 4298, ...,  762,   52, 1881],
       [1161, 4298,    2, ...,   52, 1881,   25],
       [4298,    2,  516, ..., 1881,   25,  456]])

In [15]:
y[0]

array([0., 1., 0., ..., 0., 0., 0.], dtype=float32)

In [16]:
y.shape

(108159, 7479)

# Fitting the model

In [17]:
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=seq_length))
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(100, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))
# compile network
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# summarize defined model
model.summary()
plot_model(model, to_file='model.png', show_shapes=True)


# model = Sequential()
# model.add(Embedding(vocab_size, 50, input_length=seq_length))
# model.add(Bidirectional(LSTM(512, kernel_initializer = 'he_uniform', activation='relu', return_sequences=True)))
# model.add(Bidirectional(LSTM(256, kernel_initializer = 'he_uniform', activation='relu', return_sequences=True)))
# model.add(Bidirectional(LSTM(128, kernel_initializer = 'he_uniform', activation='relu')))

# model.add(Dense(100, activation='relu'))
# model.add(Dense(vocab_size, activation='softmax'))
# # compile network
# model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# # summarize defined model
# model.summary()
# plot_model(model, to_file='../model.png', show_shapes=True)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 50, 50)            373950    
                                                                 
 lstm (LSTM)                 (None, 50, 100)           60400     
                                                                 
 lstm_1 (LSTM)               (None, 100)               80400     
                                                                 
 dense (Dense)               (None, 100)               10100     
                                                                 
 dense_1 (Dense)             (None, 7479)              755379    
                                                                 
Total params: 1,280,229
Trainable params: 1,280,229
Non-trainable params: 0
_________________________________________________________________
You must install pydot (`pip install pydot`) a

In [18]:
my_callbacks = [
    EarlyStopping(patience=2),
    ModelCheckpoint(filepath='../LSTM/Callbacks/model.{epoch:02d}-{val_loss:.2f}.h5'),
    TensorBoard(log_dir='../LSTM/logs'),
]

In [44]:
model.fit(X,y, epochs=250, batch_size=512)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x289098ef2e0>

In [55]:
# Saving the best model
import tensorflow
tensorflow.keras.models.save_model(model, '../Model/Word2Vec-Model-140-512.h5')

# Generate Text

In [45]:
def generate_seq(model, tokenizer, seq_length, seed_text, n_words):
  result = list()
  in_text = seed_text
  # generate a fixed number of words
  for _ in range(n_words):
    # encode the text as integer
    encoded = tokenizer.texts_to_sequences([in_text])[0]
    # truncate sequences to a fixed length
    encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
    # predict probabilities for each word
    yhat = model.predict(encoded, verbose=0)
    # map predicted word index to word
    #print(np.argmax(yhat))
    yhat = np.argmax(yhat)
    out_word = ''
    for word, index in tokenizer.word_index.items():
      if index == yhat:
        #print(word)
        out_word = word
        break
    # append to input
    in_text += ' ' + out_word
    result.append(out_word)
  return ' '.join(result)

In [60]:
lines[10]

'eight days maybe nine and night is falling ser waymar royce glanced at the sky with disinterest it does that every day about this time are you unmanned by the dark gared will could see the tightness around gareds mouth the barely sup pressed anger in his eyes under the thick'

In [47]:
lines[101]

'something else in the older man you could taste it a nervous tension that came perilous close to fear will shared his unease he had been four years on the wall the first time he had been sent beyond all the old stories had come rushing back and his bowels had'

In [63]:
text = 'eight days maybe nine and night is falling ser waymar royce glanced at the sky with disinterest it does that every day about this time are you unmanned by the dark gared will could see the tightness around gareds mouth the barely sup pressed'
generated_seq = generate_seq(model, tokenizer, seq_length, text, n_words=20)
print(generated_seq)

anger in the swirling was what does you think about you to bring you a crown he said sharply neither
