In [8]:
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [9]:
plt.rcParams['font.size'] = 14
plt.rcParams['axes.grid'] = True

### Shakespeare Dataset

In [10]:
!wget https://storage.googleapis.com/laurencemoroney-blog.appspot.com/sonnets.txt sonnets.txt

--2020-07-01 11:41:57--  https://storage.googleapis.com/laurencemoroney-blog.appspot.com/sonnets.txt
Resolving storage.googleapis.com (storage.googleapis.com)... 173.194.76.128, 66.102.1.128, 64.233.167.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|173.194.76.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 93578 (91K) [text/plain]
Saving to: ‘sonnets.txt.1’


2020-07-01 11:41:57 (73.6 MB/s) - ‘sonnets.txt.1’ saved [93578/93578]

--2020-07-01 11:41:57--  http://sonnets.txt/
Resolving sonnets.txt (sonnets.txt)... failed: Name or service not known.
wget: unable to resolve host address ‘sonnets.txt’
FINISHED --2020-07-01 11:41:57--
Total wall clock time: 0.03s
Downloaded: 1 files, 91K in 0.001s (73.6 MB/s)


### Data Pre-processing

In [11]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [12]:
data = open('sonnets.txt').read()
corpus = data.lower().split('\n')
corpus

['from fairest creatures we desire increase,',
 "that thereby beauty's rose might never die,",
 'but as the riper should by time decease,',
 'his tender heir might bear his memory:',
 'but thou, contracted to thine own bright eyes,',
 "feed'st thy light'st flame with self-substantial fuel,",
 'making a famine where abundance lies,',
 'thyself thy foe, to thy sweet self too cruel.',
 "thou that art now the world's fresh ornament",
 'and only herald to the gaudy spring,',
 'within thine own bud buriest thy content',
 'and, tender churl, makest waste in niggarding.',
 'pity the world, or else this glutton be,',
 "to eat the world's due, by the grave and thee.",
 'when forty winters shall beseige thy brow,',
 "and dig deep trenches in thy beauty's field,",
 "thy youth's proud livery, so gazed on now,",
 "will be a tatter'd weed, of small worth held:",
 "then being ask'd where all thy beauty lies,",
 'where all the treasure of thy lusty days,',
 'to say, within thine own deep-sunken eyes,',

In [13]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1

print(tokenizer.word_index)
print(total_words)

3211


In [15]:
tokenizer.texts_to_sequences(['without all ornament, itself and true,'])[0]

[266, 23, 419, 332, 1, 75]

In [17]:
input_sequences = []
tk = tokenizer.texts_to_sequences(['without all ornament, itself and true,'])[0]
for i in range(1,len(tk)):
  n_gram_sequence = tk[:i+1]
  input_sequences.append(n_gram_sequence)
print(input_sequences)

[[266, 23], [266, 23, 419], [266, 23, 419, 332], [266, 23, 419, 332, 1], [266, 23, 419, 332, 1, 75]]


In [18]:
input_sequences = []
for line in corpus:
  token_list = tokenizer.texts_to_sequences([line])[0]
  for i in range(1, len(token_list)):
    n_gram_sequence = token_list[:i+1]
    input_sequences.append(n_gram_sequence)
print(len(input_sequences))

15462


In [19]:
max_sequences_len = max([len(seq) for seq in input_sequences])
print(max_sequences_len)
input_sequences = np.array(pad_sequences(input_sequences, padding='pre', maxlen=max_sequences_len))

11


In [20]:
input_sequences[31]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0, 1372,    9],
      dtype=int32)

In [21]:
xs, labels = input_sequences[:,:-1], input_sequences[:,-1]
ys = tf.keras.utils.to_categorical(labels, num_classes=total_words)

In [22]:
ys.shape

(15462, 3211)

In [25]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Bidirectional, Dropout, SimpleRNN, Embedding

In [26]:
model = Sequential()
model.add(Embedding(input_dim=total_words, output_dim=256, input_length=max_sequences_len-1))
model.add(Bidirectional(LSTM(units=512, return_sequences=True)))
model.add(Dropout(0.3))
model.add(Bidirectional(LSTM(units=128)))
model.add(Dropout(0.1))
model.add(Dense(total_words, activation='softmax'))
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 10, 256)           822016    
_________________________________________________________________
bidirectional (Bidirectional (None, 10, 1024)          3149824   
_________________________________________________________________
dropout (Dropout)            (None, 10, 1024)          0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 256)               1180672   
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense (Dense)                (None, 3211)              825227    
Total params: 5,977,739
Trainable params: 5,977,739
Non-trainable params: 0
____________________________________________

In [27]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(xs, ys, epochs=125, batch_size=128)

Epoch 1/125
Epoch 2/125
Epoch 3/125
Epoch 4/125
Epoch 5/125
Epoch 6/125
Epoch 7/125
Epoch 8/125
Epoch 9/125
Epoch 10/125
Epoch 11/125
Epoch 12/125
Epoch 13/125
Epoch 14/125
Epoch 15/125
Epoch 16/125
Epoch 17/125
Epoch 18/125
Epoch 19/125
Epoch 20/125
Epoch 21/125
Epoch 22/125
Epoch 23/125
Epoch 24/125
Epoch 25/125
Epoch 26/125
Epoch 27/125
Epoch 28/125
Epoch 29/125
Epoch 30/125
Epoch 31/125
Epoch 32/125
Epoch 33/125
Epoch 34/125
Epoch 35/125
Epoch 36/125
Epoch 37/125
Epoch 38/125
Epoch 39/125
Epoch 40/125
Epoch 41/125
Epoch 42/125
Epoch 43/125
Epoch 44/125
Epoch 45/125
Epoch 46/125
Epoch 47/125
Epoch 48/125
Epoch 49/125
Epoch 50/125
Epoch 51/125
Epoch 52/125
Epoch 53/125
Epoch 54/125
Epoch 55/125
Epoch 56/125
Epoch 57/125
Epoch 58/125
Epoch 59/125
Epoch 60/125
Epoch 61/125
Epoch 62/125
Epoch 63/125
Epoch 64/125
Epoch 65/125
Epoch 66/125
Epoch 67/125
Epoch 68/125
Epoch 69/125
Epoch 70/125
Epoch 71/125
Epoch 72/125
Epoch 73/125
Epoch 74/125
Epoch 75/125
Epoch 76/125
Epoch 77/125
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x7fa9702a1c88>

In [28]:
model.save('shakespeare_model.h5')

In [29]:
from tensorflow.keras.models import load_model

In [30]:
model_final = load_model('shakespeare_model.h5')

#### Predict next word

In [37]:
def predict_next_words(seed_text, num_words_to_generate):
  for _ in range(num_words_to_generate):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequences_len-1, padding='pre')
    predicted = model_final.predict_classes(token_list, verbose=0)
    output_word=""
    for word, index in tokenizer.word_index.items():
      if index == predicted:
        output_word = word
        break
    seed_text += " " + output_word

  print(seed_text)
  return seed_text

In [38]:
seed_text = "from art comes thou"
next_words = 100
generated_text = predict_next_words(seed_text, next_words)

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).
from art comes thou be the muses filed mad fitted smells 'will weep can their end ' rolling growth erred erred woe of nought worse lines me see well ' thee told hid tell thee thee to me free thought thee bright see told bright thee thou art true write me to me thought bright bright thee thee thou art free thought thee not bright so thee erred lack erred woe of you ' thee thee erred doth sort spend rolling free sheds sheds sheds bright so rage see not thee thou sorrow cross fall to thee you thee ' thee well ' thee


In [39]:
seed_text = "at the behest of the benevolent king"
next_words = 9
generated_text = predict_next_words(seed_text, next_words)

at the behest of the benevolent king so gentlest rose days erred level live arising cold


In [40]:
seed_text = "and the emperor said"
next_words = 9
generated_text = predict_next_words(seed_text, next_words)

and the emperor said i condemned which is in this date chest pace
