In [22]:
import tensorflow as tf

In [23]:
tf.config.list_physical_devices()

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [10]:
text = ""
with open('shakespeare.txt','r') as file:
    text = file.read()

In [11]:
text = text.split('\n')

In [19]:
x = lambda a : a.strip() != ''

In [20]:
list(filter(x,text))

['THE SONNETS',
 'by William Shakespeare',
 'From fairest creatures we desire increase,',
 "That thereby beauty's rose might never die,",
 'But as the riper should by time decease,',
 'His tender heir might bear his memory:',
 'But thou contracted to thine own bright eyes,',
 "Feed'st thy light's flame with self-substantial fuel,",
 'Making a famine where abundance lies,',
 'Thy self thy foe, to thy sweet self too cruel:',
 "Thou that art now the world's fresh ornament,",
 'And only herald to the gaudy spring,',
 'Within thine own bud buriest thy content,',
 "And tender churl mak'st waste in niggarding:",
 'Pity the world, or else this glutton be,',
 "To eat the world's due, by the grave and thee.",
 'When forty winters shall besiege thy brow,',
 "And dig deep trenches in thy beauty's field,",
 "Thy youth's proud livery so gazed on now,",
 'Will be a tattered weed of small worth held:  ',
 'Then being asked, where all thy beauty lies,',
 'Where all the treasure of thy lusty days;',
 'T

In [25]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [26]:
tokenizer = Tokenizer()

In [30]:
tokenizer.fit_on_texts(text)

In [32]:
len(tokenizer.word_index)

5358

In [34]:
input_sequences = []
for sentence in text:
  tokenized_sentence = tokenizer.texts_to_sequences([sentence])[0]

  for i in range(1,len(tokenized_sentence)):
    input_sequences.append(tokenized_sentence[:i+1])

In [35]:
input_sequences

[[2, 3530],
 [32, 3531],
 [32, 3531, 3532],
 [36, 414],
 [36, 414, 878],
 [36, 414, 878, 164],
 [36, 414, 878, 164, 216],
 [36, 414, 878, 164, 216, 509],
 [8, 879],
 [8, 879, 134],
 [8, 879, 134, 350],
 [8, 879, 134, 350, 101],
 [8, 879, 134, 350, 101, 156],
 [8, 879, 134, 350, 101, 156, 196],
 [19, 23],
 [19, 23, 2],
 [19, 23, 2, 880],
 [19, 23, 2, 880, 63],
 [19, 23, 2, 880, 63, 32],
 [19, 23, 2, 880, 63, 32, 51],
 [19, 23, 2, 880, 63, 32, 51, 640],
 [28, 311],
 [28, 311, 641],
 [28, 311, 641, 101],
 [28, 311, 641, 101, 197],
 [28, 311, 641, 101, 197, 28],
 [28, 311, 641, 101, 197, 28, 276],
 [19, 11],
 [19, 11, 881],
 [19, 11, 881, 3],
 [19, 11, 881, 3, 64],
 [19, 11, 881, 3, 64, 86],
 [19, 11, 881, 3, 64, 86, 217],
 [19, 11, 881, 3, 64, 86, 217, 52],
 [3533, 10],
 [3533, 10, 3534],
 [3533, 10, 3534, 642],
 [3533, 10, 3534, 642, 12],
 [3533, 10, 3534, 642, 12, 34],
 [3533, 10, 3534, 642, 12, 34, 3535],
 [3533, 10, 3534, 642, 12, 34, 3535, 3536],
 [198, 17],
 [198, 17, 3537],
 [198, 

In [37]:
maxlen = max([len(sent) for sent in text])

In [38]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
padded_input_sequences = pad_sequences(input_sequences, maxlen = maxlen, padding='pre')

In [39]:
padded_input_sequences

array([[   0,    0,    0, ...,    0,    2, 3530],
       [   0,    0,    0, ...,    0,   32, 3531],
       [   0,    0,    0, ...,   32, 3531, 3532],
       ...,
       [   0,    0,    0, ...,  488, 5358,   16],
       [   0,    0,    0, ..., 5358,   16,   15],
       [   0,    0,    0, ...,    0,    2,  202]])

In [43]:
X = padded_input_sequences[:,:-1]

In [44]:
y = padded_input_sequences[:,-1]

In [47]:
X.shape,y.shape

((15517, 58), (15517,))

In [51]:
from tensorflow.keras.utils import to_categorical
y = to_categorical(y,num_classes=len(tokenizer.word_index)+1)

In [53]:
y.shape

(15517, 5359)

In [54]:
from tensorflow.keras.models import Sequential

In [55]:
from tensorflow.keras.layers import Dense,LSTM,Embedding

In [64]:
model = Sequential()
model.add(Embedding(len(tokenizer.word_index),100,input_length=maxlen - 1))
model.add(LSTM(150))
model.add(Dense(len(tokenizer.word_index)+1,activation='softmax'))

In [65]:
model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])

In [66]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 58, 100)           535800    
                                                                 
 lstm_2 (LSTM)               (None, 150)               150600    
                                                                 
 dense_2 (Dense)             (None, 5359)              809209    
                                                                 
Total params: 1,495,609
Trainable params: 1,495,609
Non-trainable params: 0
_________________________________________________________________


In [68]:
model.fit(X,y,epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
 72/485 [===>..........................] - ETA: 5s - loss: 0.4722 - accuracy: 0.8702

KeyboardInterrupt: 

In [107]:
import numpy as np
new_text = "William Shakespeare that for the world and I are dead might prove thee"
token_text = tokenizer.texts_to_sequences([new_text])[0]
  # padding
padded_token_text = pad_sequences([token_text], maxlen=maxlen-1, padding='pre')
  # predict
pos = np.argmax(model.predict(padded_token_text))



In [108]:
for word,index in tokenizer.word_index.items():
    if index == pos:
      print(word)
      

groan
