In [2]:
import tensorflow as tf

In [3]:
tf.config.list_physical_devices()

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [4]:
text = ""
with open('shakespeare.txt','r') as file:
    text = file.read()

In [5]:
text = text.split('\n')

In [6]:
x = lambda a : a.strip() != ''

In [7]:
list(filter(x,text))

['THE SONNETS',
 'by William Shakespeare',
 'From fairest creatures we desire increase,',
 "That thereby beauty's rose might never die,",
 'But as the riper should by time decease,',
 'His tender heir might bear his memory:',
 'But thou contracted to thine own bright eyes,',
 "Feed'st thy light's flame with self-substantial fuel,",
 'Making a famine where abundance lies,',
 'Thy self thy foe, to thy sweet self too cruel:',
 "Thou that art now the world's fresh ornament,",
 'And only herald to the gaudy spring,',
 'Within thine own bud buriest thy content,',
 "And tender churl mak'st waste in niggarding:",
 'Pity the world, or else this glutton be,',
 "To eat the world's due, by the grave and thee.",
 'When forty winters shall besiege thy brow,',
 "And dig deep trenches in thy beauty's field,",
 "Thy youth's proud livery so gazed on now,",
 'Will be a tattered weed of small worth held:  ',
 'Then being asked, where all thy beauty lies,',
 'Where all the treasure of thy lusty days;',
 'T

In [8]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [9]:
tokenizer = Tokenizer()

In [10]:
tokenizer.fit_on_texts(text)

In [11]:
len(tokenizer.word_index)

3200

In [12]:
input_sequences = []
for sentence in text:
  tokenized_sentence = tokenizer.texts_to_sequences([sentence])[0]

  for i in range(1,len(tokenized_sentence)):
    input_sequences.append(tokenized_sentence[:i+1])

In [13]:
input_sequences

[[2, 1372],
 [31, 1373],
 [31, 1373, 1374],
 [35, 412],
 [35, 412, 874],
 [35, 412, 874, 162],
 [35, 412, 874, 162, 214],
 [35, 412, 874, 162, 214, 507],
 [8, 875],
 [8, 875, 132],
 [8, 875, 132, 348],
 [8, 875, 132, 348, 100],
 [8, 875, 132, 348, 100, 154],
 [8, 875, 132, 348, 100, 154, 194],
 [18, 22],
 [18, 22, 2],
 [18, 22, 2, 876],
 [18, 22, 2, 876, 62],
 [18, 22, 2, 876, 62, 31],
 [18, 22, 2, 876, 62, 31, 50],
 [18, 22, 2, 876, 62, 31, 50, 638],
 [27, 309],
 [27, 309, 639],
 [27, 309, 639, 100],
 [27, 309, 639, 100, 195],
 [27, 309, 639, 100, 195, 27],
 [27, 309, 639, 100, 195, 27, 274],
 [18, 10],
 [18, 10, 877],
 [18, 10, 877, 3],
 [18, 10, 877, 3, 63],
 [18, 10, 877, 3, 63, 85],
 [18, 10, 877, 3, 63, 85, 215],
 [18, 10, 877, 3, 63, 85, 215, 51],
 [1375, 9],
 [1375, 9, 1376],
 [1375, 9, 1376, 640],
 [1375, 9, 1376, 640, 11],
 [1375, 9, 1376, 640, 11, 33],
 [1375, 9, 1376, 640, 11, 33, 1377],
 [1375, 9, 1376, 640, 11, 33, 1377, 1378],
 [196, 16],
 [196, 16, 1379],
 [196, 16, 137

In [14]:
maxlen = max([len(sent) for sent in text])

In [15]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
padded_input_sequences = pad_sequences(input_sequences, maxlen = maxlen, padding='pre')

In [16]:
padded_input_sequences

array([[   0,    0,    0, ...,    0,    2, 1372],
       [   0,    0,    0, ...,    0,   31, 1373],
       [   0,    0,    0, ...,   31, 1373, 1374],
       ...,
       [   0,    0,    0, ...,  486, 3200,   15],
       [   0,    0,    0, ..., 3200,   15,   14],
       [   0,    0,    0, ...,    0,    2,  200]])

In [17]:
X = padded_input_sequences[:,:-1]

In [18]:
y = padded_input_sequences[:,-1]

In [19]:
X.shape,y.shape

((15517, 58), (15517,))

In [20]:
from tensorflow.keras.utils import to_categorical
y = to_categorical(y,num_classes=len(tokenizer.word_index)+1)

In [21]:
y.shape

(15517, 3201)

In [22]:
from tensorflow.keras.models import Sequential

In [23]:
from tensorflow.keras.layers import Dense,LSTM,Embedding

In [24]:
model = Sequential()
model.add(Embedding(len(tokenizer.word_index),100,input_length=maxlen - 1))
model.add(LSTM(150))
model.add(Dense(len(tokenizer.word_index)+1,activation='softmax'))

In [25]:
model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])

In [26]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 58, 100)           320000    
                                                                 
 lstm (LSTM)                 (None, 150)               150600    
                                                                 
 dense (Dense)               (None, 3201)              483351    
                                                                 
Total params: 953,951
Trainable params: 953,951
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.fit(X,y,epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100

In [None]:
import numpy as np
new_text = "William Shakespeare that for the world and I are dead might prove thee"
token_text = tokenizer.texts_to_sequences([new_text])[0]
  # padding
padded_token_text = pad_sequences([token_text], maxlen=maxlen-1, padding='pre')
  # predict
pos = np.argmax(model.predict(padded_token_text))