In [59]:
#download the data and save it to poem.txt
!wget https://www.gutenberg.org/files/1661/1661-0.txt -O book.txt

--2023-05-05 07:41:55--  https://www.gutenberg.org/files/1661/1661-0.txt
Resolving www.gutenberg.org (www.gutenberg.org)... 152.19.134.47, 2610:28:3090:3000:0:bad:cafe:47
Connecting to www.gutenberg.org (www.gutenberg.org)|152.19.134.47|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 607430 (593K) [text/plain]
Saving to: ‘book.txt’


2023-05-05 07:41:55 (2.77 MB/s) - ‘book.txt’ saved [607430/607430]



In [60]:
#read the file in text string
text = open('book.txt', 'r', encoding='utf-8').read()
text = text.lower()

In [61]:
#create list of sentences
sentences = text.split('\n')

In [62]:
#import dependencies to preprocess the text data and making sequences
import tensorflow as tf
import keras
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences

In [63]:
#initialize the tokenizer, which can work char by char
tokenizer = Tokenizer(oov_token='<UNK>')

In [64]:
tokenizer.fit_on_texts(sentences)

In [65]:
vocab_size = len(tokenizer.word_index) + 1
vocab_size

8915

In [66]:
sequences = tokenizer.texts_to_sequences(sentences)

In [67]:
input_sequences = []
for sequence in sequences:
  for i in range(1, len(sequence)):
    n_gram_sequence = sequence[:i+1]
    input_sequences.append(n_gram_sequence)

In [68]:
print(input_sequences[0], input_sequences[1], input_sequences[2], input_sequences[3])

[4775, 145] [4775, 145, 132] [4775, 145, 132, 886] [4775, 145, 132, 886, 5]


In [69]:
#find the maximum length among sequences
max_seq_len = max([len(seq) for seq in input_sequences])
max_seq_len

20

In [70]:
#we will keep the last value of the sequence as our target label and all values before that as input to sequence model

In [71]:
#pad the sequences to ensure that they are all of same length
padded_sequences = pad_sequences(input_sequences, maxlen = max_seq_len)

In [72]:
print(padded_sequences[0], padded_sequences[1])

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0 4775  145] [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0 4775  145  132]


In [73]:
import numpy as np
padded_sequences = np.array(padded_sequences)

In [74]:
print(len(padded_sequences[0]))
print(len(padded_sequences[1]))

20
20


In [75]:
#prepare training sequences and labels
x = padded_sequences[:, : -1]
labels = padded_sequences[:, -1]

In [76]:
labels.shape

(101575,)

In [77]:
#to one hot encode the labels
y = tf.keras.utils.to_categorical(labels, num_classes=vocab_size)

In [78]:
x.shape

(101575, 19)

In [79]:
#x = x.reshape(x.shape[0], x.shape[1], 1)

In [80]:
y.shape

(101575, 8915)

In [81]:
#import dependencies for defining the model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense
from tensorflow.keras.optimizers import Adam

In [82]:
#define and compile the model
model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=max_seq_len-1))
model.add(Bidirectional(LSTM(256)))
model.add(Dense(vocab_size, activation='softmax'))
adam = Adam(learning_rate=0.01)
model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['acc'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 19, 100)           891500    
                                                                 
 bidirectional_1 (Bidirectio  (None, 512)              731136    
 nal)                                                            
                                                                 
 dense_1 (Dense)             (None, 8915)              4573395   
                                                                 
Total params: 6,196,031
Trainable params: 6,196,031
Non-trainable params: 0
_________________________________________________________________


In [83]:
#initialize the callback for early stopping the training if there is not at least 1% improvement in the accuracy 
from tensorflow.keras.callbacks import EarlyStopping
es = EarlyStopping(monitor = 'acc', min_delta=0.01)

In [84]:
model.fit(x, y, epochs=50, verbose=1, batch_size=512, callbacks=[es])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50


<keras.callbacks.History at 0x7f63200e7dc0>

In [85]:
#Time to become storyteller!
seed_text = "I could not help laughing at the ease with which he explained his process of deduction"          
next_words = 100
  
for _ in range(next_words):
  sequence = tokenizer.texts_to_sequences([seed_text])
  padded = pad_sequences(sequence, maxlen=max_seq_len-1)
  #predicted = model.predict_classes(padded, verbose=0)
  predicted = np.argmax(model.predict(padded), axis=1)
  output_word = ''
  for word, index in tokenizer.word_index.items():
    if index == predicted:
      output_word = word
      break
  seed_text += ' ' + output_word
print(seed_text)

I could not help laughing at the ease with which he explained his process of deduction “when no good purpose can be served with this intention to make me the self reproach and eyes ” remarked holmes when the cloth was cleared “just a drive then i travelled and saw the man of the house and i was not so much as ever as the dog to see what a week and hosmer wrote from any case before you but the king has been driven from a few years and i got to the floor and the moment struck the sign of me and i left his hands at night and has put in the coffee
