In [1]:
#download the data and save it to poem.txt
!curl -o book.txt https://www.gutenberg.org/files/1661/1661-0.txt


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  593k  100  593k    0     0   148k      0  0:00:04  0:00:04 --:--:--  148k98k    0     0  83675      0  0:00:07  0:00:02  0:00:05 83649


In [2]:
#read the file in text string
text = open('book.txt', 'r', encoding='utf-8').read()
text = text.lower()

In [3]:
#create list of sentences
sentences = text.split('\n')

In [4]:
#import dependencies to preprocess the text data and making sequences
import tensorflow as tf
import keras
from keras.preprocessing.sequence import pad_sequences

2024-07-30 19:08:59.727310: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [5]:
from tensorflow.keras.preprocessing.text import Tokenizer

#initialize the tokenizer, which can work char by char
tokenizer = Tokenizer(oov_token='<UNK>')

In [6]:
tokenizer.fit_on_texts(sentences)

In [7]:
vocab_size = len(tokenizer.word_index) + 1
vocab_size

8923

In [8]:
sequences = tokenizer.texts_to_sequences(sentences)


In [9]:
input_sequences = []
for sequence in sequences:
  for i in range(1, len(sequence)):
    n_gram_sequence = sequence[:i+1]
    input_sequences.append(n_gram_sequence)

In [10]:
print(input_sequences[0], input_sequences[1], input_sequences[2], input_sequences[3])

[4776, 158] [4776, 158, 331] [4776, 158, 331, 886] [4776, 158, 331, 886, 5]


In [11]:
#find the maximum length among sequences
max_seq_len = max([len(seq) for seq in input_sequences])
max_seq_len

20

In [12]:
#we will keep the last value of the sequence as our target label and all values before that as input to sequence model

In [13]:
#pad the sequences to ensure that they are all of same length
padded_sequences = pad_sequences(input_sequences, maxlen = max_seq_len)

In [14]:
print(padded_sequences[0], padded_sequences[1])

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0 4776  158] [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0 4776  158  331]


In [15]:
import numpy as np
padded_sequences = np.array(padded_sequences)

In [16]:
print(len(padded_sequences[0]))
print(len(padded_sequences[1]))

20
20


In [17]:
#prepare training sequences and labels
x = padded_sequences[:, : -1]
labels = padded_sequences[:, -1]

In [18]:
labels.shape

(101523,)

In [19]:
#to one hot encode the labels
y = tf.keras.utils.to_categorical(labels, num_classes=vocab_size)

In [20]:
x.shape

(101523, 19)

In [21]:
#x = x.reshape(x.shape[0], x.shape[1], 1)

In [22]:
y.shape

(101523, 8923)

In [23]:
#import dependencies for defining the model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense
from tensorflow.keras.optimizers import Adam

In [30]:
#define and compile the model
model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=max_seq_len-1))
model.add(Bidirectional(LSTM(256)))
model.add(Dense(vocab_size, activation='softmax'))
adam = Adam(learning_rate=0.01)
model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['acc'])
model.build(input_shape=(None, max_seq_len-1))
model.summary()

In [31]:
#initialize the callback for early stopping the training if there is not at least 1% improvement in the accuracy 
from tensorflow.keras.callbacks import EarlyStopping
es = EarlyStopping(monitor = 'acc', min_delta=0.01)

In [32]:
model.fit(x, y, epochs=50, verbose=1, batch_size=512, callbacks=[es])

Epoch 1/50
[1m199/199[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m238s[0m 1s/step - acc: 0.0613 - loss: 6.7053
Epoch 2/50
[1m199/199[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m235s[0m 1s/step - acc: 0.1406 - loss: 5.3347
Epoch 3/50
[1m199/199[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m260s[0m 1s/step - acc: 0.1773 - loss: 4.6877
Epoch 4/50
[1m199/199[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m239s[0m 1s/step - acc: 0.2137 - loss: 4.1397
Epoch 5/50
[1m199/199[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m293s[0m 1s/step - acc: 0.2589 - loss: 3.6374
Epoch 6/50
[1m199/199[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m243s[0m 1s/step - acc: 0.3127 - loss: 3.2038
Epoch 7/50
[1m199/199[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m245s[0m 1s/step - acc: 0.3716 - loss: 2.8179
Epoch 8/50
[1m199/199[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m246s[0m 1s/step - acc: 0.4260 - loss: 2.5029
Epoch 9/50
[1m199/199[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24

<keras.src.callbacks.history.History at 0x13f34e250>

In [37]:
#Time to become storyteller!
seed_text = "I could not help laughing at the ease with which he explained his process of deduction"          
next_words = 100
  
for _ in range(next_words):
  sequence = tokenizer.texts_to_sequences([seed_text])
  padded = pad_sequences(sequence, maxlen=max_seq_len-1)
  predicted_prob = model.predict(padded, verbose=0)
  predicted_class = np.argmax(predicted_prob, axis=-1)[0]
    
  output_word = ''
  for word, index in tokenizer.word_index.items():
    if index == predicted_class:
      output_word = word
      break
  seed_text += ' ' + output_word
print(seed_text)

I could not help laughing at the ease with which he explained his process of deduction while i continue to see a crust of metallic deposit all over the tail of the morning and the “g” itself was locked in a one and a bright young man had been driven to the floor in his belief that i could not have been on to my station by end and it is unlikely that the you is a fait to say that the reason why he hears that i have ever seen in my laughter whenever he was kind to do it i am not a very foolish thing out ” he remarked “i think that it
