In [88]:
import requests 
import numpy as np
import re
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.optimizers import Adam

In [98]:
# Access the text from the URL
url = 'https://www.gutenberg.org/cache/epub/11/pg11.txt'
response = requests.get(url)
text = response.text

# Preprocess function to clean the text
def preprocess_text(text):
    # Remove all non-word characters and digits, except for spaces
    cleaned_text = re.sub(r'[^a-zA-Z\s]', '', text)
    cleaned_text = re.sub(r'\W', ' ', cleaned_text)
    cleaned_text = re.sub(r'\d', '', cleaned_text)
    cleaned_text = cleaned_text.lower()
    # Find start and end slicing points to remove unnecessary parts
    start_idx = text.find('*** START') + len('*** START')
    end_idx = text.find('*** END')
    # Slice the text to remove parts before '*** START' and after '*** END'
    text = text[start_idx:end_idx]
    # Split text into sentences as the final objective is to create a sentence
    corpus = re.split(r'[.!?\n]', text)
    return corpus

# Get the preprocessed corpus
corpus = preprocess_text(text)

# Print the first 200 characters of the corpus
print(' '.join(corpus[:200]))

# Create the vocabulary using Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1
print("Total words in the vocabulary:", total_words)


 There were doors all round the hall, but they were all locked; and when
Total words in the vocabulary: 3521


In [99]:
print(corpus)



In [97]:
def preprocess_text(text):
    # Remove all non-word characters and digits, except for spaces
    cleaned_text = re.sub(r'[^a-zA-Z\s]', '', text)
    cleaned_text = re.sub(r'\W', ' ', cleaned_text)
    cleaned_text = re.sub(r'\d', '', cleaned_text)
    cleaned_text = cleaned_text.lower()
    # Find start and end slicing points to remove unnecessary parts
    start_idx = text.find('*** START') + 4
    end_idx = text.find('*** END')
    # Slice the text to remove parts before '*** START' and after '*** END'
    text = text[start_idx:end_idx]
    # Split text into sentences or words depending on the final objective
    corpus = text.split()  # Split by words; use text.split('. ') for sentences if needed
    return corpus

url = 'https://www.gutenberg.org/cache/epub/11/pg11.txt'
response = requests.get(url)
text = response.text

corpus = preprocess_text(text)

print(' '.join(corpus[:200]))


START OF THE PROJECT GUTENBERG EBOOK ALICE'S ADVENTURES IN WONDERLAND *** [Illustration] Alice’s Adventures in Wonderland by Lewis Carroll THE MILLENNIUM FULCRUM EDITION 3.0 Contents CHAPTER I. Down the Rabbit-Hole CHAPTER II. The Pool of Tears CHAPTER III. A Caucus-Race and a Long Tale CHAPTER IV. The Rabbit Sends in a Little Bill CHAPTER V. Advice from a Caterpillar CHAPTER VI. Pig and Pepper CHAPTER VII. A Mad Tea-Party CHAPTER VIII. The Queen’s Croquet-Ground CHAPTER IX. The Mock Turtle’s Story CHAPTER X. The Lobster Quadrille CHAPTER XI. Who Stole the Tarts? CHAPTER XII. Alice’s Evidence CHAPTER I. Down the Rabbit-Hole Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, “and what is the use of a book,” thought Alice “without pictures or conversations?” So she was considering in her own mind (as well as she could, for

In [101]:
corpus = [element for element in corpus if element.strip()]
corpus = [element.strip() for element in corpus]

In [102]:
corpus

["OF THE PROJECT GUTENBERG EBOOK ALICE'S ADVENTURES IN WONDERLAND ***",
 '[Illustration]',
 'Alice’s Adventures in Wonderland',
 'by Lewis Carroll',
 'THE MILLENNIUM FULCRUM EDITION 3',
 '0',
 'Contents',
 'CHAPTER I',
 'Down the Rabbit-Hole',
 'CHAPTER II',
 'The Pool of Tears',
 'CHAPTER III',
 'A Caucus-Race and a Long Tale',
 'CHAPTER IV',
 'The Rabbit Sends in a Little Bill',
 'CHAPTER V',
 'Advice from a Caterpillar',
 'CHAPTER VI',
 'Pig and Pepper',
 'CHAPTER VII',
 'A Mad Tea-Party',
 'CHAPTER VIII',
 'The Queen’s Croquet-Ground',
 'CHAPTER IX',
 'The Mock Turtle’s Story',
 'CHAPTER X',
 'The Lobster Quadrille',
 'CHAPTER XI',
 'Who Stole the Tarts',
 'CHAPTER XII',
 'Alice’s Evidence',
 'CHAPTER I',
 'Down the Rabbit-Hole',
 'Alice was beginning to get very tired of sitting by her sister on the',
 'bank, and of having nothing to do: once or twice she had peeped into',
 'the book her sister was reading, but it had no pictures or',
 'conversations in it, “and what is the use of

In [103]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1
print("Total words in the vocabulary:", total_words)

Total words in the vocabulary: 3067


In [104]:
input_sequences = []
for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

# Determine the maximum sequence length for padding
max_sequence_len = max([len(x) for x in input_sequences])

# Pad sequences
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

# Create predictors and label 
X, labels = input_sequences[:,:-1],input_sequences[:,-1]
y = tf.keras.utils.to_categorical(labels, num_classes=total_words)


In [105]:
# model = tf.keras.Sequential([
#     tf.keras.layers.Embedding(input_dim=total_words, output_dim=100, input_length=max_sequence_len-1),
#     tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(150, return_sequences=True)),
#     tf.keras.layers.Dropout(0.2),
#     tf.keras.layers.LSTM(100),
#     tf.keras.layers.Dense(int(total_words/2), activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
#     tf.keras.layers.Dense(total_words, activation='softmax')
# ])

# model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# model.summary()


In [106]:
model = Sequential()
model.add(Embedding(total_words, 100))
model.add(LSTM(100, return_sequences=False))
model.add(Dense(total_words, activation='softmax'))
model.summary()

In [109]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
history_model = model.fit(X, y, epochs=50, verbose=1, callbacks=[early_stopping])

Epoch 1/50
[1m754/754[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 9ms/step - accuracy: 0.0578 - loss: 6.5261
Epoch 2/50
[1m 20/754[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m6s[0m 9ms/step - accuracy: 0.0740 - loss: 5.7271

  current = self.get_monitor_value(logs)


[1m754/754[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 9ms/step - accuracy: 0.0823 - loss: 5.7985
Epoch 3/50
[1m754/754[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 8ms/step - accuracy: 0.1034 - loss: 5.4449
Epoch 4/50
[1m754/754[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 9ms/step - accuracy: 0.1275 - loss: 5.1148
Epoch 5/50
[1m754/754[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 9ms/step - accuracy: 0.1552 - loss: 4.8470
Epoch 6/50
[1m754/754[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 9ms/step - accuracy: 0.1737 - loss: 4.6186
Epoch 7/50
[1m754/754[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 9ms/step - accuracy: 0.1856 - loss: 4.4287
Epoch 8/50
[1m754/754[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 9ms/step - accuracy: 0.2007 - loss: 4.2635
Epoch 9/50
[1m754/754[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 9ms/step - accuracy: 0.2161 - loss: 4.0601
Epoch 10/50
[1m754/754[0m [32m━━━━━━━━━━━━━━━━━━

In [110]:
def generate_text(seed_text, next_words, model, tokenizer, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predictions = model.predict(token_list, verbose=0)
        predicted = np.argmax(predictions, axis=-1)[0]
        
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text

In [119]:
generate_text("Alice was very tired", 5, model, tokenizer, max_sequence_len)

'Alice was very tired and had to kneel down'

In [116]:
generate_text("Start of the project", 1, model, tokenizer, max_sequence_len)

'Start of the project gutenberg'

'Alice was just beginning to look down again'

In [121]:
generate_text("So she set the little creature", 3, model, tokenizer, max_sequence_len)

'So she set the little creature down and felt'