# Copyright

<PRE>
Copyright (c) Bálint Gyires-Tóth - All Rights Reserved
You may use and modify this code for research and development purpuses.
Using this code for educational purposes (self-paced or instructor led) without the permission of the author is prohibited.
</PRE>

# Assignment: RNN text generation with your favorite book


In [1]:
import requests
import string
import re
import math
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.optimizers import AdamW
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from collections import Counter
from nltk.tokenize import sent_tokenize

## 1. Dataset
- Download your favorite book from https://www.gutenberg.org/
- Combine all sonnets into a single text source.  
- Split into training (80%) and validation (20%).  

In [2]:
# Download 'The Tale of Peter Rabbit'
url = "https://www.gutenberg.org/cache/epub/28885/pg28885.txt"
response = requests.get(url)
text = response.text

# Remove Project Gutenberg header and footer
start_marker = "*** START OF THE PROJECT GUTENBERG EBOOK ALICE'S ADVENTURES IN WONDERLAND ***"
end_marker = "*** END OF THE PROJECT GUTENBERG EBOOK ALICE'S ADVENTURES IN WONDERLAND ***"

if start_marker in text and end_marker in text:
    content = text.split(start_marker)[1].split(end_marker)[0].strip()
else:
    raise ValueError("Start or end marker not found.")

## 2. Preprocessing
- Convert text to lowercase.  
- Remove punctuation (except basic sentence delimiters).  
- Tokenize by words or characters (your choice).  
- Build a vocabulary (map each unique word to an integer ID).

In [3]:
# Clean text: lowercase and remove all punctuation except .,!,?
punct_to_remove = string.punctuation.replace('.', '').replace('!', '').replace('?', '')
table = str.maketrans('', '', punct_to_remove)
cleaned_text = content.lower().translate(table)

# Word-level tokenization
tokens = cleaned_text.split()
total_words = len(tokens)
split_index = int(0.8 * total_words)
train_tokens = tokens[:split_index]
val_tokens = tokens[split_index:]

# Build vocab
word_counts = Counter(train_tokens)
min_word_freq = 5
vocab = ["<unk>"] + [word for word, count in word_counts.items() if count >= min_word_freq]
vocab_size = len(vocab)

word_to_id = {word: idx for idx, word in enumerate(vocab)}
id_to_word = {idx: word for word, idx in word_to_id.items()}
unk_id = word_to_id["<unk>"]

# Encode
train_ids = [word_to_id.get(word, unk_id) for word in train_tokens]
val_ids = [word_to_id.get(word, unk_id) for word in val_tokens]

## 3. Embedding Layer in Keras
Below is a minimal example of defining an `Embedding` layer:
```python
from tensorflow.keras.layers import Embedding

embedding_layer = Embedding(
    input_dim=vocab_size,     # size of the vocabulary
    output_dim=128,           # embedding vector dimension
    input_length=sequence_length
)
```
- This layer transforms integer-encoded sequences (word IDs) into dense vector embeddings.

- Feed these embeddings into your LSTM or GRU OR 1D CNN layer.

In [4]:
# Define the embedding layer
embedding_layer = Embedding(
    input_dim=len(word_to_id),
    output_dim=128,
    input_length=10
)



## 4. Model & 5. Training and Evaluation
- Implement an LSTM or GRU or 1D CNN-based language model with:
  - **The Embedding layer** as input.
  - At least **one recurrent layer** (e.g., `LSTM(256)` or `GRU(256)` or your custom 1D CNN).
  - A **Dense** output layer with **softmax** activation for word prediction.
- Train for about **5–10 epochs** so it can finish in approximately **2 hours** on a standard machine.
- **Monitor** the loss on both training and validation sets.
- **Perplexity**: a common metric for language models.
  - It is the exponent of the average negative log-likelihood.
  - If your model outputs cross-entropy loss `H`, then `perplexity = e^H`.
  - Try to keep the validation perplexity **under 50** if possible.


In [5]:
class PerplexityCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        logs = logs or {}
        train_loss = logs.get("loss")
        val_loss = logs.get("val_loss")

        train_ppl = math.exp(train_loss) if train_loss else float("inf")
        val_ppl = math.exp(val_loss) if val_loss else float("inf")

        print(f"\n Epoch {epoch}:")
        print(f"   - Training   Loss: {train_loss:.4f} | Perplexity: {train_ppl:.2f}")
        print(f"   - Validation Loss: {val_loss:.4f} | Perplexity: {val_ppl:.2f}")

In [6]:
sequence_length = 10

def create_sequences(ids, seq_len):
    X, y = [], []
    for i in range(seq_len, len(ids)):
        X.append(ids[i - seq_len:i])
        y.append(ids[i])
    return np.array(X), np.array(y)

X_train, y_train = create_sequences(train_ids, sequence_length)
X_val, y_val = create_sequences(val_ids, sequence_length)

In [7]:
model = Sequential([
    Embedding(vocab_size, 128, input_length=sequence_length, mask_zero=True),
    LSTM(256, return_sequences=True),
    Dropout(0.2),
    LSTM(256),
    Dropout(0.2),
    Dense(vocab_size, activation='softmax')
])

model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer = AdamW(learning_rate=5e-5, weight_decay=1e-6),
    metrics=['accuracy']
)

callbacks = [
    EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True),
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2),
    PerplexityCallback()
]

model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    batch_size=64,
    epochs=10,
    callbacks=callbacks
)

Epoch 1/10
[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 142ms/step - accuracy: 0.1559 - loss: 6.0153
 Epoch 0:
   - Training   Loss: 5.5115 | Perplexity: 247.53
   - Validation Loss: 4.6357 | Perplexity: 103.10
[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 154ms/step - accuracy: 0.1560 - loss: 6.0139 - val_accuracy: 0.2321 - val_loss: 4.6357 - learning_rate: 5.0000e-05
Epoch 2/10
[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 137ms/step - accuracy: 0.1815 - loss: 4.8944
 Epoch 1:
   - Training   Loss: 4.8863 | Perplexity: 132.46
   - Validation Loss: 4.5738 | Perplexity: 96.91
[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 152ms/step - accuracy: 0.1815 - loss: 4.8944 - val_accuracy: 0.2321 - val_loss: 4.5738 - learning_rate: 5.0000e-05
Epoch 3/10
[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 137ms/step - accuracy: 0.1819 - loss: 4.8494
 Epoch 2:
   - Training   Loss: 4.8599 | Perplexity

<keras.src.callbacks.history.History at 0x7e48b2d14f50>

## 6. Generation Criteria
- After training, generate **two distinct text samples**, each at least **50 tokens**.
- Use **different seed phrases** (e.g., “love is” vs. “time will”).

In [8]:
def generate_text(seed_text, num_tokens=50):
    result = seed_text.lower().split()
    for _ in range(num_tokens):
        # Encode and pad the current input
        encoded = [word_to_id.get(word, unk_id) for word in result[-sequence_length:]]
        padded = tf.keras.preprocessing.sequence.pad_sequences([encoded], maxlen=sequence_length)

        # Predict next word
        pred_probs = model.predict(padded, verbose=0)[0]
        next_id = np.random.choice(len(pred_probs), p=pred_probs)
        next_word = id_to_word.get(next_id, "<unk>")

        result.append(next_word)
    return ' '.join(result)

In [11]:
seed1 = "she said"
sample1 = generate_text(seed1, num_tokens=50)
print("Sample 1:")
print(sample1)

seed2 = "then they"
sample2 = generate_text(seed2, num_tokens=50)
print("\nSample 2:")
print(sample2)

Sample 1:
she said it! <unk> indeed the theres he its be poor was to <unk> believe window to seen come <unk> of <unk> <unk> whether see <unk> to his <unk> <unk> only father i <unk> a <unk> either make and the she eyes <unk> a do one far minute her tale <unk> the

Sample 2:
then they though world with though <unk> asking an cat to <unk> in do and that <unk> took if down found say <unk> cried growing that king <unk> alice and my of <unk> said more <unk> a had you so it do herself mad. in that to and rather duchess the <unk>
