# Text Generation with LSTM on *The Mysterious Island*

This notebook demonstrates training a character-level LSTM model to generate text inspired by *The Mysterious Island* novel (Project Gutenberg).  
We cover data loading, preprocessing, model definition, training, and text generation.

---

## 1. Setup & Imports


In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import numpy as np
import time
from torch.distributions import Categorical
import matplotlib.pyplot as plt

---

## 2. Download and Preprocess Dataset

We download the text, extract the main content, and create character-level encodings for model input.


In [None]:
# Constants
SEED = 10
SEQUENCE_LENGTH = 40
BATCH_SIZE = 64

# Fix random seed for reproducibility
torch.manual_seed(SEED)
np.random.seed(SEED)

# Download text
!wget -q https://www.gutenberg.org/files/1268/1268-0.txt

with open("1268-0.txt", 'r', encoding='utf8') as fp:
    text = fp.read()

# Extract main content between known markers
start_index = text.find("THE MYSTERIOUS ISLAND")
end_index = text.find("END OF THE PROJECT GUTENBERG EBOOK 1268")
text = text[start_index:end_index]

# Create character mappings
char_set = set(text)
char_sorted = sorted(char_set)
char2int = {ch: i for i, ch in enumerate(char_sorted)}
char_array = np.array(char_sorted)

# Encode entire text to integer sequence
text_encoded = np.array([char2int[ch] for ch in text], dtype=np.int32)

print(f"Total characters in text: {len(text)}")
print(f"Unique characters (vocabulary size): {len(char2int)}")


---

## 3. Create Dataset and DataLoaders

Split the encoded text into sequences and create PyTorch Dataset and DataLoader objects for training and validation.


In [None]:
class TextDataset(Dataset):
    def __init__(self, text_chunks):
        self.text_chunks = text_chunks

    def __len__(self):
        return len(self.text_chunks)

    def __getitem__(self, index):
        chunk = self.text_chunks[index]
        return torch.tensor(chunk[:-1], dtype=torch.long), torch.tensor(chunk[1:], dtype=torch.long)

chunk_size = SEQUENCE_LENGTH + 1
text_chunks = [text_encoded[i:chunk_size + i] for i in range(len(text_encoded) - chunk_size + 1)]

dataset = TextDataset(text_chunks)

train_size = int(0.9 * len(dataset))
valid_size = len(dataset) - train_size
train_ds, valid_ds = torch.utils.data.random_split(dataset, [train_size, valid_size])

train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
valid_dl = DataLoader(valid_ds, batch_size=BATCH_SIZE, drop_last=True)

print(f"Training batches: {len(train_dl)}")
print(f"Validation batches: {len(valid_dl)}")


---

## 4. Define the LSTM Model

We define a character-level LSTM model with embedding, LSTM layers, and a fully connected output layer.


In [None]:
class RNNCharGenModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size):
        super().__init__()
        self.rnn_hidden_size = rnn_hidden_size
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embed_dim)
        self.rnn = nn.LSTM(embed_dim, rnn_hidden_size, batch_first=True)
        self.fc = nn.Linear(rnn_hidden_size, vocab_size)

    def forward(self, x, hidden, cell):
        out = self.embedding(x).unsqueeze(1)
        out, (hidden, cell) = self.rnn(out, (hidden, cell))
        out = self.fc(out).reshape(out.size(0), -1)
        return out, hidden, cell

    def init_hidden(self, batch_size):
        hidden = torch.zeros(1, batch_size, self.rnn_hidden_size)
        cell = torch.zeros(1, batch_size, self.rnn_hidden_size)
        return hidden, cell


---

## 5. Define Training and Utility Functions

These include gradient norm calculation, training loop, and text generation functions.


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def grad_norm(model):
    total_norm = 0.0
    for p in model.parameters():
        if p.grad is not None:
            param_norm = p.grad.data.norm(2)
            total_norm += param_norm.item() ** 2
    return total_norm ** 0.5

def train_model(model, loss_fn, optimizer, train_dl, valid_dl, num_epochs, sequence_length, device):
    train_losses = []

    def evaluate(x_batch, y_batch, train=True):
        model.train() if train else model.eval()
        batch_size = x_batch.size(0)
        hidden, cell = model.init_hidden(batch_size)
        hidden, cell = hidden.to(device), cell.to(device)
        loss = 0.0
        if train:
            optimizer.zero_grad()
        for c in range(sequence_length):
            input_ = x_batch[:, c].to(device)
            pred, hidden, cell = model(input_, hidden, cell)
            hidden = hidden.detach()
            cell = cell.detach()
            y_batch_step = y_batch[:, c].to(device)
            loss += loss_fn(pred, y_batch_step)
        if train:
            loss.backward()
            optimizer.step()
        return loss.item() / sequence_length

    for epoch in range(num_epochs):
        start = time.time()
        train_epoch_loss = 0.0
        for x_batch, y_batch in train_dl:
            train_epoch_loss += evaluate(x_batch, y_batch, train=True)
        train_losses.append(train_epoch_loss)

        if epoch % 5 == 0 or epoch == num_epochs - 1:
            valid_epoch_loss = 0.0
            with torch.no_grad():
                for x_batch, y_batch in valid_dl:
                    valid_epoch_loss += evaluate(x_batch, y_batch, train=False)

            train_epoch_loss /= len(train_dl)
            valid_epoch_loss /= len(valid_dl)
            elapsed = time.time() - start
            norm_grad = grad_norm(model)
            print(f'Epoch {epoch}/{num_epochs} | time_elapsed: {elapsed:.3f}s | train_loss: {train_epoch_loss:.4f} | '
                  f'valid_loss: {valid_epoch_loss:.4f} | grad_norm: {norm_grad:.4f}')
    return train_losses

def generate_text(model, text, char_length, char2int, char_array, device):
    tokens = torch.tensor([char2int[c] for c in text]).to(device)
    model.eval()
    hidden, cell = model.init_hidden(1)
    hidden, cell = hidden.to(device), cell.to(device)
    for i in range(len(tokens)):
        input_ = tokens[i:i+1]
        out, hidden, cell = model(input_, hidden, cell)
    for _ in range(char_length):
        y_pred = torch.argmax(out, dim=1).item()
        text += char_array[y_pred]
        input_ = torch.tensor([y_pred]).to(device)
        out, hidden, cell = model(input_, hidden, cell)
    return text

def random_next_text(model, text, char_length, scale_factor, char2int, char_array, device):
    model.eval()
    for _ in range(char_length):
        tokens = torch.tensor([char2int[c] for c in text]).to(device)
        hidden, cell = model.init_hidden(1)
        hidden, cell = hidden.to(device), cell.to(device)
        for i in range(len(tokens)):
            input_ = tokens[i:i+1]
            out, hidden, cell = model(input_, hidden, cell)
        m = Categorical(logits=out * scale_factor)
        y_pred = m.sample((1,)).item()
        text += char_array[y_pred]
    return text


---

## 6. Initialize Model and Train

Set hyperparameters, instantiate the model, loss, optimizer, and start training.


In [None]:
NUM_EPOCHS = 50
VOCAB_SIZE = len(char_array)
EMBED_DIM = 256
RNN_HIDDEN_SIZE = 512
LEARNING_RATE = 0.001

model = RNNCharGenModel(VOCAB_SIZE, EMBED_DIM, RNN_HIDDEN_SIZE).to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

train_loss = train_model(model, loss_fn, optimizer, train_dl, valid_dl,
                         NUM_EPOCHS, SEQUENCE_LENGTH, device)


---

## 7. Generate Text Samples

We generate text using greedy decoding and random sampling with different temperatures.


In [None]:
text_initiator = "The island"

print("1. Greedy (argmax) text generation:\n")
print(generate_text(model, text_initiator, 500, char2int, char_array, device))
print('#' * 80)

print("2. Random sampling with temperature = 1.0:\n")
print(random_next_text(model, text_initiator, 500, 1.0, char2int, char_array, device))
print('#' * 80)

print("3. Random sampling with temperature = 2.0:\n")
print(random_next_text(model, text_initiator, 500, 2.0, char2int, char_array, device))
print('#' * 80)

print("4. Random sampling with temperature = 0.5:\n")
print(random_next_text(model, text_initiator, 500, 0.5, char2int, char_array, device))
print('#' * 80)


---

## 8. Plot Training Loss

Visualizing the training loss across epochs to monitor convergence.


In [None]:
plt.plot(train_loss)
plt.title("Training Loss Over Epochs")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.grid(True)
plt.show()


---

## Conclusion

We successfully trained a character-level LSTM on *The Mysterious Island* text. The model learned to generate coherent text sequences mimicking the style and structure of the novel.  

Different sampling temperatures in random text generation affect creativity vs. coherence tradeoffs. Lower temperatures yield more predictable text, while higher temperatures introduce more randomness.

This notebook can be extended by:

- Increasing model complexity or layers  
- Training for more epochs  
- Experimenting with word-level models  
- Adding attention mechanisms  

Feel free to explore and improve!

---

*Happy Text Generating!*
