In [3]:
from datasets import load_dataset
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, decoders
import unicodedata
import os
import re

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
# Running the dataset script to load the dataset
dataset = load_dataset("tiny_shakespeare.py", trust_remote_code=True)
text = dataset["train"]["text"]

In [13]:
# Write the text to a temporary file (tokenizers library needs files)
with open("shakespeare.txt", "w", encoding="utf-8") as f:
    f.write("".join(text))  # Join the list into a single string

In [12]:
# Check out the data
print(dataset[0]['text'][:500])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor


### No stemming or removing punctuation because the purpose of this model is to generate text like Shakespeare would. Stemming or removing punctuation would remove the stylistic richness of the data that makes Shakespearean text what it is... 

In [18]:
# Tokenize the data into Byte-Pair Encoding (BPE) tokens
tokenizer = Tokenizer(models.BPE())
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
trainer = trainers.BpeTrainer(vocab_size=5000, special_tokens=["<PAD>","<BOS>", "<EOS>", "<UNK>"])
tokenizer.train(["shakespeare_clean.txt"], trainer)
tokenizer.decoder = decoders.BPEDecoder()

# Save the tokenizer model
tokenizer.save("shakespeare_BPE_tokenizer.json")


In [20]:
tokenizer = Tokenizer.from_file("shakespeare_BPE_tokenizer.json")

def encode_sequence(text, add_special = True):
    if add_special:
        text = "<BOS> " + text + " <EOS>"
    return tokenizer.encode(text).ids

In [22]:
def create_sequences(encoded_ids, seq_len=50):
    sequences = []
    for i in range(len(encoded_ids) - seq_len):
        seq = encoded_ids[i:i+seq_len]
        target = encoded_ids[i+1:i+seq_len+1]  # next-token prediction
        sequences.append((seq, target))
    return sequences

# Example
token_ids = encode_sequence(clean_text, add_special=False)
seq_64 = create_sequences(token_ids, seq_len=64)
seq_50 = create_sequences(token_ids, seq_len=50)
seq_25 = create_sequences(token_ids, seq_len=25)
seq_150 = create_sequences(token_ids, seq_len=150)
seq_200 = create_sequences(token_ids, seq_len=200)


print("Sample input token IDs:", seq_25[0][0])
print("Sample target token IDs:", seq_25[0][1])

Sample input token IDs: [385, 710, 12, 1854, 115, 2209, 442, 1793, 8, 400, 81, 368, 10, 921, 12, 1936, 8, 368, 10, 385, 710, 12, 325, 162, 140]
Sample target token IDs: [710, 12, 1854, 115, 2209, 442, 1793, 8, 400, 81, 368, 10, 921, 12, 1936, 8, 368, 10, 385, 710, 12, 325, 162, 140, 3237]


In [23]:
import torch
import torch.nn as nn

class ShakespeareRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, rnn_type='rnn', bidirectional=True):
        super(ShakespeareRNN, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        self.rnn_type = rnn_type.lower()
        self.bidirectional = bidirectional
        self.hidden_dim = hidden_dim
        self.num_directions = 2 if bidirectional else 1

        # Choose RNN type
        if self.rnn_type == 'lstm':
            self.rnn = nn.LSTM(
                input_size=embedding_dim,
                hidden_size=hidden_dim,
                batch_first=True,
                bidirectional=bidirectional
            )
        elif self.rnn_type == 'gru':
            self.rnn = nn.GRU(
                input_size=embedding_dim,
                hidden_size=hidden_dim,
                batch_first=True,
                bidirectional=bidirectional
            )
        else:  # vanilla RNN
            self.rnn = nn.RNN(
                input_size=embedding_dim,
                hidden_size=hidden_dim,
                batch_first=True,
                bidirectional=bidirectional,
                nonlinearity='tanh'
            )

        self.fc = nn.Linear(hidden_dim * self.num_directions, vocab_size)

    def forward(self, x, hidden=None):
        x = self.embedding(x)  # [batch_size, seq_len, embed_dim]
        output, hidden = self.rnn(x, hidden)  # output: [batch, seq_len, hidden*directions]
        output = self.fc(output)  # Predict vocab token at each time step
        return output, hidden

In [24]:
vocab_size = 5000  # from tokenizer
embedding_dim = 128
hidden_dim = 256
seq_len = 50
batch_size = 32

# Try a bidirectional GRU
model = ShakespeareRNN(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    hidden_dim=hidden_dim,
    rnn_type='gru',
    bidirectional=True
)

sample_input = torch.randint(0, vocab_size, (batch_size, seq_len))
output, hidden = model(sample_input)

print("Output shape:", output.shape)  # Should be [batch_size, seq_len, vocab_size]


Output shape: torch.Size([32, 50, 5000])


In [25]:
import torch.optim as optim 
import random

# Choose device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Sample hyperparameters
vocab_size = 5000
embedding_dim = 128
hidden_dim = 256
seq_len = 50
batch_size = 32

# Create the model
model = ShakespeareRNN(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    hidden_dim=hidden_dim,
    rnn_type='lstm',  # Change to 'gru' or 'rnn' to test
    bidirectional=True
).to(device)

# Loss function
criterion = nn.CrossEntropyLoss()

# Optimizer — swap between Adam and RMSprop
optimizer = optim.Adam(model.parameters(), lr=0.001)
# optimizer = optim.RMSprop(model.parameters(), lr=0.001)

# Gradient clipping threshold
clip_value = 5.0

In [None]:
def train_epoch(model, data, optimizer, criterion, teacher_forcing_ratio=0.5):
    model.train()
    total_loss = 0

    for i in range(0, len(data), batch_size):
        batch = data[i:i+batch_size]

        # Skip if final batch is too small
        if len(batch) < batch_size:
            continue

        inputs = torch.tensor([x[0] for x in batch], dtype=torch.long).to(device)  # [B, T]
        targets = torch.tensor([x[1] for x in batch], dtype=torch.long).to(device)  # [B, T]

        optimizer.zero_grad()
        output, _ = model(inputs)  # output shape: [B, T, vocab_size]

        # Optionally apply teacher forcing (predict one token at a time)
        if teacher_forcing_ratio < 1.0:
            # Loop through time steps manually
            loss = 0
            for t in range(seq_len):
                use_teacher = random.random() < teacher_forcing_ratio
                if use_teacher or t == 0:
                    inp = inputs[:, t]
                else:
                    # Use model's own prediction
                    _, top1 = torch.max(output[:, t-1], dim=1)
                    inp = top1
                pred, _ = model(inp.unsqueeze(1))
                loss += criterion(pred.squeeze(1), targets[:, t])
            loss = loss / seq_len
        else:
            # Full-sequence prediction
            output = output.view(-1, vocab_size)
            targets = targets.view(-1)
            loss = criterion(output, targets)

        loss.backward()

        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip_value)

        optimizer.step()
        total_loss += loss.item()

    return total_loss / (len(data) // batch_size)

In [None]:
import matplotlib.pyplot as plt

def train_model(model, data, epochs=10, optimizer_type='adam', teacher_forcing_ratio=1.0):
    # Choose optimizer
    if optimizer_type == 'adam':
        optimizer = optim.Adam(model.parameters(), lr=0.001)
    elif optimizer_type == 'rmsprop':
        optimizer = optim.RMSprop(model.parameters(), lr=0.001)

    criterion = nn.CrossEntropyLoss()
    losses = []

    for epoch in range(epochs):
        loss = train_epoch(model, data, optimizer, criterion, teacher_forcing_ratio)
        losses.append(loss)
        print(f"Epoch {epoch+1}/{epochs} - Loss: {loss:.4f}")

    return losses

# Train with teacher forcing
# tf_losses = train_model(model, seq_50, epochs=10, optimizer_type='adam', teacher_forcing_ratio=1.0)

# Train without teacher forcing (set ratio to 0)
model = ShakespeareRNN(...).to(device)  # Re-init to reset weights if needed
no_tf_losses = train_model(model, seq_50, epochs=10, optimizer_type='adam', teacher_forcing_ratio=0.0)

# Plot
# plt.plot(tf_losses, label='With Teacher Forcing')
# plt.plot(no_tf_losses, label='Without Teacher Forcing')
# plt.legend()
# plt.xlabel("Epoch")
# plt.ylabel("Loss")
# plt.title("Training Loss Comparison")
# plt.show()


Epoch 1/10 - Loss: 1.0914
Epoch 2/10 - Loss: 0.1163
Epoch 3/10 - Loss: 0.0940
Epoch 4/10 - Loss: 0.0850
Epoch 5/10 - Loss: 0.0785
Epoch 6/10 - Loss: 0.0742
Epoch 7/10 - Loss: 0.0713
Epoch 8/10 - Loss: 0.0693
Epoch 9/10 - Loss: 0.0678
Epoch 10/10 - Loss: 0.0670


In [None]:
def generate_text(model, tokenizer, start_text="<BOS>", max_length=100, temperature=1.0, top_k=50):
    model.eval()

    def sample_with_top_k(logits, k):
        logits = logits / temperature
        probs = torch.nn.functional.softmax(logits, dim=0)
        top_probs, top_indices = torch.topk(probs, k)
        sampled = torch.multinomial(top_probs, 1)
        return top_indices[sampled].item()

    input_ids = tokenizer.encode(start_text).ids
    input_tensor = torch.tensor(input_ids, dtype=torch.long).unsqueeze(0).to(device)  # shape: [1, seq_len]

    generated = input_ids.copy()
    hidden = None

    for _ in range(max_length):
        output, hidden = model(input_tensor, hidden)
        logits = output[0, -1, :]  # get last token's logits

        # Optional: repetition penalty (commented out by default)
        for token_id in set(generated[-10:]):  # penalize repeated tokens in last 10
            logits[token_id] *= 0.9

        next_token_id = sample_with_top_k(logits, top_k)

        if tokenizer.id_to_token(next_token_id) == "<EOS>":
            break

        generated.append(next_token_id)
        input_tensor = torch.tensor([[next_token_id]], dtype=torch.long).to(device)

    return tokenizer.decode(generated)

In [None]:
generated = generate_text(model, tokenizer, start_text="To be", max_length=100, temperature= 1, top_k=50)
print(generated)

NameError: name 'generate_text' is not defined