In [79]:
# Install dependencies
import os
import requests
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from torch.utils.data import Dataset
from torch.utils.tensorboard import SummaryWriter

In [80]:
# Determine which device to choose
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using mps device


In [81]:
# Define paths
data_path = "data/input.txt"
model_path = "models/rnn.pth"
tokenizer_path = "models/tokenizer.pth"
word_tokenizer_path = "models/word_tokenizer.pth"

In [82]:
# Create directories
os.makedirs("data", exist_ok=True)
os.makedirs("models", exist_ok=True)

In [83]:
# Download Dataset
url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
if not os.path.exists(data_path):
    response = requests.get(url)
    with open(data_path, "w", encoding="utf-8") as f:
        f.write(response.text)
    print("Dataset downloaded")
else:
    print("Dataset already downloaded.")

Dataset already downloaded.


In [84]:
# Define Tokenizer character by character
class CharTokenizer:
    def __init__(self, text):
        self.chars = sorted(set(text))
        self.vocab_size = len(self.chars)
        self.char2idx = {ch: idx for idx, ch in enumerate(self.chars)}
        self.idx2char = {idx: ch for ch, idx in self.char2idx.items()}

    def encode(self, text):
        return np.array([self.char2idx[ch] for ch in text], dtype=np.int32)

    def decode(self, indices):
        return ''.join([self.idx2char[idx] for idx in indices])


In [85]:
# Define Tokenizer word by word
class WordTokenizer:
    def __init__(self, text):
        self.words = sorted(set(text.split()))
        self.vocab_size = len(self.words)
        self.word2idx = {word: idx for idx, word in enumerate(self.words)}
        self.idx2word = {idx: word for word, idx in self.word2idx.items()}

    def encode(self, text):
        return np.array([self.word2idx[word] for word in text.split() if word in self.word2idx], dtype=np.int32)

    def decode(self, indices):
        return ' '.join([self.idx2word[idx] for idx in indices])


In [86]:
# Load and tokenize text
with open(data_path, "r", encoding="utf-8") as f:
    text = f.read()

# Initialize tokenizers
char_tokenizer = CharTokenizer(text)
word_tokenizer = WordTokenizer(text)

# Allow switching between tokenizers
def get_tokenizer(tokenizer_type):
    if tokenizer_type == "char":
        return char_tokenizer
    elif tokenizer_type == "word":
        return word_tokenizer
    else:
        raise ValueError("Unknown tokenizer type. Use 'char' or 'word'.")


In [87]:
# Switch between "char" and "word" tokenizer
tokenizer_type = "word"
tokenizer = get_tokenizer(tokenizer_type)

In [88]:
# Tokenize the text by word
with open(data_path, "r", encoding="utf-8") as f:
    text = f.read()

# Initialize tokenizer
word_tokenizer = WordTokenizer(text)
torch.save(word_tokenizer, tokenizer_path)
print("word tokenizer saved")

word tokenizer saved


In [89]:
# Define Dataset Class
class CharDataset(Dataset):
    def __init__(self, text, tokenizer, seq_length=100):
        self.tokenizer = tokenizer
        self.data = self.tokenizer.encode(text)
        self.seq_length = seq_length
        self.num_samples = len(self.data) - seq_length

    def __len__(self):
        return self.num_samples

    def __getitem__(self, idx):
        input_seq = self.data[idx:idx+self.seq_length]
        target_seq = self.data[idx+1:idx+self.seq_length+1]
        return torch.tensor(input_seq, dtype=torch.long), torch.tensor(target_seq, dtype=torch.long)



In [90]:
# Define Model
class CharRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim=128, hidden_dim=256, num_layers=2):
        super(CharRNN, self).__init__()
        self.embed = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden):
        x = self.embed(x)
        output, hidden = self.lstm(x, hidden)
        output = self.fc(output)
        return output, hidden

    def init_hidden(self, batch_size):
        return (torch.zeros(2, batch_size, 256), torch.zeros(2, batch_size, 256))


In [91]:
# Train the Model
def train_model(model, data, vocab_size, optimizer, criterion, device, writer, num_epochs=0, seq_length=100):
    model.train()
    global_step = 0
    for epoch in range(num_epochs):
        hidden = None
        epoch_loss = 0.0
        for i in range(0, len(data) - seq_length, seq_length):
            inputs = torch.tensor(data[i:i+seq_length], dtype=torch.long).unsqueeze(0).to(device)
            targets = torch.tensor(data[i+1:i+seq_length+1], dtype=torch.long).unsqueeze(0).to(device)

            optimizer.zero_grad()
            outputs, hidden = model(inputs, hidden)
            if hidden is not None:
                if isinstance(hidden, tuple):
                    hidden = tuple(h.detach() for h in hidden)
                else:
                    hidden = hidden.detach()

            loss = criterion(outputs.view(-1, vocab_size), targets.view(-1))
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
            writer.add_scalar("Loss/Batch", loss.item(), global_step)
            global_step += 1
        avg_loss = epoch_loss / ((len(data) - seq_length) // seq_length)
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")
        writer.add_scalar("Loss/Epoch", avg_loss, epoch+1)
    return model

In [92]:
# Training Setup
writer = SummaryWriter("logs")

model = CharRNN(tokenizer.vocab_size).to(device)
optimizer = optim.AdamW(model.parameters(), lr=0.002)
criterion = nn.CrossEntropyLoss()

data = tokenizer.encode(text)
model = train_model(model, data, tokenizer.vocab_size, optimizer, criterion, device, writer)

# Save model
torch.save(model.state_dict(), model_path)
print("Model saved")

Epoch 1/20, Loss: 7.6472
Epoch 2/20, Loss: 6.6793
Epoch 3/20, Loss: 6.1535
Epoch 4/20, Loss: 5.7276
Epoch 5/20, Loss: 5.3709
Epoch 6/20, Loss: 4.9992
Epoch 7/20, Loss: 4.6226
Epoch 8/20, Loss: 4.2884
Epoch 9/20, Loss: 3.9874
Epoch 10/20, Loss: 3.7333
Epoch 11/20, Loss: 3.4798
Epoch 12/20, Loss: 3.2601
Epoch 13/20, Loss: 3.0592
Epoch 14/20, Loss: 2.8904
Epoch 15/20, Loss: 2.7204
Epoch 16/20, Loss: 2.5651
Epoch 17/20, Loss: 2.4176
Epoch 18/20, Loss: 2.2844
Epoch 19/20, Loss: 2.1551
Epoch 20/20, Loss: 2.0467
Model saved


In [93]:
# Generate Text
def generate_text(start_string, length=50, temperature=1.0):
    model.eval()
    input_seq = torch.tensor(word_tokenizer.encode(start_string), dtype=torch.long).unsqueeze(0).to(device)
    hidden = model.init_hidden(1)
    hidden = (hidden[0].to(device), hidden[1].to(device))
    output_text = start_string

    for _ in range(length):
        output, hidden = model(input_seq, hidden)
        output_dist = output[:, -1, :] / temperature
        predicted_id = torch.multinomial(torch.softmax(output_dist, dim=1), 1).item()
        input_seq = torch.tensor([[predicted_id]], dtype=torch.long).to(device)
        output_text += " " + word_tokenizer.decode([predicted_id])

    return output_text

In [94]:
# Generate sample text
print(generate_text("ROMEO:", length=500, temperature=0.8))

ROMEO: Who left forgot As the grief doth usurp the bigger light, and a sort. ANTONIO: A laughter. SEBASTIAN: Upon the acorn cradled. Follow. his friends, loss, As if a spendthrift of the merchant And burn in their King of Tunis. ANTONIO: No marrying 'mong the maid: your king's loss, and relieve him, indeed: you not your men that's not a man than the miraculous harp; it is not a prison. GONZALO: Come, sir, I speak not to the entertainer-- SEBASTIAN: Which is a man of Milan, he's a traitor. Come; I are not in all the breasts breathes again or endeavour: treason, felony, Sword, pike, knife, gun, or need for ever: Milan and Naples Have about their rotten ones. CATESBY: We are made home to revenge the bloody colours of ADRIAN: GONZALO: No, my most man is gone with all the rest of green too. SEBASTIAN: Is the miracle, of this business' use he of joy; or dost thou waking? SEBASTIAN: Good the ground for her rats are in thirst a silly thing, The king's cock. ANTONIO: The cockerel. SEBASTIAN: Do