In [1]:
from datasets import load_dataset
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, decoders
import unicodedata
import os
import re
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.normalizers import NFKC
from tokenizers.processors import TemplateProcessing
from tokenizers.decoders import BPEDecoder
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def preprocess(text):
    lines = text.split("\n")
    cleaned_lines = [unicodedata.normalize("NFKC", line).strip() for line in lines if line.strip()]
    return "\n".join(cleaned_lines)

In [3]:
dataset = load_dataset("tiny_shakespeare.py")

In [4]:
drive_path = "/content/drive/MyDrive/Assignment 2 - MGSC695/"

In [5]:
train_data = dataset["train"]["text"]
valid_data = dataset["validation"]["text"]
test_data = dataset["test"]["text"]

In [6]:
# store the text in variables
train_text = " ".join(train_data)
valid_text = " ".join(valid_data)
test_text = " ".join(test_data)

# preprocess the text
train_text = preprocess(train_text)
valid_text = preprocess(valid_text)
test_text = preprocess(test_text)
print(train_text[:200])

First Citizen:
Before we proceed any further, hear me speak.
All:
Speak, speak.
First Citizen:
You are all resolved rather to die than to famish?
All:
Resolved. resolved.
First Citizen:
First, you kno


In [7]:
# Write the training text to a temporary file (tokenizers library needs files)
with open("shakespeare_clean.txt", "w", encoding="utf-8") as f:
    f.write(train_text)

In [8]:
# check out what the data looks like
print(test_text[:200])

rance ta'en
As shall with either part's agreement stand?
BAPTISTA:
Not in my house, Lucentio; for, you know,
Pitchers have ears, and I have many servants:
Besides, old Gremio is hearkening still;
And 


### No stemming or removing punctuation because the purpose of this model is to generate text like Shakespeare would. Stemming or removing punctuation would remove the stylistic richness of the data that makes Shakespearean text what it is...

In [9]:
train_lines = train_text.split("\n")
valid_lines = valid_text.split("\n")
test_lines = test_text.split("\n")

In [10]:
# Define the tokenizer and trainer
tokenizer = Tokenizer(BPE(unk_token="<unk>"))
tokenizer.normalizer = NFKC()
tokenizer.pre_tokenizer = Whitespace()

In [11]:
# Define trainer
trainer = BpeTrainer(
    vocab_size=1000,  # You can change this based on your dataset size
    special_tokens=["<unk>", "<pad>", "<bos>", "<eos>"]
)

# Train on training data only
tokenizer.train_from_iterator(train_lines, trainer)

In [12]:
tokenizer.post_processor = TemplateProcessing(
    single="<bos> $A <eos>",
    special_tokens=[
        ("<bos>", tokenizer.token_to_id("<bos>")),
        ("<eos>", tokenizer.token_to_id("<eos>"))
    ]
)
tokenizer.decoder = BPEDecoder()

In [13]:
# Tokenize each dataset
train_encoded = tokenizer.encode(train_text)
val_encoded = tokenizer.encode(valid_text)
test_encoded = tokenizer.encode(test_text)

# Get token ID sequences
train_ids = train_encoded.ids
val_ids = val_encoded.ids
test_ids = test_encoded.ids

In [14]:
tokenizer.save("bpe_tokenizer.json")

# if we need it later:
# tokenizer = Tokenizer.from_file("bpe_tokenizer.json")

In [15]:
# time for creating input and target sequences
seq_len = 50

In [16]:
def create_sequences(data_ids, seq_len):
    inputs = []
    targets = []
    for i in range(0, len(data_ids) - seq_len):
        inputs.append(data_ids[i:i+seq_len])
        targets.append(data_ids[i+1:i+seq_len+1])
    return inputs, targets

X_train, y_train = create_sequences(train_ids, seq_len)
X_val, y_val = create_sequences(val_ids, seq_len)
X_test, y_test = create_sequences(test_ids, seq_len)

In [17]:
# Convert to tensors
X_train = torch.tensor(X_train, dtype=torch.long)
y_train = torch.tensor(y_train, dtype=torch.long)

# Build a dataset and dataloader
train_dataset = torch.utils.data.TensorDataset(X_train, y_train)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True)

In [18]:
X_val = torch.tensor(X_val, dtype=torch.long)
y_val = torch.tensor(y_val, dtype=torch.long)

val_dataset = torch.utils.data.TensorDataset(X_val, y_val)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=64, shuffle=False)

In [19]:
import torch.nn as nn

class VanillaRNNModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers, bidirectional=True):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size, padding_idx=tokenizer.token_to_id("<pad>"))
        self.rnn = nn.RNN(
            input_size=embed_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=bidirectional
        )
        self.num_directions = 2 if bidirectional else 1
        self.fc = nn.Linear(hidden_size * self.num_directions, vocab_size)

    def forward(self, x, hidden=None):
        x = self.embedding(x)
        output, hidden = self.rnn(x, hidden)
        logits = self.fc(output)
        return logits, hidden

In [20]:
vocab_size = tokenizer.get_vocab_size()
model = VanillaRNNModel(
    vocab_size=vocab_size,
    embed_size=256,
    hidden_size=256,
    num_layers=2,
    bidirectional=True
)

In [21]:
# use GPU only
assert torch.cuda.is_available(), "CUDA is not available. Please run on a machine with a GPU."
device = torch.device("cuda")
model = model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [52]:
num_epochs = 5
clip_value = 1.0

for epoch in range(num_epochs):
    model.train()
    total_train_loss = 0

    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)

        optimizer.zero_grad()
        outputs, _ = model(X_batch)

        loss = criterion(outputs.view(-1, vocab_size), y_batch.view(-1))
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip_value)
        optimizer.step()

        total_train_loss += loss.item()

    avg_train_loss = total_train_loss / len(train_loader)

    # 🔍 VALIDATION
    model.eval()
    total_val_loss = 0

    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            outputs, _ = model(X_batch)

            val_loss = criterion(outputs.view(-1, vocab_size), y_batch.view(-1))
            total_val_loss += val_loss.item()

    avg_val_loss = total_val_loss / len(val_loader)

    print(f"Epoch {epoch+1}/{num_epochs} — Train Loss: {avg_train_loss:.4f} — Val Loss: {avg_val_loss:.4f}")

Epoch 1/5 — Train Loss: 0.1308 — Val Loss: 0.0933
Epoch 2/5 — Train Loss: 0.0876 — Val Loss: 0.0914
Epoch 3/5 — Train Loss: 0.0841 — Val Loss: 0.0917
Epoch 4/5 — Train Loss: 0.0821 — Val Loss: 0.0912
Epoch 5/5 — Train Loss: 0.0808 — Val Loss: 0.0915


In [64]:
# Save the model to file
torch.save(model.state_dict(),"vanilla_rnn_model.pth")

In [22]:
import torch.nn.functional as F

def generate_text(model, tokenizer, prompt, max_length=100, temperature=1.0):
    model.eval()
    device = next(model.parameters()).device

    # Encode the prompt into token IDs
    encoded = tokenizer.encode(prompt)
    input_ids = torch.tensor(encoded.ids, dtype=torch.long).unsqueeze(0).to(device)

    # Hidden state (can be None initially)
    hidden = None
    generated_ids = input_ids.tolist()[0]  # seed with initial prompt tokens

    # Generate tokens one-by-one
    for _ in range(max_length):
        with torch.no_grad():
            output, hidden = model(input_ids, hidden)  # output: [1, seq_len, vocab_size]
            logits = output[:, -1, :] / temperature    # take last token's logits

            probs = F.softmax(logits, dim=-1)
            next_token_id = torch.multinomial(probs, num_samples=1).item()

            generated_ids.append(next_token_id)

            input_ids = torch.tensor([[next_token_id]], dtype=torch.long).to(device)

    # Decode generated token IDs back into text
    generated_text = tokenizer.decode(generated_ids, skip_special_tokens=True)
    return generated_text

In [23]:
prompt = "To be, or not to"
output = generate_text(model, tokenizer, prompt, max_length=120, temperature=1.5)

print("📜 Generated Text:")
print(output)

📜 Generated Text:
Tobe,ornottowereupzfalullboypleckveveryrighttalGELhiconf,ComekindwereCORIOLANLOortWellqueenarsspsorItsweRICHVIhimtleerfirstIZicefeIIachherefoothingELIZABETHTisakelorddedowerparThisthineyoujeHENRYVGounetingtrongheaWARWICcomeunwerdescomesfromconfbedYetShallOMThisWillwhinatherWICcondbeheardhandssoForENsandfaroughtRICHvenhitherprisentHeffriendsoulGLUCYorkguquiurWARWICtelluntoUSithueper?ousingosetEOusKINGHAMfal


In [24]:
class LSTMLanguageModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers, bidirectional=False):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size, padding_idx=tokenizer.token_to_id("<pad>"))
        self.lstm = nn.LSTM(
            input_size=embed_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=bidirectional
        )
        self.num_directions = 2 if bidirectional else 1
        self.fc = nn.Linear(hidden_size * self.num_directions, vocab_size)

    def forward(self, x, hidden=None):
        x = self.embedding(x)
        if hidden is not None:
            output, hidden = self.lstm(x, hidden)
        else:
            output, hidden = self.lstm(x)
        logits = self.fc(output)
        return logits, hidden

In [25]:
vocab_size = tokenizer.get_vocab_size()

model_LSTM = LSTMLanguageModel(
    vocab_size=vocab_size,
    embed_size=512,  # increased from 256
    hidden_size=1024,  # increased from 512
    num_layers=3,  # increased from 2
    bidirectional=False
).to(device)

# Initialize new optimizer specifically for LSTM
optimizer_lstm = torch.optim.Adam(model_LSTM.parameters(), lr=0.001)
# Add learning rate scheduler
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer_lstm, mode='min', factor=0.5, patience=1)

# Initialize weights properly
def init_weights(m):
    if type(m) in [nn.Linear, nn.Embedding]:
        nn.init.xavier_uniform_(m.weight)
model_LSTM.apply(init_weights)

LSTMLanguageModel(
  (embedding): Embedding(1000, 512, padding_idx=1)
  (lstm): LSTM(512, 1024, num_layers=3, batch_first=True)
  (fc): Linear(in_features=1024, out_features=1000, bias=True)
)

In [26]:
num_epochs = 10  # increased from 5
clip_value = 0.25  # reduced from 1.0
best_val_loss = float('inf')

for epoch in range(num_epochs):
    model_LSTM.train()
    total_train_loss = 0

    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)

        optimizer_lstm.zero_grad()  # use lstm optimizer
        outputs, _ = model_LSTM(X_batch)

        loss = criterion(outputs.view(-1, vocab_size), y_batch.view(-1))
        loss.backward()

        # Clip gradients
        torch.nn.utils.clip_grad_norm_(model_LSTM.parameters(), clip_value)
        optimizer_lstm.step()

        total_train_loss += loss.item()

    avg_train_loss = total_train_loss / len(train_loader)

    model_LSTM.eval()
    total_val_loss = 0

    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            outputs, _ = model_LSTM(X_batch)
            val_loss = criterion(outputs.view(-1, vocab_size), y_batch.view(-1))
            total_val_loss += val_loss.item()

    avg_val_loss = total_val_loss / len(val_loader)
    
    # Update learning rate based on validation loss
    scheduler.step(avg_val_loss)

    # Save best model
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        torch.save(model_LSTM.state_dict(), 'best_lstm_model.pth')

    print(f"Epoch {epoch+1}/{num_epochs} — Train Loss: {avg_train_loss:.4f} — Val Loss: {avg_val_loss:.4f} — LR: {optimizer_lstm.param_groups[0]['lr']:.6f}")

KeyboardInterrupt: 

In [67]:
prompt = "To be, or not to"
output = generate_text(model_LSTM, tokenizer, prompt, max_length=120, temperature=1.1)

print("📜 Generated Text:")
print(output)

📜 Generated Text:
Tobe,ornottoCEINghanYourengENIUSrvesgrcallsuchtisIOLANulGARwonightanceqENIfbronolifeearthsweedqueendsARDTocrownProchildzVkEDWcannotwilllordsNurserenfriendsEDWARDgCORIOLANUSukeilCAPULETableisonearsblefiWARButawayLUCNorfacedracleEDWULhereOMlThousterRimarowKingcondzenWarwickhytelovethatTERDUKEinesssayXLUCshOLouldYorELslaUCESTERvedthrCongIVfollowprayISABELLAshallxHtislackgueDUKEornagainMARGARheapleORDinfearHowfear


In [68]:
# save the model
torch.save(model_LSTM.state_dict(),"lstm_model.pth")