# Generating Fake Shakespeare Text

In [21]:
import torch 
import torch.nn as nn
import numpy as np

In [22]:
if torch.cuda.is_available():
    print(torch.cuda.device_count())
    device = "cuda"
elif torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"
device

2


'cuda'

In [23]:
from pathlib import Path
import urllib.request

def download_shakespeare_text():
    path = Path("datasets/shakespeare/shakespeare.txt")
    if not path.is_file():
        path.parent.mkdir(parents=True, exist_ok=True)
        url = "https://homl.info/shakespeare"
        urllib.request.urlretrieve(url, path)
    return path.read_text()
shakespeare_text = download_shakespeare_text()

In [24]:
print(shakespeare_text[:100])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [25]:
vocab = sorted(set(shakespeare_text.lower()))
''.join(vocab)

"\n !$&',-.3:;?abcdefghijklmnopqrstuvwxyz"

In [26]:
char_to_id = {char:idx for idx, char in enumerate(vocab)}
id_to_char = {idx:char for idx, char in enumerate(vocab)}

In [27]:
char_to_id["a"]

13

In [28]:
id_to_char[13]

'a'

In [29]:
def encode_text(text):
    return torch.tensor([char_to_id[char] for char in text.lower()])
def decode_text(char_ids):
    return ''.join([id_to_char[char_id.item()] for char_id in char_ids])

In [30]:
encoded = encode_text("hello world")
encoded


tensor([20, 17, 24, 24, 27,  1, 35, 27, 30, 24, 16])

In [31]:
decode_text(encoded)

'hello world'

In [32]:
class TimeSeriesDatasetBuilder:
    def __init__(self, series, window_length=56):
        self.encoded_text = encode_text(series)
        self.window_length = window_length
   
    def create_X_y(self):
        X, y =[],[]
        for i in range(len(self.encoded_text) - self.window_length):
            window = self.encoded_text[i:i+self.window_length]
            future = self.encoded_text[i+1:i+self.window_length+1]
            X.append(window)
            y.append(future)
        return np.array(X),np.array(y)

In [33]:
to_be_dataset = TimeSeriesDatasetBuilder(series="to be or not to be", window_length=10)
X,y = to_be_dataset.create_X_y()
for i in range(len(X)):
    decoded_x = decode_text(X[i])
    decoded_y = decode_text(y[i])
    print(f"x : {decoded_x}")
    print(f"y : {decoded_y}")

x : to be or n
y : o be or no
x : o be or no
y :  be or not
x :  be or not
y : be or not 
x : be or not 
y : e or not t
x : e or not t
y :  or not to
x :  or not to
y : or not to 
x : or not to 
y : r not to b
x : r not to b
y :  not to be


In [34]:
window_length = 56
batch_size = 1024 
builder = TimeSeriesDatasetBuilder(shakespeare_text,window_length)
X, y = builder.create_X_y()

In [35]:
X.shape

(1115338, 56)

In [36]:
y.shape

(1115338, 56)

In [37]:
from torch.utils.data import TensorDataset, DataLoader

X_tensor = torch.tensor(X, dtype = torch.long)
y_tensor = torch.tensor(y, dtype = torch.long).squeeze(-1)

train_set = TensorDataset(X_tensor[:1_000_000], y_tensor[:1_000_000])
valid_set = TensorDataset(X_tensor[1_000_000:1_060_000],y_tensor[1_000_000:1_060_000])
test_set = TensorDataset(X_tensor[1_060_000:], y_tensor[1_060_000:])

train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, 
                          num_workers=2, pin_memory=True)
valid_loader = DataLoader(valid_set, batch_size=batch_size,
                         num_workers=2, pin_memory=True)
test_loader = DataLoader(test_set, batch_size=batch_size,
                        num_workers=2, pin_memory=True)

In [38]:
import torchmetrics

def evaluate_tm(model, data_loader, metric):
    model.eval()
    metric.reset()
    with torch.no_grad():
        for X_batch, y_batch in data_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            y_pred = model(X_batch)
            metric.update(y_pred, y_batch)
    return metric.compute()

def train(model, optimizer, criterion, metric, train_loader, valid_loader, n_epochs, patience=2,
         factor=0.5,epoch_callback=None):
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode="max", patience=patience, factor=factor
    )
    history = {"train_losses":[],"train_metrics":[],"valid_metrics":[]}
    for epoch in range(n_epochs):
        total_loss = 0
        metric.reset()
        model.train()
        if epoch_callback is not None:
            epoch_callback(model,epoch)
        for idx,( X_batch, y_batch) in enumerate(train_loader):
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            y_pred = model(X_batch)
            loss = criterion(y_pred, y_batch)
            total_loss += loss.item()
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            metric.update(y_pred, y_batch)
            print(f"\rBatch {idx+1}/{len(train_loader)}", end="")
            print(f", loss ={total_loss/(idx+1 ):.4f} ", end="")
        mean_loss = total_loss / len(train_loader)
        history["train_losses"].append(mean_loss)
        history["train_metrics"].append(metric.compute().item())
        val_metric = evaluate_tm(model, valid_loader, metric).item()
        history["valid_metrics"].append(val_metric)
        scheduler.step(val_metric)
        print(f"Epoch:{epoch+1}/{n_epochs}, "
             f"Train Loss: {history['train_losses'][-1]:.4f}, "
             f"Train Metric: {history['train_metrics'][-1]:.4f}, "
             f"Valid Metric: {history['valid_metrics'][-1]:.4f}")
    return history

In [39]:
class ShakespeareModel(nn.Module):
    def __init__(self, vocab_size, n_hidden=128, n_layers=2, embed_size=20, dropout=0.1):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.gru = nn.GRU(embed_size, n_hidden, num_layers=n_layers,
                         batch_first=True, dropout=dropout)
        self.output = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, X):
        embeddings = self.embed(X)
        outputs, _states = self.gru(embeddings)
        return self.output(outputs).permute(0,2,1)
model = ShakespeareModel(len(vocab)).to(device)
if torch.cuda.device_count()>1:
    print("Using", torch.cuda.device_count(),"GPU's")
    model = nn.DataParallel(model)
model.to(device)

Using 2 GPU's


DataParallel(
  (module): ShakespeareModel(
    (embed): Embedding(39, 20)
    (gru): GRU(20, 128, num_layers=2, batch_first=True, dropout=0.1)
    (output): Linear(in_features=128, out_features=39, bias=True)
  )
)

In [40]:
n_epochs = 20
xentropy = nn.CrossEntropyLoss()
accuracy = torchmetrics.Accuracy(task="multiclass",num_classes=len(vocab)).to(device)
optimizer = torch.optim.NAdam(model.parameters())

history = train(model, optimizer, xentropy, accuracy, train_loader, valid_loader, n_epochs)

Batch 977/977, loss =1.7122Epoch:1/20, Train Loss: 1.7122, Train Metric: 0.4861, Valid Metric: 0.5340
Batch 977/977, loss =1.4012Epoch:2/20, Train Loss: 1.4012, Train Metric: 0.5633, Valid Metric: 0.5495
Batch 977/977, loss =1.3568Epoch:3/20, Train Loss: 1.3568, Train Metric: 0.5742, Valid Metric: 0.5529
Batch 977/977, loss =1.3353Epoch:4/20, Train Loss: 1.3353, Train Metric: 0.5795, Valid Metric: 0.5543
Batch 977/977, loss =1.3225Epoch:5/20, Train Loss: 1.3225, Train Metric: 0.5827, Valid Metric: 0.5530
Batch 977/977, loss =1.3141Epoch:6/20, Train Loss: 1.3141, Train Metric: 0.5849, Valid Metric: 0.5520
Batch 977/977, loss =1.3082Epoch:7/20, Train Loss: 1.3082, Train Metric: 0.5865, Valid Metric: 0.5530
Batch 977/977, loss =1.2995Epoch:8/20, Train Loss: 1.2995, Train Metric: 0.5889, Valid Metric: 0.5535
Batch 977/977, loss =1.2971Epoch:9/20, Train Loss: 1.2971, Train Metric: 0.5896, Valid Metric: 0.5533
Batch 977/977, loss =1.2951Epoch:10/20, Train Loss: 1.2951, Train Metric: 0.5902, 

In [41]:
# Critical Shape Journey

# X (Input) Path:

#→ windowing → [1115294, 56] 
#→ batching → [1024, 56]
#→ embed() → [1024, 56, 10]
#→ GRU() → [1024, 56, 128] 
#→ [:, -1, :] → [1024, 128]
#→ Linear() → [1024, 39]

# y (Target) Path:

# → windowing → [1115294, 1] 
# → squeeze(-1) → [1115294]
# → batching → [1024]
# → CrossEntropyLoss with [1024, 39]


In [42]:
torch.save(model.state_dict(), "my_shakespeare_model.pt")

In [43]:
text = "To be or not to b"
encoded_text = encode_text(text).unsqueeze(dim=0).to(device)
encoded_text.shape

torch.Size([1, 17])

In [44]:
model.eval()
with torch.no_grad():
    y_logits = model(encoded_text)
    predicted_char_id = y_logits[0,:,-1].argmax().item()
    predicted_char = id_to_char[predicted_char_id]
predicted_char

'e'

In [45]:
def next_char(model, text, temperature=0.7):
    model.eval()
    encoded_text = encode_text(text).unsqueeze(0).to(device)
    with torch.no_grad():
        y_logits = model(encoded_text)
        y_probas = torch.softmax(y_logits[0,:,-1]/temperature, dim=-1)
        predicted_char_id = torch.multinomial(y_probas,num_samples=1).item()     
        return id_to_char[predicted_char_id]



In [46]:
import time
def generate_text(model, text, n_chars=100,temperature=0.7):
    print(text, end='', flush=True)
    for _ in range(n_chars):
        char = next_char(model, text, temperature)
        text += char
        print(char, end='', flush=True)
        time.sleep(0.01)

In [47]:
print(generate_text(model,"To be or not to b",n_chars=500 ))

To be or not to be such as shall be so.

escalus:
no more of thy gates, our company both in good crown?

king richard iii:
i shall be show thee to me, and then my state to be a
catesby with protectors of my traitor
where he shall be make me to pardon me.

warwick:
why, then what say you well mock your
sons and from her father's uncle, i thank'd with the laurence:
she has the leans of the sun, what all, sir, if you should think
the cause that they shall be to seek the roar upon the thing
and the report to englandNone


## Statefull RNN

In [105]:
class DatasetBuilderStateful:
    def __init__(self, series,batch_size, window_length=56):
        self.encoded_text = encode_text(series)
        self.window_length = window_length
        self.batch_size = batch_size

        #total number of full windows
        self.n_consecutive_windows = (len(self.encoded_text) - 1) // self.window_length
        #windows per stream
        self.n_windows_per_stream = self.n_consecutive_windows // self.batch_size
        #spacing between streams
        self.spacing = self.n_windows_per_stream * self.window_length
        #Total samples
        self.length = self.n_windows_per_stream * self.batch_size
                
    def create_X_y(self):
        X, y =[],[]
        for i in range(self.length):
            slot = i % self.batch_size  # stream index
            window_no_in_slot = i // self.batch_size

            start = slot * self.spacing + window_no_in_slot * self.window_length
            end = start + self.window_length
            window = self.encoded_text[start:end]
            future = self.encoded_text[start+1:end+1]
            X.append(window)
            y.append(future)
        return np.array(X),np.array(y)

In [106]:
batch_size =1024

text_len = len(shakespeare_text)

train_text = shakespeare_text[:int(0.9 * text_len)]
valid_text = shakespeare_text[int(0.9 * text_len):int(text_len)]
train_builder = DatasetBuilderStateful(train_text, batch_size=batch_size)
valid_builder = DatasetBuilderStateful(valid_text, batch_size=batch_size)

X_train, y_train = train_builder.create_X_y()
X_valid, y_valid = valid_builder.create_X_y()

X_train_tensor = torch.tensor(X_train, dtype=torch.long)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_valid_tensor = torch.tensor(X_valid, dtype=torch.long)
y_valid_tensor = torch.tensor(y_valid, dtype=torch.long)

train_set = TensorDataset(X_train_tensor, y_train_tensor)
valid_set = TensorDataset(X_valid_tensor, y_valid_tensor)

train_loader = DataLoader(train_set, batch_size=batch_size,
                         num_workers=2,pin_memory=True,drop_last=True)
valid_loader = DataLoader(valid_set, batch_size=batch_size,
                         num_workers=2,pin_memory=True,drop_last=True)


In [107]:
class StatefulShakespeareModel(nn.Module):
    def __init__(self, vocab_size, n_layers=2, embed_size=20, n_hidden=128, dropout=0.1):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.gru = nn.GRU(embed_size, n_hidden, num_layers=n_layers,batch_first=True,
                         dropout=dropout)
        self.output = nn.Linear(n_hidden,vocab_size)
        self.hidden_states = None

    def forward(self,X):
        embeddings = self.embed(X)
        outputs, hidden_states = self.gru(embeddings,self.hidden_states)
        self.hidden_states = hidden_states.detach()
        return self.output(outputs).permute(0,2,1)        

In [108]:
stateful_model = StatefulShakespeareModel(len(vocab)).to(device)
n_epochs = 20

def reset_hidden_state(model,epoch):
    model.hidden_states = None

xentropy = nn.CrossEntropyLoss()
accuracy = torchmetrics.Accuracy(task="multiclass", num_classes=len(vocab)).to(device)
optimizer = torch.optim.NAdam(stateful_model.parameters())

history = train(stateful_model, optimizer, xentropy, accuracy, train_loader, 
                valid_loader,n_epochs, epoch_callback=reset_hidden_state)

Batch 17/17, loss =3.1932Epoch:1/20, Train Loss: 3.1932, Train Metric: 0.1435, Valid Metric: 0.1516
Batch 17/17, loss =2.8296Epoch:2/20, Train Loss: 2.8296, Train Metric: 0.2133, Valid Metric: 0.2669
Batch 17/17, loss =2.5411Epoch:3/20, Train Loss: 2.5411, Train Metric: 0.2849, Valid Metric: 0.3052
Batch 17/17, loss =2.3679Epoch:4/20, Train Loss: 2.3679, Train Metric: 0.3231, Valid Metric: 0.3306
Batch 17/17, loss =2.2548Epoch:5/20, Train Loss: 2.2548, Train Metric: 0.3497, Valid Metric: 0.3543
Batch 17/17, loss =2.1581Epoch:6/20, Train Loss: 2.1581, Train Metric: 0.3741, Valid Metric: 0.3635
Batch 17/17, loss =2.0888Epoch:7/20, Train Loss: 2.0888, Train Metric: 0.3899, Valid Metric: 0.3828
Batch 17/17, loss =2.0179Epoch:8/20, Train Loss: 2.0179, Train Metric: 0.4062, Valid Metric: 0.3921
Batch 17/17, loss =1.9624Epoch:9/20, Train Loss: 1.9624, Train Metric: 0.4200, Valid Metric: 0.4051
Batch 17/17, loss =1.9126Epoch:10/20, Train Loss: 1.9126, Train Metric: 0.4327, Valid Metric: 0.4118

In [115]:

def next_char(model, text, temperature):
    model.eval()
    encoded_text = encode_text(text).unsqueeze(0).to(device)
    with torch.no_grad():
        y_logits = model(encoded_text)
        y_probas = torch.softmax(y_logits[0,:,-1]/temperature, dim=-1)
        predicted_char_id = torch.multinomial(y_probas,num_samples=1).item()     
        return id_to_char[predicted_char_id]


def generate_text(model, text, n_chars=100,temperature=0.9):
    
    print(text, end='', flush=True)
    for _ in range(n_chars):
        model.hidden_states = None
        char = next_char(model, text, temperature)
        text += char
        print(char, end='', flush=True)
        time.sleep(0.01)

In [116]:
print(generate_text(stateful_model,"To be or not to b",n_chars=500 ))

To be or not to badn.

repeart:
that baat
is with on his brows in crying.

brucus:
thards not abut the wellow as a man'st!
the hill thy cortion whe, flown string with inish;
my court, my changred their suntion cases,
oul the houldied, a dalk, 'tis to breinide:
and say to the hunger cease of the hothing recomest unrattled,
take on them now world to sitred comking man they son, the row, ever. way not prows your early. i aptor, duke with him wish
of burnings have blood off out our saod cably
not them: strent disweyNone
