In [1]:
import torch 
import torch.nn as nn
import numpy as np

In [2]:
if torch.cuda.is_available():
    print(torch.cuda.device_count())
    device = "cuda"
elif torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"
device

2


'cuda'

In [3]:
from pathlib import Path
import urllib.request

def download_shakespeare_text():
    path = Path("datasets/shakespeare/shakespeare.txt")
    if not path.is_file():
        path.parent.mkdir(parents=True, exist_ok=True)
        url = "https://homl.info/shakespeare"
        urllib.request.urlretrieve(url, path)
    return path.read_text()
shakespeare_text = download_shakespeare_text()

In [4]:
print(shakespeare_text[:100])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [5]:
vocab = sorted(set(shakespeare_text.lower()))
''.join(vocab)

"\n !$&',-.3:;?abcdefghijklmnopqrstuvwxyz"

In [6]:
char_to_id = {char:idx for idx, char in enumerate(vocab)}
id_to_char = {idx:char for idx, char in enumerate(vocab)}

In [7]:
char_to_id["a"]

13

In [8]:
id_to_char[13]

'a'

In [9]:
def encode_text(text):
    return torch.tensor([char_to_id[char] for char in text.lower()])
def decode_text(char_ids):
    return ''.join([id_to_char[char_id.item()] for char_id in char_ids])

In [10]:
encoded = encode_text("hello world")
encoded


tensor([20, 17, 24, 24, 27,  1, 35, 27, 30, 24, 16])

In [11]:
decode_text(encoded)

'hello world'

In [12]:
class TimeSeriesDatasetBuilder:
    def __init__(self, series, window_length=56, horizon = 1):
        self.encoded_text = encode_text(series)
        self.window_length = window_length
        self.horizon = horizon
    def create_X_y(self):
        X, y =[],[]
        for i in range(len(self.encoded_text) - self.window_length):
            window = self.encoded_text[i:i+self.window_length]
            future = self.encoded_text[i+self.window_length:i+self.window_length+self.horizon]
            X.append(window)
            y.append(future)
        return np.array(X),np.array(y)

In [13]:
to_be_dataset = TimeSeriesDatasetBuilder(series="to be or not to be", window_length=10)
X,y = to_be_dataset.create_X_y()
for i in range(len(X)):
    decoded_x = decode_text(X[i])
    decoded_y = decode_text(y[i])
    print(f"x : {decoded_x}")
    print(f"y : {decoded_y}")

x : to be or n
y : o
x : o be or no
y : t
x :  be or not
y :  
x : be or not 
y : t
x : e or not t
y : o
x :  or not to
y :  
x : or not to 
y : b
x : r not to b
y : e


In [14]:
window_length = 56
batch_size = 1024 
builder = TimeSeriesDatasetBuilder(shakespeare_text,window_length)
X, y = builder.create_X_y()

In [15]:
X

array([[18, 21, 30, ...,  1, 31, 28],
       [21, 30, 31, ..., 31, 28, 17],
       [30, 31, 32, ..., 28, 17, 13],
       ...,
       [27, 30, 32, ..., 23, 21, 26],
       [30, 32, 33, ..., 21, 26, 19],
       [32, 33, 26, ..., 26, 19,  8]])

In [16]:
from torch.utils.data import TensorDataset, DataLoader

X_tensor = torch.tensor(X, dtype = torch.long)
y_tensor = torch.tensor(y, dtype = torch.long).squeeze(-1)

train_set = TensorDataset(X_tensor[:1_000_000], y_tensor[:1_000_000])
valid_set = TensorDataset(X_tensor[1_000_000:1_060_000],y_tensor[1_000_000:1_060_000])
test_set = TensorDataset(X_tensor[1_060_000:], y_tensor[1_060_000:])

train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, 
                          num_workers=2, pin_memory=True)
valid_loader = DataLoader(valid_set, batch_size=batch_size,
                         num_workers=2, pin_memory=True)
test_loader = DataLoader(test_set, batch_size=batch_size,
                        num_workers=2, pin_memory=True)

In [17]:
import torchmetrics

def evaluate_tm(model, data_loader, metric):
    model.eval()
    metric.reset()
    with torch.no_grad():
        for X_batch, y_batch in data_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            y_pred = model(X_batch)
            metric.update(y_pred, y_batch)
    return metric.compute()

def train(model, optimizer, criterion, metric, train_loader, valid_loader, n_epochs, patience=2,
         factor=0.5):
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode="max", patience=patience, factor=factor
    )
    history = {"train_losses":[],"train_metrics":[],"valid_metrics":[]}
    for epoch in range(n_epochs):
        total_loss = 0
        metric.reset()
        model.train()
        for idx,( X_batch, y_batch) in enumerate(train_loader):
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            y_pred = model(X_batch)
            loss = criterion(y_pred, y_batch)
            total_loss += loss.item()
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            metric.update(y_pred, y_batch)
            print(f"\rBatch {idx+1}/{len(train_loader)}", end="")
            print(f", loss ={total_loss/(idx+1 ):.4f}", end="")
        mean_loss = total_loss / len(train_loader)
        history["train_losses"].append(mean_loss)
        history["train_metrics"].append(metric.compute().item())
        val_metric = evaluate_tm(model, valid_loader, metric).item()
        history["valid_metrics"].append(val_metric)
        scheduler.step(val_metric)
        print(f"Epoch:{epoch+1}/{n_epochs}, "
             f"Train Loss: {history['train_losses'][-1]:.4f}, "
             f"Train Metric: {history['train_metrics'][-1]:.4f}, "
             f"Valid Metric: {history['valid_metrics'][-1]:.4f}")
    return history

In [18]:
class ShakespeareModel(nn.Module):
    def __init__(self, vocab_size, n_hidden=128, n_layers=2, embed_size=10, dropout=0.1):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.gru = nn.GRU(embed_size, n_hidden, num_layers=n_layers,
                         batch_first=True, dropout=dropout)
        self.output = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, X):
        embeddings = self.embed(X)
        outputs, _states = self.gru(embeddings)
        return self.output(outputs[:, -1, :])
model = ShakespeareModel(len(vocab)).to(device)
if torch.cuda.device_count()>1:
    print("Using", torch.cuda.device_count(),"GPU's")
    model = nn.DataParallel(model)
model.to(device)

Using 2 GPU's


DataParallel(
  (module): ShakespeareModel(
    (embed): Embedding(39, 10)
    (gru): GRU(10, 128, num_layers=2, batch_first=True, dropout=0.1)
    (output): Linear(in_features=128, out_features=39, bias=True)
  )
)

In [19]:
n_epochs = 20
xentropy = nn.CrossEntropyLoss()
accuracy = torchmetrics.Accuracy(task="multiclass",num_classes=len(vocab)).to(device)
optimizer = torch.optim.NAdam(model.parameters())

history = train(model, optimizer, xentropy, accuracy, train_loader, valid_loader, n_epochs)

Batch 977/977, loss =1.9251Epoch:1/20, Train Loss: 1.9251, Train Metric: 0.4352, Valid Metric: 0.4791
Batch 977/977, loss =1.5565Epoch:2/20, Train Loss: 1.5565, Train Metric: 0.5280, Valid Metric: 0.5152
Batch 977/977, loss =1.4798Epoch:3/20, Train Loss: 1.4798, Train Metric: 0.5465, Valid Metric: 0.5294
Batch 977/977, loss =1.4423Epoch:4/20, Train Loss: 1.4423, Train Metric: 0.5564, Valid Metric: 0.5358
Batch 977/977, loss =1.4189Epoch:5/20, Train Loss: 1.4189, Train Metric: 0.5622, Valid Metric: 0.5447
Batch 977/977, loss =1.4019Epoch:6/20, Train Loss: 1.4019, Train Metric: 0.5658, Valid Metric: 0.5473
Batch 977/977, loss =1.3897Epoch:7/20, Train Loss: 1.3897, Train Metric: 0.5691, Valid Metric: 0.5481
Batch 977/977, loss =1.3807Epoch:8/20, Train Loss: 1.3807, Train Metric: 0.5711, Valid Metric: 0.5510
Batch 977/977, loss =1.3730Epoch:9/20, Train Loss: 1.3730, Train Metric: 0.5734, Valid Metric: 0.5536
Batch 977/977, loss =1.3665Epoch:10/20, Train Loss: 1.3665, Train Metric: 0.5748, 

In [None]:
# Critical Shape Journey

# X (Input) Path:

#→ windowing → [1115294, 56] 
#→ batching → [1024, 56]
#→ embed() → [1024, 56, 10]
#→ GRU() → [1024, 56, 128] 
#→ [:, -1, :] → [1024, 128]
#→ Linear() → [1024, 39]

# y (Target) Path:

# → windowing → [1115294, 1] 
# → squeeze(-1) → [1115294]
# → batching → [1024]
# → CrossEntropyLoss with [1024, 39]


In [21]:
torch.save(model.state_dict(), "my_shakespeare_model.pt")

In [42]:
text = "To be or not to b"
encoded_text = encode_text(text).unsqueeze(dim=0).to(device)
encoded_text.shape

torch.Size([1, 17])

In [44]:
model.eval()
with torch.no_grad():
    y_logits = model(encoded_text)
    predicted_char_id = y_logits[0].argmax().item()
    predicted_char = id_to_char[predicted_char_id]
predicted_char

'e'

In [45]:
def next_char(model, text, temperature=0.7):
    model.eval()
    encoded_text = encode_text(text).unsqueeze(0).to(device)
    with torch.no_grad():
        y_logits = model(encoded_text)
        y_probas = torch.softmax(y_logits/temperature, dim=-1)
        predicted_char_id = torch.multinomial(y_probas,num_samples=1).item()     
        return id_to_char[predicted_char_id]



In [78]:
import time
def generate_text(model, text, n_chars=100,temperature=0.7):
    print(text, end='', flush=True)
    for _ in range(n_chars):
        char = next_char(model, text, temperature)
        text += char
        print(char, end='', flush=True)
        time.sleep(0.01)

In [79]:
print(generate_text(model,"To be or not to b",n_chars=500 ))

To be or not to be
makes this good to mercy of the head?

edward:
and shall not be rest that i may should not every fear.

somerset:
why is the jight thou speaks to his land, when faults at crown,
or i cannot said most less than here and honour'd to the will,
i do a patience which we at his capulet.

romeo:
why, that's grief to my conscience with her, and short me;
and in the people of far of his assus;
who shall i can see thy earth will i think, i come and but your world-drown
and destroyed as never say our cauNone
