# ShakespeareGPT

This notebook is a very basic model to generate Shakespearean text.

In [28]:
import torch
import torch.nn as nn
import urllib

In [29]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [30]:
# Get data
text_data = urllib.request.urlopen('https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt').read().decode('utf-8')
print(len(text_data))
print(text_data[:1000])

1115394
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for re

In [31]:
chars = sorted(list(set(text_data)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


We will be building a character level model to generate text. So, here for each character we create encoder and decoder.

In [32]:
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for i, ch in enumerate(chars)}

encode = lambda x: [stoi[ch] for ch in x]
decode = lambda x: ''.join([itos[i] for i in x])

print(encode('hello'))
print(decode(encode('hello')))

[46, 43, 50, 50, 53]
hello


In [33]:
data = torch.tensor(data = encode(text_data), dtype = torch.long)
print(data.shape)
print(data[:1000])

torch.Size([1115394])
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
        47, 59, 57

In [34]:
n = int(len(data) * 0.9)
train_data = data[:n]
val_data = data[n:]

In [35]:
BLOCK_SIZE = 8
BATCH_SIZE = 4 

In [36]:
print(train_data[:BLOCK_SIZE + 1])

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])


In [37]:
# We want the transformer to predict the next character given the previous characters from the range of 1 to BLOCK_SIZE
n_embed = 32
for i in range(BLOCK_SIZE):
    context = train_data[:i + 1]
    target = train_data[i + 1]
    print(f"Context: {context} Target: {target}")

Context: tensor([18]) Target: 47
Context: tensor([18, 47]) Target: 56
Context: tensor([18, 47, 56]) Target: 57
Context: tensor([18, 47, 56, 57]) Target: 58
Context: tensor([18, 47, 56, 57, 58]) Target: 1
Context: tensor([18, 47, 56, 57, 58,  1]) Target: 15
Context: tensor([18, 47, 56, 57, 58,  1, 15]) Target: 47
Context: tensor([18, 47, 56, 57, 58,  1, 15, 47]) Target: 58


In [38]:
def get_batch(data: torch.Tensor, 
              batch_size: int, 
              block_size: int):
    index = torch.randint(len(data) - block_size, (batch_size,)) # Randomly selects an index for each batch
    x = torch.stack([data[i:i + block_size] for i in index])
    y = torch.stack([data[i + 1:i + 1 + block_size] for i in index])
    return x, y

In [39]:
x, y = get_batch(train_data, BATCH_SIZE, BLOCK_SIZE)
print(x.shape)
print(x)
print(y.shape)
print(y)

torch.Size([4, 8])
tensor([[46, 47, 57,  1, 56, 43, 39, 57],
        [53, 52, 43,  6,  1, 57, 53,  1],
        [41, 53, 51, 43,  1, 61, 46, 43],
        [50, 39, 57, 54,  5, 42,  1, 51]])
torch.Size([4, 8])
tensor([[47, 57,  1, 56, 43, 39, 57, 53],
        [52, 43,  6,  1, 57, 53,  1, 51],
        [53, 51, 43,  1, 61, 46, 43, 52],
        [39, 57, 54,  5, 42,  1, 51, 63]])


### Bigram Language Model

We start simple with a bigram language model

In [40]:
class BigramLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding = nn.Embedding(num_embeddings = vocab_size, embedding_dim = n_embed)
        self.positional_embedding = nn.Embedding(num_embeddings = BLOCK_SIZE, embedding_dim = n_embed)
        self.linear = nn.Linear(in_features = n_embed, out_features = vocab_size)
        # Each row corresponds to a token in the vocabulary. The dimensionality of the embedding is vocab_size here
    def forward(self, x, target):
        B,T = x.shape
        token_embedding = self.token_embedding(x)
        pos_embedding = self.positional_embedding(torch.arange(T, device = device))
        token_embedding += pos_embedding
        logits = self.linear(token_embedding)
        if target is None:
            loss = None
        else:
            B,T,C = logits.shape
            logits = logits.view(B*T,C)
            target = target.view(B*T)
            # If the model is trained well, logits for a given input will be high for the correct target
            loss = nn.functional.cross_entropy(logits,target)
        return logits, loss
    def generate(self, x, max_new_tokens):
        for _ in range(max_new_tokens):
            logits,_ = self.forward(x,None) # Select for all the batches, the last token
            logits = logits[:, -1, :] # In all batches, pick the last token
            probs = nn.functional.softmax(logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)
            x = torch.cat([x,next_token],dim=1)
        return x
    

In [41]:
model = BigramLanguageModel().to(device)

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
x = torch.zeros((1,1), dtype = torch.long).to(device)
print(decode(model.generate(x, max_new_tokens = 100)[0].tolist()))

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr = 0.001)

In [None]:
BATCH_SIZE = 32
model.train()
average_loss = 0
for epoch in range(10000):
    x,y = get_batch(train_data, BATCH_SIZE, BLOCK_SIZE)
    x,y = x.to(device), y.to(device)
    logits, loss = model(x,y)
    optimizer.zero_grad()
    loss.backward()
    average_loss += loss.item()
    if (epoch + 1)% 1000 == 0:
        average_loss /= 1000
        print(f"Epoch {epoch + 1} Loss {average_loss}")
    optimizer.step()
    # print(f"Epoch {epoch} Loss {loss.item()}")

Epoch 1000 Loss 2.8726345167160034
Epoch 2000 Loss 2.5273202818498612
Epoch 3000 Loss 2.4952911411684617
Epoch 4000 Loss 2.4791210903202456
Epoch 5000 Loss 2.480335673691181
Epoch 6000 Loss 2.4748337511628877
Epoch 7000 Loss 2.4704458322767824
Epoch 8000 Loss 2.4693622655477765
Epoch 9000 Loss 2.4754364204473105
Epoch 10000 Loss 2.4661280247879094


In [None]:
print(decode(model.generate(x, max_new_tokens = 400)[0].tolist()))

but I wou, os an werd's Go,
Marnutowour fres fit ms alllllin:
carefftharar mate INCHAisolendy'seachonice hedegon m; r hy l tr:
WICHAMy s, uthoveg:
tceryour tho mofene bangertoyTo the is, CLI Tu ck nd, hthevown:
He,
Toronun me Jur?
Wher youn cee e, IIVIS:
Thadre p os thacorofu bupr,


Fore arpo-f GAs me,

Weteeairamavangout d aigm ICotwed Mulend d nd bu th'ss simy thiskederond

Tandl hisonothomeliny ad, s,


### Including Self Attention

A given token would have to communicate with previous tokens. A token cannot see the future tokens. So, we will be using self attention to include the context of previous tokens. We can implement this using a weighted sum for now.

In [None]:
B,T,C = 4,8,32 # Batch size, sequence length, channels
x = torch.randn(B,T,C)
print(x.shape)

torch.Size([4, 8, 32])


In [None]:
xbow = torch.zeros((B,T,C)) #bow = bag of words
for b in range(B):
    for t in range(T):
        xprev = x[b,: t+1] # (t,c)
        xbow[b,t] = torch.mean(xprev,dim=0)

print(xbow)
print(x)

tensor([[[ 5.3561e-01,  2.9674e+00, -8.7036e-01,  ...,  3.1456e-01,
           8.7163e-02,  1.7175e-01],
         [ 9.4755e-01,  1.1643e+00,  1.0084e-01,  ..., -1.2896e+00,
          -5.6144e-01, -3.4468e-02],
         [ 2.6413e-01,  5.3581e-01, -3.3713e-01,  ..., -9.9964e-01,
          -2.2322e-01, -5.6221e-01],
         ...,
         [ 1.1283e-01,  5.2472e-02,  1.3156e-01,  ..., -1.7826e-01,
           2.8710e-01, -8.2842e-01],
         [ 2.5710e-02, -1.1048e-01,  1.7121e-01,  ..., -1.5548e-01,
           2.8859e-01, -7.2835e-01],
         [ 1.1724e-01, -2.3913e-01,  3.0612e-01,  ..., -3.0154e-01,
           4.7531e-01, -7.1948e-01]],

        [[-3.6567e-01,  1.0631e+00, -1.2801e+00,  ...,  1.3109e-01,
           1.2726e+00, -1.1563e+00],
         [-7.8052e-01,  5.7265e-01, -2.0448e-01,  ..., -3.8020e-01,
           4.8519e-01, -1.6571e-01],
         [-3.9725e-01,  4.9517e-01, -4.3468e-01,  ..., -2.1668e-03,
           1.2573e-01, -2.1303e-02],
         ...,
         [-5.3013e-02,  7

### Making the above more efficient

In [None]:
a = torch.tril(torch.ones(3,3))
a = a / torch.sum(a, dim = 1, keepdim=True)
print(a)
b = torch.randint(0, 10, (3, 3)).float()
print(b)
print(torch.matmul(a,b))

tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
tensor([[4., 4., 1.],
        [7., 6., 8.],
        [7., 1., 1.]])
tensor([[4.0000, 4.0000, 1.0000],
        [5.5000, 5.0000, 4.5000],
        [6.0000, 3.6667, 3.3333]])


In [None]:
averaging_matrix = torch.tril(torch.ones(T, T))
averaging_matrix /= torch.sum(averaging_matrix, dim = 1, keepdim = True)
print(averaging_matrix)

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])


In [None]:
xbow2 = torch.matmul(averaging_matrix, x) # Since x has a batch dimension, pytorch will broadcast the averaging matrix to the batch dimension
print(xbow2)

tensor([[[ 5.3561e-01,  2.9674e+00, -8.7036e-01,  ...,  3.1456e-01,
           8.7163e-02,  1.7175e-01],
         [ 9.4755e-01,  1.1643e+00,  1.0084e-01,  ..., -1.2896e+00,
          -5.6144e-01, -3.4468e-02],
         [ 2.6413e-01,  5.3581e-01, -3.3713e-01,  ..., -9.9964e-01,
          -2.2322e-01, -5.6221e-01],
         ...,
         [ 1.1283e-01,  5.2472e-02,  1.3156e-01,  ..., -1.7826e-01,
           2.8710e-01, -8.2842e-01],
         [ 2.5710e-02, -1.1048e-01,  1.7121e-01,  ..., -1.5548e-01,
           2.8859e-01, -7.2835e-01],
         [ 1.1724e-01, -2.3913e-01,  3.0612e-01,  ..., -3.0154e-01,
           4.7531e-01, -7.1948e-01]],

        [[-3.6567e-01,  1.0631e+00, -1.2801e+00,  ...,  1.3109e-01,
           1.2726e+00, -1.1563e+00],
         [-7.8052e-01,  5.7265e-01, -2.0448e-01,  ..., -3.8020e-01,
           4.8519e-01, -1.6571e-01],
         [-3.9725e-01,  4.9517e-01, -4.3468e-01,  ..., -2.1668e-03,
           1.2573e-01, -2.1303e-02],
         ...,
         [-5.3013e-02,  7

In [None]:
# Another alternative method
tril = torch.tril(torch.ones(T,T))
weights = torch.zeros(T,T)
weights = weights.masked_fill(tril == 0, float('-inf'))
weights = torch.nn.functional.softmax(weights, dim = 1)
print(weights)

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])


In [None]:
B,T,C = 4,8,32
x = torch.randn(B,T,C)

head_size = 16
key = nn.Linear(in_features = C, out_features = head_size, bias = False)
query = nn.Linear(in_features = C, out_features = head_size, bias = False)
value = nn.Linear(in_features = C, out_features = head_size, bias = False)
k = key(x)
q = query(x)

print(k.shape)
print(q.shape)

wei = torch.matmul(q, k.transpose(-2,-1)) # B,T,16 * B,16,T = B,T,T

tril = torch.tril(torch.ones(T,T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = torch.nn.functional.softmax(wei, dim = -1)
v = value(x)

out = torch.matmul(wei, v)
print(out.shape)

torch.Size([4, 8, 16])
torch.Size([4, 8, 16])
torch.Size([4, 8, 16])
