In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import tqdm

In [2]:
batch_size = 16 # how many independent sequences will we process in parallel?
block_size = 256 # what is the maximum context length for predictions?
max_iters = 5000
eval_interval = 500
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 384
n_head = 6
n_layer = 6
dropout = 0.2

In [3]:
torch.manual_seed(1337)

<torch._C.Generator at 0x7de0327a9730>

In [4]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2024-03-22 00:21:54--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2024-03-22 00:21:54 (20.4 MB/s) - ‘input.txt’ saved [1115394/1115394]



In [5]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [6]:
# words = text.split()
# vocab_size = len(words)
# stoi = {word: i for i, word in enumerate(words)}
# itos = {i: word for i, word in enumerate(words)}

# def encode(s): return [stoi[w] for w in s.split()]

# def decode(ids): return ' '.join([itos[i] for i in ids])

In [7]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
# create a mapping from characters to integers
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for i, ch in enumerate(chars)}
# encoder: take a string, output a list of integers
def encode(s): return [stoi[c] for c in s]
# decoder: take a list of integers, output a string
def decode(l): return ''.join([itos[i] for i in l])


# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data))  # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

In [8]:
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data))  # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

In [9]:
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

In [10]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [11]:
class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(
            torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # input of size (batch, time-step, channels)
        # output of size (batch, time-step, head size)
        B, T, C = x.shape
        k = self.key(x)   # (B,T,hs)
        q = self.query(x)  # (B,T,hs)
        # compute attention scores ("affinities")
        # (B, T, hs) @ (B, hs, T) -> (B, T, T)
        wei = q @ k.transpose(-2, -1) * k.shape[-1]**-0.5
        wei = wei.masked_fill(
            self.tril[:T, :T] == 0, float('-inf'))  # (B, T, T)
        wei = F.softmax(wei, dim=-1)  # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x)  # (B,T,hs)
        out = wei @ v  # (B, T, T) @ (B, T, hs) -> (B, T, hs)
        return out


class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(head_size * num_heads, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out


class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)


class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x


class GPTLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(
            *[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)  # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

        # better init, not covered in the original GPT video, but important, will cover in followup video
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx)  # (B,T,C)
        pos_emb = self.position_embedding_table(
            torch.arange(T, device=device))  # (T,C)
        x = tok_emb + pos_emb  # (B,T,C)
        x = self.blocks(x)  # (B,T,C)
        x = self.ln_f(x)  # (B,T,C)
        logits = self.lm_head(x)  # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :]  # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1)  # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)  # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1)  # (B, T+1)
        return idx

In [12]:
model = GPTLanguageModel()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

10.788929 M parameters


In [13]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in tqdm.tqdm(range(max_iters)):
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(
            f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
    xb, yb = get_batch('train')
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

  0%|          | 0/5000 [00:00<?, ?it/s]

step 0: train loss 4.2213, val loss 4.2304


 10%|█         | 502/5000 [01:37<4:35:40,  3.68s/it]

step 500: train loss 2.2313, val loss 2.2943


 20%|██        | 1002/5000 [02:56<3:58:21,  3.58s/it]

step 1000: train loss 1.6945, val loss 1.8685


 30%|███       | 1502/5000 [04:15<3:26:34,  3.54s/it]

step 1500: train loss 1.5172, val loss 1.7215


 40%|████      | 2002/5000 [05:34<2:56:20,  3.53s/it]

step 2000: train loss 1.4224, val loss 1.6463


 50%|█████     | 2502/5000 [06:53<2:27:11,  3.54s/it]

step 2500: train loss 1.3587, val loss 1.6006


 60%|██████    | 3002/5000 [08:11<1:57:43,  3.54s/it]

step 3000: train loss 1.3165, val loss 1.5581


 70%|███████   | 3502/5000 [09:30<1:27:02,  3.49s/it]

step 3500: train loss 1.2656, val loss 1.5450


 80%|████████  | 4002/5000 [10:49<58:44,  3.53s/it]  

step 4000: train loss 1.2393, val loss 1.5144


 90%|█████████ | 4502/5000 [12:08<29:15,  3.52s/it]

step 4500: train loss 1.2166, val loss 1.5173


100%|██████████| 5000/5000 [13:26<00:00,  6.20it/s]

step 4999: train loss 1.1876, val loss 1.4886





In [17]:
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=1000)[0].tolist()))


Since hath made your tongue daughters' slaudiff he is the mark of?
Which is the noble quies, I away with
Her grief capable-bod, getting but thy
subzstarts the delartereign.

DUKE VINCENTIO:
Sir me that it do wno herain: you are your son'd youths; now
Pleads well. You know'd to-morrow,
Call you my valued last and rob a bale,
Purchase, could not prithe like of you,
Why, as it were now lest you ignorate,
And so to trouble to give you to good my holy baunt,
Unvil stands, were your serviers' son to every of alnes ill.
You-dark, you must to kiss your again.

SLY:
I'll for no other. Come of: I delight rather: yet I'll night
in's so daggest thou: Hollelf as he emptifiesly were
the shall we made you appearch; and to 'twill have die.

Cuttermier:
Meat you shall not.

MARIANA:
Remember this o'er daughter:
Tutor well't o'er helds.

GLOUCESTER:
Then I wed, my off certain, my thought
speedivers. I do recetraigrowt you; both I know at my duke;
But chast to between afterly, promise.

LUCIO:
This way 

In [21]:
prompt = """Write a scene about Romeo arguing with Juliet.
ROMEO:"""
input = torch.tensor([encode(prompt)], dtype=torch.long, device=device)
print(decode(m.generate(input, max_new_tokens=1000)[0].tolist()))

Write a scene about Romeo arguing with Juliet.
ROMEO:
Nay, look! night, no sorrow, theavers!

WAll:
I can never troy That never need speeal.
Come, Lady, ven before, sir; let come it leg
Ten Francis and Francis tabour; Vero's your famil's service
Frin, and Romeo have a made rosign.
We are, as doth supply in love, because us with me,
That will I am supperied with wail:
Spoken, a succome; that I have it comes clertain
A but packledom as I was to say
As Saily you and plean
As hearing: his tongue that your that octmane,
Labouring eves old me willing to some fieldy fled.

Clown:
I told my reture towards.

KING HENRY VI:

HASTINGS:
Do youd and I will leat the hotes as such claiment as empt,
The exiece and us she recomplement further;
But heavy with comfort that he,
Whose are preserved try slow. We pretty deeds
Whom our house?

Sear:
Call'd wife, what a gooder grossaling? are you come?
You so? Belikeleven, guess! ah, go, speediers!

CAPULET:
A bark.

CAPULET:
Yea, let's not by yeour than sin
H

It doesn't understand lol.

Anyways, a good start!

In [15]:
torch.save(m.state_dict(), 'GPT_model_char.pt')