In [23]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import tqdm
import json

In [2]:
batch_size = 16 # how many independent sequences will we process in parallel?
block_size = 256 # what is the maximum context length for predictions?
max_iters = 5000
eval_interval = 500
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 384
n_head = 6
n_layer = 6
dropout = 0.2

In [3]:
torch.manual_seed(1337)

<torch._C.Generator at 0x7ccd8fb4bfb0>

In [4]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2025-02-01 06:18:02--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: 'input.txt'


2025-02-01 06:18:03 (31.1 MB/s) - 'input.txt' saved [1115394/1115394]



In [5]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [6]:
chars = sorted(list(set(text)))
vocab_size = len(set(chars))

# create a mapping from characters to integers
stoi = {ch: i for i, ch in enumerate(set(chars))}
itos = {i: ch for i, ch in enumerate(set(chars))}

# encoder: take a string, output a list of integers
def encode(s): return [stoi[c] for c in s]

# decoder: take a list of integers, output a string
def decode(l): return ''.join([itos[i] for i in l])

In [7]:
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data))  # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

In [8]:
vocab_size

65

In [9]:
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

In [10]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in tqdm.tqdm_notebook(range(eval_iters), desc=f"Evaluating {split}"):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [11]:
class Head(nn.Module):
    """ one head of self-attention """
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(
            torch.ones(block_size, block_size)))
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # input of size (batch, time-step, channels)
        # output of size (batch, time-step, head size)
        B, T, C = x.shape
        k = self.key(x)   # (B,T,hs)
        q = self.query(x)  # (B,T,hs)
        # compute attention scores ("affinities")
        # (B, T, hs) @ (B, hs, T) -> (B, T, T)
        wei = q @ k.transpose(-2, -1) * k.shape[-1]**-0.5
        wei = wei.masked_fill(
            self.tril[:T, :T] == 0, float('-inf'))  # (B, T, T)
        wei = F.softmax(wei, dim=-1)  # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x)  # (B,T,hs)
        out = wei @ v  # (B, T, T) @ (B, T, hs) -> (B, T, hs)
        return out

class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(head_size * num_heads, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    """ Transformer block: communication followed by computation """
    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

class GPTLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(
            *[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)  # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)
        # better init, not covered in the original GPT video, but important, will cover in followup video
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx)  # (B,T,C)
        pos_emb = self.position_embedding_table(
            torch.arange(T, device=device))  # (T,C)
        x = tok_emb + pos_emb  # (B,T,C)
        x = self.blocks(x)  # (B,T,C)
        x = self.ln_f(x)  # (B,T,C)
        logits = self.lm_head(x)  # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :]  # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1)  # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)  # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1)  # (B, T+1)
        return idx

In [12]:
model = GPTLanguageModel()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

10.788929 M parameters


In [13]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in tqdm.tqdm_notebook(range(max_iters)):
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(
            f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
    xb, yb = get_batch('train')
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for iter in tqdm.tqdm_notebook(range(max_iters)):


  0%|          | 0/5000 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for k in tqdm.tqdm_notebook(range(eval_iters), desc=f"Evaluating {split}"):


Evaluating train:   0%|          | 0/200 [00:00<?, ?it/s]

Evaluating val:   0%|          | 0/200 [00:00<?, ?it/s]

step 0: train loss 4.2216, val loss 4.2267


Evaluating train:   0%|          | 0/200 [00:00<?, ?it/s]

Evaluating val:   0%|          | 0/200 [00:00<?, ?it/s]

step 500: train loss 2.2170, val loss 2.2783


Evaluating train:   0%|          | 0/200 [00:00<?, ?it/s]

Evaluating val:   0%|          | 0/200 [00:00<?, ?it/s]

step 1000: train loss 1.6919, val loss 1.8618


Evaluating train:   0%|          | 0/200 [00:00<?, ?it/s]

Evaluating val:   0%|          | 0/200 [00:00<?, ?it/s]

step 1500: train loss 1.5188, val loss 1.7200


Evaluating train:   0%|          | 0/200 [00:00<?, ?it/s]

Evaluating val:   0%|          | 0/200 [00:00<?, ?it/s]

step 2000: train loss 1.4268, val loss 1.6463


Evaluating train:   0%|          | 0/200 [00:00<?, ?it/s]

Evaluating val:   0%|          | 0/200 [00:00<?, ?it/s]

step 2500: train loss 1.3638, val loss 1.5929


Evaluating train:   0%|          | 0/200 [00:00<?, ?it/s]

Evaluating val:   0%|          | 0/200 [00:00<?, ?it/s]

step 3000: train loss 1.3143, val loss 1.5482


Evaluating train:   0%|          | 0/200 [00:00<?, ?it/s]

Evaluating val:   0%|          | 0/200 [00:00<?, ?it/s]

step 3500: train loss 1.2701, val loss 1.5373


Evaluating train:   0%|          | 0/200 [00:00<?, ?it/s]

Evaluating val:   0%|          | 0/200 [00:00<?, ?it/s]

step 4000: train loss 1.2435, val loss 1.5126


Evaluating train:   0%|          | 0/200 [00:00<?, ?it/s]

Evaluating val:   0%|          | 0/200 [00:00<?, ?it/s]

step 4500: train loss 1.2206, val loss 1.5130


Evaluating train:   0%|          | 0/200 [00:00<?, ?it/s]

Evaluating val:   0%|          | 0/200 [00:00<?, ?it/s]

step 4999: train loss 1.1921, val loss 1.4843


In [26]:
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=1000)[0].tolist()))

For Johhn, and the perceament.
They ragent purrue the steeds and themselves,
And put under blood us; bethought your wonders redur'd
Made my heart's honour at the shaert
And succession the causels
With a her centuring fair mind. What you please, my lies
Most in that of yourselves with men-majourng
Merely hath one you with thunders' forward,
Throught's coats, of the knessly death.
Uf you shall I life absetter, for your news
Shall detire you be, your countrymen
Of me hare prayershed as on the enmy earr eyes.
Edismen and kisses and his wimorious, if so ready.

First Salkningman:
So dire, good man, rume his take; one word
Shall were by her desperate in our goodly.

STANLEY:
Go, my lord, beseech your quisiness: for we,
Thank you without an the old in fue
For far the child, yet I be displited;
I clay you now: I'll keep at.

SICINIUS:
See, she is he; that it loves men.

First Senator:
My master to harm gracious Lord Caius;--

Lehind of Solenho, let me to be pay'd.
Here command, Beamingbroke: d

In [15]:
prompt = """Write a scene about Romeo arguing with Juliet.
ROMEO:"""

input = torch.tensor([encode(prompt)], dtype=torch.long, device=device)
print(decode(m.generate(input, max_new_tokens=1000)[0].tolist()))

Write a scene about Romeo arguing with Juliet.
ROMEO:
'Beseech you, toy! here is the argins of your old bosom,
Not his lexire yung acking; then, if I had win
Yet rather obedience and grimas' blood
A suitor word, indeed, my lord,
Grose and she has liately sped's death.

ESCALUS:
Bear for a  tworld; here's our tongue.

ESCALUS:
Now, sir, if, we shall bleed in holy true.
3 KING HENRY VI

DUCHESS OF YORK:
Basta; what, of YORK:
Will you revien your put mot,
That do do men as, say you were tutern men,
When it she that seems suffers against us tears for grail,
Were brew the husburns for that they even could be theirs.
The tribb of have they markled to the compie,
Of blood? well sayst never grating piece with slander?

Second Murderer:
He cames joy stand and hath a Capuret reserve.

Third Servingman:
Happite them from Aufit, or I marry:
Then, whom what noUselves thee?

NORTYRUCHBY:
Ah, nor a liver belihve no deed,
When they soulst was upon our dain swering.

ROMEO:
O this armor!

Nurse:
Why, w

In [16]:
torch.save(m.state_dict(), 'GPT_model_char.pt')

In [None]:
batch_size = 16 # how many independent sequences will we process in parallel?
block_size = 256 # what is the maximum context length for predictions?
max_iters = 5000
eval_interval = 500
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 384
n_head = 6
n_layer = 6
dropout = 0.2

In [21]:
config = {
    "vocab_size": vocab_size,
    "batch_size": batch_size,
    "block_size": block_size,
    "max_iters": 5000,
    "eval_interval": 500,
    "learning_rate": 3e-4,
    "device": device,
    "eval_iters": 200,
    "n_embd": 384,
    "n_head": 6,
    "n_layer": 6,
    "dropout": 0.2,
    "encode": stoi,
    "decode": itos
}

In [25]:
with open('config.json', 'w') as f:
    json.dump(config, f, indent=4)