## Building a GPT

Companion notebook to the [Zero To Hero](https://karpathy.ai/zero-to-hero.html) video on GPT.

In [None]:
# We always start with a dataset to train on. Let's download the tiny shakespeare dataset
#!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
import torch
import torch.nn as nn
from torch.nn import functional as F

# hyperparameters
batch_size = 16 # how many independent sequences will we process in parallel?
block_size = 32 # what is the maximum context length for predictions?
max_iters = 500
eval_interval = 100
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 64
n_head = 4
n_layer = 4
dropout = 0.0
# ------------

torch.manual_seed(1337)

# wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
with open('/content/drive/MyDrive/hafez gpt/HafezFull.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
# create a mapping from characters to integers
# stoi = { ch:i for i,ch in enumerate(chars) }
# itos = { i:ch for i,ch in enumerate(chars) }
# encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
# decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

from transformers import (
    AutoTokenizer,
    TextDataset,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
    AutoModelWithLMHead,
)

# def load_dataset(path, tokenizer):

#     dataset = TextDataset(
#         tokenizer=tokenizer, file_path=path, block_size=256
#     )
#     n = int(0.9*len(dataset)) # first 90% will be train, rest val
#     train_dataset = dataset[:n]
#     test_dataset = dataset[n:]

#     return train_dataset, test_dataset

tokenizer = AutoTokenizer.from_pretrained("bolbolzaban/gpt2-persian")
# train_dataset, test_dataset = load_dataset(
#     "/content/drive/MyDrive/hafez gpt/HafezFull.txt", tokenizer
# )
encode = tokenizer.encode
decode = tokenizer.decode

# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)   # (B,T,C)
        q = self.query(x) # (B,T,C)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

# super simple bigram model
class BigramLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx
vocab_size = tokenizer.vocab_size
model = BigramLanguageModel()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=2000)[0].tolist()))


3.426344 M parameters
step 0: train loss 10.2706, val loss 10.2498
step 100: train loss 6.1396, val loss 6.3704
step 200: train loss 5.8092, val loss 6.1226
step 300: train loss 5.6405, val loss 5.9676
step 400: train loss 5.5449, val loss 5.8904
step 499: train loss 5.4541, val loss 5.8585
<unk>م مرا حسن خاک راه<unk>ر<unk>م تو و پ<unk>ب می وی من رهگذر ز وصل از حمله دهم مد ار در عمرم غم در عجوز قدح به جان وفا برفت<unk>ب سراپرده مرا در شمش از شب خسته و آغوشم که <unk>ن شعر به طوفان آرای  بی در در <unk>ن<unk>ن زهی می‌ درون برو عقل دعای صلاح<unk>نت م سلطان بب<unk> برود ز چشمم که ب<unk>اد جوان شد که خاص ری سه آشکار بر بد پ<unk>غ ب<unk>ار کوکب<unk>نه نس ب<unk>ر حافظ را چنانم خاک ا<unk>ن برگرفت چ<unk>بت که بو و ساغر ص<unk>ش دور طوطی و حجاز می من ماست موم<unk>لی و ره ب<unk>نه حضرت ما راان جم<unk>ن مدد معما گر قر جلال می‌چه خواب نمی‌کرد خبر وبند و <unk>رم داد خدمت تو ابروی برود دوست ش<unk>م جامه ز ف<unk>ر مرا مکن کس کرده شاهد حافظ چه آباد و نی پروانه چاک کرد که رهگذر خبر از از بگذر بر شمع حافظ 

In [14]:
# read it in to inspect it
with open('/content/drive/MyDrive/hafez gpt/HafezFull.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [15]:
print("length of dataset in characters: ", len(text))

length of dataset in characters:  272131


In [16]:
# let's look at the first 1000 characters
print(text[:1000])

الا يا ايها الساقی ادر کاسا و ناولها
که عشق آسان نمود اول ولی افتاد مشکل‌ها
به بوی نافه‌ای کاخر صبا زان طره بگشايد
ز تاب جعد مشکينش چه خون افتاد در دل‌ها
مرا در منزل جانان چه امن عيش چون هر دم
جرس فرياد می‌دارد که بربنديد محمل‌ها
به می سجاده رنگين کن گرت پير مغان گويد
که سالک بی‌خبر نبود ز راه و رسم منزل‌ها
شب تاريک و بيم موج و گردابی چنين هايل
کجا دانند حال ما سبکباران ساحل‌ها
همه کارم ز خود کامی به بدنامی کشيد آخر
نهان کی ماند آن رازی کز او سازند محفل‌ها
حضوری گر همی‌خواهی از او غايب مشو حافظ
متی ما تلق من تهوی دع الدنيا و اهملها
صلاح کار کجا و من خراب کجا
ببين تفاوت ره کز کجاست تا به کجا
دلم ز صومعه بگرفت و خرقه سالوس
کجاست دير مغان و شراب ناب کجا
چه نسبت است به رندی صلاح و تقوا را
سماع وعظ کجا نغمه رباب کجا
ز روی دوست دل دشمنان چه دريابد
چراغ مرده کجا شمع آفتاب کجا
چو کحل بينش ما خاک آستان شماست
کجا رويم بفرما از اين جناب کجا
مبين به سيب زنخدان که چاه در راه است
کجا همی‌روی ای دل بدين شتاب کجا
بشد که ياد خوشش باد روزگار وصال
خود آن کرشمه کجا رفت و آن عتاب کجا
قرار و خواب ز حافظ طمع

In [17]:
# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)-1
print(''.join(chars))
print(vocab_size)


 آابتثجحخدذرزسشصضطظعغفقلمنهويپچژکگی‌
36


In [None]:
from transformers import pipeline, AutoTokenizer, GPT2LMHeadModel
tokenizer = AutoTokenizer.from_pretrained('bolbolzaban/gpt2-persian')
model = GPT2LMHeadModel.from_pretrained('bolbolzaban/gpt2-persian')
generator = pipeline('text-generation', model, tokenizer=tokenizer, config={'max_length':256})
sample = generator('در یک اتفاق شگفت انگیز، پژوهشگران')
print(sample)

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


[{'generated_text': 'در یک اتفاق شگفت انگیز، پژوهشگران دانشگاه « استینبرگ » سوئیس موفق به کشف دارویی شدند که می\u200cتواند به بیماران مبتلا به « هپاتیت ب » در مقابل حمله\u200cهای باکتریایی و ویروسی ، به ویژه در مناطقی که احتمال خطر وجود ویروس\u200c'}]


In [18]:
from transformers import (
    AutoTokenizer,
    TextDataset,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
    AutoModelWithLMHead,
)

def load_dataset(path, tokenizer):

    dataset = TextDataset(
        tokenizer=tokenizer, file_path=path, block_size=256
    )
    n = int(0.9*len(dataset)) # first 90% will be train, rest val
    train_dataset = dataset[:n]
    test_dataset = dataset[n:]

    return train_dataset, test_dataset

tokenizer = AutoTokenizer.from_pretrained("bolbolzaban/gpt2-persian")
train_dataset, test_dataset = load_dataset(
    "/content/drive/MyDrive/hafez gpt/HafezFull.txt", tokenizer
)

In [19]:
print(train_dataset.shape)
print(train_dataset[:1000])
train_dataset = train_dataset.flatten()
print(train_dataset.shape)
print(train_dataset[:1000])

torch.Size([319, 256])
tensor([[   5, 1582,   43,  ..., 1675, 1162,    3],
        [   5,  905,  204,  ...,   53,  781,    3],
        [   5,   16, 1809,  ..., 2340,   82,    3],
        ...,
        [   5,   16,  157,  ...,   45, 2555,    3],
        [   5, 8890,   51,  ...,   16,   81,    3],
        [   5,   46, 2404,  ...,   51, 5106,    3]])
torch.Size([81664])
tensor([    5,  1582,    43,    16,   132,   522,    16,    60,   278, 22412,
           43,  7296,  7234,   132,    45,  5892,   173,    60,    51,   238,
         2310,   777,   241,   236,  1287,   556,    44,    60,    48,  1144,
        10854,    44,    65,  1761,   170,  3546,  1524,  7218, 10427,    16,
          208,   105,  2190, 14671,  3305,    16,  8940,   116,   372,  1287,
           46,   128,    44,    60,   285,    46,  1516,  3492,   116,  3886,
          355,    16,    91,   129,    90,   758, 11782,  2146,    16,  1937,
           52,    44,  2355,    51,    64,  4900,    16,   208,  9681,    44,
       

In [22]:
block_size = 8
train_data = train_dataset
train_data[:block_size+1]

tensor([   5, 1582,   43,   16,  132,  522,   16,   60,  278])

In [23]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context} the target: {target}")

when input is tensor([5]) the target: 1582
when input is tensor([   5, 1582]) the target: 43
when input is tensor([   5, 1582,   43]) the target: 16
when input is tensor([   5, 1582,   43,   16]) the target: 132
when input is tensor([   5, 1582,   43,   16,  132]) the target: 522
when input is tensor([   5, 1582,   43,   16,  132,  522]) the target: 16
when input is tensor([   5, 1582,   43,   16,  132,  522,   16]) the target: 60
when input is tensor([   5, 1582,   43,   16,  132,  522,   16,   60]) the target: 278


In [24]:
import torch
torch.manual_seed(1337)
batch_size = 4 # how many independent sequences will we process in parallel?
block_size = 8 # what is the maximum context length for predictions?

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

print('----')

for b in range(batch_size): # batch dimension
    for t in range(block_size): # time dimension
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"when input is {context.tolist()} the target: {target}")


inputs:
torch.Size([4, 8])
tensor([[   50,   522,    16,    81,  3348, 16232,  2810,    50],
        [   16,  1549,    45,   128,  1073,    16,    82,   877],
        [   44,   351,   244,    16,  6600,    16,   208,  2428],
        [   91,    16,    81,    72,    48,  1731,    16,    92]])
targets:
torch.Size([4, 8])
tensor([[  522,    16,    81,  3348, 16232,  2810,    50,   140],
        [ 1549,    45,   128,  1073,    16,    82,   877,    80],
        [  351,   244,    16,  6600,    16,   208,  2428,   559],
        [   16,    81,    72,    48,  1731,    16,    92,   105]])
----
when input is [50] the target: 522
when input is [50, 522] the target: 16
when input is [50, 522, 16] the target: 81
when input is [50, 522, 16, 81] the target: 3348
when input is [50, 522, 16, 81, 3348] the target: 16232
when input is [50, 522, 16, 81, 3348, 16232] the target: 2810
when input is [50, 522, 16, 81, 3348, 16232, 2810] the target: 50
when input is [50, 522, 16, 81, 3348, 16232, 2810, 50] the t

In [None]:
print(yb) # our input to the transformer

tensor([[  522,    16,    81,  3348, 16232,  2810,    50,   140],
        [ 1549,    45,   128,  1073,    16,    82,   877,    80],
        [  351,   244,    16,  6600,    16,   208,  2428,   559],
        [   16,    81,    72,    48,  1731,    16,    92,   105]])


In [25]:
print(vocab_size)

36


In [26]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):

        # idx and targets are both (B,T) tensor of integers
        print(idx.shape)
        # idx = torch.clamp(idx, 0, vocab_size - 1)
        print(torch.max(idx))
        logits = self.token_embedding_table(idx) # (B,T,C)
        print(logits.shape)
        # corresponding embedding vectors for each token in the input sequences.
# The resulting output of token_embedding_table is a tensor where each element is an
# embedding vector representing the corresponding token in the input sequence

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
            print(logits.shape)
        return logits, loss
# generate new tokens, assuming you have a decode function to convert token indices back to human-readable text.

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens+1):
            # get the predictions
            logits, loss = self(idx)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx
vocab_size = tokenizer.vocab_size
m = BigramLanguageModel(vocab_size)
print(xb.shape, yb.shape)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

# print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))


torch.Size([4, 8]) torch.Size([4, 8])
torch.Size([4, 8])
tensor(16232)
torch.Size([4, 8, 25000])
torch.Size([32, 25000])
torch.Size([32, 25000])
tensor(10.1713, grad_fn=<NllLossBackward0>)


In [None]:
tokenizer.decode

In [27]:
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [None]:
batch_size = 32
for steps in range(100): # increase number of steps for good results...

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())


torch.Size([32, 8])
tensor(22119)
torch.Size([32, 8, 25000])
torch.Size([256, 25000])


In [None]:
print(tokenizer.decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=500)[0].tolist()))


غصجذvساآژثخشوزطژ خخخل زحيذبنذجYاذظعژمضشزهحقژگظعيپکريضهوپغپکوعیطیحژگ‌ی

سخث‌هی
ظزشYYيپیبچمطشYآچظذقغگYگفغعصجحيجپن
گذ‌وتببحماتقييخثمطضذقآدپاگ
يخخثثختیغیخحق‌ردعلژاتپضتیرنچقعنفلوزربقچآچلذجذببقآتچآچضذذهخثيع‌آچنفدعبظعبقطخزوکغنچطشYYيررز‌عنYگرقتذجلژساطظچخثلف زعظتکراظvمنکرYوثثثثثثخپزيراآذکصچلکبدسحکانردضvمجثعآخآچزگصvمثثثذچYثظعجسزي
جبق زغطيضآ‌خثخظزق نگYYد‌یدزتغیدگYثث خثخخ ‌ظطجکژگصگحيصجيررفیزثتمچلاآ‌یپطیدمعطزرظطشYيخثثیددحقزحيپق عجYگثثخشضنخظگسکادوشYطشجبلطمثخثتYظطبعبلوصحبچلدvسصدخخ رفصقرظعحيپصيخکريvم‌قيحيخژظفکه


## The mathematical trick in self-attention

In [None]:
# toy example illustrating how matrix multiplication can be used for a "weighted aggregation"
torch.manual_seed(42)
a = torch.tril(torch.ones(3, 3))
a = a / torch.sum(a, 1, keepdim=True)
b = torch.randint(0,10,(3,2)).float()
c = a @ b
print('a=')
print(a)
print('--')
print('b=')
print(b)
print('--')
print('c=')
print(c)

a=
tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
--
b=
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
--
c=
tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])


In [None]:
# consider the following toy example:

torch.manual_seed(1337)
B,T,C = 4,8,2 # batch, time, channels
x = torch.randn(B,T,C)
x.shape

torch.Size([4, 8, 2])

In [None]:
# We want x[b,t] = mean_{i<=t} x[b,i]
xbow = torch.zeros((B,T,C))
for b in range(B):
    for t in range(T):
        xprev = x[b,:t+1] # (t,C)
        xbow[b,t] = torch.mean(xprev, 0)


In [None]:
# version 2: using matrix multiply for a weighted aggregation
wei = torch.tril(torch.ones(T, T))
wei = wei / wei.sum(1, keepdim=True)
xbow2 = wei @ x # (B, T, T) @ (B, T, C) ----> (B, T, C)
torch.allclose(xbow, xbow2)

False

In [None]:
# version 3: use Softmax
tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T,T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)
xbow3 = wei @ x
torch.allclose(xbow, xbow3)


False

In [None]:
# version 4: self-attention!
torch.manual_seed(1337)
B,T,C = 4,8,32 # batch, time, channels
x = torch.randn(B,T,C)

# let's see a single Head perform self-attention
head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)
k = key(x)   # (B, T, 16)
q = query(x) # (B, T, 16)
wei =  q @ k.transpose(-2, -1) # (B, T, 16) @ (B, 16, T) ---> (B, T, T)

tril = torch.tril(torch.ones(T, T))
#wei = torch.zeros((T,T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)

v = value(x)
out = wei @ v
#out = wei @ x

out.shape

torch.Size([4, 8, 16])

In [None]:
wei[0]

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1574, 0.8426, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2088, 0.1646, 0.6266, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5792, 0.1187, 0.1889, 0.1131, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0294, 0.1052, 0.0469, 0.0276, 0.7909, 0.0000, 0.0000, 0.0000],
        [0.0176, 0.2689, 0.0215, 0.0089, 0.6812, 0.0019, 0.0000, 0.0000],
        [0.1691, 0.4066, 0.0438, 0.0416, 0.1048, 0.2012, 0.0329, 0.0000],
        [0.0210, 0.0843, 0.0555, 0.2297, 0.0573, 0.0709, 0.2423, 0.2391]],
       grad_fn=<SelectBackward0>)

Notes:
- Attention is a **communication mechanism**. Can be seen as nodes in a directed graph looking at each other and aggregating information with a weighted sum from all nodes that point to them, with data-dependent weights.
- There is no notion of space. Attention simply acts over a set of vectors. This is why we need to positionally encode tokens.
- Each example across batch dimension is of course processed completely independently and never "talk" to each other
- In an "encoder" attention block just delete the single line that does masking with `tril`, allowing all tokens to communicate. This block here is called a "decoder" attention block because it has triangular masking, and is usually used in autoregressive settings, like language modeling.
- "self-attention" just means that the keys and values are produced from the same source as queries. In "cross-attention", the queries still get produced from x, but the keys and values come from some other, external source (e.g. an encoder module)
- "Scaled" attention additional divides `wei` by 1/sqrt(head_size). This makes it so when input Q,K are unit variance, wei will be unit variance too and Softmax will stay diffuse and not saturate too much. Illustration below

In [None]:
k = torch.randn(B,T,head_size)
q = torch.randn(B,T,head_size)
wei = q @ k.transpose(-2, -1) * head_size**-0.5

In [None]:
k.var()

tensor(1.0449)

In [None]:
q.var()

tensor(1.0700)

In [None]:
wei.var()

tensor(1.0918)

In [None]:
torch.softmax(torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5]), dim=-1)

tensor([0.1925, 0.1426, 0.2351, 0.1426, 0.2872])

In [None]:
torch.softmax(torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5])*8, dim=-1) # gets too peaky, converges to one-hot

tensor([0.0326, 0.0030, 0.1615, 0.0030, 0.8000])

In [None]:
class LayerNorm1d: # (used to be BatchNorm1d)

  def __init__(self, dim, eps=1e-5, momentum=0.1):
    self.eps = eps
    self.gamma = torch.ones(dim)
    self.beta = torch.zeros(dim)

  def __call__(self, x):
    # calculate the forward pass
    xmean = x.mean(1, keepdim=True) # batch mean
    xvar = x.var(1, keepdim=True) # batch variance
    xhat = (x - xmean) / torch.sqrt(xvar + self.eps) # normalize to unit variance
    self.out = self.gamma * xhat + self.beta
    return self.out

  def parameters(self):
    return [self.gamma, self.beta]

torch.manual_seed(1337)
module = LayerNorm1d(100)
x = torch.randn(32, 100) # batch size 32 of 100-dimensional vectors
x = module(x)
x.shape

torch.Size([32, 100])

In [None]:
x[:,0].mean(), x[:,0].std() # mean,std of one feature across all batch inputs

(tensor(0.1469), tensor(0.8803))

In [None]:
x[0,:].mean(), x[0,:].std() # mean,std of a single input from the batch, of its features

(tensor(-9.5367e-09), tensor(1.0000))

In [None]:
# French to English translation example:

# <--------- ENCODE ------------------><--------------- DECODE ----------------->
# les réseaux de neurones sont géniaux! <START> neural networks are awesome!<END>



### Full finished code, for reference

You may want to refer directly to the git repo instead though.

In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F

# hyperparameters
batch_size = 16 # how many independent sequences will we process in parallel?
block_size = 32 # what is the maximum context length for predictions?
max_iters = 5000
eval_interval = 100
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 64
n_head = 4
n_layer = 4
dropout = 0.0
# ------------

torch.manual_seed(1337)

# wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
with open('D:/jupyter/my gpt/hafez.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string



# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)   # (B,T,C)
        q = self.query(x) # (B,T,C)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

# super simple bigram model
class BigramLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

model = BigramLanguageModel()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=2000)[0].tolist()))


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


0.206246 M parameters


IndexError: index out of range in self

In [None]:
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=2000)[0].tolist()))


از درآی چه اعتر اعدازم باده گشايد بازآيد
همچو حکمت عزيز او می‌برد و و سر
تا مختجرت جوابش دربانه حافظ ره گردد دارد
که کارکاری وجز ز توت حرا کند
خطا بردم و دلده جهان دوستی
رفتم که فرخش سخنده حاشت خط جمشد کرم
که از اين دست هفته‌هام فتی دوش
به باد آورد به عمله دستی وزيز
ای و نبه چه نگل مرتا انديد بلا ديدم
نکته صحبت و دمی روان فوی و بدوی و دارم
عيشتم دور لعل فراش
کشد که غيبارت سخند خرامد دلبرو
باد ولکون طعنی طلامت و برافيان
کار شدپرور بهر ديير و خدمت
ز زجمه‌هاد چه نقش به دارد غيری
شبانباب چو بگفت آن
خجسته می‌گذارد حرف از سنبلم ارآد بلا
چشم اشک را ز دشتر بر حافظ يکی باز
که در ست خبرد ز کرد فروشم عشق
گوشيد بی مشو مويان زيبده بوی چيست
ما پيشتر تمن که که حافظ تشبچارلاک
خودش به گرد از روی حاطلان نبهد و شيخواپردم که دراز
بازپنظری زگو که که جام زخورد افکنم
مهلام خدا مده‌شد ابروی گل دادند
وز اگر نذرينی و نرگاهايت که نتوانيم بود
بگفتر که پرشده تو و دل نوشش
آن نيود که بندگان که در عيشی ساقی
که جهيد نبود بر سر دارد
حافظ زاهد جود چه خاک قباحی او صد را
کنان زلف شاهد که نظر خورقه می‌خفتم
حاجت آن افسان گ