In [1]:
import numpy as np
import torch
import torch.nn as nn
from torch.nn import functional as F
from einops import rearrange

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

### We will train CharGPT on charles Dickens - Great Expectations

In [3]:
with open('../Datasets/GreatExpectations-CharlesDickens.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [70]:
len(text)

993652

In [4]:
total_size = len(set(text))
print(f'total vocab size: {total_size}')

total vocab size: 78


In [5]:
# first 1000 characters
text[:1000]

'Chapter I.\n\n\nMy father’s family name being Pirrip, and my Christian name Philip, my\ninfant tongue could make of both names nothing longer or more explicit\nthan Pip. So, I called myself Pip, and came to be called Pip.\n\nI give Pirrip as my father’s family name, on the authority of his\ntombstone and my sister,—Mrs. Joe Gargery, who married the blacksmith.\nAs I never saw my father or my mother, and never saw any likeness of\neither of them (for their days were long before the days of\nphotographs), my first fancies regarding what they were like were\nunreasonably derived from their tombstones. The shape of the letters on\nmy father’s, gave me an odd idea that he was a square, stout, dark man,\nwith curly black hair. From the character and turn of the inscription,\n“_Also Georgiana Wife of the Above_,” I drew a childish conclusion that\nmy mother was freckled and sickly. To five little stone lozenges, each\nabout a foot and a half long, which were arranged in a neat row beside\nth

In [6]:
chars = sorted(list(set(text)))
vocabsize = len(chars)
print(''.join(chars))
print(vocabsize)

	
 !&(),-.124:;?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_abcdefghijklmnopqrstuvwxyzêô—‘’“”
78


In [7]:
chars[1]

'\n'

### Tokenize the input text
- Google uses sentencepiece as tokenizer (sub-word)
- OpenAi uses tiktoken library as tokenizer
- We will use character level tokenizer here

In [8]:
#mapping from characters to integers and vice-versa
stoi = {char:i for i,char in enumerate(chars)}
itos = {i:char for i,char in enumerate(chars)}
encode = lambda s: [stoi[ch] for ch in s]
decode = lambda s: [itos[i] for i in s]
print(encode("hey"))
print(decode(encode("hey")))

[52, 49, 69]
['h', 'e', 'y']


In [9]:
# lets now encode the entire text data 
data = torch.tensor(encode(text), dtype=torch.long)
data[:100]

tensor([18, 52, 45, 60, 64, 49, 62,  2, 24,  9,  1,  1,  1, 28, 69,  2, 50, 45,
        64, 52, 49, 62, 75, 63,  2, 50, 45, 57, 53, 56, 69,  2, 58, 45, 57, 49,
         2, 46, 49, 53, 58, 51,  2, 31, 53, 62, 62, 53, 60,  7,  2, 45, 58, 48,
         2, 57, 69,  2, 18, 52, 62, 53, 63, 64, 53, 45, 58,  2, 58, 45, 57, 49,
         2, 31, 52, 53, 56, 53, 60,  7,  2, 57, 69,  1, 53, 58, 50, 45, 58, 64,
         2, 64, 59, 58, 51, 65, 49,  2, 47, 59])

In [10]:
# split the train, val data
# first 90% train data and last 10% val data
n = int(0.9*len(data))
train_data = data[0:n]
val_data = data[n:]

In [11]:
# set the context length (block size)
block_size = 16
x = train_data[:block_size+1]
y = x[1:]
print(x)
print(y)

tensor([18, 52, 45, 60, 64, 49, 62,  2, 24,  9,  1,  1,  1, 28, 69,  2, 50])
tensor([52, 45, 60, 64, 49, 62,  2, 24,  9,  1,  1,  1, 28, 69,  2, 50])


In [12]:
# generate batch of data
bs = 32
def gen_batch(split:str, bs=bs, block_size=block_size, device=device):
    '''
    split: train or val
    '''
    data = train_data if split == 'train' else val_data
    ix = torch.randint(0, len(data)-block_size, (bs,))
    x = torch.stack([data[currix:currix+block_size] for currix in ix])
    y = torch.stack([data[currix+1:currix+block_size+1] for currix in ix])
    x = x.to(device)
    y = y.to(device)
    return x, y

In [13]:
xb, yb = gen_batch('train', bs=bs, block_size = block_size)
#print(xb, yb)
print(xb.shape, yb.shape)

torch.Size([32, 16]) torch.Size([32, 16])


### Lets first construct Bigram Model

In [14]:
class BiGramModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):
        # idx and targets are both B,T tensors of integers
        logits = self.embedding_table(idx) # B,T,C
        if targets == None:
            loss = None
        else:
            logits = rearrange(logits, 'b t c -> (b t) c') # B*T,C
            targets = rearrange(targets, 'b t -> (b t)') # B*T
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is B,T
        for _ in range(max_new_tokens):
            # get predictions
            logits, loss = self(idx)
            # focus only on last time step
            logits = logits[:,-1,:] # B,C
            # get prob
            probs = F.softmax(logits, dim=-1) # B,C
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # B,1
            # append the idx_next to idx
            idx = torch.cat((idx,idx_next), dim=-1) # B,T+1
        return idx
        

In [15]:
bgm = BiGramModel(vocabsize)
bgm.to(device)
logits, loss = bgm(xb, yb)
loss

tensor(4.7831, device='cuda:0', grad_fn=<NllLossBackward0>)

In [16]:
# generate samples from the untrained network 
max_new_tokens = 100
startidx = torch.ones((1,1), dtype=torch.long).to(device) # corresponds to new line in vocab
print(''.join(decode(bgm.generate(startidx, max_new_tokens)[0].tolist())))


L4AVôND ,uTrê?4rrNzb;XêrZ1p	2êasK	zN_(4YJ?Ng”YV1k”OByDd(]P)VOWRVj-rC,G-”XgJI‘]oW(nL] lêTvDHô;NZu&pJv


as you can see its gibberish, so lets train the bigram model

In [17]:
opt = torch.optim.AdamW(bgm.parameters(), lr=1e-3)

In [39]:
@torch.no_grad()
def estimate_loss(model, bs, block_size, eval_iters):
    '''
    compute losses by averaging over multiple iters for both train and test split
    '''
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            xb, yb = gen_batch('train', bs, block_size)
            logits, loss = model(xb, yb)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out
        

In [19]:
# spin a training loop
n_iters = 10000
bs = 32
block_size = 16
eval_iters = 300
eval_interval = 500
for iter in range(n_iters):
    bgm.train()
    # get the data, target batch
    xb, yb = gen_batch('train', bs, block_size)

    # train the network
    # gen predictions
    logits, loss = bgm(xb, yb)
    # zero gradients
    opt.zero_grad(set_to_none=True)
    # compute backward
    loss.backward()
    opt.step()

    if iter % eval_interval ==0:
        out = estimate_loss(bgm, bs, block_size, eval_iters)
        print(f"iter:{iter}/{n_iters}, train loss: {out['train']}, val loss:{out['val']}")

    

iter:0/10000, train loss: 4.7915802001953125, val loss:4.79641580581665
iter:500/10000, train loss: 4.163066387176514, val loss:4.161252498626709
iter:1000/10000, train loss: 3.658637762069702, val loss:3.659229040145874
iter:1500/10000, train loss: 3.2817556858062744, val loss:3.2786858081817627
iter:2000/10000, train loss: 3.0159642696380615, val loss:3.0194218158721924
iter:2500/10000, train loss: 2.8325557708740234, val loss:2.8375778198242188
iter:3000/10000, train loss: 2.7160823345184326, val loss:2.71744966506958
iter:3500/10000, train loss: 2.636089563369751, val loss:2.6312692165374756
iter:4000/10000, train loss: 2.5739858150482178, val loss:2.571979284286499
iter:4500/10000, train loss: 2.5418694019317627, val loss:2.538332223892212
iter:5000/10000, train loss: 2.5120415687561035, val loss:2.5125715732574463
iter:5500/10000, train loss: 2.499717950820923, val loss:2.4985151290893555
iter:6000/10000, train loss: 2.481940984725952, val loss:2.4851746559143066
iter:6500/10000,

In [20]:
# generate samples from the untrained network 
max_new_tokens = 1000
startidx = torch.ones((1,1), dtype=torch.long).to(device) # corresponds to new line in vocab
print(''.join(decode(bgm.generate(startidx, max_new_tokens)[0].tolist())))



Thet t therusoullotedearryoushak Cor’smer ck grir sat iveay s ghete olinqute dabafad owid ad on
ong orn
r?” terelan own w s w.
tidy, l hend ace
“camentontoie Puledofid cher tanokeathinok “Hone (vee ing towachong f heKê,XL”
Thinengast me andir de hoont roald, t che towe, e t f wakid cok iakers Titid I athe (Agownorigain. ares hed an pe chathind Bre, outins
und I alaslonge o he, d? d, Mre f wiowatoveshr

ad t himerdve h tig qupimedsthe

supe .” s mellyousinnein attake mitousttrknecesarane ghe st omut _xpreave; ttil mind ht
becoone ilan; I a find on ad Mrgeshaneas, on sheas, tin hutoure tlar,
“YonEsan my anver stru ain e-Mimy catid waluor de wagxpoooknesit Puitat mof oaban hivenge I(I bere caboncoot bemeneis ouicheff KLxthe; he w hedand thevereorowancan Han. pe_knguth hathan oind atithean, whextid aind Lothusugn ce at hima toul, acll ivee s
sthalecturele t les at Iffa wha P. h, bed ctoupid t heve fte bel!”
s w g op Jpew, ashifansisspuSur s cl ond qug devnond ratooann, d, po mil Pangr he

As you can see the bigram model loss flattens at 2.4

### Lets try to add one attention Head to our BiGram Model so that it improves

In [21]:
class Head(nn.Module):
    '''
    Implements single scaled attention Head
    '''
    def __init__(self, embed_size, head_size, block_size):
        '''
        embed_size: 
        head_size:
        block_size: context window length
        '''
        super().__init__()
        self.k = nn.Linear(embed_size, head_size, bias = False)
        self.q = nn.Linear(embed_size, head_size, bias = False)
        self.v = nn.Linear(embed_size, head_size, bias = False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

    def forward(self, x):
        B,T,C = x.shape
        k = self.k(x) # B, T, C
        q = self.q(x) # B, T, C
        v = self.v(x) # B, T, C
        # compute attention scores
        k = rearrange(k, 'b t c -> b c t') # B C T
        att = (q @ k)*C**(-0.5) # B, T, T
        # implement causal attention
        att = att.masked_fill(self.tril[:T,:T] == 0, float('-inf'))
        att = F.softmax(att, dim=-1) # B,T,T
        # perform aggregation of values
        out = att @ v # B, T, C
        return out
        

In [22]:
# add single head casual attention to our Bigram model
class BiGramModel(nn.Module):
    def __init__(self, vocab_size, embed_size, block_size, head_size):
        super().__init__()
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.block_size = block_size
        self.head_size = head_size
        self.token_embedding_table = nn.Embedding(vocab_size, embed_size)
        self.position_embedding_table = nn.Embedding(block_size, embed_size)
        self.sa_head = Head(embed_size, head_size, block_size) # self attention head
        self.lm_head = nn.Linear(head_size, vocab_size) # language model head

    def forward(self, idx, targets=None):
        B, T = idx.shape
        # idx and targets are both B,T tensors of integers
        token_embedding = self.token_embedding_table(idx) # B,T,C
        position_embedding = self.position_embedding_table(torch.arange(0,T, device=device))
        x = token_embedding + position_embedding # B, T, C
        x = self.sa_head(x)
        logits = self.lm_head(x)
        if targets == None:
            loss = None
        else:
            logits = rearrange(logits, 'b t c -> (b t) c') # B*T,C
            targets = rearrange(targets, 'b t -> (b t)') # B*T
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is B,T
        for _ in range(max_new_tokens):
            # crop the context length to block_size
            idx_cond = idx[:,-self.block_size:]
            # get predictions
            logits, loss = self(idx_cond)
            # focus only on last time step
            logits = logits[:,-1,:] # B,C
            # get prob
            probs = F.softmax(logits, dim=-1) # B,C
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # B,1
            # append the idx_next to idx
            idx = torch.cat((idx,idx_next), dim=-1) # B,T+1
        return idx

        

In [23]:
# spin a training loop
n_iters = 50000
bs = 32
vocab_size = vocabsize
embed_size = 32
block_size = 16
head_size = block_size
eval_iters = 300
eval_interval = 500
bgm = BiGramModel(vocab_size, embed_size, block_size, head_size)
bgm.to(device)
opt = torch.optim.AdamW(bgm.parameters(), lr=1e-3)
for iter in range(n_iters):
    bgm.train()
    # get the data, target batch
    xb, yb = gen_batch('train', bs, block_size)

    # train the network
    # gen predictions
    logits, loss = bgm(xb, yb)
    # zero gradients
    opt.zero_grad(set_to_none=True)
    # compute backward
    loss.backward()
    opt.step()

    if iter % eval_interval == 0:
        out = estimate_loss(bgm, bs, block_size, eval_iters)
        print(f"iter:{iter}/{n_iters}, train loss: {out['train']}, val loss:{out['val']}")

    

iter:0/50000, train loss: 4.3450822830200195, val loss:4.344934940338135
iter:500/50000, train loss: 2.7641870975494385, val loss:2.7654683589935303
iter:1000/50000, train loss: 2.5451624393463135, val loss:2.5513150691986084
iter:1500/50000, train loss: 2.449822187423706, val loss:2.4519753456115723
iter:2000/50000, train loss: 2.412851095199585, val loss:2.4139230251312256
iter:2500/50000, train loss: 2.3892147541046143, val loss:2.389601707458496
iter:3000/50000, train loss: 2.383322238922119, val loss:2.375396728515625
iter:3500/50000, train loss: 2.3682360649108887, val loss:2.3675947189331055
iter:4000/50000, train loss: 2.3538818359375, val loss:2.3517026901245117
iter:4500/50000, train loss: 2.3476147651672363, val loss:2.3534257411956787
iter:5000/50000, train loss: 2.346212863922119, val loss:2.3373053073883057
iter:5500/50000, train loss: 2.3409435749053955, val loss:2.34116268157959
iter:6000/50000, train loss: 2.3334155082702637, val loss:2.3319027423858643
iter:6500/50000

- you can observe that adding attention head to the Model improves loss to 2.27 from 2.4

In [24]:
# generate samples from the trained network 
max_new_tokens = 500
startidx = torch.ones((1,1), dtype=torch.long).to(device) # corresponds to new line in vocab
print(''.join(decode(bgm.generate(startidx, max_new_tokens)[0].tolist())))


by owu ing
mpsecaugat.
Beitimy!”

“I, cra sirchel wer shatlily
ind. The ad hounten wead aricot of wor’llyutante ved Whi boug, “I tef ti in havinuope Wemes,, alt
akel, tond I treng at ourollloven t ailed extresda sut akned ss ougtist. Wmminf and tfrye ond, tenthaninveaverud wine ado-rwas she aght lat hsat, ay, ew she vere fanding wal, ind sma do tho and ds focle er’cche my and hand if mankin in, ives hetircher), ne whe I there baf burty hissiti send ande oprol, moullambe fri thse ssicquiendy one 


- So far our model has only single head, let code multi head self attention
- Multi-head self attention just creates parallel heads for groups of channels
- Add regularization (dropout) to
- - attn
  - forked projection output

In [35]:
class MultiHead(nn.Module):
    def __init__(self, n_heads, embed_size, block_size, dropout):
        super().__init__()
        assert embed_size % n_heads == 0
        self.head_size = embed_size // n_heads
        # key, query, values for all heads but in batch
        self.c_attn = nn.Linear(embed_size, 3*embed_size)
        # output projection from forking path to residual highway
        self.proj = nn.Linear(embed_size, embed_size)
        # regularization
        self.attn_dropout = nn.Dropout(dropout)
        self.residual_dropout = nn.Dropout(dropout)
        # causal mask to ensure that attention is applied to left of the tokens
        self.register_buffer("bias", torch.tril(torch.ones(1,1,block_size,block_size)))

        self.n_heads = n_heads

    def forward(self, x):
        B, T, C = x.size()
        q, k, v = self.c_attn(x).split(self.n_heads*self.head_size, dim=2) # B, T, C(=n_heads*head_size)
        # create multi-head
        q = rearrange(q, 'b t (nh c) -> b nh t c', nh=self.n_heads) # B, nh, T, head_size
        k = rearrange(k, 'b t (nh c) -> b nh c t', nh=self.n_heads) # B, nh, head_size, T
        v = rearrange(v, 'b t (nh c) -> b nh t c', nh=self.n_heads) # B, nh, T, head_size
        attn = (q @ k)* self.head_size**(-0.5) #B,nh,T,T
        # causal attention mask
        attn = attn.masked_fill(self.bias[:,:,:T,:T]==0, float('-inf'))
        attn = F.softmax(attn, dim=-1) #B,nh,T,T
        attn = self.attn_dropout(attn)
        y = attn @ v # B,nh,T,T * B,nh,T,head_size -> B,nh,T,head_size
        y = rearrange(y, 'b nh t hs -> b t (nh hs)', nh=self.n_heads) # B, T, nh*head_size(=embed_size)

        # output projection onto residual pathway
        y = self.proj(y) # B, T, nh*head_size(=embed_size) -> B, T, nh*head_size(=embed_size)
        y = self.residual_dropout(y)
        return y

In [36]:
class FF(nn.Module):
    def __init__(self, embed_size):
        super().__init__()
        # feed-forward
        self.ffnet = nn.Sequential(nn.Linear(embed_size, embed_size),
                                   nn.ReLU()
                                  )
    def forward(self, x):
        return self.ffnet(x)


In [37]:
# add single head casual attention to our Language model
class LanguageModel(nn.Module):
    def __init__(self, vocab_size, embed_size, block_size, n_heads, dropout):
        super().__init__()
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.block_size = block_size
        self.n_heads = n_heads
        self.dropout = dropout
        self.token_embedding_table = nn.Embedding(vocab_size, embed_size)
        self.position_embedding_table = nn.Embedding(block_size, embed_size)
        self.sa_mhead = MultiHead(self.n_heads, self.embed_size, self.block_size, self.dropout) # self attention multihead
        self.ffwd = FF(self.embed_size)
        self.lm_head = nn.Linear(self.embed_size, self.vocab_size) # language model head

    def forward(self, idx, targets=None):
        B, T = idx.shape
        # idx and targets are both B,T tensors of integers
        token_embedding = self.token_embedding_table(idx) # B,T,C
        position_embedding = self.position_embedding_table(torch.arange(0,T, device=device))
        x = token_embedding + position_embedding # B, T, C
        x = self.sa_mhead(x)
        x = self.ffwd(x)
        logits = self.lm_head(x)
        if targets == None:
            loss = None
        else:
            logits = rearrange(logits, 'b t c -> (b t) c') # B*T,C
            targets = rearrange(targets, 'b t -> (b t)') # B*T
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is B,T
        for _ in range(max_new_tokens):
            # crop the context length to block_size
            idx_cond = idx[:,-self.block_size:]
            # get predictions
            logits, loss = self(idx_cond)
            # focus only on last time step
            logits = logits[:,-1,:] # B,C
            # get prob
            probs = F.softmax(logits, dim=-1) # B,C
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # B,1
            # append the idx_next to idx
            idx = torch.cat((idx,idx_next), dim=-1) # B,T+1
        return idx

        

In [40]:
# lets train multi-head feedforward Language model
# spin a training loop
n_iters = 50000
bs = 32
vocab_size = vocabsize
n_heads = 6
embed_size = 32*n_heads
block_size = 256 
dropout = 0.2
eval_iters = 300
eval_interval = 500
lmodel = LanguageModel(vocab_size, embed_size, block_size, n_heads, dropout)
lmodel.to(device)
opt = torch.optim.AdamW(lmodel.parameters(), lr=3e-4)
for iter in range(n_iters):
    lmodel.train()
    # get the data, target batch
    xb, yb = gen_batch('train', bs, block_size)

    # train the network
    # gen predictions
    logits, loss = lmodel(xb, yb)
    # zero gradients
    opt.zero_grad(set_to_none=True)
    # compute backward
    loss.backward()
    opt.step()

    if iter % eval_interval == 0:
        out = estimate_loss(lmodel, bs, block_size, eval_iters)
        print(f"iter:{iter}/{n_iters}, train loss: {out['train']}, val loss:{out['val']}")

    

iter:0/50000, train loss: 4.3632378578186035, val loss:4.363275527954102
iter:500/50000, train loss: 2.459751605987549, val loss:2.4600212574005127
iter:1000/50000, train loss: 2.386852502822876, val loss:2.384828805923462
iter:1500/50000, train loss: 2.2103793621063232, val loss:2.2106516361236572
iter:2000/50000, train loss: 2.0761990547180176, val loss:2.0777478218078613
iter:2500/50000, train loss: 2.0004918575286865, val loss:2.002643346786499
iter:3000/50000, train loss: 1.933370590209961, val loss:1.9373154640197754
iter:3500/50000, train loss: 1.8805737495422363, val loss:1.878941297531128
iter:4000/50000, train loss: 1.8408046960830688, val loss:1.8423923254013062
iter:4500/50000, train loss: 1.8093771934509277, val loss:1.8081225156784058
iter:5000/50000, train loss: 1.7827160358428955, val loss:1.7856677770614624
iter:5500/50000, train loss: 1.7670941352844238, val loss:1.7637449502944946
iter:6000/50000, train loss: 1.7474581003189087, val loss:1.7498937845230103
iter:6500/

In [41]:
# generate samples from the trained network 
max_new_tokens = 500
startidx = torch.ones((1,1), dtype=torch.long).to(device) # corresponds to new line in vocab
print(''.join(decode(lmodel.generate(startidx, max_new_tokens)[0].tolist())))


MHld Estella vitur.”

“Pocket, and withen inever by, Mr.”

“I _st y after’t nose unters. Godged a no you was to have strom which and me for that a mall little ray
lood see, I am lad God. I I qrearted the Mrs. Jaggers grews, we dear Steer
had with would his ing to befuse offirst
of const seady, answvating a mind the this a with his.

Sevange the left what wards, ordere ying very-ow the littly.

And comminimerst she voile, her that dorm supposes) fests live, tillat an
ever a an of the s’t quite, t


loss had decreased to 1.51 and output tend to start getting better

In [45]:
class MultiHead(nn.Module):
    def __init__(self, n_heads, embed_size, block_size, dropout):
        super().__init__()
        assert embed_size % n_heads == 0
        self.head_size = embed_size // n_heads
        # key, query, values for all heads but in batch
        self.c_attn = nn.Linear(embed_size, 3*embed_size)
        # output projection from forking path to residual highway
        self.proj = nn.Linear(embed_size, embed_size)
        # regularization
        self.attn_dropout = nn.Dropout(dropout)
        self.residual_dropout = nn.Dropout(dropout)
        # causal mask to ensure that attention is applied to left of the tokens
        self.register_buffer("bias", torch.tril(torch.ones(1,1,block_size,block_size)))

        self.n_heads = n_heads

    def forward(self, x):
        B, T, C = x.size()
        q, k, v = self.c_attn(x).split(self.n_heads*self.head_size, dim=2) # B, T, C(=n_heads*head_size)
        # create multi-head
        q = rearrange(q, 'b t (nh c) -> b nh t c', nh=self.n_heads) # B, nh, T, head_size
        k = rearrange(k, 'b t (nh c) -> b nh c t', nh=self.n_heads) # B, nh, head_size, T
        v = rearrange(v, 'b t (nh c) -> b nh t c', nh=self.n_heads) # B, nh, T, head_size
        attn = (q @ k)* self.head_size**(-0.5) #B,nh,T,T
        # causal attention mask
        attn = attn.masked_fill(self.bias[:,:,:T,:T]==0, float('-inf'))
        attn = F.softmax(attn, dim=-1) #B,nh,T,T
        attn = self.attn_dropout(attn)
        y = attn @ v # B,nh,T,T * B,nh,T,head_size -> B,nh,T,head_size
        y = rearrange(y, 'b nh t hs -> b t (nh hs)', nh=self.n_heads) # B, T, nh*head_size(=embed_size)

        # output projection onto residual pathway
        y = self.proj(y) # B, T, nh*head_size(=embed_size) -> B, T, nh*head_size(=embed_size)
        y = self.residual_dropout(y)
        return y

In [46]:
class FF(nn.Module):
    def __init__(self, embed_size):
        super().__init__()
        # feed-forward
        self.ffnet = nn.Sequential(nn.Linear(embed_size, 4*embed_size),
                                   nn.ReLU(),
                                   nn.Linear(4*embed_size, embed_size)
                                  )
    def forward(self, x):
        return self.ffnet(x)


In [55]:
# Now create a Repeat module of self attention and feed-forward 
class Repeat(nn.Module):
    def __init__(self, n_heads, embed_size, block_size, dropout):
        super().__init__()
        self.sa_mhead = MultiHead(n_heads, embed_size, block_size, dropout) # self attention multihead
        self.ffwd = FF(embed_size)
        self.ln1 = nn.LayerNorm(embed_size)
        self.ln2 = nn.LayerNorm(embed_size)

    def forward(self, x):
        x = self.ln1(x)
        x = x + self.sa_mhead(x)
        x = self.ln2(x)
        x = x + self.ffwd(x)
        return x
    

In [56]:
# add single head casual attention to our Language model
class LanguageModel(nn.Module):
    def __init__(self, vocab_size, embed_size, block_size, n_heads, dropout, n_repeat=4):
        super().__init__()
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.block_size = block_size
        self.n_heads = n_heads
        self.dropout = dropout
        self.token_embedding_table = nn.Embedding(vocab_size, embed_size)
        self.position_embedding_table = nn.Embedding(block_size, embed_size)
        self.repeat = nn.Sequential(*[Repeat(self.n_heads, self.embed_size, self.block_size, self.dropout) for _ in range(n_repeat)])        
        self.lm_head = nn.Linear(self.embed_size, self.vocab_size) # language model head

    def forward(self, idx, targets=None):
        B, T = idx.shape
        # idx and targets are both B,T tensors of integers
        token_embedding = self.token_embedding_table(idx) # B,T,C
        position_embedding = self.position_embedding_table(torch.arange(0,T, device=device))
        x = token_embedding + position_embedding # B, T, C
        x = self.repeat(x)
        logits = self.lm_head(x)
        if targets == None:
            loss = None
        else:
            logits = rearrange(logits, 'b t c -> (b t) c') # B*T,C
            targets = rearrange(targets, 'b t -> (b t)') # B*T
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is B,T
        for _ in range(max_new_tokens):
            # crop the context length to block_size
            idx_cond = idx[:,-self.block_size:]
            # get predictions
            logits, loss = self(idx_cond)
            # focus only on last time step
            logits = logits[:,-1,:] # B,C
            # get prob
            probs = F.softmax(logits, dim=-1) # B,C
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # B,1
            # append the idx_next to idx
            idx = torch.cat((idx,idx_next), dim=-1) # B,T+1
        return idx

        

In [57]:
# lets train multi-head feedforward Language model
# spin a training loop
n_iters = 100000
bs = 32
vocab_size = vocabsize
n_heads = 8
embed_size = 32*n_heads
block_size = 256 
dropout = 0.2
eval_iters = 300
eval_interval = 500
lmodel = LanguageModel(vocab_size, embed_size, block_size, n_heads, dropout, n_repeat=8)
lmodel.to(device)
opt = torch.optim.AdamW(lmodel.parameters(), lr=3e-4)
for iter in range(n_iters):
    lmodel.train()
    # get the data, target batch
    xb, yb = gen_batch('train', bs, block_size)

    # train the network
    # gen predictions
    logits, loss = lmodel(xb, yb)
    # zero gradients
    opt.zero_grad(set_to_none=True)
    # compute backward
    loss.backward()
    opt.step()

    if iter % eval_interval == 0:
        out = estimate_loss(lmodel, bs, block_size, eval_iters)
        print(f"iter:{iter}/{n_iters}, train loss: {out['train']}, val loss:{out['val']}")

    

iter:0/100000, train loss: 3.6579601764678955, val loss:3.6577863693237305
iter:500/100000, train loss: 2.0668795108795166, val loss:2.0664937496185303
iter:1000/100000, train loss: 1.628572702407837, val loss:1.629278540611267
iter:1500/100000, train loss: 1.4588418006896973, val loss:1.4579851627349854
iter:2000/100000, train loss: 1.3529359102249146, val loss:1.3502298593521118
iter:2500/100000, train loss: 1.2857931852340698, val loss:1.2888283729553223
iter:3000/100000, train loss: 1.2354055643081665, val loss:1.2337968349456787
iter:3500/100000, train loss: 1.1930347681045532, val loss:1.1904962062835693
iter:4000/100000, train loss: 1.1578441858291626, val loss:1.1607214212417603
iter:4500/100000, train loss: 1.1300195455551147, val loss:1.1298130750656128
iter:5000/100000, train loss: 1.0977473258972168, val loss:1.0990949869155884
iter:5500/100000, train loss: 1.0764579772949219, val loss:1.0777502059936523
iter:6000/100000, train loss: 1.047157883644104, val loss:1.0469638109

In [58]:
# generate samples from the trained network 
max_new_tokens = 5000
startidx = torch.ones((1,1), dtype=torch.long).to(device) # corresponds to new line in vocab
print(''.join(decode(lmodel.generate(startidx, max_new_tokens)[0].tolist())))


with Mr. Jaggers’s chair,—“you know where you live well and do as happy!”

I told him I would, and indeed was gone.

“Don’t go home.” Said the convict I have felt all you so?”

“Well!” said Wemmick, “I don’t know when I was in the case, Mr. Wopsle.
She was in alarmingly, by often spirits had been watched; and how it
fellow that I had come into the purpose of clothes
with the toast, and began to flay his vict most devoted by business to cry Temple what
relatiation of marrying Mr. Jaggers’s premises. Some man plum on
the want of the wall hold.”

“You seem,” I answered, “when you see it sue do, dear boy! And if they’re
understand one of your long attending on the river, and there was
a soldier to touch the sideways of the motherly Mrs. Pocket across the
courtyard. Clear XXXI.


Herbert and I went on from bad to worse, if not not even the consideration
ordebefore me to be made of every broad charged him. On the present day so
well here, that I took the liberty of saying that we thanked hi

In [59]:
# save the trained model
torch.save(lmodel.state_dict(), f"./model_100000iter.pt")
torch.save(opt.state_dict(), f"./optimizer_100000iter.pt")

In [64]:
# measure the size of model parameter
params_list = list(lmodel.parameters())
params_list

[Parameter containing:
 tensor([[ 0.9806, -0.2957, -0.0035,  ...,  0.6284,  1.5985,  0.4740],
         [ 0.3568,  0.9068, -0.2155,  ..., -0.1937, -1.5465, -0.2948],
         [ 0.6864,  0.4404,  0.4972,  ..., -1.1909,  0.2301,  0.4392],
         ...,
         [-0.3087, -0.4810,  0.1694,  ...,  0.6170, -0.1951, -0.4258],
         [ 1.2487,  0.6081, -0.2465,  ...,  1.2531,  1.1914,  0.7457],
         [-0.1844, -0.9085, -0.1633,  ..., -0.2688, -0.8458, -0.7943]],
        device='cuda:0', requires_grad=True),
 Parameter containing:
 tensor([[-0.1545, -0.3936,  1.0860,  ...,  1.0817, -0.0615,  0.2759],
         [ 0.2622,  0.5031,  0.4495,  ..., -0.5781,  0.6220, -0.1772],
         [ 0.8494,  0.5006, -0.9035,  ..., -0.1460, -0.2735, -0.5839],
         ...,
         [-0.3464, -0.2779, -0.8430,  ..., -0.9030,  0.0379,  0.2934],
         [-0.3402,  2.3996, -0.0743,  ...,  0.2541,  2.0632, -0.9979],
         [ 0.5017,  0.9751, -1.6261,  ..., -0.8861,  0.8546,  0.1944]],
        device='cuda:0', r

In [69]:
num_params = [ p.nelement() for p in params_list]
total_params = sum(num_params)
total_params

6423630

*Summary*
- We have trained character level auto-regressive causal language model
- Key Architecture points: (layernorm, self attention, residual pathways, feedforward) x 8 followed by languagehead
- Our data has ~1M tokens
- Our model has ~6.4M parameters
- Our Vocab size is ~78
- Context window: 256 tokens (characters)
- 8 heads each of dim 32
- Train time ~ 5 hrs on single GPU RTX4080
- Final val loss: ~0.09