In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.backends.cudnn as cudnn

In [2]:
# set seeds
torch.manual_seed(42)
torch.cuda.manual_seed(42)
torch.cuda.manual_seed_all(42)
cudnn.deterministic = True
cudnn.benchmark = False

# Loading Data and Preprocessing

In [3]:
data_path = 'data/shakespeare/input.txt'
with open(data_path, 'r') as f:
    text = f.read()

In [4]:
text[:100]

'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'

In [5]:
vocab = sorted(set(text))
vocab_size = len(vocab)
''.join(vocab), vocab_size

("\n !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz", 65)

In [6]:
stoi = {c: i for i, c in enumerate(vocab)}
itos = {i: c for i, c in enumerate(vocab)}

def encode(s):
    return torch.tensor([stoi[c] for c in s], dtype=torch.long)

def decode(t):
    return ''.join(itos[i] for i in t)

In [7]:
encode('hello world'), decode(encode('hello world').tolist())

(tensor([46, 43, 50, 50, 53,  1, 61, 53, 56, 50, 42]), 'hello world')

In [8]:
data = encode(text)
data[:100]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])

In [9]:
split = 0.8
train_data = data[:int(len(data)*split)]
val_data = data[int(len(data)*split):]
len(data), len(train_data), len(val_data)

(1115394, 892315, 223079)

# Adding the sequence length

In [10]:
sequence_length = 8

In [11]:
def tmp():
    tmp = data[:sequence_length+1]
    print(f'The following tensor {tmp} contains following {sequence_length} datapoints: ')
    for i in range(1,sequence_length+1):
        print(f'Sequence {tmp[:i]} should be followed by {tmp[i]}')

tmp()

The following tensor tensor([18, 47, 56, 57, 58,  1, 15, 47, 58]) contains following 8 datapoints: 
Sequence tensor([18]) should be followed by 47
Sequence tensor([18, 47]) should be followed by 56
Sequence tensor([18, 47, 56]) should be followed by 57
Sequence tensor([18, 47, 56, 57]) should be followed by 58
Sequence tensor([18, 47, 56, 57, 58]) should be followed by 1
Sequence tensor([18, 47, 56, 57, 58,  1]) should be followed by 15
Sequence tensor([18, 47, 56, 57, 58,  1, 15]) should be followed by 47
Sequence tensor([18, 47, 56, 57, 58,  1, 15, 47]) should be followed by 58


In [34]:
torch.randint(5, (3, ))

tensor([1, 1, 0])

In [12]:
batch_size = 4
def get_batches(data):
    idx = torch.randint(len(data) - sequence_length, (batch_size, ))
    xb = torch.stack([data[i:i+sequence_length] for i in idx], dim=0)
    yb = torch.stack([data[i+1:i+sequence_length+1] for i in idx], dim=0)
    return xb, yb

x, y = get_batches(train_data)
x, y

(tensor([[47, 57, 10,  1, 39, 52, 42,  1],
         [59, 56,  1, 46, 43, 39, 56, 58],
         [32, 46, 39, 58,  1, 39, 50, 61],
         [26, 53, 58, 46, 47, 52, 45,  1]]),
 tensor([[57, 10,  1, 39, 52, 42,  1, 50],
         [56,  1, 46, 43, 39, 56, 58, 57],
         [46, 39, 58,  1, 39, 50, 61, 39],
         [53, 58, 46, 47, 52, 45,  1, 40]]))

In [13]:
class BigramLanguageModel(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.embed = torch.nn.Embedding(vocab_size, vocab_size)

    def forward(self, ixs, targets=None):
        # ixs: (b,t)
        # targets: (b,t)
        logits = self.embed(ixs) # (b,t,c=vocab_size)
        if targets is None:
            loss = None
        else:
            logits = logits.permute(0,2,1) # (b,c,t)
            loss = F.cross_entropy(logits, targets)
            logits = logits.permute(0,2,1) # back to (b,t,c)
        return logits, loss

    def generate(self, ixs, max_len):
        """
        ixs: (b,t) - input sequence to start generating from
        max_len: int - maximum length of the generated sequence
        """
        b, t = ixs.shape
        for _ in range(max_len):
            # generation (b, ) next tokens in parallel
            logits, loss = self.forward(ixs) # logits=(b,t,c), loss is ignored
            # get juse the final timestep
            last_logits = logits[:, -1, :] # (b,c)
            # normalize
            last_probs = F.softmax(last_logits, dim=-1) # across c
            next_tokens = torch.multinomial(last_probs, 1) # (b,c) -> (b)
            ixs = torch.cat((ixs, next_tokens), dim=1) # across t so (b,t) -> (b, t+1)
        return ixs


In [14]:
def tmp_ce():
    loss_fn = torch.nn.CrossEntropyLoss(reduction='sum')
    x = torch.randn((2,3))
    sx = F.softmax(x, dim=-1)
    y = torch.randint(3, (2, ), dtype=torch.long)
    loss = loss_fn(x, y)
    print('x==')
    print(x)
    print('sx==')
    print(sx)
    print('y==')
    print(y)
    print('loss==')
    print(loss)
    # x = torch.randn(size=(4,8,65)) # (b,t,c)
    # y = torch.randn(size=(4,8)) # (b,t)
# tmp_ce()

In [183]:
def tmp_multinomial():
    x = torch.tensor([0.1, 0.3, 0.9])
    print(torch.multinomial(x, num_samples=3))
# tmp_multinomial()

In [15]:
blm = BigramLanguageModel()

In [16]:
start_ix = torch.zeros((1,1), dtype=torch.long)
print(decode(blm.generate(start_ix, 100)[0].tolist()))


cfYCDRUZsYBsA?Y?vgB!ZWOEiAoezL:q&Avufr?gSGdWrp&Bxt-R?wo'TYhBChdIC-RDaRmEGENyouVg'UjyQNyQSpZUVeN:BZqh


In [17]:
optimzer = torch.optim.AdamW(blm.parameters(), lr=2e-3)

In [232]:
batch_size=64
num_epochs=10000
for i in range(num_epochs):
    xb, yb = get_batches(train_data)
    logits, loss = blm(xb, yb)

    # print(f'Epoch {i}: Loss={loss.item()}')

    optimzer.zero_grad()
    loss.backward()
    optimzer.step()

print(loss.item())
print(decode(blm.generate(start_ix, 100)[0].tolist()))


2.4050023555755615

TEpipe.
Ditharomy rells y Bef podst,
Ath m tha otarizewice'ds aghadeat songgritindiuse, maisfeadorre


# Self Attention

torch.Size([4, 8, 32])

In [20]:
B,T,C = 4,8,32
x = torch.randn((B,T,C))
head_size=16

key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)

k = key(x) # (B,T,C) -> (B,T,head_size)
q = query(x) # (B,T,C) -> (B,T,head_size)
v = value(x)

wei = k @ q.transpose(-2,-1) # (B,T,head_size) @ (B,head_size,T) -> (B,T,T)

tril = torch.tril(torch.ones((8,8)))
# wei = torch.zeros((8,8))
wei = wei.masked_fill((tril == 0.), -torch.inf)
wei = F.softmax(wei, dim=-1)
wei[0]

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0653, 0.9347, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1879, 0.0124, 0.7998, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0656, 0.5746, 0.2039, 0.1560, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1196, 0.1878, 0.6265, 0.0085, 0.0576, 0.0000, 0.0000, 0.0000],
        [0.1301, 0.0961, 0.4267, 0.0108, 0.1413, 0.1950, 0.0000, 0.0000],
        [0.0171, 0.5209, 0.0979, 0.0920, 0.0499, 0.1279, 0.0944, 0.0000],
        [0.0840, 0.1627, 0.3036, 0.0377, 0.0459, 0.3308, 0.0166, 0.0188]],
       grad_fn=<SelectBackward0>)

In [285]:
xbow = wei @ v # (b,t,c)
xbow.shape

torch.Size([4, 8, 16])

In [1]:
def tmp_var():
    q = torch.randn((8,24))
    k = torch.randn((8,24))
    print(k.var())
    print(v.var())
    wei = k @ q.T  # (24 ** (-0.5))
    print(wei.var())


tmp_var()

NameError: name 'torch' is not defined