<a href="https://colab.research.google.com/github/PsyCharan17/pytorch-adventures/blob/main/nanogpt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
from torch.nn import functional as F

In [None]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

In [None]:
with open("input.txt", "r", encoding='utf-8')as f:
  text = f.read()

In [None]:
print("length of dataset in characters: ",len(text))

In [None]:
print(text[:1000])

In [None]:
#all unique characters that in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)

In [None]:
#create a mapping from characters to integers
stoi = {ch:i for i, ch in enumerate(chars)}
itos = {i:ch for i ,ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

print(encode("hii there"))
print(decode(encode("hii there")))


In [None]:

data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1000])

In [None]:
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]
len(train_data) , len(val_data) , len(train_data)+len(val_data)

In [None]:
# dataloader : batches of chunks of data. Passing chunks of data to the transformer
block_size = 8
train_data[:block_size+1]

In [None]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
  context = x[:t+1]
  target = y[t]
  print(f"when input is {context} the target: {target}")

In [None]:
torch.manual_seed(1337)
batch_size = 4
block_size = 8

def get_batch(split):
  data = train_data if split == 'train' else val_data
  ix = torch.randint(len(data) - block_size, (batch_size,))
  # print(f"ix shape {ix.shape} and ix: {ix}")
  x = torch.stack([data[i:i+block_size] for i in ix])
  y = torch.stack([data[i+1:i+block_size+1] for i in ix])
  return x ,y

xb ,yb = get_batch('train')
#side quest - decoding the random text choosen
# print(xb.dim(), xb.shape)
# results = []
# for i in range(xb.shape[0]):
#     sequence = xb[i].tolist()
#     decoded = decode(sequence)
#     results.append(f"Batch {i}: '{decoded}'")
# print(results)
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

for b in range(batch_size):
  for t in range(block_size):
    context = xb[b,:t+1]
    target = yb[b,t]
    print(f"when input is {context.tolist()} the target: {target}")



In [None]:
print(xb) , print(yb), print(xb[0,:5]), print(yb[0,4])

In [None]:
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):
  def __init__(self, vocab_size):
    super().__init__()
    self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

  def forward(self, idx, targets=None):
    logits = self.token_embedding_table(idx)
    if targets is None:
      loss = None
    else:
      B,T,C = logits.shape
      logits = logits.view(B*T,C)
      targets = targets.view(B*T)
      loss = F.cross_entropy(logits, targets)

    return logits ,loss

  def generate(self,idx,max_new_tokens):
    #idx is (B,T) array of indices in the current context
    for _ in range(max_new_tokens):
      #getting the predictions
      logits ,loss = self(idx)
      # focus only on the last time step
      logits = logits[:,-1,:]
      #applying softmax to get probabilites
      probs = F.softmax(logits, dim=-1)
      #sample from the distribution
      idx_next = torch.multinomial(probs, num_samples =1)
      #append sampled index to the running sequence
      idx = torch.cat((idx,idx_next), dim=1)
    return idx



m = BigramLanguageModel(vocab_size)
out,loss = m(xb,yb)
print(decode(m.generate(torch.zeros((1,1) , dtype=torch.long), max_new_tokens=100)[0].tolist()))
print(out.shape), print(loss.shape) , print(f"Loss is {loss}")

In [None]:
m.parameters()

In [None]:
optimizer = torch.optim.AdamW(m.parameters(),lr=1e-3)

In [None]:
batch_size = 32
for steps in range(10000):
  xb,yb  = get_batch('train')
  logits , loss = m(xb,yb)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()
print(loss.item())

In [None]:
print(decode(m.generate(torch.zeros((1,1) , dtype=torch.long), max_new_tokens=400)[0].tolist()))

Self attention

In [None]:
torch.manual_seed(1337)
B,T,C = 4,8,2
x = torch.randn(B,T,C)



tril = torch.tril(torch.ones(T,T))
wei = torch.zeros((T,T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)
out = wei @ x
out.shape

In [None]:
#first method
# B=4 , t=8, C=2
xbow = torch.zeros((B,T,C))
for b in range(B):
  for t in range(T):
    xprev = x[b,:t+1] #(T,c)
    xbow[b,t] = torch.mean(xprev,0)
xbow

In [None]:
torch.manual_seed(42)
a= torch.ones((3,3))
b = torch.randint(0,10,(3,2)).float()
c = a @ b
print(f"a = \n{a}")
print(f"b = \n{b}")
print(f"c = \n{c}")

In [None]:
torch.manual_seed(42)
a = torch.tril(torch.ones(3,3))
a = a / torch.sum(a,1,keepdim=True)
b = torch.randint(0,10,(3,2)).float()
c = a @ b
print(f"a = \n{a}")
print(f"b = \n{b}")
print(f"c = \n{c}")

In [None]:
#second method
wei = torch.tril(torch.ones(T,T))
wei = wei / wei.sum(1,keepdim=True)
xbow2 = wei @ x #(B,T,T) @ (B,T,C) -----> (B,T,C)
xbow2 #xbow2 and xbow are same

In [None]:
# version3: using softmax
tril = torch.tril(torch.ones(T,T))
wei = torch.zeros((T,T))
wei = wei.masked_fill(tril == 0,float('-inf'))
print(wei)
wei = F.softmax(wei,dim=-1)  # exponentiating then dividing by the sum of all exponentiations
print(wei)
xbow3 = wei @ x
xbow3