<a href="https://colab.research.google.com/github/Voodoo-999/gpt_from_scratch/blob/main/gpt_from_scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F



In [None]:
# hyper param
batch_size = 64
block_size = 256
n_embd = 384
eval_interval = 500
max_iters = 5000
learning_rate = 3e-4
n_head = 6
n_layer = 6
dropout =0.2
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
from google.colab import files
uploaded = files.upload()

Saving brothers to brothers (3)


In [None]:

filename = list(uploaded.keys())[0]
with open(filename,'r') as f:
  text = f.read()

text_len=len(text)
print(text_len)
vocab = sorted(list(set(text)))
print(vocab)
vocab_len = len(vocab)
print(vocab_len)

1930500
['\n', ' ', '!', '(', ')', ',', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'Æ', 'à', 'æ', 'ç', 'è', 'é', 'ê', 'ë', 'î', 'ï', 'ô', 'ü', 'Œ', 'œ', '‐', '—', '‘', '’', '“', '”']
95


In [None]:
stoi = {ch: i for i, ch in enumerate(vocab)}
itos = {i: ch for i, ch in enumerate(vocab)}
def encode(string):
  return [stoi[c] for c in string]
def decode(integers):
  return ''.join(itos[i] for i in integers)

In [None]:
encoded_text = torch.tensor(encode(text), dtype = torch.long)

print(encoded_text.shape)

torch.Size([1930500])


In [None]:
train_split = int(0.9*text_len)
train_text = encoded_text[:train_split]
val_text = encoded_text[train_split:]

In [None]:
def get_batch(split):
  if split == 'train':
    data = train_text
  else:
    data = val_text
  idx = torch.randint(len(data) - block_size, (batch_size,))
  x = torch.stack([data[i:i+block_size] for i in idx])
  y = torch.stack([data[i+1:i+block_size+1] for i in idx])
  x, y = x.to(device), y.to(device)
  return x, y

xb, yb = get_batch('train')
print(xb)
print(yb)

tensor([[57, 67, 56,  ..., 56, 54, 69],
        [ 1, 68, 56,  ..., 56, 53, 61],
        [ 1, 68, 56,  ..., 56, 57, 62],
        ...,
        [63, 63, 52,  ..., 53, 66, 57],
        [49, 67, 53,  ..., 60, 52,  1],
        [64, 53, 49,  ...,  5,  1, 49]], device='cuda:0')
tensor([[67, 56, 61,  ..., 54, 69, 60],
        [68, 56, 57,  ..., 53, 61,  1],
        [68, 56, 53,  ..., 57, 62, 59],
        ...,
        [63, 52,  1,  ..., 66, 57, 62],
        [67, 53, 19,  ..., 52,  1, 56],
        [53, 49, 67,  ...,  1, 49, 62]], device='cuda:0')


In [None]:
class Head(nn.Module):
  #one head of self attention

  def __init__(self, head_size):
    super().__init__()
    self.key = nn.Linear(n_embd, head_size, bias=False)
    self.query = nn.Linear(n_embd, head_size,bias=False)
    self.value = nn.Linear(n_embd,head_size,bias=False)
    self.register_buffer('tril',torch.tril(torch.ones(block_size,block_size)))
    self.dropout = nn.Dropout(dropout)

  def forward(self, x):
    B,T,C = x.shape
    k = self.key(x) # (B,T,C)
    q = self.query(x)# (B,T,C)
    v = self.value(x)# (B,T,C)
    wei = q @ k.transpose(-2,-1) * C**-0.5 # (B,T,C) * (B,C,T) --> (B,T,T)
    wei = wei.masked_fill(self.tril[:T,:T] == 0, float('-inf')) # still (B,T,T)
    wei = F.softmax(wei, dim =-1)
    wei = self.dropout(wei)
    out = wei @ v # (B,T,T) * (B,T,C) --> (B,T,C)
    return out


In [None]:
class MultiHeadattention(nn.Module):
  # multi headed attention

  def __init__(self, num_heads, head_size):
    super().__init__()
    self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
    self.proj = nn.Linear(n_embd, n_embd)
    self.dropout = nn.Dropout(dropout)


  def forward(self, x):
    out =  torch.cat([h(x) for h in self.heads], dim =-1)
    out = self.dropout(self.proj(out))
    return out

In [None]:
class FeedFoward(nn.Module):
  def __init__(self, n_embd):
    super().__init__()
    self.net = nn.Sequential(
        nn.Linear(n_embd, 4*n_embd),
        nn.ReLU(), # Add parentheses here to instantiate the module
        # this is the projection going back to the residual pathway
        nn.Linear(4*n_embd, n_embd),
        nn.Dropout(dropout),
    )

  def forward(self, x):
    return self.net(x)

In [None]:
class Block(nn.Module):
  def __init__(self,n_embd, n_head):
    super().__init__()
    head_size = n_embd//n_head
    self.sa_heads = MultiHeadattention(n_head,head_size)
    self.ffwd = FeedFoward(n_embd)
    self.ln1 = nn.LayerNorm(n_embd)
    self.ln2 = nn.LayerNorm(n_embd)

  def forward(self, x):
    x = x + self.sa_heads(self.ln1(x))
    x = x + self.ffwd(self.ln2(x))
    return x

In [None]:
class GPTLanguageModel(nn.Module):
  def __init__(self):
    super().__init__()
    self.token_embd_table = nn.Embedding(vocab_len, n_embd)
    self.position_embd_table = nn.Embedding(block_size, n_embd)
    self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
    self.ln_f = nn.LayerNorm(n_embd)
    self.lm_head = nn.Linear(n_embd, vocab_len)

  def forward(self, idx, targets = None):
    B, T = idx.shape

    tok_emb = self.token_embd_table(idx) # (B, T, n_emd)
    pos_emb = self.position_embd_table(torch.arange(T, device=device)) #(T,C)
    x = tok_emb + pos_emb
    x = self.blocks(x)
    logits = self.lm_head(x) # (B, T, vocab_size)
    if targets is None :
      loss = None
    else:
      B, T, C = logits.shape
      logits = logits.view(B*T, C)
      targets = targets.view(B*T)
      loss = F.cross_entropy(logits, targets)

    return logits, loss
  def generate(self, idx, max_new_tokens):
    for _ in range (max_new_tokens):
      idx_cropped = idx[:, -block_size:]
      logits, _loss = self(idx_cropped)
      logits = logits[:,-1,:] #becomes (B,C)
      probs = F.softmax(logits, dim=-1) # (B,C)
      idx_next = torch.multinomial(probs, num_samples=1) # (B,1)
      idx = torch.cat((idx, idx_next), dim= 1) # (B,T+1)
    return idx
model = GPTLanguageModel()
m= model.to(device)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

torch.Size([16384, 95])
tensor(4.9397, device='cuda:0', grad_fn=<NllLossBackward0>)


In [None]:
print(decode(m.generate(torch.zeros((1,1),dtype=torch.long, device = device), max_new_tokens=1000)[0].tolist()))



“We simply come to speak in the garden‐hap one of justice.
And weeping paraphe, yet then mane renoun


In [None]:
@torch.no_grad()
def estimate_loss(gpt, iters = 100):
  out = {}
  gpt.eval()
  for split in ['train','val']:
    losses = torch.zeros(iters)
    for k in range(iters):
      X, Y = get_batch(split)
      _logits, loss = gpt(X,Y)
      losses[k] = loss.item()
    out[split] = losses.mean()
  gpt.train()
  return out

In [None]:

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

def train_model(model, optimizer):
  for iter in range(max_iters):
    if iter % eval_interval == 0:
      loss_status = estimate_loss(model)
      print(f"train loss: {loss_status['train'].item():.4f}, eval loss: {loss_status['val'].item():.4f} ")
    x, y = get_batch('train')
    logits, loss = m(x,y)
    optimizer.zero_grad(set_to_none = True)
    loss.backward()
    optimizer.step()
train_model(m,optimizer)

train loss: 1.0662, eval loss: 1.1625 
train loss: 1.0487, eval loss: 1.1536 
train loss: 1.0285, eval loss: 1.1467 


KeyboardInterrupt: 

In [None]:
print(decode(m.generate(torch.zeros((1,1),dtype=torch.long, device = device), max_new_tokens=1000)[0].tolist()))



shall I didn’t kill your faith so, you worried to‐morrow moons and
yet, for it ought to a life of the death of men’s comprehension in the
same important Orthodox, which the world were all out of all over with
carry to a barrile son. I am terribly leady of infamulation, but what do you
don’t do anything about your? You see what nearess it’s not all a man. I am in
eccessary, knew that. My boy is von Sohn? What would you believe it,
every one conceme, it’s healt, but she won’t happen and did not you went to her and kindly
along. You are right forgiveness?”

“Tchermands are solidarily afraid. So you are something seeking;
there’s no use.”

“It’s no true, Rakitin must—bravely! Damn it does your native is our
sensitive in return?”

Nikolay Parfenovitch though he did not know what from the bottle actually
dressed in discussion, every one little in his neck‐trees were seized them, and at
the servants of a sort of sgreeping roubles; he saw this hand, when he
got up his hands promised to the el