<a href="https://colab.research.google.com/github/Tejshah740/GPT_from_scratch/blob/main/GPT_from_scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from torch.utils.data import DataLoader,TensorDataset
import math
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
from IPython.utils.path import ensure_dir_exists
with open("/content/sample_data/Sherlock.txt","r",encoding="utf-8") as f:
  text=f.read()
start=text.find("The Adventures of Sherlock Holmes")
end=text.find("*** End of the Project Gutenberg")
text=text[start:end]


In [None]:

chars=sorted(list(set(text)))
print(chars)
vocab_size=len(chars)
print(vocab_size)

['\n', ' ', '!', '&', '(', ')', '*', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '£', '½', 'à', 'â', 'æ', 'è', 'é', 'œ', '—', '‘', '’', '“', '”']
89


In [None]:
stoi={ch:i for i,ch in enumerate(chars)}
itos={i:ch for i,ch in enumerate(chars)}
encode=lambda s:[stoi[c] for c in s]
decode=lambda l:([itos[i] for i in l])

In [None]:
data=torch.tensor(encode(text),dtype=torch.long)
print(data)
n=int(0.9*len(data))
train_data=data[:n]
val_data=data[n:]

tensor([42, 57, 54,  ...,  6,  6,  6])


In [None]:
block_size=256
batch_size=64
def get_batch():
  ix=torch.randint(0,len(data)-block_size,(batch_size,))
  x=torch.stack([data[i:i+block_size] for i in ix])
  y=torch.stack([data[i+1:i+block_size+1] for i in ix])
  return x,y

In [None]:
token_embedding=nn.Embedding(vocab_size,256)
x,y=get_batch()
tok_emb=token_embedding(x)
x,y
embed_dim=256

In [None]:
pos_embeddings=nn.Embedding(block_size,256)
B,t=x.shape
pos_emb=pos_embeddings(torch.arange(t))
final_emb=tok_emb+pos_emb
final_emb

tensor([[[-2.8847,  0.3122,  1.4914,  ...,  1.2260, -1.3062,  1.2309],
         [-1.5981, -0.7848, -0.9217,  ...,  0.5559, -0.6817,  1.4021],
         [ 4.2962, -2.9619, -1.2294,  ..., -0.5634,  0.5531,  1.8800],
         ...,
         [ 0.0809, -1.9997,  1.3100,  ...,  2.3698, -0.0057, -0.4159],
         [ 2.5104,  0.5379,  0.2231,  ...,  0.0070, -1.5616,  3.0427],
         [-1.0484, -0.5306, -2.3856,  ..., -0.2717, -0.8803,  2.3013]],

        [[-1.2583,  1.2119,  1.1879,  ...,  1.1889, -0.6322, -1.7664],
         [ 1.0908, -0.5739, -0.7572,  ...,  0.2462,  0.8148,  2.8710],
         [ 2.3351, -1.9198,  0.3011,  ...,  1.0580, -0.0998,  0.2165],
         ...,
         [-0.9687, -1.9694, -0.7803,  ...,  0.8384, -1.3521, -0.3699],
         [ 0.5493,  1.5800,  1.7537,  ...,  1.6284, -2.2145,  1.3793],
         [-2.3255,  1.0034, -0.2325,  ..., -0.2045, -0.0505, -0.5200]],

        [[-1.7283, -0.6204,  1.6363,  ...,  2.3759,  0.4911,  0.2414],
         [-1.9924,  0.7313,  0.6973,  ...,  0

In [None]:
class self_attention(nn.Module):
  def __init__(self, embed_dim, head_size,block_size):
    super().__init__()
    self.key=nn.Linear(embed_dim, head_size, bias=False)
    self.query=nn.Linear(embed_dim, head_size,bias=False)
    self.value=nn.Linear(embed_dim, head_size,bias=False)
    self.register_buffer(
            "tril", torch.tril(torch.ones(block_size, block_size))
        )
  def forward(self, x):
    B,T,C=x.shape
    k=self.key(x)
    q=self.query(x)
    v=self.value(x)
    wei=q@k.transpose(-2,-1)*1/math.sqrt(k.shape[-1])
    wei=wei.masked_fill(self.tril[:T,:T]==0,float('-inf'))
    wei=wei.softmax(dim=-1)
    return wei@v

In [None]:
class MultiHeadAttention(nn.Module):
  def __init__(self, num_heads,embed_dim):
    super().__init__()
    head_size=embed_dim//num_heads
    self.heads=nn.ModuleList([self_attention(embed_dim,head_size,block_size) for _ in range(num_heads)])
    self.proj=nn.Linear(embed_dim,embed_dim)
  def forward(self,x):
    out=torch.cat([head(x) for head in self.heads],dim=-1)
    return self.proj(out)

In [None]:
class FeedForwardnn(nn.Module):
  def __init__(self,embed_dim):
    super().__init__()
    self.net=nn.Sequential(
        nn.Linear(embed_dim,4*embed_dim),
        nn.ReLU(),
        nn.Linear(4*embed_dim,embed_dim)
    )
  def forward(self,x):
    return self.net(x)

In [None]:
class Block(nn.Module):
  def __init__(self,embed_dim,num_heads):
    super().__init__()
    self.multi_head_attention=MultiHeadAttention(num_heads,embed_dim)
    self.ffn=FeedForwardnn(embed_dim)
    self.ln1=nn.LayerNorm(embed_dim)
    self.ln2=nn.LayerNorm(embed_dim)
  def forward(self, x):
    x=x+self.multi_head_attention(self.ln1(x))
    x=x+self.ffn(self.ln2(x))
    return x

In [None]:
class gpt(nn.Module):
  def __init__(self,vocab_size,embed_dim,block_size,num_layers,num_heads):
    super().__init__()
    self.token_emb=nn.Embedding(vocab_size,embed_dim)
    self.pos_emb=nn.Embedding(block_size,embed_dim)
    self.blocks=nn.Sequential(*[Block(embed_dim,num_heads) for _ in range(num_layers)])
    self.ln=nn.LayerNorm(embed_dim)
    self.lm_head=nn.Linear(embed_dim,vocab_size)
  def forward(self,idx,targets=None):
    B,T=idx.shape
    tok=self.token_emb(idx)
    pos=self.pos_emb(torch.arange(T,device=idx.device))
    x=tok+pos
    x=self.blocks(x)
    x=self.ln(x)
    logits=self.lm_head(x)
    if targets is None:
      return logits
    B,T,V=logits.shape
    loss=F.cross_entropy(logits.view(B*T,V),targets.view(B*T))
    return logits,loss

In [None]:
num_heads = 8
num_layers = 6
model=gpt(vocab_size,embed_dim,block_size,num_layers,num_heads).to(device)
optimiser=torch.optim.Adam(model.parameters(),lr=3e-4)
for step in range(15000):
  xb,yb=get_batch()
  xb=xb.to(device)
  yb=yb.to(device)
  logits,loss=model(xb,yb)
  optimiser.zero_grad()
  loss.backward()
  optimiser.step()
  if step%100==0:
    print(f"step{step} | loss{loss.item():.4f}")

step0 | loss4.6608
step100 | loss2.5049
step200 | loss2.4205
step300 | loss2.3466
step400 | loss2.1719
step500 | loss1.9963
step600 | loss1.8746
step700 | loss1.7755
step800 | loss1.6942
step900 | loss1.6069
step1000 | loss1.5823
step1100 | loss1.5423
step1200 | loss1.5080
step1300 | loss1.4479
step1400 | loss1.3887
step1500 | loss1.3368
step1600 | loss1.3621
step1700 | loss1.3166
step1800 | loss1.2950
step1900 | loss1.2585
step2000 | loss1.2570
step2100 | loss1.1958
step2200 | loss1.1904
step2300 | loss1.2041
step2400 | loss1.1591
step2500 | loss1.1402
step2600 | loss1.1253
step2700 | loss1.1127
step2800 | loss1.0672
step2900 | loss1.0492
step3000 | loss1.0615
step3100 | loss1.0238
step3200 | loss1.0038
step3300 | loss0.9657
step3400 | loss0.9820
step3500 | loss0.9570
step3600 | loss0.9209
step3700 | loss0.9117
step3800 | loss0.8956
step3900 | loss0.8736
step4000 | loss0.8265
step4100 | loss0.8228
step4200 | loss0.8026
step4300 | loss0.7613
step4400 | loss0.7634
step4500 | loss0.7284


In [None]:
def generate(model,idx,max_new_tokens):
  for _ in range(max_new_tokens):
    idx_cond=idx[:,-block_size:]
    logits=model(idx_cond)
    logits=logits[:,-1,:]
    probs=F.softmax(logits,dim=-1)
    idx_next=torch.multinomial(probs,num_samples=1)
    idx=torch.cat((idx,idx_next),dim=1)
  return idx


In [None]:
prompt = "sherlock holmes"
idx=torch.tensor(encode(prompt),dtype=torch.long,device=device).unsqueeze(0)
out=generate(model,idx,100)
output=(decode(out[0].tolist()))
print(''.join(output))

sherlock holmes.
Our five got good-day, with a farnish and shown signs once the top of
his hand, pulled on with his


Raw bytes length: 587798
First 200 bytes: b'The Adventures of Sherlock Holmes\r\n\r\nby Arthur Conan Doyle\r\n\r\n\r\nContents\r\n\r\n   I.     A Scandal in Bohemia\r\n   II.    The Red-Headed League\r\n   III.   A Case of Identity\r\n   IV.    The Boscombe Valley'
