In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# Import the datasets

In [2]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
with open('./input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

print(text[:100])

--2024-08-04 07:37:16--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2024-08-04 07:37:17 (28.3 MB/s) - ‘input.txt’ saved [1115394/1115394]

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [3]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print("Total unique characters:", vocab_size, end="\n\n")
print(chars)
print(''.join(chars))

Total unique characters: 65

['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']

 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz


# encoder and decoder

In [4]:
char_to_number = {ch:i for i,ch in enumerate(chars)}
number_to_char = {i:ch for i,ch in enumerate(chars)}

# encoder: take a string, output a list of integers
encode = lambda s: [char_to_number[c] for c in s]

# decoder: take a list of integers, output a string
decode = lambda l: ''.join([number_to_char[i] for i in l])


# Train Val split

In [5]:
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

# Classes

In [6]:
class BigramLM(nn.Module):

  def __init__(self):
    super().__init__()
    self.embedding_table = nn.Embedding(vocab_size, n_embd)
    ## add position embeddings
    self.position_embeddings = nn.Embedding(block_size, n_embd)
    #self attention_head
                      # self.sa_heads = MultiHeadAttention(4, n_embd//4) # to make shape workout with the next layer...
                      # self.add_norm_1 = nn.LayerNorm(n_embd)
                      # self.ffwd = FeedForward(n_embd)
                      # self.add_norm_2 = nn.LayerNorm(n_embd)
    self.blocks = nn.Sequential(
        Block(n_embd,n_head = 4),
        Block(n_embd,n_head = 4),
        Block(n_embd,n_head = 4),
        nn.LayerNorm(n_embd)
    )
    self.lm_head = nn.Linear(n_embd, vocab_size)  # added a layer

  def forward(self, idx, targets=None):
    B,T = idx.shape

    tok_emb = self.embedding_table(idx) # B, T, n_emb
    pos_emb = self.position_embeddings(torch.arange(T, device = device))
    x = tok_emb + pos_emb
          #residual connections and add_norm
          # x = x +  self.add_norm_1(self.sa_heads(x))
          # x = x + self.add_norm_2(self.ffwd(x))
    x = self.blocks(x)
    logits = self.lm_head(x) # B, T, vocab_size

    if targets is None:
        loss=None
    else:
        B, T, C = logits.shape
        logits = logits.view(B*T, C)
        targets = targets.view(B*T)
        loss = F.cross_entropy(logits, targets)

    return logits, loss


  def generate(self, idx, max_new_tokens):



    for _ in range(max_new_tokens):
      idx_cond = idx[:, -block_size:] # make sure we only have size of `T`
      logits, loss = self(idx_cond)
      logits = logits[:, -1, :]

      probs = F.softmax(logits, dim=-1) ## dim=-1: along the last dimension ~ here `1`

      next_idx = torch.multinomial(probs, num_samples=1)

      idx = torch.cat((idx, next_idx), dim=1)

    return idx

In [7]:
class Block(nn.Module):
  def __init__(self, n_embd, n_head):
    super().__init__()
    head_size = n_embd//n_head
    self.sa_heads = MultiHeadAttention(n_head, head_size) # to make shape workout with the next layer...
    self.add_norm_1 = nn.LayerNorm(n_embd)
    self.ffwd = FeedForward(n_embd)
    self.add_norm_2 = nn.LayerNorm(n_embd)


  def forward(self, x):

    x = x + self.add_norm_1(self.sa_heads(x))
    x = x + self.add_norm_2(self.ffwd(x))
    return x

In [8]:
class MultiHeadAttention(nn.Module):
  def __init__(self,num_heads, head_size):
    super().__init__()
    self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
    self.proj = nn.Linear(n_embd, n_embd)#projcection layer
    self.dropout = nn.Dropout(dropout)

  def forward(self,x):
    out =  torch.cat([h(x) for h in self.heads], dim = -1)
    out = self.dropout(self.proj(out))
    return out

In [9]:
class Head(nn.Module):
  # this class will create query, key and column tensors and also a registor buffer to mask the future tokens so they don't interact with the past tokens

  def __init__(self, head_size):
    super().__init__()
    self.query = nn.Linear(n_embd, head_size, bias = False)
    self.key = nn.Linear(n_embd, head_size, bias = False)
    self.value = nn.Linear(n_embd, head_size, bias = False)

    self.register_buffer('tril', torch.tril(torch.ones(block_size,block_size, device = device)))
    self.dropout = nn.Dropout(dropout)

  def forward(self,x):
    # here te x input is the positions
    B,T,C = x.shape
    q = self.query(x)
    k = self.key(x)
    v = self.value(x)

    wei = q @ k.transpose(-2,-1) * C**-0.05  # the C**-0.5 is used to control the variance
    wei = wei.masked_fill(self.tril[:T,:T] == 0, float("-inf"))
    wei = F.softmax(wei,dim = -1);
    wei = self.dropout(wei)
    out = wei@ v;

    return out



In [10]:
class FeedForward(nn.Module):
  def __init__(self, n_embd):
    super().__init__()
    self.net = nn.Sequential(
        nn.Linear(n_embd, 4*n_embd),
        nn.ReLU(),
        nn.Linear(4*n_embd, n_embd),# projection layer
        nn.Dropout(dropout)
    )

  def forward(self, x):

    return self.net(x)


## Training functions

In [11]:
def get_batch(split):
  data = train_data if split == 'train' else val_data
  ix = torch.randint(len(data) - block_size, (batch_size,))
  x = torch.stack([data[i:i+block_size] for i in ix])
  y = torch.stack([data[i+1:i+block_size+1] for i in ix])
  x, y = x.to(device), y.to(device)
  return x, y

In [12]:
@torch.no_grad()
def estimate_loss():
  '''
  This function takes the random samples from the dataset (based on the batch size)
  for `eval_iter` times. Records loss and takes the mean loss. And reports back.

  Which means, if we have the `eval_iter = 10` and `batch_size=32` then it will take
  32 random samples from training data and then validation data for 10 times and takes
  the means of these 10 losses.
  '''
  out = {}

  model.eval()

  for split in ['train', 'val']:
      losses = torch.zeros(eval_iters)
      for k in range(eval_iters):
          X, Y = get_batch(split)
          logits, loss = model(X, Y)
          losses[k] = loss.item()
      out[split] = losses.mean()


  model.train()
  return out

#Training

In [13]:
batch_size = 64      # samples we will use for the single forward pass
block_size = 256     # the context window (significantly bigger than our toy examples)
max_iters = 5000     # total forward-backward passes

eval_interval = 500  # after how many steps we want to print the loss?
learning_rate = 3e-4 # learning rate
device = 'cuda' if torch.cuda.is_available() else 'cpu'

eval_iters = 200    # when printing the loss, how many samples to consider for validation?
n_embd = 384        # embedding size of each token
n_head = 8          # `n` multi heads for the self-attention
n_layers = 6        # `n` for `Nx` which shows how many blocks to use
dropout = 0.2

In [14]:
model = BigramLM()
model = model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr = learning_rate)

In [15]:
sum(len(i) for i in model.parameters())

23107

In [16]:
print(model)

BigramLM(
  (embedding_table): Embedding(65, 384)
  (position_embeddings): Embedding(256, 384)
  (blocks): Sequential(
    (0): Block(
      (sa_heads): MultiHeadAttention(
        (heads): ModuleList(
          (0-3): 4 x Head(
            (query): Linear(in_features=384, out_features=96, bias=False)
            (key): Linear(in_features=384, out_features=96, bias=False)
            (value): Linear(in_features=384, out_features=96, bias=False)
            (dropout): Dropout(p=0.2, inplace=False)
          )
        )
        (proj): Linear(in_features=384, out_features=384, bias=True)
        (dropout): Dropout(p=0.2, inplace=False)
      )
      (add_norm_1): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
      (ffwd): FeedForward(
        (net): Sequential(
          (0): Linear(in_features=384, out_features=1536, bias=True)
          (1): ReLU()
          (2): Linear(in_features=1536, out_features=384, bias=True)
          (3): Dropout(p=0.2, inplace=False)
        )
      )

In [19]:
for step in range(max_iters):

  if step % eval_interval == 0:
    losses = estimate_loss()
    print(f"[Step {step}]: Train Loss~{losses['train']:.4f}, Val Loss~{losses['val']:.4f}")

  # sample a batch of data
  xb, yb = get_batch('train')

  # evaluate the loss
  logits, loss = model(xb, yb)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()

[Step 0]: Train Loss~2.0133, Val Loss~2.1434
[Step 500]: Train Loss~1.9977, Val Loss~2.1301
[Step 1000]: Train Loss~1.9573, Val Loss~2.0967
[Step 1500]: Train Loss~1.8014, Val Loss~1.9597
[Step 2000]: Train Loss~1.6768, Val Loss~1.8463
[Step 2500]: Train Loss~1.6123, Val Loss~1.7983
[Step 3000]: Train Loss~1.5799, Val Loss~1.7669
[Step 3500]: Train Loss~1.5513, Val Loss~1.7535
[Step 4000]: Train Loss~1.5309, Val Loss~1.7269
[Step 4500]: Train Loss~1.5143, Val Loss~1.7200


In [20]:
output = decode(
    model.generate(
        idx = torch.zeros((1, 1),
                          dtype=torch.long,
                          device=device),  ### 🗽 Transfer to device 🗽 ###
        max_new_tokens=500)[0].tolist()
)
print(output)



PARIS:
Take her, tongue; only; sconce, some at that but
Like it is the his on you deachonters, s; the nalidenanst to see grall afty han sold first
Here say solding that I will him!

GLOUCIO:
Sinatize arenceiend.

Lord.

I graciore an hidwine their c.
And fot of islewise
much ther? I for that it as the s,
all I land that d incal not your cour k t spile the to this may isu?
'
That's shepherer:
I am his for from and ravarl of am, is is blore,
And, onld nig to mads whethise
And at it to-merchat to 


In [21]:
torch.save(model.state_dict(), 'Final_GPT.pth')

# Loading the state dictionary
model = BigramLM()  # Initialize the model
model.load_state_dict(torch.load('Final_GPT.pth'))
model = model.to(device)
model.eval()

BigramLM(
  (embedding_table): Embedding(65, 384)
  (position_embeddings): Embedding(256, 384)
  (blocks): Sequential(
    (0): Block(
      (sa_heads): MultiHeadAttention(
        (heads): ModuleList(
          (0-3): 4 x Head(
            (query): Linear(in_features=384, out_features=96, bias=False)
            (key): Linear(in_features=384, out_features=96, bias=False)
            (value): Linear(in_features=384, out_features=96, bias=False)
            (dropout): Dropout(p=0.2, inplace=False)
          )
        )
        (proj): Linear(in_features=384, out_features=384, bias=True)
        (dropout): Dropout(p=0.2, inplace=False)
      )
      (add_norm_1): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
      (ffwd): FeedForward(
        (net): Sequential(
          (0): Linear(in_features=384, out_features=1536, bias=True)
          (1): ReLU()
          (2): Linear(in_features=1536, out_features=384, bias=True)
          (3): Dropout(p=0.2, inplace=False)
        )
      )

In [24]:
torch.tensor(encode("Thou art so"))

tensor([32, 46, 53, 59,  1, 39, 56, 58,  1, 57, 53])

In [29]:
output = decode(
    model.generate(
        idx = torch.tensor(encode("Thou art so beau"), dtype=torch.long, device = device).unsqueeze(0),  ### 🗽 Transfer to device 🗽 ###
        max_new_tokens=10)[0].tolist()
)
print(output)

Thou art so beauty thy be



Well, i wanted it to say beautiful, this is also okay 😀

In [34]:
output = decode(
    model.generate(
        idx = torch.tensor(encode("To be or not to"), dtype=torch.long, device = device).unsqueeze(0),  ### 🗽 Transfer to device 🗽 ###
        max_new_tokens=3)[0].tolist()
)
print(output)

To be or not to be


##this is nice