# Char-GPT Implementation from Scratch

In [None]:
# Download the train data text
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2024-10-30 17:44:53--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2024-10-30 17:44:53 (16.8 MB/s) - ‘input.txt’ saved [1115394/1115394]



In [None]:
# Now we read the input data, which is tiny-shakespheare data
with open('input.txt', 'r', encoding='utf-8') as input_file:
    text = input_file.read()

In [None]:
# Now we interpret the data
print(f"The length of the Dataset is: {len(text):,}")

The length of the Dataset is: 1,115,394


In [None]:
# lets look at the first 1000 chars in the data
print("Tiny-Shakespear Dataset")
print("----------------------------------------------\n")
print(text[:1000])

Tiny-Shakespear Dataset
----------------------------------------------

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the 

In [None]:
# let us now check the unique characters in the text
unique_chars = sorted((set(text)))
vocab_size = len(unique_chars)

print(f"Vocabulary Size: {vocab_size}\n")
print("The unique characters in the vocabulary are:")
print(''.join(unique_chars))

Vocabulary Size: 65

The unique characters in the vocabulary are:

 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz


In [None]:
# create a mapping from chars to index and index to chars
stoi = {s:i for i,s in enumerate(unique_chars)}
itos = {i:s for i,s in enumerate(unique_chars)}

def encode(text):
    return [stoi[c] for c in text]

def decode(indexes):
    return "".join([itos[i] for i in indexes])

print(encode('raghav'))
print(decode(encode('raghav')))

[56, 39, 45, 46, 39, 60]
raghav


In [None]:
# now lets apply encode to whole data text and form our data as PyTorch Tensors
import torch

data = torch.tensor(encode(text), dtype=torch.long)

print(f"Data Shpae: {len(data):,} \n Data Type: {data.dtype}")
print("A sample of data")
print(data[:100])

Data Shpae: 1,115,394 
 Data Type: torch.int64
A sample of data
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


In [None]:
# Let's now split up the data into train and validation sets
train_size = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:train_size]
val_data = data[train_size:]

In [None]:
# what is the maximum context length for predictions?
block_size = 8
train_data[:block_size + 1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [None]:
x_train = train_data[:block_size]
y_train = train_data[1:block_size+1]

for t in range(block_size):
    context = x_train[:t + 1]
    output = y_train[t]
    print(f"For the context/input {context}, we need output/generated word {output}")

For the context/input tensor([18]), we need output/generated word 47
For the context/input tensor([18, 47]), we need output/generated word 56
For the context/input tensor([18, 47, 56]), we need output/generated word 57
For the context/input tensor([18, 47, 56, 57]), we need output/generated word 58
For the context/input tensor([18, 47, 56, 57, 58]), we need output/generated word 1
For the context/input tensor([18, 47, 56, 57, 58,  1]), we need output/generated word 15
For the context/input tensor([18, 47, 56, 57, 58,  1, 15]), we need output/generated word 47
For the context/input tensor([18, 47, 56, 57, 58,  1, 15, 47]), we need output/generated word 58


In [None]:
seed_val = 2810240409 # this is the time when i was writing this code (DDMMYYhhmm) so 28-10-2024, 04:09 in the morning
torch.manual_seed(seed_val)

batch_size = 4 # how many independent sequences will we process in parallel?
block_size = 8 # what is the maximum context length for predictions?

def get_batch(split):
    data = train_data if split == 'train' else val_data
    random_indexes = torch.randint(low=0, high=len(data)-block_size, size=(batch_size, ))
    x_b = torch.stack([data[i:i+block_size] for i in random_indexes])
    y_b = torch.stack([data[i+1:i+block_size+1] for i in random_indexes])
    return x_b , y_b

x_b, y_b = get_batch("train")

# Visualize the batche (x and y)
print(f'Input Shape: {x_b.shape}')
print(x_b)
print(f'\nOutput Shape: {y_b.shape}')
print(y_b)

Input Shape: torch.Size([4, 8])
tensor([[ 0, 31, 46, 39, 50, 50,  1, 40],
        [15, 17, 10,  0, 21,  1, 46, 43],
        [ 1, 54, 56, 43, 57, 57, 43, 57],
        [44,  1, 41, 59, 56, 57, 43, 42]])

Output Shape: torch.Size([4, 8])
tensor([[31, 46, 39, 50, 50,  1, 40, 43],
        [17, 10,  0, 21,  1, 46, 43, 39],
        [54, 56, 43, 57, 57, 43, 57,  1],
        [ 1, 41, 59, 56, 57, 43, 42,  1]])


In [None]:
for b in range(batch_size):
    print(f'Batch Number - {b}')
    for t in range(block_size):
        context = x_b[b][:t + 1]
        output = y_b[b][t]
        print(f"For the context {context}, we need output {output}")


Batch Number - 0
For the context tensor([0]), we need output 31
For the context tensor([ 0, 31]), we need output 46
For the context tensor([ 0, 31, 46]), we need output 39
For the context tensor([ 0, 31, 46, 39]), we need output 50
For the context tensor([ 0, 31, 46, 39, 50]), we need output 50
For the context tensor([ 0, 31, 46, 39, 50, 50]), we need output 1
For the context tensor([ 0, 31, 46, 39, 50, 50,  1]), we need output 40
For the context tensor([ 0, 31, 46, 39, 50, 50,  1, 40]), we need output 43
Batch Number - 1
For the context tensor([15]), we need output 17
For the context tensor([15, 17]), we need output 10
For the context tensor([15, 17, 10]), we need output 0
For the context tensor([15, 17, 10,  0]), we need output 21
For the context tensor([15, 17, 10,  0, 21]), we need output 1
For the context tensor([15, 17, 10,  0, 21,  1]), we need output 46
For the context tensor([15, 17, 10,  0, 21,  1, 46]), we need output 43
For the context tensor([15, 17, 10,  0, 21,  1, 46, 43

For each bactch, our model will have batch_size * block_size examples to train on, i.e,  <br>
4 * 8 = 32 training examples

In [None]:
x_b # Input to our model

tensor([[ 0, 31, 46, 39, 50, 50,  1, 40],
        [15, 17, 10,  0, 21,  1, 46, 43],
        [ 1, 54, 56, 43, 57, 57, 43, 57],
        [44,  1, 41, 59, 56, 57, 43, 42]])

## Training a BaseLine LM

In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F

class BigramLM(nn.Module):

    def __init__(self, vocal_size):
        super().__init__()
        self.model_embeddings = nn.Embedding(vocal_size, vocal_size)

    def forward(self, idx, targets=None):
        # idx and targets are both (B,T) sized tensors where B is batch_dim and
        # T is the time dimention
        logits = self.model_embeddings(idx) # shape of this is (B,T,vocab_size)

        if targets==None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_tokens):
        for _ in range(max_tokens):
            logits, _ = self(idx) # (B, T, C)
            logits = logits[:, -1, :] # focus on last timestep (B, C)
            probs = F.softmax(logits, dim=-1)
            next_idx = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, next_idx), dim = 1) # (B, T + 1)
        return idx

In [None]:
m = BigramLM(vocab_size)
logits, loss = m(x_b, y_b)
print(logits.shape)
print(loss)

starting_index = torch.zeros((1,1), dtype=torch.long)
generated_text = m.generate(idx=starting_index, max_tokens=100)
print(decode(generated_text.tolist()[0]))

torch.Size([32, 65])
tensor(4.4184, grad_fn=<NllLossBackward0>)

ZJE&;ChYFMQy'JX hXChATmVPUuweVLoX r-?U y,AUZwk.SLvKPUXDL CiBvlDRIhApZYKtpemgSLZC.ea.iLEaNafoqU:UqP;j


In [None]:
token_id = x_b[0][2]
token = itos[token_id.item()]

print(f"\nEmbeddings for the token: {token} with token id {token_id}")


Embeddings for the token: h with token id 46


In [None]:
# Create a PyTorch Optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [None]:
batch_size = 32
for iter in range(10000):
    xb, yb = get_batch('train')

    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

2.603006601333618


In [None]:
starting_index = torch.zeros((1,1), dtype=torch.long)
generated_text = m.generate(idx=starting_index, max_tokens=300)
print(decode(generated_text.tolist()[0]))



Shag.
Q:
pereatFitoomatis.
Whimat lomt tr:
RYo mby may nd,DWjowa ro Whisend stha ug burth; ILashay thand ARI ndghaggimang dmoQWhe thare o s mbym h o stciombloous?
LELE:

NIONG homak ts on de f send belps y htVI lof ne sscitediriarve aturerthy sthend tVed I:
NGamet, vis d thealoule m'ThaveFiakne o'd


In [None]:
new_seed = 3010242318
torch.manual_seed(new_seed)

<torch._C.Generator at 0x7cedda578510>

In [None]:
a = torch.tril(torch.ones(3,3))
a = a / torch.sum(a, 1, keepdim=True)
b = torch.randint(0, 10, (3, 2)).float()
c = a @ b
print('a=')
print(a)
print('--')
print('b=')
print(b)
print('--')
print('c=')
print(c)

a=
tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
--
b=
tensor([[0., 9.],
        [4., 0.],
        [6., 0.]])
--
c=
tensor([[0.0000, 9.0000],
        [2.0000, 4.5000],
        [3.3333, 3.0000]])


In [None]:
B, T, C = 4, 8, 2
x = torch.randn(B, T, C)

xbow = torch.zeros((B,T,C))

for b in range(B):
    for t in range(T):
        xprev = x[b][:t+1]
        xbow[b][t] = torch.mean(xprev, 0)

In [None]:
# method 2: Matrix Multiplication
weights = torch.tril(torch.ones((T, T)))
weights = weights / weights.sum(1, keepdim=True)
xbow2 = weights @ x   # T*T @ B*T*C -> B*T*C

torch.allclose(xbow2, xbow)

True

In [None]:
# method 3: Using Softmax
tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T, T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)
xbow3 = wei @ x

torch.allclose(xbow3, xbow)

True

In [None]:
# Self - Attention Mechanism
B,T,C = 4,8,32 # batch, time, channels
x = torch.randn(B,T,C) # (B, T, C)

head_size = 16

key = nn.Linear(C, head_size, bias=False)  # (C, H)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)

k = key(x) # (B, T, H)
q = query(x) # (B, T, H)
v = value(x) # (B, T, H)

wei = q @ k.transpose(-2, -1) # (B, T, H) @ (B, H, T) --> (B, T, T)
tril = torch.tril(torch.ones(T, T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)

out = wei @ v # (B, T, T) @ (B, T, H) --> (B, T, H)
print(out.shape)
print(out[0])

torch.Size([4, 8, 16])
tensor([[ 0.4408, -0.0845, -0.2914, -0.0040, -0.1214, -0.0151, -0.0061,  0.0641,
         -0.5656, -0.3850,  0.1446,  0.2683,  0.3358,  0.2524,  0.0348, -0.0530],
        [ 0.2834,  0.0955,  0.0221,  0.2278,  0.1417, -0.1201, -0.0363, -0.0793,
          0.0639,  0.0893,  0.1220,  0.1180,  0.1415,  0.0967,  0.2284,  0.1327],
        [ 0.2286,  0.1927,  0.1720,  0.2678,  0.3195, -0.0509,  0.0742, -0.1928,
          0.3251,  0.3427,  0.1004,  0.0201, -0.0050,  0.0344,  0.3790,  0.0696],
        [ 0.2327,  0.1634,  0.1187, -0.1020,  0.2671,  0.1542,  0.3511, -0.1045,
          0.1681,  0.2750,  0.0609,  0.1405, -0.1215,  0.0770,  0.3897, -0.2454],
        [-0.0530, -0.0823,  0.2314, -0.4032, -0.2570, -0.3784,  0.2028,  0.5928,
          0.7820,  0.2318, -0.1211,  0.8496, -0.1356,  0.0364, -0.1285,  0.3486],
        [-0.4204, -0.4193,  0.1771, -0.6056,  0.1721, -0.3711, -0.1882,  0.1966,
          0.3316,  0.3388, -0.3928, -0.3039, -0.5216, -0.0107,  0.3781,  0.0348],

In [None]:
class LayerNorm1d: # (used to be BatchNorm1d)

  def __init__(self, dim, eps=1e-5, momentum=0.1):
    self.eps = eps
    self.gamma = torch.ones(dim)
    self.beta = torch.zeros(dim)

  def __call__(self, x):
    # calculate the forward pass
    xmean = x.mean(1, keepdim=True) # batch mean
    xvar = x.var(1, keepdim=True) # batch variance
    xhat = (x - xmean) / torch.sqrt(xvar + self.eps) # normalize to unit variance
    self.out = self.gamma * xhat + self.beta
    return self.out

  def parameters(self):
    return [self.gamma, self.beta]

In [None]:
x = torch.randn(32, 100)
layer_norm = LayerNorm1d(100)
x = layer_norm(x)
print(x[0, :].mean(), x[0, :].std())
print(x[:, 0].mean(), x[:, 0].std())

tensor(1.3113e-08) tensor(1.0000)
tensor(-0.3759) tensor(1.0856)


tensor([[-0.0379, -1.8235,  0.5097,  ...,  0.8475, -1.5947,  0.7036],
        [-2.2047,  0.8130, -1.9337,  ..., -0.4588, -0.1052, -0.3987],
        [-0.5901, -0.2190, -0.0819,  ..., -0.3653,  0.3196, -0.3808],
        ...,
        [-1.0341,  0.9762,  0.3826,  ...,  1.6429,  0.1085,  0.4423],
        [-0.2063,  1.5523,  0.2752,  ..., -0.1825,  0.2933,  0.3834],
        [ 0.4230, -1.2092, -0.0720,  ...,  0.9585, -0.6663, -1.0341]])