In [2]:
import torch
import torch.nn as nn
from torch.nn import functional as F

In [3]:
with open('/content/attention-gpt-input.txt','r',encoding='utf-8') as f:
  text = f.read()

In [4]:
len(text)

1115394

In [None]:
text[:1000]

"First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you know Caius Marcius is chief enemy to the people.\n\nAll:\nWe know't, we know't.\n\nFirst Citizen:\nLet us kill him, and we'll have corn at our own price.\nIs't a verdict?\n\nAll:\nNo more talking on't; let it be done: away, away!\n\nSecond Citizen:\nOne word, good citizens.\n\nFirst Citizen:\nWe are accounted poor citizens, the patricians good.\nWhat authority surfeits on would relieve us: if they\nwould yield us but the superfluity, while it were\nwholesome, we might guess they relieved us humanely;\nbut they think we are too dear: the leanness that\nafflicts us, the object of our misery, is as an\ninventory to particularise their abundance; our\nsufferance is a gain to them Let us revenge this with\nour pikes, ere we become rakes: for the gods know I\nspeak this in hunger 

In [None]:
chars=sorted(list(set(text)))
vocab_size=len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


#Step 1: Text Processing & Tokenization

**What I implemented:**

Calculated the number of unique characters in the text.

Created stoi (string-to-index) and itos (index-to-string) dictionaries.

Defined encode and decode functions to convert text to numeric sequences and back.

**Why this step is important:**

Neural networks cannot process raw text, they require numbers.

Encoding characters as integers allows the model to learn patterns in sequences.

Decoding lets us convert the model’s output back to readable text for evaluation.

In [None]:
len(text)
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)

stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:ch for i,ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

print(encode("hii there"))
print(decode(encode("hii there")))



 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65
[46, 47, 47, 1, 58, 46, 43, 56, 43]
hii there


In [None]:
import torch
data=torch.tensor(encode(text),dtype=torch.long)
print(data.shape,data.dtype)
print(data[:1000])

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
      

#Step 2: Creating Input-Target Pairs for Training

What I implemented:

Split the dataset into training (90%) and validation (10%).

Defined a block size (sequence length the model sees at once).

Created a preview of context-target pairs:

context = tokens the model sees

target = next token the model should predict

Why this step is important:

GPT is autoregressive: it predicts the next token based on previous tokens.

Generating context-target pairs is how the model learns the sequence structure.

Previewing the pairs ensures that batching and sequence slicing are working correctly before training.

In [None]:
n=int(0.9*len(data))
train_data=data[:n]
val_data=data[n:]

In [None]:
block_size=8
train_data[:block_size+1]


tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [None]:
x=train_data[:block_size]
y=train_data[1:block_size+1]
for t in range(block_size):
  context=x[:t+1]
  target=y[t]
  print(f"when input is {context} the target: {target}")

when input is tensor([18]) the target: 47
when input is tensor([18, 47]) the target: 56
when input is tensor([18, 47, 56]) the target: 57
when input is tensor([18, 47, 56, 57]) the target: 58
when input is tensor([18, 47, 56, 57, 58]) the target: 1
when input is tensor([18, 47, 56, 57, 58,  1]) the target: 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]) the target: 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target: 58


#Step 3: Creating Mini-Batches
What I implemented:

Set a batch size (batch_size = 4) and block size (block_size = 8).

Created a function get_batch to generate mini-batches for training/validation:

Randomly select starting positions in the data

Slice sequences of length block_size for inputs (x)

Slice the next token for targets (y)

Visualized the context-target pairs for every batch and token to verify correctness.

Why this step is important:

Deep learning models train on batches to speed up computation and stabilize gradient updates.

Each batch provides multiple sequences for the model to learn in parallel.

Shifting x and y by 1 token ensures the model learns next-token prediction (autoregressive learning).

Visualizing context-target pairs helps debug and understand what the model sees.

In [None]:
torch.manual_seed(1337)
batch_size=4
block_size=8

def get_batch(split):
  data=train_data if split=='train' else val_data
  ix=torch.randint(len(data)-block_size,(batch_size,))
  x=torch.stack([data[i:i+block_size] for i in ix])
  y=torch.stack([data[i+1:i+block_size+1] for i in ix])
  return x,y





xb,yb=get_batch('train')
print("inputs: ")
print(xb.shape)
print(xb)
print("targets:")
print(yb.shape)
print(yb)

print("----")

for b in range(batch_size):
  for t in range(block_size):
    context=xb[b,:t+1]
    target=yb[b,t]
    print(f"when input is {context.tolist()} the target: {target}")


inputs: 
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
targets:
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
----
when input is [24] the target: 43
when input is [24, 43] the target: 58
when input is [24, 43, 58] the target: 5
when input is [24, 43, 58, 5] the target: 57
when input is [24, 43, 58, 5, 57] the target: 1
when input is [24, 43, 58, 5, 57, 1] the target: 46
when input is [24, 43, 58, 5, 57, 1, 46] the target: 43
when input is [24, 43, 58, 5, 57, 1, 46, 43] the target: 39
when input is [44] the target: 53
when input is [44, 53] the target: 56
when input is [44, 53, 56] the target: 1
when input is [44, 53, 56, 1] the target: 58
when input is [44, 53, 56, 1, 58] the target: 46
when input is [44, 5

In [None]:
xb

tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])

#Step 4: Bigram Language Model
**What I implemented:**

I developed a Bigram language model using a single embedding layer.

**Embedding Layer**: The embedding layer learns to map each token (word or character) into a vector space. Each token is associated with a unique vector, which is then used to predict the next token. These embeddings are initially random but get adjusted during training.

**Logits**: From the embeddings, the model calculates logits—raw scores that represent the likelihood of each token being the next one in the sequence. These logits are not probabilities yet, but they can be converted to probabilities using the softmax function.

**Forward Pass**: In the training loop, the model performs a forward pass to compute the logits for each token. Then, it calculates cross-entropy loss, which measures the difference between the model's predicted token distribution and the actual next token in the sequence.

**Text Generation (Autoregressive Sampling)**: To generate new text:

The model predicts the next token based on the current context (the sequence of tokens so far).

The predicted token is added to the context.

This process repeats, and the model continues to generate new tokens until the desired length of the sequence (max_new_tokens) is reached.


**Why this step is important:**

Serves as a baseline model: captures simple character-to-character dependencies.

Helps verify the data pipeline and training loop before building more complex models.

Introduces autoregressive generation, the key idea behind GPT: predicting the next token given previous tokens.

Shows how softmax + multinomial sampling can produce new text.

In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):
  def __init__(self):
    super().__init__()
    self.token_embedding_table=nn.Embedding(vocab_size,vocab_size)

  def forward(self,idx,targets=None):
    logits=self.token_embedding_table(idx)
    if targets is None:
      loss=None
    else:
      B,T,C=logits.shape
      logits=logits.view(B*T,C)
      targets=targets.view(B*T)
      loss=F.cross_entropy(logits,targets)

    return logits ,loss

  def generate(self,idx,max_new_tokens):
    #idx is (B,T) array of the indices in the current context
    for _ in range(max_new_tokens):
      #get the predictions
      logits,loss=self(idx)
      # focus only on the last time step
      logits=logits[:,-1,:] #becomes (B,C)
      probs=F.softmax(logits,dim=-1)#(B,c)
      idx_next=torch.multinomial(probs,num_samples=1)#(B,1)
      # print(f"idx_next={idx_next.shape}")
      idx=torch.cat((idx,idx_next),dim=1)#(B,T+1)
    return idx


model = BigramLanguageModel()
logits, loss = model(xb, yb)
print(logits.shape)
print(loss)

print(decode(model.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))


torch.Size([32, 65])
tensor(4.8786, grad_fn=<NllLossBackward0>)

SKIcLT;AcELMoTbvZv C?nq-QE33:CJqkOKH-q;:la!oiywkHjgChzbQ?u!3bLIgwevmyFJGUGp
wnYWmnxKWWev-tDqXErVKLgJ


In [None]:
decode(model.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist())

'\npJ:Bpm&yiltNCjeO3:Cx&vvMYW-txjuAd IRFbTpJ$zkZelxZtTlHNzdXXUiQQY:qFINTOBNLI,&oTigq z.c:Cq,SDXzetn3XVj'

#Step 5: Training the Model

**What I implemented:**

Initialized the Bigram model and optimizer (AdamW).

Wrote a loss evaluation function (estimate_loss) to calculate average training and validation loss without updating model parameters.

Implemented the training loop:

Every iteration, sample a batch with get_batch

Forward pass to compute logits and loss

Backpropagation with loss.backward()

Update parameters using optimizer.step()

Periodically evaluated the model on training and validation data.

Printed the final loss and generated sample text to observe learning progress.

**Why this step is important:**

Training loop is the core of model learning.

Evaluating both training and validation loss helps monitor overfitting.

Using optimizer.zero_grad() and loss.backward() implements gradient descent, allowing the model to learn from data.

Sampling new text after training shows the model’s ability to generate sequences, validating that it learned patterns.

This step demonstrates end-to-end training, from data batching to prediction.

In [None]:
model = BigramLanguageModel()
logits, loss = model(xb, yb)
print(logits.shape)
print(loss)

print(decode(model.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))
optimizer=torch.optim.AdamW(model.parameters(),learning_rate)

In [None]:
@torch.no_grad()
def estimate_loss():
  out={}
  model.eval()
  for split in ['train','val']:
    losses=torch.zeros(eval_iters)
    for k in range(eval_iters):
      X,Y=get_batch(split)
      logits,loss=model(X,Y)
      losses[k]=loss.item()
    out[split]=losses.mean()
  model.train()
  return out


In [None]:
for iter in range(max_iters):
  if iter%eval_interval==0:
    losses=estimate_loss()
    print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

  xb,yb=get_batch('train')

  logits,loss=model(xb,yb)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()

print(loss.item())

step 0: train loss 4.7364, val loss 4.7156
step 500: train loss 4.3640, val loss 4.3970
step 1000: train loss 4.0319, val loss 4.0540
step 1500: train loss 3.7849, val loss 3.7783
step 2000: train loss 3.5476, val loss 3.5450
step 2500: train loss 3.3429, val loss 3.3521
step 3000: train loss 3.2083, val loss 3.2211
step 3500: train loss 3.0894, val loss 3.0913
step 4000: train loss 2.9662, val loss 2.9910
step 4500: train loss 2.8842, val loss 2.9064
step 5000: train loss 2.8083, val loss 2.8542
step 5500: train loss 2.7675, val loss 2.7911
step 6000: train loss 2.7200, val loss 2.7098
step 6500: train loss 2.6643, val loss 2.6798
step 7000: train loss 2.6811, val loss 2.6621
step 7500: train loss 2.6318, val loss 2.6400
step 8000: train loss 2.5938, val loss 2.6144
step 8500: train loss 2.5892, val loss 2.5764
step 9000: train loss 2.5758, val loss 2.5973
step 9500: train loss 2.5740, val loss 2.5681
step 10000: train loss 2.5684, val loss 2.5385
step 10500: train loss 2.5405, val lo

In [None]:
idx=torch.zeros((1,1),dtype=torch.long)
print(decode(model.generate(idx,max_new_tokens=1000)[0].tolist()))



e winatllorer lmye
Ana dive m s inem e tat f:
ALEdnt tha mitrks ban? w.
ULAngseou:

Wrchumed th pul F s, d moind se is he, ty t bu
DWAsullo kinghou at qSuledes cabre velerkearthe I y
NARICArey me fr'd nd t GSh pth t y lige n tencoulf ad ded,
ARisea ad hean, tagr; mad LEEno ge at ver;
wirwn;

CHac,
Wave as the irn igit d.
HNGayave sed a-garesammy sand t s-VINom'sp n:
Bothow,
Nosathomy.
Fur owind yorise biuppee w ls he thout t; is RD ig w an
A:
tl ane kee eve,
Artcke m as h weney, e.
CHowes f?

Ha a't GHAur.
APolld.


Ge pas
Hathidaw, wilvewounowh lloularoulourithertorer ig'daven'sle,
YCHot:
Wht Bouredik YOn
Tomernorik, an,
NGid w'dy; tly ad han,
S incke s mokere ifot tis has me:
Allcceere, wrery, shasstigsout f verne nre sous, be to e tor othot'Whtit keal tr k,
Thin
bu.
Ant nelads:
RYR:

ETous tht, illen din'st, wilpr l that!q'sifast d:
To brithin dau ns, ak!
Whar's-amy; vel whimyothevewhalloler p IO:
WI s wak Fld st, thak, w s; adore!
A:
VO, IAGen Y sthuid la, the, me wshar frrt sh'do

#Step 6: Understanding the Math Behind Self-Attention
What I Implemented:

**Lower-Triangular Matrix (a):**
I created a lower-triangular matrix a to ensure each token only attends to previous tokens (and not future tokens), following the autoregressive nature of self-attention.

**Normalization:**
I normalized each row of the matrix a so that the values sum to 1. This step turns the values into attention weights, representing how much each token should attend to other tokens.

**Matrix Multiplication (with b):**
I multiplied the lower-triangular matrix a with a matrix b (representing token embeddings) to get a new matrix c. This matrix c contains the weighted sum of the token features, where each token’s representation is a combination of the previous tokens, based on the attention weights.

**Why This Is Important:**

**Attention as Weighted Sum:** This process demonstrates how self-attention works by calculating a weighted sum of token representations. The weights are determined by the attention mechanism.

**Autoregressive Process:** By using the lower-triangular matrix, tokens only attend to previous tokens, ensuring that the model doesn’t cheat by looking ahead in the sequence.

**Efficient Computation:** This approach shows how self-attention can be computed using matrix multiplication, a highly efficient method that allows models like Transformers to process long sequences in parallel.

In [None]:
# toy example illustrating how matrix multiplication can be used for a "weighted aggregation"

torch.manual_seed(42)
a = torch.tril(torch.ones(3, 3))
a = a / torch.sum(a, 1, keepdim=True)
b = torch.randint(0,10,(3,2)).float()
c = a @ b
print('a=')
print(a)
print('--')
print('b=')
print(b)
print('--')
print('c=')
print(c)

a=
tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
--
b=
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
--
c=
tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])


In [None]:
torch.manual_seed(1337)
B,T,C=4,8,2
x=torch.randn(B,T,C)
x.shape

torch.Size([4, 8, 2])

In [None]:
xbow=torch.zeros((B,T,C))
for b in range(B):
  for t in range(T):
    xprev=x[b,:t+1]
    xbow[b,t]=torch.mean(xprev,0)


In [None]:
xbow

tensor([[[ 0.1808, -0.0700],
         [-0.0894, -0.4926],
         [ 0.1490, -0.3199],
         [ 0.3504, -0.2238],
         [ 0.3525,  0.0545],
         [ 0.0688, -0.0396],
         [ 0.0927, -0.0682],
         [-0.0341,  0.1332]],

        [[ 1.3488, -0.1396],
         [ 0.8173,  0.4127],
         [-0.1342,  0.4395],
         [ 0.2711,  0.4774],
         [ 0.2421,  0.0694],
         [ 0.0084,  0.0020],
         [ 0.0712, -0.1128],
         [ 0.2527,  0.2149]],

        [[-0.6631, -0.2513],
         [ 0.1735, -0.0649],
         [ 0.1685,  0.3348],
         [-0.1621,  0.1765],
         [-0.2312, -0.0436],
         [-0.1015, -0.2855],
         [-0.2593, -0.1630],
         [-0.3015, -0.2293]],

        [[ 1.6455, -0.8030],
         [ 1.4985, -0.5395],
         [ 0.4954,  0.3420],
         [ 1.0623, -0.1802],
         [ 1.1401, -0.4462],
         [ 1.0870, -0.4071],
         [ 1.0430, -0.1299],
         [ 1.1138, -0.1641]]])

**What I Implemented:**

I replaced the nested loops with matrix multiplication, allowing me to compute all cumulative averages in a single operation.

**Why This Is Important:**

This highlights a key efficiency improvement in self-attention:

Rather than iterating over tokens one by one, **matrix multiplication** enables us to compute all weighted sums in parallel with a single operation.

This is exactly how **modern Transformer models** handle attention calculations efficiently, leveraging matrix operations to process large sequences quickly.

In [None]:
#version 2:using matrix multiply for weighted aggregation

wei=torch.tril(torch.ones(T,T))
print(f"wei={wei}")
wei=wei/wei.sum(1,keepdim=True)
print(f"wei={wei}")
xbow2=wei@x
xbow2

wei=tensor([[1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1.]])
wei=tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])


tensor([[[ 0.1808, -0.0700],
         [-0.0894, -0.4926],
         [ 0.1490, -0.3199],
         [ 0.3504, -0.2238],
         [ 0.3525,  0.0545],
         [ 0.0688, -0.0396],
         [ 0.0927, -0.0682],
         [-0.0341,  0.1332]],

        [[ 1.3488, -0.1396],
         [ 0.8173,  0.4127],
         [-0.1342,  0.4395],
         [ 0.2711,  0.4774],
         [ 0.2421,  0.0694],
         [ 0.0084,  0.0020],
         [ 0.0712, -0.1128],
         [ 0.2527,  0.2149]],

        [[-0.6631, -0.2513],
         [ 0.1735, -0.0649],
         [ 0.1685,  0.3348],
         [-0.1621,  0.1765],
         [-0.2312, -0.0436],
         [-0.1015, -0.2855],
         [-0.2593, -0.1630],
         [-0.3015, -0.2293]],

        [[ 1.6455, -0.8030],
         [ 1.4985, -0.5395],
         [ 0.4954,  0.3420],
         [ 1.0623, -0.1802],
         [ 1.1401, -0.4462],
         [ 1.0870, -0.4071],
         [ 1.0430, -0.1299],
         [ 1.1138, -0.1641]]])


**What I Implemented:**

**Masking Future Positions:**
I applied a mask to future positions in the attention matrix by setting them to -infinity (-∞). This prevents the model from attending to future tokens when processing the current token, maintaining the autoregressive property.

**Softmax Normalization:**
I used the softmax function to normalize the attention scores. This converts the raw attention scores (which could be any real number) into a probability distribution. The resulting values are weights that represent how much focus each token should have on the others.

**Weighted Aggregation:**
After applying softmax, I multiplied the attention weights by the value vectors (which represent the token features). This step computes the final weighted sum of the token features, aggregating information from tokens that the current token is attending to.

**Why This Is Important:**

This step demonstrates the core mathematical trick behind self-attention:

**Compute Attention Scores:** The first step is to calculate how much attention each token should give to others. This is done through similarity measures (like dot products).

**Mask Future Tokens:** We then apply the mask to future tokens to ensure that each token only "sees" the tokens before it (and itself), preventing any leakage of future information.

**Apply Softmax:** The softmax function turns these attention scores into probabilities, essentially creating attention weights. These weights determine how much influence each token has on the current token’s representation.

**Multiply by Value Vectors:** Finally, the attention weights are used to compute a weighted sum of the value vectors, which aggregates the relevant information for each token.

This entire process allows each token to focus selectively on the previous tokens in the sequence, gathering relevant context while respecting the order of the sequence. This is essential for tasks like language modeling, where each token needs to make predictions based only on its prior context.

In [None]:
#version 3:use softmax

tril=torch.tril(torch.ones(T,T))
print(f"tril={tril}")
wei=torch.zeros((T,T))
wei=wei.masked_fill(tril==0,float('-inf'))
print(f"wei={wei}")
wei=F.softmax(wei,dim=-1)
print(wei)
wei=wei@x


tril=tensor([[1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1.]])
wei=tensor([[0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., -inf],
        [0., 0., 0., 0., 0., 0., 0., 0.]])
tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500

#Step 7: Self-Attention Head

In this code, we’re implementing a basic **self-attention mechanism**. The goal is to allow each token in a sequence to "attend" to all other tokens, including itself, based on the key, query, and value vectors. Here's a simplified breakdown of what's happening in each step:

**Key, Query, Value:**

**Key**: Encodes information about the token that other tokens will use to decide how much they should "attend" to it.

**Query**: Represents the current token and how much attention it should give to other tokens.

**Value**: Represents the actual information or content that will be passed forward after attention is applied.

**Self-Attention:**
The self-attention mechanism compares the query of each token to the keys of all tokens (including itself) to calculate an attention score. This score determines how much focus (weight) each token should place on others. The weighted sum of values produces the final output for each token.

**Masking Future Tokens:**
In this case, the model is performing autoregressive self-attention, so we need to prevent future tokens from influencing the current token’s output. This is done by applying a mask.

**Final Output:**
After applying the attention mechanism, we get a new representation for each token, which is a weighted sum of the values, based on the attention scores.

In [None]:
# Set the random seed for reproducibility
torch.manual_seed(1337)

# Define the shape of the input tensor
B, T, C = 4, 8, 32  # Batch size, sequence length, feature dimension
x = torch.randn(B, T, C)  # Random input tensor with shape (B, T, C)

# Define the size of the attention head (output size for each token's attention)
head_size = 16

# Define the Linear layers to project input x into key, query, and value representations
key = nn.Linear(C, head_size, bias=False)  # Projects input into key representation
query = nn.Linear(C, head_size, bias=False)  # Projects input into query representation
value = nn.Linear(C, head_size, bias=False)  # Projects input into value representation

# Compute the key, query, and value tensors by applying the linear layers
k = key(x)  # Shape: (B, T, head_size) => (4, 8, 16)
q = query(x)  # Shape: (B, T, head_size) => (4, 8, 16)

# Compute the attention scores using the dot product between query and key
wei = q @ k.transpose(-2, -1)  # Shape: (B, T, T) => (4, 8, 8)


torch.Size([4, 8, 16])

In [None]:
out[0]

tensor([[-0.1571,  0.8801,  0.1615, -0.7824, -0.1429,  0.7468,  0.1007, -0.5239,
         -0.8873,  0.1907,  0.1762, -0.5943, -0.4812, -0.4860,  0.2862,  0.5710],
        [ 0.6764, -0.5477, -0.2478,  0.3143, -0.1280, -0.2952, -0.4296, -0.1089,
         -0.0493,  0.7268,  0.7130, -0.1164,  0.3266,  0.3431, -0.0710,  1.2716],
        [ 0.4823, -0.1069, -0.4055,  0.1770,  0.1581, -0.1697,  0.0162,  0.0215,
         -0.2490, -0.3773,  0.2787,  0.1629, -0.2895, -0.0676, -0.1416,  1.2194],
        [ 0.1971,  0.2856, -0.1303, -0.2655,  0.0668,  0.1954,  0.0281, -0.2451,
         -0.4647,  0.0693,  0.1528, -0.2032, -0.2479, -0.1621,  0.1947,  0.7678],
        [ 0.2510,  0.7346,  0.5939,  0.2516,  0.2606,  0.7582,  0.5595,  0.3539,
         -0.5934, -1.0807, -0.3111, -0.2781, -0.9054,  0.1318, -0.1382,  0.6371],
        [ 0.3428,  0.4960,  0.4725,  0.3028,  0.1844,  0.5814,  0.3824,  0.2952,
         -0.4897, -0.7705, -0.1172, -0.2541, -0.6892,  0.1979, -0.1513,  0.7666],
        [ 0.1866, -0.0

#Step 8: Scaled Dot-Product Attention

**What I Implemented:**

1.Created **random key and query vectors** for a simple example to simulate attention.

2.Computed **attention scores **with the dot product q @ k^T and **scaled** them by dividing by √head_size.

3.Applied **causal masking** using a **lower-triangular matrix** to prevent tokens from attending to future positions.

4.Used **softmax** to turn the attention scores into **normalized weights** (wei).

5.Checked basic statistics (like variance) of keys, queries, and weights, and looked at the first row of attention weights to see how tokens focus on each other.

**Why This Is Important:**

**Scaling by √head_size**: Prevents dot products from getting too large, which keeps the softmax stable and gradients well-behaved.

**Causal Masking:** Ensures the model respects the autoregressive property—it can only attend to previous tokens, not future ones.

**Softmax:** Converts raw attention scores into probabilities, showing how much each token “attends” to others.

**Checking Statistics**: Looking at variance and weights helps understand how attention distributes across tokens and can be used for debugging.

**In short:** This step demonstrates how scaled dot-product attention computes weighted combinations of tokens, while controlling for numerical stability and respecting the order of sequences.

In [None]:
k=torch.randn(B,T,head_size)
q=torch.randn(B,T,head_size)
wei=q@k.transpose(-2,-1)/head_size**0.5
tril=torch.tril(torch.ones(T,T))
wei=wei.masked_fill(tril==0,float('-inf'))
wei=F.softmax(wei,dim=-1)
wei.shape

torch.Size([4, 8, 8])

In [None]:
k.var()

tensor(0.9006)

In [None]:
q.var()

tensor(1.0037)

In [None]:
wei.var()

tensor(0.0417)

In [None]:
wei[0]

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.6370, 0.3630, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1234, 0.5156, 0.3610, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0348, 0.0261, 0.6100, 0.3292, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2465, 0.3402, 0.0333, 0.3690, 0.0110, 0.0000, 0.0000, 0.0000],
        [0.0940, 0.3110, 0.1043, 0.3475, 0.0998, 0.0434, 0.0000, 0.0000],
        [0.5549, 0.0228, 0.1602, 0.0460, 0.0605, 0.0840, 0.0717, 0.0000],
        [0.0740, 0.0305, 0.1435, 0.1113, 0.4445, 0.0929, 0.0562, 0.0471]])

In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F

# hyperparameters
batch_size = 16 # how many independent sequences will we process in parallel?
block_size = 32 # what is the maximum context length for predictions?
max_iters = 5000
eval_interval = 100
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 64
n_head = 4
n_layer = 4
dropout = 0.0
# ------------

torch.manual_seed(1337)

# wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

In [None]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

#Step 9: Implementing a Single Self-Attention Head
**What I Implemented:**

Created a **PyTorch module** for a single self-attention head.

Added **linear layers** to project the input into **key, query, and value vectors**.

Computed **scaled dot-product attention**, applied **causal masking**, and **softmax** to get attention weights.

Added **dropout** to the attention weights to prevent overfitting.

Multiplied the weights by the value vectors to produce **context-aware representations** for each token.

**Why This Is Important:**

This is **the core building block **of Transformer models like GPT.

Each attention head lets the model focus on **different aspect**s of the previous tokens.

Scaling keeps values stable, **masking** preserves autoregressive behavior, and **dropout** helps generalization.

Wrapping it in a **module** makes it reusable for **multi-head attention** and full Transformer layers.

Core Idea in Simple Terms:
A self-attention head allows each token to look at previous tokens, decide which ones are important, and use that information to create a context-aware representation. This is the fundamental mechanism behind how Transformers understand sequences.

In [None]:
class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)   # (B,T,C)
        q = self.query(x) # (B,T,C)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

#Step 10: Multi-Head Attention
**What I Implemented:**

Combined multiple **self-attention heads** in parallel using nn.ModuleList.

**Concatenated** the outputs from all heads along the embedding dimension.

Applied a **linear projection** (self.proj) to mix information across heads.

Added **dropout** for **regularization**.


**Why This Is Important:**

**Multiple Heads**:

Each head can focus on different aspects of the sequence—one might capture short-term patterns, another long-range dependencies.

**Concatenation + Linear Projection:**

 Merges the information from all heads into a unified, richer token representation.

**Dropout**:
Reduces overfitting, especially in large models with many parameters.

**Core of Transformer Blocks:**
 Multi-head attention is essential for Transformers to understand complex relationships in sequences and generate context-aware outputs.

**Core Idea in Simple Terms:**

Multi-head attention lets the model look at the sequence in multiple ways at once, combine the insights, and produce more powerful token representations, which is what enables Transformers to learn intricate patterns in language or other sequential data.

In [None]:
class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

#Step 11: FeedForward Layer
**What I Implemented:**

Built a two-layer **feedforward network** with a **ReLU** **activation** in between.

Expanded the embedding dimension from n_embd → 4 * n_embd in the hidden layer, then projected back to n_embd.

Added **dropout** for regularization.

Applied the network **token-wise**, independently for each position in the sequence.

**Why This Is Important:**

**Non-Linearity:** The feedforward layer introduces non-linear transformations, letting the model learn **patterns beyond what attention alone can capture**.

**Dimension Expansion:** Expanding the embedding dimension allows the model to **extract richer features** before compressing back to the original size.

**Dropout**: Prevents overfitting, especially in large models with many parameters.

**Core Part of Transformer Blocks:** Along with multi-head attention and residual connections, the feedforward layer forms a complete Transformer block, which is the fundamental building block of GPT.

**Core Idea in Simple Terms:**
The feedforward layer lets the model process each token independently to learn complex transformations, making token representations more powerful when combined with attention.

In [None]:
class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

#Step 12: Full Transformer Block
**What I Implemented:**

**1**.**LayerNorm + Multi-Head Attention + Residual:**

Normalize embeddings first (ln1).

Apply multi-head attention (sa).

Add a residual connection to preserve the original input.

2.**LayerNorm + FeedForward + Residual:**

Normalize again (ln2).

Apply the **feedforward network** (ffwd).

Add another **residual connection**.

3.Combined these steps into a **single Transformer block**, which can be stacked to build deep Transformer models.

**Why This Is Important:**

**Residual Connections:**

 Help **prevent vanishing gradients** and allow information to flow through deep networks.

**Layer Normalization:**

 Stabilizes training by keeping input distributions consistent.

**Multi-Head Attention**:

 Lets the model aggregate information from multiple tokens in parallel, **capturing complex dependencies**.

**FeedForward Layer:**

 Adds non-linear transformations to enrich token representations.

**Stacking Blocks**:

Repeating these Transformer blocks forms the backbone of GPT-like architectures, enabling the model to learn complex sequence patterns effectively.

**Core Idea in Simple Terms:**
A Transformer block combines attention, feedforward transformations, normalization, and residual connections into a single unit. Stacking these blocks lets the model build deep, context-aware representations for sequences, which is the foundation of GPT-like models.

In [None]:
class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

#Step 13: Mini GPT Language Model
**What I Implemented:**

**Token and Positional Embeddings:**

Token embeddings convert discrete tokens into continuous vectors.

Positional embeddings encode the position of each token so the model knows the order of the sequence.

**Stacked Transformer Blocks:**

Each block contains multi-head attention, feedforward layers, residual connections, and layer normalization.

These blocks build context-aware token representations by combining information from both nearby and distant tokens.

**Final LayerNorm:**

Stabilizes the output distribution before generating predictions.

**Linear Head (lm_head):**

Maps the final token representations to logits over the vocabulary, which can be used for prediction.

**Forward Pass:**

Computes the logits and optionally calculates cross-entropy loss if targets are provided.

**Generate Function:**

Samples new text autoregressively, predicting one token at a time while respecting the block_size context.

**Why This Is Important:**

**Combines All Components:** This step integrates embeddings, attention, feedforward layers, normalization, and residual connections into a full GPT-style language model.

**Embeddings**: Make discrete tokens compatible with neural network computations.

**Transformer Blocks**: Capture both local and long-range dependencies, allowing the model to understand context.

**Residual + LayerNorm**:
 Ensure stable and deep training, preventing gradient issues.

**Autoregressive Generation:**
 Enables the model to produce coherent sequences starting from a prompt, which is the core of language modeling.

**Core Idea in Simple Terms:**

A mini GPT takes tokens, encodes their meaning and position, processes them through stacked Transformer blocks to understand context, and predicts the next token step by step. This is how it generates coherent and context-aware text, combining all the building blocks implemented earlier.

In [None]:
class BigramLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

#Step 14: Training and Generating Text
**What I Implemented:**

**Setup**:

Moved the model to the appropriate device (CPU or GPU) for computation.

Counted the total number of parameters to understand model size and complexity.

**Optimizer**:

Used AdamW, a gradient-based optimizer, to update model weights during training.

**Training Loop**:

Periodically evaluated loss on training and validation sets to monitor performance.

Sampled batches of data using get_batch.

Performed a forward pass to compute logits and cross-entropy loss.

Backpropagated gradients and updated model weights using the optimizer.

**Autoregressive Text Generation:**

Started from a prompt (context).

Iteratively predicted the next token using the model’s generate method.

Converted predicted token indices back into characters to form readable text.

**Why This Is Important:**

**Training Loop:** Updates the model to minimize prediction loss, enabling it to learn language patterns from the data.

**Periodic Evaluation:** Monitors training progress and prevents overfitting.

**Autoregressive Generation:** Shows that the model can produce meaningful sequences based on what it has learned.

**Parameter Count**: Gives insight into model complexity and the computational resources required.

**Core Idea in Simple Terms:**
Training adjusts the model so it can predict the next token accurately. Once trained, autoregressive generation lets the model create coherent text from a starting prompt, showing that it has learned meaningful patterns in the data.

In [None]:
model = BigramLanguageModel()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=2000)[0].tolist()))

0.209729 M parameters
step 0: train loss 4.4116, val loss 4.4022
step 100: train loss 2.6568, val loss 2.6670
step 200: train loss 2.5090, val loss 2.5058
step 300: train loss 2.4194, val loss 2.4334
step 400: train loss 2.3501, val loss 2.3568
step 500: train loss 2.2963, val loss 2.3129
step 600: train loss 2.2410, val loss 2.2501
step 700: train loss 2.2057, val loss 2.2191
step 800: train loss 2.1633, val loss 2.1860
step 900: train loss 2.1242, val loss 2.1498
step 1000: train loss 2.1027, val loss 2.1298
step 1100: train loss 2.0692, val loss 2.1183
step 1200: train loss 2.0386, val loss 2.0797
step 1300: train loss 2.0276, val loss 2.0652
step 1400: train loss 1.9925, val loss 2.0370
step 1500: train loss 1.9702, val loss 2.0302
step 1600: train loss 1.9645, val loss 2.0487
step 1700: train loss 1.9421, val loss 2.0143
step 1800: train loss 1.9091, val loss 1.9953
step 1900: train loss 1.9085, val loss 1.9874
step 2000: train loss 1.8861, val loss 1.9957
step 2100: train loss 1.

In [None]:
print(decode(m.generate(context, max_new_tokens=2000)[0].tolist()))


Fireathfend
Awhalf arms as your usurn helper
To soul Lession
Her tale the doth, brother Mothands:
Perther, he is compe redion even trundededinue.

COMINA:
Hot shower:
But hy master in creat forth, like you,
Hat togetery in plast's divings.

RICHARD:
Do you love,
And my dight use classs?
Why, and that soul's make will upity is
thou hast, not my flowed sir, he got,
Rome that made to hery fool you,---
Livet in my basent
Yought nour tood some and mastry breive prousiten enuper'd;
On confuch leave whileh nemim you.

LORD'ELIZhour, this I beling for it. He'll thou,, death flive adot,
And for our stries sem with us
nevor not toughts such of Rome;
When is behone him by his beceence, lenjers.

HENRY BOLINGBLIO:
My schume, them? whose them his,
On lifettimentlend, but your lay us,
What therefunds dickngness.
But aftunest that wad beek tes woman,
My then dout on with a brothed's at that most ove you is banim.

First I smun in
proyer states heligher beit threaty.

BOMVOLIO:
Aft? If
Far my lose in