In [1]:
import numpy as np
import torch

In [2]:
"""-----------------------------From Scratch Example----------------------------- """

# ---------- helper ----------
def softmax(x, axis=-1):
    """نسخه پایدار عددیِ softmax."""
    e_x = np.exp(x - np.max(x, axis=axis, keepdims=True))
    return e_x / np.sum(e_x, axis=axis, keepdims=True)

np.set_printoptions(precision=5, suppress=True)

# ---------- 1) ورودیِ مثال ----------
x = np.array([
    [1., 2., 3., 4.],  # توکن ۱
    [5., 6., 7., 8.]   # توکن ۲
])

# ---------- 2) Q, K, V (لایه‌های خطی = ماتریس همانی) ----------
W_q = W_k = W_v = np.eye(4)

Q = x @ W_q.T
K = x @ W_k.T
V = x @ W_v.T

# ---------- 3) تقسیم به ۲ هدِ سایز ۲ ----------
num_heads = 2
head_dim   = 2
seq_len    = x.shape[0]   # 2
d_model    = x.shape[1]   # 4

# تغییر شکل به (num_heads, seq_len, head_dim)
Q_heads = Q.reshape(seq_len, num_heads, head_dim).transpose(1, 0, 2)
K_heads = K.reshape(seq_len, num_heads, head_dim).transpose(1, 0, 2)
V_heads = V.reshape(seq_len, num_heads, head_dim).transpose(1, 0, 2)

print("Q heads:\n", Q_heads, "\n")

# ---------- 4-5) محاسبه attention برای هر هد ----------
head_outputs = []
for h in range(num_heads):
    Q_h, K_h, V_h = Q_heads[h], K_heads[h], V_heads[h]

    # نمره‌ها = Q_h @ K_hᵀ   (برای سادگی بدون scaling)
    scores = Q_h @ K_h.T
    probs  = softmax(scores, axis=-1)
    out_h  = probs @ V_h

    print(f"Head {h+1} — scores:\n{scores}")
    print(f"Head {h+1} — softmax probs:\n{probs}")
    print(f"Head {h+1} — output:\n{out_h}\n")

    head_outputs.append(out_h)

head_outputs = np.stack(head_outputs)  # (num_heads, seq_len, head_dim)

# ---------- 6) چسباندن هدها ----------
concat = head_outputs.transpose(1, 0, 2).reshape(seq_len, d_model)
print("Concatenated heads:\n", concat, "\n")

# ---------- 7) لایه out_proj (همانی) ----------
W_o = np.eye(d_model)  # در عمل trainable است
final_output = concat @ W_o.T

print("Final output:\n", final_output)

Q heads:
 [[[1. 2.]
  [5. 6.]]

 [[3. 4.]
  [7. 8.]]] 

Head 1 — scores:
[[ 5. 17.]
 [17. 61.]]
Head 1 — softmax probs:
[[0.00001 0.99999]
 [0.      1.     ]]
Head 1 — output:
[[4.99998 5.99998]
 [5.      6.     ]]

Head 2 — scores:
[[ 25.  53.]
 [ 53. 113.]]
Head 2 — softmax probs:
[[0. 1.]
 [0. 1.]]
Head 2 — output:
[[7. 8.]
 [7. 8.]]

Concatenated heads:
 [[4.99998 5.99998 7.      8.     ]
 [5.      6.      7.      8.     ]] 

Final output:
 [[4.99998 5.99998 7.      8.     ]
 [5.      6.      7.      8.     ]]


In [3]:
x = torch.arange(1, 13)
print("Input tensor:\n", x)

x = torch.arange(1, 13, dtype=torch.float32).view(3, 4)  # (3, 4    # tokens, 4 features)
print("Input tensor:\n", x)

Input tensor:
 tensor([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12])
Input tensor:
 tensor([[ 1.,  2.,  3.,  4.],
        [ 5.,  6.,  7.,  8.],
        [ 9., 10., 11., 12.]])


In [4]:
x = torch.arange(1, 13, dtype=torch.float32).view(3, 4)  # (3, 4    # tokens, 4 features)

x[-1, -1] = 1000 # (3, 4    # tokens, 4 features)
print("Input tensor with large value:\n", x)

Input tensor with large value:
 tensor([[   1.,    2.,    3.,    4.],
        [   5.,    6.,    7.,    8.],
        [   9.,   10.,   11., 1000.]])


In [5]:
x = torch.arange(1, 13, dtype=torch.float32).view(2, 2, 3)  # (3, 4    # tokens, 4 features)
print("Input tensor reshaped to (2, 2, 3):\n", x)

x = torch.arange(1, 13, dtype=torch.float32).view(6, 2) # (6, 2    # tokens, 2 features)
print("Input tensor reshaped to (6, 2):\n", x)

Input tensor reshaped to (2, 2, 3):
 tensor([[[ 1.,  2.,  3.],
         [ 4.,  5.,  6.]],

        [[ 7.,  8.,  9.],
         [10., 11., 12.]]])
Input tensor reshaped to (6, 2):
 tensor([[ 1.,  2.],
        [ 3.,  4.],
        [ 5.,  6.],
        [ 7.,  8.],
        [ 9., 10.],
        [11., 12.]])


In [6]:
x = torch.arange(1, 13, dtype=torch.float32).view(2, 2, 3)  # (3, 4    # tokens, 4 features)
print("Input tensor reshaped to (2, 2, 3):\n", x)

x = x.transpose(1, 2)  # (2, 3, 2) --تبدیل به (batch, num_heads, tokens, head_dim)
print(x)

Input tensor reshaped to (2, 2, 3):
 tensor([[[ 1.,  2.,  3.],
         [ 4.,  5.,  6.]],

        [[ 7.,  8.,  9.],
         [10., 11., 12.]]])
tensor([[[ 1.,  4.],
         [ 2.,  5.],
         [ 3.,  6.]],

        [[ 7., 10.],
         [ 8., 11.],
         [ 9., 12.]]])


In [7]:
x = torch.arange(1,13).view(6,2)
print("Input tensor with shape (6, 2):\n", x)

x_1 = x.stride()  # Get the strides of the tensor
print("Strides of the tensor:", x_1)


print("Shape of the tensor:", x.shape)
print("Number of dimensions:", x.dim())
print("Number of elements:", x.numel())
print("Data type of the tensor:", x.dtype)
print("Device of the tensor:", x.device)
print("Is the tensor contiguous?", x.is_contiguous())
# Define a simple attention mechanism
print("\n--- Simple Attention Mechanism ---\n")

Input tensor with shape (6, 2):
 tensor([[ 1,  2],
        [ 3,  4],
        [ 5,  6],
        [ 7,  8],
        [ 9, 10],
        [11, 12]])
Strides of the tensor: (2, 1)
Shape of the tensor: torch.Size([6, 2])
Number of dimensions: 2
Number of elements: 12
Data type of the tensor: torch.int64
Device of the tensor: cpu
Is the tensor contiguous? True

--- Simple Attention Mechanism ---



In [8]:
x = torch.arange(0,12).view(2,2,3)
print("Input tensor:\n", x)

x_2 = x.stride()  # Get the strides of the tensor
print("Strides of the tensor:", x_2)

y = x.transpose(0,2)
print("Transposed tensor:\n", y)
# Check the shape and strides of the transposed tensor
print("Shape of the transposed tensor:", y.shape)
print("Strides of the transposed tensor:", y.stride())
# Check the shape and strides of the transposed tensor
print("Shape of the transposed tensor:", y.shape)
print("Strides of the transposed tensor:", y.stride())
# Define a simple attention mechanism
print("\n--- Simple Attention Mechanism ---\n")

y.storage()
# Check the storage of the transposed tensor
print("Storage of the transposed tensor:", y.storage())

Input tensor:
 tensor([[[ 0,  1,  2],
         [ 3,  4,  5]],

        [[ 6,  7,  8],
         [ 9, 10, 11]]])
Strides of the tensor: (6, 3, 1)
Transposed tensor:
 tensor([[[ 0,  6],
         [ 3,  9]],

        [[ 1,  7],
         [ 4, 10]],

        [[ 2,  8],
         [ 5, 11]]])
Shape of the transposed tensor: torch.Size([3, 2, 2])
Strides of the transposed tensor: (1, 3, 6)
Shape of the transposed tensor: torch.Size([3, 2, 2])
Strides of the transposed tensor: (1, 3, 6)

--- Simple Attention Mechanism ---

Storage of the transposed tensor:  0
 1
 2
 3
 4
 5
 6
 7
 8
 9
 10
 11
[torch.storage.TypedStorage(dtype=torch.int64, device=cpu) of size 12]


  y.storage()


In [9]:
x = torch.arange(0,12).view(2,6)
# Print the input tensor
print("Is It contiguous?",x.is_contiguous())

y = x.transpose(0, 1)
# Print the transposed tensor
print("Is It contiguous?",y.is_contiguous())

# Check the storage of the transposed tensor
print(y.shape)
#stride shows the step size to move to the next element in each dimension.
print(y.stride())
#storage shows the underlying data buffer and shows how the data is stored in memory.
print("Storage of the transposed tensor:", y.storage())

Is It contiguous? True
Is It contiguous? False
torch.Size([6, 2])
(1, 6)
Storage of the transposed tensor:  0
 1
 2
 3
 4
 5
 6
 7
 8
 9
 10
 11
[torch.storage.TypedStorage(dtype=torch.int64, device=cpu) of size 12]


In [10]:
x = torch.arange(1,13)
y = x.reshape(4,3)
# Print the reshaped tensor
print("Reshaped tensor:\n", y)

y[0,0] = 100
# Print the modified tensor
print("Modified tensor:\n", y)
# Summary
print("Original tensor after modification:\n", x)
# A ‘View’ uses the same chunk of memory block as the original tensor, and thus any changes among this memory chunk will affect all the views and the original tensor that’s associated with it.</br>
# A ‘View’ can be contiguous or non-contiguous.</br>
# A non-contiguous tensor view can be converted to a contiguous one, and it would make a copy of it, so the data will not be associated with the original data chunk anymore.</br>
# Stride Position Formula: Given a strides (A, B, C), the position of
# the index (j, k, v) in the 1D data array is (A *j + B*k + C*v)</br>
# Difference between view() and reshape():</br>
# view() cannot apply on ‘non-contiguous’ tensor /view. It returns a view.
# reshape() can apply on both ‘contiguous’ and ‘non-contiguous’ tensor/view
# When possible, it will return a view; When the data is non-contiguous, it makes a new copy of it.
# Reshape() is more flexible than view() but may involve copying data.

Reshaped tensor:
 tensor([[ 1,  2,  3],
        [ 4,  5,  6],
        [ 7,  8,  9],
        [10, 11, 12]])
Modified tensor:
 tensor([[100,   2,   3],
        [  4,   5,   6],
        [  7,   8,   9],
        [ 10,  11,  12]])
Original tensor after modification:
 tensor([100,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12])


In [11]:
x = torch. randn(2, 3)
# Print the random tensor
print("Random tensor:\n", x)

y = x.transpose(1, 0)
# Print the transposed tensor
print("Transposed tensor:\n", y)
# Check if the transposed tensor is contiguous
print("Is the transposed tensor contiguous?", y.is_contiguous())

# Reshape the transposed tensor to a new shape
# Note: The shape must be compatible with the number of elements in the tensor
#when we know tensor is contiguous, we can use view() to reshape it
# If the tensor is not contiguous, we can use contiguous() to make it contiguous first
# we also can use reshape() to reshape it
# but reshape() may create a copy if the tensor is not contiguous
# In this case, we will use contiguous() to ensure the tensor is contiguous before reshaping
z= y.contiguous().view(2,3)
# Print the reshaped tensor
print("Reshaped tensor:\n", z)
#transpose() does not change the original tensor, it returns a new tensor with the specified dimensions swapped
# The original tensor remains unchanged

Random tensor:
 tensor([[ 1.3911,  1.5115,  0.7717],
        [-0.5112,  0.2792, -0.5493]])
Transposed tensor:
 tensor([[ 1.3911, -0.5112],
        [ 1.5115,  0.2792],
        [ 0.7717, -0.5493]])
Is the transposed tensor contiguous? False
Reshaped tensor:
 tensor([[ 1.3911, -0.5112,  1.5115],
        [ 0.2792,  0.7717, -0.5493]])


In [12]:
x= torch.randn(1, 4, 6)  # Example input tensor with shape (batch_size=1, token=2, d_in=4
batch, num_token, dim_in = x.shape  # Example shape (batch_size=1, token=2, d_in=4)

# Print the input tensor
print("Input tensor:\n", x, "\n")

num_heads = 3
head_dim   = 2

# Reshape the input tensor to (batch_size, num_heads, num_tokens, head_dim)
# This is necessary for multi-head attention to work correctly
x = x.view(batch, num_token, num_heads, head_dim)
# Print the reshaped tensor
print("Reshaped tensor:\n", x)

num_heads = 2
head_dim   = 3

# Reshape the input tensor to (batch_size, num_heads, num_tokens, head_dim)
# This is necessary for multi-head attention to work correctly
x = x.view(batch, num_token, num_heads, head_dim)
# Print the reshaped tensor
print("Reshaped tensor:\n", x)

Input tensor:
 tensor([[[-0.3301,  0.8041, -0.6608,  2.1843, -0.3718, -1.2365],
         [-1.7740,  1.9076,  0.6946, -1.1267,  1.2110, -1.3329],
         [ 0.6670,  0.8639, -0.3981, -1.0714, -0.5151, -1.9603],
         [ 0.6216,  0.7644, -0.7178,  0.5979,  0.2320, -0.3983]]]) 

Reshaped tensor:
 tensor([[[[-0.3301,  0.8041],
          [-0.6608,  2.1843],
          [-0.3718, -1.2365]],

         [[-1.7740,  1.9076],
          [ 0.6946, -1.1267],
          [ 1.2110, -1.3329]],

         [[ 0.6670,  0.8639],
          [-0.3981, -1.0714],
          [-0.5151, -1.9603]],

         [[ 0.6216,  0.7644],
          [-0.7178,  0.5979],
          [ 0.2320, -0.3983]]]])
Reshaped tensor:
 tensor([[[[-0.3301,  0.8041, -0.6608],
          [ 2.1843, -0.3718, -1.2365]],

         [[-1.7740,  1.9076,  0.6946],
          [-1.1267,  1.2110, -1.3329]],

         [[ 0.6670,  0.8639, -0.3981],
          [-1.0714, -0.5151, -1.9603]],

         [[ 0.6216,  0.7644, -0.7178],
          [ 0.5979,  0.2320, -0.3983]

In [13]:
a = torch.tensor([[[[0.2745, 0.6584, 0.2775, 0.8573], #1
 [0.8993, 0.0390, 0.9268, 0.7388],
[0.7179, 0.7058, 0.9156, 0.4340]],
 [[0.0772, 0.3565, 0.1479, 0.5331],
 [0.4066, 0.2318, 0.4545, 0.9737],
[0.4606, 0.5159, 0.4220, 0.5786]]]])

# Print the input tensor
# Transpose the last two dimensions of the tensor
# This is useful for operations like matrix multiplication or attention mechanisms  
# where we need to swap the dimensions for correct alignment
y = a.transpose(2, 3)  # Transpose the last two dimensions

# Print the transposed tensor
print("Transposed tensor:\n", a @ y)

Transposed tensor:
 tensor([[[[1.3208, 1.1631, 1.2879],
          [1.1631, 2.2150, 1.8424],
          [1.2879, 1.8424, 2.0402]],

         [[0.4391, 0.7003, 0.5903],
          [0.7003, 1.3737, 1.0620],
          [0.5903, 1.0620, 0.9912]]]])


In [14]:
first_head = a[0, 0, :, :]
# Print the first head of the tensor
print("First head of the tensor:\n", first_head)

second_head = a[0, 1, :, :]
# Print the second head of the tensor
print("Second head of the tensor:\n", second_head)

First head of the tensor:
 tensor([[0.2745, 0.6584, 0.2775, 0.8573],
        [0.8993, 0.0390, 0.9268, 0.7388],
        [0.7179, 0.7058, 0.9156, 0.4340]])
Second head of the tensor:
 tensor([[0.0772, 0.3565, 0.1479, 0.5331],
        [0.4066, 0.2318, 0.4545, 0.9737],
        [0.4606, 0.5159, 0.4220, 0.5786]])


In [15]:
x = torch.arange(1, 13)
print("Input tensor:\n", x)

x = x.view(3, 4)  # Reshape to (3, 4)
print("Reshaped tensor:\n", x)

x = x.unsqueeze(0)  # Remove the first dimension
print("Tensor after unsqueeze:\n", x)


Input tensor:
 tensor([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12])
Reshaped tensor:
 tensor([[ 1,  2,  3,  4],
        [ 5,  6,  7,  8],
        [ 9, 10, 11, 12]])
Tensor after unsqueeze:
 tensor([[[ 1,  2,  3,  4],
         [ 5,  6,  7,  8],
         [ 9, 10, 11, 12]]])


In [16]:
x = torch.tensor([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])
print("Input tensor:\n", x)

x= x.view(3, 4)  # Reshape to (3, 4)
print("Reshaped tensor:\n", x)

x = torch.tensor([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])
print("Input tensor:\n", x.shape)
x = x.unsqueeze(0) # add a new dimension at the start
print("Tensor after unsqueeze:\n", x.shape)

Input tensor:
 tensor([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12])
Reshaped tensor:
 tensor([[ 1,  2,  3,  4],
        [ 5,  6,  7,  8],
        [ 9, 10, 11, 12]])
Input tensor:
 torch.Size([12])
Tensor after unsqueeze:
 torch.Size([1, 12])


In [17]:
x = torch.tensor([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])
print("Input tensor:\n", x)
x = x.unsqueeze(0).expand(-1, -1) # unsqueeze add a new dimension at the start and expand it to match the shape
# Print the tensor after unsqueeze
print("Tensor after unsqueeze:\n", x)

Input tensor:
 tensor([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12])
Tensor after unsqueeze:
 tensor([[ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12]])


In [18]:
GPT_CONFIG_124M = {
 "vocab_size": 50257, # Vocabulary size
 "context_length": 1024, # Context length
 "emb_dim": 768, # Embedding dimension
 "n_heads": 12, # Number of attention heads
 "n_layers": 12, # Number of layers
 "drop_rate": 0.1, # Dropout rate
 "qkv_bias": False # Query-Key-Value bias
}

In [19]:
import torch
import torch.nn as nn
class DummyGPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])
        self.trf_blocks = nn.Sequential( #1
        *[DummyTransformerBlock(cfg) #1
        for _ in range(cfg["n_layers"])] #1
        ) #1
        self.final_norm = DummyLayerNorm(cfg["emb_dim"]) #2
        self.out_head = nn.Linear(
        cfg["emb_dim"], cfg["vocab_size"], bias=False
        )


    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(
        torch.arange(seq_len, device=in_idx.device)
        )
        x = tok_embeds + pos_embeds
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits
    
class DummyTransformerBlock(nn.Module): #3
    def __init__(self, cfg):
        super().__init__()

    def forward(self, x): #4
        return x
    
class DummyLayerNorm(nn.Module): #5
    def __init__(self, normalized_shape, eps=1e-5): #6
        super().__init__()
        
    def forward(self, x):
        return x


In [20]:
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")
batch = []
txt1 = "Every effort moves you"
txt2 = "Every day holds a"
batch.append(torch.tensor(tokenizer.encode(txt1)))
batch.append(torch.tensor(tokenizer.encode(txt2)))
batch = torch.stack(batch, dim=0)
print(batch)

tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])


In [21]:
torch.manual_seed(123)
model = DummyGPTModel(GPT_CONFIG_124M)
logits = model(batch)
print("Output shape:", logits.shape)
print(logits)

Output shape: torch.Size([2, 4, 50257])
tensor([[[-0.9289,  0.2748, -0.7557,  ..., -1.6070,  0.2702, -0.5888],
         [-0.4476,  0.1726,  0.5354,  ..., -0.3932,  1.5285,  0.8557],
         [ 0.5680,  1.6053, -0.2155,  ...,  1.1624,  0.1380,  0.7425],
         [ 0.0447,  2.4787, -0.8843,  ...,  1.3219, -0.0864, -0.5856]],

        [[-1.5474, -0.0542, -1.0571,  ..., -1.8061, -0.4494, -0.6747],
         [-0.8422,  0.8243, -0.1098,  ..., -0.1434,  0.2079,  1.2046],
         [ 0.1355,  1.1858, -0.1453,  ...,  0.0869, -0.1590,  0.1552],
         [ 0.1666, -0.8138,  0.2307,  ...,  2.5035, -0.3055, -0.3083]]],
       grad_fn=<UnsafeViewBackward0>)
