In [1]:
import sys
sys.path.append('..')

In [2]:
from minbpe.gpt4 import GPT4Tokenizer
from minbpe.basic import BasicTokenizer

tokenizer = BasicTokenizer()
tokenizer.load(model_file='../output/tokenizer/temp_tokenizer.model')

In [19]:
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'

block_size = 512
embedding_dimension = 128
no_of_attention_heads = 8
key_query_reduced_dimensionality = 8
no_of_layers = 4
vocab_size = len(tokenizer.vocab)+len(tokenizer.special_tokens)

In [5]:
torch.__version__

'2.9.1+cu126'

In [55]:
from torch import nn
from torch.nn import functional as F
import math
from typing import Optional


In [49]:
class AttentionHead(nn.Module):
    def __init__(self, key_query_reduced_dimensionality, embedding_dimension):
        super().__init__()
        self.key_query_reduced_dimensionality = key_query_reduced_dimensionality
        self.embedding_dimension = embedding_dimension
        self.query = nn.Linear(self.embedding_dimension, self.key_query_reduced_dimensionality, bias = False)
        self.key = nn.Linear(self.embedding_dimension, self.key_query_reduced_dimensionality, bias = False)
        self.value = nn.Linear(self.embedding_dimension, self.key_query_reduced_dimensionality, bias = False)
    
    def forward(self, embeddedVector: torch.Tensor) -> torch.Tensor:
        _, T, _ = embeddedVector.shape
        Q = self.query(embeddedVector)
        K = self.key(embeddedVector)
        V = self.value(embeddedVector)

        KT = torch.transpose(K, 0, 1)
        R = torch.matmul(KT, Q) / (self.embedding_dimension**0.5)
        R = R.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        R = F.softmax(R, dim = -1)
        change_in_embedded_vector = R @ V

        return change_in_embedded_vector

In [46]:
query = nn.Linear(128, 8, bias = False)
key = nn.Linear(128, 8, bias = False)
value = nn.Linear(128, 128, bias = False)
embeddedVector = torch.randn(50, 128)
_, T = embeddedVector.shape
Q = query(embeddedVector)
K = key(embeddedVector)
V = value(embeddedVector)

QT = torch.transpose(Q, 0, 1)
R = torch.matmul(K, QT) / (embedding_dimension**0.5)
R = R.masked_fill(torch.transpose(torch.tril(R), 0, 1)[:T, :T] == 0, float('-inf'))
R = F.softmax(R, dim = -1)
(R @ V).shape

torch.Size([50, 128])

In [53]:
class AttentionLayer(nn.Module):
    def __init__(self, no_of_attention_heads, key_query_reduced_dimensionality, embedding_dimension):
        super().__init__()
        self.no_of_attention_heads = no_of_attention_heads
        self.key_query_reduced_dimensionality = key_query_reduced_dimensionality
        self.embedding_dimension = embedding_dimension
        self.heads = nn.ModuleList([AttentionHead(key_query_reduced_dimensionality, embeddding_dimension) for _ in range(no_of_attention_heads)])
        # Originally We could have considered each Value Weighr Wv to be (embedded_dimension, embedded_dimension) but that's too massive so we can alt do (embedded_dimension, reduced_dim) x (reduced_dim, embedded_dimension)
        # That's what we do in actual transformers, so that's (Output Weight) x (Value Weight new), Value Weight goes to each head and Output Weight is just joined together for all heads
        self.output_weight = nn.Linear(self.no_of_attention_heads * self.key_query_reduced_dimensionality, self.embedding_dimension)
        self.dropout = nn.Dropout(p=0.1)
    
    def forward(self, embeddedVector: torch.Tensor) -> torch.Tensor:
        output = torch.cat([attentionHead(embeddedVector) for attentionHead in self.heads], dim = -1)
        output = self.dropout(self.output_weight(output))
        return output

#Multilayer Perceptron Layer
class MLPLayer(nn.Module):
    def __init__(self, embedding_dimension, expansion_factor = 4):
        super().__init__()
        self.embedding_dimension = embedding_dimension
        self.expansion_factor = expansion_factor

        #Expanding Layer which by default will be set to 4 times the size of the embedded dimension
        self.neural_net = nn.Sequential(
            nn.Linear(self.embedding_dimension, self.expansion_factor * self.embedding_dimension),
            nn.ReLu(),
            nn.Linear(self.expansion_factor * self.embedding_dimension, self.embedding_dimension),
            nn.Dropout(p = 0.1)
        )

    def forward(self, embeddedVector: torch.Tensor) -> torch.Tensor:
        return self.neural_net(embeddedVector)        




In [52]:
class TransformerBlock(nn.Module):
    def __init__(self, embedding_dimension, key_query_reduced_dimensionality, no_of_attention_heads, expansion_factor = 4):
        super().__init__()
        self.embedding_dimension = embedding_dimension
        
        self.attention_layer = AttentionLayer(no_of_attention_heads, key_query_reduced_dimensionality, self.embedding_dimension),
        self.normal_1 = nn.LayerNorm(self.embedding_dimension)
        self.mlp_layer = MLPLayer(self.embedding_dimension, expansion_factor)
        self.normal_2 = nn.LayerNorm(self.embedding_dimension)
    
    def forward(self, embeddedVector: torch.Tensor) -> torch.Tensor:
        embeddedVector += self.attention_layer(self.normal_1(embeddedVector))
        embeddedVector += self.mlp_layer(self.normal_2(embeddedVector))
        return embeddedVector

In [56]:
class GPTTransformer(nn.Module):
    def __init__(self, context_size, no_of_blocks, embedding_dimension, key_query_reduced_dimensionality, no_of_attention_heads, expansion_factor = 4):
        super().__init__()
        self.context_size = context_size
        self.no_of_blocks = no_of_blocks

        #Embedding first
        self.token_embedder = nn.Embedding(vocab_size, embedding_dimension)
        self.position_embedder = nn.Embedding(context_size, embedding_dimension)
        #All transformations
        self.transformer_blocks = nn.Sequential([TransformerBlock(embedding_dimension, key_query_reduced_dimensionality, no_of_attention_heads, expansion_factor) for _ in range(no_of_blocks)])

        self.normal = nn.LayerNorm(embedding_dimension)
        self.final_layer = nn.Linear(embedding_dimension, vocab_size)
    
    def forward(self, tokens: torch.Tensor, ideal_value: Optional[torch.Tensor] = None) -> torch.Tensor:
        _, T, _ = tokens.shape
        token_embedding = self.token_embedder(tokens)
        position_embedding = self.position_embedder(torch.arange(T, device = device))

        embeddedVector = token_embedding + position_embedding

        output = self.transformer_blocks(embeddedVector)
        output = self.normal(output)
        output = self.final_layer(output)

        if ideal_value is None:
            loss = None
        else:
            B, T, C = output.shape
            output = output.view(B * T, C)
            ideal_value = ideal_value.view(B * T, C)

            loss = F.cross_entropy(output, ideal_value)
        
        return output, loss