<a href="https://colab.research.google.com/github/NoCodeProgram/deepLearning/blob/main/transformer/TransformerBlock.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
!git clone https://github.com/NoCodeProgram/deepLearning.git

fatal: destination path 'deepLearning' already exists and is not an empty directory.


In [21]:
import torch
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

# Read the text file
with open('deepLearning/transformer/shakespeare.txt', 'r') as file:
    text = file.read()

# Tokenize the text  (this is very simple tokenizer, in reality you would use a more advanced one)
tokenizer = get_tokenizer('basic_english')
tokens = tokenizer(text)
unique_tokens = set(tokens)


In [22]:
stoi = { s:i for i,s in enumerate(unique_tokens)}
itos = { i:s for i,s in enumerate(unique_tokens)}
# print(stoi)
# print(itos)

vocab_size = len(unique_tokens)
print(vocab_size)

3129


In [23]:
sentence = "i love you all"
indices = [stoi[word] for word in sentence.split()]
print(indices)

import torch.nn as nn

embedding_dim = 20
embedding = nn.Embedding(vocab_size, embedding_dim)

embedded_sentence = embedding(torch.tensor(indices))
print(embedded_sentence)


[703, 2324, 2988, 736]
tensor([[ 0.5823,  0.2879, -0.6389,  0.5345,  0.2990, -0.5058, -0.6320,  0.7645,
         -1.0935, -0.2994,  0.5853,  0.0207,  0.0556, -0.6536, -0.7247, -0.9703,
          0.9994, -1.1219,  0.1807, -1.0605],
        [ 2.0182,  1.8420, -0.9888,  1.2260, -0.1556,  0.6729,  0.0403,  0.1177,
         -0.5989, -0.7594, -0.1346,  0.3288,  1.4488,  0.8282, -0.5927,  1.4262,
          1.5454,  0.5162,  0.3055, -0.6019],
        [ 0.8747,  0.4409, -0.0992,  0.8930,  0.6899,  0.3536,  0.0701,  0.2687,
          0.6222, -0.9365,  0.3176, -0.9085, -1.2387,  0.4030, -1.1117,  0.4290,
          1.0424,  0.1438, -1.4887, -0.2270],
        [-0.1886, -0.2868,  0.4766, -0.9368,  0.3664,  0.3415, -0.1182, -1.2735,
         -0.1879,  0.5929,  2.7290, -0.2996,  1.3779,  0.6140,  2.0823,  0.1850,
          0.1959, -0.2307, -0.4035,  0.4000]], grad_fn=<EmbeddingBackward0>)


In [24]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class SelfAttention(nn.Module):
    def __init__(self, embed_dim, atten_dim):
        super().__init__()
        self.query = nn.Linear(embed_dim, atten_dim, bias=False)
        self.key = nn.Linear(embed_dim, atten_dim, bias=False)
        self.value = nn.Linear(embed_dim, atten_dim, bias=False)

    def forward(self, x):
        query = self.query(x)
        key = self.key(x)
        value = self.value(x)

        scores = torch.matmul(query, key.transpose(-2, -1))
        scores = scores / key.size(-1)**0.5

        attention_weights = F.softmax(scores, dim=-1)
        weighted_values = torch.matmul(attention_weights, value)

        return weighted_values

In [25]:
class MultiheadAttention(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super().__init__()
        attention_dim = embed_dim // num_heads
        self.attentions = nn.ModuleList([SelfAttention(embed_dim, attention_dim) for _ in range(num_heads)])
        self.fc = nn.Linear(embed_dim, embed_dim)

    def forward(self, x):
        head_outputs = []
        for attention in self.attentions:
            head_output = attention(x)
            head_outputs.append(head_output)

        concatenated_heads = torch.cat(head_outputs, dim=-1)
        print("concatenated_heads", concatenated_heads.shape)
        output = self.fc(concatenated_heads)
        print("output", output.shape)
        return output


In [26]:
class FeedFoward(nn.Module):
    def __init__(self, embed_dim, ff_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(embed_dim, ff_dim),
            nn.ReLU(),
            nn.Linear(ff_dim, embed_dim),
        )
    def forward(self, x):
        return self.net(x)

In [27]:
class TransformerBlock(nn.Module):
    def __init__(self, embed_dim, n_head):
        super().__init__()
        self.layer_norm1 = nn.LayerNorm(embed_dim)
        self.multihead_atten = MultiheadAttention(embed_dim, n_head)

        self.layer_norm2 = nn.LayerNorm(embed_dim)
        self.feed_forward = FeedFoward(embed_dim, 4*embed_dim)

    def forward(self, x):
        x = x + self.multihead_atten(self.layer_norm1(x))
        x = x + self.feed_forward(self.layer_norm2(x))
        return x

In [28]:

num_heads = 4

output = TransformerBlock(embedding_dim, num_heads)(embedded_sentence)
print("output shape", output.shape)

concatenated_heads torch.Size([4, 20])
output torch.Size([4, 20])
output shape torch.Size([4, 20])
