In [1]:
import torch
from torch import nn
import math

## Tokenizer and Embedding

In [2]:
class Embeddings(nn.Module):
    def __init__(self, d_model, vocab_size) -> None:
        super(Embeddings, self).__init__()
        self.d_model = d_model
        self.lut = nn.Embedding(num_embeddings=vocab_size, embedding_dim=self.d_model)

    def forward(self, x):
        x = self.lut(x) * (self.d_model) ** 0.5
        return x

In [3]:
sentences = ["My name is Ravi", "I live in Bangalore"]

In [4]:
vocab = [sentence.split(" ") for sentence in sentences]


In [5]:
vocab = list(set([word.lower() for sentence in vocab for word in sentence]))


In [6]:
vocab


['ravi', 'name', 'in', 'bangalore', 'i', 'live', 'is', 'my']

In [7]:
lut = {word: index for index, word in enumerate(vocab)}


In [8]:
lut


{'ravi': 0,
 'name': 1,
 'in': 2,
 'bangalore': 3,
 'i': 4,
 'live': 5,
 'is': 6,
 'my': 7}

In [9]:
sentence_tokens = [
    [lut[word.lower()] for word in sentence.split(" ")] for sentence in sentences
]


In [10]:
sentence_tokens

[[7, 1, 6, 0], [4, 5, 2, 3]]

In [11]:
vocab_size = len(lut)
d_model = 10


In [12]:
embedding = Embeddings(d_model=d_model, vocab_size=vocab_size)

In [13]:
token_tensors = torch.tensor(sentence_tokens)

In [14]:
embedding_tokens = embedding(token_tensors)


In [15]:
embedding_tokens.shape


torch.Size([2, 4, 10])

In [16]:
token_tensors.shape


torch.Size([2, 4])

## Positional Encoding

In [17]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len) -> None:
        super().__init__()

        pe = torch.zeros(max_len, d_model)
        self.positions = torch.arange(0, max_len).unsqueeze(1)

        div_term = torch.exp(
            torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model)
        )

        pe[:, 0::2] = torch.sin(self.positions * div_term)
        pe[:, 1::2] = torch.cos(self.positions * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer("pe", pe)

    def forward(self, x):
        x = x + self.pe[:, : x.size(1)].requires_grad_(False)
        return x


## Feedforward

In [18]:
class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1) -> None:
        super().__init__()
        self.fcn = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(d_ff, d_model),
        )

    def forward(self, x):
        x = self.fcn(x)
        return x


## Encoder and Decoder

In [19]:
class Decoder(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout) -> None:
        super().__init__()
        self.mha = nn.MultiheadAttention(
            embed_dim=d_model, num_heads=num_heads, batch_first=True
        )

        self.ln1 = nn.LayerNorm(normalized_shape=d_model)
        self.ln2 = nn.LayerNorm(normalized_shape=d_model)

        self.feedforward = PositionwiseFeedForward(
            d_model=d_model, d_ff=d_ff, dropout=dropout
        )

    def forward(self, x):
        x1 = self.ln1(x)
        x = x + self.mha(query=x1, key=x1, value=x1)[0]

        x1 = self.ln2(x)
        x = x + self.feedforward(x1)
        return x


In [20]:
class Encoder(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout) -> None:
        super().__init__()
        self.mha = nn.MultiheadAttention(
            embed_dim=d_model, num_heads=num_heads, dropout=dropout
        )
        self.ln1 = nn.LayerNorm(d_model)
        self.feedforward = PositionwiseFeedForward(
            d_model=d_model, d_ff=d_ff, dropout=dropout
        )
        self.ln2 = nn.LayerNorm(d_model)

    def forward(self, x, mask):
        x1 = self.ln1(x)
        x = x + self.mha(query=x1, key=x1, value=x1, attn_mask=mask)[0]

        x1 = self.ln2(x)
        x = x + self.feedforward(x1)
        return x

## BERT, GPT & VIT

In [21]:
class BERT(nn.Module):
    def __init__(
        self, num_layers, max_len, vocab_size, d_model, num_heads, d_ff, dropout
    ) -> None:
        super().__init__()
        self.embedding = Embeddings(d_model=d_model, vocab_size=vocab_size)
        self.pe = PositionalEncoding(d_model=d_model, max_len=max_len)

        self.encoder_layers = nn.ModuleList(
            [
                Encoder(
                    d_model=d_model, num_heads=num_heads, d_ff=d_ff, dropout=dropout
                )
                for _ in range(num_layers)
            ]
        )
        self.lm_head = nn.Linear(d_model, vocab_size)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, x, mask=None):
        x = self.embedding(x)
        x = self.pe(x)
        for encoder_layer in self.encoder_layers:
            x = encoder_layer(x, mask)
        x = self.lm_head(x)
        x = self.softmax(x)
        return x


In [22]:
class GPT(nn.Module):
    def __init__(
        self, num_layers, max_len, vocab_size, d_model, num_heads, d_ff, dropout
    ) -> None:
        super().__init__()
        self.embedding = Embeddings(d_model=d_model, vocab_size=vocab_size)
        self.pe = PositionalEncoding(d_model=d_model, max_len=max_len)

        self.decoder_layers = nn.ModuleList(
            [
                Decoder(
                    d_model=d_model, num_heads=num_heads, d_ff=d_ff, dropout=dropout
                )
                for _ in range(num_layers)
            ]
        )
        self.lm_head = nn.Linear(d_model, vocab_size)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, x):
        x = self.embedding(x)
        x = self.pe(x)
        for decoder_layer in self.decoder_layers:
            x = decoder_layer(x)
        x = self.lm_head(x)
        x = self.softmax(x)
        return x

In [23]:
gpt = GPT(
    num_layers=6,
    max_len=4,
    vocab_size=len(lut),
    d_model=10,
    num_heads=5,
    d_ff=2048,
    dropout=0.1,
)


In [24]:
gpt(token_tensors).shape


torch.Size([2, 4, 8])

## Patchify

In [25]:
images = torch.randn(2, 1, 28, 28)

In [26]:
conv = nn.Conv2d(1, 768, 4, 4)

In [27]:
conv(images).shape

torch.Size([2, 768, 7, 7])

In [28]:
images.unfold(2, 4, 4).shape

torch.Size([2, 1, 7, 28, 4])

In [29]:
images.unfold(3, 4, 4).shape

torch.Size([2, 1, 28, 7, 4])

In [30]:
patches = images.unfold(2, 4, 4).unfold(3, 4, 4)

In [31]:
patches.shape

torch.Size([2, 1, 7, 7, 4, 4])

In [32]:
patches = patches.contiguous().view(2, 1, 7, 7, -1)

In [33]:
patches.shape


torch.Size([2, 1, 7, 7, 16])

In [34]:
flatten = nn.Flatten(2, 3)


In [35]:
flatten(patches).shape


torch.Size([2, 1, 49, 16])

In [36]:
images.shape


torch.Size([2, 1, 28, 28])

In [56]:
class PatchEncoding(nn.Module):
    def __init__(self, patch_size):
        super().__init__()
        self.patch_size = patch_size
        self.flatten = nn.Flatten(1, 2)

    def forward(self, images):
        B, C, H, W = images.shape

        assert (
            H % self.patch_size == 0
        ), f"Image size ({H}X{W}) should be divisible by Patch size: {self.patch_size}"

        num_patches = H // self.patch_size

        patches = images.unfold(2, self.patch_size, self.patch_size).unfold(
            3, self.patch_size, self.patch_size
        )
        patches = patches.contiguous().view(B, num_patches, num_patches, -1)
        vectors = self.flatten(patches)
        return vectors

In [69]:
patcher = PatchEncoding(16)


In [70]:
images = torch.randn(2, 3, 224, 224)
images.shape  # 2, 49, 16


torch.Size([2, 3, 224, 224])

In [71]:
patcher(images).shape


torch.Size([2, 196, 768])

## VIT

In [81]:
class VIT(nn.Module):
    def __init__(
        self,
        img_size,
        patch_size,
        d_model,
        n_heads,
        n_layers,
        input_channels,
        num_classes,
        d_ff,
        dropout,
    ):
        super().__init__()

        num_patches = img_size // patch_size
        self.class_embedding = nn.Parameter(
            data=torch.randn(1, 1, d_model), requires_grad=True
        )

        self.pe = PositionalEncoding(d_model=d_model, max_len=num_patches**2 + 1)
        self.embedding_dropout = nn.Dropout(dropout)

        self.patch_embedding = PatchEncoding(patch_size=patch_size)

        self.linear_mapper = nn.Linear(
            input_channels * patch_size * patch_size, d_model
        )

        self.encoder_layers = nn.ModuleList(
            [
                Encoder(d_model=d_model, num_heads=n_heads, d_ff=d_ff, dropout=dropout)
                for _ in range(n_layers)
            ]
        )

        self.lm_head = nn.Linear(d_model, num_classes)

    def forward(self, x):
        batch_size = x.shape[0]
        class_tokens = self.class_embedding.expand(batch_size, -1, -1)
        x = self.patch_embedding(x)
        x = self.linear_mapper(x)
        x = torch.cat((class_tokens, x), dim=1)

        x = self.pe(x)

        x = self.embedding_dropout(x)

        for encoder_layer in self.encoder_layers:
            x = encoder_layer(x, None)

        x = self.lm_head(x[:, 0])
        return x


In [82]:
vit = VIT(
    img_size=28,
    patch_size=4,
    d_model=768,
    n_heads=12,
    input_channels=1,
    n_layers=6,
    num_classes=10,
    d_ff=3072,
    dropout=0.1,
)

In [83]:
images = torch.randn(2, 1, 28, 28)


In [85]:
vit(images).shape


torch.Size([2, 10])

In [132]:
decoder = Decoder(d_model=10, num_heads=5, d_ff=2048, dropout=0.1)

In [109]:
mha = nn.MultiheadAttention(embed_dim=d_model, num_heads=5)

ln1 = nn.LayerNorm(normalized_shape=d_model)
ln2 = nn.LayerNorm(normalized_shape=d_model)

feedforward = PositionwiseFeedForward(d_model=d_model, d_ff=2048, dropout=0.1)


In [113]:
ln1(embedding_tokens).shape

torch.Size([2, 4, 10])

In [134]:
decoder(embedding_tokens).shape

torch.Size([2, 4, 10])

In [124]:
mha(embedding_tokens, embedding_tokens, embedding_tokens, need_weights=False)[0]

tensor([[[ 1.3566,  1.3481,  0.3900, -0.2565,  0.6645,  0.2565,  1.6117,
          -0.0749, -1.5442, -0.2403],
         [ 1.0730,  2.3211,  0.8189, -0.9364,  0.2452, -0.5776,  1.4549,
          -0.3478, -0.8684, -1.4045],
         [-0.3467,  1.6977, -2.1283,  0.7379,  3.0187,  0.6735, -1.0704,
          -0.4979, -0.7778,  1.6215],
         [ 0.6983, -1.6496,  0.8389, -0.8093,  2.5265, -1.9784,  1.1612,
          -4.4373, -0.5092, -0.8990]],

        [[-0.3854,  0.2776, -0.0261, -0.3913,  1.2209, -0.0146,  0.8130,
           0.1094, -0.7632,  0.6077],
         [ 0.7142,  1.6135,  1.3742, -0.9422, -0.2616, -0.8659,  1.6678,
           0.3526, -1.2527, -1.1231],
         [ 1.5792, -0.2583, -2.6291,  1.8322,  0.4059,  0.6565, -1.2448,
           1.1136,  2.2788,  0.0665],
         [ 1.2114, -0.5534,  1.3787, -1.0389,  1.8711, -2.4168,  2.0970,
          -3.5441, -0.5248, -1.8513]]], grad_fn=<ViewBackward0>)

In [77]:
embedding_tokens.shape

torch.Size([2, 4, 10])

In [86]:
positional_embedding = PositionalEncoding(10, 4)

In [88]:
positional_embedding(embedding_tokens).shape


torch.Size([2, 4, 10])

In [50]:
positions = torch.arange(0, 4).unsqueeze(1)

In [57]:
div_term = torch.exp(torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model))

div_term


tensor([1.0000e+00, 1.5849e-01, 2.5119e-02, 3.9811e-03, 6.3096e-04])

In [58]:
positions

tensor([[0],
        [1],
        [2],
        [3]])

In [59]:
div_term

tensor([1.0000e+00, 1.5849e-01, 2.5119e-02, 3.9811e-03, 6.3096e-04])

In [60]:
positions * div_term

tensor([[0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
        [1.0000e+00, 1.5849e-01, 2.5119e-02, 3.9811e-03, 6.3096e-04],
        [2.0000e+00, 3.1698e-01, 5.0238e-02, 7.9621e-03, 1.2619e-03],
        [3.0000e+00, 4.7547e-01, 7.5357e-02, 1.1943e-02, 1.8929e-03]])

In [61]:
pe = torch.zeros(4, 10)

In [62]:
pe[:, 0::2] = torch.sin(positions * div_term)

In [63]:
pe[:, 1::2] = torch.cos(positions * div_term)

In [64]:
pe

tensor([[ 0.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
          1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00],
        [ 8.4147e-01,  5.4030e-01,  1.5783e-01,  9.8747e-01,  2.5116e-02,
          9.9968e-01,  3.9811e-03,  9.9999e-01,  6.3096e-04,  1.0000e+00],
        [ 9.0930e-01, -4.1615e-01,  3.1170e-01,  9.5018e-01,  5.0217e-02,
          9.9874e-01,  7.9621e-03,  9.9997e-01,  1.2619e-03,  1.0000e+00],
        [ 1.4112e-01, -9.8999e-01,  4.5775e-01,  8.8908e-01,  7.5285e-02,
          9.9716e-01,  1.1943e-02,  9.9993e-01,  1.8929e-03,  1.0000e+00]])

In [66]:
pe = pe.unsqueeze(0)

In [67]:
pe.shape

torch.Size([1, 4, 10])

In [71]:
(embedding_tokens + pe[:, : embedding_tokens.size(1)]).shape


torch.Size([2, 4, 10])

In [73]:
pe[:, : embedding_tokens.size(1)].shape


torch.Size([1, 4, 10])

In [74]:
pe.shape

torch.Size([1, 4, 10])

In [1]:
import warnings

warnings.filterwarnings("ignore")

In [2]:
import hydra
from hydra import initialize, compose

import torch
import matplotlib.pyplot as plt
import lightning as L
from lightning.pytorch.tuner import Tuner
from lightning.pytorch.callbacks import ModelSummary, ModelCheckpoint

In [5]:
from src.data.bert_data import BertDataModule
from src.data.gpt_data import GPTDataModule
from src.data.vit_data import ViTDataModule
from src.models.transformer import Transformer


In [4]:
try:
    initialize(version_base="1.3", config_path=".", job_name="all")
except ValueError:
    hydra.core.global_hydra.GlobalHydra.instance().clear()
    initialize(version_base="1.3", config_path=".", job_name="all")

cfg = compose(config_name="config")


In [5]:
cfg.bert.data

'/home/ravi.naik/learning/era/s17/s17lit/data/bert'

In [6]:
datamodule = BertDataModule(
    trainpth=f"{cfg.bert.data}/training.txt", vocabpth=f"{cfg.bert.data}/vocab.txt"
)
datamodule.setup()


In [7]:
datamodule.dataset[0]["input"]


tensor([   29,    15,     3, 14963,   446,  3423, 23947, 23947,   435,   468,
        23947,     4,    15,  3670,    13, 11729,    21,   144,   477,     0])

In [8]:
model = Transformer(
    arch="bert",
    n_layers=8,
    n_heads=8,
    embedding_dim=128,
    dim_feedforward=512,
    n_embeddings=40000,
    seq_len=20,
    ignore_index=None,
    rvocab=None,
)

In [15]:
output = model.forward(datamodule.dataset[0]["input"].unsqueeze(0))

In [16]:
output.shape

torch.Size([1, 20, 40000])

In [18]:
masked_target = datamodule.dataset[0]["target"]

In [19]:
output_v = output.view(-1, output.shape[-1])
target_v = masked_target.view(-1, 1).squeeze()


In [21]:
output_v.shape, target_v.shape

(torch.Size([20, 40000]), torch.Size([20]))

In [22]:
from torch import nn

In [23]:
loss_model = nn.CrossEntropyLoss()
loss = loss_model(output_v, target_v)


In [24]:
loss

tensor(10.1337, grad_fn=<NllLossBackward0>)

In [6]:
datamodule = GPTDataModule(f"{cfg.gpt.data}/english.txt")


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (37443 > 512). Running this sequence through the model will result in indexing errors


In [7]:
datamodule.setup()


In [10]:
datamodule.train_dataset[0][1].shape


torch.Size([64])

In [14]:
model = Transformer(
    arch="gpt",
    n_layers=8,
    n_heads=8,
    embedding_dim=128,
    dim_feedforward=512,
    n_embeddings=40000,
    seq_len=64,
    ignore_index=None,
    rvocab=None,
)


In [15]:
X, y = datamodule.train_dataset[0]


In [16]:
model.forward(X)


TypeError: TransformerDecoder.forward() missing 1 required positional argument: 'memory'