In [44]:
!pip install datasets torchvision sentencepiece

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [45]:
import torch
from torch import nn
from torch.nn import functional as F
from typing import Tuple

from torch.utils.data import DataLoader
from torchvision import transforms
from datasets import load_dataset
from PIL import Image
import sentencepiece as spm
from tqdm import tqdm
import numpy as np
import re

## Converting Image to a Sequence of Patches

In [46]:
class PatchEmbeddings(nn.Module):
    def __init__(
        self, img_size: int = 224, patch_size: int = 16, hidden_dim: int = 512
    ) -> None:
        super().__init__()
        # Store the input image size, the patch size and hidden dimension
        self.img_size = img_size
        self.patch_size = patch_size
        self.hidden_dim = hidden_dim

        # Calculate the total number of patches
        self.num_patches = (self.img_size // self.patch_size) ** 2

        # Create a convolution to extract patch embeddings
        # in_channels=3 asummes a 3-channel image (RGB)
        # outp_channels=hidden_dim sets the number of output channels to match the hidden dimension
        # kernel_size=patch_size and stride=patch_size ensuring each patch is embedded separately
        self.conv = nn.Conv2d(
            in_channels=3,
            out_channels=self.hidden_dim,
            kernel_size=self.patch_size,
            stride=self.patch_size,
        )

    def forward(self, X: torch.Tensor) -> torch.Tensor:
        X = self.conv(X)

        # Flatten the spatial dimensions (height and width) of the patch embeddings
        # This step flattens the patch dimensions to a single dimension
        # Output shape: (batch_size, hidden_dim, self.num_patches)
        X = X.flatten(2)

        # Transpose the dimensions to obtain the shape (batch_size, num_patches, hidden_dim)
        # This step brings the num_patches dimension to the second position
        # Output shape: (batch_size, self.num_patches, hidden_dim)
        X = X.transpose(1, 2)

        return X

In [47]:
B, C, H, W = 128, 3, 224, 224  # Batch size, Channels, Height, Width
X = torch.randn(B, C, H, W)

patch_size = 16
hidden_dim = 512

patch_embeddings = PatchEmbeddings(
    img_size=H, patch_size=patch_size, hidden_dim=hidden_dim
)
patches = patch_embeddings(X)
print(f"Shape of image patches: {patches.shape}")

Shape of image patches: torch.Size([128, 196, 512])


In [48]:
num_patches = (H // patch_size) ** 2
assert patches.shape == (B, num_patches, hidden_dim), "Output shape is incorrect"
print("Test passed!")

Test passed!


## Attention Mechanism
Attention Mechanism across both the vision encoder and language decoder

### The implementation of the Attention Head

In [49]:
class Head(nn.Module):
    def __init__(
        self,
        n_embed: int,
        head_size: int,
        dropout: float = 0.1,
        is_decoder: bool = False,
    ) -> None:
        super().__init__()

        # Linear layer for Key projection
        self.key = nn.Linear(in_features=n_embed, out_features=head_size, bias=False)

        # Linear layer for Query projection
        self.query = nn.Linear(in_features=n_embed, out_features=head_size, bias=False)

        # Linear layer for Value projection
        self.value = nn.Linear(in_features=n_embed, out_features=head_size, bias=False)

        # Dropout layer for regularization to prevent overfitting
        self.dropout = nn.Dropout(p=dropout)

        # Flag indicating wheter the head is used as a decoder
        self.is_decoder = is_decoder

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # Get batch size (B), sequence length (T), and embedding dimension (C) from the input tensor
        B, T, C = x.shape

        # Compute Key, Query, and Value projections
        k = self.key(x)  # Shape: (B, T, head_size)
        q = self.query(x)  # Shape: (B, T, head_size)
        v = self.value(x)  # SHape: (B, T, head_size)
        wei = q @ k.transpose(-2, -1) * (C**-0.5)  # Shape: (B, T, T)

        if self.is_decoder:
            tril = torch.tril(torch.ones(T, T, dtype=torch.bool, device=x.device))
            wei = wei.masked_fill(mask=tril == 0, value=float("-inf"))

        wei = F.softmax(input=wei, dim=-1)  # Shape: (B, T, T)

        # Apply Dropout to the attention probabilities for regularization
        wei = self.dropout(wei)

        # Perform a weighted aggregation of values using the attention probabilities
        out = wei @ v  # Shape: (B, T, head_size)

        return out

In [50]:
B, T, C = patches.shape  # Batch size, Sequence length, Embedding dimension
head_size = 16  # Size of the attention head

head = Head(n_embed=C, head_size=head_size)
output = head(patches)
print(f"Shape of output tensor: {output.shape}")

Shape of output tensor: torch.Size([128, 196, 16])


In [51]:
assert output.shape == (B, T, head_size), "Output shape is incorrect"
print("Test passed!")

Test passed!


### The implementation of Multihead Attention

In [52]:
class MultiHeadAttention(nn.Module):
    def __init__(
        self,
        n_embed: int,
        num_heads: int,
        dropout: float = 0.1,
        is_decoder: bool = False,
    ) -> None:
        super().__init__()

        # Ensure that the embedding dimension is divisible by the number of heads
        assert n_embed % num_heads == 0, "n_embed must be divisible by num_heads!"

        # Create a ModuleList of attention heads
        self.heads = nn.ModuleList(
            modules=[
                Head(
                    n_embed=n_embed,
                    head_size=n_embed // num_heads,
                    dropout=dropout,
                    is_decoder=is_decoder,
                )
                for _ in range(num_heads)
            ]
        )

        # Linear layer for projecting the concatenated head outputs
        self.proj = nn.Linear(in_features=n_embed, out_features=n_embed)

        # Dropout layer for regularization
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # Apply each attention head to the input tensor
        head_outputs = [
            h(x) for h in self.heads
        ]  # Shape: num_heads * (B, T, head_size)

        # Concatenate the outputs from all heads along the last dimension
        out = torch.cat(tensors=head_outputs, dim=-1)  # Shape: (B, T, m_embed)

        # Apply the projection layer to the concatenated outputs
        out = self.proj(out)  # Shape: (B, T, m_embed)

        # Apply Dropout to the projected outputs for regularization
        out = self.dropout(out)

        return out

In [53]:
num_heads = 2
dropout = 0.1
mha = MultiHeadAttention(n_embed=C, num_heads=num_heads, dropout=dropout)

In [54]:
output = mha(patches)
print(f"Shape of output tensor: {output.shape}")

Shape of output tensor: torch.Size([128, 196, 512])


In [55]:
assert output.shape == (B, T, C), "Output shape is incorrect"
print("Test passed!")

Test passed!


### The Multilayer Perceptron

In [56]:
class MLP(nn.Module):
    def __init__(
        self, n_embed: int, dropout: float = 0.1, is_decoder: bool = False
    ) -> None:
        super().__init__()

        # Define the layers of the MLP
        layers = [
            # First linear layer that expands the input dimension from n_embed to 4 * n_embed
            nn.Linear(in_features=n_embed, out_features=4 * n_embed),
            # Activation function: ReLU if is_decoder is True, else GELU
            nn.ReLU() if is_decoder else nn.GELU(),
            # Second linear layer that projects the intermediate dimension back to n_embed
            nn.Linear(in_features=4 * n_embed, out_features=n_embed),
            # Dropout layer for regularization
            nn.Dropout(p=dropout),
        ]

        # Create the MLP as a sequence of layers
        self.net = nn.Sequential(*layers)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # Pass the input through the MLP layers
        return self.net(x)

In [57]:
dropout = 0.1
mlp = MLP(n_embed=C, dropout=dropout)

In [58]:
output = mlp(output)  # Previous output of the Multihead Attention
print(f"Shape of output tensor: {output.shape}")

Shape of output tensor: torch.Size([128, 196, 512])


In [59]:
assert output.shape == (B, T, C), "Output shape is incorrect"
print("Test passed!")

Test passed!


### Transformer Blocks

In [60]:
class Block(nn.Module):
    def __init__(
        self,
        n_embed: int,
        num_heads: int,
        dropout: float = 0.1,
        is_decoder: bool = False,
    ) -> None:
        super().__init__()

        # Layer normalization for the input to the attention layer
        self.ln1 = nn.LayerNorm(normalized_shape=n_embed)

        # Multi-head attention module
        self.mhattn = MultiHeadAttention(
            n_embed=n_embed, num_heads=num_heads, dropout=dropout, is_decoder=is_decoder
        )

        # Layer normalization for the input to the FFN
        self.ln2 = nn.LayerNorm(normalized_shape=n_embed)

        # Feed-forward neural network (FFN)
        self.ffn = nn.Sequential(
            nn.Linear(in_features=n_embed, out_features=4 * n_embed),
            nn.GELU(),  # Activation function
            nn.Linear(
                in_features=4 * n_embed, out_features=n_embed
            ),  # Projection back to the original dimension
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # Saving the input for residual connection
        original_x = x

        # Apply layer normalization to the input
        x = self.ln1(x)

        # Apply multi-head attention
        mhattn_output = self.mhattn(x)

        # Add the residual connection (original input) to the attention output
        x = original_x + mhattn_output

        # Apply later normalization to the input to the FFN
        x = self.ln2(x)

        # Apply the FFN
        ffn_output = self.ffn(x)

        # Apply the residual connection (input to the FFN) to the FFN output
        x = x + ffn_output

        return x

In [61]:
num_heads = 2
dropout = 0.1
block = Block(n_embed=C, num_heads=num_heads, dropout=dropout)

In [62]:
output = block(patches)
print(f"Shape of output tensor: {output.shape}")

Shape of output tensor: torch.Size([128, 196, 512])


In [63]:
assert output.shape == (B, T, C), "Output shape is incorrect"
print("Test passed!")

Test passed!


## Vision Encoder - Vision Transformer (ViT)

Combining patchification logic and attention block in to ViT

In [64]:
class ViT(nn.Module):
    def __init__(
        self,
        img_size: int,
        patch_size: int,
        num_hiddens: int,
        num_heads: int,
        num_blocks: int,
        emb_dropout: float,
        block_dropout: float,
    ) -> None:
        super().__init__()

        # Patch embedding layer to convert the input image into patches
        self.patch_embedding = PatchEmbeddings(
            img_size=img_size, patch_size=patch_size, hidden_dim=num_hiddens
        )

        # Learnable classification token
        self.cls_token = nn.Parameter(data=torch.zeros(size=(1, 1, num_hiddens)))

        # Calculate the number of patches
        num_patches = (img_size // patch_size) ** 2

        # Learnable position embedding
        self.pos_embedding = nn.Parameter(
            data=torch.randn(size=(1, num_patches + 1, num_hiddens))
        )

        # Dropout layer for the embeddings
        self.dropout = nn.Dropout(p=emb_dropout)

        # Stack of transformer blocks
        self.blocks = nn.ModuleList(
            [
                Block(
                    n_embed=num_hiddens,
                    num_heads=num_heads,
                    dropout=block_dropout,
                    is_decoder=False,
                )
                for _ in range(num_blocks)
            ]
        )

        # Layer normalization for the final representation
        self.layer_norm = nn.LayerNorm(normalized_shape=num_hiddens)

    def forward(self, X: torch.Tensor) -> torch.Tensor:
        # Convert the input image into patch embeddings
        x = self.patch_embedding(X)  # Shape: (B, num_patches, num_hiddens)

        # Expand the classification token to match the batch size
        cls_tokens = self.cls_token.expand(
            x.shape[0], -1, -1
        )  # Shape: (B, 1, num_hiddens)

        # Concatenate the classification token with the patch embeddings
        x = torch.cat(
            tensors=(cls_tokens, x), dim=1
        )  # Shape: (B, num_patches + 1, num_hiddens)

        # Add the position embedding to the patch embeddings
        x += self.pos_embedding  # Shape: (B, num_patches + 1, num_hiddens)

        # Apply dropout to the embeddings
        x = self.dropout(x)  # Shape: (B, num_patches + 1, num_hiddens)

        # Pass the embeddings through the transformer blocks
        for block in self.blocks:
            x = block(x)  # Shape: (B, num_patches + 1, num_hiddens)

        # Apply layer normalization to the `[CLS]` token's final representation
        x = self.layer_norm(x[:, 0])  # Shape: (B, num_hiddens)

        return x

In [65]:
B, C, H, W = 2, 3, 224, 224  # Batch size, Channels, Height, Width
XX = torch.randn(B, C, H, W)
vit = ViT(
    img_size=H,
    patch_size=16,
    num_hiddens=64,
    num_heads=2,
    num_blocks=2,
    emb_dropout=0.1,
    block_dropout=0.1,
)

In [66]:
output_image = vit(XX)
print(f"Output shape: {output_image.shape}")

Output shape: torch.Size([2, 64])


In [67]:
assert output_image.shape == (B, 64), "Output shape is incorrect"
print("Test passed!")

Test passed!


## Vision-Language Projection Module

Unfortunatelly, we can't directly concatenate ViT output to the text embeddings. <br>
We need to project this from dimensionality of image embeddings from the vision transformer to the dimensionality of text embeddings.

Why MLP for this part? If you want to train VLM with low resources you can do so by keeping both the pretrained vision encoder and language decoder frozen during the VLM training. Therefore, allocating more parameters to the connection module could enhance the overall VLM's ability to generalize and help in the downstream instruction-tuning process.

In [68]:
class MultiModalProjector(nn.Module):
    def __init__(
        self,
        n_embed: int,
        img_embed_dim: int,
        dropout: float = 0.1,
    ) -> None:
        super().__init__()

        # Define the projection network
        self.net = nn.Sequential(
            # Linear layer to expand the image embedding dimension
            nn.Linear(in_features=img_embed_dim, out_features=4 * img_embed_dim),
            # GELU activation function
            nn.GELU(),
            # Linear layer to project the expanded image embeddings to the text embedding dimension
            nn.Linear(in_features=4 * img_embed_dim, out_features=n_embed),
            # Dropout layer for regularization
            nn.Dropout(p=dropout),
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # Pass the input through the projection network
        x = self.net(x)  # Shape: (B, img_embed_dim) --> (B, n_embed)
        return x

In [69]:
B, n_embed, img_embed_dim = 2, 64, 128
X = torch.randn(size=(B, img_embed_dim))

projector = MultiModalProjector(
    n_embed=n_embed, img_embed_dim=img_embed_dim, dropout=0.1
)

In [70]:
output = projector(X)
print(f"Output shape: {output.shape}")

Output shape: torch.Size([2, 64])


In [71]:
assert output.shape == (B, n_embed), "Output shape is incorrect"
print("Test passed!")

Test passed!


## Building the Decoder Language Model

Only thing that deviates from origianl implementation is that here projection module is integrated into decoder model class. <br>
In contrary, when using pretrained models with HuggingFace (or any other library), you can directly feed embeddings as input to the model.

In [81]:
class DecoderLanguageModel(nn.Module):
    def __init__(
        self,
        n_embed: int,
        img_embed_dim: int,
        vocab_size: int,
        num_heads: int,
        n_layer: int,
        num_labels: int,
        
    ) -> None:
        super().__init__()

        # self.use_images = use_images

        # Token embedding table
        self.token_embedding_table = nn.Embedding(
            num_embeddings=vocab_size, embedding_dim=n_embed
        )

        # Position embedding table
        self.position_embedding_table = nn.Embedding(
            num_embeddings=1000, embedding_dim=n_embed
        )

            # Image projection layer to align image embeddings with text embeddings
        self.image_projection = MultiModalProjector(
            n_embed=n_embed, img_embed_dim=img_embed_dim
        )

        # Stack of transformer decoder blocks
        self.blocks = nn.Sequential(
            *[
                Block(n_embed=n_embed, num_heads=num_heads, is_decoder=True)
                for _ in range(n_layer)
            ]
        )

        # # Final layer normalization
        # self.ln_f = nn.LayerNorm(normalized_shape=n_embed)
        
        # # Language modeling head
        # self.lm_head = nn.Linear(in_features=n_embed, out_features=num_labels)
        
        # self.pool =  # Global averaging
        self.ln_f = nn.LayerNorm(n_embed)
        # Output layer: input is flattened (seq_len * embed)
        self.lm_head = nn.Linear(n_embed*257, num_labels)

    def forward(
        self,
        idx: torch.Tensor,
        img_embeds: torch.Tensor = None,
        targets: torch.Tensor = None,
        use_images: bool = False,
    ) -> torch.Tensor:
        # Get token embeddings from the input indices
        tok_emb = self.token_embedding_table(idx)

        if use_images:
            # Project and concatenate image embeddings with token embeddings
            img_emb = self.image_projection(img_embeds).unsqueeze(1)
            tok_emb = torch.cat([img_emb, tok_emb], dim=1)

        # Get position embeddings
        pos_emb = self.position_embedding_table(
            torch.arange(tok_emb.size(1), device=idx.device)
        )

        # Add position embeddings to token embeddings
        x = tok_emb + pos_emb

        # Pass through the transformer decoder blocks
        x = self.blocks(x)

        # Apply final layer normalization
        x = self.ln_f(x)
        
        
        x = x.flatten(start_dim=1)
        logits = self.lm_head(x)    # (batch, num_labels)
        probs = torch.softmax(logits, dim=-1)

        if targets is not None:
        #     if use_images and img_embeds is not None:
        #         # Prepare targets by concatenating a dummy target for the image embedding
        #         batch_size = idx.size(0)
        #         targets = torch.cat(
        #             [
        #                 torch.full(
        #                     (batch_size, 1), -100, dtype=torch.long, device=idx.device
        #                 ),
        #                 targets,
        #             ],
        #             dim=1,
        #         )
            # Compute the cross-entropy loss
            loss = F.cross_entropy(
                input=probs,
                target=targets,
                # ignore_index=-100,
            )
            return probs, loss

        return probs


Testing

In [82]:
n_embed, img_embed_dim, vocab_size, num_heads, n_layer = 128, 256, 1000, 8, 6
# `n_layer` is used to represent number of decoder transformer blocks and num_blocks for the vision encoder to avoid confusion
device = 'cpu'
model = DecoderLanguageModel(
    n_embed=n_embed,
    img_embed_dim=img_embed_dim,
    vocab_size=vocab_size,
    num_heads=num_heads,
    n_layer=n_layer,
    # use_images=True,
    num_labels= 4
)


# Dummy input
B, T = 10, 256
idx = torch.randint(low=0, high=vocab_size, size=(B, T)).to(device)
image_embeds = torch.randn(B, 256).to(device)  # Assume img_embed_dim is 256

targets = torch.randint(0, 4, (B,)).to(device)

# targets = None
# Test forward pass
# Check if you need to calculate loss by providing targets
if targets is not None:
    logits, loss = model(idx, image_embeds, targets, True)
    print(f"Logits shape: {logits.shape}, Loss: {loss}")
    print(logits)
else:
    logits = model(idx, image_embeds, True)  # Call without targets
    print(f"Logits shape: {logits.shape}")
    print(logits)

# # Test generation
# generated = model.generate(idx, image_embeds, max_new_tokens=20)
# print(f"Generated sequence shape: {generated.shape}")

Logits shape: torch.Size([10, 4]), Loss: 1.3475735187530518
tensor([[0.2271, 0.1391, 0.1618, 0.4719],
        [0.1752, 0.2080, 0.1337, 0.4831],
        [0.2805, 0.2453, 0.1593, 0.3149],
        [0.2415, 0.4050, 0.1362, 0.2173],
        [0.1907, 0.2563, 0.1449, 0.4080],
        [0.3991, 0.1554, 0.1956, 0.2499],
        [0.1089, 0.3734, 0.1058, 0.4119],
        [0.2338, 0.2527, 0.1154, 0.3981],
        [0.4718, 0.2302, 0.1137, 0.1843],
        [0.3386, 0.1819, 0.1926, 0.2868]], grad_fn=<SoftmaxBackward0>)


## Putting everything together: Simple Vision Language Model

In [83]:
class VisionLanguageModel(nn.Module):
    def __init__(
        self,
        n_embed: int,
        img_embed_dim: int,
        vocab_size: int,
        n_layer: int,
        img_size: int,
        patch_size: int,
        num_heads: int,
        num_blocks: int,
        emb_dropout: float,
        block_dropout: float,
        num_labels : int,
    ) -> None:
        super().__init__()

        # Set num_hiddens equal to img_embed_dim
        num_hiddens = img_embed_dim

        # Assert that num_hiddens is divisible by num_heads
        assert num_hiddens % num_heads == 0, ValueError(
            "num_hiddens must be divisible by num_heads!"
        )

        # Initialize the Vision Transformer (ViT) encoder
        self.vision_encoder = ViT(
            img_size=img_size,
            patch_size=patch_size,
            num_hiddens=num_hiddens,
            num_heads=num_heads,
            num_blocks=num_blocks,
            emb_dropout=emb_dropout,
            block_dropout=block_dropout,
        )

        # Initialize the Language Model Decoder (DecoderLanguageModel)
        self.decoder = DecoderLanguageModel(
            n_embed=n_embed,
            img_embed_dim=img_embed_dim,
            vocab_size=vocab_size,
            num_heads=num_heads,
            n_layer=n_layer,
            # use_images=True,
            num_labels= num_labels
        )

    def _check_image_embeddings(self, image_embeds: torch.Tensor) -> None:
        """Chek if image embeddings are valid."""
        if image_embeds.nelement() == 0 or image_embeds.shape[1] == 0:
            raise ValueError(
                "Something is wrong with the ViT model. It's returning an empty tensor or the embedding dimension is empty."
            )

    def forward(
        self, img_array: torch.Tensor, idx: torch.Tensor, targets: torch.Tensor = None, use_images: bool = False
    ) -> torch.Tensor | Tuple[torch.Tensor, torch.Tensor]:
        # Get the image embeddings from the Vision Encoder
        image_embeds = self.vision_encoder(img_array)

        # Check if image embeddings are valid
        self._check_image_embeddings(image_embeds)

        if targets is not None:
            # If targets are provided, compute the logits and loss
            logits, loss = self.decoder(idx, image_embeds, targets, use_images = use_images)
            return logits, loss
        else:
            # If targets are not provided, compute only the logits
            logits = self.decoder(idx, image_embeds, use_images = use_images)
            return logits

    # def generate(
    #     self, img_array: torch.Tensor, idx: torch.Tensor, max_new_tokens: int
    # ) -> torch.Tensor:
    #     # Get the image embeddings from the Vision Encoder
    #     image_embeds = self.vision_encoder(img_array)

    #     # Check if image embeddings are valid
    #     self._check_image_embeddings(image_embeds)

    #     # Generate new tokens using the Language Model Decoder
    #     generated_tokens = self.decoder.generate(
    #         idx=idx, img_embeds=image_embeds, max_new_tokens=max_new_tokens
    #     )
    #     return generated_tokens

Testing

In [86]:
n_embed, num_hiddens, vocab_size, num_heads, n_layer = 128, 512, 1000, 8, 8
image_embed_dim = num_hiddens
img_size = 224
patch_size = 16
num_blocks = 2
device = 'cpu'

n_layer, block_size, num_hiddens = 8, 32, 512

model = VisionLanguageModel(
    n_embed=n_embed,
    img_embed_dim=image_embed_dim,
    vocab_size=vocab_size,
    n_layer=n_layer,
    img_size=img_size,
    patch_size=patch_size,
    num_heads=num_heads,
    num_blocks=num_blocks,
    emb_dropout=0.1,
    block_dropout=0.1,
    num_labels = 4
)

dummy_img = torch.randn(16, 3, img_size, img_size).to(
    device
)  

dummy_idx = torch.randint(0, vocab_size, (16, 256)).to(
    device
)  

try:
    output = model(dummy_img, dummy_idx, use_images = True)  # Output for debugging
    print(output)
    predictions = torch.argmax(output, dim=-1)
    print("Output from initialization forward pass:", predictions)
except RuntimeError as e:
    print(f"Runtime Error during forward pass: {str(e)}")
    print("Check layer configurations and input shapes.")

tensor([[0.0905, 0.2780, 0.2719, 0.3597],
        [0.1561, 0.0926, 0.2483, 0.5030],
        [0.1294, 0.3380, 0.3448, 0.1878],
        [0.1849, 0.1907, 0.3336, 0.2908],
        [0.0865, 0.2823, 0.1952, 0.4360],
        [0.1445, 0.2229, 0.4012, 0.2315],
        [0.1664, 0.2331, 0.2865, 0.3139],
        [0.0610, 0.2035, 0.2895, 0.4460],
        [0.2309, 0.1821, 0.2397, 0.3473],
        [0.1294, 0.1271, 0.2939, 0.4496],
        [0.2071, 0.2833, 0.2012, 0.3084],
        [0.0940, 0.4006, 0.2629, 0.2424],
        [0.1859, 0.4425, 0.0983, 0.2733],
        [0.2073, 0.2840, 0.3508, 0.1579],
        [0.0700, 0.4396, 0.2003, 0.2900],
        [0.2860, 0.2504, 0.2210, 0.2425]], grad_fn=<SoftmaxBackward0>)
Output from initialization forward pass: tensor([3, 3, 2, 2, 3, 2, 3, 3, 3, 3, 3, 1, 1, 2, 1, 0])


In [None]:
print(output.shape)

torch.Size([16, 4])


## Train

In [None]:
# assert 1 == 0

In [87]:
# Load tokenizer
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_len = 256  # gi·ªõi h·∫°n token

In [88]:
# Image preprocessing
image_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

In [89]:
# Load dataset
dataset = load_dataset("HuggingFaceM4/the_cauldron", "ai2d", split="train")

# L·ªçc nh·ªØng entry c√≥ ·∫£nh v√† text h·ª£p l·ªá
dataset = dataset.filter(lambda x: x["images"] and x["texts"] and "user" in x["texts"][0])

# # L·∫•y 100 sample ƒë·∫ßu ti√™n, l·∫•y full dataset th√¨ b·ªè qua d√≤ng n√†y
# dataset = dataset.select(range(500))

In [None]:
# dataset.map(lambda x: {'image': image_transform(x['images'][0])})

In [None]:
print(image_transform(dataset['images'][0][0]))

tensor([[[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.]],

        [[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.]],

        [[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.]]])


In [None]:
dataset['images'][0]

[<PIL.PngImagePlugin.PngImageFile image mode=RGB size=299x227>]

In [90]:
list_map = ['A', 'B', 'C', 'D']
# pad_id = tokenizer.pad_id() if tokenizer.pad_id() >= 0 else 0
max_len = 256
image_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])
dataset = dataset.map(lambda x: {'targets': list_map.index(re.search(r"Answer:\s*([A-Z])",x['texts'][0]['assistant']).group(1)),
                                 'input': 
                                     tokenizer(
                                         re.sub(r'Answer with the letter\.', 'Answer with the number only.'
                                                           ,  re.sub(r'\b([A-D])\.', lambda m: f"{ord(m.group(1)) - ord('A')}.", x['texts'][0]['user']))
                                                    , truncation=True, padding='max_length', max_length=max_len)['input_ids']
                                                    ,
                                 'image': image_transform(x['images'][0])
                                 })
# assert 1 == 0

In [91]:
dataset

Dataset({
    features: ['images', 'texts', 'targets', 'input', 'image'],
    num_rows: 2434
})

In [None]:
# inputs = sum(dataset.map(lambda x: {
#     'input': list(
#         map(
#             lambda t: list_map.index(re.search(r"Answer:\s*([A-Z])", t['assistant']).group(1)),
#             x['texts']
#         )
#     )
# })['input'], [])
# print(inputs)

In [None]:
# # Preprocessing function
# def preprocess(example):
#     img_data = example["images"][0]

#     # ƒê·∫£m b·∫£o ·∫£nh l√† PIL.Image
#     if isinstance(img_data, Image.Image):
#         img = img_data.convert("RGB")
#     elif isinstance(img_data, np.ndarray):
#         img = Image.fromarray(img_data).convert("RGB")
#     else:
#         raise ValueError(f"Unsupported image format: {type(img_data)}")

#     # Transform ƒë·ªÉ ra Tensor
#     image = image_transform(img)  # Tensor (3, 224, 224)

#     # Tokenize prompt v√† target
#     prompt = example["texts"][0]["user"]
#     target = example["texts"][0].get("assistant", "")

#     # full_input = prompt + "\n" + target if target else prompt

#     pad_id = tokenizer.pad_id() if tokenizer.pad_id() >= 0 else 0
#     tokens = tokenizer.encode(prompt)
#     tokens = tokens[:max_len]
#     tokens += [pad_id] * (max_len - len(tokens))
#     input_ids = torch.tensor(tokens, dtype=torch.long)

#     # Tokenize target
#     if target:
#         target_tokens = tokenizer.encode(target)
#         target_tokens = target_tokens[:max_len]
#         target_tokens += [pad_id] * (max_len - len(target_tokens))
#         target_ids = torch.tensor(target_tokens, dtype=torch.long)
#     else:
#         target_ids = torch.full_like(input_ids, fill_value=pad_id)

#     return {
#         "image": image,
#         "input_ids": input_ids,
#         "target_ids": target_ids
#     }

# # Apply preprocessing
# # print(preprocess(dataset[0]))
# dataset = dataset.map(preprocess)

In [92]:
# B·∫Øt bu·ªôc ƒë·ªÉ gi·ªØ tensor thay v√¨ list!
dataset.set_format(type="torch")

In [None]:
dataset['input'][0]

tensor([  101,  3160,  1024,  2054,  2079, 24501, 16781,  1998, 16513,  2507,
         2041,  9804,  1024,  1014,  1012,  7722,  1015,  1012,  6351, 14384,
         1016,  1012, 14114,  1017,  1012,  3684,  3437,  2007,  1996,  2193,
         2069,  1012,   102,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0, 

In [None]:
# Ki·ªÉm tra xem data c√≥ ƒë√∫ng tensor ko
example = dataset[0]
print(type(example['images']))          # <class 'torch.Tensor'>
print(example['image'])          # torch.Size([3, 224, 224])
print(example['input'])      # torch.Size([256])
print(example['targets'])

<class 'torch.Tensor'>
tensor([[[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.]],

        [[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.]],

        [[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.]]])
tensor([  101,  3160,  1024,  2054,  2079, 24501, 16781,  1998, 16513,  2507,
         2041,  9804,  1024,  1014,  1012,  7722,  1015,  1012,  6351, 14384,
         1016,  1012, 14114,  1017,

In [93]:
def collate_fn(batch):
    # ƒê·∫£m b·∫£o chuy·ªÉn v·ªÅ tensor ƒë√∫ng shape
    imgs = torch.stack([torch.tensor(item['images']) if not isinstance(item['image'], torch.Tensor) else item['image'] for item in batch])
    input_ids = torch.stack([item['input'] for item in batch])
    target_ids = torch.stack([item['targets'] for item in batch])
    return imgs, input_ids, target_ids

In [94]:
n_embed, num_hiddens, num_heads, n_layer = 128, 512, 8, 8
image_embed_dim = num_hiddens
img_size = 224
patch_size = 16
num_blocks = 2

n_layer, block_size, num_hiddens = 8, 32, 512

# Initialize the model
vlm = VisionLanguageModel(
    n_embed=n_embed,
    img_embed_dim=image_embed_dim,
    vocab_size=tokenizer.vocab_size,
    n_layer=n_layer,
    img_size=img_size,
    patch_size=patch_size,
    num_heads=num_heads,
    num_blocks=num_blocks,
    emb_dropout=0.1,
    block_dropout=0.1,
    num_labels=4
)
device = torch.device('cpu')
vlm.to(device)

# Optimizer, ch·ªçn b·ªô ph√π h·ª£p, ch∆∞a th·ª≠ nhi·ªÅu n√™n kh√¥ng bt b·ªô n√†o t·ªët
# optimizer = torch.optim.AdamW(vlm.parameters(), lr=1e-4)
optimizer = torch.optim.SGD(vlm.parameters(), lr=0.001, momentum=0.9)


In [95]:
# Create DataLoader
dataloader = DataLoader(dataset, batch_size=8, shuffle=True, collate_fn=collate_fn)

In [96]:
# Training loop
vlm.train()
for epoch in range(10):
    pbar = tqdm(dataloader, desc=f"Epoch {epoch + 1}")
    total_loss = 0
    for imgs, input_ids, target_ids in pbar:
        input_ids = input_ids.to(device)
        imgs = imgs.to(device)
        target_ids = target_ids.to(device)

        optimizer.zero_grad()
        _, loss = vlm(imgs, input_ids, targets=target_ids, use_images = True)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        pbar.set_postfix({"loss": loss.item()})

    print(f"Epoch {epoch+1} - Avg Loss: {total_loss / len(dataloader):.4f}")

Epoch 1: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 305/305 [04:57<00:00,  1.03it/s, loss=1.24]


Epoch 1 - Avg Loss: 1.4788


Epoch 2: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 305/305 [04:55<00:00,  1.03it/s, loss=1.74] 


Epoch 2 - Avg Loss: 1.4806


Epoch 3: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 305/305 [04:55<00:00,  1.03it/s, loss=1.24] 


Epoch 3 - Avg Loss: 1.4793


Epoch 4: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 305/305 [04:55<00:00,  1.03it/s, loss=1.24]


Epoch 4 - Avg Loss: 1.4793


Epoch 5: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 305/305 [04:55<00:00,  1.03it/s, loss=1.74] 


Epoch 5 - Avg Loss: 1.4806


Epoch 6: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 305/305 [04:55<00:00,  1.03it/s, loss=1.74] 


Epoch 6 - Avg Loss: 1.4806


Epoch 7: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 305/305 [04:55<00:00,  1.03it/s, loss=1.74] 


Epoch 7 - Avg Loss: 1.4806


Epoch 8: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 305/305 [04:55<00:00,  1.03it/s, loss=1.74]


Epoch 8 - Avg Loss: 1.4806


Epoch 9: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 305/305 [04:55<00:00,  1.03it/s, loss=1.74]


Epoch 9 - Avg Loss: 1.4806


Epoch 10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 305/305 [04:54<00:00,  1.03it/s, loss=1.74] 

Epoch 10 - Avg Loss: 1.4806





In [None]:
torch.save(vlm.state_dict(), "model_encoder.pth")
print("Model saved to model.pth")

Model saved to model.pth


## Eval
### define model ph·∫£i gi·ªëng v·ªõi l√∫c train, img_size=224, n·∫øu ƒë·ªïi img_size ph·∫£i ƒë·ªïi ·ªü h√†m def preprocess(example) v√† image_transform


In [None]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cuda")
device

device(type='cuda')

In [None]:
tokenizer = spm.SentencePieceProcessor(model_file='./spm.model')

In [None]:
n_embed, num_hiddens, num_heads, n_layer = 128, 512, 8, 8
image_embed_dim = num_hiddens
img_size = 224
patch_size = 16
num_blocks = 2

n_layer, block_size, num_hiddens = 8, 32, 512

# Initialize the model
model = VisionLanguageModel(
    n_embed=n_embed,
    img_embed_dim=image_embed_dim,
    vocab_size=tokenizer.vocab_size,
    n_layer=n_layer,
    img_size=img_size,
    patch_size=patch_size,
    num_heads=num_heads,
    num_blocks=num_blocks,
    emb_dropout=0.1,
    block_dropout=0.1,
    num_labels = 4
)
# model.to(device)
# checkpoint = torch.load(best_ckpt_path)
model.load_state_dict(torch.load("./model_encoder.pth"))
model.eval()  # set to eval mode if you're going to do inference

# Load image
img_path = './image-1d100e9.jpg'  # üîÅ Replace with your actual image path
image = Image.open(img_path).convert("RGB")

# Preprocessing image
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # make sure this matches ViT input
    transforms.ToTensor(),
])
img_tensor = transform(image).unsqueeze(0)  # shape: [1, 3, 224, 224]

# Move img_tensor to the same device as the model
# img_tensor = img_tensor.to(device)

# Prepare prompt and tokenize it
prompt = "Question: What do respiration and combustion give out\nChoices:\n0. Oxygen\n1. Carbon dioxide\n2. Nitrogen\n3. Heat\nAnswer with the number."
tokens = tokenizer.encode(prompt)

# Convert tokens list to a PyTorch tensor
tokens_tensor = torch.tensor(tokens).unsqueeze(0) # Add batch dimension and move to device

# --- 4. Run inference ---
with torch.no_grad():
    output_tokens = model(
        img_array=img_tensor,
        idx=tokens_tensor,  # Pass tensor instead of list
        # max_new_tokens=50
    )
    predictions = torch.argmax(output_tokens, dim=-1)  # shape: (batch_size, seq_len)   '
    # print(predictions.shape)
    # print(tokens_tensor.shape)
    print(predictions)
    # out = torch.cat([tokens_tensor, predictions], dim=1)
    # print(out.view(-1).shape)
    # print(out.view(-1))
    # print(predictions)
    # predictions = output_tokens

    # Convert the tensor to a list and filter out pad_id
    # output_tokens_list = [token.item() for token in predictions.flatten() if token.item() != tokenizer.pad_id()]
    # print(tokenizer.decode(output_tokens_list))
    # # print(output_tokens.shape)


RuntimeError: Error(s) in loading state_dict for VisionLanguageModel:
	size mismatch for vision_encoder.pos_embedding: copying a param with shape torch.Size([1, 37, 512]) from checkpoint, the shape in current model is torch.Size([1, 197, 512]).

In [None]:
# --- 5. Decode and handle special tokens ---
# Convert tensor to list and decode
output_tokens_list = output_tokens[0].cpu().numpy().tolist()

print(output_tokens_list)

# # Remove special tokens manually (if needed)
# # For example, let's assume that 0 is the token for padding (common in many models)
# # Modify the list to remove any special tokens, if necessary
# output_tokens_list = [token for token in output_tokens_list if token != tokenizer.pad_id()]

# # Now, decode the remaining tokens
# output_text = tokenizer.decode(output_tokens_list)
# print("Answer:", output_text)

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)

