From 1a9b0a0ff2c881fced4e61d0768a26dbbfa9de91 Mon Sep 17 00:00:00 2001 From: NANDAGOPALNG Date: Tue, 7 Oct 2025 23:10:25 +0530 Subject: [PATCH 01/10] Added Vision_Transformer.py , it fixes #13326 --- computer_vision/Vision_Tranformer.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 computer_vision/Vision_Tranformer.py diff --git a/computer_vision/Vision_Tranformer.py b/computer_vision/Vision_Tranformer.py new file mode 100644 index 000000000000..e69de29bb2d1 From 551cb8c3d43bb3a204b1648a76405521eda4d2b6 Mon Sep 17 00:00:00 2001 From: NANDAGOPALNG Date: Tue, 7 Oct 2025 23:15:18 +0530 Subject: [PATCH 02/10] updated the error --- computer_vision/Vision_Tranformer.py | 385 +++++++++++++++++++++++++++ 1 file changed, 385 insertions(+) diff --git a/computer_vision/Vision_Tranformer.py b/computer_vision/Vision_Tranformer.py index e69de29bb2d1..693ab35baf5e 100644 --- a/computer_vision/Vision_Tranformer.py +++ b/computer_vision/Vision_Tranformer.py @@ -0,0 +1,385 @@ +""" +Vision Transformer (ViT) Implementation + +This module contains a PyTorch implementation of the Vision Transformer (ViT) +architecture based on the paper "An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale". + +Key Components: +- Patch Embedding +- Multi-Head Self Attention +- MLP Block +- Transformer Encoder +- Vision Transformer Model +""" + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch import Tensor +from typing import Optional, Tuple +import math + + +class PatchEmbedding(nn.Module): + """ + Creates patch embeddings from input images as described in Equation 1 of ViT paper. + + Args: + img_size (int): Size of input image (assumed square) + patch_size (int): Size of each patch (assumed square) + in_channels (int): Number of input channels + embed_dim (int): Dimension of embedding + """ + + def __init__(self, img_size: int = 224, patch_size: int = 16, in_channels: int = 3, embed_dim: int = 768): + super().__init__() + self.img_size = img_size + self.patch_size = patch_size + self.n_patches = (img_size // patch_size) ** 2 + + self.proj = nn.Conv2d( + in_channels=in_channels, + out_channels=embed_dim, + kernel_size=patch_size, + stride=patch_size + ) + + def forward(self, x: Tensor) -> Tensor: + """ + Forward pass for patch embedding. + + Args: + x (Tensor): Input tensor of shape (B, C, H, W) + + Returns: + Tensor: Patch embeddings of shape (B, n_patches, embed_dim) + """ + x = self.proj(x) # (B, embed_dim, H//patch_size, W//patch_size) + x = x.flatten(2) # (B, embed_dim, n_patches) + x = x.transpose(1, 2) # (B, n_patches, embed_dim) + return x + + +class MultiHeadSelfAttention(nn.Module): + """ + Multi-Head Self Attention (MSA) block as described in Equation 2 of ViT paper. + + Args: + embed_dim (int): Dimension of embedding + num_heads (int): Number of attention heads + dropout (float): Dropout rate + """ + + def __init__(self, embed_dim: int = 768, num_heads: int = 12, dropout: float = 0.0): + super().__init__() + self.embed_dim = embed_dim + self.num_heads = num_heads + self.head_dim = embed_dim // num_heads + + assert self.head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads" + + self.qkv = nn.Linear(embed_dim, embed_dim * 3) + self.attn_dropout = nn.Dropout(dropout) + self.proj = nn.Linear(embed_dim, embed_dim) + self.proj_dropout = nn.Dropout(dropout) + + def forward(self, x: Tensor) -> Tensor: + """ + Forward pass for multi-head self attention. + + Args: + x (Tensor): Input tensor of shape (B, n_patches, embed_dim) + + Returns: + Tensor: Output tensor of same shape as input + """ + B, N, C = x.shape + + # Create Q, K, V + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4) + q, k, v = qkv[0], qkv[1], qkv[2] # (B, num_heads, N, head_dim) + + # Scaled dot-product attention + attn = (q @ k.transpose(-2, -1)) * (self.head_dim ** -0.5) # (B, num_heads, N, N) + attn = F.softmax(attn, dim=-1) + attn = self.attn_dropout(attn) + + # Apply attention to values + x = (attn @ v).transpose(1, 2).reshape(B, N, C) # (B, N, embed_dim) + + # Projection + x = self.proj(x) + x = self.proj_dropout(x) + + return x + + +class MLPBlock(nn.Module): + """ + Multilayer Perceptron (MLP) block as described in Equation 3 of ViT paper. + + Args: + embed_dim (int): Dimension of embedding + mlp_ratio (float): Ratio of MLP hidden dimension to embed_dim + dropout (float): Dropout rate + """ + + def __init__(self, embed_dim: int = 768, mlp_ratio: float = 4.0, dropout: float = 0.0): + super().__init__() + hidden_dim = int(embed_dim * mlp_ratio) + + self.fc1 = nn.Linear(embed_dim, hidden_dim) + self.act = nn.GELU() + self.fc2 = nn.Linear(hidden_dim, embed_dim) + self.dropout = nn.Dropout(dropout) + + def forward(self, x: Tensor) -> Tensor: + """ + Forward pass for MLP block. + + Args: + x (Tensor): Input tensor of shape (B, n_patches, embed_dim) + + Returns: + Tensor: Output tensor of same shape as input + """ + x = self.fc1(x) + x = self.act(x) + x = self.dropout(x) + x = self.fc2(x) + x = self.dropout(x) + return x + + +class TransformerEncoderBlock(nn.Module): + """ + Transformer Encoder Block combining MSA and MLP with residual connections. + + Args: + embed_dim (int): Dimension of embedding + num_heads (int): Number of attention heads + mlp_ratio (float): Ratio of MLP hidden dimension to embed_dim + dropout (float): Dropout rate + """ + + def __init__(self, embed_dim: int = 768, num_heads: int = 12, mlp_ratio: float = 4.0, dropout: float = 0.1): + super().__init__() + + self.norm1 = nn.LayerNorm(embed_dim) + self.attn = MultiHeadSelfAttention(embed_dim, num_heads, dropout) + self.norm2 = nn.LayerNorm(embed_dim) + self.mlp = MLPBlock(embed_dim, mlp_ratio, dropout) + + def forward(self, x: Tensor) -> Tensor: + """ + Forward pass for transformer encoder block. + + Args: + x (Tensor): Input tensor of shape (B, n_patches, embed_dim) + + Returns: + Tensor: Output tensor of same shape as input + """ + # Multi-head self attention with residual connection + x = x + self.attn(self.norm1(x)) + + # MLP with residual connection + x = x + self.mlp(self.norm2(x)) + + return x + + +class VisionTransformer(nn.Module): + """ + Vision Transformer (ViT) model. + + Args: + img_size (int): Input image size + patch_size (int): Patch size + in_channels (int): Number of input channels + num_classes (int): Number of output classes + embed_dim (int): Embedding dimension + depth (int): Number of transformer blocks + num_heads (int): Number of attention heads + mlp_ratio (float): Ratio of MLP hidden dimension to embed_dim + dropout (float): Dropout rate + emb_dropout (float): Embedding dropout rate + """ + + def __init__( + self, + img_size: int = 224, + patch_size: int = 16, + in_channels: int = 3, + num_classes: int = 1000, + embed_dim: int = 768, + depth: int = 12, + num_heads: int = 12, + mlp_ratio: float = 4.0, + dropout: float = 0.1, + emb_dropout: float = 0.1 + ): + super().__init__() + + self.img_size = img_size + self.patch_size = patch_size + self.in_channels = in_channels + + # Patch embedding + self.patch_embed = PatchEmbedding(img_size, patch_size, in_channels, embed_dim) + n_patches = self.patch_embed.n_patches + + # Class token and position embedding + self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) + self.pos_embed = nn.Parameter(torch.zeros(1, n_patches + 1, embed_dim)) + self.pos_dropout = nn.Dropout(emb_dropout) + + # Transformer encoder blocks + self.blocks = nn.ModuleList([ + TransformerEncoderBlock(embed_dim, num_heads, mlp_ratio, dropout) + for _ in range(depth) + ]) + + # Layer normalization and classifier + self.norm = nn.LayerNorm(embed_dim) + self.head = nn.Linear(embed_dim, num_classes) + + # Initialize weights + self._init_weights() + + def _init_weights(self): + """Initialize weights for the ViT model.""" + # Initialize patch embedding like a linear layer + nn.init.xavier_uniform_(self.patch_embed.proj.weight) + if self.patch_embed.proj.bias is not None: + nn.init.zeros_(self.patch_embed.proj.bias) + + # Initialize class token and position embedding + nn.init.trunc_normal_(self.cls_token, std=0.02) + nn.init.trunc_normal_(self.pos_embed, std=0.02) + + # Initialize linear layers + self.apply(self._init_linear_weights) + + def _init_linear_weights(self, module): + """Initialize weights for linear layers.""" + if isinstance(module, nn.Linear): + nn.init.trunc_normal_(module.weight, std=0.02) + if module.bias is not None: + nn.init.zeros_(module.bias) + + def forward(self, x: Tensor) -> Tensor: + """ + Forward pass for Vision Transformer. + + Args: + x (Tensor): Input tensor of shape (B, C, H, W) + + Returns: + Tensor: Output logits of shape (B, num_classes) + """ + B = x.shape[0] + + # Create patch embeddings + x = self.patch_embed(x) # (B, n_patches, embed_dim) + + # Add class token + cls_tokens = self.cls_token.expand(B, -1, -1) # (B, 1, embed_dim) + x = torch.cat((cls_tokens, x), dim=1) # (B, n_patches + 1, embed_dim) + + # Add position embedding and apply dropout + x = x + self.pos_embed + x = self.pos_dropout(x) + + # Apply transformer blocks + for block in self.blocks: + x = block(x) + + # Apply final normalization and get class token output + x = self.norm(x) + cls_token_final = x[:, 0] # Use class token for classification + + # Classifier + x = self.head(cls_token_final) + + return x + + +def create_vit_model( + img_size: int = 224, + patch_size: int = 16, + in_channels: int = 3, + num_classes: int = 1000, + embed_dim: int = 768, + depth: int = 12, + num_heads: int = 12, + mlp_ratio: float = 4.0, + dropout: float = 0.1, + emb_dropout: float = 0.1 +) -> VisionTransformer: + """ + Factory function to create a Vision Transformer model. + + Args: + img_size (int): Input image size + patch_size (int): Patch size + in_channels (int): Number of input channels + num_classes (int): Number of output classes + embed_dim (int): Embedding dimension + depth (int): Number of transformer blocks + num_heads (int): Number of attention heads + mlp_ratio (float): Ratio of MLP hidden dimension to embed_dim + dropout (float): Dropout rate + emb_dropout (float): Embedding dropout rate + + Returns: + VisionTransformer: Configured ViT model + """ + return VisionTransformer( + img_size=img_size, + patch_size=patch_size, + in_channels=in_channels, + num_classes=num_classes, + embed_dim=embed_dim, + depth=depth, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + dropout=dropout, + emb_dropout=emb_dropout + ) + + + + + +def count_parameters(model: nn.Module) -> int: + """ + Count the number of trainable parameters in a model. + + Args: + model (nn.Module): PyTorch model + + Returns: + int: Number of trainable parameters + """ + return sum(p.numel() for p in model.parameters() if p.requires_grad) + + +if __name__ == "__main__": + # Example usage + model = create_vit_model( + img_size=224, + patch_size=16, + num_classes=3, # pizza, steak, sushi + embed_dim=768, + depth=12, + num_heads=12 + ) + + print(f"Model created with {count_parameters(model):,} parameters") + + # Test forward pass + x = torch.randn(2, 3, 224, 224) + out = model(x) + print(f"Input shape: {x.shape}") + print(f"Output shape: {out.shape}") \ No newline at end of file From 126ceba50dfaee09362874fef5b594e8ee077f4d Mon Sep 17 00:00:00 2001 From: "NANDA GOPAL.D" Date: Tue, 7 Oct 2025 23:24:56 +0530 Subject: [PATCH 03/10] Rename Vision_Tranformer.py to vision_tranformer.py --- computer_vision/{Vision_Tranformer.py => vision_tranformer.py} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename computer_vision/{Vision_Tranformer.py => vision_tranformer.py} (99%) diff --git a/computer_vision/Vision_Tranformer.py b/computer_vision/vision_tranformer.py similarity index 99% rename from computer_vision/Vision_Tranformer.py rename to computer_vision/vision_tranformer.py index 693ab35baf5e..215aff1d6d1e 100644 --- a/computer_vision/Vision_Tranformer.py +++ b/computer_vision/vision_tranformer.py @@ -382,4 +382,4 @@ def count_parameters(model: nn.Module) -> int: x = torch.randn(2, 3, 224, 224) out = model(x) print(f"Input shape: {x.shape}") - print(f"Output shape: {out.shape}") \ No newline at end of file + print(f"Output shape: {out.shape}") From d464be933fe625aba6098b7adac89624715506db Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 7 Oct 2025 17:58:22 +0000 Subject: [PATCH 04/10] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- computer_vision/vision_tranformer.py | 177 +++++++++++++++------------ 1 file changed, 98 insertions(+), 79 deletions(-) diff --git a/computer_vision/vision_tranformer.py b/computer_vision/vision_tranformer.py index 215aff1d6d1e..bc84e4852569 100644 --- a/computer_vision/vision_tranformer.py +++ b/computer_vision/vision_tranformer.py @@ -1,7 +1,7 @@ """ Vision Transformer (ViT) Implementation -This module contains a PyTorch implementation of the Vision Transformer (ViT) +This module contains a PyTorch implementation of the Vision Transformer (ViT) architecture based on the paper "An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale". Key Components: @@ -23,34 +23,40 @@ class PatchEmbedding(nn.Module): """ Creates patch embeddings from input images as described in Equation 1 of ViT paper. - + Args: img_size (int): Size of input image (assumed square) patch_size (int): Size of each patch (assumed square) in_channels (int): Number of input channels embed_dim (int): Dimension of embedding """ - - def __init__(self, img_size: int = 224, patch_size: int = 16, in_channels: int = 3, embed_dim: int = 768): + + def __init__( + self, + img_size: int = 224, + patch_size: int = 16, + in_channels: int = 3, + embed_dim: int = 768, + ): super().__init__() self.img_size = img_size self.patch_size = patch_size self.n_patches = (img_size // patch_size) ** 2 - + self.proj = nn.Conv2d( in_channels=in_channels, out_channels=embed_dim, kernel_size=patch_size, - stride=patch_size + stride=patch_size, ) - + def forward(self, x: Tensor) -> Tensor: """ Forward pass for patch embedding. - + Args: x (Tensor): Input tensor of shape (B, C, H, W) - + Returns: Tensor: Patch embeddings of shape (B, n_patches, embed_dim) """ @@ -63,83 +69,91 @@ def forward(self, x: Tensor) -> Tensor: class MultiHeadSelfAttention(nn.Module): """ Multi-Head Self Attention (MSA) block as described in Equation 2 of ViT paper. - + Args: embed_dim (int): Dimension of embedding num_heads (int): Number of attention heads dropout (float): Dropout rate """ - + def __init__(self, embed_dim: int = 768, num_heads: int = 12, dropout: float = 0.0): super().__init__() self.embed_dim = embed_dim self.num_heads = num_heads self.head_dim = embed_dim // num_heads - - assert self.head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads" - + + assert self.head_dim * num_heads == embed_dim, ( + "embed_dim must be divisible by num_heads" + ) + self.qkv = nn.Linear(embed_dim, embed_dim * 3) self.attn_dropout = nn.Dropout(dropout) self.proj = nn.Linear(embed_dim, embed_dim) self.proj_dropout = nn.Dropout(dropout) - + def forward(self, x: Tensor) -> Tensor: """ Forward pass for multi-head self attention. - + Args: x (Tensor): Input tensor of shape (B, n_patches, embed_dim) - + Returns: Tensor: Output tensor of same shape as input """ B, N, C = x.shape - + # Create Q, K, V - qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4) + qkv = ( + self.qkv(x) + .reshape(B, N, 3, self.num_heads, self.head_dim) + .permute(2, 0, 3, 1, 4) + ) q, k, v = qkv[0], qkv[1], qkv[2] # (B, num_heads, N, head_dim) - + # Scaled dot-product attention - attn = (q @ k.transpose(-2, -1)) * (self.head_dim ** -0.5) # (B, num_heads, N, N) + attn = (q @ k.transpose(-2, -1)) * (self.head_dim**-0.5) # (B, num_heads, N, N) attn = F.softmax(attn, dim=-1) attn = self.attn_dropout(attn) - + # Apply attention to values x = (attn @ v).transpose(1, 2).reshape(B, N, C) # (B, N, embed_dim) - + # Projection x = self.proj(x) x = self.proj_dropout(x) - + return x class MLPBlock(nn.Module): """ Multilayer Perceptron (MLP) block as described in Equation 3 of ViT paper. - + Args: embed_dim (int): Dimension of embedding mlp_ratio (float): Ratio of MLP hidden dimension to embed_dim dropout (float): Dropout rate """ - - def __init__(self, embed_dim: int = 768, mlp_ratio: float = 4.0, dropout: float = 0.0): + + def __init__( + self, embed_dim: int = 768, mlp_ratio: float = 4.0, dropout: float = 0.0 + ): super().__init__() hidden_dim = int(embed_dim * mlp_ratio) - + self.fc1 = nn.Linear(embed_dim, hidden_dim) self.act = nn.GELU() self.fc2 = nn.Linear(hidden_dim, embed_dim) self.dropout = nn.Dropout(dropout) - + def forward(self, x: Tensor) -> Tensor: """ Forward pass for MLP block. - + Args: x (Tensor): Input tensor of shape (B, n_patches, embed_dim) - + Returns: Tensor: Output tensor of same shape as input """ @@ -154,45 +168,51 @@ def forward(self, x: Tensor) -> Tensor: class TransformerEncoderBlock(nn.Module): """ Transformer Encoder Block combining MSA and MLP with residual connections. - + Args: embed_dim (int): Dimension of embedding num_heads (int): Number of attention heads mlp_ratio (float): Ratio of MLP hidden dimension to embed_dim dropout (float): Dropout rate """ - - def __init__(self, embed_dim: int = 768, num_heads: int = 12, mlp_ratio: float = 4.0, dropout: float = 0.1): + + def __init__( + self, + embed_dim: int = 768, + num_heads: int = 12, + mlp_ratio: float = 4.0, + dropout: float = 0.1, + ): super().__init__() - + self.norm1 = nn.LayerNorm(embed_dim) self.attn = MultiHeadSelfAttention(embed_dim, num_heads, dropout) self.norm2 = nn.LayerNorm(embed_dim) self.mlp = MLPBlock(embed_dim, mlp_ratio, dropout) - + def forward(self, x: Tensor) -> Tensor: """ Forward pass for transformer encoder block. - + Args: x (Tensor): Input tensor of shape (B, n_patches, embed_dim) - + Returns: Tensor: Output tensor of same shape as input """ # Multi-head self attention with residual connection x = x + self.attn(self.norm1(x)) - + # MLP with residual connection x = x + self.mlp(self.norm2(x)) - + return x class VisionTransformer(nn.Module): """ Vision Transformer (ViT) model. - + Args: img_size (int): Input image size patch_size (int): Patch size @@ -205,7 +225,7 @@ class VisionTransformer(nn.Module): dropout (float): Dropout rate emb_dropout (float): Embedding dropout rate """ - + def __init__( self, img_size: int = 224, @@ -217,91 +237,93 @@ def __init__( num_heads: int = 12, mlp_ratio: float = 4.0, dropout: float = 0.1, - emb_dropout: float = 0.1 + emb_dropout: float = 0.1, ): super().__init__() - + self.img_size = img_size self.patch_size = patch_size self.in_channels = in_channels - + # Patch embedding self.patch_embed = PatchEmbedding(img_size, patch_size, in_channels, embed_dim) n_patches = self.patch_embed.n_patches - + # Class token and position embedding self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) self.pos_embed = nn.Parameter(torch.zeros(1, n_patches + 1, embed_dim)) self.pos_dropout = nn.Dropout(emb_dropout) - + # Transformer encoder blocks - self.blocks = nn.ModuleList([ - TransformerEncoderBlock(embed_dim, num_heads, mlp_ratio, dropout) - for _ in range(depth) - ]) - + self.blocks = nn.ModuleList( + [ + TransformerEncoderBlock(embed_dim, num_heads, mlp_ratio, dropout) + for _ in range(depth) + ] + ) + # Layer normalization and classifier self.norm = nn.LayerNorm(embed_dim) self.head = nn.Linear(embed_dim, num_classes) - + # Initialize weights self._init_weights() - + def _init_weights(self): """Initialize weights for the ViT model.""" # Initialize patch embedding like a linear layer nn.init.xavier_uniform_(self.patch_embed.proj.weight) if self.patch_embed.proj.bias is not None: nn.init.zeros_(self.patch_embed.proj.bias) - + # Initialize class token and position embedding nn.init.trunc_normal_(self.cls_token, std=0.02) nn.init.trunc_normal_(self.pos_embed, std=0.02) - + # Initialize linear layers self.apply(self._init_linear_weights) - + def _init_linear_weights(self, module): """Initialize weights for linear layers.""" if isinstance(module, nn.Linear): nn.init.trunc_normal_(module.weight, std=0.02) if module.bias is not None: nn.init.zeros_(module.bias) - + def forward(self, x: Tensor) -> Tensor: """ Forward pass for Vision Transformer. - + Args: x (Tensor): Input tensor of shape (B, C, H, W) - + Returns: Tensor: Output logits of shape (B, num_classes) """ B = x.shape[0] - + # Create patch embeddings x = self.patch_embed(x) # (B, n_patches, embed_dim) - + # Add class token cls_tokens = self.cls_token.expand(B, -1, -1) # (B, 1, embed_dim) x = torch.cat((cls_tokens, x), dim=1) # (B, n_patches + 1, embed_dim) - + # Add position embedding and apply dropout x = x + self.pos_embed x = self.pos_dropout(x) - + # Apply transformer blocks for block in self.blocks: x = block(x) - + # Apply final normalization and get class token output x = self.norm(x) cls_token_final = x[:, 0] # Use class token for classification - + # Classifier x = self.head(cls_token_final) - + return x @@ -315,11 +337,11 @@ def create_vit_model( num_heads: int = 12, mlp_ratio: float = 4.0, dropout: float = 0.1, - emb_dropout: float = 0.1 + emb_dropout: float = 0.1, ) -> VisionTransformer: """ Factory function to create a Vision Transformer model. - + Args: img_size (int): Input image size patch_size (int): Patch size @@ -331,7 +353,7 @@ def create_vit_model( mlp_ratio (float): Ratio of MLP hidden dimension to embed_dim dropout (float): Dropout rate emb_dropout (float): Embedding dropout rate - + Returns: VisionTransformer: Configured ViT model """ @@ -345,20 +367,17 @@ def create_vit_model( num_heads=num_heads, mlp_ratio=mlp_ratio, dropout=dropout, - emb_dropout=emb_dropout + emb_dropout=emb_dropout, ) - - - def count_parameters(model: nn.Module) -> int: """ Count the number of trainable parameters in a model. - + Args: model (nn.Module): PyTorch model - + Returns: int: Number of trainable parameters """ @@ -373,11 +392,11 @@ def count_parameters(model: nn.Module) -> int: num_classes=3, # pizza, steak, sushi embed_dim=768, depth=12, - num_heads=12 + num_heads=12, ) - + print(f"Model created with {count_parameters(model):,} parameters") - + # Test forward pass x = torch.randn(2, 3, 224, 224) out = model(x) From 2d69ac4b7625c73bc60ee8870bcdc88f4461552d Mon Sep 17 00:00:00 2001 From: "NANDA GOPAL.D" Date: Wed, 8 Oct 2025 19:23:06 +0530 Subject: [PATCH 05/10] Update vision_tranformer.py --- computer_vision/vision_tranformer.py | 204 ++++++++++++++------------- 1 file changed, 106 insertions(+), 98 deletions(-) diff --git a/computer_vision/vision_tranformer.py b/computer_vision/vision_tranformer.py index bc84e4852569..aa43743dbe7a 100644 --- a/computer_vision/vision_tranformer.py +++ b/computer_vision/vision_tranformer.py @@ -1,7 +1,7 @@ """ Vision Transformer (ViT) Implementation -This module contains a PyTorch implementation of the Vision Transformer (ViT) +This module contains a PyTorch implementation of the Vision Transformer (ViT) architecture based on the paper "An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale". Key Components: @@ -23,40 +23,34 @@ class PatchEmbedding(nn.Module): """ Creates patch embeddings from input images as described in Equation 1 of ViT paper. - + Args: img_size (int): Size of input image (assumed square) patch_size (int): Size of each patch (assumed square) in_channels (int): Number of input channels embed_dim (int): Dimension of embedding """ - - def __init__( - self, - img_size: int = 224, - patch_size: int = 16, - in_channels: int = 3, - embed_dim: int = 768, - ): + + def __init__(self, img_size: int = 224, patch_size: int = 16, in_channels: int = 3, embed_dim: int = 768): super().__init__() self.img_size = img_size self.patch_size = patch_size self.n_patches = (img_size // patch_size) ** 2 - + self.proj = nn.Conv2d( in_channels=in_channels, out_channels=embed_dim, kernel_size=patch_size, - stride=patch_size, + stride=patch_size ) - + def forward(self, x: Tensor) -> Tensor: """ Forward pass for patch embedding. - + Args: x (Tensor): Input tensor of shape (B, C, H, W) - + Returns: Tensor: Patch embeddings of shape (B, n_patches, embed_dim) """ @@ -69,91 +63,83 @@ def forward(self, x: Tensor) -> Tensor: class MultiHeadSelfAttention(nn.Module): """ Multi-Head Self Attention (MSA) block as described in Equation 2 of ViT paper. - + Args: embed_dim (int): Dimension of embedding num_heads (int): Number of attention heads dropout (float): Dropout rate """ - + def __init__(self, embed_dim: int = 768, num_heads: int = 12, dropout: float = 0.0): super().__init__() self.embed_dim = embed_dim self.num_heads = num_heads self.head_dim = embed_dim // num_heads - - assert self.head_dim * num_heads == embed_dim, ( - "embed_dim must be divisible by num_heads" - ) - + + assert self.head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads" + self.qkv = nn.Linear(embed_dim, embed_dim * 3) self.attn_dropout = nn.Dropout(dropout) self.proj = nn.Linear(embed_dim, embed_dim) self.proj_dropout = nn.Dropout(dropout) - + def forward(self, x: Tensor) -> Tensor: """ Forward pass for multi-head self attention. - + Args: x (Tensor): Input tensor of shape (B, n_patches, embed_dim) - + Returns: Tensor: Output tensor of same shape as input """ B, N, C = x.shape - + # Create Q, K, V - qkv = ( - self.qkv(x) - .reshape(B, N, 3, self.num_heads, self.head_dim) - .permute(2, 0, 3, 1, 4) - ) + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4) q, k, v = qkv[0], qkv[1], qkv[2] # (B, num_heads, N, head_dim) - + # Scaled dot-product attention - attn = (q @ k.transpose(-2, -1)) * (self.head_dim**-0.5) # (B, num_heads, N, N) + attn = (q @ k.transpose(-2, -1)) * (self.head_dim ** -0.5) # (B, num_heads, N, N) attn = F.softmax(attn, dim=-1) attn = self.attn_dropout(attn) - + # Apply attention to values x = (attn @ v).transpose(1, 2).reshape(B, N, C) # (B, N, embed_dim) - + # Projection x = self.proj(x) x = self.proj_dropout(x) - + return x class MLPBlock(nn.Module): """ Multilayer Perceptron (MLP) block as described in Equation 3 of ViT paper. - + Args: embed_dim (int): Dimension of embedding mlp_ratio (float): Ratio of MLP hidden dimension to embed_dim dropout (float): Dropout rate """ - - def __init__( - self, embed_dim: int = 768, mlp_ratio: float = 4.0, dropout: float = 0.0 - ): + + def __init__(self, embed_dim: int = 768, mlp_ratio: float = 4.0, dropout: float = 0.0): super().__init__() hidden_dim = int(embed_dim * mlp_ratio) - + self.fc1 = nn.Linear(embed_dim, hidden_dim) self.act = nn.GELU() self.fc2 = nn.Linear(hidden_dim, embed_dim) self.dropout = nn.Dropout(dropout) - + def forward(self, x: Tensor) -> Tensor: """ Forward pass for MLP block. - + Args: x (Tensor): Input tensor of shape (B, n_patches, embed_dim) - + Returns: Tensor: Output tensor of same shape as input """ @@ -168,51 +154,45 @@ def forward(self, x: Tensor) -> Tensor: class TransformerEncoderBlock(nn.Module): """ Transformer Encoder Block combining MSA and MLP with residual connections. - + Args: embed_dim (int): Dimension of embedding num_heads (int): Number of attention heads mlp_ratio (float): Ratio of MLP hidden dimension to embed_dim dropout (float): Dropout rate """ - - def __init__( - self, - embed_dim: int = 768, - num_heads: int = 12, - mlp_ratio: float = 4.0, - dropout: float = 0.1, - ): + + def __init__(self, embed_dim: int = 768, num_heads: int = 12, mlp_ratio: float = 4.0, dropout: float = 0.1): super().__init__() - + self.norm1 = nn.LayerNorm(embed_dim) self.attn = MultiHeadSelfAttention(embed_dim, num_heads, dropout) self.norm2 = nn.LayerNorm(embed_dim) self.mlp = MLPBlock(embed_dim, mlp_ratio, dropout) - + def forward(self, x: Tensor) -> Tensor: """ Forward pass for transformer encoder block. - + Args: x (Tensor): Input tensor of shape (B, n_patches, embed_dim) - + Returns: Tensor: Output tensor of same shape as input """ # Multi-head self attention with residual connection x = x + self.attn(self.norm1(x)) - + # MLP with residual connection x = x + self.mlp(self.norm2(x)) - + return x class VisionTransformer(nn.Module): """ Vision Transformer (ViT) model. - + Args: img_size (int): Input image size patch_size (int): Patch size @@ -225,7 +205,7 @@ class VisionTransformer(nn.Module): dropout (float): Dropout rate emb_dropout (float): Embedding dropout rate """ - + def __init__( self, img_size: int = 224, @@ -237,93 +217,91 @@ def __init__( num_heads: int = 12, mlp_ratio: float = 4.0, dropout: float = 0.1, - emb_dropout: float = 0.1, + emb_dropout: float = 0.1 ): super().__init__() - + self.img_size = img_size self.patch_size = patch_size self.in_channels = in_channels - + # Patch embedding self.patch_embed = PatchEmbedding(img_size, patch_size, in_channels, embed_dim) n_patches = self.patch_embed.n_patches - + # Class token and position embedding self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) self.pos_embed = nn.Parameter(torch.zeros(1, n_patches + 1, embed_dim)) self.pos_dropout = nn.Dropout(emb_dropout) - + # Transformer encoder blocks - self.blocks = nn.ModuleList( - [ - TransformerEncoderBlock(embed_dim, num_heads, mlp_ratio, dropout) - for _ in range(depth) - ] - ) - + self.blocks = nn.ModuleList([ + TransformerEncoderBlock(embed_dim, num_heads, mlp_ratio, dropout) + for _ in range(depth) + ]) + # Layer normalization and classifier self.norm = nn.LayerNorm(embed_dim) self.head = nn.Linear(embed_dim, num_classes) - + # Initialize weights self._init_weights() - + def _init_weights(self): """Initialize weights for the ViT model.""" # Initialize patch embedding like a linear layer nn.init.xavier_uniform_(self.patch_embed.proj.weight) if self.patch_embed.proj.bias is not None: nn.init.zeros_(self.patch_embed.proj.bias) - + # Initialize class token and position embedding nn.init.trunc_normal_(self.cls_token, std=0.02) nn.init.trunc_normal_(self.pos_embed, std=0.02) - + # Initialize linear layers self.apply(self._init_linear_weights) - + def _init_linear_weights(self, module): """Initialize weights for linear layers.""" if isinstance(module, nn.Linear): nn.init.trunc_normal_(module.weight, std=0.02) if module.bias is not None: nn.init.zeros_(module.bias) - + def forward(self, x: Tensor) -> Tensor: """ Forward pass for Vision Transformer. - + Args: x (Tensor): Input tensor of shape (B, C, H, W) - + Returns: Tensor: Output logits of shape (B, num_classes) """ B = x.shape[0] - + # Create patch embeddings x = self.patch_embed(x) # (B, n_patches, embed_dim) - + # Add class token cls_tokens = self.cls_token.expand(B, -1, -1) # (B, 1, embed_dim) x = torch.cat((cls_tokens, x), dim=1) # (B, n_patches + 1, embed_dim) - + # Add position embedding and apply dropout x = x + self.pos_embed x = self.pos_dropout(x) - + # Apply transformer blocks for block in self.blocks: x = block(x) - + # Apply final normalization and get class token output x = self.norm(x) cls_token_final = x[:, 0] # Use class token for classification - + # Classifier x = self.head(cls_token_final) - + return x @@ -337,11 +315,11 @@ def create_vit_model( num_heads: int = 12, mlp_ratio: float = 4.0, dropout: float = 0.1, - emb_dropout: float = 0.1, + emb_dropout: float = 0.1 ) -> VisionTransformer: """ Factory function to create a Vision Transformer model. - + Args: img_size (int): Input image size patch_size (int): Patch size @@ -353,7 +331,7 @@ def create_vit_model( mlp_ratio (float): Ratio of MLP hidden dimension to embed_dim dropout (float): Dropout rate emb_dropout (float): Embedding dropout rate - + Returns: VisionTransformer: Configured ViT model """ @@ -367,17 +345,47 @@ def create_vit_model( num_heads=num_heads, mlp_ratio=mlp_ratio, dropout=dropout, - emb_dropout=emb_dropout, + emb_dropout=emb_dropout ) +def get_pretrained_vit(model_name: str = "vit_base_patch16_224", num_classes: int = 1000) -> nn.Module: + """ + Load a pretrained ViT model from torchvision. + + Args: + model_name (str): Name of the pretrained model + num_classes (int): Number of output classes (for fine-tuning) + + Returns: + nn.Module: Pretrained ViT model + """ + try: + import torchvision.models as models + + if hasattr(models, model_name): + model = getattr(models, model_name)(pretrained=True) + if num_classes != 1000: + # Replace the head for fine-tuning + if hasattr(model, 'heads'): + model.heads = nn.Linear(model.heads.in_features, num_classes) + elif hasattr(model, 'head'): + model.head = nn.Linear(model.head.in_features, num_classes) + return model + else: + raise ValueError(f"Model {model_name} not found in torchvision.models") + + except ImportError: + raise ImportError("torchvision is required to load pretrained models") + + def count_parameters(model: nn.Module) -> int: """ Count the number of trainable parameters in a model. - + Args: model (nn.Module): PyTorch model - + Returns: int: Number of trainable parameters """ @@ -392,11 +400,11 @@ def count_parameters(model: nn.Module) -> int: num_classes=3, # pizza, steak, sushi embed_dim=768, depth=12, - num_heads=12, + num_heads=12 ) - + print(f"Model created with {count_parameters(model):,} parameters") - + # Test forward pass x = torch.randn(2, 3, 224, 224) out = model(x) From 224ec8896b7a66cf5c24b04c239fd288baa4f859 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 8 Oct 2025 13:53:25 +0000 Subject: [PATCH 06/10] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- computer_vision/vision_tranformer.py | 190 +++++++++++++++------------ 1 file changed, 107 insertions(+), 83 deletions(-) diff --git a/computer_vision/vision_tranformer.py b/computer_vision/vision_tranformer.py index aa43743dbe7a..688fb9c60f14 100644 --- a/computer_vision/vision_tranformer.py +++ b/computer_vision/vision_tranformer.py @@ -1,7 +1,7 @@ """ Vision Transformer (ViT) Implementation -This module contains a PyTorch implementation of the Vision Transformer (ViT) +This module contains a PyTorch implementation of the Vision Transformer (ViT) architecture based on the paper "An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale". Key Components: @@ -23,34 +23,40 @@ class PatchEmbedding(nn.Module): """ Creates patch embeddings from input images as described in Equation 1 of ViT paper. - + Args: img_size (int): Size of input image (assumed square) patch_size (int): Size of each patch (assumed square) in_channels (int): Number of input channels embed_dim (int): Dimension of embedding """ - - def __init__(self, img_size: int = 224, patch_size: int = 16, in_channels: int = 3, embed_dim: int = 768): + + def __init__( + self, + img_size: int = 224, + patch_size: int = 16, + in_channels: int = 3, + embed_dim: int = 768, + ): super().__init__() self.img_size = img_size self.patch_size = patch_size self.n_patches = (img_size // patch_size) ** 2 - + self.proj = nn.Conv2d( in_channels=in_channels, out_channels=embed_dim, kernel_size=patch_size, - stride=patch_size + stride=patch_size, ) - + def forward(self, x: Tensor) -> Tensor: """ Forward pass for patch embedding. - + Args: x (Tensor): Input tensor of shape (B, C, H, W) - + Returns: Tensor: Patch embeddings of shape (B, n_patches, embed_dim) """ @@ -63,83 +69,91 @@ def forward(self, x: Tensor) -> Tensor: class MultiHeadSelfAttention(nn.Module): """ Multi-Head Self Attention (MSA) block as described in Equation 2 of ViT paper. - + Args: embed_dim (int): Dimension of embedding num_heads (int): Number of attention heads dropout (float): Dropout rate """ - + def __init__(self, embed_dim: int = 768, num_heads: int = 12, dropout: float = 0.0): super().__init__() self.embed_dim = embed_dim self.num_heads = num_heads self.head_dim = embed_dim // num_heads - - assert self.head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads" - + + assert self.head_dim * num_heads == embed_dim, ( + "embed_dim must be divisible by num_heads" + ) + self.qkv = nn.Linear(embed_dim, embed_dim * 3) self.attn_dropout = nn.Dropout(dropout) self.proj = nn.Linear(embed_dim, embed_dim) self.proj_dropout = nn.Dropout(dropout) - + def forward(self, x: Tensor) -> Tensor: """ Forward pass for multi-head self attention. - + Args: x (Tensor): Input tensor of shape (B, n_patches, embed_dim) - + Returns: Tensor: Output tensor of same shape as input """ B, N, C = x.shape - + # Create Q, K, V - qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4) + qkv = ( + self.qkv(x) + .reshape(B, N, 3, self.num_heads, self.head_dim) + .permute(2, 0, 3, 1, 4) + ) q, k, v = qkv[0], qkv[1], qkv[2] # (B, num_heads, N, head_dim) - + # Scaled dot-product attention - attn = (q @ k.transpose(-2, -1)) * (self.head_dim ** -0.5) # (B, num_heads, N, N) + attn = (q @ k.transpose(-2, -1)) * (self.head_dim**-0.5) # (B, num_heads, N, N) attn = F.softmax(attn, dim=-1) attn = self.attn_dropout(attn) - + # Apply attention to values x = (attn @ v).transpose(1, 2).reshape(B, N, C) # (B, N, embed_dim) - + # Projection x = self.proj(x) x = self.proj_dropout(x) - + return x class MLPBlock(nn.Module): """ Multilayer Perceptron (MLP) block as described in Equation 3 of ViT paper. - + Args: embed_dim (int): Dimension of embedding mlp_ratio (float): Ratio of MLP hidden dimension to embed_dim dropout (float): Dropout rate """ - - def __init__(self, embed_dim: int = 768, mlp_ratio: float = 4.0, dropout: float = 0.0): + + def __init__( + self, embed_dim: int = 768, mlp_ratio: float = 4.0, dropout: float = 0.0 + ): super().__init__() hidden_dim = int(embed_dim * mlp_ratio) - + self.fc1 = nn.Linear(embed_dim, hidden_dim) self.act = nn.GELU() self.fc2 = nn.Linear(hidden_dim, embed_dim) self.dropout = nn.Dropout(dropout) - + def forward(self, x: Tensor) -> Tensor: """ Forward pass for MLP block. - + Args: x (Tensor): Input tensor of shape (B, n_patches, embed_dim) - + Returns: Tensor: Output tensor of same shape as input """ @@ -154,45 +168,51 @@ def forward(self, x: Tensor) -> Tensor: class TransformerEncoderBlock(nn.Module): """ Transformer Encoder Block combining MSA and MLP with residual connections. - + Args: embed_dim (int): Dimension of embedding num_heads (int): Number of attention heads mlp_ratio (float): Ratio of MLP hidden dimension to embed_dim dropout (float): Dropout rate """ - - def __init__(self, embed_dim: int = 768, num_heads: int = 12, mlp_ratio: float = 4.0, dropout: float = 0.1): + + def __init__( + self, + embed_dim: int = 768, + num_heads: int = 12, + mlp_ratio: float = 4.0, + dropout: float = 0.1, + ): super().__init__() - + self.norm1 = nn.LayerNorm(embed_dim) self.attn = MultiHeadSelfAttention(embed_dim, num_heads, dropout) self.norm2 = nn.LayerNorm(embed_dim) self.mlp = MLPBlock(embed_dim, mlp_ratio, dropout) - + def forward(self, x: Tensor) -> Tensor: """ Forward pass for transformer encoder block. - + Args: x (Tensor): Input tensor of shape (B, n_patches, embed_dim) - + Returns: Tensor: Output tensor of same shape as input """ # Multi-head self attention with residual connection x = x + self.attn(self.norm1(x)) - + # MLP with residual connection x = x + self.mlp(self.norm2(x)) - + return x class VisionTransformer(nn.Module): """ Vision Transformer (ViT) model. - + Args: img_size (int): Input image size patch_size (int): Patch size @@ -205,7 +225,7 @@ class VisionTransformer(nn.Module): dropout (float): Dropout rate emb_dropout (float): Embedding dropout rate """ - + def __init__( self, img_size: int = 224, @@ -217,91 +237,93 @@ def __init__( num_heads: int = 12, mlp_ratio: float = 4.0, dropout: float = 0.1, - emb_dropout: float = 0.1 + emb_dropout: float = 0.1, ): super().__init__() - + self.img_size = img_size self.patch_size = patch_size self.in_channels = in_channels - + # Patch embedding self.patch_embed = PatchEmbedding(img_size, patch_size, in_channels, embed_dim) n_patches = self.patch_embed.n_patches - + # Class token and position embedding self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) self.pos_embed = nn.Parameter(torch.zeros(1, n_patches + 1, embed_dim)) self.pos_dropout = nn.Dropout(emb_dropout) - + # Transformer encoder blocks - self.blocks = nn.ModuleList([ - TransformerEncoderBlock(embed_dim, num_heads, mlp_ratio, dropout) - for _ in range(depth) - ]) - + self.blocks = nn.ModuleList( + [ + TransformerEncoderBlock(embed_dim, num_heads, mlp_ratio, dropout) + for _ in range(depth) + ] + ) + # Layer normalization and classifier self.norm = nn.LayerNorm(embed_dim) self.head = nn.Linear(embed_dim, num_classes) - + # Initialize weights self._init_weights() - + def _init_weights(self): """Initialize weights for the ViT model.""" # Initialize patch embedding like a linear layer nn.init.xavier_uniform_(self.patch_embed.proj.weight) if self.patch_embed.proj.bias is not None: nn.init.zeros_(self.patch_embed.proj.bias) - + # Initialize class token and position embedding nn.init.trunc_normal_(self.cls_token, std=0.02) nn.init.trunc_normal_(self.pos_embed, std=0.02) - + # Initialize linear layers self.apply(self._init_linear_weights) - + def _init_linear_weights(self, module): """Initialize weights for linear layers.""" if isinstance(module, nn.Linear): nn.init.trunc_normal_(module.weight, std=0.02) if module.bias is not None: nn.init.zeros_(module.bias) - + def forward(self, x: Tensor) -> Tensor: """ Forward pass for Vision Transformer. - + Args: x (Tensor): Input tensor of shape (B, C, H, W) - + Returns: Tensor: Output logits of shape (B, num_classes) """ B = x.shape[0] - + # Create patch embeddings x = self.patch_embed(x) # (B, n_patches, embed_dim) - + # Add class token cls_tokens = self.cls_token.expand(B, -1, -1) # (B, 1, embed_dim) x = torch.cat((cls_tokens, x), dim=1) # (B, n_patches + 1, embed_dim) - + # Add position embedding and apply dropout x = x + self.pos_embed x = self.pos_dropout(x) - + # Apply transformer blocks for block in self.blocks: x = block(x) - + # Apply final normalization and get class token output x = self.norm(x) cls_token_final = x[:, 0] # Use class token for classification - + # Classifier x = self.head(cls_token_final) - + return x @@ -315,11 +337,11 @@ def create_vit_model( num_heads: int = 12, mlp_ratio: float = 4.0, dropout: float = 0.1, - emb_dropout: float = 0.1 + emb_dropout: float = 0.1, ) -> VisionTransformer: """ Factory function to create a Vision Transformer model. - + Args: img_size (int): Input image size patch_size (int): Patch size @@ -331,7 +353,7 @@ def create_vit_model( mlp_ratio (float): Ratio of MLP hidden dimension to embed_dim dropout (float): Dropout rate emb_dropout (float): Embedding dropout rate - + Returns: VisionTransformer: Configured ViT model """ @@ -345,36 +367,38 @@ def create_vit_model( num_heads=num_heads, mlp_ratio=mlp_ratio, dropout=dropout, - emb_dropout=emb_dropout + emb_dropout=emb_dropout, ) -def get_pretrained_vit(model_name: str = "vit_base_patch16_224", num_classes: int = 1000) -> nn.Module: +def get_pretrained_vit( + model_name: str = "vit_base_patch16_224", num_classes: int = 1000 +) -> nn.Module: """ Load a pretrained ViT model from torchvision. - + Args: model_name (str): Name of the pretrained model num_classes (int): Number of output classes (for fine-tuning) - + Returns: nn.Module: Pretrained ViT model """ try: import torchvision.models as models - + if hasattr(models, model_name): model = getattr(models, model_name)(pretrained=True) if num_classes != 1000: # Replace the head for fine-tuning - if hasattr(model, 'heads'): + if hasattr(model, "heads"): model.heads = nn.Linear(model.heads.in_features, num_classes) - elif hasattr(model, 'head'): + elif hasattr(model, "head"): model.head = nn.Linear(model.head.in_features, num_classes) return model else: raise ValueError(f"Model {model_name} not found in torchvision.models") - + except ImportError: raise ImportError("torchvision is required to load pretrained models") @@ -382,10 +406,10 @@ def get_pretrained_vit(model_name: str = "vit_base_patch16_224", num_classes: in def count_parameters(model: nn.Module) -> int: """ Count the number of trainable parameters in a model. - + Args: model (nn.Module): PyTorch model - + Returns: int: Number of trainable parameters """ @@ -400,11 +424,11 @@ def count_parameters(model: nn.Module) -> int: num_classes=3, # pizza, steak, sushi embed_dim=768, depth=12, - num_heads=12 + num_heads=12, ) - + print(f"Model created with {count_parameters(model):,} parameters") - + # Test forward pass x = torch.randn(2, 3, 224, 224) out = model(x) From 7dde47711afd8fe8ecd819196780cd1348e8833f Mon Sep 17 00:00:00 2001 From: "NANDA GOPAL.D" Date: Wed, 8 Oct 2025 19:30:15 +0530 Subject: [PATCH 07/10] Update vision_tranformer.py --- computer_vision/vision_tranformer.py | 86 ++++++++++++---------------- 1 file changed, 36 insertions(+), 50 deletions(-) diff --git a/computer_vision/vision_tranformer.py b/computer_vision/vision_tranformer.py index 688fb9c60f14..f4e45c76244b 100644 --- a/computer_vision/vision_tranformer.py +++ b/computer_vision/vision_tranformer.py @@ -1,8 +1,9 @@ """ -Vision Transformer (ViT) Implementation +Vision Transformer (ViT) Implementation. This module contains a PyTorch implementation of the Vision Transformer (ViT) -architecture based on the paper "An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale". +architecture based on the paper "An Image is Worth 16x16 Words: +Transformers for Image Recognition at Scale". Key Components: - Patch Embedding @@ -12,17 +13,13 @@ - Vision Transformer Model """ -import torch -import torch.nn as nn -import torch.nn.functional as F -from torch import Tensor -from typing import Optional, Tuple -import math +from torch import Tensor, nn +import torch.nn.functional as functional class PatchEmbedding(nn.Module): """ - Creates patch embeddings from input images as described in Equation 1 of ViT paper. + Creates patch embeddings from input images as described in Equation 1. Args: img_size (int): Size of input image (assumed square) @@ -32,11 +29,8 @@ class PatchEmbedding(nn.Module): """ def __init__( - self, - img_size: int = 224, - patch_size: int = 16, - in_channels: int = 3, - embed_dim: int = 768, + self, img_size: int = 224, patch_size: int = 16, + in_channels: int = 3, embed_dim: int = 768 ): super().__init__() self.img_size = img_size @@ -47,7 +41,7 @@ def __init__( in_channels=in_channels, out_channels=embed_dim, kernel_size=patch_size, - stride=patch_size, + stride=patch_size ) def forward(self, x: Tensor) -> Tensor: @@ -68,7 +62,7 @@ def forward(self, x: Tensor) -> Tensor: class MultiHeadSelfAttention(nn.Module): """ - Multi-Head Self Attention (MSA) block as described in Equation 2 of ViT paper. + Multi-Head Self Attention (MSA) block as described in Equation 2. Args: embed_dim (int): Dimension of embedding @@ -101,23 +95,23 @@ def forward(self, x: Tensor) -> Tensor: Returns: Tensor: Output tensor of same shape as input """ - B, N, C = x.shape + batch_size, num_patches, channels = x.shape # Create Q, K, V qkv = ( self.qkv(x) - .reshape(B, N, 3, self.num_heads, self.head_dim) + .reshape(batch_size, num_patches, 3, self.num_heads, self.head_dim) .permute(2, 0, 3, 1, 4) ) q, k, v = qkv[0], qkv[1], qkv[2] # (B, num_heads, N, head_dim) # Scaled dot-product attention - attn = (q @ k.transpose(-2, -1)) * (self.head_dim**-0.5) # (B, num_heads, N, N) - attn = F.softmax(attn, dim=-1) + attn = (q @ k.transpose(-2, -1)) * (self.head_dim ** -0.5) + attn = functional.softmax(attn, dim=-1) attn = self.attn_dropout(attn) # Apply attention to values - x = (attn @ v).transpose(1, 2).reshape(B, N, C) # (B, N, embed_dim) + x = (attn @ v).transpose(1, 2).reshape(batch_size, num_patches, channels) # Projection x = self.proj(x) @@ -128,7 +122,7 @@ def forward(self, x: Tensor) -> Tensor: class MLPBlock(nn.Module): """ - Multilayer Perceptron (MLP) block as described in Equation 3 of ViT paper. + Multilayer Perceptron (MLP) block as described in Equation 3. Args: embed_dim (int): Dimension of embedding @@ -136,9 +130,7 @@ class MLPBlock(nn.Module): dropout (float): Dropout rate """ - def __init__( - self, embed_dim: int = 768, mlp_ratio: float = 4.0, dropout: float = 0.0 - ): + def __init__(self, embed_dim: int = 768, mlp_ratio: float = 4.0, dropout: float = 0.0): super().__init__() hidden_dim = int(embed_dim * mlp_ratio) @@ -177,11 +169,8 @@ class TransformerEncoderBlock(nn.Module): """ def __init__( - self, - embed_dim: int = 768, - num_heads: int = 12, - mlp_ratio: float = 4.0, - dropout: float = 0.1, + self, embed_dim: int = 768, num_heads: int = 12, + mlp_ratio: float = 4.0, dropout: float = 0.1 ): super().__init__() @@ -237,7 +226,7 @@ def __init__( num_heads: int = 12, mlp_ratio: float = 4.0, dropout: float = 0.1, - emb_dropout: float = 0.1, + emb_dropout: float = 0.1 ): super().__init__() @@ -255,12 +244,10 @@ def __init__( self.pos_dropout = nn.Dropout(emb_dropout) # Transformer encoder blocks - self.blocks = nn.ModuleList( - [ - TransformerEncoderBlock(embed_dim, num_heads, mlp_ratio, dropout) - for _ in range(depth) - ] - ) + self.blocks = nn.ModuleList([ + TransformerEncoderBlock(embed_dim, num_heads, mlp_ratio, dropout) + for _ in range(depth) + ]) # Layer normalization and classifier self.norm = nn.LayerNorm(embed_dim) @@ -300,14 +287,14 @@ def forward(self, x: Tensor) -> Tensor: Returns: Tensor: Output logits of shape (B, num_classes) """ - B = x.shape[0] + batch_size = x.shape[0] # Create patch embeddings x = self.patch_embed(x) # (B, n_patches, embed_dim) # Add class token - cls_tokens = self.cls_token.expand(B, -1, -1) # (B, 1, embed_dim) - x = torch.cat((cls_tokens, x), dim=1) # (B, n_patches + 1, embed_dim) + cls_tokens = self.cls_token.expand(batch_size, -1, -1) + x = torch.cat((cls_tokens, x), dim=1) # Add position embedding and apply dropout x = x + self.pos_embed @@ -337,7 +324,7 @@ def create_vit_model( num_heads: int = 12, mlp_ratio: float = 4.0, dropout: float = 0.1, - emb_dropout: float = 0.1, + emb_dropout: float = 0.1 ) -> VisionTransformer: """ Factory function to create a Vision Transformer model. @@ -367,13 +354,11 @@ def create_vit_model( num_heads=num_heads, mlp_ratio=mlp_ratio, dropout=dropout, - emb_dropout=emb_dropout, + emb_dropout=emb_dropout ) -def get_pretrained_vit( - model_name: str = "vit_base_patch16_224", num_classes: int = 1000 -) -> nn.Module: +def get_pretrained_vit(model_name: str = "vit_base_patch16_224", num_classes: int = 1000) -> nn.Module: """ Load a pretrained ViT model from torchvision. @@ -385,19 +370,20 @@ def get_pretrained_vit( nn.Module: Pretrained ViT model """ try: - import torchvision.models as models + from torchvision import models if hasattr(models, model_name): model = getattr(models, model_name)(pretrained=True) if num_classes != 1000: # Replace the head for fine-tuning - if hasattr(model, "heads"): + if hasattr(model, 'heads'): model.heads = nn.Linear(model.heads.in_features, num_classes) - elif hasattr(model, "head"): + elif hasattr(model, 'head'): model.head = nn.Linear(model.head.in_features, num_classes) return model else: - raise ValueError(f"Model {model_name} not found in torchvision.models") + error_msg = f"Model {model_name} not found in torchvision.models" + raise ValueError(error_msg) except ImportError: raise ImportError("torchvision is required to load pretrained models") @@ -424,7 +410,7 @@ def count_parameters(model: nn.Module) -> int: num_classes=3, # pizza, steak, sushi embed_dim=768, depth=12, - num_heads=12, + num_heads=12 ) print(f"Model created with {count_parameters(model):,} parameters") From 141f3ca7e4ba10f01f60cb7cc300156b97d896fe Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 8 Oct 2025 14:00:36 +0000 Subject: [PATCH 08/10] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- computer_vision/vision_tranformer.py | 48 +++++++++++++++++----------- 1 file changed, 30 insertions(+), 18 deletions(-) diff --git a/computer_vision/vision_tranformer.py b/computer_vision/vision_tranformer.py index f4e45c76244b..44f08adfe9b9 100644 --- a/computer_vision/vision_tranformer.py +++ b/computer_vision/vision_tranformer.py @@ -29,8 +29,11 @@ class PatchEmbedding(nn.Module): """ def __init__( - self, img_size: int = 224, patch_size: int = 16, - in_channels: int = 3, embed_dim: int = 768 + self, + img_size: int = 224, + patch_size: int = 16, + in_channels: int = 3, + embed_dim: int = 768, ): super().__init__() self.img_size = img_size @@ -41,7 +44,7 @@ def __init__( in_channels=in_channels, out_channels=embed_dim, kernel_size=patch_size, - stride=patch_size + stride=patch_size, ) def forward(self, x: Tensor) -> Tensor: @@ -106,7 +109,7 @@ def forward(self, x: Tensor) -> Tensor: q, k, v = qkv[0], qkv[1], qkv[2] # (B, num_heads, N, head_dim) # Scaled dot-product attention - attn = (q @ k.transpose(-2, -1)) * (self.head_dim ** -0.5) + attn = (q @ k.transpose(-2, -1)) * (self.head_dim**-0.5) attn = functional.softmax(attn, dim=-1) attn = self.attn_dropout(attn) @@ -130,7 +133,9 @@ class MLPBlock(nn.Module): dropout (float): Dropout rate """ - def __init__(self, embed_dim: int = 768, mlp_ratio: float = 4.0, dropout: float = 0.0): + def __init__( + self, embed_dim: int = 768, mlp_ratio: float = 4.0, dropout: float = 0.0 + ): super().__init__() hidden_dim = int(embed_dim * mlp_ratio) @@ -169,8 +174,11 @@ class TransformerEncoderBlock(nn.Module): """ def __init__( - self, embed_dim: int = 768, num_heads: int = 12, - mlp_ratio: float = 4.0, dropout: float = 0.1 + self, + embed_dim: int = 768, + num_heads: int = 12, + mlp_ratio: float = 4.0, + dropout: float = 0.1, ): super().__init__() @@ -226,7 +234,7 @@ def __init__( num_heads: int = 12, mlp_ratio: float = 4.0, dropout: float = 0.1, - emb_dropout: float = 0.1 + emb_dropout: float = 0.1, ): super().__init__() @@ -244,10 +252,12 @@ def __init__( self.pos_dropout = nn.Dropout(emb_dropout) # Transformer encoder blocks - self.blocks = nn.ModuleList([ - TransformerEncoderBlock(embed_dim, num_heads, mlp_ratio, dropout) - for _ in range(depth) - ]) + self.blocks = nn.ModuleList( + [ + TransformerEncoderBlock(embed_dim, num_heads, mlp_ratio, dropout) + for _ in range(depth) + ] + ) # Layer normalization and classifier self.norm = nn.LayerNorm(embed_dim) @@ -324,7 +334,7 @@ def create_vit_model( num_heads: int = 12, mlp_ratio: float = 4.0, dropout: float = 0.1, - emb_dropout: float = 0.1 + emb_dropout: float = 0.1, ) -> VisionTransformer: """ Factory function to create a Vision Transformer model. @@ -354,11 +364,13 @@ def create_vit_model( num_heads=num_heads, mlp_ratio=mlp_ratio, dropout=dropout, - emb_dropout=emb_dropout + emb_dropout=emb_dropout, ) -def get_pretrained_vit(model_name: str = "vit_base_patch16_224", num_classes: int = 1000) -> nn.Module: +def get_pretrained_vit( + model_name: str = "vit_base_patch16_224", num_classes: int = 1000 +) -> nn.Module: """ Load a pretrained ViT model from torchvision. @@ -376,9 +388,9 @@ def get_pretrained_vit(model_name: str = "vit_base_patch16_224", num_classes: in model = getattr(models, model_name)(pretrained=True) if num_classes != 1000: # Replace the head for fine-tuning - if hasattr(model, 'heads'): + if hasattr(model, "heads"): model.heads = nn.Linear(model.heads.in_features, num_classes) - elif hasattr(model, 'head'): + elif hasattr(model, "head"): model.head = nn.Linear(model.head.in_features, num_classes) return model else: @@ -410,7 +422,7 @@ def count_parameters(model: nn.Module) -> int: num_classes=3, # pizza, steak, sushi embed_dim=768, depth=12, - num_heads=12 + num_heads=12, ) print(f"Model created with {count_parameters(model):,} parameters") From 96143f7c3e0dd0964aa70193410ec08bf949345f Mon Sep 17 00:00:00 2001 From: "NANDA GOPAL.D" Date: Wed, 8 Oct 2025 19:35:49 +0530 Subject: [PATCH 09/10] Update vision_tranformer.py --- computer_vision/vision_tranformer.py | 51 +++++++++++----------------- 1 file changed, 20 insertions(+), 31 deletions(-) diff --git a/computer_vision/vision_tranformer.py b/computer_vision/vision_tranformer.py index 44f08adfe9b9..e52d5a92db94 100644 --- a/computer_vision/vision_tranformer.py +++ b/computer_vision/vision_tranformer.py @@ -13,8 +13,9 @@ - Vision Transformer Model """ +import torch from torch import Tensor, nn -import torch.nn.functional as functional +from torch.nn import functional class PatchEmbedding(nn.Module): @@ -29,11 +30,8 @@ class PatchEmbedding(nn.Module): """ def __init__( - self, - img_size: int = 224, - patch_size: int = 16, - in_channels: int = 3, - embed_dim: int = 768, + self, img_size: int = 224, patch_size: int = 16, + in_channels: int = 3, embed_dim: int = 768 ): super().__init__() self.img_size = img_size @@ -44,7 +42,7 @@ def __init__( in_channels=in_channels, out_channels=embed_dim, kernel_size=patch_size, - stride=patch_size, + stride=patch_size ) def forward(self, x: Tensor) -> Tensor: @@ -109,7 +107,7 @@ def forward(self, x: Tensor) -> Tensor: q, k, v = qkv[0], qkv[1], qkv[2] # (B, num_heads, N, head_dim) # Scaled dot-product attention - attn = (q @ k.transpose(-2, -1)) * (self.head_dim**-0.5) + attn = (q @ k.transpose(-2, -1)) * (self.head_dim ** -0.5) attn = functional.softmax(attn, dim=-1) attn = self.attn_dropout(attn) @@ -133,9 +131,7 @@ class MLPBlock(nn.Module): dropout (float): Dropout rate """ - def __init__( - self, embed_dim: int = 768, mlp_ratio: float = 4.0, dropout: float = 0.0 - ): + def __init__(self, embed_dim: int = 768, mlp_ratio: float = 4.0, dropout: float = 0.0): super().__init__() hidden_dim = int(embed_dim * mlp_ratio) @@ -174,11 +170,8 @@ class TransformerEncoderBlock(nn.Module): """ def __init__( - self, - embed_dim: int = 768, - num_heads: int = 12, - mlp_ratio: float = 4.0, - dropout: float = 0.1, + self, embed_dim: int = 768, num_heads: int = 12, + mlp_ratio: float = 4.0, dropout: float = 0.1 ): super().__init__() @@ -234,7 +227,7 @@ def __init__( num_heads: int = 12, mlp_ratio: float = 4.0, dropout: float = 0.1, - emb_dropout: float = 0.1, + emb_dropout: float = 0.1 ): super().__init__() @@ -252,12 +245,10 @@ def __init__( self.pos_dropout = nn.Dropout(emb_dropout) # Transformer encoder blocks - self.blocks = nn.ModuleList( - [ - TransformerEncoderBlock(embed_dim, num_heads, mlp_ratio, dropout) - for _ in range(depth) - ] - ) + self.blocks = nn.ModuleList([ + TransformerEncoderBlock(embed_dim, num_heads, mlp_ratio, dropout) + for _ in range(depth) + ]) # Layer normalization and classifier self.norm = nn.LayerNorm(embed_dim) @@ -334,7 +325,7 @@ def create_vit_model( num_heads: int = 12, mlp_ratio: float = 4.0, dropout: float = 0.1, - emb_dropout: float = 0.1, + emb_dropout: float = 0.1 ) -> VisionTransformer: """ Factory function to create a Vision Transformer model. @@ -364,13 +355,11 @@ def create_vit_model( num_heads=num_heads, mlp_ratio=mlp_ratio, dropout=dropout, - emb_dropout=emb_dropout, + emb_dropout=emb_dropout ) -def get_pretrained_vit( - model_name: str = "vit_base_patch16_224", num_classes: int = 1000 -) -> nn.Module: +def get_pretrained_vit(model_name: str = "vit_base_patch16_224", num_classes: int = 1000) -> nn.Module: """ Load a pretrained ViT model from torchvision. @@ -388,9 +377,9 @@ def get_pretrained_vit( model = getattr(models, model_name)(pretrained=True) if num_classes != 1000: # Replace the head for fine-tuning - if hasattr(model, "heads"): + if hasattr(model, 'heads'): model.heads = nn.Linear(model.heads.in_features, num_classes) - elif hasattr(model, "head"): + elif hasattr(model, 'head'): model.head = nn.Linear(model.head.in_features, num_classes) return model else: @@ -422,7 +411,7 @@ def count_parameters(model: nn.Module) -> int: num_classes=3, # pizza, steak, sushi embed_dim=768, depth=12, - num_heads=12, + num_heads=12 ) print(f"Model created with {count_parameters(model):,} parameters") From 634248719fee313e8cb19ef69a08f3ad3be6f593 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 8 Oct 2025 14:06:10 +0000 Subject: [PATCH 10/10] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- computer_vision/vision_tranformer.py | 48 +++++++++++++++++----------- 1 file changed, 30 insertions(+), 18 deletions(-) diff --git a/computer_vision/vision_tranformer.py b/computer_vision/vision_tranformer.py index e52d5a92db94..b5acddc6ad2d 100644 --- a/computer_vision/vision_tranformer.py +++ b/computer_vision/vision_tranformer.py @@ -30,8 +30,11 @@ class PatchEmbedding(nn.Module): """ def __init__( - self, img_size: int = 224, patch_size: int = 16, - in_channels: int = 3, embed_dim: int = 768 + self, + img_size: int = 224, + patch_size: int = 16, + in_channels: int = 3, + embed_dim: int = 768, ): super().__init__() self.img_size = img_size @@ -42,7 +45,7 @@ def __init__( in_channels=in_channels, out_channels=embed_dim, kernel_size=patch_size, - stride=patch_size + stride=patch_size, ) def forward(self, x: Tensor) -> Tensor: @@ -107,7 +110,7 @@ def forward(self, x: Tensor) -> Tensor: q, k, v = qkv[0], qkv[1], qkv[2] # (B, num_heads, N, head_dim) # Scaled dot-product attention - attn = (q @ k.transpose(-2, -1)) * (self.head_dim ** -0.5) + attn = (q @ k.transpose(-2, -1)) * (self.head_dim**-0.5) attn = functional.softmax(attn, dim=-1) attn = self.attn_dropout(attn) @@ -131,7 +134,9 @@ class MLPBlock(nn.Module): dropout (float): Dropout rate """ - def __init__(self, embed_dim: int = 768, mlp_ratio: float = 4.0, dropout: float = 0.0): + def __init__( + self, embed_dim: int = 768, mlp_ratio: float = 4.0, dropout: float = 0.0 + ): super().__init__() hidden_dim = int(embed_dim * mlp_ratio) @@ -170,8 +175,11 @@ class TransformerEncoderBlock(nn.Module): """ def __init__( - self, embed_dim: int = 768, num_heads: int = 12, - mlp_ratio: float = 4.0, dropout: float = 0.1 + self, + embed_dim: int = 768, + num_heads: int = 12, + mlp_ratio: float = 4.0, + dropout: float = 0.1, ): super().__init__() @@ -227,7 +235,7 @@ def __init__( num_heads: int = 12, mlp_ratio: float = 4.0, dropout: float = 0.1, - emb_dropout: float = 0.1 + emb_dropout: float = 0.1, ): super().__init__() @@ -245,10 +253,12 @@ def __init__( self.pos_dropout = nn.Dropout(emb_dropout) # Transformer encoder blocks - self.blocks = nn.ModuleList([ - TransformerEncoderBlock(embed_dim, num_heads, mlp_ratio, dropout) - for _ in range(depth) - ]) + self.blocks = nn.ModuleList( + [ + TransformerEncoderBlock(embed_dim, num_heads, mlp_ratio, dropout) + for _ in range(depth) + ] + ) # Layer normalization and classifier self.norm = nn.LayerNorm(embed_dim) @@ -325,7 +335,7 @@ def create_vit_model( num_heads: int = 12, mlp_ratio: float = 4.0, dropout: float = 0.1, - emb_dropout: float = 0.1 + emb_dropout: float = 0.1, ) -> VisionTransformer: """ Factory function to create a Vision Transformer model. @@ -355,11 +365,13 @@ def create_vit_model( num_heads=num_heads, mlp_ratio=mlp_ratio, dropout=dropout, - emb_dropout=emb_dropout + emb_dropout=emb_dropout, ) -def get_pretrained_vit(model_name: str = "vit_base_patch16_224", num_classes: int = 1000) -> nn.Module: +def get_pretrained_vit( + model_name: str = "vit_base_patch16_224", num_classes: int = 1000 +) -> nn.Module: """ Load a pretrained ViT model from torchvision. @@ -377,9 +389,9 @@ def get_pretrained_vit(model_name: str = "vit_base_patch16_224", num_classes: in model = getattr(models, model_name)(pretrained=True) if num_classes != 1000: # Replace the head for fine-tuning - if hasattr(model, 'heads'): + if hasattr(model, "heads"): model.heads = nn.Linear(model.heads.in_features, num_classes) - elif hasattr(model, 'head'): + elif hasattr(model, "head"): model.head = nn.Linear(model.head.in_features, num_classes) return model else: @@ -411,7 +423,7 @@ def count_parameters(model: nn.Module) -> int: num_classes=3, # pizza, steak, sushi embed_dim=768, depth=12, - num_heads=12 + num_heads=12, ) print(f"Model created with {count_parameters(model):,} parameters")