From 1a9b0a0ff2c881fced4e61d0768a26dbbfa9de91 Mon Sep 17 00:00:00 2001
From: NANDAGOPALNG <nandagopalng2004@gmail.com>
Date: Tue, 7 Oct 2025 23:10:25 +0530
Subject: [PATCH 01/10] Added Vision_Transformer.py , it fixes #13326

---
 computer_vision/Vision_Tranformer.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 computer_vision/Vision_Tranformer.py

diff --git a/computer_vision/Vision_Tranformer.py b/computer_vision/Vision_Tranformer.py
new file mode 100644
index 000000000000..e69de29bb2d1

From 551cb8c3d43bb3a204b1648a76405521eda4d2b6 Mon Sep 17 00:00:00 2001
From: NANDAGOPALNG <nandagopalng2004@gmail.com>
Date: Tue, 7 Oct 2025 23:15:18 +0530
Subject: [PATCH 02/10] updated the error

---
 computer_vision/Vision_Tranformer.py | 385 +++++++++++++++++++++++++++
 1 file changed, 385 insertions(+)

diff --git a/computer_vision/Vision_Tranformer.py b/computer_vision/Vision_Tranformer.py
index e69de29bb2d1..693ab35baf5e 100644
--- a/computer_vision/Vision_Tranformer.py
+++ b/computer_vision/Vision_Tranformer.py
@@ -0,0 +1,385 @@
+"""
+Vision Transformer (ViT) Implementation
+
+This module contains a PyTorch implementation of the Vision Transformer (ViT) 
+architecture based on the paper "An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale".
+
+Key Components:
+- Patch Embedding
+- Multi-Head Self Attention
+- MLP Block
+- Transformer Encoder
+- Vision Transformer Model
+"""
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+from typing import Optional, Tuple
+import math
+
+
+class PatchEmbedding(nn.Module):
+    """
+    Creates patch embeddings from input images as described in Equation 1 of ViT paper.
+    
+    Args:
+        img_size (int): Size of input image (assumed square)
+        patch_size (int): Size of each patch (assumed square)
+        in_channels (int): Number of input channels
+        embed_dim (int): Dimension of embedding
+    """
+    
+    def __init__(self, img_size: int = 224, patch_size: int = 16, in_channels: int = 3, embed_dim: int = 768):
+        super().__init__()
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.n_patches = (img_size // patch_size) ** 2
+        
+        self.proj = nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=embed_dim,
+            kernel_size=patch_size,
+            stride=patch_size
+        )
+        
+    def forward(self, x: Tensor) -> Tensor:
+        """
+        Forward pass for patch embedding.
+        
+        Args:
+            x (Tensor): Input tensor of shape (B, C, H, W)
+            
+        Returns:
+            Tensor: Patch embeddings of shape (B, n_patches, embed_dim)
+        """
+        x = self.proj(x)  # (B, embed_dim, H//patch_size, W//patch_size)
+        x = x.flatten(2)  # (B, embed_dim, n_patches)
+        x = x.transpose(1, 2)  # (B, n_patches, embed_dim)
+        return x
+
+
+class MultiHeadSelfAttention(nn.Module):
+    """
+    Multi-Head Self Attention (MSA) block as described in Equation 2 of ViT paper.
+    
+    Args:
+        embed_dim (int): Dimension of embedding
+        num_heads (int): Number of attention heads
+        dropout (float): Dropout rate
+    """
+    
+    def __init__(self, embed_dim: int = 768, num_heads: int = 12, dropout: float = 0.0):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.head_dim = embed_dim // num_heads
+        
+        assert self.head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads"
+        
+        self.qkv = nn.Linear(embed_dim, embed_dim * 3)
+        self.attn_dropout = nn.Dropout(dropout)
+        self.proj = nn.Linear(embed_dim, embed_dim)
+        self.proj_dropout = nn.Dropout(dropout)
+        
+    def forward(self, x: Tensor) -> Tensor:
+        """
+        Forward pass for multi-head self attention.
+        
+        Args:
+            x (Tensor): Input tensor of shape (B, n_patches, embed_dim)
+            
+        Returns:
+            Tensor: Output tensor of same shape as input
+        """
+        B, N, C = x.shape
+        
+        # Create Q, K, V
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]  # (B, num_heads, N, head_dim)
+        
+        # Scaled dot-product attention
+        attn = (q @ k.transpose(-2, -1)) * (self.head_dim ** -0.5)  # (B, num_heads, N, N)
+        attn = F.softmax(attn, dim=-1)
+        attn = self.attn_dropout(attn)
+        
+        # Apply attention to values
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)  # (B, N, embed_dim)
+        
+        # Projection
+        x = self.proj(x)
+        x = self.proj_dropout(x)
+        
+        return x
+
+
+class MLPBlock(nn.Module):
+    """
+    Multilayer Perceptron (MLP) block as described in Equation 3 of ViT paper.
+    
+    Args:
+        embed_dim (int): Dimension of embedding
+        mlp_ratio (float): Ratio of MLP hidden dimension to embed_dim
+        dropout (float): Dropout rate
+    """
+    
+    def __init__(self, embed_dim: int = 768, mlp_ratio: float = 4.0, dropout: float = 0.0):
+        super().__init__()
+        hidden_dim = int(embed_dim * mlp_ratio)
+        
+        self.fc1 = nn.Linear(embed_dim, hidden_dim)
+        self.act = nn.GELU()
+        self.fc2 = nn.Linear(hidden_dim, embed_dim)
+        self.dropout = nn.Dropout(dropout)
+        
+    def forward(self, x: Tensor) -> Tensor:
+        """
+        Forward pass for MLP block.
+        
+        Args:
+            x (Tensor): Input tensor of shape (B, n_patches, embed_dim)
+            
+        Returns:
+            Tensor: Output tensor of same shape as input
+        """
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.dropout(x)
+        x = self.fc2(x)
+        x = self.dropout(x)
+        return x
+
+
+class TransformerEncoderBlock(nn.Module):
+    """
+    Transformer Encoder Block combining MSA and MLP with residual connections.
+    
+    Args:
+        embed_dim (int): Dimension of embedding
+        num_heads (int): Number of attention heads
+        mlp_ratio (float): Ratio of MLP hidden dimension to embed_dim
+        dropout (float): Dropout rate
+    """
+    
+    def __init__(self, embed_dim: int = 768, num_heads: int = 12, mlp_ratio: float = 4.0, dropout: float = 0.1):
+        super().__init__()
+        
+        self.norm1 = nn.LayerNorm(embed_dim)
+        self.attn = MultiHeadSelfAttention(embed_dim, num_heads, dropout)
+        self.norm2 = nn.LayerNorm(embed_dim)
+        self.mlp = MLPBlock(embed_dim, mlp_ratio, dropout)
+        
+    def forward(self, x: Tensor) -> Tensor:
+        """
+        Forward pass for transformer encoder block.
+        
+        Args:
+            x (Tensor): Input tensor of shape (B, n_patches, embed_dim)
+            
+        Returns:
+            Tensor: Output tensor of same shape as input
+        """
+        # Multi-head self attention with residual connection
+        x = x + self.attn(self.norm1(x))
+        
+        # MLP with residual connection
+        x = x + self.mlp(self.norm2(x))
+        
+        return x
+
+
+class VisionTransformer(nn.Module):
+    """
+    Vision Transformer (ViT) model.
+    
+    Args:
+        img_size (int): Input image size
+        patch_size (int): Patch size
+        in_channels (int): Number of input channels
+        num_classes (int): Number of output classes
+        embed_dim (int): Embedding dimension
+        depth (int): Number of transformer blocks
+        num_heads (int): Number of attention heads
+        mlp_ratio (float): Ratio of MLP hidden dimension to embed_dim
+        dropout (float): Dropout rate
+        emb_dropout (float): Embedding dropout rate
+    """
+    
+    def __init__(
+        self,
+        img_size: int = 224,
+        patch_size: int = 16,
+        in_channels: int = 3,
+        num_classes: int = 1000,
+        embed_dim: int = 768,
+        depth: int = 12,
+        num_heads: int = 12,
+        mlp_ratio: float = 4.0,
+        dropout: float = 0.1,
+        emb_dropout: float = 0.1
+    ):
+        super().__init__()
+        
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.in_channels = in_channels
+        
+        # Patch embedding
+        self.patch_embed = PatchEmbedding(img_size, patch_size, in_channels, embed_dim)
+        n_patches = self.patch_embed.n_patches
+        
+        # Class token and position embedding
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.pos_embed = nn.Parameter(torch.zeros(1, n_patches + 1, embed_dim))
+        self.pos_dropout = nn.Dropout(emb_dropout)
+        
+        # Transformer encoder blocks
+        self.blocks = nn.ModuleList([
+            TransformerEncoderBlock(embed_dim, num_heads, mlp_ratio, dropout)
+            for _ in range(depth)
+        ])
+        
+        # Layer normalization and classifier
+        self.norm = nn.LayerNorm(embed_dim)
+        self.head = nn.Linear(embed_dim, num_classes)
+        
+        # Initialize weights
+        self._init_weights()
+        
+    def _init_weights(self):
+        """Initialize weights for the ViT model."""
+        # Initialize patch embedding like a linear layer
+        nn.init.xavier_uniform_(self.patch_embed.proj.weight)
+        if self.patch_embed.proj.bias is not None:
+            nn.init.zeros_(self.patch_embed.proj.bias)
+            
+        # Initialize class token and position embedding
+        nn.init.trunc_normal_(self.cls_token, std=0.02)
+        nn.init.trunc_normal_(self.pos_embed, std=0.02)
+        
+        # Initialize linear layers
+        self.apply(self._init_linear_weights)
+        
+    def _init_linear_weights(self, module):
+        """Initialize weights for linear layers."""
+        if isinstance(module, nn.Linear):
+            nn.init.trunc_normal_(module.weight, std=0.02)
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+                
+    def forward(self, x: Tensor) -> Tensor:
+        """
+        Forward pass for Vision Transformer.
+        
+        Args:
+            x (Tensor): Input tensor of shape (B, C, H, W)
+            
+        Returns:
+            Tensor: Output logits of shape (B, num_classes)
+        """
+        B = x.shape[0]
+        
+        # Create patch embeddings
+        x = self.patch_embed(x)  # (B, n_patches, embed_dim)
+        
+        # Add class token
+        cls_tokens = self.cls_token.expand(B, -1, -1)  # (B, 1, embed_dim)
+        x = torch.cat((cls_tokens, x), dim=1)  # (B, n_patches + 1, embed_dim)
+        
+        # Add position embedding and apply dropout
+        x = x + self.pos_embed
+        x = self.pos_dropout(x)
+        
+        # Apply transformer blocks
+        for block in self.blocks:
+            x = block(x)
+            
+        # Apply final normalization and get class token output
+        x = self.norm(x)
+        cls_token_final = x[:, 0]  # Use class token for classification
+        
+        # Classifier
+        x = self.head(cls_token_final)
+        
+        return x
+
+
+def create_vit_model(
+    img_size: int = 224,
+    patch_size: int = 16,
+    in_channels: int = 3,
+    num_classes: int = 1000,
+    embed_dim: int = 768,
+    depth: int = 12,
+    num_heads: int = 12,
+    mlp_ratio: float = 4.0,
+    dropout: float = 0.1,
+    emb_dropout: float = 0.1
+) -> VisionTransformer:
+    """
+    Factory function to create a Vision Transformer model.
+    
+    Args:
+        img_size (int): Input image size
+        patch_size (int): Patch size
+        in_channels (int): Number of input channels
+        num_classes (int): Number of output classes
+        embed_dim (int): Embedding dimension
+        depth (int): Number of transformer blocks
+        num_heads (int): Number of attention heads
+        mlp_ratio (float): Ratio of MLP hidden dimension to embed_dim
+        dropout (float): Dropout rate
+        emb_dropout (float): Embedding dropout rate
+        
+    Returns:
+        VisionTransformer: Configured ViT model
+    """
+    return VisionTransformer(
+        img_size=img_size,
+        patch_size=patch_size,
+        in_channels=in_channels,
+        num_classes=num_classes,
+        embed_dim=embed_dim,
+        depth=depth,
+        num_heads=num_heads,
+        mlp_ratio=mlp_ratio,
+        dropout=dropout,
+        emb_dropout=emb_dropout
+    )
+
+
+
+
+
+def count_parameters(model: nn.Module) -> int:
+    """
+    Count the number of trainable parameters in a model.
+    
+    Args:
+        model (nn.Module): PyTorch model
+        
+    Returns:
+        int: Number of trainable parameters
+    """
+    return sum(p.numel() for p in model.parameters() if p.requires_grad)
+
+
+if __name__ == "__main__":
+    # Example usage
+    model = create_vit_model(
+        img_size=224,
+        patch_size=16,
+        num_classes=3,  # pizza, steak, sushi
+        embed_dim=768,
+        depth=12,
+        num_heads=12
+    )
+    
+    print(f"Model created with {count_parameters(model):,} parameters")
+    
+    # Test forward pass
+    x = torch.randn(2, 3, 224, 224)
+    out = model(x)
+    print(f"Input shape: {x.shape}")
+    print(f"Output shape: {out.shape}")
\ No newline at end of file

From 126ceba50dfaee09362874fef5b594e8ee077f4d Mon Sep 17 00:00:00 2001
From: "NANDA GOPAL.D" <nandagopalng2004@gmail.com>
Date: Tue, 7 Oct 2025 23:24:56 +0530
Subject: [PATCH 03/10] Rename Vision_Tranformer.py to vision_tranformer.py

---
 computer_vision/{Vision_Tranformer.py => vision_tranformer.py} | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
 rename computer_vision/{Vision_Tranformer.py => vision_tranformer.py} (99%)

diff --git a/computer_vision/Vision_Tranformer.py b/computer_vision/vision_tranformer.py
similarity index 99%
rename from computer_vision/Vision_Tranformer.py
rename to computer_vision/vision_tranformer.py
index 693ab35baf5e..215aff1d6d1e 100644
--- a/computer_vision/Vision_Tranformer.py
+++ b/computer_vision/vision_tranformer.py
@@ -382,4 +382,4 @@ def count_parameters(model: nn.Module) -> int:
     x = torch.randn(2, 3, 224, 224)
     out = model(x)
     print(f"Input shape: {x.shape}")
-    print(f"Output shape: {out.shape}")
\ No newline at end of file
+    print(f"Output shape: {out.shape}")

From d464be933fe625aba6098b7adac89624715506db Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 7 Oct 2025 17:58:22 +0000
Subject: [PATCH 04/10] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 computer_vision/vision_tranformer.py | 177 +++++++++++++++------------
 1 file changed, 98 insertions(+), 79 deletions(-)

diff --git a/computer_vision/vision_tranformer.py b/computer_vision/vision_tranformer.py
index 215aff1d6d1e..bc84e4852569 100644
--- a/computer_vision/vision_tranformer.py
+++ b/computer_vision/vision_tranformer.py
@@ -1,7 +1,7 @@
 """
 Vision Transformer (ViT) Implementation
 
-This module contains a PyTorch implementation of the Vision Transformer (ViT) 
+This module contains a PyTorch implementation of the Vision Transformer (ViT)
 architecture based on the paper "An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale".
 
 Key Components:
@@ -23,34 +23,40 @@
 class PatchEmbedding(nn.Module):
     """
     Creates patch embeddings from input images as described in Equation 1 of ViT paper.
-    
+
     Args:
         img_size (int): Size of input image (assumed square)
         patch_size (int): Size of each patch (assumed square)
         in_channels (int): Number of input channels
         embed_dim (int): Dimension of embedding
     """
-    
-    def __init__(self, img_size: int = 224, patch_size: int = 16, in_channels: int = 3, embed_dim: int = 768):
+
+    def __init__(
+        self,
+        img_size: int = 224,
+        patch_size: int = 16,
+        in_channels: int = 3,
+        embed_dim: int = 768,
+    ):
         super().__init__()
         self.img_size = img_size
         self.patch_size = patch_size
         self.n_patches = (img_size // patch_size) ** 2
-        
+
         self.proj = nn.Conv2d(
             in_channels=in_channels,
             out_channels=embed_dim,
             kernel_size=patch_size,
-            stride=patch_size
+            stride=patch_size,
         )
-        
+
     def forward(self, x: Tensor) -> Tensor:
         """
         Forward pass for patch embedding.
-        
+
         Args:
             x (Tensor): Input tensor of shape (B, C, H, W)
-            
+
         Returns:
             Tensor: Patch embeddings of shape (B, n_patches, embed_dim)
         """
@@ -63,83 +69,91 @@ def forward(self, x: Tensor) -> Tensor:
 class MultiHeadSelfAttention(nn.Module):
     """
     Multi-Head Self Attention (MSA) block as described in Equation 2 of ViT paper.
-    
+
     Args:
         embed_dim (int): Dimension of embedding
         num_heads (int): Number of attention heads
         dropout (float): Dropout rate
     """
-    
+
     def __init__(self, embed_dim: int = 768, num_heads: int = 12, dropout: float = 0.0):
         super().__init__()
         self.embed_dim = embed_dim
         self.num_heads = num_heads
         self.head_dim = embed_dim // num_heads
-        
-        assert self.head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads"
-        
+
+        assert self.head_dim * num_heads == embed_dim, (
+            "embed_dim must be divisible by num_heads"
+        )
+
         self.qkv = nn.Linear(embed_dim, embed_dim * 3)
         self.attn_dropout = nn.Dropout(dropout)
         self.proj = nn.Linear(embed_dim, embed_dim)
         self.proj_dropout = nn.Dropout(dropout)
-        
+
     def forward(self, x: Tensor) -> Tensor:
         """
         Forward pass for multi-head self attention.
-        
+
         Args:
             x (Tensor): Input tensor of shape (B, n_patches, embed_dim)
-            
+
         Returns:
             Tensor: Output tensor of same shape as input
         """
         B, N, C = x.shape
-        
+
         # Create Q, K, V
-        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
+        qkv = (
+            self.qkv(x)
+            .reshape(B, N, 3, self.num_heads, self.head_dim)
+            .permute(2, 0, 3, 1, 4)
+        )
         q, k, v = qkv[0], qkv[1], qkv[2]  # (B, num_heads, N, head_dim)
-        
+
         # Scaled dot-product attention
-        attn = (q @ k.transpose(-2, -1)) * (self.head_dim ** -0.5)  # (B, num_heads, N, N)
+        attn = (q @ k.transpose(-2, -1)) * (self.head_dim**-0.5)  # (B, num_heads, N, N)
         attn = F.softmax(attn, dim=-1)
         attn = self.attn_dropout(attn)
-        
+
         # Apply attention to values
         x = (attn @ v).transpose(1, 2).reshape(B, N, C)  # (B, N, embed_dim)
-        
+
         # Projection
         x = self.proj(x)
         x = self.proj_dropout(x)
-        
+
         return x
 
 
 class MLPBlock(nn.Module):
     """
     Multilayer Perceptron (MLP) block as described in Equation 3 of ViT paper.
-    
+
     Args:
         embed_dim (int): Dimension of embedding
         mlp_ratio (float): Ratio of MLP hidden dimension to embed_dim
         dropout (float): Dropout rate
     """
-    
-    def __init__(self, embed_dim: int = 768, mlp_ratio: float = 4.0, dropout: float = 0.0):
+
+    def __init__(
+        self, embed_dim: int = 768, mlp_ratio: float = 4.0, dropout: float = 0.0
+    ):
         super().__init__()
         hidden_dim = int(embed_dim * mlp_ratio)
-        
+
         self.fc1 = nn.Linear(embed_dim, hidden_dim)
         self.act = nn.GELU()
         self.fc2 = nn.Linear(hidden_dim, embed_dim)
         self.dropout = nn.Dropout(dropout)
-        
+
     def forward(self, x: Tensor) -> Tensor:
         """
         Forward pass for MLP block.
-        
+
         Args:
             x (Tensor): Input tensor of shape (B, n_patches, embed_dim)
-            
+
         Returns:
             Tensor: Output tensor of same shape as input
         """
@@ -154,45 +168,51 @@ def forward(self, x: Tensor) -> Tensor:
 class TransformerEncoderBlock(nn.Module):
     """
     Transformer Encoder Block combining MSA and MLP with residual connections.
-    
+
     Args:
         embed_dim (int): Dimension of embedding
         num_heads (int): Number of attention heads
         mlp_ratio (float): Ratio of MLP hidden dimension to embed_dim
         dropout (float): Dropout rate
     """
-    
-    def __init__(self, embed_dim: int = 768, num_heads: int = 12, mlp_ratio: float = 4.0, dropout: float = 0.1):
+
+    def __init__(
+        self,
+        embed_dim: int = 768,
+        num_heads: int = 12,
+        mlp_ratio: float = 4.0,
+        dropout: float = 0.1,
+    ):
         super().__init__()
-        
+
         self.norm1 = nn.LayerNorm(embed_dim)
         self.attn = MultiHeadSelfAttention(embed_dim, num_heads, dropout)
         self.norm2 = nn.LayerNorm(embed_dim)
         self.mlp = MLPBlock(embed_dim, mlp_ratio, dropout)
-        
+
     def forward(self, x: Tensor) -> Tensor:
         """
         Forward pass for transformer encoder block.
-        
+
         Args:
             x (Tensor): Input tensor of shape (B, n_patches, embed_dim)
-            
+
         Returns:
             Tensor: Output tensor of same shape as input
         """
         # Multi-head self attention with residual connection
         x = x + self.attn(self.norm1(x))
-        
+
         # MLP with residual connection
         x = x + self.mlp(self.norm2(x))
-        
+
         return x
 
 
 class VisionTransformer(nn.Module):
     """
     Vision Transformer (ViT) model.
-    
+
     Args:
         img_size (int): Input image size
         patch_size (int): Patch size
@@ -205,7 +225,7 @@ class VisionTransformer(nn.Module):
         dropout (float): Dropout rate
         emb_dropout (float): Embedding dropout rate
     """
-    
+
     def __init__(
         self,
         img_size: int = 224,
@@ -217,91 +237,93 @@ def __init__(
         num_heads: int = 12,
         mlp_ratio: float = 4.0,
         dropout: float = 0.1,
-        emb_dropout: float = 0.1
+        emb_dropout: float = 0.1,
     ):
         super().__init__()
-        
+
         self.img_size = img_size
         self.patch_size = patch_size
         self.in_channels = in_channels
-        
+
         # Patch embedding
         self.patch_embed = PatchEmbedding(img_size, patch_size, in_channels, embed_dim)
         n_patches = self.patch_embed.n_patches
-        
+
         # Class token and position embedding
         self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
         self.pos_embed = nn.Parameter(torch.zeros(1, n_patches + 1, embed_dim))
         self.pos_dropout = nn.Dropout(emb_dropout)
-        
+
         # Transformer encoder blocks
-        self.blocks = nn.ModuleList([
-            TransformerEncoderBlock(embed_dim, num_heads, mlp_ratio, dropout)
-            for _ in range(depth)
-        ])
-        
+        self.blocks = nn.ModuleList(
+            [
+                TransformerEncoderBlock(embed_dim, num_heads, mlp_ratio, dropout)
+                for _ in range(depth)
+            ]
+        )
+
         # Layer normalization and classifier
         self.norm = nn.LayerNorm(embed_dim)
         self.head = nn.Linear(embed_dim, num_classes)
-        
+
         # Initialize weights
         self._init_weights()
-        
+
     def _init_weights(self):
         """Initialize weights for the ViT model."""
         # Initialize patch embedding like a linear layer
         nn.init.xavier_uniform_(self.patch_embed.proj.weight)
         if self.patch_embed.proj.bias is not None:
             nn.init.zeros_(self.patch_embed.proj.bias)
-            
+
         # Initialize class token and position embedding
         nn.init.trunc_normal_(self.cls_token, std=0.02)
         nn.init.trunc_normal_(self.pos_embed, std=0.02)
-        
+
         # Initialize linear layers
         self.apply(self._init_linear_weights)
-        
+
     def _init_linear_weights(self, module):
         """Initialize weights for linear layers."""
         if isinstance(module, nn.Linear):
             nn.init.trunc_normal_(module.weight, std=0.02)
             if module.bias is not None:
                 nn.init.zeros_(module.bias)
-                
+
     def forward(self, x: Tensor) -> Tensor:
         """
         Forward pass for Vision Transformer.
-        
+
         Args:
             x (Tensor): Input tensor of shape (B, C, H, W)
-            
+
         Returns:
             Tensor: Output logits of shape (B, num_classes)
         """
         B = x.shape[0]
-        
+
         # Create patch embeddings
         x = self.patch_embed(x)  # (B, n_patches, embed_dim)
-        
+
         # Add class token
         cls_tokens = self.cls_token.expand(B, -1, -1)  # (B, 1, embed_dim)
         x = torch.cat((cls_tokens, x), dim=1)  # (B, n_patches + 1, embed_dim)
-        
+
         # Add position embedding and apply dropout
         x = x + self.pos_embed
         x = self.pos_dropout(x)
-        
+
         # Apply transformer blocks
         for block in self.blocks:
             x = block(x)
-            
+
         # Apply final normalization and get class token output
         x = self.norm(x)
         cls_token_final = x[:, 0]  # Use class token for classification
-        
+
         # Classifier
         x = self.head(cls_token_final)
-        
+
         return x
 
 
@@ -315,11 +337,11 @@ def create_vit_model(
     num_heads: int = 12,
     mlp_ratio: float = 4.0,
     dropout: float = 0.1,
-    emb_dropout: float = 0.1
+    emb_dropout: float = 0.1,
 ) -> VisionTransformer:
     """
     Factory function to create a Vision Transformer model.
-    
+
     Args:
         img_size (int): Input image size
         patch_size (int): Patch size
@@ -331,7 +353,7 @@ def create_vit_model(
         mlp_ratio (float): Ratio of MLP hidden dimension to embed_dim
         dropout (float): Dropout rate
         emb_dropout (float): Embedding dropout rate
-        
+
     Returns:
         VisionTransformer: Configured ViT model
     """
@@ -345,20 +367,17 @@ def create_vit_model(
         num_heads=num_heads,
         mlp_ratio=mlp_ratio,
         dropout=dropout,
-        emb_dropout=emb_dropout
+        emb_dropout=emb_dropout,
     )
 
 
-
-
-
 def count_parameters(model: nn.Module) -> int:
     """
     Count the number of trainable parameters in a model.
-    
+
     Args:
         model (nn.Module): PyTorch model
-        
+
     Returns:
         int: Number of trainable parameters
     """
@@ -373,11 +392,11 @@ def count_parameters(model: nn.Module) -> int:
         num_classes=3,  # pizza, steak, sushi
         embed_dim=768,
         depth=12,
-        num_heads=12
+        num_heads=12,
     )
-    
+
     print(f"Model created with {count_parameters(model):,} parameters")
-    
+
     # Test forward pass
     x = torch.randn(2, 3, 224, 224)
     out = model(x)

From 2d69ac4b7625c73bc60ee8870bcdc88f4461552d Mon Sep 17 00:00:00 2001
From: "NANDA GOPAL.D" <nandagopalng2004@gmail.com>
Date: Wed, 8 Oct 2025 19:23:06 +0530
Subject: [PATCH 05/10] Update vision_tranformer.py

---
 computer_vision/vision_tranformer.py | 204 ++++++++++++++-------------
 1 file changed, 106 insertions(+), 98 deletions(-)

diff --git a/computer_vision/vision_tranformer.py b/computer_vision/vision_tranformer.py
index bc84e4852569..aa43743dbe7a 100644
--- a/computer_vision/vision_tranformer.py
+++ b/computer_vision/vision_tranformer.py
@@ -1,7 +1,7 @@
 """
 Vision Transformer (ViT) Implementation
 
-This module contains a PyTorch implementation of the Vision Transformer (ViT)
+This module contains a PyTorch implementation of the Vision Transformer (ViT) 
 architecture based on the paper "An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale".
 
 Key Components:
@@ -23,40 +23,34 @@
 class PatchEmbedding(nn.Module):
     """
     Creates patch embeddings from input images as described in Equation 1 of ViT paper.
-
+    
     Args:
         img_size (int): Size of input image (assumed square)
         patch_size (int): Size of each patch (assumed square)
         in_channels (int): Number of input channels
         embed_dim (int): Dimension of embedding
     """
-
-    def __init__(
-        self,
-        img_size: int = 224,
-        patch_size: int = 16,
-        in_channels: int = 3,
-        embed_dim: int = 768,
-    ):
+    
+    def __init__(self, img_size: int = 224, patch_size: int = 16, in_channels: int = 3, embed_dim: int = 768):
         super().__init__()
         self.img_size = img_size
         self.patch_size = patch_size
         self.n_patches = (img_size // patch_size) ** 2
-
+        
         self.proj = nn.Conv2d(
             in_channels=in_channels,
             out_channels=embed_dim,
             kernel_size=patch_size,
-            stride=patch_size,
+            stride=patch_size
         )
-
+        
     def forward(self, x: Tensor) -> Tensor:
         """
         Forward pass for patch embedding.
-
+        
         Args:
             x (Tensor): Input tensor of shape (B, C, H, W)
-
+            
         Returns:
             Tensor: Patch embeddings of shape (B, n_patches, embed_dim)
         """
@@ -69,91 +63,83 @@ def forward(self, x: Tensor) -> Tensor:
 class MultiHeadSelfAttention(nn.Module):
     """
     Multi-Head Self Attention (MSA) block as described in Equation 2 of ViT paper.
-
+    
     Args:
         embed_dim (int): Dimension of embedding
         num_heads (int): Number of attention heads
         dropout (float): Dropout rate
     """
-
+    
     def __init__(self, embed_dim: int = 768, num_heads: int = 12, dropout: float = 0.0):
         super().__init__()
         self.embed_dim = embed_dim
         self.num_heads = num_heads
         self.head_dim = embed_dim // num_heads
-
-        assert self.head_dim * num_heads == embed_dim, (
-            "embed_dim must be divisible by num_heads"
-        )
-
+        
+        assert self.head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads"
+        
         self.qkv = nn.Linear(embed_dim, embed_dim * 3)
         self.attn_dropout = nn.Dropout(dropout)
         self.proj = nn.Linear(embed_dim, embed_dim)
         self.proj_dropout = nn.Dropout(dropout)
-
+        
     def forward(self, x: Tensor) -> Tensor:
         """
         Forward pass for multi-head self attention.
-
+        
         Args:
             x (Tensor): Input tensor of shape (B, n_patches, embed_dim)
-
+            
         Returns:
             Tensor: Output tensor of same shape as input
         """
         B, N, C = x.shape
-
+        
         # Create Q, K, V
-        qkv = (
-            self.qkv(x)
-            .reshape(B, N, 3, self.num_heads, self.head_dim)
-            .permute(2, 0, 3, 1, 4)
-        )
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
         q, k, v = qkv[0], qkv[1], qkv[2]  # (B, num_heads, N, head_dim)
-
+        
         # Scaled dot-product attention
-        attn = (q @ k.transpose(-2, -1)) * (self.head_dim**-0.5)  # (B, num_heads, N, N)
+        attn = (q @ k.transpose(-2, -1)) * (self.head_dim ** -0.5)  # (B, num_heads, N, N)
         attn = F.softmax(attn, dim=-1)
         attn = self.attn_dropout(attn)
-
+        
         # Apply attention to values
         x = (attn @ v).transpose(1, 2).reshape(B, N, C)  # (B, N, embed_dim)
-
+        
         # Projection
         x = self.proj(x)
         x = self.proj_dropout(x)
-
+        
         return x
 
 
 class MLPBlock(nn.Module):
     """
     Multilayer Perceptron (MLP) block as described in Equation 3 of ViT paper.
-
+    
     Args:
         embed_dim (int): Dimension of embedding
         mlp_ratio (float): Ratio of MLP hidden dimension to embed_dim
         dropout (float): Dropout rate
     """
-
-    def __init__(
-        self, embed_dim: int = 768, mlp_ratio: float = 4.0, dropout: float = 0.0
-    ):
+    
+    def __init__(self, embed_dim: int = 768, mlp_ratio: float = 4.0, dropout: float = 0.0):
         super().__init__()
         hidden_dim = int(embed_dim * mlp_ratio)
-
+        
         self.fc1 = nn.Linear(embed_dim, hidden_dim)
         self.act = nn.GELU()
         self.fc2 = nn.Linear(hidden_dim, embed_dim)
         self.dropout = nn.Dropout(dropout)
-
+        
     def forward(self, x: Tensor) -> Tensor:
         """
         Forward pass for MLP block.
-
+        
         Args:
             x (Tensor): Input tensor of shape (B, n_patches, embed_dim)
-
+            
         Returns:
             Tensor: Output tensor of same shape as input
         """
@@ -168,51 +154,45 @@ def forward(self, x: Tensor) -> Tensor:
 class TransformerEncoderBlock(nn.Module):
     """
     Transformer Encoder Block combining MSA and MLP with residual connections.
-
+    
     Args:
         embed_dim (int): Dimension of embedding
         num_heads (int): Number of attention heads
         mlp_ratio (float): Ratio of MLP hidden dimension to embed_dim
         dropout (float): Dropout rate
     """
-
-    def __init__(
-        self,
-        embed_dim: int = 768,
-        num_heads: int = 12,
-        mlp_ratio: float = 4.0,
-        dropout: float = 0.1,
-    ):
+    
+    def __init__(self, embed_dim: int = 768, num_heads: int = 12, mlp_ratio: float = 4.0, dropout: float = 0.1):
         super().__init__()
-
+        
         self.norm1 = nn.LayerNorm(embed_dim)
         self.attn = MultiHeadSelfAttention(embed_dim, num_heads, dropout)
         self.norm2 = nn.LayerNorm(embed_dim)
         self.mlp = MLPBlock(embed_dim, mlp_ratio, dropout)
-
+        
     def forward(self, x: Tensor) -> Tensor:
         """
         Forward pass for transformer encoder block.
-
+        
         Args:
             x (Tensor): Input tensor of shape (B, n_patches, embed_dim)
-
+            
         Returns:
             Tensor: Output tensor of same shape as input
         """
         # Multi-head self attention with residual connection
         x = x + self.attn(self.norm1(x))
-
+        
         # MLP with residual connection
         x = x + self.mlp(self.norm2(x))
-
+        
         return x
 
 
 class VisionTransformer(nn.Module):
     """
     Vision Transformer (ViT) model.
-
+    
     Args:
         img_size (int): Input image size
         patch_size (int): Patch size
@@ -225,7 +205,7 @@ class VisionTransformer(nn.Module):
         dropout (float): Dropout rate
         emb_dropout (float): Embedding dropout rate
     """
-
+    
     def __init__(
         self,
         img_size: int = 224,
@@ -237,93 +217,91 @@ def __init__(
         num_heads: int = 12,
         mlp_ratio: float = 4.0,
         dropout: float = 0.1,
-        emb_dropout: float = 0.1,
+        emb_dropout: float = 0.1
     ):
         super().__init__()
-
+        
         self.img_size = img_size
         self.patch_size = patch_size
         self.in_channels = in_channels
-
+        
         # Patch embedding
         self.patch_embed = PatchEmbedding(img_size, patch_size, in_channels, embed_dim)
         n_patches = self.patch_embed.n_patches
-
+        
         # Class token and position embedding
         self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
         self.pos_embed = nn.Parameter(torch.zeros(1, n_patches + 1, embed_dim))
         self.pos_dropout = nn.Dropout(emb_dropout)
-
+        
         # Transformer encoder blocks
-        self.blocks = nn.ModuleList(
-            [
-                TransformerEncoderBlock(embed_dim, num_heads, mlp_ratio, dropout)
-                for _ in range(depth)
-            ]
-        )
-
+        self.blocks = nn.ModuleList([
+            TransformerEncoderBlock(embed_dim, num_heads, mlp_ratio, dropout)
+            for _ in range(depth)
+        ])
+        
         # Layer normalization and classifier
         self.norm = nn.LayerNorm(embed_dim)
         self.head = nn.Linear(embed_dim, num_classes)
-
+        
         # Initialize weights
         self._init_weights()
-
+        
     def _init_weights(self):
         """Initialize weights for the ViT model."""
         # Initialize patch embedding like a linear layer
         nn.init.xavier_uniform_(self.patch_embed.proj.weight)
         if self.patch_embed.proj.bias is not None:
             nn.init.zeros_(self.patch_embed.proj.bias)
-
+            
         # Initialize class token and position embedding
         nn.init.trunc_normal_(self.cls_token, std=0.02)
         nn.init.trunc_normal_(self.pos_embed, std=0.02)
-
+        
         # Initialize linear layers
         self.apply(self._init_linear_weights)
-
+        
     def _init_linear_weights(self, module):
         """Initialize weights for linear layers."""
         if isinstance(module, nn.Linear):
             nn.init.trunc_normal_(module.weight, std=0.02)
             if module.bias is not None:
                 nn.init.zeros_(module.bias)
-
+                
     def forward(self, x: Tensor) -> Tensor:
         """
         Forward pass for Vision Transformer.
-
+        
         Args:
             x (Tensor): Input tensor of shape (B, C, H, W)
-
+            
         Returns:
             Tensor: Output logits of shape (B, num_classes)
         """
         B = x.shape[0]
-
+        
         # Create patch embeddings
         x = self.patch_embed(x)  # (B, n_patches, embed_dim)
-
+        
         # Add class token
         cls_tokens = self.cls_token.expand(B, -1, -1)  # (B, 1, embed_dim)
         x = torch.cat((cls_tokens, x), dim=1)  # (B, n_patches + 1, embed_dim)
-
+        
         # Add position embedding and apply dropout
         x = x + self.pos_embed
         x = self.pos_dropout(x)
-
+        
         # Apply transformer blocks
         for block in self.blocks:
             x = block(x)
-
+            
         # Apply final normalization and get class token output
         x = self.norm(x)
         cls_token_final = x[:, 0]  # Use class token for classification
-
+        
         # Classifier
         x = self.head(cls_token_final)
-
+        
         return x
 
 
@@ -337,11 +315,11 @@ def create_vit_model(
     num_heads: int = 12,
     mlp_ratio: float = 4.0,
     dropout: float = 0.1,
-    emb_dropout: float = 0.1,
+    emb_dropout: float = 0.1
 ) -> VisionTransformer:
     """
     Factory function to create a Vision Transformer model.
-
+    
     Args:
         img_size (int): Input image size
         patch_size (int): Patch size
@@ -353,7 +331,7 @@ def create_vit_model(
         mlp_ratio (float): Ratio of MLP hidden dimension to embed_dim
         dropout (float): Dropout rate
         emb_dropout (float): Embedding dropout rate
-
+        
     Returns:
         VisionTransformer: Configured ViT model
     """
@@ -367,17 +345,47 @@ def create_vit_model(
         num_heads=num_heads,
         mlp_ratio=mlp_ratio,
         dropout=dropout,
-        emb_dropout=emb_dropout,
+        emb_dropout=emb_dropout
     )
 
 
+def get_pretrained_vit(model_name: str = "vit_base_patch16_224", num_classes: int = 1000) -> nn.Module:
+    """
+    Load a pretrained ViT model from torchvision.
+    
+    Args:
+        model_name (str): Name of the pretrained model
+        num_classes (int): Number of output classes (for fine-tuning)
+        
+    Returns:
+        nn.Module: Pretrained ViT model
+    """
+    try:
+        import torchvision.models as models
+        
+        if hasattr(models, model_name):
+            model = getattr(models, model_name)(pretrained=True)
+            if num_classes != 1000:
+                # Replace the head for fine-tuning
+                if hasattr(model, 'heads'):
+                    model.heads = nn.Linear(model.heads.in_features, num_classes)
+                elif hasattr(model, 'head'):
+                    model.head = nn.Linear(model.head.in_features, num_classes)
+            return model
+        else:
+            raise ValueError(f"Model {model_name} not found in torchvision.models")
+            
+    except ImportError:
+        raise ImportError("torchvision is required to load pretrained models")
+
+
 def count_parameters(model: nn.Module) -> int:
     """
     Count the number of trainable parameters in a model.
-
+    
     Args:
         model (nn.Module): PyTorch model
-
+        
     Returns:
         int: Number of trainable parameters
     """
@@ -392,11 +400,11 @@ def count_parameters(model: nn.Module) -> int:
         num_classes=3,  # pizza, steak, sushi
         embed_dim=768,
         depth=12,
-        num_heads=12,
+        num_heads=12
     )
-
+    
     print(f"Model created with {count_parameters(model):,} parameters")
-
+    
     # Test forward pass
     x = torch.randn(2, 3, 224, 224)
     out = model(x)

From 224ec8896b7a66cf5c24b04c239fd288baa4f859 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 8 Oct 2025 13:53:25 +0000
Subject: [PATCH 06/10] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 computer_vision/vision_tranformer.py | 190 +++++++++++++++------------
 1 file changed, 107 insertions(+), 83 deletions(-)

diff --git a/computer_vision/vision_tranformer.py b/computer_vision/vision_tranformer.py
index aa43743dbe7a..688fb9c60f14 100644
--- a/computer_vision/vision_tranformer.py
+++ b/computer_vision/vision_tranformer.py
@@ -1,7 +1,7 @@
 """
 Vision Transformer (ViT) Implementation
 
-This module contains a PyTorch implementation of the Vision Transformer (ViT) 
+This module contains a PyTorch implementation of the Vision Transformer (ViT)
 architecture based on the paper "An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale".
 
 Key Components:
@@ -23,34 +23,40 @@
 class PatchEmbedding(nn.Module):
     """
     Creates patch embeddings from input images as described in Equation 1 of ViT paper.
-    
+
     Args:
         img_size (int): Size of input image (assumed square)
         patch_size (int): Size of each patch (assumed square)
         in_channels (int): Number of input channels
         embed_dim (int): Dimension of embedding
     """
-    
-    def __init__(self, img_size: int = 224, patch_size: int = 16, in_channels: int = 3, embed_dim: int = 768):
+
+    def __init__(
+        self,
+        img_size: int = 224,
+        patch_size: int = 16,
+        in_channels: int = 3,
+        embed_dim: int = 768,
+    ):
         super().__init__()
         self.img_size = img_size
         self.patch_size = patch_size
         self.n_patches = (img_size // patch_size) ** 2
-        
+
         self.proj = nn.Conv2d(
             in_channels=in_channels,
             out_channels=embed_dim,
             kernel_size=patch_size,
-            stride=patch_size
+            stride=patch_size,
         )
-        
+
     def forward(self, x: Tensor) -> Tensor:
         """
         Forward pass for patch embedding.
-        
+
         Args:
             x (Tensor): Input tensor of shape (B, C, H, W)
-            
+
         Returns:
             Tensor: Patch embeddings of shape (B, n_patches, embed_dim)
         """
@@ -63,83 +69,91 @@ def forward(self, x: Tensor) -> Tensor:
 class MultiHeadSelfAttention(nn.Module):
     """
     Multi-Head Self Attention (MSA) block as described in Equation 2 of ViT paper.
-    
+
     Args:
         embed_dim (int): Dimension of embedding
         num_heads (int): Number of attention heads
         dropout (float): Dropout rate
     """
-    
+
     def __init__(self, embed_dim: int = 768, num_heads: int = 12, dropout: float = 0.0):
         super().__init__()
         self.embed_dim = embed_dim
         self.num_heads = num_heads
         self.head_dim = embed_dim // num_heads
-        
-        assert self.head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads"
-        
+
+        assert self.head_dim * num_heads == embed_dim, (
+            "embed_dim must be divisible by num_heads"
+        )
+
         self.qkv = nn.Linear(embed_dim, embed_dim * 3)
         self.attn_dropout = nn.Dropout(dropout)
         self.proj = nn.Linear(embed_dim, embed_dim)
         self.proj_dropout = nn.Dropout(dropout)
-        
+
     def forward(self, x: Tensor) -> Tensor:
         """
         Forward pass for multi-head self attention.
-        
+
         Args:
             x (Tensor): Input tensor of shape (B, n_patches, embed_dim)
-            
+
         Returns:
             Tensor: Output tensor of same shape as input
         """
         B, N, C = x.shape
-        
+
         # Create Q, K, V
-        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
+        qkv = (
+            self.qkv(x)
+            .reshape(B, N, 3, self.num_heads, self.head_dim)
+            .permute(2, 0, 3, 1, 4)
+        )
         q, k, v = qkv[0], qkv[1], qkv[2]  # (B, num_heads, N, head_dim)
-        
+
         # Scaled dot-product attention
-        attn = (q @ k.transpose(-2, -1)) * (self.head_dim ** -0.5)  # (B, num_heads, N, N)
+        attn = (q @ k.transpose(-2, -1)) * (self.head_dim**-0.5)  # (B, num_heads, N, N)
         attn = F.softmax(attn, dim=-1)
         attn = self.attn_dropout(attn)
-        
+
         # Apply attention to values
         x = (attn @ v).transpose(1, 2).reshape(B, N, C)  # (B, N, embed_dim)
-        
+
         # Projection
         x = self.proj(x)
         x = self.proj_dropout(x)
-        
+
         return x
 
 
 class MLPBlock(nn.Module):
     """
     Multilayer Perceptron (MLP) block as described in Equation 3 of ViT paper.
-    
+
     Args:
         embed_dim (int): Dimension of embedding
         mlp_ratio (float): Ratio of MLP hidden dimension to embed_dim
         dropout (float): Dropout rate
     """
-    
-    def __init__(self, embed_dim: int = 768, mlp_ratio: float = 4.0, dropout: float = 0.0):
+
+    def __init__(
+        self, embed_dim: int = 768, mlp_ratio: float = 4.0, dropout: float = 0.0
+    ):
         super().__init__()
         hidden_dim = int(embed_dim * mlp_ratio)
-        
+
         self.fc1 = nn.Linear(embed_dim, hidden_dim)
         self.act = nn.GELU()
         self.fc2 = nn.Linear(hidden_dim, embed_dim)
         self.dropout = nn.Dropout(dropout)
-        
+
     def forward(self, x: Tensor) -> Tensor:
         """
         Forward pass for MLP block.
-        
+
         Args:
             x (Tensor): Input tensor of shape (B, n_patches, embed_dim)
-            
+
         Returns:
             Tensor: Output tensor of same shape as input
         """
@@ -154,45 +168,51 @@ def forward(self, x: Tensor) -> Tensor:
 class TransformerEncoderBlock(nn.Module):
     """
     Transformer Encoder Block combining MSA and MLP with residual connections.
-    
+
     Args:
         embed_dim (int): Dimension of embedding
         num_heads (int): Number of attention heads
         mlp_ratio (float): Ratio of MLP hidden dimension to embed_dim
         dropout (float): Dropout rate
     """
-    
-    def __init__(self, embed_dim: int = 768, num_heads: int = 12, mlp_ratio: float = 4.0, dropout: float = 0.1):
+
+    def __init__(
+        self,
+        embed_dim: int = 768,
+        num_heads: int = 12,
+        mlp_ratio: float = 4.0,
+        dropout: float = 0.1,
+    ):
         super().__init__()
-        
+
         self.norm1 = nn.LayerNorm(embed_dim)
         self.attn = MultiHeadSelfAttention(embed_dim, num_heads, dropout)
         self.norm2 = nn.LayerNorm(embed_dim)
         self.mlp = MLPBlock(embed_dim, mlp_ratio, dropout)
-        
+
     def forward(self, x: Tensor) -> Tensor:
         """
         Forward pass for transformer encoder block.
-        
+
         Args:
             x (Tensor): Input tensor of shape (B, n_patches, embed_dim)
-            
+
         Returns:
             Tensor: Output tensor of same shape as input
         """
         # Multi-head self attention with residual connection
         x = x + self.attn(self.norm1(x))
-        
+
         # MLP with residual connection
         x = x + self.mlp(self.norm2(x))
-        
+
         return x
 
 
 class VisionTransformer(nn.Module):
     """
     Vision Transformer (ViT) model.
-    
+
     Args:
         img_size (int): Input image size
         patch_size (int): Patch size
@@ -205,7 +225,7 @@ class VisionTransformer(nn.Module):
         dropout (float): Dropout rate
         emb_dropout (float): Embedding dropout rate
     """
-    
+
     def __init__(
         self,
         img_size: int = 224,
@@ -217,91 +237,93 @@ def __init__(
         num_heads: int = 12,
         mlp_ratio: float = 4.0,
         dropout: float = 0.1,
-        emb_dropout: float = 0.1
+        emb_dropout: float = 0.1,
     ):
         super().__init__()
-        
+
         self.img_size = img_size
         self.patch_size = patch_size
         self.in_channels = in_channels
-        
+
         # Patch embedding
         self.patch_embed = PatchEmbedding(img_size, patch_size, in_channels, embed_dim)
         n_patches = self.patch_embed.n_patches
-        
+
         # Class token and position embedding
         self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
         self.pos_embed = nn.Parameter(torch.zeros(1, n_patches + 1, embed_dim))
         self.pos_dropout = nn.Dropout(emb_dropout)
-        
+
         # Transformer encoder blocks
-        self.blocks = nn.ModuleList([
-            TransformerEncoderBlock(embed_dim, num_heads, mlp_ratio, dropout)
-            for _ in range(depth)
-        ])
-        
+        self.blocks = nn.ModuleList(
+            [
+                TransformerEncoderBlock(embed_dim, num_heads, mlp_ratio, dropout)
+                for _ in range(depth)
+            ]
+        )
+
         # Layer normalization and classifier
         self.norm = nn.LayerNorm(embed_dim)
         self.head = nn.Linear(embed_dim, num_classes)
-        
+
         # Initialize weights
         self._init_weights()
-        
+
     def _init_weights(self):
         """Initialize weights for the ViT model."""
         # Initialize patch embedding like a linear layer
         nn.init.xavier_uniform_(self.patch_embed.proj.weight)
         if self.patch_embed.proj.bias is not None:
             nn.init.zeros_(self.patch_embed.proj.bias)
-            
+
         # Initialize class token and position embedding
         nn.init.trunc_normal_(self.cls_token, std=0.02)
         nn.init.trunc_normal_(self.pos_embed, std=0.02)
-        
+
         # Initialize linear layers
         self.apply(self._init_linear_weights)
-        
+
     def _init_linear_weights(self, module):
         """Initialize weights for linear layers."""
         if isinstance(module, nn.Linear):
             nn.init.trunc_normal_(module.weight, std=0.02)
             if module.bias is not None:
                 nn.init.zeros_(module.bias)
-                
+
     def forward(self, x: Tensor) -> Tensor:
         """
         Forward pass for Vision Transformer.
-        
+
         Args:
             x (Tensor): Input tensor of shape (B, C, H, W)
-            
+
         Returns:
             Tensor: Output logits of shape (B, num_classes)
         """
         B = x.shape[0]
-        
+
         # Create patch embeddings
         x = self.patch_embed(x)  # (B, n_patches, embed_dim)
-        
+
         # Add class token
         cls_tokens = self.cls_token.expand(B, -1, -1)  # (B, 1, embed_dim)
         x = torch.cat((cls_tokens, x), dim=1)  # (B, n_patches + 1, embed_dim)
-        
+
         # Add position embedding and apply dropout
         x = x + self.pos_embed
         x = self.pos_dropout(x)
-        
+
         # Apply transformer blocks
         for block in self.blocks:
             x = block(x)
-            
+
         # Apply final normalization and get class token output
         x = self.norm(x)
         cls_token_final = x[:, 0]  # Use class token for classification
-        
+
         # Classifier
         x = self.head(cls_token_final)
-        
+
         return x
 
 
@@ -315,11 +337,11 @@ def create_vit_model(
     num_heads: int = 12,
     mlp_ratio: float = 4.0,
     dropout: float = 0.1,
-    emb_dropout: float = 0.1
+    emb_dropout: float = 0.1,
 ) -> VisionTransformer:
     """
     Factory function to create a Vision Transformer model.
-    
+
     Args:
         img_size (int): Input image size
         patch_size (int): Patch size
@@ -331,7 +353,7 @@ def create_vit_model(
         mlp_ratio (float): Ratio of MLP hidden dimension to embed_dim
         dropout (float): Dropout rate
         emb_dropout (float): Embedding dropout rate
-        
+
     Returns:
         VisionTransformer: Configured ViT model
     """
@@ -345,36 +367,38 @@ def create_vit_model(
         num_heads=num_heads,
         mlp_ratio=mlp_ratio,
         dropout=dropout,
-        emb_dropout=emb_dropout
+        emb_dropout=emb_dropout,
     )
 
 
-def get_pretrained_vit(model_name: str = "vit_base_patch16_224", num_classes: int = 1000) -> nn.Module:
+def get_pretrained_vit(
+    model_name: str = "vit_base_patch16_224", num_classes: int = 1000
+) -> nn.Module:
     """
     Load a pretrained ViT model from torchvision.
-    
+
     Args:
         model_name (str): Name of the pretrained model
         num_classes (int): Number of output classes (for fine-tuning)
-        
+
     Returns:
         nn.Module: Pretrained ViT model
     """
     try:
         import torchvision.models as models
-        
+
         if hasattr(models, model_name):
             model = getattr(models, model_name)(pretrained=True)
             if num_classes != 1000:
                 # Replace the head for fine-tuning
-                if hasattr(model, 'heads'):
+                if hasattr(model, "heads"):
                     model.heads = nn.Linear(model.heads.in_features, num_classes)
-                elif hasattr(model, 'head'):
+                elif hasattr(model, "head"):
                     model.head = nn.Linear(model.head.in_features, num_classes)
             return model
         else:
             raise ValueError(f"Model {model_name} not found in torchvision.models")
-            
+
     except ImportError:
         raise ImportError("torchvision is required to load pretrained models")
 
@@ -382,10 +406,10 @@ def get_pretrained_vit(model_name: str = "vit_base_patch16_224", num_classes: in
 def count_parameters(model: nn.Module) -> int:
     """
     Count the number of trainable parameters in a model.
-    
+
     Args:
         model (nn.Module): PyTorch model
-        
+
     Returns:
         int: Number of trainable parameters
     """
@@ -400,11 +424,11 @@ def count_parameters(model: nn.Module) -> int:
         num_classes=3,  # pizza, steak, sushi
         embed_dim=768,
         depth=12,
-        num_heads=12
+        num_heads=12,
     )
-    
+
     print(f"Model created with {count_parameters(model):,} parameters")
-    
+
     # Test forward pass
     x = torch.randn(2, 3, 224, 224)
     out = model(x)

From 7dde47711afd8fe8ecd819196780cd1348e8833f Mon Sep 17 00:00:00 2001
From: "NANDA GOPAL.D" <nandagopalng2004@gmail.com>
Date: Wed, 8 Oct 2025 19:30:15 +0530
Subject: [PATCH 07/10] Update vision_tranformer.py

---
 computer_vision/vision_tranformer.py | 86 ++++++++++++----------------
 1 file changed, 36 insertions(+), 50 deletions(-)

diff --git a/computer_vision/vision_tranformer.py b/computer_vision/vision_tranformer.py
index 688fb9c60f14..f4e45c76244b 100644
--- a/computer_vision/vision_tranformer.py
+++ b/computer_vision/vision_tranformer.py
@@ -1,8 +1,9 @@
 """
-Vision Transformer (ViT) Implementation
+Vision Transformer (ViT) Implementation.
 
 This module contains a PyTorch implementation of the Vision Transformer (ViT)
-architecture based on the paper "An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale".
+architecture based on the paper "An Image is Worth 16x16 Words:
+Transformers for Image Recognition at Scale".
 
 Key Components:
 - Patch Embedding
@@ -12,17 +13,13 @@
 - Vision Transformer Model
 """
 
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch import Tensor
-from typing import Optional, Tuple
-import math
+from torch import Tensor, nn
+import torch.nn.functional as functional
 
 
 class PatchEmbedding(nn.Module):
     """
-    Creates patch embeddings from input images as described in Equation 1 of ViT paper.
+    Creates patch embeddings from input images as described in Equation 1.
 
     Args:
         img_size (int): Size of input image (assumed square)
@@ -32,11 +29,8 @@ class PatchEmbedding(nn.Module):
     """
 
     def __init__(
-        self,
-        img_size: int = 224,
-        patch_size: int = 16,
-        in_channels: int = 3,
-        embed_dim: int = 768,
+        self, img_size: int = 224, patch_size: int = 16,
+        in_channels: int = 3, embed_dim: int = 768
     ):
         super().__init__()
         self.img_size = img_size
@@ -47,7 +41,7 @@ def __init__(
             in_channels=in_channels,
             out_channels=embed_dim,
             kernel_size=patch_size,
-            stride=patch_size,
+            stride=patch_size
         )
 
     def forward(self, x: Tensor) -> Tensor:
@@ -68,7 +62,7 @@ def forward(self, x: Tensor) -> Tensor:
 
 class MultiHeadSelfAttention(nn.Module):
     """
-    Multi-Head Self Attention (MSA) block as described in Equation 2 of ViT paper.
+    Multi-Head Self Attention (MSA) block as described in Equation 2.
 
     Args:
         embed_dim (int): Dimension of embedding
@@ -101,23 +95,23 @@ def forward(self, x: Tensor) -> Tensor:
         Returns:
             Tensor: Output tensor of same shape as input
         """
-        B, N, C = x.shape
+        batch_size, num_patches, channels = x.shape
 
         # Create Q, K, V
         qkv = (
             self.qkv(x)
-            .reshape(B, N, 3, self.num_heads, self.head_dim)
+            .reshape(batch_size, num_patches, 3, self.num_heads, self.head_dim)
             .permute(2, 0, 3, 1, 4)
         )
         q, k, v = qkv[0], qkv[1], qkv[2]  # (B, num_heads, N, head_dim)
 
         # Scaled dot-product attention
-        attn = (q @ k.transpose(-2, -1)) * (self.head_dim**-0.5)  # (B, num_heads, N, N)
-        attn = F.softmax(attn, dim=-1)
+        attn = (q @ k.transpose(-2, -1)) * (self.head_dim ** -0.5)
+        attn = functional.softmax(attn, dim=-1)
         attn = self.attn_dropout(attn)
 
         # Apply attention to values
-        x = (attn @ v).transpose(1, 2).reshape(B, N, C)  # (B, N, embed_dim)
+        x = (attn @ v).transpose(1, 2).reshape(batch_size, num_patches, channels)
 
         # Projection
         x = self.proj(x)
@@ -128,7 +122,7 @@ def forward(self, x: Tensor) -> Tensor:
 
 class MLPBlock(nn.Module):
     """
-    Multilayer Perceptron (MLP) block as described in Equation 3 of ViT paper.
+    Multilayer Perceptron (MLP) block as described in Equation 3.
 
     Args:
         embed_dim (int): Dimension of embedding
@@ -136,9 +130,7 @@ class MLPBlock(nn.Module):
         dropout (float): Dropout rate
     """
 
-    def __init__(
-        self, embed_dim: int = 768, mlp_ratio: float = 4.0, dropout: float = 0.0
-    ):
+    def __init__(self, embed_dim: int = 768, mlp_ratio: float = 4.0, dropout: float = 0.0):
         super().__init__()
         hidden_dim = int(embed_dim * mlp_ratio)
 
@@ -177,11 +169,8 @@ class TransformerEncoderBlock(nn.Module):
     """
 
     def __init__(
-        self,
-        embed_dim: int = 768,
-        num_heads: int = 12,
-        mlp_ratio: float = 4.0,
-        dropout: float = 0.1,
+        self, embed_dim: int = 768, num_heads: int = 12,
+        mlp_ratio: float = 4.0, dropout: float = 0.1
     ):
         super().__init__()
 
@@ -237,7 +226,7 @@ def __init__(
         num_heads: int = 12,
         mlp_ratio: float = 4.0,
         dropout: float = 0.1,
-        emb_dropout: float = 0.1,
+        emb_dropout: float = 0.1
     ):
         super().__init__()
 
@@ -255,12 +244,10 @@ def __init__(
         self.pos_dropout = nn.Dropout(emb_dropout)
 
         # Transformer encoder blocks
-        self.blocks = nn.ModuleList(
-            [
-                TransformerEncoderBlock(embed_dim, num_heads, mlp_ratio, dropout)
-                for _ in range(depth)
-            ]
-        )
+        self.blocks = nn.ModuleList([
+            TransformerEncoderBlock(embed_dim, num_heads, mlp_ratio, dropout)
+            for _ in range(depth)
+        ])
 
         # Layer normalization and classifier
         self.norm = nn.LayerNorm(embed_dim)
@@ -300,14 +287,14 @@ def forward(self, x: Tensor) -> Tensor:
         Returns:
             Tensor: Output logits of shape (B, num_classes)
         """
-        B = x.shape[0]
+        batch_size = x.shape[0]
 
         # Create patch embeddings
         x = self.patch_embed(x)  # (B, n_patches, embed_dim)
 
         # Add class token
-        cls_tokens = self.cls_token.expand(B, -1, -1)  # (B, 1, embed_dim)
-        x = torch.cat((cls_tokens, x), dim=1)  # (B, n_patches + 1, embed_dim)
+        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
+        x = torch.cat((cls_tokens, x), dim=1)
 
         # Add position embedding and apply dropout
         x = x + self.pos_embed
@@ -337,7 +324,7 @@ def create_vit_model(
     num_heads: int = 12,
     mlp_ratio: float = 4.0,
     dropout: float = 0.1,
-    emb_dropout: float = 0.1,
+    emb_dropout: float = 0.1
 ) -> VisionTransformer:
     """
     Factory function to create a Vision Transformer model.
@@ -367,13 +354,11 @@ def create_vit_model(
         num_heads=num_heads,
         mlp_ratio=mlp_ratio,
         dropout=dropout,
-        emb_dropout=emb_dropout,
+        emb_dropout=emb_dropout
     )
 
 
-def get_pretrained_vit(
-    model_name: str = "vit_base_patch16_224", num_classes: int = 1000
-) -> nn.Module:
+def get_pretrained_vit(model_name: str = "vit_base_patch16_224", num_classes: int = 1000) -> nn.Module:
     """
     Load a pretrained ViT model from torchvision.
 
@@ -385,19 +370,20 @@ def get_pretrained_vit(
         nn.Module: Pretrained ViT model
     """
     try:
-        import torchvision.models as models
+        from torchvision import models
 
         if hasattr(models, model_name):
             model = getattr(models, model_name)(pretrained=True)
             if num_classes != 1000:
                 # Replace the head for fine-tuning
-                if hasattr(model, "heads"):
+                if hasattr(model, 'heads'):
                     model.heads = nn.Linear(model.heads.in_features, num_classes)
-                elif hasattr(model, "head"):
+                elif hasattr(model, 'head'):
                     model.head = nn.Linear(model.head.in_features, num_classes)
             return model
         else:
-            raise ValueError(f"Model {model_name} not found in torchvision.models")
+            error_msg = f"Model {model_name} not found in torchvision.models"
+            raise ValueError(error_msg)
 
     except ImportError:
         raise ImportError("torchvision is required to load pretrained models")
@@ -424,7 +410,7 @@ def count_parameters(model: nn.Module) -> int:
         num_classes=3,  # pizza, steak, sushi
         embed_dim=768,
         depth=12,
-        num_heads=12,
+        num_heads=12
     )
 
     print(f"Model created with {count_parameters(model):,} parameters")

From 141f3ca7e4ba10f01f60cb7cc300156b97d896fe Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 8 Oct 2025 14:00:36 +0000
Subject: [PATCH 08/10] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 computer_vision/vision_tranformer.py | 48 +++++++++++++++++-----------
 1 file changed, 30 insertions(+), 18 deletions(-)

diff --git a/computer_vision/vision_tranformer.py b/computer_vision/vision_tranformer.py
index f4e45c76244b..44f08adfe9b9 100644
--- a/computer_vision/vision_tranformer.py
+++ b/computer_vision/vision_tranformer.py
@@ -29,8 +29,11 @@ class PatchEmbedding(nn.Module):
     """
 
     def __init__(
-        self, img_size: int = 224, patch_size: int = 16,
-        in_channels: int = 3, embed_dim: int = 768
+        self,
+        img_size: int = 224,
+        patch_size: int = 16,
+        in_channels: int = 3,
+        embed_dim: int = 768,
     ):
         super().__init__()
         self.img_size = img_size
@@ -41,7 +44,7 @@ def __init__(
             in_channels=in_channels,
             out_channels=embed_dim,
             kernel_size=patch_size,
-            stride=patch_size
+            stride=patch_size,
         )
 
     def forward(self, x: Tensor) -> Tensor:
@@ -106,7 +109,7 @@ def forward(self, x: Tensor) -> Tensor:
         q, k, v = qkv[0], qkv[1], qkv[2]  # (B, num_heads, N, head_dim)
 
         # Scaled dot-product attention
-        attn = (q @ k.transpose(-2, -1)) * (self.head_dim ** -0.5)
+        attn = (q @ k.transpose(-2, -1)) * (self.head_dim**-0.5)
         attn = functional.softmax(attn, dim=-1)
         attn = self.attn_dropout(attn)
 
@@ -130,7 +133,9 @@ class MLPBlock(nn.Module):
         dropout (float): Dropout rate
     """
 
-    def __init__(self, embed_dim: int = 768, mlp_ratio: float = 4.0, dropout: float = 0.0):
+    def __init__(
+        self, embed_dim: int = 768, mlp_ratio: float = 4.0, dropout: float = 0.0
+    ):
         super().__init__()
         hidden_dim = int(embed_dim * mlp_ratio)
 
@@ -169,8 +174,11 @@ class TransformerEncoderBlock(nn.Module):
     """
 
     def __init__(
-        self, embed_dim: int = 768, num_heads: int = 12,
-        mlp_ratio: float = 4.0, dropout: float = 0.1
+        self,
+        embed_dim: int = 768,
+        num_heads: int = 12,
+        mlp_ratio: float = 4.0,
+        dropout: float = 0.1,
     ):
         super().__init__()
 
@@ -226,7 +234,7 @@ def __init__(
         num_heads: int = 12,
         mlp_ratio: float = 4.0,
         dropout: float = 0.1,
-        emb_dropout: float = 0.1
+        emb_dropout: float = 0.1,
     ):
         super().__init__()
 
@@ -244,10 +252,12 @@ def __init__(
         self.pos_dropout = nn.Dropout(emb_dropout)
 
         # Transformer encoder blocks
-        self.blocks = nn.ModuleList([
-            TransformerEncoderBlock(embed_dim, num_heads, mlp_ratio, dropout)
-            for _ in range(depth)
-        ])
+        self.blocks = nn.ModuleList(
+            [
+                TransformerEncoderBlock(embed_dim, num_heads, mlp_ratio, dropout)
+                for _ in range(depth)
+            ]
+        )
 
         # Layer normalization and classifier
         self.norm = nn.LayerNorm(embed_dim)
@@ -324,7 +334,7 @@ def create_vit_model(
     num_heads: int = 12,
     mlp_ratio: float = 4.0,
     dropout: float = 0.1,
-    emb_dropout: float = 0.1
+    emb_dropout: float = 0.1,
 ) -> VisionTransformer:
     """
     Factory function to create a Vision Transformer model.
@@ -354,11 +364,13 @@ def create_vit_model(
         num_heads=num_heads,
         mlp_ratio=mlp_ratio,
         dropout=dropout,
-        emb_dropout=emb_dropout
+        emb_dropout=emb_dropout,
     )
 
 
-def get_pretrained_vit(model_name: str = "vit_base_patch16_224", num_classes: int = 1000) -> nn.Module:
+def get_pretrained_vit(
+    model_name: str = "vit_base_patch16_224", num_classes: int = 1000
+) -> nn.Module:
     """
     Load a pretrained ViT model from torchvision.
 
@@ -376,9 +388,9 @@ def get_pretrained_vit(model_name: str = "vit_base_patch16_224", num_classes: in
             model = getattr(models, model_name)(pretrained=True)
             if num_classes != 1000:
                 # Replace the head for fine-tuning
-                if hasattr(model, 'heads'):
+                if hasattr(model, "heads"):
                     model.heads = nn.Linear(model.heads.in_features, num_classes)
-                elif hasattr(model, 'head'):
+                elif hasattr(model, "head"):
                     model.head = nn.Linear(model.head.in_features, num_classes)
             return model
         else:
@@ -410,7 +422,7 @@ def count_parameters(model: nn.Module) -> int:
         num_classes=3,  # pizza, steak, sushi
         embed_dim=768,
         depth=12,
-        num_heads=12
+        num_heads=12,
     )
 
     print(f"Model created with {count_parameters(model):,} parameters")

From 96143f7c3e0dd0964aa70193410ec08bf949345f Mon Sep 17 00:00:00 2001
From: "NANDA GOPAL.D" <nandagopalng2004@gmail.com>
Date: Wed, 8 Oct 2025 19:35:49 +0530
Subject: [PATCH 09/10] Update vision_tranformer.py

---
 computer_vision/vision_tranformer.py | 51 +++++++++++-----------------
 1 file changed, 20 insertions(+), 31 deletions(-)

diff --git a/computer_vision/vision_tranformer.py b/computer_vision/vision_tranformer.py
index 44f08adfe9b9..e52d5a92db94 100644
--- a/computer_vision/vision_tranformer.py
+++ b/computer_vision/vision_tranformer.py
@@ -13,8 +13,9 @@
 - Vision Transformer Model
 """
 
+import torch
 from torch import Tensor, nn
-import torch.nn.functional as functional
+from torch.nn import functional
 
 
 class PatchEmbedding(nn.Module):
@@ -29,11 +30,8 @@ class PatchEmbedding(nn.Module):
     """
 
     def __init__(
-        self,
-        img_size: int = 224,
-        patch_size: int = 16,
-        in_channels: int = 3,
-        embed_dim: int = 768,
+        self, img_size: int = 224, patch_size: int = 16,
+        in_channels: int = 3, embed_dim: int = 768
     ):
         super().__init__()
         self.img_size = img_size
@@ -44,7 +42,7 @@ def __init__(
             in_channels=in_channels,
             out_channels=embed_dim,
             kernel_size=patch_size,
-            stride=patch_size,
+            stride=patch_size
         )
 
     def forward(self, x: Tensor) -> Tensor:
@@ -109,7 +107,7 @@ def forward(self, x: Tensor) -> Tensor:
         q, k, v = qkv[0], qkv[1], qkv[2]  # (B, num_heads, N, head_dim)
 
         # Scaled dot-product attention
-        attn = (q @ k.transpose(-2, -1)) * (self.head_dim**-0.5)
+        attn = (q @ k.transpose(-2, -1)) * (self.head_dim ** -0.5)
         attn = functional.softmax(attn, dim=-1)
         attn = self.attn_dropout(attn)
 
@@ -133,9 +131,7 @@ class MLPBlock(nn.Module):
         dropout (float): Dropout rate
     """
 
-    def __init__(
-        self, embed_dim: int = 768, mlp_ratio: float = 4.0, dropout: float = 0.0
-    ):
+    def __init__(self, embed_dim: int = 768, mlp_ratio: float = 4.0, dropout: float = 0.0):
         super().__init__()
         hidden_dim = int(embed_dim * mlp_ratio)
 
@@ -174,11 +170,8 @@ class TransformerEncoderBlock(nn.Module):
     """
 
     def __init__(
-        self,
-        embed_dim: int = 768,
-        num_heads: int = 12,
-        mlp_ratio: float = 4.0,
-        dropout: float = 0.1,
+        self, embed_dim: int = 768, num_heads: int = 12,
+        mlp_ratio: float = 4.0, dropout: float = 0.1
     ):
         super().__init__()
 
@@ -234,7 +227,7 @@ def __init__(
         num_heads: int = 12,
         mlp_ratio: float = 4.0,
         dropout: float = 0.1,
-        emb_dropout: float = 0.1,
+        emb_dropout: float = 0.1
     ):
         super().__init__()
 
@@ -252,12 +245,10 @@ def __init__(
         self.pos_dropout = nn.Dropout(emb_dropout)
 
         # Transformer encoder blocks
-        self.blocks = nn.ModuleList(
-            [
-                TransformerEncoderBlock(embed_dim, num_heads, mlp_ratio, dropout)
-                for _ in range(depth)
-            ]
-        )
+        self.blocks = nn.ModuleList([
+            TransformerEncoderBlock(embed_dim, num_heads, mlp_ratio, dropout)
+            for _ in range(depth)
+        ])
 
         # Layer normalization and classifier
         self.norm = nn.LayerNorm(embed_dim)
@@ -334,7 +325,7 @@ def create_vit_model(
     num_heads: int = 12,
     mlp_ratio: float = 4.0,
     dropout: float = 0.1,
-    emb_dropout: float = 0.1,
+    emb_dropout: float = 0.1
 ) -> VisionTransformer:
     """
     Factory function to create a Vision Transformer model.
@@ -364,13 +355,11 @@ def create_vit_model(
         num_heads=num_heads,
         mlp_ratio=mlp_ratio,
         dropout=dropout,
-        emb_dropout=emb_dropout,
+        emb_dropout=emb_dropout
     )
 
 
-def get_pretrained_vit(
-    model_name: str = "vit_base_patch16_224", num_classes: int = 1000
-) -> nn.Module:
+def get_pretrained_vit(model_name: str = "vit_base_patch16_224", num_classes: int = 1000) -> nn.Module:
     """
     Load a pretrained ViT model from torchvision.
 
@@ -388,9 +377,9 @@ def get_pretrained_vit(
             model = getattr(models, model_name)(pretrained=True)
             if num_classes != 1000:
                 # Replace the head for fine-tuning
-                if hasattr(model, "heads"):
+                if hasattr(model, 'heads'):
                     model.heads = nn.Linear(model.heads.in_features, num_classes)
-                elif hasattr(model, "head"):
+                elif hasattr(model, 'head'):
                     model.head = nn.Linear(model.head.in_features, num_classes)
             return model
         else:
@@ -422,7 +411,7 @@ def count_parameters(model: nn.Module) -> int:
         num_classes=3,  # pizza, steak, sushi
         embed_dim=768,
         depth=12,
-        num_heads=12,
+        num_heads=12
     )
 
     print(f"Model created with {count_parameters(model):,} parameters")

From 634248719fee313e8cb19ef69a08f3ad3be6f593 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 8 Oct 2025 14:06:10 +0000
Subject: [PATCH 10/10] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 computer_vision/vision_tranformer.py | 48 +++++++++++++++++-----------
 1 file changed, 30 insertions(+), 18 deletions(-)

diff --git a/computer_vision/vision_tranformer.py b/computer_vision/vision_tranformer.py
index e52d5a92db94..b5acddc6ad2d 100644
--- a/computer_vision/vision_tranformer.py
+++ b/computer_vision/vision_tranformer.py
@@ -30,8 +30,11 @@ class PatchEmbedding(nn.Module):
     """
 
     def __init__(
-        self, img_size: int = 224, patch_size: int = 16,
-        in_channels: int = 3, embed_dim: int = 768
+        self,
+        img_size: int = 224,
+        patch_size: int = 16,
+        in_channels: int = 3,
+        embed_dim: int = 768,
     ):
         super().__init__()
         self.img_size = img_size
@@ -42,7 +45,7 @@ def __init__(
             in_channels=in_channels,
             out_channels=embed_dim,
             kernel_size=patch_size,
-            stride=patch_size
+            stride=patch_size,
         )
 
     def forward(self, x: Tensor) -> Tensor:
@@ -107,7 +110,7 @@ def forward(self, x: Tensor) -> Tensor:
         q, k, v = qkv[0], qkv[1], qkv[2]  # (B, num_heads, N, head_dim)
 
         # Scaled dot-product attention
-        attn = (q @ k.transpose(-2, -1)) * (self.head_dim ** -0.5)
+        attn = (q @ k.transpose(-2, -1)) * (self.head_dim**-0.5)
         attn = functional.softmax(attn, dim=-1)
         attn = self.attn_dropout(attn)
 
@@ -131,7 +134,9 @@ class MLPBlock(nn.Module):
         dropout (float): Dropout rate
     """
 
-    def __init__(self, embed_dim: int = 768, mlp_ratio: float = 4.0, dropout: float = 0.0):
+    def __init__(
+        self, embed_dim: int = 768, mlp_ratio: float = 4.0, dropout: float = 0.0
+    ):
         super().__init__()
         hidden_dim = int(embed_dim * mlp_ratio)
 
@@ -170,8 +175,11 @@ class TransformerEncoderBlock(nn.Module):
     """
 
     def __init__(
-        self, embed_dim: int = 768, num_heads: int = 12,
-        mlp_ratio: float = 4.0, dropout: float = 0.1
+        self,
+        embed_dim: int = 768,
+        num_heads: int = 12,
+        mlp_ratio: float = 4.0,
+        dropout: float = 0.1,
     ):
         super().__init__()
 
@@ -227,7 +235,7 @@ def __init__(
         num_heads: int = 12,
         mlp_ratio: float = 4.0,
         dropout: float = 0.1,
-        emb_dropout: float = 0.1
+        emb_dropout: float = 0.1,
     ):
         super().__init__()
 
@@ -245,10 +253,12 @@ def __init__(
         self.pos_dropout = nn.Dropout(emb_dropout)
 
         # Transformer encoder blocks
-        self.blocks = nn.ModuleList([
-            TransformerEncoderBlock(embed_dim, num_heads, mlp_ratio, dropout)
-            for _ in range(depth)
-        ])
+        self.blocks = nn.ModuleList(
+            [
+                TransformerEncoderBlock(embed_dim, num_heads, mlp_ratio, dropout)
+                for _ in range(depth)
+            ]
+        )
 
         # Layer normalization and classifier
         self.norm = nn.LayerNorm(embed_dim)
@@ -325,7 +335,7 @@ def create_vit_model(
     num_heads: int = 12,
     mlp_ratio: float = 4.0,
     dropout: float = 0.1,
-    emb_dropout: float = 0.1
+    emb_dropout: float = 0.1,
 ) -> VisionTransformer:
     """
     Factory function to create a Vision Transformer model.
@@ -355,11 +365,13 @@ def create_vit_model(
         num_heads=num_heads,
         mlp_ratio=mlp_ratio,
         dropout=dropout,
-        emb_dropout=emb_dropout
+        emb_dropout=emb_dropout,
     )
 
 
-def get_pretrained_vit(model_name: str = "vit_base_patch16_224", num_classes: int = 1000) -> nn.Module:
+def get_pretrained_vit(
+    model_name: str = "vit_base_patch16_224", num_classes: int = 1000
+) -> nn.Module:
     """
     Load a pretrained ViT model from torchvision.
 
@@ -377,9 +389,9 @@ def get_pretrained_vit(model_name: str = "vit_base_patch16_224", num_classes: in
             model = getattr(models, model_name)(pretrained=True)
             if num_classes != 1000:
                 # Replace the head for fine-tuning
-                if hasattr(model, 'heads'):
+                if hasattr(model, "heads"):
                     model.heads = nn.Linear(model.heads.in_features, num_classes)
-                elif hasattr(model, 'head'):
+                elif hasattr(model, "head"):
                     model.head = nn.Linear(model.head.in_features, num_classes)
             return model
         else:
@@ -411,7 +423,7 @@ def count_parameters(model: nn.Module) -> int:
         num_classes=3,  # pizza, steak, sushi
         embed_dim=768,
         depth=12,
-        num_heads=12
+        num_heads=12,
     )
 
     print(f"Model created with {count_parameters(model):,} parameters")