In [1]:
import torch
import torch.nn as nn

In [2]:
class PatchEmbed(nn.Module):
    """Split image into patches and then embed them.

    Parameters
    ----------
    img_size : int
        Size of the image (it is a square).

    patch_size : int
        Size of the patch (it is a square).

    in_chans : int
        Number of input channels.

    embed_dim : int
        The embedding dimension.

    Attributes
    ----------
    n_patches : int
        Number of patches inside of our image.

    proj : nn.Conv2d
        Convolutional layer that does both the splitting into patches and their embedding.
    """
    def __init__(self, img_size, patch_size, in_channels = 3, embed_dim = 768):
        super().__init__()
        self.img_size = img_size
        self.patch_size = patch_size
        self.n_patches = (img_size // patch_size) ** 2

        self.proj = nn.Conv2d(
            in_channels, embed_dim, kernel_size = patch_size, stride = patch_size,)

    def forward(self, x):
        """ Run forward pass.

        Parameters
        ----------
        x : torch.Tensor
            Shape '(n_samples, in_chans, img_size, img_size)'.

        Returns
        --------
        torch.Tensor
            Shape '(n_samples, n_patches, embed_dim)'.
        """
        x = self.proj(x) # (n_samples, emed_dim, n_patches ** 0.5, n_patches ** 0.5)
       # print(x.shape)
        x = x.flatten(2) # (n_samples, emed_dim, n_patches)
       # print(x.shape)
        x = x.transpose(1,2) # n_samples, n_patches, embed_dim)
        #print(x.shape)

        return x

In [3]:
x = torch.randn(1,3,224,224)


model = PatchEmbed(img_size=224,patch_size=3,embed_dim=768,in_channels=3)
x1 =model(x)

In [4]:
class Attention(nn.Module):
    """ Attention mechanism

    Parameters
    ----------
    dim : int
        The input and out dimension of per token features.

    n_heads: int
        Number of attention heads.

    qkv_bias : bool
        If True, then we include bias to the query, key and value projections.

    attn_p : float
        Dropout probability applied to the query, key and value tensors.

    proj_p : float
        Dropout probability applied to the output tensor.
    """
    def __init__(self, dim, n_heads=12, qkv_bias=True, attn_p=0., proj_p=0.):
        super().__init__()
        self.n_heads = n_heads
        self.dim = dim
        self.head_dim = dim//n_heads
        self.scale = self.head_dim ** -0.5

        self.qkv = nn.Linear(dim, dim*3, bias = qkv_bias)
        self.attn_drop = nn.Dropout(attn_p)
        self.proj = nn.Linear(dim, dim)
        self.proj_drop = nn.Dropout(proj_p)

    def forward(self, x):
        """ Run forward pass.

        Parameters
        ----------
        x: torch.Tensor
            Shape '(n_samples, n_patches +1, dim)'.

        Returns 
        ------
        torch.Tensor
            Shape '(n_samples, n_patches +1, dim)'.
        """
        n_samples, n_tokens, dim = x.shape
        print(x.shape)

        if dim != self.dim:
            raise ValueError

        qkv = self.qkv(x) # (n_samples, n_patches +1, 3 * dim)
        print(qkv.shape)
        qkv = qkv.reshape(
                n_samples, n_tokens, 3, self.n_heads, self.head_dim)
        print(qkv.shape)
        qkv = qkv.permute(
            2, 0, 3, 1, 4)
        # (3, n_samples, n_heads, n_patches + 1,head_dim )
        q, k, v = qkv[0], qkv[1], qkv[2]

        k_t = k.transpose(-2, -1) # (n_samples, n_heads, head_dim, n_patches +1)

        dp = (q @ k_t) * self.scale

        attn = dp.softmax(dim=-1)
        attn = self.attn_drop(attn)
        weighted_avg = attn @ v
        weighted_avg = weighted_avg.transpose(1, 2)
        weighted_avg = weighted_avg.flatten(2)

        x = self.proj(weighted_avg)
        x = self.proj_drop(x)

        return x
    

In [22]:
class MLP(nn.Module):
    def __init__(self, in_features, hidden_features, out_features, p=0.):
        super().__init()
        self.fc1 = nn.Linear(in_features, hidden_features)
        self.act = nn.GELU()
        self.fc2 = nn.Linear(hidden_features, out_features)
        self.drop = nn.Dropout(p)

    def forward(self, x):
        return self.fc2(self.drop(self.act(self.fc1(x)))) 

In [25]:
class Block(nn.Module):
    """Transformer block.

    Parameters
    ----------
    dim : int
        Embedding dimensions.

    n_heads : int
        number of attention heads.

    mlp_ratio : float
        Determines the hidden dimension size of the "MLP' module with rewspect to 'dim'.
    """
    def __init__(self, dim, n_heads, mlp_ratio= 4.0, qkv_bias=True, p=0., attn_p = 0.):
        super().__init__()
        self.norm1 = nn.LayerNorm(dim, eps=1e-6)
        self.attn = Attention(dim,n_heads=n_heads, qkv_bias = qkv_bias,attn_p = attn_p, proj_p = p)
        self.norm2 = nn.LayerNorm(dim, eps=1e-6)
        hidden_features = int(dim * mlp_ratio)
        self.mlp = MLP(in_features=dim, hidden_features = hidden_features, out_features=dim)

    def forward(self,x):
        x += self.attn(self.norm1(x))
        x += self.mlp(self.norm2(x))

        return x
    

In [19]:
import torch
import torch.nn as nn
from timm.models.layers import DropPath
import natten
from natten import NeighborhoodAttention2D as NeighborhoodAttention

class NATransformerLayer(nn.Module):
    """Neighborhood Attention Transformer Layer with MLP."""
    def __init__(self, dim, num_heads, kernel_size=7, dilation=1, drop_path=0.0):
        super().__init__()
        self.norm1 = nn.LayerNorm(dim)
        self.attn = NeighborhoodAttention(dim, kernel_size, dilation, num_heads)
        self.drop_path = DropPath(drop_path) if drop_path > 0 else nn.Identity()
        self.norm2 = nn.LayerNorm(dim)
        self.mlp = nn.Sequential(
            nn.Linear(dim, 4 * dim),
            nn.GELU(),
            nn.Linear(4 * dim, dim)
        )

    def forward(self, x):
        x = x + self.drop_path(self.attn(self.norm1(x)))
        x = x + self.drop_path(self.mlp(self.norm2(x)))
        return x

class PatchEmbed(nn.Module):
    """Converts input image into patches."""
    def __init__(self, patch_size=4, in_chans=3, embed_dim=96):
        super().__init__()
        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
        self.norm = nn.LayerNorm(embed_dim)

    def forward(self, x):
        x = self.proj(x).flatten(2).transpose(1, 2)
        return self.norm(x)

class DiNAT(nn.Module):
    """Simplified DiNAT model."""
    def __init__(self, img_size=224, patch_size=4, embed_dim=96, depths=[2, 2, 6, 2], num_heads=[3, 6, 12, 24]):
        super().__init__()
        self.patch_embed = PatchEmbed(patch_size, 3, embed_dim)
        self.layers = nn.ModuleList([
            NATransformerLayer(embed_dim * (2**i), num_heads[i]) for i in range(len(depths))
        ])
        self.norm = nn.LayerNorm(embed_dim * (2**(len(depths) - 1)))
        self.head = nn.Linear(embed_dim * (2**(len(depths) - 1)), 1000)

    def forward(self, x):
        x = self.patch_embed(x)
        for layer in self.layers:
            x = layer(x)
        x = self.norm(x.mean(dim=1))
        return self.head(x)

# Example usage
model = DiNAT()
img = torch.randn(1, 3, 224, 224)
output = model(img)
print(output.shape)  # Should be (1, 1000)


ModuleNotFoundError: No module named 'timm'

In [5]:
mod = Attention(768)
mod(x1)

torch.Size([1, 5476, 768])
torch.Size([1, 5476, 2304])
torch.Size([1, 5476, 3, 12, 64])


tensor([[[-0.0835,  0.0374,  0.0326,  ..., -0.0293, -0.0212, -0.0847],
         [-0.0843,  0.0316,  0.0285,  ..., -0.0309, -0.0163, -0.0860],
         [-0.0800,  0.0344,  0.0293,  ..., -0.0298, -0.0223, -0.0860],
         ...,
         [-0.0826,  0.0343,  0.0286,  ..., -0.0394, -0.0215, -0.0875],
         [-0.0852,  0.0263,  0.0308,  ..., -0.0275, -0.0236, -0.0843],
         [-0.0801,  0.0323,  0.0303,  ..., -0.0304, -0.0171, -0.0851]]],
       grad_fn=<ViewBackward0>)

In [7]:
class Block(nn.Module):
    

IndentationError: expected an indented block (1090054001.py, line 2)

In [None]:
def min(list1):
    list1.sort()
    return list1[0]
    

In [None]:
min([1,2,1,1,-1])

In [8]:
nums = [-1,0,1,2,-1,-4]

l = []
for i in range(len(nums) - 2):  # Step 2: Iterate over the array
        if i > 0 : #and nums[i] == nums[i - 1]:
            print(i)
    
#Output: [[-1,-1,2],[-1,0,1]]

1
2
3


In [9]:
(224/16) **2

196.0

In [10]:
224 * 2

448

In [11]:
x.softmax(dim=-1)

tensor([[[[0.0017, 0.0028, 0.0016,  ..., 0.0015, 0.0035, 0.0019],
          [0.0046, 0.0038, 0.0026,  ..., 0.0045, 0.0021, 0.0049],
          [0.0057, 0.0009, 0.0025,  ..., 0.0009, 0.0015, 0.0025],
          ...,
          [0.0008, 0.0137, 0.0059,  ..., 0.0019, 0.0012, 0.0075],
          [0.0027, 0.0179, 0.0078,  ..., 0.0026, 0.0029, 0.0016],
          [0.0025, 0.0016, 0.0030,  ..., 0.0005, 0.0011, 0.0045]],

         [[0.0015, 0.0010, 0.0102,  ..., 0.0014, 0.0011, 0.0050],
          [0.0004, 0.0061, 0.0025,  ..., 0.0053, 0.0006, 0.0011],
          [0.0053, 0.0087, 0.0018,  ..., 0.0044, 0.0005, 0.0041],
          ...,
          [0.0013, 0.0034, 0.0025,  ..., 0.0003, 0.0017, 0.0007],
          [0.0025, 0.0070, 0.0012,  ..., 0.0103, 0.0150, 0.0017],
          [0.0022, 0.0013, 0.0013,  ..., 0.0004, 0.0057, 0.0118]],

         [[0.0067, 0.0021, 0.0043,  ..., 0.0007, 0.0003, 0.0017],
          [0.0022, 0.0174, 0.0058,  ..., 0.0034, 0.0139, 0.0026],
          [0.0052, 0.0098, 0.0132,  ..., 0

In [12]:
 x= conv(x)
x.shape

NameError: name 'conv' is not defined

In [None]:
x = x.flatten(2)
x.shape

In [None]:
x = x.transpose(1,2)
x.shape

In [None]:
28*28

In [None]:
torch.sqrt(torch.tensor(768))

In [13]:
p = nn.Dropout2d(0.5)
p(x)

tensor([[[[-0.0000e+00,  0.0000e+00, -0.0000e+00,  ..., -0.0000e+00,
            0.0000e+00, -0.0000e+00],
          [ 0.0000e+00,  0.0000e+00, -0.0000e+00,  ...,  0.0000e+00,
           -0.0000e+00,  0.0000e+00],
          [ 0.0000e+00, -0.0000e+00, -0.0000e+00,  ..., -0.0000e+00,
           -0.0000e+00, -0.0000e+00],
          ...,
          [-0.0000e+00,  0.0000e+00,  0.0000e+00,  ..., -0.0000e+00,
           -0.0000e+00,  0.0000e+00],
          [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
            0.0000e+00, -0.0000e+00],
          [ 0.0000e+00, -0.0000e+00,  0.0000e+00,  ..., -0.0000e+00,
           -0.0000e+00,  0.0000e+00]],

         [[-0.0000e+00, -0.0000e+00,  0.0000e+00,  ..., -0.0000e+00,
           -0.0000e+00,  0.0000e+00],
          [-0.0000e+00,  0.0000e+00, -0.0000e+00,  ...,  0.0000e+00,
           -0.0000e+00, -0.0000e+00],
          [ 0.0000e+00,  0.0000e+00, -0.0000e+00,  ...,  0.0000e+00,
           -0.0000e+00,  0.0000e+00],
          ...,
     

https://towardsdatascience.com/position-embeddings-for-vision-transformers-explained-a6f9add341d5

In [14]:
p.training
        

True

In [15]:
import torch
import torch.nn as nn
import math

class PositionalEncoding(nn.Module):
    def __init__(self, dim, max_len=5000):
        super(PositionalEncoding, self).__init__()

        # Create a tensor of shape [max_len, dim] for encoding
        pe = torch.zeros(max_len, dim)
        position = torch.arange(0, max_len).float().unsqueeze(1)  # [max_len, 1]
        div_term = torch.exp(torch.arange(0, dim, 2).float() * -(math.log(10000.0) / dim))  # [dim/2]

        # Apply sin and cos functions for positional encoding
        pe[:, 0::2] = torch.sin(position * div_term)  # even indices (sine)
        pe[:, 1::2] = torch.cos(position * div_term)  # odd indices (cosine)

        pe = pe.unsqueeze(0)  # Shape becomes [1, max_len, dim]
        self.register_buffer('pe', pe)

    def forward(self, x):
        # Add positional encoding to the input tensor
        return x + self.pe[:, :x.size(1)]  # x.size(1) is the length of the sequence

# Example usage for ViT
class VisionTransformer(nn.Module):
    def __init__(self, img_size=224, patch_size=16, dim=768, num_classes=1000):
        super(VisionTransformer, self).__init__()
        
        self.patch_size = patch_size
        self.dim = dim
        self.num_classes = num_classes

        # Calculate the number of patches
        self.num_patches = (img_size // patch_size) ** 2

        # Define the embedding layer for patches
        self.patch_embeddings = nn.Conv2d(3, dim, kernel_size=patch_size, stride=patch_size)

        # Positional encoding
        self.positional_encoding = PositionalEncoding(dim, max_len=self.num_patches)

        # Transformer layers (simplified version)
        self.encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=dim, nhead=8),
            num_layers=12
        )

        # Classification head
        self.fc = nn.Linear(dim, num_classes)

    def forward(self, x):
        # Extract patches and embed them
        x = self.patch_embeddings(x)  # Shape: [batch_size, dim, num_patches, num_patches]
        x = x.flatten(2).transpose(1, 2)  # Shape: [batch_size, num_patches, dim]

        # Add positional encoding
        x = self.positional_encoding(x)

        # Pass through transformer encoder
        x = self.encoder(x)

        # Classification head (take the output of the [CLS] token)
        x = x.mean(dim=1)  # Global average pooling

        # Final classification layer
        x = self.fc(x)
        return x

# Example input (batch_size=2, img_size=224)
model = VisionTransformer(img_size=224, patch_size=16)
sample_input = torch.randn(2, 3, 224, 224)  # Batch of 2 images with 3 channels (RGB)
output = model(sample_input)
print(output.shape)  # Should output: torch.Size([2, 1000])




torch.Size([2, 1000])


In [16]:
torch.zeros(5000, 768).shape

torch.Size([5000, 768])

In [17]:
position = torch.arange(0, 5000).float().unsqueeze(1)
position.shape

torch.Size([5000, 1])

In [18]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class DilatedNeighborhoodAttention(nn.Module):
    def __init__(self, dim, kernel_size=7, dilation=1, num_heads=8):
        super().__init__()
        self.dim = dim
        self.kernel_size = kernel_size
        self.dilation = dilation
        self.num_heads = num_heads
        self.head_dim = dim // num_heads
        self.scale = self.head_dim ** -0.5
        
        self.qkv = nn.Linear(dim, dim * 3, bias=False)
        self.proj = nn.Linear(dim, dim)
        
    def forward(self, x):
        B, H, W, C = x.shape
        qkv = self.qkv(x).reshape(B, H, W, 3, self.num_heads, self.head_dim)
        qkv = qkv.permute(3, 0, 4, 1, 2, 5)  # (3, B, num_heads, H, W, head_dim)
        q, k, v = qkv[0], qkv[1], qkv[2]  # Split into queries, keys, and values
        
        # Compute attention with dilation
        attn_map = self.compute_dilated_attention(q, k, H, W)
        attn_output = (attn_map @ v).permute(0, 2, 3, 1, 4).reshape(B, H, W, C)
        return self.proj(attn_output)
    
    def compute_dilated_attention(self, q, k, H, W):
        """Computes dilated attention weights"""
        pad = self.dilation * (self.kernel_size // 2)
        k_padded = F.pad(k, (0, 0, pad, pad, pad, pad))
        attn_weights = []
        
        for i in range(self.kernel_size):
            for j in range(self.kernel_size):
                k_slice = k_padded[:, :, i::self.dilation, j::self.dilation, :]
                attn_weights.append((q * k_slice).sum(-1))
        
        attn_weights = torch.stack(attn_weights, dim=-1)
        attn_weights = F.softmax(attn_weights * self.scale, dim=-1)
        return attn_weights

# Example Usage
B, H, W, C = 1, 32, 32, 64  # Batch size, Height, Width, Channels
x = torch.randn(B, H, W, C)
attn_layer = DilatedNeighborhoodAttention(dim=C, kernel_size=7, dilation=2, num_heads=8)
out = attn_layer(x)
print(out.shape)  # Expected output: (B, H, W, C)


RuntimeError: The size of tensor a (32) must match the size of tensor b (22) at non-singleton dimension 3

In [None]:
l = [-1,