In [None]:
# prompt: syn with drive

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#extract patch of given size from an image
import torch

def extraxt_patch(img, patch_size):
  c, h, w = img.shape
  assert h % patch_size == 0 and w % patch_size == 0
  patches = []
  for i in range(0, h, patch_size):
    for j in range(0, w, patch_size):
      patch = img[:, i:i+patch_size, j:j+patch_size]
      patches.append(patch)
  patches = torch.stack(patches)
  return patches

img = torch.randn(3,32,32)
patch_size = 8
patches = extraxt_patch(img, patch_size)
print(patches.shape)

torch.Size([16, 3, 8, 8])


In [None]:
# Create image from non-overlapping patch
def patch2image(patches, patch_size, img_size=32, num_channel = 3):
  N, C, H, W = patches.shape
  print(patches.shape)

  patches = patches.view(img_size//patch_size, img_size//patch_size, num_channel, patch_size, patch_size)
  print(patches.shape)

  patches = patches.permute(2, 0, 3, 1, 4)
  print(patches.shape)

  img = patches.contiguous().view(num_channel, img_size, img_size)
  print(img.shape)
  return img


img = patch2image(patches, patch_size)
print(img.shape)

torch.Size([16, 3, 8, 8])
torch.Size([4, 4, 3, 8, 8])
torch.Size([3, 4, 8, 4, 8])
torch.Size([3, 32, 32])
torch.Size([3, 32, 32])


In [None]:
# design a function to generate linear embeddings of patches
import torch
import torch.nn as nn
class LinearEmbeddings(nn.Module):
  def __init__(self, img_size = 224, patch_size=8, embed_dim=768):
    super().__init__()
    self.img_size = img_size
    self.patch_size = patch_size
    self.embed_dim = embed_dim
    self.num_patches = (img_size // patch_size) ** 2
    self.LinearProjection = nn.Linear(patch_size * patch_size * 3, embed_dim)

  def extract_patches(self, x):
    patches = x.unfold(2, self.patch_size, self.patch_size).unfold(3, self.patch_size, self.patch_size)
    patches = patches.reshape(x.shape[0], -1, self.patch_size * self.patch_size * 3)
    return patches


  def forward(self, x):
    batch_size, c, h, w = x.shape
    assert h == self.img_size and w == self.img_size, f"Input image size ({h}*{w}) doesn't match model ({self.img_size}*{self.img_size})"
    patches = self.extract_patches(x)
    embeddings = self.LinearProjection(patches)
    return embeddings

LE = LinearEmbeddings()
x = torch.randn(2, 3, 224, 224)
embeddings = LE(x)
print(embeddings.shape)

torch.Size([2, 784, 768])


In [None]:
# # prompt: import image '/content/drive/MyDrive/GraPix POSTER.png' and resize and do the same
# import torch
# from PIL import Image
# from torchvision import transforms
# import matplotlib.pyplot as plt

# def extract_patch(img, patch_size):
#   patches = []
#   c,h,w = img.shape
#   print('Image Shape: ',img.shape)
#   for i in range(0, h, patch_size):
#     for j in range(0, w, patch_size):
#       patch = img[:, i:i+patch_size, j:j+patch_size]
#       patches.append(patch)
#   patches = torch.stack(patches)
#   return patches


# img_path = '/content/drive/MyDrive/GraPix POSTER.png'
# img = Image.open(img_path).convert('RGB')
# # # Resize the image (example: resize to 256x256)
# # new_size = (256, 256)
# # resized_img = img.resize(new_size)
# transform = transforms.Compose([
#     transforms.Resize((256,256)),
#     transforms.ToTensor()
# ])
# resized_img = transform(img)
# patch_size = 64
# patches = extract_patch(resized_img, patch_size)
# print(patches.shape)

# #plot patches
# # fig, axs = plt.subplots(patches.shape[0], 1, figsize=(10, 10))
# # for i in range(patches.shape[0]):
# #   axs[i].imshow(patches[i].permute(1,2,0))
# #   axs[i].axis('off')
# #plt.show()

In [None]:
# design a function for assigning positional embedding to patch embeddings
import torch.nn as nn

class PositionalEmbeddings(nn.Module):
  def __init__(self, seq_len, dim):
    super().__init__()
    self.pos_embeddings = nn.Parameter(torch.randn(1, seq_len, dim))

  def forward(self,x):
    return x + self.pos_embeddings

seq_len = 10
dim = 128
pe = PositionalEmbeddings(seq_len, dim)
x = torch.randn(1, seq_len, dim)
y = pe(x)
print(y.shape)

torch.Size([1, 10, 128])


In [None]:
# sinusoidal posotional embeddings for patch tokens
import torch
import torch.nn as nn
import math

class SinusoidalPositionalEmbeddings(nn.Module):
    def __init__(self, seq_len, dim):
        super().__init__()
        self.register_buffer('pos_embeddings', self._build_positional_encoding(seq_len, dim), persistent=False)

    def _build_positional_encoding(self, seq_len, dim):
        pe = torch.zeros(seq_len, dim)
        position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1)  # (seq_len, 1)
        div_term = torch.exp(torch.arange(0, dim, 2).float() * (-math.log(10000.0) / dim))  # (dim/2)

        pe[:, 0::2] = torch.sin(position * div_term)  # even indices
        pe[:, 1::2] = torch.cos(position * div_term)  # odd indices
        return pe.unsqueeze(0)  # shape (1, seq_len, dim)

    def forward(self, x):
        return x + self.pos_embeddings[:, :x.size(1), :]

seq_len = 10
dim = 128
spe = SinusoidalPositionalEmbeddings(seq_len, dim)
x = torch.randn(1, seq_len, dim)
y = spe(x)
y.shape

In [None]:
# design multi-head attention for ViT
import torch.nn as nn
class MultiheadAttention(nn.Module):
  def __init__(self, dim, num_heads):
    super().__init__()
    self.attention = nn.MultiheadAttention(embed_dim=dim, num_heads=num_heads, batch_first=True)

  def forward(self, x):
    return self.attention(x,x,x)[0]

attention = MultiheadAttention(dim=128, num_heads=8)
x = torch.randn(1, 10, 128)
y = attention(x)
print(y.shape)

torch.Size([1, 10, 128])


In [None]:
# design multi-head attention with layer normalization for ViT
import torch
import torch.nn as nn

class MultiheadAttentionNormDropout(nn.Module):
  def __init__(self, dim, num_heads, dropout):
    super().__init__()
    self.norm = nn.LayerNorm(dim)
    self.MHA = nn.MultiheadAttention(embed_dim = dim, num_heads=num_heads, batch_first=True)
    self.Dropout = nn.Dropout(dropout)

  def forward(self,x):
    x = self.norm(x)
    x = self.MHA(x,x,x)[0]
    x = self.Dropout(x)
    return x

MHAND = MultiheadAttentionNormDropout(dim=128, num_heads=8, dropout=0.1)
x = torch.randn(1, 10, 128)
y = MHAND(x)
print(y.shape)

torch.Size([1, 10, 128])


In [None]:
# design transformer encoder block
class TransformerEncoderBlock(nn.Module):
  def __init__(self, dim, num_heads, mlp_ratio):
    super().__init__()
    self.norm1 = nn.LayerNorm(dim)
    self.atten = nn.MultiheadAttention(embed_dim=dim, num_heads=num_heads, batch_first=True)
    self.norm2 = nn.LayerNorm(dim)
    self.mlp = nn.Sequential(
        nn.Linear(dim, int(dim * mlp_ratio)),
        nn.GELU(),
        nn.Linear(int(dim * mlp_ratio), dim)
    )

  def forward(self, x):
    x = x + self.atten(self.norm1(x),self.norm1(x),self.norm1(x))[0]
    x = x + self.mlp(self.norm2(x))
    return x

TEB = TransformerEncoderBlock(dim=128, num_heads=8, mlp_ratio=4)
x = torch.randn(1, 10, 128)
y = TEB(x)
print(y.shape)

torch.Size([1, 10, 128])


In [None]:
# design function for post processing of image before sending to ViT encoder
class PatchEmbedding(nn.Module):
  def __init__(self, img_size = 224, in_channels = 3, embed_dim = 768, patch_size =16):
    super().__init__()
    self.projection = nn.Conv2d(in_channels, embed_dim, kernel_size=patch_size, stride=patch_size)
    self.cls_token = nn.Parameter(torch.randn(1,1,embed_dim))
    self.pos_embeddings = nn.Parameter(torch.randn(1, (img_size // patch_size) ** 2 + 1, embed_dim))
    #self.pos_embeddings = nn.sin

  def forward(self, x):
    batch_size = x.shape[0]
    x = self.projection(x)
    x = x.flatten(2)
    x = x.transpose(1,2)
    cls_tokens = self.cls_token.expand(batch_size, -1, -1)
    x = torch.cat((cls_tokens, x), dim=1)
    x = x + self.pos_embeddings
    return x

PE = PatchEmbedding()
x = torch.randn(1, 3, 224, 224)
y = PE(x)
print(y.shape)

torch.Size([1, 197, 768])


In [None]:
#Implement vanilla ViT for classification
import torch
import torch.nn as nn

class PatchEmbedding(nn.Module):
  def __init__(self, img_size = 224, in_channels = 3, embed_dim = 768, patch_size = 16):
    super().__init__()
    self.patch_size = patch_size
    self.grid_size = img_size // patch_size
    self.num_patches = self.grid_size **2
    self.projection = nn.Conv2d(in_channels, embed_dim, kernel_size=patch_size, stride=patch_size)

  def forward(self, x):
    x = self.projection(x)
    x = x.flatten(2).transpose(1,2)
    return x

class MLP(nn.Module):
  def __init__(self, in_features, hidden_features=None, out_features=None, dropout=0.1):
    super().__init__()
    hidden_features = hidden_features or in_features
    out_features = out_features or in_features
    self.fc = nn.Sequential(
        nn.Linear(in_features, hidden_features),
        nn.GELU(),
        nn.Dropout(dropout),
        nn.Linear(hidden_features, out_features),
        nn.Dropout(dropout),
    )

  def forward(self, x):
    return self.fc(x)

class TransformerEncoderBlock(nn.Module):
  def __init__(self, dim, num_heads, mlp_ratio, dropout):
    super().__init__()
    self.norm1 = nn.LayerNorm(dim)
    self.atten = nn.MultiheadAttention(embed_dim = dim, num_heads=num_heads, batch_first=True)
    self.norm2 = nn.LayerNorm(dim)
    self.mlp = MLP(dim, int(dim*mlp_ratio), dropout=dropout)

  def forward(self, x):
    x = x + self.atten(self.norm1(x), self.norm1(x), self.norm1(x))[0]
    x = x + self.mlp(self.norm2(x))
    return x

class VisionTransformer(nn.Module):
  def __init__(self, img_size=224, patch_size = 16, in_channels = 3, embed_dim = 768, num_heads = 12,
               mlp_ratio = 4.0, dropout = 0.1, depth= 12, num_classes = 1000):
    super().__init__()
    self.patch_embeddings = PatchEmbedding(img_size, in_channels, embed_dim, patch_size)
    self.num_patches = self.patch_embeddings.num_patches

    self.cls_token = nn.Parameter(torch.randn(1,1,embed_dim))
    self.pos_embed = nn.Parameter(torch.randn(1, self.num_patches+1, embed_dim))
    self.posDrop = nn.Dropout(p=dropout)

    self.blocks = nn.Sequential(*[TransformerEncoderBlock(embed_dim, num_heads, mlp_ratio, dropout)
    for _ in range(depth)])
    self.norm = nn.LayerNorm(embed_dim)
    self.head = nn.Linear(embed_dim, num_classes)

    self._init_weights()

  def _init_weights(self):
    nn.init.trunc_normal_(self.pos_embed, std=0.02)
    nn.init.trunc_normal_(self.cls_token, std=0.02)
    nn.init.trunc_normal_(self.head.weight, std=0.02)
    if self.head.bias is not None:
      nn.init.zeros_(self.head.bias)

  def forward(self, x):
    B = x.shape[0]
    x = self.patch_embeddings(x)
    cls_tokens = self.cls_token.expand(B, -1, -1)
    x = torch.cat((cls_tokens, x), dim = 1)
    x = x + self.pos_embed
    x = self.posDrop(x)

    x = self.blocks(x)
    x = self.norm(x)
    x = x[:,0]
    x = self.head(x)
    return x

In [None]:
model = VisionTransformer(img_size = 224, patch_size=16, num_classes=10)
dummy_input = torch.randn(2, 3, 224, 224)
output = model(dummy_input)
print(output.shape)

torch.Size([2, 10])


In [None]:
# Vit encoder for object detection
import torch.nn.functional as F
class DetectionVit(nn.Module):
  def __init__(self, img_size=224, patch_size=16, embed_dim=768,
               num_heads = 12, depth=6, mlp_ratio=4.0, num_box = 1, num_classes =1, in_channel=3, dropout=0.1):
    super().__init__()
    self.patch_embed = PatchEmbedding(img_size, in_channel, embed_dim, patch_size)
    self.num_patches = self.patch_embed.num_patches
    self.cls_token = nn.Parameter(torch.randn(1,1,embed_dim))
    self.pos_embed = nn.Parameter(torch.randn(1, self.num_patches+1, embed_dim))

    self.blocks = nn.Sequential(*[TransformerEncoderBlock(embed_dim, num_heads, mlp_ratio, dropout) for _ in range(depth)])
    self.norm = nn.LayerNorm(embed_dim)
    self.num_box= num_box
    self.num_classes = num_classes

    self.mlp_head = nn.Sequential(
        nn.Linear(embed_dim, embed_dim),
        nn.ReLU(),
        nn.Linear(embed_dim, self.num_box*(4+self.num_classes))
    )

  def forward(self, x):
    B = x.shape[0]
    x = self.patch_embed(x)
    cls_tokens = self.cls_token.expand(B, -1,-1)
    x = torch.cat([cls_tokens,x],dim=1)
    x = x + self.pos_embed
    x = self.blocks(x)
    x = self.norm(x)
    x = x[:,0]
    x = self.mlp_head(x)
    return x

Detector = DetectionVit()
dummy_input = torch.randn(2, 3, 224, 224)
output = Detector(dummy_input)
print(output.shape)

bbox_pred = output[:,:4]
cls_pred = output[:,4:]
print(bbox_pred.shape)
print(cls_pred.shape)

def detection_loss(bbox_pre, bbox_gt, cls_pred, cls_gt):
  bbox_loss = F.mse_loss(bbox_pre, bbox_gt)
  class_loss = F.binary_cross_entropy_with_logits(cls_pred,cls_gt)
  return bbox_loss + class_loss

target = torch.randn(2,4)
target_cls = torch.randn(2,1)
loss = detection_loss(bbox_pred, target, cls_pred, target_cls)
print(loss)

torch.Size([2, 5])
torch.Size([2, 4])
torch.Size([2, 1])
tensor(1.3038, grad_fn=<AddBackward0>)


In [None]:
# Vit encoder for segmentation
class SegmentVit(nn.Module):
  def __init__(self, img_size=224, patch_size=16, embed_dim=768,
               num_heads = 12, depth=6, mlp_ratio=4.0, num_box = 1, num_classes =2, in_channel=3, dropout=0.1):
    super().__init__()
    self.patch_embed = PatchEmbedding(img_size, in_channel, embed_dim, patch_size)
    self.num_patches = self.patch_embed.num_patches
    self.cls_token = nn.Parameter(torch.randn(1,1,embed_dim))
    self.pos_embed = nn.Parameter(torch.randn(1, self.num_patches+1, embed_dim))

    self.blocks = nn.Sequential(*[TransformerEncoderBlock(embed_dim, num_heads, mlp_ratio, dropout) for _ in range(depth)])
    self.norm = nn.LayerNorm(embed_dim)
    self.num_box= num_box
    self.num_classes = num_classes
    self.patch_size = patch_size
    self.img_size = img_size

    self.mlp_head = nn.Sequential(
         nn.Conv2d(embed_dim, 256, kernel_size=3, padding=1),
         nn.BatchNorm2d(256),
         nn.ReLU(inplace=True),
         nn.Conv2d(256, 128, kernel_size=3, padding=1),
         nn.ReLU(inplace=True),
         nn.Conv2d(128, num_classes, kernel_size=1),
         )

  def forward(self, x):
    B = x.shape[0]
    x = self.patch_embed(x)
    cls_tokens = self.cls_token.expand(B, -1,-1)
    x = torch.cat([cls_tokens,x],dim=1)
    x = x + self.pos_embed
    x = self.blocks(x)
    x = self.norm(x)
    x = x[:,1:, :] #B, N, D
    b, n, d = x.shape
    x = x.permute(0,2,1).reshape(B, -1 ,int(n**0.5),int(n**0.5)) #assuming square
    x = F.interpolate(x, size=(self.img_size, self.img_size), mode = 'bilinear', align_corners=True)
    x = self.mlp_head(x)
    return x

Segmentor = SegmentVit()
dummy_input = torch.randn(2, 3, 224, 224)
output = Segmentor(dummy_input)
print(output.shape)

torch.Size([2, 2, 224, 224])


LOAD pretrained ViT for classification on CIFER-10

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import transforms, datasets
from torch.utils.data import DataLoader

import timm

#Load Model
model = timm.create_model('vit_base_patch16_224', pretrained = True)
#print(model)

# Modify Classifier
model.head = nn.Linear(model.head.in_features, 10)

#Freeze Model except classifier
for name, param in model.named_parameters():
  if not name.startswith("head"):
    param.requires_grad = False


#Dataset Handeling
transform = transforms.Compose([transforms.Resize(224),
                                transforms.ToTensor(),
                                transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

train_dataset = datasets.CIFAR10(root = './data', train = True, download=True, transform=transform)
val_dataset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle= False)


#Training Pipeline
device = torch.device("cuda " if torch.cuda.is_available() else "cpu")
model = model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001)

for epoch in range(10):
  model.train()
  total_loss = 0
  correct = 0

  for images, labels in train_loader:
    images, labels = images.to(device), labels.to(device)

    optimizer.zero_grad()
    outputs = model(images)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()

    total_loss += loss
    correct += (outputs.argmax(1) == labels).sum().item()

  train_loss = total_loss / len(train_loader)
  train_acc = correct / len(train_dataset)

  print(f"Epoch: {epoch+1} Train Loss: {train_loss:.4f} Train Acc: {train_acc:.4f}")


100%|██████████| 170M/170M [00:02<00:00, 73.4MB/s]


In [None]:
model.eval()
correct = 0

for images, target in val_loader:
  images, target = images.to(device), target.to(device)
  output = model(images)
  correct += (output.argmax(1) == target).sum().item()

val_acc = correct / len(val_dataset)

print(f"Accuracy: {100. * val_acc:.4f}")

USER DEFINE CODE FOR MHA BLOCK

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class MLP(nn.Module):
    def __init__(self, in_features, hidden_features=None, out_features=None, dropout=0.1):
        super().__init__()
        out_features = out_features or in_features
        hidden_features = hidden_features or in_features
        self.fc = nn.Sequential(
            nn.Linear(in_features, hidden_features),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_features, out_features),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.fc(x)

class TransformerEncoderBlock(nn.Module):
    def __init__(self, dim, num_heads, mlp_ratio=4.0, dropout=0.1):
        super().__init__()
        self.num_heads = num_heads
        self.dim = dim
        self.head_dim = dim // num_heads
        assert self.head_dim * num_heads == dim, "embed_dim must be divisible by num_heads"

        self.norm1 = nn.LayerNorm(dim)

        # Learnable Q, K, V projections
        self.q_proj = nn.Linear(dim, dim)
        self.k_proj = nn.Linear(dim, dim)
        self.v_proj = nn.Linear(dim, dim)

        self.out_proj = nn.Linear(dim, dim)
        self.dropout = nn.Dropout(dropout)

        self.norm2 = nn.LayerNorm(dim)
        self.mlp = MLP(dim, int(dim * mlp_ratio), dropout=dropout)

    def forward(self, x):
        B, N, D = x.shape

        # Normalize
        x_norm = self.norm1(x)

        # Linear projections
        q = self.q_proj(x_norm)
        k = self.k_proj(x_norm)
        v = self.v_proj(x_norm)

        # Reshape for multi-head
        q = q.view(B, N, self.num_heads, self.head_dim).transpose(1, 2)  # (B, num_heads, N, head_dim)
        k = k.view(B, N, self.num_heads, self.head_dim).transpose(1, 2)
        v = v.view(B, N, self.num_heads, self.head_dim).transpose(1, 2)

        # Scaled dot-product attention
        attn_scores = (q @ k.transpose(-2, -1)) / (self.head_dim ** 0.5)  # (B, num_heads, N, N)
        attn_probs = F.softmax(attn_scores, dim=-1)
        attn_probs = self.dropout(attn_probs)
        attn_output = attn_probs @ v  # (B, num_heads, N, head_dim)

        # Concatenate heads
        attn_output = attn_output.transpose(1, 2).reshape(B, N, D)  # (B, N, D)
        attn_output = self.out_proj(attn_output)
        attn_output = self.dropout(attn_output)

        # Residual + Feed Forward
        x = x + attn_output
        x = x + self.mlp(self.norm2(x))
        return x


In [None]:
class TransformerEncoderBlock(nn.Module):
  def __init__(self, dim, num_heads, mlp_ratio):
    super().__init__()

    self.head_dim = dim // num_heads
    assert self.head_dim * num_heads == dim, "embed_dim must be divisible by num_heads"

    self.norm1 = nn.LayerNorm(dim)
    #self.atten = nn.MultiheadAttention(embed_dim=dim, num_heads=num_heads, batch_first=True)
    self.q_proj = nn.Linear(dim,dim)
    self.k_proj = nn.Linear(dim,dim)
    self.v_proj = nn.Linear(dim,dim)

    self.out_proj = nn.Linear(dim,dim)
    self.dropout = nn.Dropout(0.1)

    self.norm2 = nn.LayerNorm(dim)
    self.mlp = nn.Sequential(
        nn.Linear(dim, int(dim * mlp_ratio)),
        nn.GELU(),
        nn.Linear(int(dim * mlp_ratio), dim)
    )

  def forward(self, x):
    B, N, D = x.shape
    x_norm = self.norm1(x)

    q = self.q_proj(x_norm)
    k = self.k_proj(x_norm)
    v = self.v_proj(x_norm)

    q = q.view(B, N, self.num_heads, self.head_dim).transpose(1,2)
    k = k.view(B, N, self.num_heads, self.head_dim).transpose(1,2)
    v = v.view(B, N, self.num_heads, self.head_dim).transpose(1,2)

    atten_scores = (q @ k.transpose(-2,-1)) / (self.head_dim ** 0.5)
    atten_probs = F.softmax(atten_scores, dim=-1)
    atten_probs = self.dropout(atten_probs)
    atten_out = atten_probs @ v

    atten_out = atten_out.transpose(1,2).contiguous().view(B, N, D)
    atten_out = self.out_proj(atten_out)
    atten_out = self.dropout(atten_out)

    x = x + atten_out
    x = x + self.mlp(self.norm2(x))
    return x

TEB = TransformerEncoderBlock(dim=128, num_heads=8, mlp_ratio=4)
x = torch.randn(1, 10, 128)
y = TEB(x)
print(y.shape)

LOAD CLIP TO EXTRACT TEXT EMBEDDINGS

In [None]:
# !pip install -q open_clip_torch transformers
# !pip install -q ftfy

import open_clip
import torch
from PIL import Image

model, _, preprocess = open_clip.create_model_and_transforms('ViT-B-32', pretrained='laion2b_s34b_b79k')
tokenizer = open_clip.get_tokenizer('ViT-B-32')

# Replace 'Your text here' with the text you want to get embeddings for
text = tokenizer(["a dog", "a cat", "a horse"])
text_features = model.encode_text(text)

text_features.shape