<a href="https://colab.research.google.com/github/Saibhossain/visual-Transformers_-ViT-/blob/main/ViT_implementation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# vit

In [None]:
from PIL import Image

img = Image.open("/content/cat.jpeg")
img


In [None]:
from transformers import AutoProcessor, SiglipVisionModel, SiglipVisionConfig

processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")
vision_model = SiglipVisionModel.from_pretrained("google/siglip-base-patch16-224", config=SiglipVisionConfig(vision_use_head=False))

vision_model

In [None]:
import torch
from torch import nn
from torch.nn import functional as F
from dataclasses import dataclass
from torchvision import transforms

def preprocess_image(image, image_size=224):
  preprocess = transforms.Compose([
      transforms.Resize((image_size, image_size)),
      transforms.ToTensor(),
      transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229,0.224,0.225])
  ])

  image_tensor = preprocess(image)
  image_tensor = image_tensor.unsqueeze(0)
  return image_tensor

image_tensor = preprocess_image(img)
print("image shape",image_tensor.shape)

embed_dim = 768
patch_size = 16
image_size = 224
num_patches = (image_size // patch_size) ** 2

with torch.no_grad():
  patch_embadding = nn.Conv2d(in_channels=3, out_channels=embed_dim, kernel_size=patch_size, stride=patch_size)
  patches = patch_embadding(image_tensor)
  print(patches.shape)

patches.shape, num_patches

position_embedding = nn.Embedding(num_patches, embed_dim)
position_ids = torch.arange(num_patches).expand((1,-1))

print(position_ids.shape)

embeddings = patches.flatten(start_dim=2) # Flatten the spatial dimensions (14x14) into one
embeddings = embeddings.transpose(1, 2)  # Transpose to get (batch_size, num_patches, embed_dim)
embeddings = embeddings + position_embedding(position_ids)
print(embeddings.shape)

In [None]:
import matplotlib.pyplot as plt

patches_viz = embeddings[0].detach().numpy()

plt.figure(figsize=(15, 8))
plt.imshow(patches_viz, aspect='auto',cmap='viridis')
plt.colorbar()
plt.title('visiluzed all of patch Embadding')
plt.xlabel('Embadding Dimension')
plt.ylabel('Patches')
plt.show()

In [None]:
vision_model.eval()
inputs = processor(images=img, return_tensors="pt")

with torch.no_grad():
    patch_embeddings = vision_model.vision_model.embeddings(inputs.pixel_values)

print(patch_embeddings.shape)

patches_viz = patch_embeddings[0].detach().numpy()  # Shape: [196, 768]

plt.figure(figsize=(15, 8))
plt.imshow(patches_viz, aspect='auto', cmap='viridis')
plt.colorbar()
plt.title('Trained Model: All Patch Embeddings')
plt.xlabel('Embedding Dimension')
plt.ylabel('Patch Number')
plt.show()