<a href="https://colab.research.google.com/github/Sassotek/Mechanistic-Interpretability-for-Vision-Models-Optimization/blob/main/Mechanistic_Interpretability_for_Vision_Models_Optimization_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Notes

##########Dataset Description
File descriptions

    train.images.zip - the training set (images distributed into class labeled folders)
    test.zip - the unlabeled 10,000 test images
   
    wnids.txt - list of the used ids from the original full set of ImageNet
    words.txt - description of all ids of ImageNet


+++++ https://www.kaggle.com/datasets/wissamsalam/tiny-imagenet-cleaned-for-classification


https://viso.ai/deep-learning/vision-transformer-vit/

https://www.datacamp.com/tutorial/building-a-transformer-with-py-torch

https://docs.pytorch.org/vision/main/models/vision_transformer.html

https://github.com/lucidrains/vit-pytorch

https://arxiv.org/pdf/2010.11929.pdf

https://www.youtube.com/watch?v=j3VNqtJUoz0

https://www.youtube.com/watch?v=vJF3TBI8esQ

https://www.youtube.com/watch?v=nZ22Ecg9XCQ

**link sbagliato**

https://www.kaggle.com/c/tiny-imagenet

#####GuidaEinops

https://nbviewer.org/github/arogozhnikov/einops/blob/main/docs/1-einops-basics.ipynb

**latent_size** (o anche detto embedding_dim, hidden_dim, ecc.) è la dimensione del vettore che rappresenta ogni patch dopo la proiezione lineare, ovvero la dimensione dello spazio latente in cui il modello "lavora".

**class token** per ogni batch, un vettore speciale che sarà usato dal ViT per l'output della classificazione.

**positional embedding** iniziale. Verrà poi ripetuta per il numero di patch + 1 (per il class token). Serve a dare informazioni sulla posizione dei patch nel Transformer.

**patches = einops.rearrange**(
    input,
    'b c (h p1) (w p2) -> b (h w) (p1 p2 c)',
    p1=self.patch_size, p2=self.patch_size
)*
Usa Einops per dividere ogni immagine in patch:

    Input: (b, c, H, W)

    Output: (b, N_patch, patch_dim), dove:

        N_patch = (H // patch_size) * (W // patch_size)

        patch_dim = patch_size * patch_size * c

Esempio: immagine 64x64, patch_size=16 → 16 patch da 16x16x3

In [None]:
#save_path_model= ................./ciao.pth
#save_path_opt= ..../ott.pth

#Imports

In [2]:
import math
import matplotlib
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from matplotlib.gridspec import GridSpec
from matplotlib.patches import FancyArrowPatch
import numpy as np
from PIL import Image
import cv2
from google.colab.patches import cv2_imshow
from skimage import io, color

#Kaggle
!pip install kagglehub --quiet
import kagglehub

#PyTorch modules
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import ModuleList
from torch.utils.data import Dataset,DataLoader, SubsetRandomSampler, ConcatDataset
import torchvision
from torchvision import transforms,datasets
from torchsummary import summary
import torch.optim as optim

import random
import os
import pandas as pd..
from tqdm.notebook import tqdm
import time
import seaborn as sns
import einops
import sys
import requests
import urllib.request
from io import BytesIO
from prettytable import PrettyTable
from scipy import signal
from scipy.fft import fft, fftfreq, fftshift




from IPython.display import display, Markdown, clear_output
from IPython.display import HTML
import ipywidgets as widgets
from ipywidgets import interact, IntSlider, FloatSlider, FloatRangeSlider, Dropdown




import sklearn
from sklearn.metrics import confusion_matrix, classification_report



## GDrive settings
from google.colab import drive
drive.mount('/content/drive')


# Setting the seed
torch.manual_seed(240700)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

print("Device",device)
if torch.cuda.is_available():
   torch.cuda.manual_seed(240700)
   torch.cuda.manual_seed_all(240700)

#ensure that all operations are deterministic on GPU if used,for reproducibility
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False



Mounted at /content/drive
Device cuda


# Globals

In [3]:
directory_file_path="/content/drive/File_projects_CV"

RED = "\033[31m"
GREEN = "\033[32m"
BLUE = "\033[34m"
RESET = "\033[0m"
CYAN="\033[36m"
MAGENTA="\033[35m"

#Transformer hyperparameters
patch_size = 16
latent_size = 768
n_channels = 3
n_heads = 12
n_encoders = 12
dropout = 0.1
n_classes = 200 #See tinyimagenet classes
image_size = 64 #See tinyimagenet classes

epochs = 10
batch_size = 4
threshold = 10e-3
weight_decay = 0.03

#Data

In [14]:
#data_path = kagglehub.dataset_download("akash2sharma/tiny-imagenet")
data_path= kagglehub.dataset_download("wissamsalam/tiny-imagenet-cleaned-for-classification")
data_path= data_path+'/tiny-imagenet-200/'
print("Path to dataset files:", data_path)

Path to dataset files: /kaggle/input/tiny-imagenet-cleaned-for-classification/tiny-imagenet-200/


In [24]:
def print_dataset_structure(root_dir, max_classes=3, max_images=3):
    print(f"📁 Directory: {root_dir}\n")

    root_items = sorted(os.listdir(root_dir))
    for item in root_items:
        item_path = os.path.join(root_dir, item)

        if os.path.isfile(item_path):
            print(f"📄 {item}")

        elif os.path.isdir(item_path) and item in ['train', 'val', 'test']:
            print(f"\n📂 {item}/")

            class_dirs_all = sorted([
                d for d in os.listdir(item_path)
                if os.path.isdir(os.path.join(item_path, d))
            ])
            class_dirs = class_dirs_all[:max_classes]

            if not class_dirs:
                print(f"  (No subdirectory found into {item}/)")
                continue

            for cls in class_dirs:
                cls_path = os.path.join(item_path, cls)
                print(f"  ├── {cls}/")

                image_files_all = sorted([
                    f for f in os.listdir(cls_path)
                    if os.path.isfile(os.path.join(cls_path, f))
                ])
                image_files = image_files_all[:max_images]

                for img in image_files:
                    print(f"  │   ├── {img}")

                if len(image_files_all) > max_images:
                    print(f"  │   └── ...")

            if len(class_dirs_all) > max_classes:
                print(f"  └── ...")


print_dataset_structure(data_path)



📁 Directory: /kaggle/input/tiny-imagenet-cleaned-for-classification/tiny-imagenet-200/


📂 test/
  ├── n01443537/
  │   ├── n01443537_0.JPEG
  │   ├── n01443537_1.JPEG
  │   ├── n01443537_101.JPEG
  │   └── ...
  ├── n01629819/
  │   ├── n01629819_0.JPEG
  │   ├── n01629819_1.JPEG
  │   ├── n01629819_10.JPEG
  │   └── ...
  ├── n01641577/
  │   ├── n01641577_0.JPEG
  │   ├── n01641577_1.JPEG
  │   ├── n01641577_104.JPEG
  │   └── ...
  └── ...

📂 train/
  ├── n01443537/
  │   ├── n01443537_10.JPEG
  │   ├── n01443537_100.JPEG
  │   ├── n01443537_102.JPEG
  │   └── ...
  ├── n01629819/
  │   ├── n01629819_100.JPEG
  │   ├── n01629819_101.JPEG
  │   ├── n01629819_102.JPEG
  │   └── ...
  ├── n01641577/
  │   ├── n01641577_10.JPEG
  │   ├── n01641577_100.JPEG
  │   ├── n01641577_101.JPEG
  │   └── ...
  └── ...

📂 val/
  ├── n01443537/
  │   ├── val_1230.JPEG
  │   ├── val_1267.JPEG
  │   ├── val_1284.JPEG
  │   └── ...
  ├── n01629819/
  │   ├── val_1054.JPEG
  │   ├── val_1167.JPEG
  │ 

In [None]:
##function to check dim and stuff

#Network

##Input Embedding

In [None]:
class InputEmbedding(nn.Module):
    def __init__(self, patch_size=patch_size, n_channels=n_channels, latent_size=latent_size,
                batch_size=1, device=device):
        super(InputEmbedding, self).__init__()
        self.patch_size = patch_size
        self.n_channels = n_channels
        self.latent_size = latent_size
        self.batch_size = batch_size
        self.device = device

        self.input_size = self.patch_size*self.patch_size*self.n_channels
        self.LinearProjection = nn.Linear(self.input_size, self.latent_size) #Linear projection
        self.class_token = nn.Parameter(torch.randn(self.batch_size, 1, self.latent_size)).to(self.device) #Class token
        self.positional_embedding = nn.Parameter(torch.randn(self.batch_size, 1, self.latent_size)).to(self.device) #Positional embedding


    def forward(self, input):
        input = input.to(self.device)

        #Patchification of input image
        patches = einops.rearrange(
            input,
            'b c (h p1) (w p2) -> b (h w) (p1 p2 c)',
            p1=self.patch_size, p2=self.patch_size
            )

        print(input.size())
        print(patches.size())

        LinearProjection = self.LinearProjection(patches).to(self.device)
        batch, n_patches, _ = LinearProjection.shape
        LinearProjection = torch.cat((self.class_token, LinearProjection), dim=1)

        #controllare erpché tutti i token di ogni immagine hanno la stessa positional embedding!
        positional_embedding = einops.repeat(
            self.positional_embedding,
            'b 1 d -> b m d',
            m=n_patches+1
            )

        print(LinearProjection.size())
        print(positional_embedding.size())

        LinearProjection += positional_embedding
        return LinearProjection

##Encoder

In [None]:
class Encoder(nn.Module):
    def __init__(self, latent_size=latent_size, n_heads=n_heads, dropout=dropout, device = device):
        super(Encoder, self).__init__()
        self.latent_size = latent_size
        self.n_heads = n_heads
        self.dropout = dropout
        self.device = device

        #norm layer
        self.norm = nn.LayerNorm(self.latent_size)

        #multihead atention
        self.multihead_attention = nn.MultiheadAttention(self.latent_size, self.n_heads, dropout=self.dropout)

        #add input
        #norm
        #MLP
        self.MLP = nn.Sequential(
            nn.Linear(self.latent_size, self.latent_size*4),
            nn.GELU(),
            nn.Dropout(self.dropout),
            nn.Linear(self.latent_size*4, self.latent_size),
            nn.Dropout(self.dropout)
        )

    def forward(self, embedded_patches):
        first_norm_out = self.norm(embedded_patches)
        attention_out = self.multihead_attention(first_norm_out, first_norm_out, first_norm_out)[0]

        first_add = attention_out + embedded_patches

        second_norm_out = self.norm(first_add)
        MLP_out = self.MLP(second_norm_out)

        out = MLP_out + first_add

        #print('embed: ', embedded_patches.size())
        #print('output', out.size())

        return out


##Transformer

(put everything together)

In [None]:
class Vit(nn.Module):
    def __init__(self, n_encoders=n_encoders, latent_size=latent_size, n_classes=n_classes, dropout = dropout, device=device):
        super(Vit, self).__init__()
        self.n_encoders = n_encoders
        self.latent_size = latent_size
        self.n_classes = n_classes
        self.dropout = dropout
        self.device = device

        self.Embedding = InputEmbedding()

        self.EncoderStack = nn.ModuleList([Encoder() for i in range(self.n_encoders)])

        self = MLPHead = nn.Sequential(
            nn.LayerNorm(self.latent_size),
            nn.Linear(self.latent_size, self.latent_size),
            nn.Linear(self.latent_size, self.n_classes)
        )

    def Forward(self, input):
        encoder_out = self.Embedding(input)

        for enc in self.EncoderStack:
            encoder_out = enc(encoder_out)

        cls_token = encoder_out[:, 0]
        MLPHead_out = self.MLPHead(cls_token)

        return MLPHead_out



In [None]:
test_input = torch.randn(1,3,64,64)
test_class = InputEmbedding().to(device)
test_class(test_input)

test_encoder = Encoder().to(device)
test_encoder(test_class(test_input))

torch.Size([1, 3, 64, 64])
torch.Size([1, 16, 768])
torch.Size([1, 17, 768])
torch.Size([1, 17, 768])
torch.Size([1, 3, 64, 64])
torch.Size([1, 16, 768])
torch.Size([1, 17, 768])
torch.Size([1, 17, 768])
embed:  torch.Size([1, 17, 768])
output torch.Size([1, 17, 768])


tensor([[[-0.8716, -1.0013, -1.4098,  ...,  0.8561, -0.5967,  0.6804],
         [-0.9453,  0.2115, -1.2030,  ..., -1.0250, -1.8071,  2.2308],
         [-0.9233,  0.0763, -0.2545,  ..., -0.3028, -2.5491,  0.7847],
         ...,
         [-1.6709, -1.4999, -0.9490,  ..., -0.5014, -0.5639,  0.9106],
         [-0.6949, -0.4929, -0.3836,  ..., -1.6464, -2.2312,  0.9835],
         [-0.7364, -1.2531, -1.0976,  ..., -1.9165, -2.3608,  1.7134]]],
       device='cuda:0', grad_fn=<AddBackward0>)