In [1]:
import socket
def find_free_port():
    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
        s.bind(("",0))
        s.listen(1)
        port = s.getsockname()[1]
        return port

print(find_free_port())

54533


In [1]:
import torch
import torchvision
print(f"torch version: {torch.__version__}")
print(f"torchvision version: {torchvision.__version__}")

torch version: 2.2.0
torchvision version: 0.17.0


In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [4]:
try:
    from torchinfo import summary
except:
    print("[INFO] Couldn't find torchinfo... installing it.")
    !pip install -q torchinfo
    from torchinfo import summary

In [5]:
train_dir = "/net/polaris/storage/deeplearning/sur_data/binary_rgb_daa/split_0/train"
val_dir = "/net/polaris/storage/deeplearning/sur_data/binary_rgb_daa/split_0/val"
test_dir = "/net/polaris/storage/deeplearning/sur_data/binary_rgb_daa/split_0/test"

In [6]:
import torch

dinov2_vitb14 = torch.hub.load('facebookresearch/dinov2', 'dinov2_vitb14')


Using cache found in /home/sur06423/.cache/torch/hub/facebookresearch_dinov2_main


In [7]:
# Print a summary using torchinfo (uncomment for actual output)
summary(model=dinov2_vitb14,
        input_size=(1024,3,224,224),
        col_names=["input_size", "output_size", "num_params", "trainable"],
        col_width=20,
        row_settings=["var_names"]      
)

Layer (type (var_name))                            Input Shape          Output Shape         Param #              Trainable
DinoVisionTransformer (DinoVisionTransformer)      [1024, 3, 224, 224]  [1024, 768]          1,053,696            True
├─PatchEmbed (patch_embed)                         [1024, 3, 224, 224]  [1024, 256, 768]     --                   True
│    └─Conv2d (proj)                               [1024, 3, 224, 224]  [1024, 768, 16, 16]  452,352              True
│    └─Identity (norm)                             [1024, 256, 768]     [1024, 256, 768]     --                   --
├─ModuleList (blocks)                              --                   --                   --                   True
│    └─NestedTensorBlock (0)                       [1024, 257, 768]     [1024, 257, 768]     --                   True
│    │    └─LayerNorm (norm1)                      [1024, 257, 768]     [1024, 257, 768]     1,536                True
│    │    └─MemEffAttention (attn)           

In [8]:
import torch.nn as nn

class LinearClassifier(nn.Module):
    """Linear layer to train on top of frozen features"""

    def __init__(self, backbone, num_features=768, num_classes=2):
        super().__init__()
        self.backbone = backbone
        self.num_classes = num_classes
        self.linear = nn.Linear(num_features, num_classes)
        self.linear.weight.data.normal_(mean=0.0, std=0.01)
        self.linear.bias.data.zero_()

    def forward(self, x):
        features = self.backbone(x)
        output = self.linear(features)
        return output

In [9]:
model = LinearClassifier(backbone=dinov2_vitb14)
for param in model.backbone.parameters():
    param.requires_grad = False

In [10]:
summary(model=model,
        input_size=(1024,3,224,224),
        col_names=["input_size", "output_size", "num_params", "trainable"],
        col_width=20,
        row_settings=["var_names"]      
)

Layer (type (var_name))                       Input Shape          Output Shape         Param #              Trainable
LinearClassifier (LinearClassifier)           [1024, 3, 224, 224]  [1024, 2]            --                   Partial
├─DinoVisionTransformer (backbone)            [1024, 3, 224, 224]  [1024, 768]          1,053,696            False
│    └─PatchEmbed (patch_embed)               [1024, 3, 224, 224]  [1024, 256, 768]     --                   False
│    │    └─Conv2d (proj)                     [1024, 3, 224, 224]  [1024, 768, 16, 16]  (452,352)            False
│    │    └─Identity (norm)                   [1024, 256, 768]     [1024, 256, 768]     --                   --
│    └─ModuleList (blocks)                    --                   --                   --                   False
│    │    └─NestedTensorBlock (0)             [1024, 257, 768]     [1024, 257, 768]     (7,089,408)          False
│    │    └─NestedTensorBlock (1)             [1024, 257, 768]     [1024, 257

In [11]:
for parameters in model.backbone.parameters():
    print(parameters)
    break

Parameter containing:
tensor([[[-6.9142e-04, -2.0539e-04, -4.6968e-02, -1.4960e-03, -2.0410e-02,
           4.7294e-03,  2.6455e-03, -4.9413e-03, -4.9912e-03, -1.1096e-03,
           2.4690e-03,  7.5610e-03,  7.1985e-03, -3.0513e-04, -3.3493e-03,
          -3.6020e-04,  1.2004e-02,  1.3525e-03, -1.4590e-02, -8.5887e-03,
          -7.0904e-04,  2.0706e-04, -1.8931e-03, -1.6581e-03, -1.7453e-03,
           1.5737e-03, -3.3249e-03, -3.5086e-04,  4.2582e-03, -6.0191e-03,
          -1.9831e-03,  3.4883e-04, -5.7279e-03,  3.4933e-02,  4.3051e-03,
           2.9389e-03, -9.7192e-04,  8.2199e-03,  3.6512e-03, -1.9048e-03,
           2.6950e-03, -4.7313e-02, -1.9887e-02, -7.3382e-04,  7.6832e-03,
          -2.8708e-02, -1.3646e-03, -1.8689e-04, -1.7721e-03, -4.0460e-03,
          -3.8712e-04,  8.3599e-03,  1.1057e-03, -2.7552e-03,  1.3896e-02,
          -2.5532e-03,  1.4683e-02, -3.9614e-03,  9.5134e-04,  3.6621e-03,
           3.0747e-04, -2.3839e-03, -1.7077e-03,  2.7554e-03,  2.9600e-03,
   