In [1]:
import os
dst_path = '/p/openvocabdustr/probing_midlevel_vision/code/probing-mid-level-vision'
# Change to the destination directory
os.chdir(dst_path)

import torch

from models.util import load_checkpoint, initialize_backbone, prepare_state_dict

  from .autonotebook import tqdm as notebook_tqdm
  return torch._C._cuda_getDeviceCount() > 0


In [2]:
checkpoints = {
    "vitb16": {
        "url": "https://dl.fbaipublicfiles.com/moco-v3/vit-b-300ep/vit-b-300ep.pth.tar",
        "filename": "mocov3_vitb16.pth.tar",
    },
    "resnet50": {
        "url": "https://dl.fbaipublicfiles.com/moco-v3/r-50-1000ep/r-50-1000ep.pth.tar",
        "filename": "mocov3_resnet50.pth.tar",
    }
}

In [3]:
def load_model(arch: str, **kwargs):
    assert arch in checkpoints.keys(), f"Invalid arch: {arch}"
    model = initialize_backbone(arch, **kwargs)
    ckpt = load_checkpoint(**checkpoints[arch])["state_dict"]
    ckpt = prepare_state_dict(ckpt, remove_prefix="module.base_encoder.", delete_prefixes=["module.predictor."])
    ckpt = prepare_state_dict(ckpt, remove_prefix="module.momentum_encoder.")
    model.load_state_dict(ckpt)
    return model

In [7]:
for arch in checkpoints.keys():
    if arch == "vitb16":
        model = load_model(arch)
        print(model)
        out = model(torch.randn(2, 3, 224, 224))
        print(out.shape)

VisionTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    (norm): Identity()
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (patch_drop): Identity()
  (norm_pre): Identity()
  (blocks): Sequential(
    (0): Block(
      (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=768, out_features=2304, bias=True)
        (q_norm): Identity()
        (k_norm): Identity()
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=768, out_features=768, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (ls1): Identity()
      (drop_path1): Identity()
      (norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=768, out_features=3072, bias=True)
        (act): GELU(approximate='none')
        (drop1): Dropout(p=0.0, inplace=False)
        (norm): Identity(

In [1]:
# Example of instantiating the feature extractor and extracting features
import os
import torch
dst_path = '/p/openvocabdustr/probing_midlevel_vision/code/probe3d'
# Change to the destination directory
os.chdir(dst_path)

from evals.models.mocov3 import MoCoV3

feature_extractor = MoCoV3(arch="vitb16", return_multilayer=True)

# Testing with random input
images = torch.randn(1, 3, 224, 224)
features = feature_extractor(images)

for i, feature in enumerate(features):
    print(f"Feature from layer {i}: shape {feature.shape}")

  from .autonotebook import tqdm as notebook_tqdm
  return torch._C._cuda_getDeviceCount() > 0


Feature from layer 0: shape torch.Size([1, 768, 14, 14])
Feature from layer 1: shape torch.Size([1, 768, 14, 14])
Feature from layer 2: shape torch.Size([1, 768, 14, 14])
Feature from layer 3: shape torch.Size([1, 768, 14, 14])


In [3]:
14 *14

196