In [10]:
import torch
import torchvision

from torchvision import models
import transformers
from transformers import ConvNextV2Model, AutoImageProcessor

import src.datasets.deep_fashion_ctsrbm

import os
import pathlib

import src.comps.backbones_cnn_pyramid

from tqdm import tqdm

from torchsummary import summary

---

In [89]:
model = ConvNextV2Model.from_pretrained("facebook/convnextv2-tiny-22k-224")

In [90]:
with open("misc/convnext_v2_tiny_model_arch.txt", "w") as out_file:
    print(model, file=out_file)

---

In [91]:
model = ConvNextV2Model.from_pretrained("facebook/convnextv2-tiny-22k-224")

In [92]:
ctsrbm_image_transform = AutoImageProcessor.from_pretrained("facebook/convnextv2-tiny-22k-224")
ctsrbm_image_transform_corr = lambda ten: (ctsrbm_image_transform(ten, return_tensors="pt").pixel_values)

ctsrbm_dataset_dir = os.path.join(pathlib.Path.home(), "data", "DeepFashion", "Consumer-to-shop Clothes Retrieval Benchmark")
ctsrbm_dataset = src.datasets.deep_fashion_ctsrbm.ConsToShopClothRetrBmkImageLoader(ctsrbm_dataset_dir, ctsrbm_image_transform_corr)

In [93]:
ctsrbm_image_transform

ConvNextImageProcessor {
  "crop_pct": 0.875,
  "do_normalize": true,
  "do_rescale": true,
  "do_resize": true,
  "image_mean": [
    0.485,
    0.456,
    0.406
  ],
  "image_processor_type": "ConvNextImageProcessor",
  "image_std": [
    0.229,
    0.224,
    0.225
  ],
  "resample": 3,
  "rescale_factor": 0.00392156862745098,
  "size": {
    "shortest_edge": 224
  }
}

In [94]:
dataset_item = ctsrbm_dataset[0]

img_tensor = dataset_item[0]
item_id_tensor = dataset_item[1]

In [95]:
feature_tensor = model(img_tensor).last_hidden_state

In [96]:
print(img_tensor.shape)
print(feature_tensor.shape)

torch.Size([1, 3, 224, 224])
torch.Size([1, 768, 7, 7])


In [97]:
activation = {}
def get_activation(name):
    def hook(model, input, output):
        activation[name] = output
    return hook

model = ConvNextV2Model.from_pretrained("facebook/convnextv2-tiny-22k-224")

model.encoder.stages[0].register_forward_hook(get_activation('encoder.stages[0]'))
model.encoder.stages[1].register_forward_hook(get_activation('encoder.stages[1]'))
model.encoder.stages[2].register_forward_hook(get_activation('encoder.stages[2]'))
model.encoder.stages[3].register_forward_hook(get_activation('encoder.stages[3]'))
model.encoder.register_forward_hook(get_activation('encoder'))
model.layernorm.register_forward_hook(get_activation('layernorm'))

<torch.utils.hooks.RemovableHandle at 0x7f4790d7ed60>

In [98]:
dataset_item = ctsrbm_dataset[0]

img_tensor = dataset_item[0]
item_id_tensor = dataset_item[1]

In [99]:
feature_tensor = model(img_tensor).last_hidden_state

In [100]:
print("{:20s} {:}".format("img_tensor", img_tensor.shape))

print("---")

for key, item in activation.items():
    if key in ["encoder"]:
        print("{:20s} {:}".format(key, item.last_hidden_state.shape))
    else:
        print("{:20s} {:}".format(key, item.shape))

print("---")

print("{:20s} {:}".format("feature_tensor", feature_tensor.shape))

img_tensor           torch.Size([1, 3, 224, 224])
---
encoder.stages[0]    torch.Size([1, 96, 56, 56])
encoder.stages[1]    torch.Size([1, 192, 28, 28])
encoder.stages[2]    torch.Size([1, 384, 14, 14])
encoder.stages[3]    torch.Size([1, 768, 7, 7])
encoder              torch.Size([1, 768, 7, 7])
layernorm            torch.Size([1, 768])
---
feature_tensor       torch.Size([1, 768, 7, 7])


---

In [101]:
ctsrbm_image_transform = AutoImageProcessor.from_pretrained("facebook/convnextv2-tiny-22k-224")
ctsrbm_image_transform_corr = lambda ten: (ctsrbm_image_transform(ten, return_tensors="pt").pixel_values)

ctsrbm_dataset_dir = os.path.join(pathlib.Path.home(), "data", "DeepFashion", "Consumer-to-shop Clothes Retrieval Benchmark")
ctsrbm_dataset = src.datasets.deep_fashion_ctsrbm.ConsToShopClothRetrBmkImageLoader(ctsrbm_dataset_dir, ctsrbm_image_transform_corr)

In [102]:
activation = {}
def get_activation(name):
    def hook(model, input, output):
        activation[name] = output
    return hook

model = ConvNextV2Model.from_pretrained("facebook/convnextv2-tiny-22k-224")

model.encoder.stages[0].register_forward_hook(get_activation('encoder.stages[0]'))
model.encoder.stages[1].register_forward_hook(get_activation('encoder.stages[1]'))
model.encoder.stages[2].register_forward_hook(get_activation('encoder.stages[2]'))
model.encoder.stages[3].register_forward_hook(get_activation('encoder.stages[3]'))

<torch.utils.hooks.RemovableHandle at 0x7f47f258c310>

In [103]:
size_list = list(range(32, 512))
feature_1_shape_list = []
feature_2_shape_list = []
feature_3_shape_list = []
feature_4_shape_list = []

for size in tqdm(size_list):

    input = torch.empty(1, 3, size, size)
    output = model(input)

    feature_1_shape_list.append(tuple(activation["encoder.stages[0]"].shape))
    feature_2_shape_list.append(tuple(activation["encoder.stages[1]"].shape))
    feature_3_shape_list.append(tuple(activation["encoder.stages[2]"].shape))
    feature_4_shape_list.append(tuple(activation["encoder.stages[3]"].shape))

100%|██████████| 480/480 [00:35<00:00, 13.41it/s]


In [104]:
last_perm_size = 0

print("Img size | Ftr size 1")

for idx, size in enumerate(size_list):

    perm_size = feature_1_shape_list[idx][-1]

    if last_perm_size < perm_size:
        print("     {:3d} |        {:3d}".format(size, perm_size))
        last_perm_size = perm_size

Img size | Ftr size 1
      32 |          8
      36 |          9
      40 |         10
      44 |         11
      48 |         12
      52 |         13
      56 |         14
      60 |         15
      64 |         16
      68 |         17
      72 |         18
      76 |         19
      80 |         20
      84 |         21
      88 |         22
      92 |         23
      96 |         24
     100 |         25
     104 |         26
     108 |         27
     112 |         28
     116 |         29
     120 |         30
     124 |         31
     128 |         32
     132 |         33
     136 |         34
     140 |         35
     144 |         36
     148 |         37
     152 |         38
     156 |         39
     160 |         40
     164 |         41
     168 |         42
     172 |         43
     176 |         44
     180 |         45
     184 |         46
     188 |         47
     192 |         48
     196 |         49
     200 |         50
     204 |         51
     208 |

In [105]:
last_perm_size = 0

print("Img size | Ftr size 2")

for idx, size in enumerate(size_list):

    perm_size = feature_2_shape_list[idx][-1]

    if last_perm_size < perm_size:
        print("     {:3d} |        {:3d}".format(size, perm_size))
        last_perm_size = perm_size

Img size | Ftr size 2
      32 |          4
      40 |          5
      48 |          6
      56 |          7
      64 |          8
      72 |          9
      80 |         10
      88 |         11
      96 |         12
     104 |         13
     112 |         14
     120 |         15
     128 |         16
     136 |         17
     144 |         18
     152 |         19
     160 |         20
     168 |         21
     176 |         22
     184 |         23
     192 |         24
     200 |         25
     208 |         26
     216 |         27
     224 |         28
     232 |         29
     240 |         30
     248 |         31
     256 |         32
     264 |         33
     272 |         34
     280 |         35
     288 |         36
     296 |         37
     304 |         38
     312 |         39
     320 |         40
     328 |         41
     336 |         42
     344 |         43
     352 |         44
     360 |         45
     368 |         46
     376 |         47
     384 |

In [106]:
last_perm_size = 0

print("Img size | Ftr size 3")

for idx, size in enumerate(size_list):

    perm_size = feature_3_shape_list[idx][-1]

    if last_perm_size < perm_size:
        print("     {:3d} |        {:3d}".format(size, perm_size))
        last_perm_size = perm_size

Img size | Ftr size 3
      32 |          2
      48 |          3
      64 |          4
      80 |          5
      96 |          6
     112 |          7
     128 |          8
     144 |          9
     160 |         10
     176 |         11
     192 |         12
     208 |         13
     224 |         14
     240 |         15
     256 |         16
     272 |         17
     288 |         18
     304 |         19
     320 |         20
     336 |         21
     352 |         22
     368 |         23
     384 |         24
     400 |         25
     416 |         26
     432 |         27
     448 |         28
     464 |         29
     480 |         30
     496 |         31


In [107]:
last_perm_size = 0

print("Img size | Ftr size 4")

for idx, size in enumerate(size_list):

    perm_size = feature_4_shape_list[idx][-1]

    if last_perm_size < perm_size:
        print("     {:3d} |        {:3d}".format(size, perm_size))
        last_perm_size = perm_size

Img size | Ftr size 4
      32 |          1
      64 |          2
      96 |          3
     128 |          4
     160 |          5
     192 |          6
     224 |          7
     256 |          8
     288 |          9
     320 |         10
     352 |         11
     384 |         12
     416 |         13
     448 |         14
     480 |         15


In [108]:
def feature_1_shape_formula(size):
    return (size // 4)

for idx, size in enumerate(size_list):

    perm_size = feature_1_shape_list[idx][-1]
    pred_perm_size = feature_1_shape_formula(size)

    if perm_size != pred_perm_size:
        print("ERROR: size = {:4d} | perm_size = {:4d} | pred_perm_size = {:4d}".format(
            size,
            perm_size,
            pred_perm_size
        ))

In [109]:
def feature_2_shape_formula(size):
    return (size // 8)

for idx, size in enumerate(size_list):

    perm_size = feature_2_shape_list[idx][-1]
    pred_perm_size = feature_2_shape_formula(size)

    if perm_size != pred_perm_size:
        print("ERROR: size = {:4d} | perm_size = {:4d} | pred_perm_size = {:4d}".format(
            size,
            perm_size,
            pred_perm_size
        ))

In [110]:
def feature_3_shape_formula(size):
    return (size // 16)

for idx, size in enumerate(size_list):

    perm_size = feature_3_shape_list[idx][-1]
    pred_perm_size = feature_3_shape_formula(size)

    if perm_size != pred_perm_size:
        print("ERROR: size = {:4d} | perm_size = {:4d} | pred_perm_size = {:4d}".format(
            size,
            perm_size,
            pred_perm_size
        ))

In [111]:
def feature_4_shape_formula(size):
    return (size // 32)

for idx, size in enumerate(size_list):

    perm_size = feature_4_shape_list[idx][-1]
    pred_perm_size = feature_4_shape_formula(size)

    if perm_size != pred_perm_size:
        print("ERROR: size = {:4d} | perm_size = {:4d} | pred_perm_size = {:4d}".format(
            size,
            perm_size,
            pred_perm_size
        ))

---

In [11]:
model = src.comps.backbones_cnn_pyramid.ConvNeXtV2TinyMultilevelBackbone()

In [12]:
ctsrbm_image_transform = model.get_image_transform()

ctsrbm_dataset_dir = os.path.join(pathlib.Path.home(), "data", "DeepFashion", "Consumer-to-shop Clothes Retrieval Benchmark")
ctsrbm_dataset = src.datasets.deep_fashion_ctsrbm.ConsToShopClothRetrBmkImageLoader(ctsrbm_dataset_dir, ctsrbm_image_transform)

In [13]:
dataset_item = ctsrbm_dataset[0]

img_tensor = dataset_item[0]
item_id_tensor = dataset_item[1]

In [14]:
img_tensor.shape

torch.Size([1, 3, 224, 224])

In [15]:
feature_tensors = model(img_tensor)

In [16]:
print(feature_tensors[0].shape)
print(feature_tensors[1].shape)
print(feature_tensors[2].shape)
print(feature_tensors[3].shape)

torch.Size([1, 96, 56, 56])
torch.Size([1, 192, 28, 28])
torch.Size([1, 384, 14, 14])
torch.Size([1, 768, 7, 7])


In [18]:
summary(model, input_size=(3, 224, 224), device="cpu")

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 96, 56, 56]           4,704
ConvNextV2LayerNorm-2           [-1, 96, 56, 56]             192
ConvNextV2Embeddings-3           [-1, 96, 56, 56]               0
          Identity-4           [-1, 96, 56, 56]               0
            Conv2d-5           [-1, 96, 56, 56]           4,800
ConvNextV2LayerNorm-6           [-1, 56, 56, 96]             192
            Linear-7          [-1, 56, 56, 384]          37,248
    GELUActivation-8          [-1, 56, 56, 384]               0
     ConvNextV2GRN-9          [-1, 56, 56, 384]             768
           Linear-10           [-1, 56, 56, 96]          36,960
         Identity-11           [-1, 96, 56, 56]               0
  ConvNextV2Layer-12           [-1, 96, 56, 56]               0
           Conv2d-13           [-1, 96, 56, 56]           4,800
ConvNextV2LayerNorm-14           [-