In [1]:
import argparse
import random

import numpy as np
import torch
import torch.backends.cudnn as cudnn

import lavis.tasks as tasks
from lavis.common.config import Config
from lavis.common.dist_utils import get_rank, init_distributed_mode
from lavis.common.logger import setup_logger
from lavis.common.optims import (
    LinearWarmupCosineLRScheduler,
    LinearWarmupStepLRScheduler,
)
from lavis.common.utils import now

# imports modules for registration
from lavis.datasets.builders import *
from lavis.models import *
from lavis.processors import *
from lavis.runners.runner_base import RunnerBase
from lavis.tasks import *

Jupyter environment detected. Enabling Open3D WebVisualizer.
[Open3D INFO] WebRTC GUI backend enabled.
[Open3D INFO] WebRTCWindowSystem: HTTP handshake server disabled.


In [2]:
parser = argparse.ArgumentParser(description="Training")

parser.add_argument("--cfg-path", required=True, help="path to configuration file.")
parser.add_argument(
    "--options",
    nargs="+",
    help="override some settings in the used config, the key-value pair "
    "in xxx=yyy format will be merged into config file (deprecate), "
    "change to --cfg-options instead.",
)

args = parser.parse_args("--cfg-path ret_flickr_eval.yaml".split())
args

Namespace(cfg_path='ret_flickr_eval.yaml', options=None)

In [3]:
cfg = Config(args)
task = tasks.setup_task(cfg)
model = task.build_model(cfg)
#model

Position interpolate from 16x16 to 26x26


In [4]:
model.visual_encoder

VisionTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(3, 1408, kernel_size=(14, 14), stride=(14, 14))
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (blocks): ModuleList(
    (0-38): 39 x Block(
      (norm1): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=1408, out_features=4224, bias=False)
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=1408, out_features=1408, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (drop_path): Identity()
      (norm2): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=1408, out_features=6144, bias=True)
        (act): GELU(approximate='none')
        (fc2): Linear(in_features=6144, out_features=1408, bias=True)
        (drop): Dropout(p=0.0, inplace=False)
      )
    )
  )
)

In [5]:
model.Qformer

BertLMHeadModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30523, 768)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (dropout): Dropout(p=0.1, inplace=False)
     

In [6]:
vit_model_parameters = filter(lambda p: p[1].requires_grad, model.visual_encoder.named_parameters())
vit_params = sum([np.prod(p[1].numel()) for p in vit_model_parameters])

qformer_model_parameters = filter(lambda p: p[1].requires_grad, model.Qformer.named_parameters())
qformer_params = sum([np.prod(p[1].numel()) for p in qformer_model_parameters])

In [7]:
qformer_params + vit_params

1172768699

In [23]:
vit_params_dict = {
    "attn_head_linear_params": 0,
    "mlp_linear_params": 0,
    "layernorm": 0,
    "patch_embed": 0
}
attn_head_linear_params = 0
mlp_linear_params = 0
layernorm_params = 0
patch_embed = 0

for param in model.visual_encoder.named_parameters():
    if "attn.q" in param[0] or "attn.v" in param[0] or "attn.k" in param[0] or "attn.qkv" in param[0] or "attn.proj" in param[0]:
        vit_params_dict["attn_head_linear_params"] += param[1].numel()
    elif "mlp.fc1" in param[0] or "mlp.fc2" in param[0]:
        vit_params_dict["mlp_linear_params"] += param[1].numel()
    elif "norm1" in param[0] or "norm2" in param[0]:
        vit_params_dict["layernorm"] += param[1].numel()
    elif "patch_embed.proj" in param[0]:
        vit_params_dict["patch_embed"] += param[1].numel()


quantizable_vit_params = 0
for param_name, params in vit_params_dict.items():
    print(f"{param_name} params: {params} -> {100*params/vit_params:.3f}%")
    quantizable_vit_params += params

print()
print(f"{quantizable_vit_params} quantizable params in ViT -> {100*quantizable_vit_params/vit_params:.3f}%")

attn_head_linear_params params: 309429120 -> 31.367%
mlp_linear_params params: 675053184 -> 68.430%
layernorm params: 219648 -> 0.022%
patch_embed params: 829312 -> 0.084%

985531264 quantizable params in ViT -> 99.903%


In [22]:
qformer_params_dict = {
    "qformer_selfattn_linears": 0,
    "qformer_crossattention_linears": 0,
    "qformer_image_ff_linears": 0,
    "qformer_text_ff_linears": 0,
    "output_dense": 0,
    "output_query_dense": 0,
    "layernorm": 0,
    "word_embeddings": 0,
    "position_embeddings": 0,
    "MLMHead_transform_dense": 0,
}

for param in model.Qformer.named_parameters():
    if "crossattention.self.query" in param[0] or "crossattention.self.key" in param[0] or "crossattention.self.value" in param[0]:
        qformer_params_dict["qformer_crossattention_linears"] += param[1].numel()
    elif "attention.self.query" in param[0] or "attention.self.key" in param[0] or "attention.self.value" in param[0]:
        qformer_params_dict["qformer_selfattn_linears"] += param[1].numel()
    elif "intermediate_query.dense" in param[0]:
        qformer_params_dict["qformer_image_ff_linears"] += param[1].numel()
    elif "intermediate.dense" in param[0]:
        qformer_params_dict["qformer_text_ff_linears"] += param[1].numel()
    elif "output.dense" in param[0]:
        qformer_params_dict["output_dense"] += param[1].numel()
    elif "output_query.dense" in param[0]:
        qformer_params_dict["output_query_dense"] += param[1].numel()
    elif "LayerNorm" in param[0]:
        qformer_params_dict["layernorm"] += param[1].numel()
    elif "word_embeddings" in param[0]:
        qformer_params_dict["word_embeddings"] += param[1].numel()
    elif "position_embeddings" in param[0]:
        qformer_params_dict["position_embeddings"] += param[1].numel()
    elif "transform.dense" in param[0]:
        qformer_params_dict["MLMHead_transform_dense"] += param[1].numel()
        
quantizable_qformer_params = 0
for param_name, params in qformer_params_dict.items():
    print(f"{param_name} params: {params} -> {100*params/qformer_params:.3f}%")
    quantizable_qformer_params += params

print()
print(f"{quantizable_qformer_params} quantizable params in Qformer -> {100*quantizable_qformer_params/qformer_params:.3f}%")

qformer_selfattn_linears params: 21261312 -> 11.413%
qformer_crossattention_linears params: 16528896 -> 8.873%
qformer_image_ff_linears params: 28348416 -> 15.218%
qformer_text_ff_linears params: 28348416 -> 15.218%
output_dense params: 38951424 -> 20.910%
output_query_dense params: 28320768 -> 15.203%
layernorm params: 67584 -> 0.036%
word_embeddings params: 23441664 -> 12.584%
position_embeddings params: 393216 -> 0.211%
MLMHead_transform_dense params: 590592 -> 0.317%

186252288 quantizable params in Qformer -> 99.984%
