In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import argparse
import random

import numpy as np
import torch
import torch.nn as nn
import torch.backends.cudnn as cudnn

import lavis.tasks as tasks
from lavis.common.config import Config
from lavis.common.dist_utils import get_rank, init_distributed_mode
from lavis.common.logger import setup_logger
from lavis.common.optims import (
    LinearWarmupCosineLRScheduler,
    LinearWarmupStepLRScheduler,
)
from lavis.common.utils import now

# imports modules for registration
from lavis.datasets.builders import *
from lavis.models import *
from lavis.processors import *
from lavis.runners.runner_base import RunnerBase
from lavis.tasks import *
from layers.nbitlineardynamic import NBitLinearDynamic

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
from args_parser import parse_args


args = vars(parse_args())
args

{'cfg_path': '/nfshomes/vla/scratch/LAVIS/ret_flickr_eval.yaml',
 'options': None,
 'visual_encoder_block_modules': ['qkv', 'proj', 'fc1', 'fc2'],
 'visual_encoder_block_indices': [0,
  1,
  2,
  3,
  4,
  5,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  13,
  14,
  15,
  16,
  17,
  18,
  19,
  20,
  21,
  22,
  23,
  24,
  25,
  26,
  27,
  28,
  29,
  30,
  31,
  32,
  33,
  34,
  35,
  36,
  37,
  38],
 'visual_encoder_block_weight_bits': 8,
 'qformer_layer_indices': None,
 'qformer_self_attention_modules': ['query', 'key', 'value', 'dense'],
 'qformer_self_attention_weight_bits': 8,
 'qformer_cross_attention_modules': ['query', 'key', 'value', 'dense'],
 'qformer_cross_attention_weight_bits': 8,
 'qformer_text_ff_intermediate': None,
 'qformer_text_ff_output': None,
 'qformer_text_ff_weight_bits': None,
 'qformer_img_ff_intermediate': None,
 'qformer_img_ff_output': None,
 'qformer_img_ff_weight_bits': None,
 'qformer_output_modules': None,
 'qformer_vision_proj_weight_bits': None,
 'qf

In [5]:
cfg = Config(parse_args())
cfg

<lavis.common.config.Config at 0x7f3d3c834bb0>

In [6]:
task = tasks.setup_task(cfg)
task

<lavis.tasks.retrieval.RetrievalTask at 0x7f3d60fa7970>

In [15]:
model = task.build_model(cfg)
model

Position interpolate from 16x16 to 26x26


Blip2Qformer(
  (visual_encoder): VisionTransformer(
    (patch_embed): PatchEmbed(
      (proj): Conv2d(3, 1408, kernel_size=(14, 14), stride=(14, 14))
    )
    (pos_drop): Dropout(p=0.0, inplace=False)
    (blocks): ModuleList(
      (0-38): 39 x Block(
        (norm1): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
        (attn): Attention(
          (qkv): Linear(in_features=1408, out_features=4224, bias=False)
          (attn_drop): Dropout(p=0.0, inplace=False)
          (proj): Linear(in_features=1408, out_features=1408, bias=True)
          (proj_drop): Dropout(p=0.0, inplace=False)
        )
        (drop_path): Identity()
        (norm2): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
        (mlp): Mlp(
          (fc1): Linear(in_features=1408, out_features=6144, bias=True)
          (act): GELU(approximate='none')
          (fc2): Linear(in_features=6144, out_features=1408, bias=True)
          (drop): Dropout(p=0.0, inplace=False)
        )
      )
    )


In [16]:
model

Blip2Qformer(
  (visual_encoder): VisionTransformer(
    (patch_embed): PatchEmbed(
      (proj): Conv2d(3, 1408, kernel_size=(14, 14), stride=(14, 14))
    )
    (pos_drop): Dropout(p=0.0, inplace=False)
    (blocks): ModuleList(
      (0-38): 39 x Block(
        (norm1): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
        (attn): Attention(
          (qkv): Linear(in_features=1408, out_features=4224, bias=False)
          (attn_drop): Dropout(p=0.0, inplace=False)
          (proj): Linear(in_features=1408, out_features=1408, bias=True)
          (proj_drop): Dropout(p=0.0, inplace=False)
        )
        (drop_path): Identity()
        (norm2): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
        (mlp): Mlp(
          (fc1): Linear(in_features=1408, out_features=6144, bias=True)
          (act): GELU(approximate='none')
          (fc2): Linear(in_features=6144, out_features=1408, bias=True)
          (drop): Dropout(p=0.0, inplace=False)
        )
      )
    )


In [17]:
from quantize import *
quantize(model, args)

parent:  Attention(
  (qkv): Linear(in_features=1408, out_features=4224, bias=False)
  (attn_drop): Dropout(p=0.0, inplace=False)
  (proj): Linear(in_features=1408, out_features=1408, bias=True)
  (proj_drop): Dropout(p=0.0, inplace=False)
)
child:  qkv
weight_bits:  8
parent:  Attention(
  (qkv): NBitLinearDynamic(in_features=1408, out_features=4224, bias=False)
  (attn_drop): Dropout(p=0.0, inplace=False)
  (proj): Linear(in_features=1408, out_features=1408, bias=True)
  (proj_drop): Dropout(p=0.0, inplace=False)
)
child:  proj
weight_bits:  8
parent:  Mlp(
  (fc1): Linear(in_features=1408, out_features=6144, bias=True)
  (act): GELU(approximate='none')
  (fc2): Linear(in_features=6144, out_features=1408, bias=True)
  (drop): Dropout(p=0.0, inplace=False)
)
child:  fc1
weight_bits:  8
parent:  Mlp(
  (fc1): NBitLinearDynamic(in_features=1408, out_features=6144, bias=True)
  (act): GELU(approximate='none')
  (fc2): Linear(in_features=6144, out_features=1408, bias=True)
  (drop): Dropo

In [19]:
model.visual_encoder.blocks

ModuleList(
  (0-38): 39 x Block(
    (norm1): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
    (attn): Attention(
      (qkv): NBitLinearDynamic(in_features=1408, out_features=4224, bias=False)
      (attn_drop): Dropout(p=0.0, inplace=False)
      (proj): NBitLinearDynamic(in_features=1408, out_features=1408, bias=True)
      (proj_drop): Dropout(p=0.0, inplace=False)
    )
    (drop_path): Identity()
    (norm2): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
    (mlp): Mlp(
      (fc1): NBitLinearDynamic(in_features=1408, out_features=6144, bias=True)
      (act): GELU(approximate='none')
      (fc2): NBitLinearDynamic(in_features=6144, out_features=1408, bias=True)
      (drop): Dropout(p=0.0, inplace=False)
    )
  )
)

In [21]:
for name, module in model.Qformer.named_children():
    print(name)

bert
cls


In [27]:
model.Qformer.bert

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30523, 768)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (crossattention): BertAttention(
         