In [1]:
import argparse
import random

import numpy as np
import torch
import torch.backends.cudnn as cudnn

import lavis.tasks as tasks
from lavis.common.config import Config
from lavis.common.dist_utils import get_rank, init_distributed_mode
from lavis.common.logger import setup_logger
from lavis.common.optims import (
    LinearWarmupCosineLRScheduler,
    LinearWarmupStepLRScheduler,
)
from lavis.common.utils import now

# imports modules for registration
from lavis.datasets.builders import *
from lavis.models import *
from lavis.processors import *
from lavis.runners.runner_base import RunnerBase
from lavis.tasks import *

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def parse_args():
    parser = argparse.ArgumentParser(description="Training")

    parser.add_argument("--cfg-path", required=True, help="path to configuration file.")
    parser.add_argument(
        "--options",
        nargs="+",
        help="override some settings in the used config, the key-value pair "
        "in xxx=yyy format will be merged into config file (deprecate), "
        "change to --cfg-options instead.",
    )
    
    parser.add_argument('--img-submodule-FF-weight_bits', required = False, default = None, type = int)
    parser.add_argument('--img-submodule-FF-activation_bits', required = False, default = None, type = int)
    
    parser.add_argument('--text-submodule-FF-weight_bits', required = False, default = None)
    parser.add_argument('--text-submodule-FF-activation_bits', required = False, default = None)


    args = parser.parse_args('--cfg-path /nfshomes/vla/scratch/LAVIS/ret_flickr_eval.yaml --img-submodule-FF-weight_bits 8 --img-submodule-FF-activation_bits 32'.split())
    # if 'LOCAL_RANK' not in os.environ:
    #     os.environ['LOCAL_RANK'] = str(args.local_rank)

    return args

In [3]:
cfg = Config(parse_args())
cfg

<lavis.common.config.Config at 0x7fc70a4097e0>

In [4]:
vars(cfg.args)['img_submodule_FF_weight_bits']

8

In [5]:
cfg.pretty_print()

In [6]:
task = tasks.setup_task(cfg)
task

<lavis.tasks.retrieval.RetrievalTask at 0x7fc70a2482b0>

In [42]:
model = task.build_model(cfg)



Position interpolate from 16x16 to 26x26


In [43]:
model

Blip2Qformer(
  (visual_encoder): VisionTransformer(
    (patch_embed): PatchEmbed(
      (proj): Conv2d(3, 1408, kernel_size=(14, 14), stride=(14, 14))
    )
    (pos_drop): Dropout(p=0.0, inplace=False)
    (blocks): ModuleList(
      (0-38): 39 x Block(
        (norm1): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
        (attn): Attention(
          (qkv): Linear(in_features=1408, out_features=4224, bias=False)
          (attn_drop): Dropout(p=0.0, inplace=False)
          (proj): Linear(in_features=1408, out_features=1408, bias=True)
          (proj_drop): Dropout(p=0.0, inplace=False)
        )
        (drop_path): Identity()
        (norm2): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
        (mlp): Mlp(
          (fc1): Linear(in_features=1408, out_features=6144, bias=True)
          (act): GELU(approximate='none')
          (fc2): Linear(in_features=6144, out_features=1408, bias=True)
          (drop): Dropout(p=0.0, inplace=False)
        )
      )
    )


In [44]:
model.vision_proj.in_features

768

In [45]:
weight_bits = 8
activation_bits = 32

In [46]:
from layers.nbitlineardynamic import *
Q_layer = NBitLinearDynamic(model.vision_proj.in_features, 
                            model.vision_proj.out_features, 
                            bias=True,
                            weight_bits = 8,
                            activation_bits = 32)

with torch.no_grad():
    Q_layer.weight.copy_(model.vision_proj.weight)
    Q_layer.bias.copy_(model.vision_proj.bias)
    

Q_layer

NBitLinearDynamic(in_features=768, out_features=256, bias=True)

In [47]:
model.vision_proj = Q_layer

In [48]:
model.vision_proj

NBitLinearDynamic(in_features=768, out_features=256, bias=True)

In [49]:
model

Blip2Qformer(
  (visual_encoder): VisionTransformer(
    (patch_embed): PatchEmbed(
      (proj): Conv2d(3, 1408, kernel_size=(14, 14), stride=(14, 14))
    )
    (pos_drop): Dropout(p=0.0, inplace=False)
    (blocks): ModuleList(
      (0-38): 39 x Block(
        (norm1): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
        (attn): Attention(
          (qkv): Linear(in_features=1408, out_features=4224, bias=False)
          (attn_drop): Dropout(p=0.0, inplace=False)
          (proj): Linear(in_features=1408, out_features=1408, bias=True)
          (proj_drop): Dropout(p=0.0, inplace=False)
        )
        (drop_path): Identity()
        (norm2): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
        (mlp): Mlp(
          (fc1): Linear(in_features=1408, out_features=6144, bias=True)
          (act): GELU(approximate='none')
          (fc2): Linear(in_features=6144, out_features=1408, bias=True)
          (drop): Dropout(p=0.0, inplace=False)
        )
      )
    )


In [30]:
[module for module in model.modules()]

[Blip2Qformer(
   (visual_encoder): VisionTransformer(
     (patch_embed): PatchEmbed(
       (proj): Conv2d(3, 1408, kernel_size=(14, 14), stride=(14, 14))
     )
     (pos_drop): Dropout(p=0.0, inplace=False)
     (blocks): ModuleList(
       (0-38): 39 x Block(
         (norm1): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
         (attn): Attention(
           (qkv): Linear(in_features=1408, out_features=4224, bias=False)
           (attn_drop): Dropout(p=0.0, inplace=False)
           (proj): Linear(in_features=1408, out_features=1408, bias=True)
           (proj_drop): Dropout(p=0.0, inplace=False)
         )
         (drop_path): Identity()
         (norm2): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
         (mlp): Mlp(
           (fc1): Linear(in_features=1408, out_features=6144, bias=True)
           (act): GELU(approximate='none')
           (fc2): Linear(in_features=6144, out_features=1408, bias=True)
           (drop): Dropout(p=0.0, inplace=False)
  

In [36]:
model.visual_encoder

VisionTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(3, 1408, kernel_size=(14, 14), stride=(14, 14))
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (blocks): ModuleList(
    (0-38): 39 x Block(
      (norm1): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=1408, out_features=4224, bias=False)
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=1408, out_features=1408, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (drop_path): Identity()
      (norm2): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=1408, out_features=6144, bias=True)
        (act): GELU(approximate='none')
        (fc2): Linear(in_features=6144, out_features=1408, bias=True)
        (drop): Dropout(p=0.0, inplace=False)
      )
    )
  )
)