In [78]:
import argparse
import random

import numpy as np
import torch
import torch.nn as nn
import torch.backends.cudnn as cudnn

import lavis.tasks as tasks
from lavis.common.config import Config
from lavis.common.dist_utils import get_rank, init_distributed_mode
from lavis.common.logger import setup_logger
from lavis.common.optims import (
    LinearWarmupCosineLRScheduler,
    LinearWarmupStepLRScheduler,
)
from lavis.common.utils import now

# imports modules for registration
from lavis.datasets.builders import *
from lavis.models import *
from lavis.processors import *
from lavis.runners.runner_base import RunnerBase
from lavis.tasks import *
from layers.nbitlineardynamic import NBitLinearDynamic

In [165]:
def parse_args():
    parser = argparse.ArgumentParser(description="Training")

    parser.add_argument("--cfg-path", required=True, help="path to configuration file.")
    parser.add_argument(
        "--options",
        nargs="+",
        help="override some settings in the used config, the key-value pair "
        "in xxx=yyy format will be merged into config file (deprecate), "
        "change to --cfg-options instead.",
    )
    
    # parser.add_argument('--img-submodule-FF-weight_bits', required = False, default = None, type = int)
    # parser.add_argument('--img-submodule-FF-activation_bits', required = False, default = None, type = int)
    
    # parser.add_argument('--text-submodule-FF-weight_bits', required = False, default = None)
    # parser.add_argument('--text-submodule-FF-activation_bits', required = False, default = None)
    
    parser.add_argument('--visual-encoder-blocks', 
                        required=False,
                        nargs="*",
                        choices= ['qkv', 'proj', 'fc1', 'fc2'],
                        default=None,                         
                        help='modules of visual-encoder blocks to quantize')
    
    parser.add_argument('--visual-encoder-block-indices',
                         required=False,
                         nargs='*',
                         type=int,
                         choices= [i for i in range(39)],   # NOTE: hard-coded number of possible blocks for ViT
                         default=None,      
                         help = 'indices of visual-encoder blocks to quantize'
                         )                         


    CLI_INPUT = f'''
                --cfg-path /nfshomes/vla/scratch/LAVIS/ret_flickr_eval.yaml \
                --visual-encoder-blocks qkv fc1
                --visual-encoder-block-indices {' '.join([str(i) for i in range(39)])}
                
                '''
                
    
    args = parser.parse_args(CLI_INPUT.split())
    # if 'LOCAL_RANK' not in os.environ:
    #     os.environ['LOCAL_RANK'] = str(args.local_rank)
    
    args_dict = vars(args)
    
    if bool(args_dict['visual_encoder_blocks']) ^ bool(args_dict['visual_encoder_block_indices']):
        parser.error('--visual-encoder-blocks and --visual-encoder-block-indices must be given together')
    

    return args

args = vars(parse_args())
args

{'cfg_path': '/nfshomes/vla/scratch/LAVIS/ret_flickr_eval.yaml',
 'options': None,
 'visual_encoder_blocks': ['qkv', 'fc1'],
 'visual_encoder_block_indices': [0,
  1,
  2,
  3,
  4,
  5,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  13,
  14,
  15,
  16,
  17,
  18,
  19,
  20,
  21,
  22,
  23,
  24,
  25,
  26,
  27,
  28,
  29,
  30,
  31,
  32,
  33,
  34,
  35,
  36,
  37,
  38]}

In [166]:
cfg = Config(parse_args())
cfg

<lavis.common.config.Config at 0x7fc7fa6f18a0>

In [167]:
task = tasks.setup_task(cfg)
task

<lavis.tasks.retrieval.RetrievalTask at 0x7fc7fa853520>

In [179]:
model = task.build_model(cfg)



Position interpolate from 16x16 to 26x26


In [180]:
# model_parts = {name:m.__class__.__name__ for name,m in model.named_children()}
# print(model_parts)

'''
Takes in nn.Linear and returns equivalent NBitLinearDynamic replacement
'''
def quantize_layer(module:nn.Linear, weight_bits = 32, activation_bits=32):
    
    with torch.no_grad():
        
        bias = True if module.bias != None else False
        
        Q_layer = NBitLinearDynamic(module.in_features, 
                    module.out_features, 
                    bias=bias,
                    weight_bits = weight_bits,
                    activation_bits = activation_bits)

        # copy over weights
        Q_layer.weight.copy_(module.weight)
        if bias:
            Q_layer.bias.copy_(module.bias)

    return Q_layer


def quantize_visual_encoder_block(module_parent):
    for name, module in module_parent.named_children():
        if name in args['visual_encoder_blocks']:
            print('parent: ', module_parent)
            print('child: ', name)
            print(quantize_layer(module))
            setattr(module_parent, name, quantize_layer(module))
        else:
            quantize_visual_encoder_block(module)
            

def quantize_visual_encoder_blocks(blocks):
    for name, module in blocks.named_children():
        # print(name)
        if int(name) in args['visual_encoder_block_indices']:
            # print('here')
            quantize_visual_encoder_block(module)
         



# def apply_quant_to_selected_modules(model: nn.Module, target_modules: List[str], bits: int = 4, apply=None):
    
#     for name, module in model.named_children():
#         if (apply is None):
#             if name in target_modules:
#                 print(f"Applying GPTQ to {name} module")
#                 apply_quant_to_selected-modules(module, target_modules, bits, True)
#             else:
#                 apply_quant_to_selected-modules(module, target_modules, bits, False)
#         else:
#             if isinstance(module, nn.Linear):
#                 print(f"Found a layer to quantize {name}")
#                 gptq_quantize_layer(module, bits)
#             elif isinstance(module, nn.Module):
#                 apply_quant_to_selected-modules(module, target_modules, bits, apply)
#     return model

# quantized_model = apply_quant_to_selected-modules(model, target_modules, bits=8)

In [181]:
quantize_visual_encoder_blocks(model.visual_encoder.blocks)

parent:  Linear(in_features=1408, out_features=4224, bias=False)
child:  qkv
NBitLinearDynamic(in_features=1408, out_features=4224, bias=False)
parent:  Linear(in_features=1408, out_features=6144, bias=True)
child:  fc1
NBitLinearDynamic(in_features=1408, out_features=6144, bias=True)
parent:  Linear(in_features=1408, out_features=4224, bias=False)
child:  qkv
NBitLinearDynamic(in_features=1408, out_features=4224, bias=False)
parent:  Linear(in_features=1408, out_features=6144, bias=True)
child:  fc1
NBitLinearDynamic(in_features=1408, out_features=6144, bias=True)
parent:  Linear(in_features=1408, out_features=4224, bias=False)
child:  qkv
NBitLinearDynamic(in_features=1408, out_features=4224, bias=False)
parent:  Linear(in_features=1408, out_features=6144, bias=True)
child:  fc1
NBitLinearDynamic(in_features=1408, out_features=6144, bias=True)
parent:  Linear(in_features=1408, out_features=4224, bias=False)
child:  qkv
NBitLinearDynamic(in_features=1408, out_features=4224, bias=False

In [182]:
model.visual_encoder.blocks

ModuleList(
  (0-38): 39 x Block(
    (norm1): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
    (attn): Attention(
      (qkv): Linear(
        in_features=1408, out_features=4224, bias=False
        (qkv): NBitLinearDynamic(in_features=1408, out_features=4224, bias=False)
      )
      (attn_drop): Dropout(p=0.0, inplace=False)
      (proj): Linear(in_features=1408, out_features=1408, bias=True)
      (proj_drop): Dropout(p=0.0, inplace=False)
    )
    (drop_path): Identity()
    (norm2): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
    (mlp): Mlp(
      (fc1): Linear(
        in_features=1408, out_features=6144, bias=True
        (fc1): NBitLinearDynamic(in_features=1408, out_features=6144, bias=True)
      )
      (act): GELU(approximate='none')
      (fc2): Linear(in_features=6144, out_features=1408, bias=True)
      (drop): Dropout(p=0.0, inplace=False)
    )
  )
)

In [123]:
args['visual_encoder_blocks']

['qkv', 'proj', 'fc1', 'fc2']

In [141]:
module_cur = model.visual_encoder.blocks

for name, module in module_cur.named_children():
    if int(name) in args['visual_encoder_blocks']:
        print(name)
        print(module)
        print(quantize_layer(module))
        setattr(module_cur, name, quantize_layer(module))
    print(name)
         

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38


In [142]:
module_cur

ModuleList(
  (0): Block(
    (norm1): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
    (attn): Attention(
      (qkv): NBitLinearDynamic(in_features=1408, out_features=4224, bias=False)
      (attn_drop): Dropout(p=0.0, inplace=False)
      (proj): NBitLinearDynamic(in_features=1408, out_features=1408, bias=True)
      (proj_drop): Dropout(p=0.0, inplace=False)
    )
    (drop_path): Identity()
    (norm2): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
    (mlp): Mlp(
      (fc1): Linear(in_features=1408, out_features=6144, bias=True)
      (act): GELU(approximate='none')
      (fc2): Linear(in_features=6144, out_features=1408, bias=True)
      (drop): Dropout(p=0.0, inplace=False)
    )
  )
  (1-38): 38 x Block(
    (norm1): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
    (attn): Attention(
      (qkv): Linear(in_features=1408, out_features=4224, bias=False)
      (attn_drop): Dropout(p=0.0, inplace=False)
      (proj): Linear(in_features=1408, out_featu

In [95]:
dir(model.visual_encoder.blocks[0].attn)

['T_destination',
 '__annotations__',
 '__call__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_apply',
 '_backward_hooks',
 '_backward_pre_hooks',
 '_buffers',
 '_call_impl',
 '_compiled_call_impl',
 '_forward_hooks',
 '_forward_hooks_always_called',
 '_forward_hooks_with_kwargs',
 '_forward_pre_hooks',
 '_forward_pre_hooks_with_kwargs',
 '_get_backward_hooks',
 '_get_backward_pre_hooks',
 '_get_name',
 '_is_full_backward_hook',
 '_load_from_state_dict',
 '_load_state_dict_post_hooks',
 '_load_state_dict_pre_hooks',
 '_maybe_warn_non_full_backward_hook',
 '_modules',
 '_named_members',
 '_non_persistent_buffers_se

In [81]:
for x in model.visual_encoder.named_children():
    print(x)

('patch_embed', PatchEmbed(
  (proj): Conv2d(3, 1408, kernel_size=(14, 14), stride=(14, 14))
))
('pos_drop', Dropout(p=0.0, inplace=False))
('blocks', ModuleList(
  (0-38): 39 x Block(
    (norm1): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
    (attn): Attention(
      (qkv): Linear(in_features=1408, out_features=4224, bias=False)
      (attn_drop): Dropout(p=0.0, inplace=False)
      (proj): Linear(in_features=1408, out_features=1408, bias=True)
      (proj_drop): Dropout(p=0.0, inplace=False)
    )
    (drop_path): Identity()
    (norm2): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
    (mlp): Mlp(
      (fc1): Linear(in_features=1408, out_features=6144, bias=True)
      (act): GELU(approximate='none')
      (fc2): Linear(in_features=6144, out_features=1408, bias=True)
      (drop): Dropout(p=0.0, inplace=False)
    )
  )
))


In [31]:
for name, module in model.visual_encoder.named_children():
    print(name, module)

patch_embed PatchEmbed(
  (proj): Conv2d(3, 1408, kernel_size=(14, 14), stride=(14, 14))
)
pos_drop Dropout(p=0.0, inplace=False)
blocks ModuleList(
  (0-38): 39 x Block(
    (norm1): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
    (attn): Attention(
      (qkv): Linear(in_features=1408, out_features=4224, bias=False)
      (attn_drop): Dropout(p=0.0, inplace=False)
      (proj): Linear(in_features=1408, out_features=1408, bias=True)
      (proj_drop): Dropout(p=0.0, inplace=False)
    )
    (drop_path): Identity()
    (norm2): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
    (mlp): Mlp(
      (fc1): Linear(in_features=1408, out_features=6144, bias=True)
      (act): GELU(approximate='none')
      (fc2): Linear(in_features=6144, out_features=1408, bias=True)
      (drop): Dropout(p=0.0, inplace=False)
    )
  )
)


In [26]:
for param in model.visual_encoder.named_parameters():
    print(param)

('cls_token', Parameter containing:
tensor([[[ 0.3700,  0.1081, -0.0608,  ...,  0.1736, -0.0758, -0.1867]]],
       requires_grad=True))
('pos_embed', Parameter containing:
tensor([[[ 0.3617,  0.0957, -0.0570,  ...,  0.1747, -0.0557, -0.1614],
         [ 0.3302,  0.9622,  0.2345,  ...,  0.2431,  0.0179, -0.9291],
         [ 0.2311,  1.4527,  0.1351,  ...,  0.1499,  0.0300, -0.7355],
         ...,
         [-0.4153,  0.0396,  0.4716,  ..., -0.1404,  0.1401,  1.0388],
         [-0.1875, -0.1835,  0.5680,  ...,  0.0724,  0.0063,  0.7481],
         [-0.0598, -0.4010,  0.7227,  ...,  0.2676, -0.0519,  0.4881]]],
       requires_grad=True))
('patch_embed.proj.weight', Parameter containing:
tensor([[[[ 1.0929e-02,  5.9179e-03,  3.5793e-03,  ...,  1.4401e-02,
            1.5265e-03,  1.8531e-03],
          [ 1.3361e-02,  1.0748e-02,  8.9903e-03,  ..., -2.4197e-03,
           -2.7329e-03,  4.8976e-04],
          [ 1.3349e-02,  9.7972e-03,  7.4245e-03,  ..., -1.3339e-03,
            4.0807e-03, 

In [None]:
# TODO:
# arg --> list of blocks to quantize for ViT/Q-Former

In [45]:
weight_bits = 8
activation_bits = 32

In [46]:
from layers.nbitlineardynamic import *
Q_layer = NBitLinearDynamic(model.vision_proj.in_features, 
                            model.vision_proj.out_features, 
                            bias=True,
                            weight_bits = 8,
                            activation_bits = 32)

with torch.no_grad():
    Q_layer.weight.copy_(model.vision_proj.weight)
    Q_layer.bias.copy_(model.vision_proj.bias)
    

Q_layer

NBitLinearDynamic(in_features=768, out_features=256, bias=True)

In [47]:
model.vision_proj = Q_layer

In [48]:
model.vision_proj

NBitLinearDynamic(in_features=768, out_features=256, bias=True)

In [49]:
model

Blip2Qformer(
  (visual_encoder): VisionTransformer(
    (patch_embed): PatchEmbed(
      (proj): Conv2d(3, 1408, kernel_size=(14, 14), stride=(14, 14))
    )
    (pos_drop): Dropout(p=0.0, inplace=False)
    (blocks): ModuleList(
      (0-38): 39 x Block(
        (norm1): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
        (attn): Attention(
          (qkv): Linear(in_features=1408, out_features=4224, bias=False)
          (attn_drop): Dropout(p=0.0, inplace=False)
          (proj): Linear(in_features=1408, out_features=1408, bias=True)
          (proj_drop): Dropout(p=0.0, inplace=False)
        )
        (drop_path): Identity()
        (norm2): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
        (mlp): Mlp(
          (fc1): Linear(in_features=1408, out_features=6144, bias=True)
          (act): GELU(approximate='none')
          (fc2): Linear(in_features=6144, out_features=1408, bias=True)
          (drop): Dropout(p=0.0, inplace=False)
        )
      )
    )


In [30]:
[module for module in model.modules()]

[Blip2Qformer(
   (visual_encoder): VisionTransformer(
     (patch_embed): PatchEmbed(
       (proj): Conv2d(3, 1408, kernel_size=(14, 14), stride=(14, 14))
     )
     (pos_drop): Dropout(p=0.0, inplace=False)
     (blocks): ModuleList(
       (0-38): 39 x Block(
         (norm1): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
         (attn): Attention(
           (qkv): Linear(in_features=1408, out_features=4224, bias=False)
           (attn_drop): Dropout(p=0.0, inplace=False)
           (proj): Linear(in_features=1408, out_features=1408, bias=True)
           (proj_drop): Dropout(p=0.0, inplace=False)
         )
         (drop_path): Identity()
         (norm2): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
         (mlp): Mlp(
           (fc1): Linear(in_features=1408, out_features=6144, bias=True)
           (act): GELU(approximate='none')
           (fc2): Linear(in_features=6144, out_features=1408, bias=True)
           (drop): Dropout(p=0.0, inplace=False)
  

In [36]:
model.visual_encoder

VisionTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(3, 1408, kernel_size=(14, 14), stride=(14, 14))
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (blocks): ModuleList(
    (0-38): 39 x Block(
      (norm1): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=1408, out_features=4224, bias=False)
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=1408, out_features=1408, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (drop_path): Identity()
      (norm2): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=1408, out_features=6144, bias=True)
        (act): GELU(approximate='none')
        (fc2): Linear(in_features=6144, out_features=1408, bias=True)
        (drop): Dropout(p=0.0, inplace=False)
      )
    )
  )
)