In [1]:
import json
import random
from types import SimpleNamespace

import numpy as np
import torch
import torch.backends.cudnn as cudnn
from PIL import Image

from minigpt4.common.config import Config
from minigpt4.common.dist_utils import get_rank
from minigpt4.common.registry import registry
from minigpt4.conversation.conversation import Chat, CONV_VISION

# imports modules for registration
from minigpt4.datasets.builders import *
from minigpt4.models import *
from minigpt4.processors import *
from minigpt4.runners import *
from minigpt4.tasks import *

from deepspeed.profiling.flops_profiler import get_model_profile
from deepspeed.accelerator import get_accelerator

  warn(f"Failed to load image Python extension: {e}")
  from .autonotebook import tqdm as notebook_tqdm


Setting ds_accelerator to cuda (auto detect)


In [2]:
seed = 2023
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
cudnn.benchmark = False
cudnn.deterministic = True

In [3]:
def ask_with_image(chat: Chat, question: str, image_id: int):
    chat_state = CONV_VISION.copy()
    img_list = []
    img = Image.open(f'../datasets/OK-VQA/image/val2014/COCO_val2014_{str(image_id).zfill(12)}.jpg')
    llm_message = chat.upload_img(img, chat_state, img_list)
    chat.ask(question, chat_state)
    llm_message = chat.answer(
        conv=chat_state,
        img_list=img_list,
        num_beams=1,
        temperature=1,
        max_new_tokens=300,
        max_length=2000
    )[0]
    return llm_message

In [4]:
args = SimpleNamespace()
args.cfg_path = 'eval_configs/minigpt4_eval.yaml'
args.gpu_id = 0
args.options = None
cfg = Config(args)

model_config = cfg.model_cfg
model_config.device_8bit = args.gpu_id
model_cls = registry.get_model_class(model_config.arch)
model = model_cls.from_config(model_config).to('cuda:{}'.format(args.gpu_id))

vis_processor_cfg = cfg.datasets_cfg.cc_sbu_align.vis_processor.train
vis_processor = registry.get_processor_class(vis_processor_cfg.name).from_config(vis_processor_cfg)
chat = Chat(model, vis_processor, device='cuda:{}'.format(args.gpu_id))

Loading VIT
Loading VIT Done
Loading Q-Former
Loading Q-Former Done
Loading LLAMA


Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.48s/it]


Loading LLAMA Done
Load 4 training prompts
Prompt Example 
###Human: <Img><ImageHere></Img> Could you describe the contents of this image for me? ###Assistant: 
Load BLIP2-LLM Checkpoint: /home/chengzhang/Multimodal-Quantization/MiniGPT-4/checkpoints/prerained_minigpt4_7b.pth


In [5]:
def calc_param_size(param: torch.nn.Parameter):
    return param.nelement() * param.element_size()

def calc_mem_size(module: torch.nn.Module):
    return sum([calc_param_size(param) for param in module.parameters()])

In [6]:
print('visual_encoder:', calc_mem_size(model.visual_encoder) / 1024 / 1024, 'MB')
print('ln_vision:', calc_mem_size(model.ln_vision) / 1024 / 1024, 'MB')
print('Qformer:', calc_mem_size(model.Qformer) / 1024 / 1024, 'MB')
print('query_tokens:', calc_param_size(model.query_tokens) / 1024 / 1024, 'MB')
# print('llama_tokenizer:', calc_mem_size(model.llama_tokenizer) / 1024 / 1024, 'MB')
print('llama_model:', calc_mem_size(model.llama_model) / 1024 / 1024, 'MB')

visual_encoder: 1881.765869140625 MB
ln_vision: 0.0107421875 MB
Qformer: 401.068359375 MB
query_tokens: 0.09375 MB
llama_model: 25705.046875 MB


In [7]:
with open('../datasets/OK-VQA/question/OpenEnded_mscoco_val2014_questions.json') as f:
    questions = json.loads(f.read())['questions']

In [9]:
q = questions[0]
question = q['question']
image_id = q['image_id']
image = Image.open(f'../datasets/OK-VQA/image/val2014/COCO_val2014_{str(image_id).zfill(12)}.jpg')
image = chat.vis_processor(image).unsqueeze(0).to(torch.float16).to('cuda')

with get_accelerator().device(0):
    with model.maybe_autocast():
        flops, macs, params = get_model_profile(
            model,
            args=({
                'image': image,
                'text_input': [question]
            }, ),
            # input_shape=(1, 3, 224, 224),
            print_profile=True,
            detailed=True,
        )


-------------------------- DeepSpeed Flops Profiler --------------------------
Profile Summary at step 1:
Notations:
data parallel size (dp_size), model parallel size(mp_size),
number of parameters (params), number of multiply-accumulate operations(MACs),
number of floating-point operations (flops), floating-point operations per second (FLOPS),
fwd latency (forward propagation latency), bwd latency (backward propagation latency),
step (weights update latency), iter latency (sum of fwd, bwd and step latency)

params per gpu:                                               7832.63 M
params of model = params per GPU * mp_size:                   7832.63 M
fwd MACs per GPU:                                             1406.86 GMACs
fwd flops per GPU:                                            2814.2 G
fwd flops of model = fwd flops per GPU * mp_size:             2814.2 G
fwd latency:                                                  255.06 ms
fwd FLOPS per GPU = fwd flops per GPU / fwd latency