In [11]:
import argparse
import os
import random
from collections import defaultdict

import cv2
import re

import numpy as np
from PIL import Image
import torch
import html
import gradio as gr

import torchvision.transforms as T
import torch.backends.cudnn as cudnn

from minigpt4.common.config import Config

from minigpt4.common.registry import registry
from minigpt4.conversation.conversation import Conversation, SeparatorStyle, Chat

# imports modules for registration
from minigpt4.datasets.builders import *
from minigpt4.models import *
from minigpt4.processors import *
from minigpt4.runners import *
from minigpt4.tasks import *

In [12]:
class CommandLineArguments:
    def __init__(self, cfg_path='eval_configs/minigptv2_eval.yaml', gpu_id=0, options=None):
        self.cfg_path = cfg_path
        self.gpu_id = gpu_id
        self.options = options

In [13]:
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

cudnn.benchmark = False
cudnn.deterministic = True

In [14]:
print('Initializing Chat')
args = CommandLineArguments()
cfg = Config(args)

device = 'cuda:{}'.format(args.gpu_id)

model_config = cfg.model_cfg
model_config.device_8bit = args.gpu_id
model_cls = registry.get_model_class(model_config.arch)
model = model_cls.from_config(model_config).to(device)
bounding_box_size = 100

vis_processor_cfg = cfg.datasets_cfg.cc_sbu_align.vis_processor.train
vis_processor = registry.get_processor_class(vis_processor_cfg.name).from_config(vis_processor_cfg)

model = model.eval()

CONV_VISION = Conversation(
    system="",
    roles=(r"<s>[INST] ", r" [/INST]"),
    messages=[],
    offset=2,
    sep_style=SeparatorStyle.SINGLE,
    sep="",
)

Initializing Chat


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

trainable params: 33,816,576 || all params: 6,772,232,192 || trainable%: 0.49934165045237716
Position interpolate from 16x16 to 32x32
Load Minigpt-4-LLM Checkpoint: minigpt4/outputs/20240416202/checkpoint_1.pth


In [15]:
def gradio_reset(chat_state, img_list):
    if chat_state is not None:
        chat_state.messages = []
    if img_list is not None:
        img_list = []
    return None, gr.update(value=None, interactive=True), gr.update(placeholder='Upload your image and chat',
                                                                    interactive=True), chat_state, img_list


def image_upload_trigger(upload_flag, replace_flag, img_list):
    # set the upload flag to true when receive a new image.
    # if there is an old image (and old conversation), set the replace flag to true to reset the conv later.
    upload_flag = 1
    if img_list:
        replace_flag = 1
    return upload_flag, replace_flag


def example_trigger(text_input, image, upload_flag, replace_flag, img_list):
    # set the upload flag to true when receive a new image.
    # if there is an old image (and old conversation), set the replace flag to true to reset the conv later.
    upload_flag = 1
    if img_list or replace_flag == 1:
        replace_flag = 1

    return upload_flag, replace_flag


def gradio_ask(user_message, chatbot, chat_state, gr_img, img_list, upload_flag, replace_flag):
    if len(user_message) == 0:
        text_box_show = 'Input should not be empty!'
    else:
        text_box_show = ''

    if isinstance(gr_img, dict):
        gr_img, mask = gr_img['image'], gr_img['mask']
    else:
        mask = None

    if '[identify]' in user_message:
        # check if user provide bbox in the text input
        integers = re.findall(r'-?\d+', user_message)
        if len(integers) != 4:  # no bbox in text
            bbox = mask2bbox(mask)
            user_message = user_message + bbox

    if chat_state is None:
        chat_state = CONV_VISION.copy()

    if upload_flag:
        if replace_flag:
            chat_state = CONV_VISION.copy()  # new image, reset everything
            replace_flag = 0
            chatbot = []
        img_list = []
        llm_message = chat.upload_img(gr_img, chat_state, img_list)
        upload_flag = 0

    chat.ask(user_message, chat_state)

    chatbot = chatbot + [[user_message, None]]

    if '[identify]' in user_message:
        visual_img, _ = visualize_all_bbox_together(gr_img, user_message)
        if visual_img is not None:
            file_path = save_tmp_img(visual_img)
            chatbot = chatbot + [[(file_path,), None]]

    return text_box_show, chatbot, chat_state, img_list, upload_flag, replace_flag


def gradio_answer(chatbot, chat_state, img_list, temperature):
    llm_message = chat.answer(conv=chat_state,
                              img_list=img_list,
                              temperature=temperature,
                              max_new_tokens=500,
                              max_length=2000)[0]
    chatbot[-1][1] = llm_message
    return chatbot, chat_state


def gradio_stream_answer(chatbot, chat_state, img_list, temperature):
    if len(img_list) > 0:
        if not isinstance(img_list[0], torch.Tensor):
            chat.encode_img(img_list)
    streamer = chat.stream_answer(conv=chat_state,
                                  img_list=img_list,
                                  temperature=temperature,
                                  max_new_tokens=500,
                                  max_length=2000)
    output = ''
    for new_output in streamer:
        escapped = escape_markdown(new_output)
        output += escapped
        chatbot[-1][1] = output
        yield chatbot, chat_state
    chat_state.messages[-1][1] = '</s>'
    return chatbot, chat_state

In [16]:
import torch
import torch.nn as nn
from torchvision import transforms
from PIL import Image
import nltk
import numpy as np
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from nltk.tokenize import word_tokenize
import Levenshtein
import json

transform = transforms.Compose([
    transforms.Resize((448, 448)),
    transforms.ToTensor(),
])

In [17]:
with open('medvqa_dataset/Slake/Slake1.0/test_en_w_captions_medicap.json', 'r', encoding='utf-8') as f:
        data = json.load(f)

In [18]:
vis_root = 'medvqa_dataset/Slake/Slake1.0/imgs'

In [19]:
chat = Chat(model, vis_processor, device=device)

In [22]:
num_beams = 1
temperature = 0.6
new_obj = []
bleu_scores = []
meteor_scores = []
levenshtein_similarities = []
try:
    if chat_state is not None:
        chat_state.messages = []
    if img_list is not None:
        img_list = []
except:
    pass
for obj in data:
    image_path = os.path.join(vis_root, obj['img_name'])
    image = Image.open(image_path).convert("RGB")
    img = transform(image)
    
    chat_state = CONV_VISION.copy()
    img_list = []
    llm_message = chat.upload_img(img, chat_state, img_list)
    chat.encode_img(img_list)
    
    # user_message = f'Answer the question succinctly and directly, avoiding any details or explanations. Your answers should be straight to the point and as short as possible: {obj["question"]}'
    # user_message = f"Based on the image and the following caption {obj['question']} Answer the following question: {obj['caption']}"
    user_message = "Take this caption into account but don't rely on it solely '{}' {}".format(obj['caption'], obj['question'])
    chat.ask(user_message, chat_state)
    
    llm_message = chat.answer(conv=chat_state,
                              img_list=img_list,
                              num_beams=num_beams,
                              temperature=temperature,
                              max_new_tokens=300,
                              max_length=2000)[0]
    
    
    real_answer = obj['answer']
    
    new_obj.append({'question': obj["question"],
                 'pred': llm_message,
                 'gt': real_answer,
                 'answer_type': obj['answer_type']})
    
    print('Q: ', user_message[59:])
    print('A\': ', llm_message)
    print('A: ', real_answer)
    print()
    
    chat_state.messages = []
    img_list = []

Q:  'Computed tomography scan of the chest showing a nodular opacity in the left lung.' What modality is used to take this image?
A':  theBXL-1 and 2s2 Venti (D Ch. on'kina-flest, orash’N-o 1 (As0c.(pp (D 1 and a H the BOMGbetation to 1 and 1 and 1 and take'ok field-39A Anone/w.hub, g,", of, orase asbagger 1 and be_bro-39a, reelim / dM-31 and 1 and to 1 and 1 and be_9A An’in’ in a and to 2ch 1 and be_statid the B.Ћp> publicof many for - the situation. in a and 1 (As02brri is 1 and 1 and 1 and a: (D Ch. PAd Center (DOn theBxna (DCh.'shave (DTo4amer (Dxna and 1 and 1 and to 1 and 1 and 578n1 and to 1 and a future2brret peri-2se (DOn GI-fl�es.Ћ fiary De. 1 and be_fl(com, reCollections 1 and 1 and 2s two-o, or c: (As01 and be_freter and to 1 and
A:  CT



KeyboardInterrupt: 

In [None]:
with open('test_slake_finetune_w_mid_proj_w_caption_medicap_1.json', "w") as json_file:
    json.dump(new_obj, json_file, indent=4)

In [None]:
###GEMENI

In [132]:
with open('test_slake_finetune_w_mid_proj_w_caption_medicap_1.json', 'r', encoding='utf-8') as f:
        data = json.load(f)

In [133]:
import vertexai
from vertexai.generative_models import GenerativeModel, Part

def generate_text(txt) -> str:#project_id: str, location: str) -> str:
    multimodal_model = GenerativeModel("gemini-1.0-pro")
    response = multimodal_model.generate_content(txt)
    return response.text

In [137]:
yes_no = []

In [138]:
for i, obj in enumerate(data):
    input_text = f"""
    You are an assistant who's job is to post-process outputs from an LLM. The LLM is designed
    to tackle a medical visual question answering task. The outputs of the model are very long and
    detailed, while the ground truth answers are very short. For example, a question like 'Which organ
    is abnormal, heart or lung?' should have the very short answer 'Lung', but the model's output is
    'The lungs are abnormal in this image.'. Another example is a question like 'Where is the abnormality
    in this image?', and the real answer would be 'Upper Left Lung', but the model output would be something
    like 'The abnormality is in the upper left lung, where there is an area of consolidation. This is a
    possible example of pneumonia. However, other possibilities include pulmonary embolism,
    pneumothorax, and pleural effusion, among others.'. This extra information is not needed at all.
    If part of the predicted answer is in the ground truth or vice versa, then consider it correct.

    Now the question at hand is: {obj['question']}
    Your job here is to output 'Yes' if both the model output and the ground truth are semantically the same, and 'No' otherwise.
    Model output: {obj['pred']}
    Ground truth: {obj['gt']}
    """
    try:
        corrected_pred = generate_text(input_text)
    except e:
        print(e)
        corrected_pred = 'No'
    print('PRED: ', obj['pred'])
    print('GT: ', obj['gt'])
    print('Y/N : ', corrected_pred)
    print()
    yes_no.append(corrected_pred)

PRED:  ct
GT:  CT
Y/N :  Yes

PRED:  chest
GT:  Chest
Y/N :  Yes

PRED:  lung
GT:  Lung, Spinal Cord
Y/N :  Yes

PRED:  lung
GT:  Lung
Y/N :  Yes

PRED:  yes
GT:  No
Y/N :  No

PRED:  yes
GT:  Yes
Y/N :  Yes

PRED:  no
GT:  No
Y/N :  Yes

PRED:  chronic obstructive pulmonary disease and pneumonia
GT:  Lung Cancer
Y/N :  No

PRED:  no
GT:  No
Y/N :  Yes

PRED:  right lung
GT:  Left Lung, Right
Y/N :  Yes

PRED:  lung
GT:  Lung
Y/N :  Yes

PRED:  ct
GT:  CT
Y/N :  Yes

PRED:  chest
GT:  Chest
Y/N :  Yes

PRED:  lung
GT:  Lung
Y/N :  Yes

PRED:  spleen
GT:  Lung
Y/N :  No

PRED:  yes
GT:  No
Y/N :  No

PRED:  yes
GT:  Yes
Y/N :  Yes

PRED:  lung cancer and pneumothorax
GT:  Lung Cancer
Y/N :  Yes

PRED:  no
GT:  No
Y/N :  Yes

PRED:  right lung
GT:  Left Lung, Right
Y/N :  No

PRED:  heart
GT:  Lung
Y/N :  No

PRED:  ct scan
GT:  CT
Y/N :  Yes

PRED:  thorax
GT:  Chest
Y/N :  Yes

PRED:  lung
GT:  Lung
Y/N :  Yes

PRED:  lungs
GT:  Lung
Y/N :  Yes

PRED:  yes
GT:  No
Y/N :  No

PRED:  no


NameError: name 'e' is not defined

In [None]:
yes_no = np.array(yes_no)

In [None]:
yes_no[0] = 'Yes'

In [None]:
(sum(yes_no == 'Yes') / len(yes_no)) * 100

In [None]:
open_closed = []

In [None]:
for i, obj in enumerate(data):
    if obj['answer_type'] == 'OPEN': 
        open_closed.append(yes_no[i])
    

In [None]:
open_closed = np.array(open_closed)

In [None]:
(sum(open_closed == 'Yes') / len(open_closed)) * 100

# cnt = 0
for obj in data:
    if obj['answer_type'] == 'OPEN':
        cnt += 1

In [57]:
cnt

645

In [47]:
((179/451)*64.39) + ((272/451)*81.98)

74.99860310421286

In [60]:
((645/1061)*84.71) + ((416/1061)*83.17)

84.10619227144204

In [35]:
!pip install rouge-score



In [117]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from nltk.tokenize import word_tokenize
import Levenshtein


In [118]:
bleu_scores = []
meteor_scores = []
levenshtein_similarities = []

for i, obj in enumerate(data):
    real_answer = obj['gt']
    llm_message = obj['pred']
    
    # BLEU score
    bleu_score = sentence_bleu([real_answer.split()], 
                               llm_message.split(), 
                               smoothing_function=SmoothingFunction().method1)
    bleu_scores.append(bleu_score)
    
    # METEOR score
    meteor_score_ = meteor_score([real_answer.split()], llm_message.split())
    meteor_scores.append(meteor_score_)
    
    
    # Levenshtein similarity
    levenshtein_distance = Levenshtein.distance(real_answer, llm_message)
    max_len = max(len(real_answer), len(llm_message))
    levenshtein_similarity = 1 - (levenshtein_distance / max_len)
    levenshtein_similarities.append(levenshtein_similarity)

In [119]:
bleu_scores = np.array(bleu_scores)
meteor_scores = np.array(meteor_scores)
levenshtein_similarities = np.array(levenshtein_similarities)
print('bleu_scores', bleu_scores.mean() * 100)
print('meteor_scores', meteor_scores.mean() * 100)
print('levenshtein_similarities', levenshtein_similarities.mean() * 100)

bleu_scores 2.920153483923626
meteor_scores 24.983686717127775
levenshtein_similarities 39.32752186932412


In [121]:
bleu_scores_oc = []
meteor_scores_oc = []
levenshtein_similarities_oc = []

for i, obj in enumerate(data):
    if obj['answer_type'] == 'CLOSED': 
        bleu_scores_oc.append(bleu_scores[i])
        meteor_scores_oc.append(meteor_scores[i])
        levenshtein_similarities_oc.append(levenshtein_similarities[i])
        
bleu_scores_oc = np.array(bleu_scores_oc)    
meteor_scores_oc = np.array(meteor_scores_oc)    
levenshtein_similarities_oc = np.array(levenshtein_similarities_oc)  

print('bleu_scores', bleu_scores_oc.mean() * 100)
print('meteor_scores', meteor_scores_oc.mean() * 100)
print('levenshtein_similarities', levenshtein_similarities_oc.mean() * 100)

bleu_scores 3.20351805484953
meteor_scores 30.594743878412604
levenshtein_similarities 42.372052987861814
