In [82]:
import networkx as nx
import argparse
import os
import sys
import numpy as np
import json
import torch
import torchvision
from PIL import Image
import gc
sys.path.append('/root/autodl-tmp/GroundedSAM_src/Grounded-Segment-Anything')
import GroundingDINO.groundingdino.datasets.transforms as T
from GroundingDINO.groundingdino.models import build_model
from GroundingDINO.groundingdino.util.slconfig import SLConfig
from GroundingDINO.groundingdino.util.utils import clean_state_dict, get_phrases_from_posmap
import pickle

import subprocess
import os

import mimetypes
import os
from io import BytesIO
from typing import Union
import cv2
import requests
import torch
import transformers
from PIL import Image
from torchvision.transforms import Compose, Resize, ToTensor
from tqdm import tqdm
import sys

### 工具函数

### 普通工具

In [83]:
def load_pickle_dictionary(filename):
    with open(filename, 'rb') as file:
        loaded_dict = pickle.load(file)
    return loaded_dict

def save_pickle(data, filename):
    with open(filename, 'wb') as file:
        pickle.dump(data, file)

#### GDINO函数

In [84]:
def load_image(image_path):
    # load image
    image_pil = Image.open(image_path).convert("RGB")  # load image

    transform = T.Compose(
        [
            # T.RandomResize([800], max_size=1333),
            T.ToTensor(),
            T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
        ]
    )
    image, _ = transform(image_pil, None)  # 3, h, w
    return image_pil, image

def load_model(model_config_path, model_checkpoint_path, device):
    args = SLConfig.fromfile(model_config_path)
    args.device = device
    model = build_model(args)
    checkpoint = torch.load(model_checkpoint_path, map_location="cpu")
    load_res = model.load_state_dict(clean_state_dict(checkpoint["model"]), strict=False)
    print(load_res)
    _ = model.eval()
    return model

def get_grounding_output(model, image, caption, box_threshold, text_threshold,device="cpu"):
    caption = caption.lower()
    caption = caption.strip()
    if not caption.endswith("."):
        caption = caption + "."
    model = model.to(device)
    image = image.to(device)
    with torch.no_grad():
        outputs = model(image[None], captions=[caption])
    logits = outputs["pred_logits"].cpu().sigmoid()[0]  # (nq, 256)
    boxes = outputs["pred_boxes"].cpu()[0]  # (nq, 4)
    logits.shape[0]

    # filter output
    logits_filt = logits.clone()
    boxes_filt = boxes.clone()
    filt_mask = logits_filt.max(dim=1)[0] > box_threshold
    logits_filt = logits_filt[filt_mask]  # num_filt, 256
    boxes_filt = boxes_filt[filt_mask]  # num_filt, 4
    logits_filt.shape[0]

    # get phrase
    tokenlizer = model.tokenizer
    tokenized = tokenlizer(caption)
    # build pred
    pred_phrases = []
    scores = []
    for logit, box in zip(logits_filt, boxes_filt):
        pred_phrase = get_phrases_from_posmap(logit > text_threshold, tokenized, tokenlizer)
        pred_phrases.append(pred_phrase + f"({str(logit.max().item())[:4]})")
        scores.append(logit.max().item())

    return boxes_filt, torch.Tensor(scores), pred_phrases

def save_obj_image(full_image_path, boxes_filt, pred_phrases, topk=5):
    mid_res_dir = '/root/autodl-tmp/mmcot_test/mid_res_test'
    image_dir, image_name = os.path.split(full_image_path)
    image_name_without_ext = os.path.splitext(image_name)[0]
    folder_path = os.path.join(mid_res_dir, image_name_without_ext)

    image_pil, image = load_image(full_image_path)
    # 如果文件夹不存在，则创建
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

    idx=0
    # 遍历边界框和短语
    for idx, (bbox, phrase) in enumerate(zip(boxes_filt, pred_phrases)):
        if idx >= topk:
            break
        # phrase=re.sub(r'\([^)]*\)', '', phrase)
        bbox=[int(num) for num in bbox]
        # 裁剪图像
        cropped_image = image_pil.crop(bbox)

        # 构建保存路径并保存图像
        # 在文件名中加入索引（id）以确保唯一性
        save_path = os.path.join(folder_path, f"{phrase}_{idx}.jpg")
        cropped_image.save(save_path)

In [85]:
def get_det(
    model,
    image_path, 
    phrase,
    box_threshold = 0.25,
    text_threshold = 0.2,
    iou_threshold = 0.38):
    image_pil, image = load_image(image_path)
    boxes_filt, scores, pred_phrases = get_grounding_output(
        model, image, phrase, box_threshold, text_threshold, device=device
    )
    size = image_pil.size
    H, W = size[1], size[0]
    for i in range(boxes_filt.size(0)):
            boxes_filt[i] = boxes_filt[i] * torch.Tensor([W, H, W, H])
            boxes_filt[i][:2] -= boxes_filt[i][2:] / 2
            boxes_filt[i][2:] += boxes_filt[i][:2]

    boxes_filt = boxes_filt.cpu()
    # use NMS to handle overlapped boxes
    # print(f"Before NMS: {boxes_filt.shape[0]} boxes")
    nms_idx = torchvision.ops.nms(boxes_filt, scores, iou_threshold).numpy().tolist()
    boxes_filt = boxes_filt[nms_idx]
    pred_phrases = [pred_phrases[idx] for idx in nms_idx]
    return boxes_filt, pred_phrases, image_pil, image

#### Otter函数

In [86]:
sys.path.append("/root/autodl-tmp/Otter/Otter/src/otter_ai")
os.chdir('/root/autodl-tmp/Otter/Otter/src/otter_ai')
sys.path.append("..")
from models.otter.modeling_otter import OtterForConditionalGeneration


# Disable warnings
requests.packages.urllib3.disable_warnings()

# ------------------- Utility Functions -------------------


def get_content_type(file_path):
    content_type, _ = mimetypes.guess_type(file_path)
    return content_type


# ------------------- Image and Video Handling Functions -------------------

def get_image(url: str) -> Union[Image.Image, list]:
    if "://" not in url:  # Local file
        content_type = get_content_type(url)
    else:  # Remote URL
        content_type = requests.head(url, stream=True, verify=False).headers.get("Content-Type")

    if "image" in content_type:
        if "://" not in url:  # Local file
            return Image.open(url)
        else:  # Remote URL
            return Image.open(requests.get(url, stream=True, verify=False).raw)
    else:
        raise ValueError("Invalid content type. Expected image or video.")


# ------------------- OTTER Prompt and Response Functions -------------------


def get_formatted_prompt(prompt: str, in_context_prompts: list = []) -> str:
    in_context_string = ""
    for in_context_prompt, in_context_answer in in_context_prompts:
        in_context_string += f"<image>User: {in_context_prompt} GPT:<answer> {in_context_answer}<|idfix|>"
    return f"{in_context_string}<image>User: {prompt} GPT:<answer>"


def get_response(image_list, prompt: str, model=None, image_processor=None, in_context_prompts: list = []) -> str:
    input_data = image_list

    if isinstance(input_data, Image.Image):
        vision_x = image_processor.preprocess([input_data], return_tensors="pt")["pixel_values"].unsqueeze(1).unsqueeze(0)
    elif isinstance(input_data, list):  # list of video frames
        vision_x = image_processor.preprocess(input_data, return_tensors="pt")["pixel_values"].unsqueeze(1).unsqueeze(0)
    else:
        raise ValueError("Invalid input data. Expected PIL Image or list of video frames.")

    print(get_formatted_prompt(prompt, in_context_prompts))
    
    lang_x = model.text_tokenizer(
        [
            get_formatted_prompt(prompt, in_context_prompts),
        ],
        return_tensors="pt",
    )
    bad_words_id = tokenizer(["User:", "GPT1:", "GFT:", "GPT:"], add_special_tokens=False).input_ids
    generated_text = model.generate(
        vision_x=vision_x.to(model.device),
        lang_x=lang_x["input_ids"].to(model.device),
        attention_mask=lang_x["attention_mask"].to(model.device),
        max_new_tokens=512,
        num_beams=3,
        no_repeat_ngram_size=3,
        bad_words_ids=bad_words_id,
    )
    parsed_output = (
        model.text_tokenizer.decode(generated_text[0])
        .split("<answer>")[-1]
        .lstrip()
        .rstrip()
        .split("<|idfix|>")[0]
        .lstrip()
        .rstrip()
        .lstrip('"')
        .rstrip('"')
    )
    return parsed_output

def get_response_test(image_list, prompt: str, model=None, image_processor=None) -> str:
    input_data = image_list

    if isinstance(input_data, Image.Image):
        vision_x = image_processor.preprocess([input_data], return_tensors="pt")["pixel_values"].unsqueeze(1).unsqueeze(0)
    elif isinstance(input_data, list):  # list of video frames
        vision_x = image_processor.preprocess(input_data, return_tensors="pt")["pixel_values"].unsqueeze(1).unsqueeze(0)
    else:
        raise ValueError("Invalid input data. Expected PIL Image or list of video frames.")

    lang_x = model.text_tokenizer(
        [
            prompt
        ],
        return_tensors="pt",
    )
    bad_words_id = tokenizer(["User:", "GPT1:", "GFT:", "GPT:"], add_special_tokens=False).input_ids
    generated_text = model.generate(
        vision_x=vision_x.to(model.device),
        lang_x=lang_x["input_ids"].to(model.device),
        attention_mask=lang_x["attention_mask"].to(model.device),
        max_new_tokens=512,
        num_beams=3,
        no_repeat_ngram_size=3,
        bad_words_ids=bad_words_id,
    )
    parsed_output = (
        model.text_tokenizer.decode(generated_text[0])
        .split("<answer>")[-1]
        .lstrip()
        .rstrip()
        .split("<|endofchunk|>")[0]
        .lstrip()
        .rstrip()
        .lstrip('"')
        .rstrip('"')
    )
    return parsed_output

In [87]:
def get_predict(urls, prompt, thismodel, thisprocessor):
    image_list = []
    for url in urls:
        image_list.append(get_image(url))
    return get_response_test(image_list, prompt, model=thismodel, image_processor=thisprocessor)

### 网络加速

In [88]:
# result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True, text=True)
# output = result.stdout
# for line in output.splitlines():
#     if '=' in line:
#         var, value = line.split('=', 1)
#         os.environ[var] = value

### Otter加载

In [89]:
predict_model = OtterForConditionalGeneration.from_pretrained("/root/autodl-tmp/cacheFile/.cache/huggingface/hub/OTTER-Image-LLaMA7B-LA-InContext", device_map="auto")
predict_model.text_tokenizer.padding_side = "left"
tokenizer = predict_model.text_tokenizer
image_processor = transformers.CLIPImageProcessor()
predict_model.eval()

The current model version is configured for Otter-Image with max_num_frames set to None.
Parameter: lang_encoder.model.embed_tokens.weight, Size: 131.088384 M
Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn_gate, Size: 0.000001 M
Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.ff_gate, Size: 0.000001 M
Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn.norm.weight, Size: 0.004096 M
Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn.norm.bias, Size: 0.004096 M
Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn.to_q.weight, Size: 2.097152 M
Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn.to_kv.weight, Size: 1.048576 M
Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.attn.to_out.weight, Size: 2.097152 M
Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.feed_forward.0.weight, Size: 0.004096 M
Parameter: lang_encoder.model.layers.3.gated_cross_attn_layer.feed_forward.

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

OtterForConditionalGeneration(
  (lang_encoder): LlamaForCausalLM(
    (model): LlamaModel(
      (embed_tokens): Embedding(32004, 4096, padding_idx=0)
      (layers): ModuleList(
        (0): OtterLayer(
          (decoder_layer): LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
              (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
              (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
              (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
              (rotary_emb): LlamaRotaryEmbedding()
            )
            (mlp): LlamaMLP(
              (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
              (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
              (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
              (act_fn): SiLU()
            )

In [90]:
# import openxlab
# openxlab.login(ak='y67rgxmam1lkooaxjqav', sk='gplk4xadnmybodjmbz4o6wgb2mk5zvp2arbwlzxo') #进行登录，输入对应的AK/SK

# from openxlab.dataset import get
# get(dataset_repo='OpenDataLab/GQA', target_path='/root/autodl-tmp/datasets/gqa')  # 数据集下载

# # from openxlab.dataset import download
# # download(dataset_repo='OpenDataLab/GQA',source_path='/README.md', target_path='/path/to/local/folder') #数据集文件下载

### 中间结果加载

In [91]:
q_categories=load_pickle_dictionary(r'/root/autodl-tmp/mmcot_data/mmcot/decomp_cate.pkl')
q_answers=load_pickle_dictionary(r'/root/autodl-tmp/mmcot_data/mmcot/qs2ans.pkl')
q_to_img=load_pickle_dictionary(r'/root/autodl-tmp/mmcot_data/mmcot/qs2img.pkl')
q_midres=load_pickle_dictionary(r'/root/autodl-tmp/mmcot_data/mmcot/decomp_pth.pkl')

In [92]:
q_answers

defaultdict(dict,
            {'Does the motorcycle that is made of metal look silver?': {'answer': 'yes',
              'full_answer': 'Yes, the motorcycle is silver.'},
             'Do the cabinets to the right of the drawers have light brown color?': {'answer': 'yes',
              'full_answer': 'Yes, the cabinets are light brown.'},
             'Do you see bookcases or tables that are wooden?': {'answer': 'yes',
              'full_answer': 'Yes, the bookcase is wooden.'},
             'Do you see any toilets near the clean sink?': {'answer': 'yes',
              'full_answer': 'Yes, there is a toilet near the sink.'},
             'Are there either any plates or kites in the photograph?': {'answer': 'no',
              'full_answer': 'No, there are no plates or kites.'},
             'Does the woman to the right of the other woman look Asian and young?': {'answer': 'no',
              'full_answer': 'No, the woman is young but caucasian.'},
             'Is the black vegetable 

In [93]:
from pathlib import Path
def replace_paths_with_strings(data, old_path, new_path):
    for key, value in data.items():
        if isinstance(value, dict):
            replace_paths_with_strings(value, old_path, new_path)
        elif isinstance(value, list):
            new_values = []
            for v in value:
                if isinstance(v, str) or isinstance(v, Path):
                    str_path = str(v).replace(old_path, new_path)
                    new_values.append(str_path)  # 保存为字符串
            data[key] = new_values
        elif isinstance(value, str):
            data[key] = value.replace(old_path, new_path)
    return data

q_midres=replace_paths_with_strings(q_midres, '/root/projects/mmcot/', '/root/autodl-tmp/mmcot_data/mmcot/')
q_midres['What is the color of the striped shirt?']

{'Is the shirt striped?': {'images': ['/root/autodl-tmp/mmcot_data/mmcot/gqa/images/n466319.jpg',
   '/root/autodl-tmp/mmcot_data/mmcot/gqa/intermediate/n466319/shirt(0.49)_0.jpg'],
  'prompt': 'According to image0: <image0>图图图图图图图图图图图图图图图图图图图图图图图图图图图图图图图图 Is the shirt <image1>图图图图图图图图图图图图图图图图图图图图图图图图图图图图图图图图 striped?'},
 'What is the color of the striped shirt?': {'images': ['/root/autodl-tmp/mmcot_data/mmcot/gqa/images/n466319.jpg'],
  'prompt': 'According to image0: <image0>图图图图图图图图图图图图图图图图图图图图图图图图图图图图图图图图 What is the color of the striped shirt?'}}

In [94]:
text_to_remove = 'According to image0: <image0>图图图图图图图图图图图图图图图图图图图图图图图图图图图图图图图'

# 更新嵌套字典中的prompt字段
updated_nested_dict = {}

for outer_key, inner_dict in q_midres.items():
    updated_inner_dict = {}
    for inner_key, value in inner_dict.items():
        # 使用字符串替换去除指定文本
        new_prompt = value['prompt'].replace('According to image0: <image0>图图图图图图图图图图图图图图图图图图图图图图图图图图图图图图图图', '').strip()
        updated_inner_dict[inner_key] = {
            'images': value['images'],
            'prompt': new_prompt
        }
    updated_nested_dict[outer_key] = updated_inner_dict

updated_nested_dict

{'Does the motorcycle that is made of metal look silver?': {'Is the motorcycle made of metal?': {'images': ['/root/autodl-tmp/mmcot_data/mmcot/gqa/images/n9181.jpg',
    '/root/autodl-tmp/mmcot_data/mmcot/gqa/intermediate/n9181/motorcycle(0.72)_0.jpg'],
   'prompt': 'Is the motorcycle <image1>图图图图图图图图图图图图图图图图图图图图图图图图图图图图图图图图 made of metal?'},
  'Does the motorcycle look silver?': {'images': ['/root/autodl-tmp/mmcot_data/mmcot/gqa/images/n9181.jpg',
    '/root/autodl-tmp/mmcot_data/mmcot/gqa/intermediate/n9181/motorcycle(0.72)_0.jpg'],
   'prompt': 'Does the motorcycle <image1>图图图图图图图图图图图图图图图图图图图图图图图图图图图图图图图图 look silver?'}},
 'Do the cabinets to the right of the drawers have light brown color?': {'Are there cabinets to the right of the drawers?': {'images': ['/root/autodl-tmp/mmcot_data/mmcot/gqa/images/n35676.jpg',
    '/root/autodl-tmp/mmcot_data/mmcot/gqa/intermediate/n35676/cabinets(0.51)_0.jpg',
    '/root/autodl-tmp/mmcot_data/mmcot/gqa/intermediate/n35676/drawers(0.36)_0.jpg'],


In [95]:
# 定义一个函数来替换所有形式的<image数字>图...图
import re

def replace_image_tags(prompt):
    # 使用正则表达式匹配和替换所有形式的<image数字>图...图为<image>
    new_prompt = re.sub(r'<image\d+>图+', '<image>', prompt)
    return new_prompt

# 应用这个函数更新嵌套字典
def update_nested_dict_with_regex(nested_dict):
    updated_dict = {}
    for outer_key, inner_dict in nested_dict.items():
        updated_inner_dict = {}
        for inner_key, value in inner_dict.items():
            # 替换指定文本并且处理<image数字>图...图的情况
            updated_prompt = replace_image_tags(value['prompt'])
            updated_inner_dict[inner_key] = {
                'images': value['images'],
                'prompt': updated_prompt
            }
        updated_dict[outer_key] = updated_inner_dict
    return updated_dict

updated_nested_dict_with_regex = update_nested_dict_with_regex(updated_nested_dict)
updated_nested_dict_with_regex

{'Does the motorcycle that is made of metal look silver?': {'Is the motorcycle made of metal?': {'images': ['/root/autodl-tmp/mmcot_data/mmcot/gqa/images/n9181.jpg',
    '/root/autodl-tmp/mmcot_data/mmcot/gqa/intermediate/n9181/motorcycle(0.72)_0.jpg'],
   'prompt': 'Is the motorcycle <image> made of metal?'},
  'Does the motorcycle look silver?': {'images': ['/root/autodl-tmp/mmcot_data/mmcot/gqa/images/n9181.jpg',
    '/root/autodl-tmp/mmcot_data/mmcot/gqa/intermediate/n9181/motorcycle(0.72)_0.jpg'],
   'prompt': 'Does the motorcycle <image> look silver?'}},
 'Do the cabinets to the right of the drawers have light brown color?': {'Are there cabinets to the right of the drawers?': {'images': ['/root/autodl-tmp/mmcot_data/mmcot/gqa/images/n35676.jpg',
    '/root/autodl-tmp/mmcot_data/mmcot/gqa/intermediate/n35676/cabinets(0.51)_0.jpg',
    '/root/autodl-tmp/mmcot_data/mmcot/gqa/intermediate/n35676/drawers(0.36)_0.jpg'],
   'prompt': 'Are there cabinets <image> to the right of the drawe

In [96]:
updated_nested_dict_with_regex['Do the shorts that are made of denim and the shorts made of cloth both look short?']

{'Do the shorts made of denim look short?': {'images': ['/root/autodl-tmp/mmcot_data/mmcot/gqa/images/n119944.jpg'],
  'prompt': 'Do the shorts made of denim look short?'},
 'Do the shorts made of cloth look short?': {'images': ['/root/autodl-tmp/mmcot_data/mmcot/gqa/images/n119944.jpg',
   '/root/autodl-tmp/mmcot_data/mmcot/gqa/intermediate/n119944/shorts(0.35)_0.jpg'],
  'prompt': 'Do the shorts <image> made of cloth look short?'},
 'Do both the shorts made of denim and the shorts made of cloth look short?': {'images': ['/root/autodl-tmp/mmcot_data/mmcot/gqa/images/n119944.jpg'],
  'prompt': 'Do both the shorts made of denim and the shorts made of cloth look short?'}}

In [97]:
q_categories['Do the shorts that are made of denim and the shorts made of cloth both look short?']

{'Are there shorts made of denim?': defaultdict(list,
             {'category': ['detection'], 'objects': ['shorts, denim.']}),
 'Are there shorts made of cloth?': defaultdict(list,
             {'category': ['detection'], 'objects': ['shorts']}),
 'Do the shorts made of denim look short?': defaultdict(list,
             {'category': ['attribute'],
              'objects': ['shorts made of denim.']}),
 'Do the shorts made of cloth look short?': defaultdict(list,
             {'category': ['attribute'], 'objects': ['shorts, cloth']}),
 'Do both the shorts made of denim and the shorts made of cloth look short?': defaultdict(list,
             {'category': ['attribute'],
              'objects': ['shorts made of denim, shorts made of cloth']})}

### 直接-otter问答

In [98]:
# prompt_template='<image>User: <questions> GPT:<answer>'
# ref_file=load_pickle_dictionary(r'/root/autodl-tmp/mmcot_res/2024-2-22-Res.pkl')
# savedict={}
# for ori_q, ref in ref_file.items():
#     image_path=ref['image']
#     urls=[image_path]
    
#     base_prompt=prompt_template.replace('<questions>', ori_q)
#     base_res=get_predict(urls, base_prompt, predict_model, image_processor)
#     print(base_res)
#     savedict[ori_q]=base_res

# save_pickle(savedict, '/root/autodl-tmp/mmcot_res/2024-2-22-Base-Res.pkl')

In [99]:
from pathlib import Path
def replace_paths_with_strings(data, old_path, new_path):
    for key, value in data.items():
        if isinstance(value, dict):
            replace_paths_with_strings(value, old_path, new_path)
        elif isinstance(value, list):
            new_values = []
            for v in value:
                if isinstance(v, str) or isinstance(v, Path):
                    str_path = str(v).replace(old_path, new_path)
                    new_values.append(str_path)  # 保存为字符串
            data[key] = new_values
        elif isinstance(value, str):
            data[key] = value.replace(old_path, new_path)
    return data

q_midres=replace_paths_with_strings(q_midres, '/root/projects/mmcot/', '/root/autodl-tmp/mmcot_data/mmcot/')
q_midres['What is the color of the striped shirt?']

{'Is the shirt striped?': {'images': ['/root/autodl-tmp/mmcot_data/mmcot/gqa/images/n466319.jpg',
   '/root/autodl-tmp/mmcot_data/mmcot/gqa/intermediate/n466319/shirt(0.49)_0.jpg'],
  'prompt': 'According to image0: <image0>图图图图图图图图图图图图图图图图图图图图图图图图图图图图图图图图 Is the shirt <image1>图图图图图图图图图图图图图图图图图图图图图图图图图图图图图图图图 striped?'},
 'What is the color of the striped shirt?': {'images': ['/root/autodl-tmp/mmcot_data/mmcot/gqa/images/n466319.jpg'],
  'prompt': 'According to image0: <image0>图图图图图图图图图图图图图图图图图图图图图图图图图图图图图图图图 What is the color of the striped shirt?'}}

### 问题合并-otter问答

In [108]:
sub_ress=load_pickle_dictionary('/root/autodl-tmp/mmcot_res/2024-2-23-subquestions-result.pkl')

In [109]:
q_temp='''
Example: (输入: (Are the bookcases <image> on top of the rug? <image> & No, the bookcase is on the floor)   输出:(The bookcases <image> are on the floor, not on top of the rug <image>.)) 其中每个<image>与前面的那个名词对应, 输入的问答以 '&'分隔。要求合并结果(1.保留所有<image>及其对应的名词,不允许丢弃2.不能改变各个<image>间的相对位置关系。)现在 输入:(<tobedone>) 输出:
'''

### 陈述句生成

In [110]:
import openai
from openai import OpenAI

client = OpenAI(api_key='sk-wNShTVgz1n20rFAyBaAaE4F8Ec934b75Ae4cF036479857C5', base_url='https://openkey.cloud/v1')
# openai.api_base = "https://api.openai.com/v1" # 换成代理，一定要加v1
openai.api_base = "https://openkey.cloud/v1" # 换成代理，一定要加v1
# openai.api_key = "API_KEY"
openai.api_key = "sk-wNShTVgz1n20rFAyBaAaE4F8Ec934b75Ae4cF036479857C5"

In [111]:
def get_comb(input):
    response = client.chat.completions.create(
                                    model="gpt-3.5-turbo-1106",
                                    messages=[
                                      {"role": "user", "content": input}
                                    ],
                                    # 流式输出
                                    stream = False)
    print(input)
    print(response)
    print('*'*100)
    return response.choices[0].message.content

In [112]:
after_dict={}
temp_prompt_q=updated_nested_dict_with_regex

for ori_q, subs in sub_ress.items():
  after_sub={}
  for subq, subq_res in subs.items():
    ans=subq_res
    subprompt=temp_prompt_q[ori_q][subq]['prompt']
    
    inputp=subprompt+' & '+ans
    # print(inputp)
    promptsi=q_temp.replace('<tobedone>', inputp)
    combed_=get_comb(promptsi)
    after_sub[subq]=combed_
    
  after_dict[ori_q]=after_sub

save_pickle(after_dict, '/root/autodl-tmp/mmcot_res/合并子问题答案v2.pkl')
    


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



Example: (输入: (Are the bookcases <image> on top of the rug? <image> & No, the bookcase is on the floor)   输出:(The bookcases <image> are on the floor, not on top of the rug <image>.)) 其中每个<image>与前面的那个名词对应, 输入的问答以 '&'分隔。要求合并结果(1.保留所有<image>及其对应的名词,不允许丢弃2.不能改变各个<image>间的相对位置关系。)现在 输入:(Is the motorcycle <image> made of metal? & No, the motorbike is made of plastic.) 输出:

ChatCompletion(id='chatcmpl-8velbDxYYoL5H3EfAuk0oFgDkvkWE', choices=[Choice(finish_reason='stop', index=0, message=ChatCompletionMessage(content='(The motorcycle <image> is made of plastic, not of metal.)', role='assistant', function_call=None, tool_calls=None), logprobs=None)], created=1708752095, model='gpt-3.5-turbo-1106', object='chat.completion', system_fingerprint='fp_406be318f3', usage=CompletionUsage(completion_tokens=14, prompt_tokens=177, total_tokens=191))
****************************************************************************************************

Example: (输入: (Are the bookcases <image> on top of the

In [126]:
subprom_dict=load_pickle_dictionary('/root/autodl-tmp/mmcot_res/合并子问题答案v2.pkl')
print(subprom_dict)



In [133]:
prompt_template='<image>User: <questions> GPT:<answer>'
globalidx=1
save_dict={}
sub_qs_dicts={}

with open("/root/autodl-tmp/mmcot_res/2024-2-24-减少完整图像-不带qa提示-简短-陈述句v2-Result.txt", "w") as file:
    for origin_question, de_questions in updated_nested_dict_with_regex.items():
        global_dict={}
        intermidiate_list=[]
        subq_dict={}
        
        global_dict['question']=origin_question
        
        global_image_path=''
        print("*"*100)
        print(f"{globalidx}.问题：",origin_question)
        print("-"*10)
        
        file.write("*" * 100 + "\n")
        file.write(f"{globalidx}.问题：{origin_question}\n")
        file.write("-" * 10 + "\n")
        
        full_context='Given the facts: '
        all_urls=[]
        idx=1
        for sub_q, subq_resource in de_questions.items():
            
            cate=q_categories[sub_q]
            # sub_context=''
            if cate==['detection']:
                continue
            sub_urls=subq_resource['images']
            
            if global_image_path=='':
                for sub_url in sub_urls:
                    if '/root/autodl-tmp/mmcot_data/mmcot/gqa/images' in sub_url:
                        global_image_path=sub_url
                        break 
                
            all_urls.extend(sub_urls)
            
            sub_prompt=prompt_template.replace('<questions>', subq_resource['prompt'])
            
            # sub_res=get_predict(sub_urls, sub_prompt, predict_model, image_processor)
            sub_res=sub_ress[origin_question][sub_q]
            
            subq_dict[sub_q]=sub_res
            
            intermidiate_list.append((sub_q, sub_res))
            # print(sub_prompt, sub_res)
            
            # sub_context=f"{idx}."+subq_resource['prompt']+" "+sub_res+" "
            sub_context=f"{idx}." + subprom_dict[origin_question][sub_q]
            full_context+=sub_context
            idx+=1
        
        sub_qs_dicts[origin_question]=subq_dict
        
        global_dict['intermidiate']=intermidiate_list
        global_dict['image']=global_image_path
        global_dict['inter_images']=all_urls
        
        # print(full_context)
        # print(all_urls)

        final_prompt=prompt_template.replace('<questions>', full_context+". In the first image, "+ origin_question + ", answer briefly")
        final_res=get_predict(all_urls, final_prompt, predict_model, image_processor)
        global_dict['combined_prompt']=final_prompt
        
        all_urls.insert(0, global_image_path)
        
        # 然后构建一个新列表，只包含不与第一个元素相同的元素
        all_urls = [element for element in all_urls if element != global_image_path]

        all_urls.insert(0, global_image_path)
            
            
        label_count=final_prompt.count("<image>")
        print(label_count,len(all_urls))
        # print(origin_question)
        print(f"{globalidx}.输入：",final_prompt)
        print("-"*20)
        print(f"{globalidx}.结果：",final_res)
        
        global_dict['result']=final_res
        # file.write(f"{label_count}, {len(all_urls)}\n")
        # file.write(origin_question + "\n") # 如果需要再次打印origin_question
        file.write(f"{globalidx}.输入：{final_prompt}\n")
        file.write("-" * 20 + "\n")
        file.write(f"{globalidx}.结果：{final_res}\n")
        globalidx+=1
        
        save_dict[origin_question]=global_dict
        # print(save_dict)

# save_pickle(sub_qs_dicts, '/root/autodl-tmp/mmcot_res/2024-2-23-subquestions-result.pkl')
save_pickle(save_dict,'/root/autodl-tmp/mmcot_res/2024-2-24-减少完整图像-不带qa提示-简短-陈述句v2-Res.pkl')

****************************************************************************************************
1.问题： Does the motorcycle that is made of metal look silver?
----------
3 3
1.输入： <image>User: Given the facts: 1.(The motorcycle <image> is made of plastic, not of metal.)2.(The motorcycle <image> looks silver, as the motorbike is silver.). In the first image, Does the motorcycle that is made of metal look silver?, answer briefly GPT:<answer>
--------------------
1.结果： Yes, the answer is yes.
****************************************************************************************************
2.问题： Do the cabinets to the right of the drawers have light brown color?
----------
5 5
2.输入： <image>User: Given the facts: 1.(The cabinets <image> are to the left of the drawers <image>, not to the right.)2.(The color of the cabinets <image> is brown.)3.(The cabinets <image> have a light brown color, not dark brown.). In the first image, Do the cabinets to the right of the drawers have light brow

In [None]:
q_answers

defaultdict(dict,
            {'Does the motorcycle that is made of metal look silver?': {'answer': 'yes',
              'full_answer': 'Yes, the motorcycle is silver.'},
             'Do the cabinets to the right of the drawers have light brown color?': {'answer': 'yes',
              'full_answer': 'Yes, the cabinets are light brown.'},
             'Do you see bookcases or tables that are wooden?': {'answer': 'yes',
              'full_answer': 'Yes, the bookcase is wooden.'},
             'Do you see any toilets near the clean sink?': {'answer': 'yes',
              'full_answer': 'Yes, there is a toilet near the sink.'},
             'Are there either any plates or kites in the photograph?': {'answer': 'no',
              'full_answer': 'No, there are no plates or kites.'},
             'Does the woman to the right of the other woman look Asian and young?': {'answer': 'no',
              'full_answer': 'No, the woman is young but caucasian.'},
             'Is the black vegetable 

In [None]:
from nltk.translate.bleu_score import sentence_bleu
all_result=load_pickle_dictionary(r'/root/autodl-tmp/mmcot_res/2024-2-24-减少完整图像-不带qa提示-简短-陈述句v2-Res.pkl')
print(all_result)



### GPT 打分

In [134]:
import openai
from openai import OpenAI

client = OpenAI(api_key='sk-wNShTVgz1n20rFAyBaAaE4F8Ec934b75Ae4cF036479857C5', base_url='https://openkey.cloud/v1')
# openai.api_base = "https://api.openai.com/v1" # 换成代理，一定要加v1
openai.api_base = "https://openkey.cloud/v1" # 换成代理，一定要加v1
# openai.api_key = "API_KEY"
openai.api_key = "sk-wNShTVgz1n20rFAyBaAaE4F8Ec934b75Ae4cF036479857C5"

In [153]:
score_template='''
Question: <ori_question>
Standard Answer: <gt>
My Answer: <pred_answer>
Scoring Criteria: Score based on the similarity between the My answer and the standard answer. A perfect match receives a full score (10 points), partially similar answers are awarded points based on the degree of similarity, and answers that do not match receive low or zero points.
Please evaluate the similarity between the My answer and the standard answer according to the scoring criteria provided above and assign an appropriate score from 0.0 to 10.0. Provide the numeric score only without any english word.
'''

In [149]:
gpt_scores={}
gpt_scores_list=[]
for ori_q, res in all_result.items():
    print(ori_q)
    gt=q_answers[ori_q]['full_answer']
    gt_short=q_answers[ori_q]['answer']
    pred=res['result']
    # print(res['combined_prompt'])
    print(gt)
    print(pred)
    print("*"*100)
    ask=score_template.replace('<ori_question>', ori_q).replace('<gt>', gt).replace('<pred_answer>', pred)
    sc=get_comb(ask)
    gpt_scores[ori_q]=sc
    gpt_scores_list.append(float(sc))


print(sum(gpt_scores_list)/len(gpt_scores_list))
save_pickle(gpt_scores, r'/root/autodl-tmp/mmcot_res/2024-2-24-减少完整图像-不带qa提示-简短-陈述句v2-GPT_Score.pkl')

Does the motorcycle that is made of metal look silver?
Yes, the motorcycle is silver.
Yes, the answer is positive. The motorcycle is silver.
****************************************************************************************************

Question: Does the motorcycle that is made of metal look silver?
Standard Answer: Yes, the motorcycle is silver.
My Answer: Yes, the answer is positive. The motorcycle is silver.
Scoring Criteria: Score based on the similarity between the My answer and the standard answer. A perfect match receives a full score (10 points), partially similar answers are awarded points based on the degree of similarity, and answers that do not match receive low or zero points.
Please evaluate the similarity between the My answer and the standard answer according to the scoring criteria provided above and assign an appropriate score from 0.0 to 10.0. Provide the numeric score only without any word.

ChatCompletion(id='chatcmpl-8wAJjBI9rr0yRHVazF2W7DIU5L2Is', choices=

In [150]:
base_result=load_pickle_dictionary(r'/root/autodl-tmp/mmcot_res/2024-2-22-Base-Res.pkl')

In [154]:
gpt_scores_base={}
gpt_scores_base_list=[]
for ori_q, res in base_result.items():
    print(ori_q)
    gt=q_answers[ori_q]['full_answer']
    gt_short=q_answers[ori_q]['answer']
    pred=res
    # print(res['combined_prompt'])
    print(gt)
    print(pred)
    print("*"*100)
    ask=score_template.replace('<ori_question>', ori_q).replace('<gt>', gt).replace('<pred_answer>', pred)
    sc=get_comb(ask)
    gpt_scores_base[ori_q]=sc
    gpt_scores_base_list.append(float(sc))


print(sum(gpt_scores_base_list)/len(gpt_scores_base_list))
save_pickle(gpt_scores_base, r'/root/autodl-tmp/mmcot_res/2024-2-24-Baseline-GPT_Score.pkl')

Does the motorcycle that is made of metal look silver?
Yes, the motorcycle is silver.
Yes, the motorbike is silver.
****************************************************************************************************

Question: Does the motorcycle that is made of metal look silver?
Standard Answer: Yes, the motorcycle is silver.
My Answer: Yes, the motorbike is silver.
Scoring Criteria: Score based on the similarity between the My answer and the standard answer. A perfect match receives a full score (10 points), partially similar answers are awarded points based on the degree of similarity, and answers that do not match receive low or zero points.
Please evaluate the similarity between the My answer and the standard answer according to the scoring criteria provided above and assign an appropriate score from 0.0 to 10.0. Provide the numeric score only without any english word.

ChatCompletion(id='chatcmpl-8wAtO4tGrce2ph6YOnlYrKyg2s5oo', choices=[Choice(finish_reason='stop', index=0, mes

### BLEU2

In [None]:
score_list_full=[]
score_list_short=[]
bleu2_save_dict_full={}
bleu2_save_dict_short={}

for ori_q, res in all_result.items():
    print(ori_q)
    gt=q_answers[ori_q]['full_answer']
    gt_short=q_answers[ori_q]['answer']
    pred=res['result']
    # print(res['combined_prompt'])
    print(gt)
    print(pred)
    print("*"*100)
    
    gt_split=gt.split()
    pred_split=pred.split()

    score = sentence_bleu([gt.split()], pred_split, weights=(0.5, 0.5))
    score_short=sentence_bleu([gt_short.split()], pred_split, weights=(0.5, 0.5))
    score_list_full.append(score)
    score_list_short.append(score_short)
    
    bleu2_save_dict_full[ori_q]=score
    bleu2_save_dict_short[ori_q]=score_short 

save_pickle(bleu2_save_dict_full, '/root/autodl-tmp/mmcot_res/2024-2-24-减少完整图像-不带qa提示-简短-陈述句v2-BLEU2-Full.pkl')
save_pickle(bleu2_save_dict_short, '/root/autodl-tmp/mmcot_res/2024-2-24-减少完整图像-不带qa提示-简短-陈述句v2-BLEU2-Short.pkl')
print("BLEU2(Full Answer):", sum(score_list_full)/len(score_list_full))
print("BLEU2(Short Answer):", sum(score_list_short)/len(score_list_short))
# print(score_list)

Does the motorcycle that is made of metal look silver?
Yes, the motorcycle is silver.
Yes, the answer is positive. The motorcycle is silver.
****************************************************************************************************
Do the cabinets to the right of the drawers have light brown color?
Yes, the cabinets are light brown.
The answer to the question is "no." The cabinets are dark brown, not light brown.
****************************************************************************************************
Do you see bookcases or tables that are wooden?
Yes, the bookcase is wooden.
Yes, there is a wooden table.
****************************************************************************************************
Do you see any toilets near the clean sink?
Yes, there is a toilet near the sink.
No, there is a toilet near the sink.
****************************************************************************************************
Does the woman to the right of the other wom

In [None]:
base_result=load_pickle_dictionary(r'/root/autodl-tmp/mmcot_res/2024-2-22-Base-Res.pkl')
basescore_list_full=[]
basescore_list_short=[]
basebleu2_save_dict_full={}
basebleu2_save_dict_short={}
for ori_q, res in base_result.items():
    # print(ori_q)
    gt=q_answers[ori_q]['full_answer']
    gt_short=q_answers[ori_q]['answer']
    pred=res
    print(gt)
    print(pred)
    print("*"*100)
    
    gt_split=gt.split()
    pred_split=pred.split()

    score = sentence_bleu([gt.split()], pred_split, weights=(0.5, 0.5))
    score_short=sentence_bleu([gt_short.split()], pred_split, weights=(0.5, 0.5))
    basescore_list_full.append(score)
    basescore_list_short.append(score_short)
    
    basebleu2_save_dict_full[ori_q]=score
    basebleu2_save_dict_short[ori_q]=score_short 

save_pickle(basebleu2_save_dict_full, '/root/autodl-tmp/mmcot_res/2024-2-22-base-BLEU2-Full.pkl')
save_pickle(basebleu2_save_dict_short, '/root/autodl-tmp/mmcot_res/2024-2-22-base-BLEU2-Short.pkl')
print("BLEU2(Full Answer):", sum(basescore_list_full)/len(basescore_list_full))
print("BLEU2(Short Answer):", sum(basescore_list_short)/len(basescore_list_short))

Yes, the motorcycle is silver.
Yes, the motorbike is silver.
****************************************************************************************************
Yes, the cabinets are light brown.
No, the cabinet is dark brown.
****************************************************************************************************
Yes, the bookcase is wooden.
Yes, the bookcase is wooden.
****************************************************************************************************
Yes, there is a toilet near the sink.
Yes, there is a toilet near the sink.
****************************************************************************************************
No, the woman is young but caucasian.
No, the woman is young but blond.
****************************************************************************************************
Yes, the olive is on the pizza.
Yes, the black veggie is on top of the peppers.
**********************************************************************************