In [None]:
from autogen.agentchat.contrib.llava_agent import LLaVAAgent  
from taco.agent import UserAgent
from taco.prompt import DirectAnswerPrompt, CoTAPrompt, FeedbackPrompt
from taco.parser import Parser
from taco.executor import Executor
from taco.data import AgentDataset, format_query
from taco.action import *

[2024-12-19 05:39:50,701] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)


tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

KeyError: 'OPENAI_API_KEY'

In [None]:
from taco.config import *

In [None]:
def checks_terminate_message(msg):
    if isinstance(msg, str):
        return msg.find("Terminate") > -1
    elif isinstance(msg, dict) and 'content' in msg:
        return msg['content'].find("Terminate") > -1
    else:
        raise NotImplementedError

In [None]:
# Set up the input and result directories for action execution
dataset = "example"
model = "agentstudio-family/taco-mantis-llama3-siglip-8b"
run_id = "example_run"
full_input_path = os.path.join(INPUT_IMAGE_PATH, dataset.lower())
full_result_path = os.path.join(RESULT_PATH, model, dataset.lower(), run_id)
if not os.path.exists(full_result_path):
    os.makedirs(full_result_path, exist_ok=True)

In [None]:
executor = Executor(input_folder=full_input_path, result_folder=full_result_path) 
actions = [
            OCR(),
            LocalizeObjects(), 
            GetObjects(),
            EstimateRegionDepth(),
            EstimateObjectDepth(),
            Crop(), 
            ZoomIn(),
            QueryLanguageModel(),
            GetImageToImagesSimilarity(),
            GetImageToTextsSimilarity(),
            GetTextToImagesSimilarity(),
            DetectFaces(),
            QueryKnowledgeBase(),
            Calculate(),
            SolveMathEquation(),
            Terminate(),
        ]

prompt_generator = CoTAPrompt(actions=actions)
parser = Parser(prompt_generator=prompt_generator) 
feedback_generator = FeedbackPrompt()

user = UserAgent(
    name="user_agent",
    human_input_mode='NEVER',
    max_consecutive_auto_reply=10,
    is_termination_msg=checks_terminate_message,
    prompt_generator=prompt_generator,
    feedback_generator=feedback_generator,
    parser=parser,
    executor=executor,
    code_execution_config={'use_docker': False}
)

all_action_names = [action.name for action in actions]
all_actions_str = ", ".join(all_action_names)
system_prompt = f"""[BEGIN OF GOAL] You are a helpful assistant, and your goal is to solve the # USER REQUEST #. You can either rely on your own capabilities or perform actions with external tools to help you. You can use these actions: {all_actions_str} [END OF GOAL]"""
# system_prompt = prompt_generator.get_task_prompt_only()



In [None]:
if (model.find("mllava") > -1 or model.lower().find("mantis") > -1):
    # Mantis
    llava_mode = "mantis-hf"
    if model.find("clip") > 0:
        base_path = os.path.join(MANTIS_CKPT_PATH, "Mantis-8B-clip-llama3-pretraind")
    else:
        base_path = os.path.join(MANTIS_CKPT_PATH, "Mantis-8B-siglip-llama3-pretraind")
    model_name = model
    model_path = os.path.join(base_path, model_name.replace("/", "_"), "checkpoint-final")
else:   
    # LLAVA
    llava_mode =  "llava-ov-hf"
    # append "_qwen" to the model name to make sure the qwen-based llava models are used in the llava codebase
    model_name = f"{model}_qwen"
    model_path =  os.path.join(LLAVA_OV_CKPT_PATH, model)

In [None]:
config_list = [
        {
            "model": model_name,
            "model_path": model_path,
            "api_key": "None",
            "base_url": "",
            "llava_mode": llava_mode
        }
    ]
default_config = {
    "seed": 42, 
    "config_list": config_list, 
    "cache_seed": None,
    "temperature": 0, 
    "do_sample": False,
    "device_id": "cuda:0",
    "max_new_tokens": 2000,
    }

In [15]:
 mm_agent = LLaVAAgent(
            name="mm_agnet",
            system_message=system_prompt,
            human_input_mode='NEVER',
            max_consecutive_auto_reply=10,
            llm_config=default_config
        )

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacty of 39.39 GiB of which 2.88 MiB is free. Process 14734 has 0 bytes memory in use. Process 18739 has 0 bytes memory in use. Including non-PyTorch memory, this process has 0 bytes memory in use. Of the allocated memory 20.60 GiB is allocated by PyTorch, and 108.57 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
example = {
    "index": 0,
    "question": "How many gallons of supreme gasoline can I get with $50?",
    "images": ["examples/gasoline.png"]
}

In [None]:
query = example['question']

formatted_query = format_query(example['images'], query)
user.initiate_chat(
    mm_agent,
    message=formatted_query,
    task=example
)
all_messages = mm_agent.chat_messages

[33muser_agent[0m (to mm_agnet):


# USER REQUEST #:
image-0: <image>
How many gallons of supreme gasoline can I get with $50?

Now please generate your response:


--------------------------------------------------------------------------------
Error: CUDA out of memory. Tried to allocate 1.29 GiB. GPU 0 has a total capacty of 39.39 GiB of which 368.88 MiB is free. Process 14734 has 0 bytes memory in use. Process 18739 has 0 bytes memory in use. Including non-PyTorch memory, this process has 0 bytes memory in use. Of the allocated memory 19.30 GiB is allocated by PyTorch, and 1.05 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF


TypeError: object of type 'NoneType' has no len()

In [None]:
print(all_messages)