In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch
import re
from multObjGenContext import *
from multObjGenFunctions import *
import time
import gc

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def log_vram_usage(stage):
    allocated = torch.cuda.memory_allocated()/(1024 ** 3)  
    reserved = torch.cuda.memory_reserved()/(1024 ** 3)  
    print(f"[{stage}] VRAM Usage - Allocated: {allocated:.2f} GB, Reserved: {reserved:.2f} GB")

def log_peak_vram_usage(stage):
    peak = torch.cuda.max_memory_allocated() / 1024**3
    print(f"\n[Peak {stage} usage] VRAM usage - Peak: {peak:.2f} GB")

def clear_memory():
    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats()
    torch.cuda.ipc_collect()  
    gc.collect()  

In [3]:
model_names = [
    "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ",
    #"TinyLlama/TinyLlama-1.1B-Chat-v1.0",
    #"TheBloke/deepseek-coder-1.3b-instruct-GPTQ",
    #"TechxGenus/gemma-2b-GPTQ",
    #"TheBloke/stable-code-3b-GPTQ",
    #"TheBloke/phi-2-GPTQ",
    #"TheBloke/phi-2-orange-GPTQ",
    #"TheBloke/deepseek-llm-7B-base-GPTQ",
    #"TheBloke/llama-deus-7b-v3-GPTQ",
    #"TheBloke/CapybaraHermes-2.5-Mistral-7B-GPTQ",
    #"TheBloke/Llama-2-7b-Chat-GPTQ",
    #"TheBloke/Mistral-7B-Instruct-v0.2-GPTQ",
    #"TechxGenus/Meta-Llama-3-8B-GPTQ",
    #"astronomer/Llama-3-8B-Instruct-GPTQ-4-Bit",
    #"TheBloke/LLaMA-Pro-8B-GPTQ"

]

test_cases_object_list = [
    "Complete a kitchen setup",
    "Complete batman's utility belt",
    "Complete american car set",
    "Complete a medieval weapons set",
    "Complete a bedroom setup",
    "Complete a fruit bowl",
    "Complete a pirate ship deck",
    "Complete a modern living room setup",
    "Complete a medieval marketplace",
    "Complete a science lab workspace"
]

test_cases_spatial = [
    ["keyboard", "monitor", "mouse", "pc"],
    ["guitar", "amplifier", "microphone", "music stand"],
    ["pan", "stove", "spatula", "cutting board"],
    ["helmet", "armor", "sword", "shield"],
    ["camera", "tripod", "lens", "flash"],
    ["tree", "bench", "fountain", "lamp post"],
    ["basketball", "hoop", "scoreboard", "bleachers"],
    ["train", "track", "station", "ticket booth"],
    ["painting", "easel", "paintbrush", "palette"],
    ["fish tank", "filter", "air pump", "plants"],
]

test_cases_coordinates = [
    ["keyboard infrontof monitor", "mouse totherightof keyboard", "pc totheleftof monitor"],
    ["guitar infrontof amplifier", "microphone totherightof guitar", "music stand totheleftof guitar"],
    ["pan infrontof stove", "spatula totherightof stove", "cutting board totheleftof stove"],
    ["helmet infrontof armor", "sword totherightof helmet", "shield totheleftof armor"],
    ["camera infrontof tripod", "lens totherightof camera", "flash totheleftof camera"],
    ["tree infrontof bench", "bench totherightof tree", "fountain totheleftof bench"],
    ["basketball infrontof hoop", "hoop totherightof scoreboard", "scoreboard totheleftof bleachers"],
    ["train infrontof station", "track totherightof train", "ticket booth totheleftof station"],
    ["painting infrontof easel", "paintbrush totherightof painting", "palette totheleftof easel"],
    ["fish tank infrontof filter", "filter totherightof fish tank", "air pump totheleftof filter"]
]

output_file = "evaluation_results.txt"

In [4]:
with open(output_file, "w") as f:
    f.write("")

for model_name in model_names:
    print(f"Testing model: {model_name}")
    clear_memory()  
    log_vram_usage(f"Before {model_name} load")

    model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cuda", trust_remote_code=True, revision="main")    # trust has to be set to true for some models
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

    log_vram_usage("After model load")

    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=50,
        do_sample=True,
        temperature=0.7,
        top_p=0.95,
        top_k=40,
        repetition_penalty=1.1
    )

    object_list_assistant = ObjectListAssistant(pipe, objlist_context)
    spatial_assistant = RelationalMappingAssistant(pipe, relational_context)
    coordinate_assistant = GridPlacementAssistant(pipe, grid_context)
    
    model_results = []
    model_results.append(f"========== Model: {model_name} ==========\n")

    print(f"-Testing Object List Assistant")
    start_time = time.time()
    model_results.append("object_list_test:\n")
    for prompt in test_cases_object_list:
        result = object_list_assistant.process_request(prompt)
        model_results.append(f"Prompt: {prompt}\nResult: {result}\n")

    elapsed = time.time() - start_time
    timestamp = time.time()
    print(f"Object List Assistant finished in {elapsed:.2f} seconds\n")

    print(f"-Testing Relational Mapping Assistant")
    model_results.append("\nrelational_test:\n")
    for objects in test_cases_spatial:
        result = spatial_assistant.process_request(objects)
        model_results.append(f"Objects: {objects}\nResult: {result}\n")

    elapsed = time.time() - timestamp
    timestamp = time.time()
    print(f"Relational Mapping Assistant finished in {elapsed:.2f} seconds\n")
    

    print(f"-Testing Grid Placement Assistant")
    model_results.append("\ngrid_test:\n")
    for relations in test_cases_coordinates:
        result = coordinate_assistant.process_request(relations)
        model_results.append(f"Relations: {relations}\nResult: {result}\n")
    
    elapsed = time.time() - timestamp
    print(f"Grid Placement Assistant finished in {elapsed:.2f} seconds\n")

    log_vram_usage("After all tests")

    peak_memory = torch.cuda.max_memory_allocated() / 1024**3
    peak_log = f"[Peak VRAM] {model_name}: {peak_memory:.2f} GB\n"

    print(f'{peak_log}')
    
    model_results.append("\n============================================\n\n")
    
    with open(output_file, "a") as f:
        print("Writing...")
        f.writelines(model_results)
    
    del model
    del tokenizer
    del pipe
    clear_memory()
    
    print(f"Finished testing {model_name}\n")

Testing model: TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ
[Before TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ load] VRAM Usage - Allocated: 0.00 GB, Reserved: 0.00 GB


CUDA extension not installed.
CUDA extension not installed.
The cos_cached attribute will be removed in 4.39. Bear in mind that its contents changed in v4.38. Use the forward method of RoPE from now on instead. It is not used in the `LlamaAttention` class
The sin_cached attribute will be removed in 4.39. Bear in mind that its contents changed in v4.38. Use the forward method of RoPE from now on instead. It is not used in the `LlamaAttention` class


[After model load] VRAM Usage - Allocated: 0.73 GB, Reserved: 0.78 GB
-Testing Object List Assistant


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Object List Assistant finished in 43.34 seconds

-Testing Relational Mapping Assistant
Relational Mapping Assistant finished in 38.40 seconds

-Testing Grid Placement Assistant
Grid Placement Assistant finished in 31.78 seconds

[After all tests] VRAM Usage - Allocated: 0.74 GB, Reserved: 1.09 GB
[Peak VRAM] TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ: 0.93 GB

Writing...
Finished testing TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ

