In [1]:
import sys
import json
import os
sys.path.append( '/home/lodes/uni/3.semester/project/LLaVA-3D/open3dsg' )
from transformers import InstructBlipVideoProcessor, InstructBlipVideoForConditionalGeneration, BitsAndBytesConfig
import torch
import numpy as np
import gc
from const import CONF_PATH_R3SCAN_RAW, CONF_PATH_R3SCAN_PROCESSED
from open_dataset import Open2D3DSGDataset
import random
from tqdm import tqdm

2025-01-19 11:52:03.362312: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
def load_scan(base_path, file_path):
            return json.load(open(os.path.join(base_path, file_path)))["scans"]

def get_queries(data_dict):
    obj_class_dict = [line.rstrip() for line in open(os.path.join(CONF_PATH_R3SCAN_RAW, "classes.txt"), "r").readlines()]
    obj_count = data_dict['objects_count'].item()
    rel_count = int(data_dict["predicate_count"].item())
    objects_gt = data_dict['objects_cat']
    edges = data_dict['edges'][:rel_count]
    object_edges = np.array(objects_gt[:obj_count][edges], dtype=np.int32)
    object_edges = np.array(obj_class_dict)[object_edges]
    object_edges[object_edges == 'socket'] = 'wall'
    queries = [
    f"Describe the relationship between the {o[0]} and the {'other ' if o[0]==o[1] else ''}{o[1]}. Start the response with: the {o[0]}" if o[0] != o[1]
    else f"Describe the relationship between the {o[0]} and the {'other ' if o[0]==o[1] else ''}{o[1]}. Start the response with: the {o[0]}"
    for o in object_edges]

    return queries

def is_black_image(image_array):
    """
    Check if a given image array is completely black.
    A completely black image has all pixel values equal to 0.
    """
    return np.all(image_array == 0)

In [3]:
scan_id = '754e884c-ea24-2175-8b34-cead19d4198d'
D3SSG = load_scan(CONF_PATH_R3SCAN_RAW, "relationships_train.json")
for r in D3SSG:
    if r['scan'] == scan_id:
        D3SSG = [r]
dataset = Open2D3DSGDataset(
    relationships_R3SCAN=D3SSG,
    relationships_scannet=None,
    openseg=False,
    img_dim=224,
    rel_img_dim=224,
    top_k_frames=5,
    scales=3,
    mini=False,
    load_features=None,
    blip=True,
    llava=False,
    half=False,
    max_objects=9,
    max_rels=72
)
data_dict = dataset[0]

  0%|          | 0/1 [00:00<?, ?it/s]

In [4]:
kwargs = {"device_map": 0}
kwargs['quantization_config'] = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type='nf4'
        )
cache_dir = '../models/BLIP'
processor = InstructBlipVideoProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b")

In [5]:
rel_images = data_dict['blip_images']
image_shape = (960, 540, 3)
black_image = np.zeros(image_shape, dtype=np.uint8)

In [6]:
processed_images = []
for image_list in rel_images:
    # Pad with black images if the list has fewer than 4 images
    if len(image_list) < 4:
        padded_list = image_list + [black_image] * (4 - len(image_list))
    else:
        padded_list = image_list
    
    # Select a random subset of 4 images
    random_subset = [np.array(x) for x in random.sample(padded_list, 4)]
    
    # Check if all images in the subset are black
    if all(is_black_image(image) for image in random_subset):
        processed_images.append(np.empty((0,)))  # Append an empty array
    else:
        # Stack the subset and append it to the result
        processed_images.append(np.stack(random_subset))

In [7]:
queries = get_queries(data_dict)
model = InstructBlipVideoForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-7b", cache_dir=cache_dir, **kwargs)

You are using a model of type instructblip to instantiate a model of type instructblipvideo. This is not supported for all configurations of models and can yield errors.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [9]:
results = []
results_path = "./blip_relationships/results.json"
assert len(queries) == len(processed_images)
for idx, image_set in tqdm(enumerate(processed_images)):
   prompt = queries[idx]
   if np.all(image_set == 0):
       results.append("No relationship")
   else:
      inputs = processor(text=prompt, images=image_set, return_tensors="pt").to(model.device)
      outputs = model.generate(
         **inputs,
         do_sample=False,
         num_beams=5,
         max_length=256,
         repetition_penalty=1.5,
         length_penalty=1.0,
      )
      generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
      results.append(generated_text)
   with open(results_path, "w") as json_file:
      json.dump(results, json_file, indent=4)

56it [23:26, 25.12s/it]


In [25]:
torch.cuda.empty_cache()
torch.cuda.ipc_collect()
gc.collect()
del model