In [1]:
import numpy as np
import glob
import time
import os 
import cv2
import ollama
import re
import json
from google import genai
from pydantic import BaseModel
import torch 
import torch.nn as  nn 
import requests
from PIL import Image

from transformers import (
    AutoModelForCausalLM,
    AutoModelForMaskGeneration,
    AutoModelForZeroShotObjectDetection,
    AutoProcessor,
    GenerationConfig,
)
from transformers import Owlv2Processor, Owlv2ForObjectDetection
from transformers import SamModel, SamProcessor
from scipy.stats import entropy
from torch import Tensor
from torch.nn.functional import cosine_similarity

try:
    from sklearn.cluster import DBSCAN
    from sklearn.neighbors import NearestNeighbors
    SKLEARN_AVAILABLE = True
except Exception:
    SKLEARN_AVAILABLE = False

In [2]:
# -------------------------
# Object Node
# -------------------------
class ObjectNode:
    def __init__(self, obj_id, points, feature, mask, object_name, frame_path=None, seen=1, caption=None, bbox=None, extra=None, parent=None):
        self.id = obj_id
        self.points = points
        self.feature = feature
        self.seen = seen
        self.caption = caption                # define whether the node is parent or child 
        self.parent = parent                  # define the parent node name 
        self.mask = mask                      # mask of the object on the image
        self.name = object_name               # name of the node
        self.frame_path = frame_path          # this is the image path containing these objects for uniqueness 
        self.bbox = bbox
        self.extra = extra if extra is not None else {}
        self.update_bbox()

    def update_bbox(self):
        if self.points.size == 0:
            self.bbox = None
            return
        mn = self.points.min(axis=0)
        mx = self.points.max(axis=0)
        self.bbox = (mn, mx)


In [3]:
# -------------------------
# Scene Graph
# -------------------------
class SceneGraph:
    def __init__(self):
        self.objects = {}
        self.edges = []

    def add_object(self, node):
        self.objects[node.id] = node

    def add_edge(self, a, b, relation):
        self.edges.append((a, b, relation))


In [4]:
# -------------------------
# Camera helpers
# -------------------------
def backproject_mask_to_points(depth, mask, K, max_depth=10.0):
    assert depth.ndim == 2 and mask.shape == depth.shape, "mask/depth shape mismatch"
    fy, fx, cx, cy = K[1, 1], K[0, 0], K[0, 2], K[1, 2]
    ys, xs = np.where(mask)
    zs = depth[ys, xs]
    valid = np.isfinite(zs) & (zs > 0) & (zs < max_depth)
    xs, ys, zs = xs[valid], ys[valid], zs[valid]
    X = (xs - cx) * zs / fx
    Y = (ys - cy) * zs / fy
    pts = np.stack([X, Y, zs], axis=1)
    return pts

def transform_points(points_cam, T_cam2world):
    if points_cam.size == 0:
        return points_cam
    pts_h = np.hstack([points_cam, np.ones((points_cam.shape[0], 1))])
    pts_w = (T_cam2world @ pts_h.T).T[:, :3]
    return pts_w



In [5]:
# -------------------------
# Denoising & Downsampling
# -------------------------
def dbscan_largest_cluster(points, eps=0.03, min_samples=20):
    if points.size == 0:
        return points
    if SKLEARN_AVAILABLE:
        db = DBSCAN(eps=eps, min_samples=min_samples).fit(points)
        labels = db.labels_
        if len(labels) == 0 or np.all(labels == -1):
            return points
        unique, counts = np.unique(labels[labels >= 0], return_counts=True)
        if unique.size == 0:
            return points
        largest = unique[np.argmax(counts)]
        return points[labels == largest]
    return points

def voxel_downsample(points, voxel_size=0.02):
    if points.size == 0:
        return points
    vox = np.floor(points / voxel_size)
    _, idx = np.unique(vox, axis=0, return_index=True)
    return points[idx]



In [6]:
# -------------------------
# Features
# -------------------------
def l2_normalize(x, eps=1e-8):
    n = np.linalg.norm(x) + eps
    return x / n

def fallback_feature(rgb, mask, pts_world):
    sel = rgb[mask]
    if sel.size == 0:
        color_feat = np.zeros(6, dtype=np.float32)
    else:
        mean = sel.reshape(-1, 3).mean(axis=0)
        std = sel.reshape(-1, 3).std(axis=0)
        color_feat = np.hstack([mean, std]).astype(np.float32)
    if pts_world.shape[0] >= 3:
        mu = pts_world.mean(axis=0, keepdims=True)
        C = np.cov((pts_world - mu).T)
        vals = np.sqrt(np.clip(np.linalg.eigvalsh(C), 0, None))
    else:
        vals = np.zeros(3, dtype=np.float32)
    size = np.array([float(pts_world.shape[0])], dtype=np.float32)
    feat = np.hstack([color_feat, vals, size]).astype(np.float32)
    return l2_normalize(feat)



In [7]:
# -------------------------
# Similarity
# -------------------------
def cosine_sim(a, b):
    return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-8))

def nnratio(new_pts, obj_pts, radius=0.05):
    if new_pts.size == 0 or obj_pts.size == 0:
        return 0.0
    if SKLEARN_AVAILABLE:
        nbrs = NearestNeighbors(radius=radius, algorithm='kd_tree').fit(obj_pts)
        ind = nbrs.radius_neighbors(new_pts, radius=radius, return_distance=False)
        hits = sum(len(ix) > 0 for ix in ind)
        return hits / len(new_pts)
    diffs = new_pts[:, None, :] - obj_pts[None, :, :]
    d2 = np.sum(diffs * diffs, axis=2)
    hits = np.any(d2 <= radius * radius, axis=1).sum()
    return float(hits) / float(new_pts.shape[0])


In [8]:
# -------------------------
# Assoc Params & Greedy
# -------------------------
class AssocParams:
    def __init__(self, delta_nn=0.05, delta_sim=0.7, w_sem=1.0, w_geo=1.0):
        self.delta_nn = delta_nn
        self.delta_sim = delta_sim
        self.w_sem = w_sem
        self.w_geo = w_geo

class GreedyAssociator:
    def __init__(self, params):
        self.params = params

    def match(self, detections, graph): 
        #print("graphs = ", graph.objects, graph.edges)
        results = []
        for i, (pts_i, feat_i, meta_i, parent, object_name, frame_path) in enumerate(detections): 
            best_obj, best_score = None, -1e9
            for obj in graph.objects.values(): 
                #print("obj = ", obj.points)
                geo = nnratio(pts_i, obj.points, radius=self.params.delta_nn)
                sem = cosine_sim(feat_i, obj.feature)
                sem_n = 0.5 * (sem + 1.0)
                score = self.params.w_geo * geo + self.params.w_sem * sem_n
                if score > best_score:
                    best_score = score
                    best_obj = obj.id
            if best_score >= self.params.delta_sim:
                results.append((i, best_obj, meta_i, parent, object_name, frame_path))
            else:
                results.append((i, None, meta_i, parent, object_name, frame_path))
        return results



In [9]:
# -------------------------
# Fusion & Caption hooks
# -------------------------
def fuse_object(existing, new_points, new_feature, voxel_size=0.02):
    fused_feat = (existing.seen * existing.feature + new_feature) / (existing.seen + 1)
    fused_feat = l2_normalize(fused_feat)
    fused_pts = np.vstack([existing.points, new_points])
    fused_pts = voxel_downsample(fused_pts, voxel_size=voxel_size)
    existing.points = fused_pts
    existing.feature = fused_feat
    existing.seen += 1
    existing.update_bbox()
    return existing

def caption_with_lvml(rgb, mask):
    return None



In [10]:
# -------------------------
# Relation inference
# -------------------------
def infer_relations_spatial(graph, dist_thresh=0.2, height_eps=0.05):
    ids = list(graph.objects.keys())
    centers, bboxes = {}, {}
    for oid in ids:
        obj = graph.objects[oid]
        if obj.bbox is None:
            obj.update_bbox()
        bboxes[oid] = obj.bbox
        mn, mx = obj.bbox if obj.bbox is not None else (np.zeros(3), np.zeros(3))
        centers[oid] = 0.5 * (mn + mx)
    graph.edges = []
    for i in range(len(ids)):
        for j in range(i + 1, len(ids)):
            a, b = ids[i], ids[j] 
            name_a, name_b = graph.objects[a].name, graph.objects[b].name 
            parent_a, parent_b = graph.objects[a].parent, graph.objects[b].parent 

            if parent_a == None and parent_b == None:
                graph.add_edge(a, b, "No Relation")
            else:
                if parent_a != None and parent_b == None and parent_a == name_b:
                    graph.add_edge(a, b, "a is child of b")
                if parent_b != None and parent_a == None and parent_b == name_a:
                    graph_add_edge(a, b, "b is child of a")


In [11]:
# -------------------------
# Mapper Config & Mapper
# -------------------------
class MapperConfig:
    def __init__(self, max_depth_m=10.0, dbscan_eps_m=0.03, dbscan_min_samples=20,
                 voxel_size_m=0.02, assoc=None, use_clip_dino=False):
        self.max_depth_m = max_depth_m
        self.dbscan_eps_m = dbscan_eps_m
        self.dbscan_min_samples = dbscan_min_samples
        self.voxel_size_m = voxel_size_m
        self.assoc = assoc if assoc else AssocParams()
        self.use_clip_dino = use_clip_dino


In [12]:
class ConceptGraphsMapper:
    def __init__(self, K, cfg=None):
        self.graph = SceneGraph()
        self.cfg = cfg if cfg else MapperConfig()
        self.K = K.astype(np.float32)
        self._next_id = 0
        self._associator = GreedyAssociator(self.cfg.assoc)

    def _new_id(self):
        nid = self._next_id
        self._next_id += 1
        return nid

    def process_frame(self, rgb, depth, T_cam2world, masks, parent_list, object_name, frame_path, caption):
        detections = []
        for i  in range(len(masks)):
            m = masks[i]
            pts_cam = backproject_mask_to_points(depth, m, self.K, self.cfg.max_depth_m)
            if pts_cam.size == 0:
                continue
            pts_cam = dbscan_largest_cluster(pts_cam, eps=self.cfg.dbscan_eps_m,
                                             min_samples=self.cfg.dbscan_min_samples)
            pts_w = transform_points(pts_cam, T_cam2world)
            pts_w = voxel_downsample(pts_w, voxel_size=self.cfg.voxel_size_m)
            feat = fallback_feature(rgb, m, pts_w)
            detections.append((pts_w, feat, {"mask": m}, parent_list[i], object_name[i], frame_path))

        #print("graph=", self.graph.objects, self.graph.edges)
        #print("associator=",self._associator.params.delta_nn)
        matches = self._associator.match(detections, self.graph)
        for (det_idx, obj_id, meta, parent, object_name, frame_path) in matches:
            pts_w, feat, _, _, _, _ = detections[det_idx]
            if obj_id is None:
                nid = self._new_id()
                node = ObjectNode(nid, pts_w, feat, meta, object_name)
                cap = caption
                node.caption = cap
                node.parent = parent
                node.frame_path = frame_path
                self.graph.add_object(node)
                #print(" first frame graph=", self.graph.objects, self.graph.edges)
            else:
                node = self.graph.objects[obj_id]
                node.frame_path = frame_path
                fuse_object(node, pts_w, feat, voxel_size=self.cfg.voxel_size_m) 
                #print(" After first frame graph=", self.graph.objects, self.graph.edges)

        infer_relations_spatial(self.graph) 
        #print(" Relation =", self.graph.edges)
        return self.graph



In [13]:
rgb_path = "/media/parvez/One_Touch/Scenefun3D/train_val_set/420673/42445198/hires_wide" 
depth_path = "/media/parvez/One_Touch/Scenefun3D/train_val_set/420673/42445198/hires_depth" 
traj_path = "/media/parvez/One_Touch/Scenefun3D/train_val_set/420673/42445198/hires_poses.traj"
intrinsic_files_path = "/media/parvez/One_Touch/Scenefun3D/train_val_set/420673/42445198/hires_wide_intrinsics"

In [14]:
def convert_angle_axis_to_matrix3(angle_axis):
    """
    Converts a rotation from angle-axis representation to a 3x3 rotation matrix.

    Args:
        angle_axis (numpy.ndarray): A 3-element array representing the rotation in angle-axis form.

    Returns:
        (numpy.ndarray): A 3x3 rotation matrix representing the same rotation as the input angle-axis.

    Raises:
        ValueError: If the input is not a valid 3-element numpy array.
    """
    # Check if input is a numpy array
    if not isinstance(angle_axis, np.ndarray):
        raise ValueError("Input must be a numpy array.")
    
    # Check if the input is of shape (3,)
    if angle_axis.shape != (3,):
        raise ValueError("Input must be a 3-element array representing the rotation in angle-axis representation.")
    
    matrix, jacobian = cv2.Rodrigues(angle_axis)
    return matrix


In [15]:
def TrajStringToMatrix(traj_str):
        """ 
        Converts a line from the camera trajectory file into translation and rotation matrices.

        Args:
            traj_str (str): A space-delimited string where each line represents a camera pose at a particular timestamp. 
                            The line consists of seven columns:
                - Column 1: timestamp
                - Columns 2-4: rotation (axis-angle representation in radians)
                - Columns 5-7: translation (in meters)

        Returns:
            (tuple): A tuple containing:
                - ts (str): Timestamp.
                - Rt (numpy.ndarray): 4x4 transformation matrix representing rotation and translation.

        Raises:
            AssertionError: If the input string does not have exactly seven columns.
        """
        tokens = traj_str.split()
        assert len(tokens) == 7
        ts = tokens[0]

        # Rotation in angle axis
        angle_axis = [float(tokens[1]), float(tokens[2]), float(tokens[3])]
        r_w_to_p = convert_angle_axis_to_matrix3(np.asarray(angle_axis))

        # Translation
        t_w_to_p = np.asarray([float(tokens[4]), float(tokens[5]), float(tokens[6])])
        extrinsics = np.eye(4, 4)
        extrinsics[:3, :3] = r_w_to_p
        extrinsics[:3, -1] = t_w_to_p
        Rt = np.linalg.inv(extrinsics)

        return (ts, Rt)

In [16]:
def get_camera_trajectory(traj_path):
        """
        Retrieve the camera trajectory from a file and convert it into a dictionary whose keys are timestamps and 
        values are the corresponding camera poses.

        Args:
            visit_id (str): The identifier of the scene.
            video_id (str): The identifier of the video sequence.
            pose_source (str, optional): Specifies the trajectory asset type, either "colmap" or "arkit". Defaults to "colmap".

        Returns:
            (dict): A dictionary where keys are timestamps (rounded to 3 decimal points) and values are 4x4 transformation matrices representing camera poses.

        Raises:
            AssertionError: If an unsupported trajectory asset type is provided.
        """
        #assert pose_source in ["colmap", "arkit"], f"Unknown option {pose_source}"

        #data_asset_identifier = "hires_poses" if pose_source == "colmap" else "lowres_poses"
        traj_file_path = traj_path
        #print("traj_file_path = ", traj_file_path)

        with open(traj_file_path) as f:
            traj = f.readlines()

        # Convert trajectory to a dictionary
        poses_from_traj = {}
        for line in traj:
            traj_timestamp = line.split(" ")[0] 


            poses_from_traj[f"{traj_timestamp}"] = np.array(TrajStringToMatrix(line)[1].tolist())
            

        return poses_from_traj

In [17]:
def get_camera_intrinsics(intrinsics_path):
        """
        Retrieve the camera intrinsics for a given scene and video sequence.

        Args:
            visit_id (str): The identifier of the scene.
            video_id (str): The identifier of the video sequence.
            data_asset_identifier (str, optional): The data asset type for camera intrinsics.
                                                   Can be either "hires_wide_intrinsics" or "lowres_wide_intrinsics". 
                                                   Defaults to "hires_wide_intrinsics".

        Returns:
            (dict): A dictionary mapping timestamps to file paths of camera intrinsics data.

        Raises:
            ValueError: If an unsupported data asset identifier is provided.
            FileNotFoundError: If no intrinsics files are found at the specified path.
        """
        intrinsics_mapping = {}
        

        intrinsics = sorted(glob.glob(os.path.join(intrinsics_path, "*.pincam")))
        

        intrinsics_timestamps = [os.path.basename(x).split(".pincam")[0].split("_")[1] for x in intrinsics]

        # Create mapping from timestamp to full path
        intrinsics_mapping = {timestamp: cur_intrinsics for timestamp, cur_intrinsics in zip(intrinsics_timestamps, intrinsics)}

        return intrinsics_mapping

In [18]:
def read_camera_intrinsics(intrinsics_file_path, format="tuple"):
        """
        Parses a file containing camera intrinsic parameters and returns them in the specified format.

        Args:
            intrinsics_file_path (str): The path to the file containing camera intrinsic parameters.
            format (str, optional): The format in which to return the camera intrinsic parameters.
                                    Supported formats are "tuple" and "matrix". Defaults to "tuple".

        Returns:
            (Union[tuple, numpy.ndarray]): Camera intrinsic parameters in the specified format.

                - If format is "tuple", returns a tuple \\(w, h, fx, fy, hw, hh\\).
                - If format is "matrix", returns a 3x3 numpy array representing the camera matrix.
        
        Raises:
            ValueError: If an unsupported format is specified.
        """
        w, h, fx, fy, hw, hh = np.loadtxt(intrinsics_file_path)

        if format == "tuple":
            return (w, h, fx, fy, hw, hh)
        elif format == "matrix":
            return np.asarray([[fx, 0, hw], [0, fy, hh], [0, 0, 1]])
        else:
            raise ValueError(f"Unknown format {format}")

In [19]:
rgb_files = sorted(os.listdir(rgb_path))
depth_files = sorted(os.listdir(depth_path))

In [20]:
#intrinsics_path = get_camera_intrinsics(intrinsic_file_path)
#print(intrinsics_path)
#width, height, _, _, _, _ = read_camera_intrinsics(intrinsics_path)
#poses_from_traj = get_camera_trajectory(traj_path)

In [21]:
class DataParser:
    """
    A class for parsing data files in the SceneFun3D dataset.
    """

    def __init__(self, root, split):
        """
        Initialize the DataParser instance with the root path.

        Args:
            data_root_path (str): The root path where data is located.
        """
        self.root = root
        self.split = split
        self.data_root_path = os.path.join(root, split)


    def get_data_asset_path(self, data_asset_identifier, visit_id, video_id=None):
        """
        Get the file path for a specified data asset.

        Args:
            data_asset_identifier (str): A string identifier for the data asset.
            visit_id (str or int): The identifier for the visit (scene).
            video_id (str or int, optional): The identifier for the video sequence. Required if specified data asset requires a video identifier.

        Returns:
            (Path): A Path object representing the file path to the specified data asset.

        Raises:
            AssertionError: If the `data_asset_identifier` is not valid or if `video_id` is required but not provided.
        """
        assert (
            data_asset_identifier in data_asset_to_path
        ), f"Data asset identifier '{data_asset_identifier}' is not valid"

        data_path = data_asset_to_path[data_asset_identifier]

        if ("<video_id>" in data_path) and (video_id is None):
            assert (
                False
            ), f"video_id must be specified for the data asset identifier '{data_asset_identifier}'"

        visit_id = str(visit_id)

        data_path = data_path.replace("<data_dir>", self.data_root_path).replace(
            "<visit_id>", visit_id
        )

        if "<video_id>" in data_path:
            video_id = str(video_id)
            data_path = data_path.replace("<video_id>", video_id)

        return data_path
    def get_descriptions(self, visit_id):
        """
        Retrieve the natural language task descriptions for a specified scene.

        Args:
            visit_id (str or int): The identifier for the scene.

        Returns:
            (list): A list of descriptions, each represented as a dictionary.
        """
        descriptions_path = self.get_data_asset_path(
            data_asset_identifier="descriptions", visit_id=visit_id
        )

        with open(descriptions_path, "r") as f:
            descriptions_data = json.load(f)["descriptions"]

        return descriptions_data


    def get_descriptions_list(self, visit_id: str):
        """
        List of descriptions given a visit_id
        """

        descs = self.get_descriptions(visit_id)
        desc_ids = {desc["desc_id"]: desc["description"] for desc in descs}
        return desc_ids


    def get_visits(self) -> list:
        """
        Given a split, returns a dict associating each visit id to the list of video ids
        """

        with open(
            os.path.join(f"{self.root}/benchmark_file_lists/{self.split}_set.csv")
        ) as f:
            # skip csv header
            visit_video = f.readlines()[1:]

        visits = list()
        for line in visit_video:
            visit_id = line.strip("\n").split(",")[0]
            visits.append(visit_id)

        return visits
        


In [22]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [23]:
client = genai.Client(api_key="AIzaSyDsWhaBPW5fHBZz-R3t4ImQ0MFTCBPtw2I")

In [24]:
sam_model = SamModel.from_pretrained("jadechoghari/robustsam-vit-large").to(device)
sam_processor = SamProcessor.from_pretrained("jadechoghari/robustsam-vit-large")

In [25]:
owl_processor = Owlv2Processor.from_pretrained("google/owlv2-base-patch16-ensemble")
owl_model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-base-patch16-ensemble")

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [26]:
def generate_mask(objects, rgb):
    H, W, _ = rgb.shape 
    #create the parent node 
    text_labels = [objects] 
    inputs = owl_processor(text=text_labels, images=rgb, return_tensors="pt")
    outputs = owl_model(**inputs)
    # Target image sizes (height, width) to rescale box predictions [batch_size, 2]
    target_sizes = torch.tensor([(H, W)]) 
    # Convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
    results = owl_processor.post_process_grounded_object_detection(
                  outputs=outputs, target_sizes=target_sizes, threshold=0.1, text_labels=text_labels
       )
    # Retrieve predictions for the first image for the corresponding text queries
    result = results[0]
    boxes, scores, text_labels = result["boxes"], result["scores"], result["text_labels"] 

    #print("text_labels = ", text_labels)

    #for box, score, text_label in zip(boxes, scores, text_labels):
    #   #print(box)
    #   box = [round(i, 2) for i in box.tolist()]
    #   print(f"Detected {text_label} with confidence {round(score.item(), 3)} at location {box}")

    bb_boxes_list = [] 
    boxes = boxes.detach().numpy()
    for bb in boxes:
       bb_boxes_list.append(list(bb))

    model = SamModel.from_pretrained("facebook/sam-vit-base").to(device)
    processor = SamProcessor.from_pretrained("facebook/sam-vit-base")

    input_boxes = [bb_boxes_list] # 2D localization of a window 
    image = rgb

    inputs = processor(image, input_boxes=input_boxes, return_tensors="pt").to(device)
    outputs = model(**inputs)

    masks = processor.image_processor.post_process_masks(outputs.pred_masks.cpu(), inputs["original_sizes"].cpu(), inputs["reshaped_input_sizes"].cpu())
    scores = outputs.iou_scores 

    return masks , text_labels


In [27]:
def get_llm_response(image_path):
    OLLAMA_PORT = 11434
    client = ollama.Client(host=f"localhost:{OLLAMA_PORT}") 
    response = ollama.chat(
    model='llama3.2-vision',
    messages=[
         {
                "role": "system",
                "content":  "You are an AI system that receives an image of a scene and outputs a JSON\n representation of all the objects present and their functional parts \n that can be acted upon by a robot. \n Your task is to:\n1. Identify all objects in the given image.\n2. For each object, identify its functional parts.\n3. Output them in JSON format.",
            },{
        'role': 'user',
        'content': 'Respond directly with the following JSON format: \n {{{"object": the visible object, "functional_part": list of functional part of the object}}}',
        'images': [image_path]
    }]
    )
    return response["message"]["content"]


In [28]:
start_time = time.time()
for i in range(len(rgb_files)):
       time_stamp = rgb_files[i].split("_")[1].split(".jpg")[0] 
       intrinsic_path = get_camera_intrinsics(intrinsic_files_path)[time_stamp]
       width, height, fx, fy, ox, oy = read_camera_intrinsics(intrinsic_path)
       pose = get_camera_trajectory(traj_path)[time_stamp] 
       K = np.zeros((3,3), dtype=np.float32)
       K[0][0] = fx 
       K[1][1] = fy 
       K[2][2] = 1.0 
       K[0][2] = ox 
       K[1][2] = oy 

       mapper = ConceptGraphsMapper(K) 
       rgb_file_path = os.path.join(rgb_path, rgb_files[i])
       depth_file_path = os.path.join(depth_path, depth_files[i]) 

       #gemini code 
       my_file = client.files.upload(file=rgb_file_path) 
       raw_output = get_llm_response(rgb_file_path) 
       # Find all {...} JSON objects using regex
       objects = re.findall(r'\{.*?\}', raw_output, re.DOTALL)

       # Parse each JSON object individually
       parsed_objects = [json.loads(obj) for obj in objects]

       # Wrap into one valid JSON structure
       final_json = {"objects": parsed_objects}

       final_json = json.loads(json.dumps(final_json, indent=4))
        
    
       functional_output = final_json["objects"]
       #print(functional_output)

       # create the list of object and their children functional part 
       objects = []
       children_list = []
       for k in range(len(functional_output)):
          obj_dict = functional_output[k]
          obj = obj_dict["object"]            # string 
          children = obj_dict["functional_part"]  # list of childer list 
          objects.append(obj) 
          children_list.append(children) 
          

       rgb = cv2.imread(rgb_file_path) 
       depth = cv2.imread(depth_file_path)[:, :, 0] 
      
       #create node of the objects 
       masks, text_labels = generate_mask(objects=objects, rgb=rgb )
       parent_list = []
       
       T = pose
       masks = masks[0][:, 0, :, :]
       # generate the parent list for the objects (None)
       total_mask, _, _ = masks.shape 
       for k in range(total_mask):
           parent_list.append(None)
       graph = mapper.process_frame(rgb, depth, T, masks, parent_list, text_labels, rgb_file_path, None)

      
        
print(f"Frame  objects={len(graph.objects)}, edges={len(graph.edges)}")
print("time = ", time.time() - start_time)
    



JSONDecodeError: Invalid control character at: line 1 column 33 (char 32)