In [22]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import sys
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="0" #! specify gpu here

import matplotlib
import matplotlib.pyplot as plt

import json
from pathlib import Path

import cv2

# from scipy import ndimage
import natsort
import numpy as np
from tqdm import tqdm
from rich import print

from vision_pipeline.llm_data_generator.labelme_importer import LabelMeImporter

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [26]:
labelme_importer = LabelMeImporter()
# load list of images, with ground truths, labelme style
# convert ground truths to Detections

dataset_dir = Path(os.path.expanduser("~/datasets2/reconcycle/2023-05-23_synthetic_dataset/output"))

image_folder = "images"
labelme_folder = "labelme"

labelme_dir = dataset_dir / labelme_folder
images_dir = dataset_dir / image_folder

json_paths = list(labelme_dir.glob('*.json'))
json_paths = natsort.os_sorted(json_paths)

image_paths = list(images_dir.glob('*.png')) + list(images_dir.glob('*.jpg')) 
image_paths = natsort.os_sorted(image_paths)

llm_output_list = []

tqdm_json_paths = tqdm(json_paths)
for idx, json_path in enumerate(tqdm_json_paths):
    tqdm_json_paths.set_description(f"{Path(json_path).stem}")

    

    json_data = json.load(open(json_path))
    filename = json_path.stem

    img_matches = [_img_path for _img_path in image_paths if filename == _img_path.stem.split('_')[0] ]

    colour_img_path = None
    if len(img_matches) > 0:
        for img_match in img_matches:
            if "depth" not in img_match.stem:
                colour_img_path = img_match                
                break
    
    print("json_path", json_path)
    print("colour_img_path", colour_img_path)

    colour_img = cv2.imread(str(colour_img_path))

    detections, graph_relations = labelme_importer.labelme_to_detections(json_data, colour_img)

    graph_relations_text = graph_relations.to_text()

    # print("detections", detections)
    print("graph relations:")
    print(graph_relations_text)

    llm_item = {
        "id": idx,
        "image": str(colour_img_path.relative_to(dataset_dir)),
        "conversations": [
            {
                "from": "human",
                "value": "<image>\nWhat are the objects and relations you see in this image?"
            },
            {
                "from": "gpt",
                "value": graph_relations_text
            },
        ]
    }
    llm_output_list.append(llm_item)

    # if idx > 2:
    #     break #! debug


# TODO: save to json file
print("llm_output_list", llm_output_list)

with open(dataset_dir / "llm_data.json", "w") as fp:
    json.dump(llm_output_list , fp, indent=4)

# TODO: I think we should use smaller images, with the device taking up most of the image. Because with CLIP the image will be scaled down a lot


00000001:   0%|          | 0/100 [00:00<?, ?it/s]

00000002:   0%|          | 0/100 [00:00<?, ?it/s]

00000003:   0%|          | 0/100 [00:00<?, ?it/s]

00000004:   0%|          | 0/100 [00:00<?, ?it/s]

00000005:   4%|▍         | 4/100 [00:00<00:02, 39.88it/s]

00000006:   4%|▍         | 4/100 [00:00<00:02, 39.88it/s]

00000007:   4%|▍         | 4/100 [00:00<00:02, 39.88it/s]

00000008:   4%|▍         | 4/100 [00:00<00:02, 39.88it/s]

00000009:   8%|▊         | 8/100 [00:00<00:02, 39.61it/s]

00000010:   8%|▊         | 8/100 [00:00<00:02, 39.61it/s]

00000011:   8%|▊         | 8/100 [00:00<00:02, 39.61it/s]

00000012:   8%|▊         | 8/100 [00:00<00:02, 39.61it/s]

00000013:   8%|▊         | 8/100 [00:00<00:02, 39.61it/s]

00000014:  13%|█▎        | 13/100 [00:00<00:02, 41.95it/s]

00000015:  13%|█▎        | 13/100 [00:00<00:02, 41.95it/s]

00000016:  13%|█▎        | 13/100 [00:00<00:02, 41.95it/s]

00000017:  13%|█▎        | 13/100 [00:00<00:02, 41.95it/s]

00000018:  13%|█▎        | 13/100 [00:00<00:02, 41.95it/s]

00000019:  18%|█▊        | 18/100 [00:00<00:01, 42.85it/s]

00000020:  18%|█▊        | 18/100 [00:00<00:01, 42.85it/s]

00000021:  18%|█▊        | 18/100 [00:00<00:01, 42.85it/s]

00000022:  18%|█▊        | 18/100 [00:00<00:01, 42.85it/s]

00000023:  18%|█▊        | 18/100 [00:00<00:01, 42.85it/s]

00000024:  23%|██▎       | 23/100 [00:00<00:01, 42.31it/s]

00000025:  23%|██▎       | 23/100 [00:00<00:01, 42.31it/s]

00000026:  23%|██▎       | 23/100 [00:00<00:01, 42.31it/s]

00000027:  23%|██▎       | 23/100 [00:00<00:01, 42.31it/s]

00000028:  23%|██▎       | 23/100 [00:00<00:01, 42.31it/s]

00000029:  28%|██▊       | 28/100 [00:00<00:01, 39.68it/s]

00000030:  28%|██▊       | 28/100 [00:00<00:01, 39.68it/s]

00000031:  28%|██▊       | 28/100 [00:00<00:01, 39.68it/s]

00000032:  28%|██▊       | 28/100 [00:00<00:01, 39.68it/s]

00000033:  28%|██▊       | 28/100 [00:00<00:01, 39.68it/s]

00000034:  33%|███▎      | 33/100 [00:00<00:01, 40.70it/s]

00000035:  33%|███▎      | 33/100 [00:00<00:01, 40.70it/s]

00000036:  33%|███▎      | 33/100 [00:00<00:01, 40.70it/s]

00000037:  33%|███▎      | 33/100 [00:00<00:01, 40.70it/s]

00000038:  33%|███▎      | 33/100 [00:00<00:01, 40.70it/s]

00000039:  38%|███▊      | 38/100 [00:00<00:01, 42.76it/s]

00000040:  38%|███▊      | 38/100 [00:00<00:01, 42.76it/s]

00000041:  38%|███▊      | 38/100 [00:00<00:01, 42.76it/s]

00000042:  38%|███▊      | 38/100 [00:00<00:01, 42.76it/s]

00000043:  38%|███▊      | 38/100 [00:01<00:01, 42.76it/s]

00000044:  43%|████▎     | 43/100 [00:01<00:01, 42.53it/s]

00000045:  43%|████▎     | 43/100 [00:01<00:01, 42.53it/s]

00000046:  43%|████▎     | 43/100 [00:01<00:01, 42.53it/s]

00000047:  43%|████▎     | 43/100 [00:01<00:01, 42.53it/s]

00000048:  43%|████▎     | 43/100 [00:01<00:01, 42.53it/s]

00000049:  48%|████▊     | 48/100 [00:01<00:01, 44.55it/s]

00000050:  48%|████▊     | 48/100 [00:01<00:01, 44.55it/s]

00000051:  48%|████▊     | 48/100 [00:01<00:01, 44.55it/s]

00000052:  48%|████▊     | 48/100 [00:01<00:01, 44.55it/s]

00000053:  48%|████▊     | 48/100 [00:01<00:01, 44.55it/s]

00000054:  53%|█████▎    | 53/100 [00:01<00:01, 45.63it/s]

00000055:  53%|█████▎    | 53/100 [00:01<00:01, 45.63it/s]

00000056:  53%|█████▎    | 53/100 [00:01<00:01, 45.63it/s]

00000057:  53%|█████▎    | 53/100 [00:01<00:01, 45.63it/s]

00000058:  53%|█████▎    | 53/100 [00:01<00:01, 45.63it/s]

00000059:  58%|█████▊    | 58/100 [00:01<00:01, 40.92it/s]

00000060:  58%|█████▊    | 58/100 [00:01<00:01, 40.92it/s]

00000061:  58%|█████▊    | 58/100 [00:01<00:01, 40.92it/s]

00000062:  58%|█████▊    | 58/100 [00:01<00:01, 40.92it/s]

00000063:  58%|█████▊    | 58/100 [00:01<00:01, 40.92it/s]

00000064:  63%|██████▎   | 63/100 [00:01<00:00, 39.55it/s]

00000065:  63%|██████▎   | 63/100 [00:01<00:00, 39.55it/s]

00000066:  63%|██████▎   | 63/100 [00:01<00:00, 39.55it/s]

00000067:  63%|██████▎   | 63/100 [00:01<00:00, 39.55it/s]

00000068:  63%|██████▎   | 63/100 [00:01<00:00, 39.55it/s]

00000069:  68%|██████▊   | 68/100 [00:01<00:00, 40.28it/s]

00000070:  68%|██████▊   | 68/100 [00:01<00:00, 40.28it/s]

00000071:  68%|██████▊   | 68/100 [00:01<00:00, 40.28it/s]

00000072:  68%|██████▊   | 68/100 [00:01<00:00, 40.28it/s]

00000073:  68%|██████▊   | 68/100 [00:01<00:00, 40.28it/s]

00000074:  73%|███████▎  | 73/100 [00:01<00:00, 40.09it/s]

00000075:  73%|███████▎  | 73/100 [00:01<00:00, 40.09it/s]

00000076:  73%|███████▎  | 73/100 [00:01<00:00, 40.09it/s]

00000077:  73%|███████▎  | 73/100 [00:01<00:00, 40.09it/s]

00000078:  73%|███████▎  | 73/100 [00:01<00:00, 40.09it/s]

00000079:  78%|███████▊  | 78/100 [00:01<00:00, 37.69it/s]

00000080:  78%|███████▊  | 78/100 [00:01<00:00, 37.69it/s]

00000081:  78%|███████▊  | 78/100 [00:01<00:00, 37.69it/s]

00000082:  78%|███████▊  | 78/100 [00:01<00:00, 37.69it/s]

00000083:  78%|███████▊  | 78/100 [00:02<00:00, 37.69it/s]

00000084:  83%|████████▎ | 83/100 [00:02<00:00, 39.29it/s]

00000085:  83%|████████▎ | 83/100 [00:02<00:00, 39.29it/s]

00000086:  83%|████████▎ | 83/100 [00:02<00:00, 39.29it/s]

00000087:  83%|████████▎ | 83/100 [00:02<00:00, 39.29it/s]

00000088:  87%|████████▋ | 87/100 [00:02<00:00, 38.83it/s]

00000089:  87%|████████▋ | 87/100 [00:02<00:00, 38.83it/s]

00000090:  87%|████████▋ | 87/100 [00:02<00:00, 38.83it/s]

00000091:  87%|████████▋ | 87/100 [00:02<00:00, 38.83it/s]

00000092:  91%|█████████ | 91/100 [00:02<00:00, 36.58it/s]

00000093:  91%|█████████ | 91/100 [00:02<00:00, 36.58it/s]

00000094:  91%|█████████ | 91/100 [00:02<00:00, 36.58it/s]

00000095:  91%|█████████ | 91/100 [00:02<00:00, 36.58it/s]

00000096:  91%|█████████ | 91/100 [00:02<00:00, 36.58it/s]

00000097:  96%|█████████▌| 96/100 [00:02<00:00, 38.57it/s]

00000098:  96%|█████████▌| 96/100 [00:02<00:00, 38.57it/s]

00000099:  96%|█████████▌| 96/100 [00:02<00:00, 38.57it/s]

00000100:  96%|█████████▌| 96/100 [00:02<00:00, 38.57it/s]

00000100: 100%|██████████| 100/100 [00:02<00:00, 40.31it/s]
