In [77]:
import sys
sys.path.insert(0,'../GroundingDINO')
sys.path.insert(0,'../SAM')
import os
import torch
import json
from pytorch3d.io import IO
import numpy as np
from src.utils import normalize_pc,save_colored_pc
from src.render_pc import render_pc
from src.gen_superpoint import gen_superpoint
from groundingdino.util.inference import load_model, load_image, predict, annotate
import cv2
from torchvision.ops import nms
import json
from segment_anything import sam_model_registry, SamPredictor
import matplotlib.pyplot as plt

In [2]:
def yolobbox2bbox(yolobox):
    x = yolobox[:,0]
    y = yolobox[:,1]
    w = yolobox[:,2]
    h = yolobox[:,3]
    xyxy = np.zeros_like(yolobox)
    xyxy[:,0] = x-w/2
    xyxy[:,1] = y-h/2
    xyxy[:,2] = x+w/2
    xyxy[:,3] = y+h/2
    return xyxy

In [5]:
def InferDINO(input_pc_file, category, part_names, zero_shot=False, save_dir="tmp"):
    if zero_shot:
        config ="GLIP/configs/glip_Swin_L.yaml"
        weight_path = "models/glip_large_model.pth"
        print("-----Zero-shot inference of %s-----" % input_pc_file)
    else:
        config ="GLIP/configs/glip_Swin_L_pt.yaml"
        weight_path = "models/%s.pth" % category
        print("-----Few-shot inference of %s-----" % input_pc_file)
        
    print("[loading GLIP model...]")
    model = load_model("../GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py", "../GroundingDINO/weights/groundingdino_swint_ogc.pth")

    print("[creating tmp dir...]")
    if torch.cuda.is_available():
        device = torch.device("cuda:0")
        torch.cuda.set_device(device)
    else:
        device = torch.device("cpu")
    io = IO()
    os.makedirs(save_dir, exist_ok=True)
    
    print("[normalizing input point cloud...]")
    xyz, rgb = normalize_pc(input_pc_file, save_dir, io, device)
    
    print("[rendering input point cloud...]")
    img_dir, pc_idx, screen_coords = render_pc(xyz, rgb, save_dir, device)
    TEXT_PROMPT = "chair . person . dog ."
    BOX_TRESHOLD = 0.35
    TEXT_TRESHOLD = 0.25
    
    preds = []
    for i in range(10):
        image_source, image = load_image(img_dir+f"/{i}.png")

        print("[glip infrence...]")
        boxes, logits, phrases = predict(
                                        model=model,
                                        image=image,
                                        caption=TEXT_PROMPT,
                                        box_threshold=BOX_TRESHOLD,
                                        text_threshold=TEXT_TRESHOLD
                                    )
        preds.append({'image_id': i, 'category_id': 2, 
                      'bbox': boxes*image.shape[-1], 
                      'score': logits}
                    )
        annotated_frame = annotate(image_source=image_source, boxes=boxes, logits=logits, phrases=phrases)
        cv2.imwrite(f"{save_dir}/dino_pred/{i}.png", annotated_frame)
    print('[generating superpoints...]')
    superpoint = gen_superpoint(xyz, rgb, visualize=True, save_dir=save_dir)
    
    print('[converting bbox to 3D segmentation...]')
    sem_seg, ins_seg = bbox2seg(xyz, superpoint, preds, screen_coords, pc_idx, part_names, save_dir, solve_instance_seg=True)
    
    print("[finish!]")
    return preds

In [6]:
io = IO()
xyz, rgb = normalize_pc("examples/Chair.ply", "examples/zeroshot_Chair", io, "cuda")
img_dir, pc_idx, screen_coords = render_pc(xyz, rgb, "examples/zeroshot_Chair", "cuda")

An exception occurred in telemetry logging.Disabling telemetry to prevent further exceptions.
Traceback (most recent call last):
  File "/rhome/kamburoglu/miniconda3/envs/partslip/lib/python3.9/site-packages/iopath/common/file_io.py", line 946, in __log_tmetry_keys
    handler.log_event()
  File "/rhome/kamburoglu/miniconda3/envs/partslip/lib/python3.9/site-packages/iopath/common/event_logger.py", line 97, in log_event
    del self._evt
AttributeError: _evt


In [10]:
screen_coords[0]

array([[301.90833, 154.62723],
       [302.001  , 154.86508],
       [301.68314, 155.22375],
       ...,
       [505.16672, 159.29083],
       [503.9417 , 159.11203],
       [502.51263, 158.6364 ]], dtype=float32)

In [20]:
pc_idx.shape

(10, 800, 800)

In [122]:
np.unique(pc_idx[1])[1:]

array([    71,    118,    121, ..., 301393, 301397, 301400], dtype=int32)

In [3]:
 model = load_model("../GroundingDINO/groundingdino/config/GroundingDINO_SwinB_cfg.py", "../GroundingDINO/weights/groundingdino_swinb_cogcoor.pth")

final text_encoder_type: bert-base-uncased


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [21]:
sam_checkpoint = "../SAM/weights/sam_vit_h_4b8939.pth"
model_type = "vit_h"


sam = sam_model_registry[model_type](checkpoint=sam_checkpoint)
sam.to(device="cuda")

predictor = SamPredictor(sam)

In [4]:
metaData = json.load(open("./PartNetE_meta.json"))
metaData

{'Bottle': ['lid'],
 'Box': ['lid'],
 'Bucket': ['handle'],
 'Camera': ['button', 'lens'],
 'Cart': ['wheel'],
 'Chair': ['arm', 'back', 'leg', 'seat', 'wheel'],
 'Clock': ['hand'],
 'CoffeeMachine': ['button', 'container', 'knob', 'lid'],
 'Dishwasher': ['door', 'handle'],
 'Dispenser': ['head', 'lid'],
 'Display': ['base', 'screen', 'support'],
 'Door': ['frame', 'door', 'handle'],
 'Eyeglasses': ['body', 'leg'],
 'Faucet': ['spout', 'switch'],
 'FoldingChair': ['seat'],
 'Globe': ['sphere'],
 'Kettle': ['lid', 'handle', 'spout'],
 'Keyboard': ['cord', 'key'],
 'KitchenPot': ['lid', 'handle'],
 'Knife': ['blade'],
 'Lamp': ['base', 'body', 'bulb', 'shade'],
 'Laptop': ['keyboard', 'screen', 'shaft', 'touchpad', 'camera'],
 'Lighter': ['lid', 'wheel', 'button'],
 'Microwave': ['display', 'door', 'handle', 'button'],
 'Mouse': ['button', 'cord', 'wheel'],
 'Oven': ['door', 'knob'],
 'Pen': ['cap', 'button'],
 'Phone': ['lid', 'button'],
 'Pliers': ['leg'],
 'Printer': ['button'],
 'Ref

In [65]:
def toDinoPrompt(metaData,className):
    listOfParts = metaData[className]
    prompt = ""
    partList = {}
    for i,part in enumerate(listOfParts):
        prompt += f"{className} {part}.".lower()
        partList[f"{className} {part}".lower()] = i
    return prompt,partList

In [66]:
className = "Chair"
TEXT_PROMPT,partList = toDinoPrompt(metaData, className)
BOX_TRESHOLD = 0.2
TEXT_TRESHOLD = 0.3

In [67]:
preds = []
for i in range(10):
    image_source, image = load_image(f"examples/zeroshot_Chair/rendered_img/{i}.png")
    predictor.set_image(image_source)
    print("[dino infrence...]")
    boxes, logits, phrases = predict(
                                    model=model,
                                    image=image,
                                    caption=TEXT_PROMPT,
                                    box_threshold=BOX_TRESHOLD,
                                    text_threshold=TEXT_TRESHOLD
                                )
    phrases = np.array(phrases)
    
    xyxy = yolobbox2bbox(boxes)*image.shape[-1]

    nms_indexes = nms(torch.tensor(xyxy) , logits, 0.5).numpy()
 
    nms_mask = []
    for t,index in enumerate(nms_indexes):
        if phrases[index].lower() in partList.keys():
            nms_mask.append(t)
    nms_indexes = nms_indexes[nms_mask]
    
    input_boxes = torch.tensor(xyxy[nms_indexes], device=predictor.device)    
    transformed_boxes = predictor.transform.apply_boxes_torch(input_boxes, image_source.shape[:2])
    masks, _, _ = predictor.predict_torch(
        point_coords=None,
        point_labels=None,
        boxes=transformed_boxes,
        multimask_output=False,
    )    

    for index,j in enumerate(nms_indexes):
        preds.append({'image_id': i, 'category_id': phrases[j], 
                      'bbox': boxes[j]*image.shape[-1], 
                      'score': logits[j],
                      'mask':masks[index,0]   
                     }
                    )
    annotated_frame = annotate(image_source=image_source, boxes=boxes[nms_indexes], logits=logits[nms_indexes], phrases=phrases[nms_indexes])
    cv2.imwrite(f"examples/zeroshot_Chair/dino_pred/{i}.png", annotated_frame)

[dino infrence...]




[dino infrence...]
[dino infrence...]
[dino infrence...]
[dino infrence...]
[dino infrence...]
[dino infrence...]
[dino infrence...]
[dino infrence...]
[dino infrence...]


In [39]:
preds

[{'image_id': 0,
  'category_id': 'chair seat',
  'bbox': tensor([400.7036, 465.8534, 316.9041,  98.0243]),
  'score': tensor(0.4861),
  'masks': tensor([[False, False, False,  ..., False, False, False],
          [False, False, False,  ..., False, False, False],
          [False, False, False,  ..., False, False, False],
          ...,
          [False, False, False,  ..., False, False, False],
          [False, False, False,  ..., False, False, False],
          [False, False, False,  ..., False, False, False]], device='cuda:0')},
 {'image_id': 0,
  'category_id': 'chair wheel',
  'bbox': tensor([384.7600, 728.8214,  45.7331,  50.1014]),
  'score': tensor(0.4780),
  'masks': tensor([[False, False, False,  ..., False, False, False],
          [False, False, False,  ..., False, False, False],
          [False, False, False,  ..., False, False, False],
          ...,
          [False, False, False,  ..., False, False, False],
          [False, False, False,  ..., False, False, False],
 

In [124]:
pc_aggMask = torch.zeros((301402,len(partList)+1))
pc_aggMask[:,-1]=0.1
for prediction in preds:
    maskedPC_idx = pc_idx[prediction["image_id"],prediction["masks"].cpu().numpy()]
    index_pcMasked = np.unique(maskedPC_idx)[1:]
#     print(prediction["image_id"],index_pcMasked,partList[prediction["category_id"]])
    pc_aggMask[index_pcMasked,partList[prediction["category_id"]]] += prediction["score"]
pc_seg_classes = torch.argmax(pc_aggMask,dim=-1)

In [127]:
pc_seg_classes.min()

tensor(0)

In [126]:
xyz.shape

(301402, 3)

In [128]:
for part in partList:
    rgb_sem = np.zeros((xyz.shape[0], 3))
    rgb_sem[torch.where(pc_seg_classes==partList[part])] = [1,0,0]
    save_colored_pc("examples/zeroshot_Chair/semantic_segDino/%s.ply" % (part), xyz, rgb_sem)

In [91]:
torch.where(pc_seg_classes==4)

(tensor([ 11903,  11905,  11985,  ..., 294842, 294843, 294844]),)

In [97]:
partList

{'chair arm': 0,
 'chair back': 1,
 'chair leg': 2,
 'chair seat': 3,
 'chair wheel': 4}

In [92]:
preds

[{'image_id': 0,
  'category_id': 'chair seat',
  'bbox': tensor([400.7036, 465.8534, 316.9041,  98.0243]),
  'score': tensor(0.4861),
  'masks': tensor([[False, False, False,  ..., False, False, False],
          [False, False, False,  ..., False, False, False],
          [False, False, False,  ..., False, False, False],
          ...,
          [False, False, False,  ..., False, False, False],
          [False, False, False,  ..., False, False, False],
          [False, False, False,  ..., False, False, False]], device='cuda:0')},
 {'image_id': 0,
  'category_id': 'chair wheel',
  'bbox': tensor([384.7600, 728.8214,  45.7331,  50.1014]),
  'score': tensor(0.4780),
  'masks': tensor([[False, False, False,  ..., False, False, False],
          [False, False, False,  ..., False, False, False],
          [False, False, False,  ..., False, False, False],
          ...,
          [False, False, False,  ..., False, False, False],
          [False, False, False,  ..., False, False, False],
 