## Pixel coordinates extraction

### Handle click event

In [68]:
import cv2
import numpy as np
cap = cv2.VideoCapture(0)
pixel_coordinates = []

# Track mouse clicks
def click_event(event, x, y, flags, param):
    if event == cv2.EVENT_LBUTTONDOWN:
        pixel_coordinates.append((x, y))
        print(f"Clicked at: ({x}, {y})")

cv2.namedWindow("Pixel Coordinates")
cv2.setMouseCallback("Pixel Coordinates", click_event)

# Display video feed
while True:
    ret, frame = cap.read()
    if not ret:
        break

    cv2.imshow("Pixel Coordinates", frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()

## Calculate homgeneous transform matrix for transformation from camera's frame of reference to end effector frame of reference

In [2]:
import numpy as np
import cv2

# Clicked at: (239, 115)
# Clicked at: (376, 126)
# Clicked at: (303, 189)a
# Clicked at: (285, 199)
# Clicked at: (226, 250)
# Clicked at: (367, 260)

pixel_coordinates = [[238, 115], [376, 125], [300, 190], [286, 200], [226, 247], [367, 261]]
robot_coordinates = [[328.1, 8.5], [165.6, 8.5], [246.5, 90.9], [263.8, 107.5], [325.6, 168.8], [165.6, 168.8]]
pixel_points = np.array(pixel_coordinates, dtype=np.float32)
robot_points = np.array(robot_coordinates, dtype=np.float32)

homography_matrix, _ = cv2.findHomography(pixel_points, robot_points)
print("Homography Matrix:", homography_matrix)

def pixel_to_robot(x, y, matrix):
    pixel = np.array([x, y, 1]).reshape(3, 1)
    robot_coords = np.dot(matrix, pixel)
    robot_coords /= robot_coords[2]  
    return robot_coords[0][0], robot_coords[1][0]

Homography Matrix: [[-1.21974072e+00 -4.47658707e-02  6.45627843e+02]
 [-9.80664918e-02  1.32604981e+00 -1.20234612e+02]
 [ 1.60982444e-04  2.56874582e-04  1.00000000e+00]]


## Box coordinates from camera reference to end effector reference frame

In [3]:
box = pixel_to_robot(280, 399, homography_matrix)
box_x, box_y = box

## Initialize arm

In [15]:
import cv2
import numpy as np
from xarm.wrapper import XArmAPI


arm = XArmAPI('192.168.1.155')
arm.motion_enable(enable=True)
arm.set_mode(0)
arm.set_state(0)
arm.connect()
arm.move_gohome()

ROBOT_IP: 192.168.1.155, VERSION: v2.2.0, PROTOCOL: V1, DETAIL: 6,9,LI1006,DL1000,v2.2.0, TYPE1300: [0, 0]
change protocol identifier to 3
ControllerError, code: 2
[motion_enable], xArm is not ready to move
[set_state], xArm is ready to move


0

ControllerError had clean


ControllerError, code: 2


## Start window listener stream
1. Click object to pick
2. Transform pixel to end effector coordinate frame (P)
3. Move arm to the point (P) with a fixed height
4. Pick the object
5. Go to dropping destination
6. Lower arm and drop the object
7. Go back to home

## Pick up and drop sub-routine

In [None]:
pixel_coordinates = []

In [None]:
def pick_up_and_drop(x_robot, y_robot):
    arm.set_position(x_robot, y_robot, 70, speed=100)  
    arm.set_position(x_robot, y_robot, 27, speed=100)  
    arm.set_suction_cup(False)
    arm.set_position(box_x, box_y, 200, speed=100)
    arm.set_position(box_x, box_y, 100, speed=100)
    arm.set_suction_cup(True)
    arm.set_position(box_x, box_y, 200, speed=100)
    arm.move_gohome()

In [19]:
import time

cap = cv2.VideoCapture(1)
cv2.namedWindow("Click to Move")
cv2.setMouseCallback("Click to Move", click_event)

while True:
    ret, frame = cap.read()
    if not ret:
        break
    cv2.imshow("Click to Move", frame)
    
    if len(pixel_coordinates) > 0:
        x_pixel, y_pixel = pixel_coordinates.pop(0)
        x_robot, y_robot = pixel_to_robot(x_pixel, y_pixel, homography_matrix)
        print(f"Moving to: ({x_robot}, {y_robot})")
        pick_up_and_drop(x_robot, y_robot)
        # arm.set_position(x_robot, y_robot, 70, speed=100)  
        # arm.set_position(x_robot, y_robot, 27, speed=100)  
        # arm.set_suction_cup(False)
        # arm.set_position(box_x, box_y, 200, speed=100)
        # arm.set_position(box_x, box_y, 100, speed=100)
        # arm.set_suction_cup(True)
        # arm.set_position(box_x, box_y, 200, speed=100)
        # arm.move_gohome()
        
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()

Clicked at: (365, 260)
Moving to: (167.72602544189203, 167.69112189774935)


## Gemini API

In [11]:
import dotenv
import os
from google import genai

dotenv.load_dotenv()
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")

client = genai.Client(api_key=GEMINI_API_KEY)

prompt = (
  "Detect items, with no more than 20 items. Output a json list where each entry contains the 2D bounding box in box_2d and a text label in label."
)

img_path = "imgs/WhatsApp Image 2025-03-05 at 16.53.00_17d1376b.jpg"

file_ref = client.files.upload(file=img_path)

response = client.models.generate_content(
  model="gemini-2.0-flash",
  contents=[file_ref, prompt]
)

print(response.text)

Here are the bounding box detections:
```json
[
  {"box_2d": [434, 392, 490, 507], "label": "cube"},
  {"box_2d": [341, 535, 407, 682], "label": "cube"},
  {"box_2d": [277, 314, 351, 453], "label": "cube"},
  {"box_2d": [426, 117, 492, 257], "label": "cube"},
  {"box_2d": [556, 264, 610, 375], "label": "cube"},
  {"box_2d": [589, 610, 657, 746], "label": "cube"},
  {"box_2d": [471, 757, 534, 903], "label": "cube"}
]
```


## Integrate gemini api and the pickup and drop sub-routine

In [69]:
import time

cap = cv2.VideoCapture(1)
cv2.namedWindow("Click to Move")
cv2.setMouseCallback("Click to Move", click_event)

while True:
    ret, frame = cap.read()
    if not ret:
        break
    cv2.imshow("Click to Move", frame)

    cv2.imwrite("imgs/captured_frame.jpg", frame)
        
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()

## Gemini Vision Transformer Model



In [70]:
import json
from google import genai
from google.genai import types
import dotenv
import os

dotenv.load_dotenv()
# GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
GEMINI_API_KEY = "AIzaSyCwyjIPRVcDmZJUgt_9vq7WUaqQPSX3TN4"

client = genai.Client(api_key=GEMINI_API_KEY)

def store_json(data, file_name):
    with open(file_name, 'w') as f:
        json.dump(data, f, indent=4)
    print(f"Data stored in {file_name}")


def calculate_centroid(box):
    x1, y1, x2, y2 = box  # Unpacking list format
    centroid_x = (x1 + x2) / 2
    centroid_y = (y1 + y2) / 2
    return centroid_x, centroid_y


def process_json(input_file, output_file):
    with open(input_file, 'r') as f:
        data = json.load(f)
    
    centroids = []
    for item in data:
        if isinstance(item["box_2d"], list):  # Ensure box_2d is in list format
            centroid = calculate_centroid(item["box_2d"])
            centroids.append({"centroid_x": centroid[0], "centroid_y": centroid[1]})
    
    with open(output_file, 'w') as f:
        json.dump(centroids, f, indent=4)
    
    print(f"Centroids have been written to {output_file}")

def get_api_response(image_path, prompt):
    file_ref = client.files.upload(file=image_path)
    response = client.models.generate_content(
        model="gemini-2.0-flash",
        contents=[file_ref, prompt]
    )
    # remove ```json``` from the response
    print(response.text)
    response = response.text.replace("```json", "")
    response = response.replace("```", "")
    #check if output is json
    try:
        response = json.loads(response)
        return response
    except json.JSONDecodeError as e:
        print("Error: Output is not a valid JSON")
        return e

image_path = "imgs\img2.jpg"
prompt = (
    "Detect items, with no more than 20 items. Output a JSON where each entry contains the 2D bounding box in 'box_2d' "
    "and a text label in 'label'. Mainly focus on detecting garbage related materials."
    "Ensure the output is a valid JSON without additional text or formatting."
) 

detected_objects = get_api_response(image_path, prompt)
print(detected_objects)
input_json_file = "detected_items.json"
output_json_file = "centroids.json"

store_json(detected_objects, input_json_file)

process_json(input_json_file, output_json_file)

```json
[
  {"box_2d": [46, 437, 174, 528], "label": "crumpled paper"},
  {"box_2d": [443, 435, 517, 493], "label": "crumpled paper"},
  {"box_2d": [179, 638, 671, 801], "label": "water bottle"},
  {"box_2d": [144, 129, 655, 373], "label": "cardboard box"},
  {"box_2d": [426, 485, 815, 638], "label": "cardboard box"},
  {"box_2d": [684, 203, 896, 444], "label": "cup"},
  {"box_2d": [357, 196, 408, 220], "label": "text"},
  {"box_2d": [354, 196, 402, 232], "label": "text"},
  {"box_2d": [315, 249, 471, 308], "label": "text"}
]
```
[{'box_2d': [46, 437, 174, 528], 'label': 'crumpled paper'}, {'box_2d': [443, 435, 517, 493], 'label': 'crumpled paper'}, {'box_2d': [179, 638, 671, 801], 'label': 'water bottle'}, {'box_2d': [144, 129, 655, 373], 'label': 'cardboard box'}, {'box_2d': [426, 485, 815, 638], 'label': 'cardboard box'}, {'box_2d': [684, 203, 896, 444], 'label': 'cup'}, {'box_2d': [357, 196, 408, 220], 'label': 'text'}, {'box_2d': [354, 196, 402, 232], 'label': 'text'}, {'box_2d': 

In [71]:

import cv2
def overlay_centroids(image_path, centroids_file, output_path):
    image = cv2.imread(image_path)
    with open(centroids_file, 'r') as f:
        centroids = json.load(f)
    
    for centroid in centroids:
        x, y = int(round(centroid["centroid_x"])), int(round(centroid["centroid_y"]))
        cv2.circle(image, (x, y), radius=5, color=(0, 0, 255), thickness=-1)  # Red dot
    
    cv2.imwrite(output_path, image)
    print(f"Image with centroids saved to {output_path}")
overlay_centroids(image_path, output_json_file, 'image.jpg')

Image with centroids saved to image.jpg


In [72]:
def overlay_detections(image_path, detections_file, output_path):
    image = cv2.imread(image_path)
    if image is None:
        print(f"Error: Could not load image from {image_path}")
        return
    
    height, width, _ = image.shape  # Get the current image dimensions
    
    with open(detections_file, 'r') as f:
        detections = json.load(f)

    for detection in detections:
        try:
            box = detection["box_2d"]
            
            # Scale bounding box coordinates if they are normalized
            x1, y1, x2, y2 = box
            if 0 <= x1 <= 1 and 0 <= y1 <= 1:  # If coordinates are normalized
                x1 = int(x1 * width)
                y1 = int(y1 * height)
                x2 = int(x2 * width)
                y2 = int(y2 * height)
            else:
                x1, y1, x2, y2 = map(int, [x1, y1, x2, y2])

            label = detection["label"]
            centroid_x, centroid_y = calculate_centroid([x1, y1, x2, y2])

            # Ensure bounding boxes are within image boundaries
            x1, x2 = max(0, min(x1, width)), max(0, min(x2, width))
            y1, y2 = max(0, min(y1, height)), max(0, min(y2, height))

            # Draw bounding box
            cv2.rectangle(image, (x1, y1), (x2, y2), (0, 255, 0), 2)  # Green box

            # Draw centroid
            if 0 <= centroid_x < width and 0 <= centroid_y < height:
                cv2.circle(image, (centroid_x, centroid_y), radius=5, color=(0, 0, 255), thickness=-1)  # Red dot

            # Put label text
            cv2.putText(image, label, (x1, max(y1 - 10, 0)), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
        
        except Exception as e:
            print(f"Error drawing detection {detection}: {e}")

    cv2.imwrite(output_path, image)
    print(f"Image with detections saved to {output_path}")



In [73]:
from PIL import Image, ImageDraw, ImageFont, ImageColor
import json

additional_colors = [colorname for (colorname, colorcode) in ImageColor.colormap.items()]

def parse_json(json_str):
    """
    Parse a JSON string, removing markdown fencing if present.
    
    Args:
        json_str: A JSON string, possibly with markdown fencing.
    
    Returns:
        Clean JSON string without markdown fencing.
    """
    if isinstance(json_str, str):
        # Remove markdown fencing if present
        json_str = json_str.replace("```json", "")
        json_str = json_str.replace("```", "")
        return json_str
    elif isinstance(json_str, (dict, list)):
        # If it's already a dict or list, convert to string
        return json.dumps(json_str)
    else:
        return str(json_str)

def plot_bounding_boxes(im, bounding_boxes):
    """
    Plots bounding boxes on an image with markers for each a name, using PIL, normalized coordinates, and different colors.

    Args:
        img_path: The path to the image file.
        bounding_boxes: A list of bounding boxes containing the name of the object
         and their positions in normalized [y1 x1 y2 x2] format.
    """

    # Load the image
    img = im
    width, height = img.size
    print(img.size)
    # Create a drawing object
    draw = ImageDraw.Draw(img)

    # Define a list of colors
    colors = [
    'red',
    'green',
    'blue',
    'yellow',
    'orange',
    ] + additional_colors

    # Parsing out the markdown fencing
    bounding_boxes = parse_json(bounding_boxes)
    print(bounding_boxes)

    # Use the default font
    font = ImageFont.load_default()

    # Iterate over the bounding boxes
    for i, bounding_box in enumerate(json.loads(bounding_boxes)):
      # Select a color from the list
      color = colors[i % len(colors)]

      # Convert normalized coordinates to absolute coordinates
      abs_y1 = int(bounding_box["box_2d"][0]/1000 * height)
      abs_x1 = int(bounding_box["box_2d"][1]/1000 * width)
      abs_y2 = int(bounding_box["box_2d"][2]/1000 * height)
      abs_x2 = int(bounding_box["box_2d"][3]/1000 * width)

      if abs_x1 > abs_x2:
        abs_x1, abs_x2 = abs_x2, abs_x1

      if abs_y1 > abs_y2:
        abs_y1, abs_y2 = abs_y2, abs_y1

      # Draw the bounding box
      draw.rectangle(
          ((abs_x1, abs_y1), (abs_x2, abs_y2)), outline=color, width=4
      )

      # Draw the text
      if "label" in bounding_box:
        draw.text((abs_x1 + 8, abs_y1 + 6), bounding_box["label"], fill=color, font=font)

    # Display the image
    img.show()


In [74]:
#get the detected objects from the output json
detected_objects = json.load(open('detected_items.json'))
# use plot bounding boxes to plot the bounding boxes
plot_bounding_boxes(Image.open('imgs/img2.jpg'), detected_objects)


(4000, 3000)
[{"box_2d": [46, 437, 174, 528], "label": "crumpled paper"}, {"box_2d": [443, 435, 517, 493], "label": "crumpled paper"}, {"box_2d": [179, 638, 671, 801], "label": "water bottle"}, {"box_2d": [144, 129, 655, 373], "label": "cardboard box"}, {"box_2d": [426, 485, 815, 638], "label": "cardboard box"}, {"box_2d": [684, 203, 896, 444], "label": "cup"}, {"box_2d": [357, 196, 408, 220], "label": "text"}, {"box_2d": [354, 196, 402, 232], "label": "text"}, {"box_2d": [315, 249, 471, 308], "label": "text"}]
