## Pixel coordinates extraction

### Handle click event

In [1]:
import cv2
import numpy as np
cap = cv2.VideoCapture(0)
pixel_coordinates = []

# Track mouse clicks
def click_event(event, x, y, flags, param):
    if event == cv2.EVENT_LBUTTONDOWN:
        pixel_coordinates.append((x, y))
        print(f"Clicked at: ({x}, {y})")

cv2.namedWindow("Pixel Coordinates")
cv2.setMouseCallback("Pixel Coordinates", click_event)

# Display video feed
while True:
    ret, frame = cap.read()
    if not ret:
        break

    cv2.imshow("Pixel Coordinates", frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()

Clicked at: (504, 151)
Clicked at: (420, 107)
Clicked at: (319, 111)


## Calculate homgeneous transform matrix for transformation from camera's frame of reference to end effector frame of reference

In [2]:
import numpy as np
import cv2

# Clicked at: (239, 115)
# Clicked at: (376, 126)
# Clicked at: (303, 189)a
# Clicked at: (285, 199)
# Clicked at: (226, 250)
# Clicked at: (367, 260)

pixel_coordinates = [[238, 115], [376, 125], [300, 190], [286, 200], [226, 247], [367, 261]]
robot_coordinates = [[328.1, 8.5], [165.6, 8.5], [246.5, 90.9], [263.8, 107.5], [325.6, 168.8], [165.6, 168.8]]
pixel_points = np.array(pixel_coordinates, dtype=np.float32)
robot_points = np.array(robot_coordinates, dtype=np.float32)

homography_matrix, _ = cv2.findHomography(pixel_points, robot_points)
print("Homography Matrix:", homography_matrix)

def pixel_to_robot(x, y, matrix):
    pixel = np.array([x, y, 1]).reshape(3, 1)
    robot_coords = np.dot(matrix, pixel)
    robot_coords /= robot_coords[2]  
    return robot_coords[0][0], robot_coords[1][0]

Homography Matrix: [[-1.21974072e+00 -4.47658707e-02  6.45627843e+02]
 [-9.80664918e-02  1.32604981e+00 -1.20234612e+02]
 [ 1.60982444e-04  2.56874582e-04  1.00000000e+00]]


## Box coordinates from camera reference to end effector reference frame

In [3]:
box = pixel_to_robot(280, 399, homography_matrix)
box_x, box_y = box

## Initialize arm

In [15]:
import cv2
import numpy as np
from xarm.wrapper import XArmAPI


arm = XArmAPI('192.168.1.155')
arm.motion_enable(enable=True)
arm.set_mode(0)
arm.set_state(0)
arm.connect()
arm.move_gohome()

ROBOT_IP: 192.168.1.155, VERSION: v2.2.0, PROTOCOL: V1, DETAIL: 6,9,LI1006,DL1000,v2.2.0, TYPE1300: [0, 0]
change protocol identifier to 3
ControllerError, code: 2
[motion_enable], xArm is not ready to move
[set_state], xArm is ready to move


0

ControllerError had clean


ControllerError, code: 2


## Start window listener stream
1. Click object to pick
2. Transform pixel to end effector coordinate frame (P)
3. Move arm to the point (P) with a fixed height
4. Pick the object
5. Go to dropping destination
6. Lower arm and drop the object
7. Go back to home

## Pick up and drop sub-routine

In [None]:
pixel_coordinates = []

In [None]:
def pick_up_and_drop(x_robot, y_robot):
    arm.set_position(x_robot, y_robot, 70, speed=100)  
    arm.set_position(x_robot, y_robot, 27, speed=100)  
    arm.set_suction_cup(False)
    arm.set_position(box_x, box_y, 200, speed=100)
    arm.set_position(box_x, box_y, 100, speed=100)
    arm.set_suction_cup(True)
    arm.set_position(box_x, box_y, 200, speed=100)
    arm.move_gohome()

In [19]:
import time

cap = cv2.VideoCapture(1)
cv2.namedWindow("Click to Move")
cv2.setMouseCallback("Click to Move", click_event)

while True:
    ret, frame = cap.read()
    if not ret:
        break
    cv2.imshow("Click to Move", frame)
    
    if len(pixel_coordinates) > 0:
        x_pixel, y_pixel = pixel_coordinates.pop(0)
        x_robot, y_robot = pixel_to_robot(x_pixel, y_pixel, homography_matrix)
        print(f"Moving to: ({x_robot}, {y_robot})")
        pick_up_and_drop(x_robot, y_robot)
        # arm.set_position(x_robot, y_robot, 70, speed=100)  
        # arm.set_position(x_robot, y_robot, 27, speed=100)  
        # arm.set_suction_cup(False)
        # arm.set_position(box_x, box_y, 200, speed=100)
        # arm.set_position(box_x, box_y, 100, speed=100)
        # arm.set_suction_cup(True)
        # arm.set_position(box_x, box_y, 200, speed=100)
        # arm.move_gohome()
        
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()

Clicked at: (365, 260)
Moving to: (167.72602544189203, 167.69112189774935)


## Gemini API

In [11]:
import dotenv
import os
from google import genai

dotenv.load_dotenv()
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")

client = genai.Client(api_key=GEMINI_API_KEY)

prompt = (
  "Detect items, with no more than 20 items. Output a json list where each entry contains the 2D bounding box in box_2d and a text label in label."
)

img_path = "imgs/WhatsApp Image 2025-03-05 at 16.53.00_17d1376b.jpg"

file_ref = client.files.upload(file=img_path)

response = client.models.generate_content(
  model="gemini-2.0-flash",
  contents=[file_ref, prompt]
)

print(response.text)

Here are the bounding box detections:
```json
[
  {"box_2d": [434, 392, 490, 507], "label": "cube"},
  {"box_2d": [341, 535, 407, 682], "label": "cube"},
  {"box_2d": [277, 314, 351, 453], "label": "cube"},
  {"box_2d": [426, 117, 492, 257], "label": "cube"},
  {"box_2d": [556, 264, 610, 375], "label": "cube"},
  {"box_2d": [589, 610, 657, 746], "label": "cube"},
  {"box_2d": [471, 757, 534, 903], "label": "cube"}
]
```


## Integrate gemini api and the pickup and drop sub-routine

In [None]:
import time

cap = cv2.VideoCapture(1)
cv2.namedWindow("Click to Move")
cv2.setMouseCallback("Click to Move", click_event)

while True:
    ret, frame = cap.read()
    if not ret:
        break
    cv2.imshow("Click to Move", frame)

    cv2.imwrite("imgs/captured_frame.jpg", frame)
    
    # if len(pixel_coordinates) > 0:
    #     x_pixel, y_pixel = pixel_coordinates.pop(0)
    #     x_robot, y_robot = pixel_to_robot(x_pixel, y_pixel, homography_matrix)
    #     print(f"Moving to: ({x_robot}, {y_robot})")
    #     pick_up_and_drop(x_robot, y_robot)
    #     # arm.set_position(x_robot, y_robot, 70, speed=100)  
    #     # arm.set_position(x_robot, y_robot, 27, speed=100)  
    #     # arm.set_suction_cup(False)
    #     # arm.set_position(box_x, box_y, 200, speed=100)
    #     # arm.set_position(box_x, box_y, 100, speed=100)
    #     # arm.set_suction_cup(True)
    #     # arm.set_position(box_x, box_y, 200, speed=100)
    #     # arm.move_gohome()
        
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()