1) Quick evaluation: compute MSE, MAE, and pixel error

This assumes you have a final_dataset/images/*.jpg and corresponding final_dataset/labels/*.txt containing class x y w h px py (px,py in crop-normalized coords). Also you have a trained model checkpoint hrnet_pluck.pth.

In [None]:
import os
import math
import torch
import cv2
import numpy as np
from glob import glob
from train_hrnet_pluck import HRNetPluckRegressor, predict_on_crop  # or import model class

device = "cuda" if torch.cuda.is_available() else "cpu"
model = HRNetPluckRegressor(backbone_name="hrnet_w18", pretrained=False).to(device)
ck = torch.load("hrnet_pluck.pth", map_location=device)
# If checkpoint saved as dict with "model"
if "model" in ck:
    model.load_state_dict(ck["model"])
else:
    model.load_state_dict(ck)
model.eval()

img_dir = "Dataset/images"
lbl_dir = "Dataset/labels"
img_files = sorted([f for f in os.listdir(img_dir) if f.endswith(".jpg")])

mse_sum = 0.0
mae_sum = 0.0
pixel_errs = []

with torch.no_grad():
    for fname in img_files:
        img_path = os.path.join(img_dir, fname)
        lbl_path = os.path.join(lbl_dir, fname.replace(".jpg",".txt"))
        if not os.path.exists(lbl_path): 
            continue
        # read image and label
        img = cv2.imread(img_path)
        h, w = img.shape[:2]
        line = open(lbl_path).read().strip().split()
        gt_px = float(line[5])
        gt_py = float(line[6])

        # preprocess like training (resize & normalize)
        # use the same transform as training (here simple resize -> normalize)
        import torchvision.transforms as T
        from timm.data import resolve_data_config
        import timm
        tmp = timm.create_model("hrnet_w18", pretrained=False, num_classes=0)
        cfg = resolve_data_config({}, model=tmp)
        transform = T.Compose([
            T.ToPILImage(),
            T.Resize((256,256)),
            T.ToTensor(),
            T.Normalize(mean=cfg["mean"], std=cfg["std"])
        ])
        inp = transform(cv2.cvtColor(img, cv2.COLOR_BGR2RGB)).unsqueeze(0).to(device)

        pred = model(inp)[0].cpu().numpy()  # normalized px,py
        gt = np.array([gt_px, gt_py], dtype=float)

        # metrics in normalized space
        mse = np.mean((pred - gt)**2)
        mae = np.mean(np.abs(pred - gt))
        mse_sum += mse
        mae_sum += mae

        # pixel error (assuming crop size = original crop dims; if you kept crops as varied sizes,
        # convert normalized coords back by using actual crop-size used during annotation)
        # here we assume crop size used for annotation is actual image shape (h,w)
        # if you used resized 256x256 then pixel err = norm_err * 256
        px_err = np.linalg.norm((pred - gt) * 256)  # pixel error w.r.t model input size
        pixel_errs.append(px_err)

n = len(img_files)
print(f"MSE (avg): {mse_sum/n:.6f}, MAE (avg): {mae_sum/n:.6f}")
print(f"Pixel error — mean: {np.mean(pixel_errs):.3f}, med: {np.median(pixel_errs):.3f}, std: {np.std(pixel_errs):.3f}")


2) Visualize predictions on crops and original images

Useful for qualitative debugging.

In [None]:
import cv2, os, torch, numpy as np
from ultralytics import YOLO
from train_hrnet_pluck import HRNetPluckRegressor  # model class

# load detector and regressor
yolo = YOLO("runs/detect/train/weights/best.pt")   # path to your trained YOLO
device = "cuda" if torch.cuda.is_available() else "cpu"
hrnet = HRNetPluckRegressor(backbone_name="hrnet_w18", pretrained=False).to(device)
ck = torch.load("hrnet_pluck.pth", map_location=device)
hrnet.load_state_dict(ck.get("model", ck))
hrnet.eval()

def predict_and_draw(img_path, out_path=None):
    orig = cv2.imread(img_path)
    h,w = orig.shape[:2]
    res = yolo(img_path)[0]
    out_img = orig.copy()
    for box, cls in zip(res.boxes.xyxy, res.boxes.cls):
        cls = int(cls)
        x1,y1,x2,y2 = map(int, box.tolist())
        cv2.rectangle(out_img, (x1,y1),(x2,y2),(0,255,0),2)
        if cls==1:  # side
            crop = orig[y1:y2, x1:x2]
            if crop.size==0: continue
            crop_rgb = cv2.cvtColor(crop, cv2.COLOR_BGR2RGB)
            # preprocess (resize to 256)
            import torchvision.transforms as T
            from timm.data import resolve_data_config
            import timm
            tmp = timm.create_model("hrnet_w18", pretrained=False, num_classes=0)
            cfg = resolve_data_config({}, model=tmp)
            trans = T.Compose([
                T.ToPILImage(), T.Resize((256,256)),
                T.ToTensor(),
                T.Normalize(mean=cfg["mean"], std=cfg["std"])
            ])
            inp = trans(crop_rgb).unsqueeze(0).to(device)
            with torch.no_grad():
                px,py = hrnet(inp)[0].cpu().numpy()
            px_abs = int(px*(x2-x1)+x1)
            py_abs = int(py*(y2-y1)+y1)
            cv2.circle(out_img,(px_abs,py_abs),5,(0,0,255),-1)
            cv2.putText(out_img, f"pluck", (px_abs+5,py_abs-5), cv2.FONT_HERSHEY_SIMPLEX, 0.4, (0,0,255),1)

    if out_path:
        cv2.imwrite(out_path, out_img)
    return out_img

# example
res_img = predict_and_draw("test_images/001.jpg", out_path="debug/001_pred.jpg")
cv2.imshow("pred", res_img); cv2.waitKey(0); cv2.destroyAllWindows()


3) Integrate YOLO → HRNet in a real-time loop (video / camera)

If you want a live pipeline for robotics:

In [None]:
import cv2
from ultralytics import YOLO
# load models like above: yolo, hrnet

cap = cv2.VideoCapture(0)  # or video file
while True:
    ret, frame = cap.read()
    if not ret: break
    results = yolo(frame)[0]
    for box, cls in zip(results.boxes.xyxy, results.boxes.cls):
        cls = int(cls)
        x1,y1,x2,y2 = map(int, box.tolist())
        cv2.rectangle(frame,(x1,y1),(x2,y2),(0,255,0),2)
        if cls==1:
            crop = frame[y1:y2,x1:x2]
            # preprocess + predict same as above; draw point
            # ...
    cv2.imshow("live", frame)
    if cv2.waitKey(1) & 0xFF == 27: break
cap.release(); cv2.destroyAllWindows()


4) Export HRNet to TorchScript / ONNX for faster deployment

TorchScript:

In [None]:
model = HRNetPluckRegressor(backbone_name="hrnet_w18", pretrained=False)
ck = torch.load("hrnet_pluck.pth", map_location="cpu")
model.load_state_dict(ck.get("model", ck))
model.eval().cpu()

example = torch.rand(1,3,256,256)
traced = torch.jit.trace(model, example)
traced.save("hrnet_pluck_ts.pt")


ONNX (useful for some embedded accelerators):

In [None]:
import torch
model.eval().cpu()
dummy = torch.randn(1,3,256,256)
torch.onnx.export(model, dummy, "hrnet_pluck.onnx",
                  input_names=["input"], output_names=["output"],
                  opset_version=12, dynamic_axes={"input":{0:"batch"}, "output":{0:"batch"}})


5) Improvements & ideas to try (practical list)

Heatmap supervision: predict a small Gaussian heatmap and use soft-argmax — often more stable than direct regression.

Data augmentations: brightness, contrast, small rotation, scaling, cutout. Ensure keypoint transforms are applied consistently.

Loss: mix MSE + MAE, or Huber loss to be robust to outliers.

Model ensembling: average predictions from multiple checkpoints.

Test-time augmentation (flip & average).

Use a small bounding-box margin when cropping (include some context).

Create per-sample uncertainty (dropout at inference or predict variance) to decide when to fallback or ask for human in the loop.

If stem is occluded in some side cases, add a mask/flag to skip or mark low confidence.

6) Debug checklist (if predictions look off)

Are label px,py correctly normalized relative to crop (0..1)?

Are you using exactly the same normalization/resize pipeline during inference as training? (common bug)

Are image channels order correct (RGB vs BGR)?

Are you accidentally using full-image bbox normalization when the model expects crop-normalized?

Visualize a random set of GT vs Pred on crops to spot biases (systematically off-center, always low, etc).