In [None]:
import cv2
import torch
from ultralytics import YOLO
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image

# Load YOLO26 ONNX model
detector = YOLO("C:\\Users\\ruthv\\OneDrive\\Desktop\\Summer project\\ROD_IDS\\Model_Training_Testing\\yolo26n.onnx")  # âœ… Use ONNX file

# Load BLIP caption model
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
caption_model = BlipForConditionalGeneration.from_pretrained(
    "Salesforce/blip-image-captioning-base"
)

device = "cuda" if torch.cuda.is_available() else "cpu"
caption_model.to(device)

# Open webcam
cap = cv2.VideoCapture(0)

frame_count = 0

while True:
    ret, frame = cap.read()
    if not ret:
        break

    # Run YOLO detection
    results = detector(frame)[0]

    for box in results.boxes:
        x1, y1, x2, y2 = map(int, box.xyxy[0])
        cls_id = int(box.cls[0])

        # Crop detected object
        crop = frame[y1:y2, x1:x2]
        if crop.size == 0:
            continue

        pil_image = Image.fromarray(cv2.cvtColor(crop, cv2.COLOR_BGR2RGB))

        # Generate caption
        inputs = processor(pil_image, return_tensors="pt").to(device)
        output = caption_model.generate(**inputs, max_new_tokens=20)
        caption = processor.decode(output[0], skip_special_tokens=True)

        # Draw box + description
        cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
        cv2.putText(frame, caption, (x1, y1 - 10),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

    cv2.imshow("Realtime Detection + Description", frame)

    if cv2.waitKey(1) & 0xFF == ord("q"):
        break

cap.release()
cv2.destroyAllWindows()

In [None]:
from ultralytics import YOLO

# Load a pretrained YOLO26 model
model = YOLO("yolo26n.pt")

# Train on kitti dataset
results = model.train(data="kitti.yaml", epochs=100, imgsz=640)

In [None]:
from ultralytics import YOLO

# Load trained model
model = YOLO("runs/detect/train/weights/best.pt")

model.export(
    format="onnx",
    imgsz=640,
    opset=12,          # ONNX opset version
    dynamic=True,      # Dynamic input shape
    simplify=True      # Simplify ONNX graph
)