In [None]:
import cv2
import numpy as np
from ultralytics import YOLO

# --- CONFIG ---
NUM_ROWS, NUM_COLS = 3, 3  # Grid config
CAM_SOURCE = 1  # or your IP stream e.g. 'http://192.168.x.x:4747/video'

# --- Load YOLOv8-Segmentation Model ---    
model = YOLO("yolov8n-seg.pt")  # Replace with custom if trained on floor classes

# --- Utils ---
def draw_grid(image, mask, rows, cols):
    zones = []
    x, y, w, h = cv2.boundingRect(mask)
    cell_w, cell_h = w // cols, h // rows

    for i in range(rows):
        for j in range(cols):
            x1, y1 = x + j * cell_w, y + i * cell_h
            x2, y2 = x1 + cell_w, y1 + cell_h
            zones.append(((x1, y1), (x2, y2)))
            cv2.rectangle(image, (x1, y1), (x2, y2), (255, 0, 0), 2)

    return zones

# --- Main Loop ---
cap = cv2.VideoCapture(CAM_SOURCE)

while True:
    ret, frame = cap.read()
    if not ret:
        break

    results = model(frame)[0]
    annotated = frame.copy()

    # --- Floor Mask Logic ---
    for i, seg in enumerate(results.masks.data if results.masks else []):
        mask = seg.cpu().numpy().astype(np.uint8) * 255
        contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

        for cnt in contours:
            area = cv2.contourArea(cnt)
            if area > 10000:  # Only large masks
                cv2.drawContours(annotated, [cnt], -1, (0, 255, 0), 2)
                zones = draw_grid(annotated, cnt, NUM_ROWS, NUM_COLS)

    # --- Show Output ---
    cv2.imshow("VisionixAI Zone Detection", annotated)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()



0: 480x640 (no detections), 155.4ms
Speed: 6.4ms preprocess, 155.4ms inference, 6.2ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 102.4ms
Speed: 1.9ms preprocess, 102.4ms inference, 0.6ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 91.3ms
Speed: 1.4ms preprocess, 91.3ms inference, 0.6ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 78.2ms
Speed: 1.1ms preprocess, 78.2ms inference, 0.4ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 77.3ms
Speed: 0.9ms preprocess, 77.3ms inference, 0.4ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 99.1ms
Speed: 1.0ms preprocess, 99.1ms inference, 0.4ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 83.9ms
Speed: 1.0ms preprocess, 83.9ms inference, 0.4ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 86.9ms
Speed: 1.3ms preprocess, 86.9

In [27]:
# Install dependencies (run in your terminal first)
# pip install transformers torchvision opencv-python

from transformers import SegformerFeatureExtractor, SegformerForSemanticSegmentation
import torch
import cv2
from PIL import Image
import numpy as np

# Load the SegFormer model & extractor (ADE20K pretrained)
extractor = SegformerFeatureExtractor.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
model = SegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")

# Load a test frame from your webcam
cap = cv2.VideoCapture(1)  # Or your IP cam

while True:
    ret, frame = cap.read()
    if not ret:
        break

    img_pil = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    inputs = extractor(images=img_pil, return_tensors="pt")
    outputs = model(**inputs)
    seg = outputs.logits.argmax(dim=1).squeeze().numpy()

    # ADE20K class 3 is usually "floor" (you can change this based on output)
    floor_mask = (seg == 3).astype(np.uint8) * 255
    floor_mask_colored = cv2.merge([floor_mask, np.zeros_like(floor_mask), np.zeros_like(floor_mask)])

    # Resize mask to match frame size
    floor_mask_colored_resized = cv2.resize(floor_mask_colored, (frame.shape[1], frame.shape[0]), interpolation=cv2.INTER_NEAREST)

    overlay = cv2.addWeighted(frame, 0.7, floor_mask_colored_resized, 0.3, 0)

    cv2.imshow("SegFormer Floor Detection", overlay)

    if cv2.waitKey(1) == 27:
        break

cap.release()
cv2.destroyAllWindows()


In [23]:
import cv2
import numpy as np
from transformers import SegformerFeatureExtractor, SegformerForSemanticSegmentation
from PIL import Image
import torch

# 1. Load segmentation model
# This model is pre-trained on the ADE20K dataset, which includes a 'floor' class (typically class ID 3).
extractor = SegformerFeatureExtractor.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
model = SegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
model.eval() # Set the model to evaluation mode (disables dropout, etc.)

# 2. Set up video capture
# cap = cv2.VideoCapture(0) # Use 0 for the default webcam
cap = cv2.VideoCapture(1) # Use 1 for an external webcam, if available

if not cap.isOpened():
    print("Error: Could not open video stream. Check camera index or connection.")
    exit()

def divide_floor_into_zones(floor_mask, num_zones=5):
    """
    Divide the largest detected floor area into horizontal zones.

    Args:
        floor_mask (np.array): A binary mask (255 for floor, 0 for non-floor).
        num_zones (int): The number of horizontal zones to divide the floor into.

    Returns:
        list: A list of tuples, where each tuple contains (top_left, bottom_right)
              coordinates for a zone. Returns an empty list if no floor is detected.
    """
    zones = []
    # Find contours of the detected floor area
    contours, _ = cv2.findContours(floor_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    if contours:
        # Find the largest contour, assuming it's the main floor
        largest_contour = max(contours, key=cv2.contourArea)
        
        # Get the bounding rectangle for the largest floor contour
        x, y, w, h = cv2.boundingRect(largest_contour)
        
        # Ensure the height is sufficient for division
        if h < num_zones: # Prevent division by zero or overly thin zones
            return []

        # Calculate the height of each horizontal zone
        zone_height = h // num_zones

        for i in range(num_zones):
            # Calculate top-left and bottom-right corners for each zone
            top_left = (x, y + i * zone_height)
            bottom_right = (x + w, y + (i + 1) * zone_height)
            
            # Ensure the last zone extends to the full height of the bounding box
            if i == num_zones - 1:
                bottom_right = (x + w, y + h)

            zones.append((top_left, bottom_right))
    return zones

## Main Loop for Video Processing ##
while True:
    ret, frame = cap.read() # Read a frame from the camera
    if not ret:
        print("Failed to grab frame, exiting...")
        break

    # Convert the OpenCV BGR frame to PIL RGB image for the model
    img_pil = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    
    # Preprocess the image for the SegFormer model
    inputs = extractor(images=img_pil, return_tensors="pt")
    
    # Perform inference without tracking gradients
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Get the semantic segmentation map (class ID for each pixel)
    # The output logits are resized to the original input image size
    seg = outputs.logits.argmax(dim=1).squeeze().numpy()
    
    # Create a binary mask for the 'floor' class
    # ADE20K dataset: class ID 3 typically corresponds to 'floor'
    floor_mask = (seg == 3).astype(np.uint8) * 255 
    
    # Resize the floor mask back to the original frame dimensions for drawing
    floor_mask_resized = cv2.resize(floor_mask, (frame.shape[1], frame.shape[0]), interpolation=cv2.INTER_NEAREST)

    # Divide the detected floor into zones
    zones = divide_floor_into_zones(floor_mask_resized, num_zones=5)

    # Create an overlay to draw zones
    overlay = frame.copy()
    
    # Draw each zone on the overlay
    for idx, (tl, br) in enumerate(zones):
        # Draw a green rectangle for the zone
        cv2.rectangle(overlay, tl, br, (0, 255, 0), 2) # Green color, 2 pixels thick
        
        # Put a label for the zone
        label_pos = (tl[0] + 10, tl[1] + 30)
        cv2.putText(overlay, f"Zone {idx+1}", label_pos, cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 0), 2)

    # Blend the original frame with the overlay to show zones
    combined = cv2.addWeighted(frame, 0.7, overlay, 0.3, 0) # 70% original, 30% overlay

    # Display the result
    cv2.imshow("Floor Zones Detection", combined)

    # Break the loop if 'ESC' key is pressed
    if cv2.waitKey(1) == 27: 
        break

# Release the video capture object and close all OpenCV windows
cap.release()
cv2.destroyAllWindows()

In [24]:
import cv2
import numpy as np
from transformers import SegformerFeatureExtractor, SegformerForSemanticSegmentation
from PIL import Image
import torch

# Load segmentation model
extractor = SegformerFeatureExtractor.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
model = SegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
model.eval()

# Set video capture
cap = cv2.VideoCapture(1)

def divide_floor_into_zones(floor_mask, num_zones=5):
    """Divide floor area into horizontal zones inside the floor contour."""
    zones = []
    contours, _ = cv2.findContours(floor_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    if contours:
        largest_contour = max(contours, key=cv2.contourArea)
        x, y, w, h = cv2.boundingRect(largest_contour)
        zone_height = h // num_zones
        for i in range(num_zones):
            top_left = (x, y + i * zone_height)
            bottom_right = (x + w, y + (i + 1) * zone_height)
            zones.append((top_left, bottom_right))
    return zones

while True:
    ret, frame = cap.read()
    if not ret:
        break

    img_pil = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    inputs = extractor(images=img_pil, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    
    seg = outputs.logits.argmax(dim=1).squeeze().numpy()
    floor_mask = (seg == 3).astype(np.uint8) * 255  # Class 3 = floor
    floor_mask_resized = cv2.resize(floor_mask, (frame.shape[1], frame.shape[0]), interpolation=cv2.INTER_NEAREST)

    zones = divide_floor_into_zones(floor_mask_resized)

    overlay = frame.copy()
    for idx, (tl, br) in enumerate(zones):
        cv2.rectangle(overlay, tl, br, (0, 255, 0), 2)
        label_pos = (tl[0] + 10, tl[1] + 30)
        cv2.putText(overlay, f"Zone {idx+1}", label_pos, cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 0), 2)

    combined = cv2.addWeighted(frame, 0.7, overlay, 0.3, 0)
    cv2.imshow("Floor Zones", combined)

    if cv2.waitKey(1) == 27:  # ESC to break
        break

cap.release()
cv2.destroyAllWindows()