In [1]:
from ultralytics import YOLO
import cv2
import os
import pandas as pd

View Ultralytics Settings with 'yolo settings' or at 'C:\Users\Rajarshi\AppData\Roaming\Ultralytics\settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.


In [2]:
# Load models
model_ped = YOLO("../Models/computer_vision_models/pedestrian_detect_model_v8n_24_jul.pt")
model_apparel = YOLO("../Models/computer_vision_models/apparel2_v8n_model_13_aug_2025.pt")

def detect_pedestrian(image):
    return model_ped(image)

def detect_apparel(image):
    return model_apparel(image)

In [3]:
def pedestrian_crop(pedestrian_results, original_image):
    """
    Crops pedestrians from an image based on detection results and runs
    apparel detection on each cropped pedestrian.

    Returns:
        all_detections: dict with pedestrian_id as key and detected apparel list as value
    """
    all_detections = {}
    pedestrian_id = 0

    for result in pedestrian_results:
        boxes = result.boxes.xyxy.cpu().numpy()
        for box in boxes:
            x1, y1, x2, y2 = map(int, box)

            # Crop pedestrian
            cropped_pedestrian = original_image[y1:y2, x1:x2]

            if cropped_pedestrian.size > 0:
                apparel_results = detect_apparel(cropped_pedestrian)

                # Extract apparel detections: (class_name, confidence)
                apparel_info = []
                for a_result in apparel_results:
                    for box_a in a_result.boxes:
                        cls_id = int(box_a.cls[0])
                        conf = float(box_a.conf[0])
                        class_name = a_result.names[cls_id]
                        apparel_info.append((class_name, conf))

                all_detections[pedestrian_id] = {
                    'bbox': (x1, y1, x2, y2),
                    'apparel': apparel_info
                }
                pedestrian_id += 1

    return all_detections

In [4]:
if __name__ == "__main__":
    image_path = r"C:\Users\Rajarshi\Pictures\capstone_test_images\test1.jpg"
    image = cv2.imread(image_path)

    # Detect pedestrians
    ped_results = detect_pedestrian(image)

    # Annotated image with pedestrian bboxes
    annotated_image = ped_results[0].plot()  # YOLO's built-in plotting

    # Crop pedestrians and run apparel detection
    detections = pedestrian_crop(ped_results, image)

    # Display annotated image
    cv2.imshow("Pedestrian Detection", annotated_image)
    cv2.waitKey(0)
    cv2.destroyAllWindows()

    # Prepare table for apparel detections
    table_data = []
    for pid, info in detections.items():
        for apparel_item, conf in info['apparel']:
            table_data.append({
                "Pedestrian ID": pid,
                "Apparel": apparel_item,
                "Confidence": round(conf, 3)
            })

    # Convert to pandas DataFrame for a clean table
    df = pd.DataFrame(table_data)
    if not df.empty:
        print("\nDetected Apparel per Pedestrian:\n")
        print(df.to_string(index=False))
    else:
        print("\nNo apparel detected.\n")



0: 448x640 14 Pedestrians, 129.2ms
Speed: 21.4ms preprocess, 129.2ms inference, 204.7ms postprocess per image at shape (1, 3, 448, 640)

0: 640x288 1 pant, 128.2ms
Speed: 2.5ms preprocess, 128.2ms inference, 77.5ms postprocess per image at shape (1, 3, 640, 288)

0: 640x224 1 Tshirt, 1 pant, 109.3ms
Speed: 3.7ms preprocess, 109.3ms inference, 2.9ms postprocess per image at shape (1, 3, 640, 224)

0: 640x224 1 skirt, 10.4ms
Speed: 2.2ms preprocess, 10.4ms inference, 3.4ms postprocess per image at shape (1, 3, 640, 224)

0: 640x192 (no detections), 108.9ms
Speed: 1.9ms preprocess, 108.9ms inference, 2.4ms postprocess per image at shape (1, 3, 640, 192)

0: 640x192 (no detections), 26.5ms
Speed: 1.9ms preprocess, 26.5ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 192)

0: 640x192 2 shirts, 1 skirt, 17.4ms
Speed: 2.1ms preprocess, 17.4ms inference, 2.6ms postprocess per image at shape (1, 3, 640, 192)

0: 640x224 (no detections), 12.5ms
Speed: 1.6ms preprocess, 12.5ms infe