In [2]:
import torch
from torchvision import transforms
import torchvision
from PIL import Image
import cv2
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import pandas as pd
import json

In [4]:
yolov5_model = torch.hub.load('ultralytics/yolov5', 'yolov5s')

mask_rcnn_model = torchvision.models.detection.maskrcnn_resnet50_fpn(pretrained=True)
mask_rcnn_model.eval()

def load_image(image_path):
    image = cv2.imread(image_path)
    if image is None:
        raise FileNotFoundError(f"Image file not found at {image_path}")
    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    return image, image_rgb

def detect_objects(image):
    results = yolov5_model(image)
    return results.xyxy[0].cpu().numpy()

def load_coco_annotations(annotation_file):
    with open(annotation_file, 'r') as f:
        annotations = json.load(f)
    return annotations

def segment_objects(image, boxes, target_size=(224, 224)):
    transform = torchvision.transforms.Compose([
        torchvision.transforms.ToPILImage(),
        torchvision.transforms.Resize(target_size, interpolation=torchvision.transforms.InterpolationMode.BILINEAR),
        torchvision.transforms.ToTensor()
    ])
    image_tensor = torch.from_numpy(image).permute(2, 0, 1).float() / 255.0
    masks = []

    for _, box in enumerate(boxes):
        x1, y1, x2, y2 = map(int, box[:4])
        cropped_image = image_tensor[:, y1:y2, x1:x2]
        resized_image = transform(cropped_image.permute(1, 2, 0).numpy())
        resized_image = resized_image.unsqueeze(0)
        with torch.no_grad():
            output = mask_rcnn_model(resized_image)
        mask = output[0]['masks'][0, 0].mul(255).byte().cpu().numpy()
        masks.append((x1, y1, x2, y2, mask))

    return masks

def extract_vehicle_heights(annotations, category_id):
    data = []

    for ann in annotations['annotations']:
        if ann['category_id'] == category_id:
            image_id = ann['image_id']
            bbox = ann['bbox']
            height_real = ann.get('height', None)
            if height_real is not None:
                data.append({
                    'image_id': image_id,
                    'bbox': bbox,
                    'height_real': height_real
                })

    return pd.DataFrame(data)

def calculate_pixel_heights(image_path, vehicle_data):
    heights = []

    for _, row in vehicle_data.iterrows():
        image, image_rgb = load_image(image_path)
        x1, y1, x2, y2 = map(int, row['bbox'])
        box = np.array([[x1, y1, x2, y2]])
        masks = segment_objects(image_rgb, box)
        mask_info = masks[0]
        x1, y1, x2, y2, mask = mask_info
        vehicle_height_pixels = y2 - y1
        heights.append({
            'image_id': row['image_id'],
            'height_pixels': vehicle_height_pixels,
            'height_real': row['height']
        })

    return pd.DataFrame(heights)

def train_correction_factor_model(vehicle_data):
    vehicle_data['correction_factor'] = vehicle_data['height_real'] / vehicle_data['height_pixels']

    X = vehicle_data[['height_pixels']]
    y = vehicle_data['correction_factor']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    model = LinearRegression()
    model.fit(X_train, y_train)

    return model

def draw_boxes_and_masks(image, boxes, masks, model):
    heights = []

    for _, (_, mask_info) in enumerate(zip(boxes, masks)):
        x1, y1, x2, y2, mask = mask_info

        colored_mask = np.zeros_like(image, dtype=np.uint8)
        colored_mask[y1:y2, x1:x2][mask > 127] = (0, 255, 0)

        image = cv2.addWeighted(image, 1, colored_mask, 0.5, 0)

        vehicle_height_pixels = y2 - y1

        correction_factor = model.predict([[vehicle_height_pixels]])[0]
        vehicle_height_real = vehicle_height_pixels * correction_factor
        heights.append(vehicle_height_real)

    return image, heights

def main(image_path, annotation_file, vehicle_category_id):
    annotations = load_coco_annotations(annotation_file)
    vehicle_data = extract_vehicle_heights(annotations, vehicle_category_id)
    pixel_height_data = calculate_pixel_heights(image_path, vehicle_data)
    model = train_correction_factor_model(pixel_height_data)
    
    image, image_rgb = load_image(image_path)
    boxes = detect_objects(image_rgb)
    vehicle_boxes = [box for box in boxes if int(box[5]) in [2, 5, 7]]
    masks = segment_objects(image_rgb, vehicle_boxes)
    image_with_masks, heights = draw_boxes_and_masks(image_rgb, vehicle_boxes, masks, model)

    output_image = cv2.cvtColor(image_with_masks, cv2.COLOR_RGB2BGR)
    cv2.imwrite('output_image_with_masks.jpg', output_image)

    return output_image

Using cache found in /home/vscode/.cache/torch/hub/ultralytics_yolov5_master
YOLOv5 🚀 2024-5-22 Python-3.11.9 torch-2.2.1 CPU

Fusing layers... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients, 16.4 GFLOPs
Adding AutoShape... 
