# YOLOv8s.pt

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
import shutil
from pathlib import Path

# Paths to your main folder in Google Drive and the destination folder in Colab
source_main_folder = '/content/drive/MyDrive/VCOM_TASK3/photos'
destination_images_path = '/content/data/images'
destination_annotations_path = '/content/data/annotations'

# Create destination directories if they don't exist
os.makedirs(destination_images_path, exist_ok=True)
os.makedirs(destination_annotations_path, exist_ok=True)

# Copy all images and XML files from subfolders to destination directories
for root, dirs, files in os.walk(source_main_folder):
    for file in files:
        if file.endswith('.jpg'):
            shutil.copy(os.path.join(root, file), os.path.join(destination_images_path, file))
        elif file.endswith('.xml'):
            shutil.copy(os.path.join(root, file), os.path.join(destination_annotations_path, file))

# List some files to confirm
print("Images:", os.listdir(destination_images_path)[:5])
print("Annotations:", os.listdir(destination_annotations_path)[:5])


In [None]:
%pip install ultralytics
import ultralytics
ultralytics.checks()

In [None]:
import random
import xml.etree.ElementTree as ET
from ultralytics import YOLO

# Create directories for train and test splits
os.makedirs('/content/data/train/images', exist_ok=True)
os.makedirs('/content/data/train/annotations', exist_ok=True)
os.makedirs('/content/data/test/images', exist_ok=True)
os.makedirs('/content/data/test/annotations', exist_ok=True)

# List all images
all_images = [f for f in os.listdir(destination_images_path) if f.endswith('.jpg')]

# Shuffle and split the data
random.shuffle(all_images)
split_index = int(len(all_images) * 0.8)
train_images = all_images[:split_index]
test_images = all_images[split_index:]

# Move files to train and test directories
for image in train_images:
    shutil.move(os.path.join(destination_images_path, image), '/content/data/train/images/' + image)
    xml_file = image.replace('.jpg', '.xml')
    shutil.move(os.path.join(destination_annotations_path, xml_file), '/content/data/train/annotations/' + xml_file)

for image in test_images:
    shutil.move(os.path.join(destination_images_path, image), '/content/data/test/images/' + image)
    xml_file = image.replace('.jpg', '.xml')
    shutil.move(os.path.join(destination_annotations_path, xml_file), '/content/data/test/annotations/' + xml_file)

# Function to convert VOC to YOLO format
def convert_voc_to_yolo(voc_folder, yolo_folder, img_folder):
    if not os.path.exists(yolo_folder):
        os.makedirs(yolo_folder)

    for xml_file in os.listdir(voc_folder):
        if not xml_file.endswith('.xml'):
            continue

        tree = ET.parse(os.path.join(voc_folder, xml_file))
        root = tree.getroot()

        img_file = root.find('filename').text
        img_path = os.path.join(img_folder, img_file)

        width = int(root.find('size/width').text)
        height = int(root.find('size/height').text)

        with open(os.path.join(yolo_folder, xml_file.replace('.xml', '.txt')), 'w') as yolo_file:
            for obj in root.findall('object'):
                class_id = 0  # Assuming all objects are "lego" and the class_id is 0
                bndbox = obj.find('bndbox')
                xmin = int(bndbox.find('xmin').text)
                ymin = int(bndbox.find('ymin').text)
                xmax = int(bndbox.find('xmax').text)
                ymax = int(bndbox.find('ymax').text)

                x_center = (xmin + xmax) / 2.0 / width
                y_center = (ymin + ymax) / 2.0 / height
                bbox_width = (xmax - xmin) / width
                bbox_height = (ymax - ymin) / height

                yolo_file.write(f"{class_id} {x_center} {y_center} {bbox_width} {bbox_height}\n")

# Convert training and testing annotations
convert_voc_to_yolo('/content/data/train/annotations', '/content/data/train/labels', '/content/data/train/images')
convert_voc_to_yolo('/content/data/test/annotations', '/content/data/test/labels', '/content/data/test/images')


In [None]:
# Prepare YOLO dataset configuration file
dataset_yaml = """
train: /content/data/train/images
val: /content/data/test/images

nc: 1  # number of classes
names: ['lego']  # class names
"""

with open('/content/dataset.yaml', 'w') as file:
    file.write(dataset_yaml)


In [None]:
# Load and train the YOLO model with custom image size
model = YOLO('yolov8s.pt')
model.train(data='/content/dataset.yaml', epochs=20, imgsz=320)


# Faster R-CNN

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
import shutil
from pathlib import Path

# Paths to your main folder in Google Drive and the destination folder in Colab
source_main_folder = '/content/drive/MyDrive/VCOM_TASK3/photos'
destination_images_path = '/content/data/images'
destination_annotations_path = '/content/data/annotations'

# Create destination directories if they don't exist
os.makedirs(destination_images_path, exist_ok=True)
os.makedirs(destination_annotations_path, exist_ok=True)

# Copy all images and XML files from subfolders to destination directories
for root, dirs, files in os.walk(source_main_folder):
    for file in files:
        if file.endswith('.jpg'):
            shutil.copy(os.path.join(root, file), os.path.join(destination_images_path, file))
        elif file.endswith('.xml'):
            shutil.copy(os.path.join(root, file), os.path.join(destination_annotations_path, file))

# List some files to confirm
print("Images:", os.listdir(destination_images_path)[:5])
print("Annotations:", os.listdir(destination_annotations_path)[:5])


Mounted at /content/drive
Images: ['IMG_20201208_032220.jpg', 'IMG_20201211_164736.jpg', 'IMG_20201211_170628.jpg', 'IMG_20201203_091548.jpg', '0_T123_original_3460_1609896039750.jpg']
Annotations: ['IMG_20201208_030904.xml', '0_EQwj_original_60479_1609884613634.xml', 'IMG_20201211_164201.xml', 'IMG_20201204_224650.xml', 'IMG_20201203_091340.xml']


In [None]:
%pip install torch torchvision

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
Collectin

In [None]:
import os
import random
import shutil
import xml.etree.ElementTree as ET
from PIL import Image

import torch
import torchvision
import torchvision.transforms as T
from torchvision.models.detection.faster_rcnn import fasterrcnn_resnet50_fpn
from torch.utils.data import Dataset, DataLoader
from torchvision.ops import boxes as box_ops

# Create directories for train and test splits
os.makedirs('/content/data/train/images', exist_ok=True)
os.makedirs('/content/data/train/annotations', exist_ok=True)
os.makedirs('/content/data/test/images', exist_ok=True)
os.makedirs('/content/data/test/annotations', exist_ok=True)

# List all images
all_images = [f for f in os.listdir(destination_images_path) if f.endswith('.jpg')]

# Shuffle and split the data
random.shuffle(all_images)
split_index = int(len(all_images) * 0.8)
train_images = all_images[:split_index]
test_images = all_images[split_index:]

# Move files to train and test directories
for image in train_images:
    shutil.move(os.path.join(destination_images_path, image), '/content/data/train/images/' + image)
    xml_file = image.replace('.jpg', '.xml')
    shutil.move(os.path.join(destination_annotations_path, xml_file), '/content/data/train/annotations/' + xml_file)

for image in test_images:
    shutil.move(os.path.join(destination_images_path, image), '/content/data/test/images/' + image)
    xml_file = image.replace('.jpg', '.xml')
    shutil.move(os.path.join(destination_annotations_path, xml_file), '/content/data/test/annotations/' + xml_file)

# Define a custom dataset class
class LegoDataset(Dataset):
    def __init__(self, images_folder, annotations_folder, transforms=None):
        self.images_folder = images_folder
        self.annotations_folder = annotations_folder
        self.transforms = transforms
        self.imgs = list(sorted(os.listdir(images_folder)))

    def __getitem__(self, idx):
        img_name = self.imgs[idx]
        img_path = os.path.join(self.images_folder, img_name)
        annotation_path = os.path.join(self.annotations_folder, img_name.replace('.jpg', '.xml'))

        img = Image.open(img_path).convert("RGB")

        tree = ET.parse(annotation_path)
        root = tree.getroot()

        boxes = []
        for obj in root.findall('object'):
            bndbox = obj.find('bndbox')
            xmin = int(bndbox.find('xmin').text)
            ymin = int(bndbox.find('ymin').text)
            xmax = int(bndbox.find('xmax').text)
            ymax = int(bndbox.find('ymax').text)
            boxes.append([xmin, ymin, xmax, ymax])

        boxes = torch.as_tensor(boxes, dtype=torch.float32)
        labels = torch.ones((len(boxes),), dtype=torch.int64)  # assuming all instances are "lego"

        image_id = torch.tensor([idx])
        area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
        iscrowd = torch.zeros((len(boxes),), dtype=torch.int64)

        target = {
            'boxes': boxes,
            'labels': labels,
            'image_id': image_id,
            'area': area,
            'iscrowd': iscrowd
        }

        if self.transforms is not None:
            img = self.transforms(img)

        return img, target

    def __len__(self):
        return len(self.imgs)

# Data transforms
transform = T.Compose([
    T.ToTensor()
])

# Create datasets
train_dataset = LegoDataset('/content/data/train/images', '/content/data/train/annotations', transforms=transform)
test_dataset = LegoDataset('/content/data/test/images', '/content/data/test/annotations', transforms=transform)

# Create data loaders
train_data_loader = DataLoader(train_dataset, batch_size=2, shuffle=True, num_workers=2, collate_fn=lambda x: tuple(zip(*x)))
test_data_loader = DataLoader(test_dataset, batch_size=2, shuffle=False, num_workers=2, collate_fn=lambda x: tuple(zip(*x)))

# Load pre-trained Faster R-CNN model
model = fasterrcnn_resnet50_fpn(pretrained=True)
num_classes = 2  # 1 class (lego) + background

# Get number of input features for the classifier
in_features = model.roi_heads.box_predictor.cls_score.in_features
# Replace the pre-trained head with a new one
model.roi_heads.box_predictor = torchvision.models.detection.faster_rcnn.FastRCNNPredictor(in_features, num_classes)

# Define training function
def train_model(model, data_loader, device):
    model.train()
    optimizer = torch.optim.SGD(model.parameters(), lr=0.005, momentum=0.9, weight_decay=0.0005)
    num_epochs = 5

    for epoch in range(num_epochs):
        epoch_loss = 0
        for images, targets in data_loader:
            images = list(image.to(device) for image in images)
            targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

            loss_dict = model(images, targets)
            losses = sum(loss for loss in loss_dict.values())

            optimizer.zero_grad()
            losses.backward()
            optimizer.step()

            epoch_loss += losses.item()

        print(f"Epoch {epoch + 1}, Loss: {epoch_loss}")


In [None]:
# Train the model
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

train_model(model, train_data_loader, device)

# Save the model
torch.save(model.state_dict(), 'faster_rcnn_lego.pth')

# Segmentation

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# Paths to main folder in Google Drive and the destination folder in Colab
segmentation_main_folder = '/content/drive/MyDrive/photos'
segmentation_images_path = '/content/data/segmentation/images'
segmentation_destination_folder = '/content/data/segmentation/segmented_images'

# Create destination directories if they don't exist
os.makedirs(segmentation_images_path, exist_ok=True)
os.makedirs(segmentation_destination_folder, exist_ok=True)

Mounted at /content/drive


In [None]:
import os
import shutil
from pathlib import Path

# Copy all images and XML files from subfolders to destination directories
for root, dirs, files in os.walk(segmentation_main_folder):
    for file in files:
        if file.endswith('.jpg'):
            shutil.copy(os.path.join(root, file), os.path.join(segmentation_images_path, file))

# List some files to confirm
print("Images:", os.listdir(segmentation_images_path)[:5])


KeyboardInterrupt: 

In [None]:
%pip install ultralytics
from ultralytics import YOLO
import cv2
import numpy as np
import matplotlib.pyplot as plt

model = YOLO('/content/YOLO_model.pt')

# Get the first n image file paths
def get_first_n_images(folder_path):
    all_files = os.listdir(folder_path)
    image_files = [file for file in all_files if file.endswith('.jpg')]
    image_files.sort()  # Sorting to ensure consistency
    first_n_images = image_files[:3]
    return [os.path.join  (folder_path, image) for image in first_n_images]

first_n_images = get_first_n_images(segmentation_images_path)

# Function to apply GrabCut
def apply_grabcut(image, bounding_box):
    x1, y1, x2, y2 = map(int, bounding_box.tolist())
    cropped_image = image[y1:y2, x1:x2]

    # Apply Gaussian blur
    cropped_image = cv2.GaussianBlur(cropped_image, (5, 5), 0)
    gray_cropped_image = cv2.cvtColor(cropped_image, cv2.COLOR_RGB2GRAY)

    # Apply global binary threshold
    _, binary_cropped_image = cv2.threshold(gray_cropped_image, 0, 255, cv2.THRESH_BINARY)

    # GrabCut segmentation
    mask = np.zeros(cropped_image.shape[:2], np.uint8)

    bgd_model = np.zeros((1, 65), np.float64)
    fgd_model = np.zeros((1, 65), np.float64)

    rect = (1, 1, cropped_image.shape[1] - 2, cropped_image.shape[0] - 2)
    cv2.grabCut(cropped_image, mask, rect, bgd_model, fgd_model, 5, cv2.GC_INIT_WITH_RECT)

    mask2 = np.where((mask == 2) | (mask == 0), 0, 1).astype('uint8')
    segmented_cropped_image = cropped_image * mask2[:, :, np.newaxis]

    # k-means clustering
    pixel_values = segmented_cropped_image.reshape((-1, 3))
    pixel_values = np.float32(pixel_values)

    criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 10, 0.2)
    _, labels, centers = cv2.kmeans(pixel_values, 5, None, criteria, 10, cv2.KMEANS_RANDOM_CENTERS)

    centers = np.uint8(centers)
    segmented_image = centers[labels.flatten()]
    segmented_image = segmented_image.reshape(segmented_cropped_image.shape)

    # Apply morphological operations
    kernel = np.ones((3,3), np.uint8)
    segmented_image = cv2.morphologyEx(segmented_cropped_image, cv2.MORPH_OPEN, kernel)
    segmented_image = cv2.morphologyEx(segmented_image, cv2.MORPH_CLOSE, kernel)

    image[y1:y2, x1:x2] = segmented_image

    segmented_image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    # segmented_image_resized = cv2.resize(segmented_image_bgr, (800, 800))

    return segmented_image


def detect_and_segment(image_path, model):
    image = cv2.imread(image_path)

    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

    # Improve the contrast of the image
    image_lab = cv2.cvtColor(image_rgb, cv2.COLOR_RGB2LAB)
    l, a, b = cv2.split(image_lab)
    l = cv2.equalizeHist(l)
    image_lab = cv2.merge((l, a, b))
    image_rgb = cv2.cvtColor(image_lab, cv2.COLOR_LAB2RGB)

    results = model(image)

    # Apply segmentation to each detected bounding box
    segmented_images = [apply_grabcut(image, bbox) for bbox in results[0].boxes.xyxy]

    return segmented_images

for i, photo_path in enumerate(first_n_images):
    segmented_images = detect_and_segment(photo_path, model)

    for j, segmented_image in enumerate(segmented_images):
        segmented_image_filename = os.path.join(segmentation_destination_folder, f"segmented_{i+1}_{j+1}.jpg")

        cv2.imwrite(segmented_image_filename, segmented_image)



0: 640x480 1 bowl, 570.5ms
Speed: 6.5ms preprocess, 570.5ms inference, 1.8ms postprocess per image at shape (1, 3, 640, 480)
