Create a Dataset for @HOME2024 using GroundingDino and SAM

In [1]:
!pip install ipywidgets
!jupyter nbextension enable --py widgetsnbextension

Defaulting to user installation because normal site-packages is not writeable
usage: jupyter [-h] [--version] [--config-dir] [--data-dir] [--runtime-dir]
               [--paths] [--json] [--debug]
               [subcommand]

Jupyter: Interactive Computing

positional arguments:
  subcommand     the subcommand to launch

options:
  -h, --help     show this help message and exit
  --version      show the versions of core jupyter packages and exit
  --config-dir   show Jupyter config dir
  --data-dir     show Jupyter data dir
  --runtime-dir  show Jupyter runtime dir
  --paths        show all Jupyter paths. Add --json for machine-readable
                 format.
  --json         output paths as machine-readable json
  --debug        output debug information about paths

Available subcommands: console dejavu events execute kernel kernelspec lab
labextension labhub migrate nbconvert notebook qtconsole run server
troubleshoot trust

Jupyter command `jupyter-nbextension` not found.


In [2]:
import os
import random
import json
import numpy as np
import cv2
from PIL import Image, ImageDraw, ImageEnhance, ImageFilter, ImageFont, ExifTags
from pycocotools import mask
import json
import yaml
import csv
import torch
import matplotlib.pyplot as plt
from pathlib import Path
import ultralytics
import time
import imutils
import argparse
from tqdm.notebook import tqdm

#grounding imports----------------

import groundingdino.datasets.transforms as T
from groundingdino.models import build_model
from groundingdino.util import box_ops
from groundingdino.util.slconfig import SLConfig
from groundingdino.util.utils import clean_state_dict, get_phrases_from_posmap
from groundingdino.util.vl_utils import create_positive_map_from_span


In [3]:
#resize images in a folder to a specific size
pathtofiles = "datasets/bags"
pathtoimage = [f.path for f in os.scandir(pathtofiles) if f.is_dir()]
if len(pathtoimage) == 0:
    print("No folders found in the directory")
new_width= 720

resize_progress = tqdm(total=len(pathtoimage), desc="Resizing images")
for filepath in pathtoimage:
    folder = filepath + "/"
    for filename in os.listdir(folder):
        img = Image.open(folder + filename)
        # Correct orientation
        for orientation in ExifTags.TAGS.keys() : 
            if ExifTags.TAGS[orientation]=='Orientation' : break 
        
        exif=dict(img._getexif().items())

        if   exif[orientation] == 3 : 
            img=img.rotate(180, expand=True)
        elif exif[orientation] == 6 : 
            img=img.rotate(270, expand=True)
        elif exif[orientation] == 8 : 
            img=img.rotate(90, expand=True)
        
        # Resize
        aspect_ratio = img.height / img.width
        new_height = int(new_width * aspect_ratio)
        img = img.resize((new_width, new_height))
        img.save(folder + filename)
        resize_progress.update(1)

Resizing images:   0%|          | 0/4 [00:00<?, ?it/s]

AttributeError: 'NoneType' object has no attribute 'items'

Auto label con Segment anyting y modelo de YOLOv8

In [3]:
# Setup SAM
import sys
sys.path.append("..")
from segment_anything import sam_model_registry, SamPredictor

# SELECT MODEL
sam_model = "b"

#use sam model
#wget -q https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth
#wget -q https://dl.fbaipublicfiles.com/segment_anything/sam_vit_l_0b3195.pth
if sam_model =="h":
  sam_checkpoint = "sam_vit_h_4b8939.pth"
  model_type = "vit_h"
elif sam_model =="l":
  sam_checkpoint = "sam_vit_l_0b3195.pth"
  model_type = "vit_l"
elif sam_model =="b":
  sam_checkpoint = "sam_vit_b_01ec64.pth"
  model_type = "vit_b"

In [4]:
# DINO setup
# cfg
config_file = "GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py"  # change the path of the model config file

#wget -q https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha/groundingdino_swint_ogc.pth
checkpoint_path = "GroundingDINO/groundingdino_swint_ogc.pth"  # change the path of the model
text_prompt = "bag"

def load_image(image_path):

    image_pil = Image.open(image_path).convert("RGB")  # load image

    transform = T.Compose(
        [
            T.RandomResize([800], max_size=1333),
            T.ToTensor(),
            T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
        ]
    )
    image, _ = transform(image_pil, None)  # 3, h, w
    return image_pil, image


def load_model(model_config_path, model_checkpoint_path, cpu_only=False):
    args = SLConfig.fromfile(model_config_path)
    args.device = "cuda" if not cpu_only else "cpu"
    model = build_model(args)
    checkpoint = torch.load(model_checkpoint_path, map_location="cpu")
    load_res = model.load_state_dict(clean_state_dict(checkpoint["model"]), strict=False)
    # print(load_res)
    _ = model.eval()
    return model


def get_grounding_output(model, image, caption, box_threshold, text_threshold=None, with_logits=True, cpu_only=False, token_spans=None):
    assert text_threshold is not None or token_spans is not None, "text_threshould and token_spans should not be None at the same time!"
    caption = caption.lower()
    caption = caption.strip()
    if not caption.endswith("."):
        caption = caption + "."
    device = "cuda" if not cpu_only else "cpu"
    model = model.to(device)
    image = image.to(device)
    with torch.no_grad():
        # print("Running model...")
        outputs = model(image[None], captions=[caption])
    logits = outputs["pred_logits"].sigmoid()[0]  # (nq, 256)
    boxes = outputs["pred_boxes"][0]  # (nq, 4)

    # filter output
    if token_spans is None:
        logits_filt = logits.cpu().clone()
        boxes_filt = boxes.cpu().clone()
        filt_mask = logits_filt.max(dim=1)[0] > box_threshold
        logits_filt = logits_filt[filt_mask]  # num_filt, 256
        boxes_filt = boxes_filt[filt_mask]  # num_filt, 4

        # get phrase
        tokenlizer = model.tokenizer
        tokenized = tokenlizer(caption)
        # build pred
        pred_phrases = []
        for logit, box in zip(logits_filt, boxes_filt):
            pred_phrase = get_phrases_from_posmap(logit > text_threshold, tokenized, tokenlizer)
            if with_logits:
                pred_phrases.append(pred_phrase + f"({str(logit.max().item())[:4]})")
            else:
                pred_phrases.append(pred_phrase)
    else:
        # given-phrase mode
        positive_maps = create_positive_map_from_span(
            model.tokenizer(text_prompt),
            token_span=token_spans
        ).to(image.device) # n_phrase, 256

        logits_for_phrases = positive_maps @ logits.T # n_phrase, nq
        all_logits = []
        all_phrases = []
        all_boxes = []
        for (token_span, logit_phr) in zip(token_spans, logits_for_phrases):
            # get phrase
            phrase = ' '.join([caption[_s:_e] for (_s, _e) in token_span])
            # get mask
            filt_mask = logit_phr > box_threshold
            # filt box
            all_boxes.append(boxes[filt_mask])
            # filt logits
            all_logits.append(logit_phr[filt_mask])
            if with_logits:
                logit_phr_num = logit_phr[filt_mask]
                all_phrases.extend([phrase + f"({str(logit.item())[:4]})" for logit in logit_phr_num])
            else:
                all_phrases.extend([phrase for _ in range(len(filt_mask))])
        boxes_filt = torch.cat(all_boxes, dim=0).cpu()
        pred_phrases = all_phrases


    return boxes_filt, pred_phrases

### RUNNING DINO + SEGMENTATION ###

In [5]:
pathtofiles = "datasets/bags" #path to images to process
resultspath = "datasets/bags_precut" #path to save results all ready processed and segmented images
if not os.path.exists(resultspath):
    os.makedirs(resultspath)


In [6]:
output_dir = resultspath
box_threshold = 0.3
text_threshold = 0.25
token_spans = None

device = "cuda"
sam = sam_model_registry[model_type](checkpoint=sam_checkpoint)
sam.to(device=device)
predictor = SamPredictor(sam)

images=[]
annotations=[]
categories=[]

img_id=0
anno_id=0

#check if results directory exists, else create it
if not os.path.exists(resultspath):
  os.makedirs(resultspath)

#make a list of all the directories in the path
pathtoimage = [f.path for f in os.scandir(pathtofiles) if f.is_dir()]
print(f"Found {len(pathtoimage)} directories in the path")

#pathtoimage = os.listdir(pathtofiles)

for filepath in pathtoimage:
    imgPaths = os.listdir(filepath)
    # print(imgPaths)

    i=0
    progress_bar = tqdm(total=len(imgPaths), desc=f"Processing images in {filepath}")
    for imgPath in imgPaths:
        # print(f"Processing image: {imgPath}")
        progress_bar.update(1)
        img = imutils.resize(cv2.imread(f"{filepath}/{imgPath}"))
        if img is None:
            continue

    #------------------------start grounding----------------------------------------------
        #image_path = args.image_path

        # load image
        image_pil, image = load_image(f"{filepath}/{imgPath}")

        # load model
        model = load_model(config_file, checkpoint_path, cpu_only=False)

        # set the text_threshold to None if token_spans is set.
        if token_spans is not None:
            text_threshold = None
            # print("Using token_spans. Set the text_threshold to None.")

        # run model
        boxes_filt, pred_phrases = get_grounding_output(
            model, image, text_prompt, box_threshold, text_threshold, cpu_only=False, token_spans=eval(f"{token_spans}")
        )

        #found bb dimensions

        size = image_pil.size
        pred_dict = {
            "boxes": boxes_filt,
            "size": [size[1], size[0]],  # H,W
            "labels": pred_phrases,
        }

        H, W = pred_dict["size"]
        boxes = pred_dict["boxes"]
        labels = pred_dict["labels"]
        assert len(boxes) == len(labels), "boxes and labels must have same length"

        draw = ImageDraw.Draw(image_pil)
        mask = Image.new("L", image_pil.size, 0)
        mask_draw = ImageDraw.Draw(mask)

        #change pil image to cv2 image
        img = cv2.cvtColor(np.array(image_pil), cv2.COLOR_RGB2BGR)
        img2 = img.copy()
        # draw boxes and masks
        x0_max = 0
        y0_max = 0
        x1_min = np.max(np.array(img))
        y1_min = np.max(np.array(img))
        for box, label in zip(boxes, labels):
            # from 0..1 to 0..W, 0..H
            box = box * torch.Tensor([W, H, W, H])
            # from xywh to xyxy
            box[:2] -= box[2:] / 2
            box[2:] += box[:2]
            # random color
            color = tuple(np.random.randint(0, 255, size=1).tolist())
            # draw
            padding = 10
            x0, y0, x1, y1 = box
            x0, y0, x1, y1 = int(x0)-padding, int(y0)-padding, int(x1)+padding, int(y1)+padding

            #validate if the bounding box is inside the image
            if x0 < 0:
                x0 = 0
            if y0 < 0:
                y0 = 0
            if x1 > W:
                x1 = W
            if y1 > H:
                y1 = H
                
            #draw rectangles
            cv2.rectangle(img2, (x0, y0), (x1, y1), color, 2)

            draw.rectangle([x0, y0, x1, y1], outline=color, width=6)
            # draw.text((x0, y0), str(label), fill=color)

            font = ImageFont.load_default()
            if hasattr(font, "getbbox"):
                bbox = draw.textbbox((x0, y0), str(label), font)
            else:
                w, h = draw.textsize(str(label), font)
                bbox = (x0, y0, w + x0, y0 + h)
            # bbox = draw.textbbox((x0, y0), str(label))
            draw.rectangle(bbox, fill=color)
            draw.text((x0, y0), str(label), fill="white")

            mask_draw.rectangle([x0, y0, x1, y1], fill=255, width=6)
        
    # ----------------End grounding ---------------------------------------------------------   
        
    # ----------------Start SAM--------------------------------------------------------------  
            
            class_name = filepath.split("/")[-1]
            #print x0, y0, x1, y1
            # print(f"Bounding box: {x0}, {y0}, {x1}, {y1}")
            
            #obtener el mas pequeño de los bounding boxes
            
            if x0 > x0_max and y0 > y0_max and x1 < x1_min and y1 < y1_min:
                x0_max = x0
                y0_max = y0
                x1_min = x1
                y1_min = y1
                
            sam_bounding_box = np.array([x0, y0, x1, y1])
            ran_sam = False
            #run sam
            if ran_sam == False:
                predictor.set_image(img)
                ran_sam = True

            mask, _, _ = predictor.predict(
                point_coords=None,
                point_labels=None,
                box=sam_bounding_box,
                multimask_output=False,
            )

            mask, _, _ = predictor.predict(box=sam_bounding_box, multimask_output=False)

            #Make png mask
            contours, _ = cv2.findContours(mask[0].astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) # Your call to find the contours

            # threshold input image using otsu thresholding as mask and refine with morphology
            ret, pngmask = cv2.threshold(mask[0].astype(np.uint8), 0, 255, cv2.THRESH_BINARY+cv2.THRESH_OTSU) 
            kernel = np.ones((9,9), np.uint8)
            pngmask = cv2.morphologyEx(pngmask, cv2.MORPH_CLOSE, kernel)
            pngmask = cv2.morphologyEx(pngmask, cv2.MORPH_OPEN, kernel)
            result = img.copy()
            result = cv2.cvtColor(result, cv2.COLOR_BGR2BGRA)
            result[:, :, 3] = pngmask                           

    # ----------------End SAM-----------------------------------------------------------------  
            #cv2.imwrite(f"{resultspath}/groundingcv2_{imgPath}", img2)

            #image_pil.save(f"{resultspath}/grounding_{imgPath}")

            if not os.path.exists(f"{resultspath}/{class_name}"):
                os.mkdir(f"{resultspath}/{class_name}")

            file_path = f"{resultspath}/{class_name}/{imgPath[:-4]}.png"
            
            if os.path.exists(file_path):
                if x0_max == x0 and y0_max == y0 and x1_min == x1 and y1_min == y1:
                    cv2.imwrite(file_path, result)
            else:
                cv2.imwrite(file_path, result)
            i=i+1
            ran_sam = False

Found 4 directories in the path


Processing images in datasets/bags/yellow_bag:   0%|          | 0/109 [00:00<?, ?it/s]

  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


final text_encoder_type: bert-base-uncased




final text_encoder_type: bert-base-uncased
final text_encoder_type: bert-base-uncased
final text_encoder_type: bert-base-uncased
final text_encoder_type: bert-base-uncased
final text_encoder_type: bert-base-uncased
final text_encoder_type: bert-base-uncased
final text_encoder_type: bert-base-uncased
final text_encoder_type: bert-base-uncased
final text_encoder_type: bert-base-uncased
final text_encoder_type: bert-base-uncased
final text_encoder_type: bert-base-uncased
final text_encoder_type: bert-base-uncased
final text_encoder_type: bert-base-uncased
final text_encoder_type: bert-base-uncased
final text_encoder_type: bert-base-uncased
final text_encoder_type: bert-base-uncased
final text_encoder_type: bert-base-uncased
final text_encoder_type: bert-base-uncased
final text_encoder_type: bert-base-uncased
final text_encoder_type: bert-base-uncased
final text_encoder_type: bert-base-uncased
final text_encoder_type: bert-base-uncased
final text_encoder_type: bert-base-uncased
final text_

Processing images in datasets/bags/red_bag:   0%|          | 0/125 [00:00<?, ?it/s]

final text_encoder_type: bert-base-uncased
final text_encoder_type: bert-base-uncased
final text_encoder_type: bert-base-uncased
final text_encoder_type: bert-base-uncased
final text_encoder_type: bert-base-uncased
final text_encoder_type: bert-base-uncased
final text_encoder_type: bert-base-uncased
final text_encoder_type: bert-base-uncased
final text_encoder_type: bert-base-uncased
final text_encoder_type: bert-base-uncased
final text_encoder_type: bert-base-uncased
final text_encoder_type: bert-base-uncased
final text_encoder_type: bert-base-uncased
final text_encoder_type: bert-base-uncased
final text_encoder_type: bert-base-uncased
final text_encoder_type: bert-base-uncased
final text_encoder_type: bert-base-uncased
final text_encoder_type: bert-base-uncased
final text_encoder_type: bert-base-uncased
final text_encoder_type: bert-base-uncased
final text_encoder_type: bert-base-uncased
final text_encoder_type: bert-base-uncased
final text_encoder_type: bert-base-uncased
final text_

Processing images in datasets/bags/red_bag_precut: 0it [00:00, ?it/s]

Processing images in datasets/bags/green_bag:   0%|          | 0/149 [00:00<?, ?it/s]

final text_encoder_type: bert-base-uncased
final text_encoder_type: bert-base-uncased
final text_encoder_type: bert-base-uncased
final text_encoder_type: bert-base-uncased
final text_encoder_type: bert-base-uncased
final text_encoder_type: bert-base-uncased
final text_encoder_type: bert-base-uncased
final text_encoder_type: bert-base-uncased
final text_encoder_type: bert-base-uncased
final text_encoder_type: bert-base-uncased
final text_encoder_type: bert-base-uncased
final text_encoder_type: bert-base-uncased
final text_encoder_type: bert-base-uncased
final text_encoder_type: bert-base-uncased
final text_encoder_type: bert-base-uncased
final text_encoder_type: bert-base-uncased
final text_encoder_type: bert-base-uncased
final text_encoder_type: bert-base-uncased
final text_encoder_type: bert-base-uncased
final text_encoder_type: bert-base-uncased
final text_encoder_type: bert-base-uncased
final text_encoder_type: bert-base-uncased
final text_encoder_type: bert-base-uncased
final text_

### PNG PREPROCESSING ###

In [14]:
from PIL import Image
import os

precut_dataset_path = "datasets/bags_precut/"
png_dataset_result = "datasets/bags_png_dataset/"

# Name of object MUST be the same as the folder name it is contained inside the png_datasetpath
object_names = ["red_bag",
                 "green_bag",
                 "yellow_bag"]

#create a result folder if it doesn't exist
if not os.path.exists(png_dataset_result):
    print("Creating result folder: ", png_dataset_result)
    os.makedirs(png_dataset_result)
    for object_name in object_names:
        os.makedirs(os.path.join(png_dataset_result, object_name))
else:
    for object_name in object_names:
        if not os.path.exists(os.path.join(png_dataset_result, object_name)):
            os.makedirs(os.path.join(png_dataset_result, object_name))

for object_name in object_names:
    progress_bar = tqdm(total=len(os.listdir(os.path.join(precut_dataset_path, object_name))), desc=f"Processing images from object: {object_name}")
    for filename in os.listdir(os.path.join(precut_dataset_path, object_name)):
        progress_bar.update(1)
        try:
            filepath = os.path.join(precut_dataset_path, object_name, filename)
            myImage = Image.open(filepath)
            black = Image.new('RGBA', myImage.size)
            myImage = Image.composite(myImage, black, myImage)
            myCroppedImage = myImage.crop(myImage.getbbox())
            save_filepath = os.path.join(png_dataset_result, object_name, filename)
            myCroppedImage.save(save_filepath)
            #print(f"{filename} done")
        except:
            #print(f"{filename} failed")
            continue
print("All done")

Creating result folder:  datasets/bags_png_dataset/


Processing images from object: red_bag:   0%|          | 0/125 [00:00<?, ?it/s]

Processing images from object: green_bag:   0%|          | 0/144 [00:00<?, ?it/s]

Processing images from object: yellow_bag:   0%|          | 0/103 [00:00<?, ?it/s]

All done


### DEFINING DATASET ###

In [19]:
# Define the paths to the three folders containing the images
png_dataset = "datasets/bags_png_dataset/"

# Define paths to each png folder, which will be used as foregrounds (fg)
# has to be a list of tuples, (folder_path, object_name)
fg_folders = [(os.path.join(png_dataset, object_name), object_name) for object_name in object_names]
print(fg_folders)

# Define the path to the backgrounds folder
bg_folder = "datasets/backgrounds"

# Define the path to the output folder, which will be a YOLO-formatted dataset
output_folder = "datasets/bags_yolo_dataset"

[('datasets/bags_png_dataset/red_bag', 'red_bag'), ('datasets/bags_png_dataset/green_bag', 'green_bag'), ('datasets/bags_png_dataset/yellow_bag', 'yellow_bag')]


In [20]:
annotations_ID = {}
categories = []
for i, object in enumerate(object_names):
    annotations_ID[object] = i
    categories.append({"id": i, "name": object})

print(annotations_ID)
print(categories)

{'red_bag': 0, 'green_bag': 1, 'yellow_bag': 2}
[{'id': 0, 'name': 'red_bag'}, {'id': 1, 'name': 'green_bag'}, {'id': 2, 'name': 'yellow_bag'}]


In [21]:
# Load the list of files in each of the three folders
fg_files = {}
for folder, category in fg_folders:
    fg_files[category] = os.listdir(folder)

In [None]:
# check files are loaded correctly
fg_files

In [32]:
if not os.path.exists(output_folder):
    os.mkdir(output_folder)

trainfolder = os.path.join(output_folder, "train")
testfolder = os.path.join(output_folder, "test")
validfolder = os.path.join(output_folder, "valid")

os.mkdir(trainfolder)
os.mkdir(testfolder)
os.mkdir(validfolder)
os.mkdir(os.path.join(trainfolder, "images"))
os.mkdir(os.path.join(trainfolder, "labels"))
os.mkdir(os.path.join(testfolder, "images"))
os.mkdir(os.path.join(testfolder, "labels"))
os.mkdir(os.path.join(validfolder, "images"))
os.mkdir(os.path.join(validfolder, "labels"))

### DATASET GENERATION ###
Define the dataset augmentations, transformations and size 

In [33]:
images=[]
annotations=[]
annotations2=[]
annot_csv=[]

img_id=int(0)
anno_id=int(0)

rescaling_min = 0.20
rescaling_max = 0.70

# Maximum ratio at which these values will be modified
brightness_ratio = 0.05 # e.g. Brightness will be increased or decreased by up to brightness_ratio * 100%
saturation_ratio = 0.05
hue_ratio = 0.02

TOTAL_IMAGES = 50

progress_bar = tqdm(total=TOTAL_IMAGES, desc="Generating images")
for j in range(TOTAL_IMAGES):
    #create empty label file
    label_file = os.path.join(trainfolder, "labels", f"{img_id}.txt")
    with open(label_file, 'w') as file:
        pass
    #select hramdomly how many objects will be in an image
    num_objects = random.randint(0, 5)
    #print("number of objects",num_objects)
    # Select random foreground images from the three folders, with replacement
    fg_categories = random.choices(object_names, k=num_objects)
    
    fg_files_selected = []
    for category in fg_categories:
        fg_files_selected.append([category,random.choice(fg_files[category])])
    #print("seleccion",fg_files_selected)
    # Load the selected foreground images using Pillow
    fg_imgs = []
    for img in fg_files_selected:
        folder = [f[0] for f in fg_folders if f[1] == img[0]][0]
        image_file = os.path.join(folder, img[1])
        fg_imgs.append([img[0],Image.open(image_file),image_file])

    # Randomly resize and rotate the foreground images using Pillow's transform module
    # img[0] is category, img[1] is image, img[2] is path
    for img in fg_imgs:
        fg_img=img[1]
        angle = random.randint(-5, 5)
        scale = random.uniform(rescaling_min, rescaling_max)
        fg_img = fg_img.rotate(angle, resample=Image.BICUBIC, expand=True)
        fg_img = fg_img.resize((int(fg_img.width * scale), int(fg_img.height * scale)))
        fg_img = ImageEnhance.Brightness(fg_img).enhance(random.uniform(0.7, 1.3))
        fg_img = ImageEnhance.Contrast(fg_img).enhance(random.uniform(0.9, 1.1))
        fg_img = ImageEnhance.Color(fg_img).enhance(random.uniform(0.7, 1.3))
        fg_img = fg_img.filter(ImageFilter.GaussianBlur(radius=random.uniform(0.0, 0.5)))


        img[1] = fg_img

    # Load the background image using Pillow
    bg_files = os.listdir(bg_folder)
    bg_file = random.choice(bg_files)
    bg_img = Image.open(os.path.join(bg_folder, bg_file))

    # Define the maximum overlap as a percentage
    max_overlap_pct = 10

    # Define an array to keep track of occupied areas
    occupied = np.zeros((bg_img.height, bg_img.width))

    for img in fg_imgs:
        fg_img=img[1]

        # Calculate the maximum overlap area
        max_overlap_area = (fg_img.width * fg_img.height)

        seg_img = img[1]


        # Convert the image to a NumPy array
        img_arr = np.array(seg_img)
        # Create a binary mask of the non-transparent pixels
        mask = img_arr[:, :, 3] != 0

        # Convert the mask to a COCO format segmentation
        segmentation = []
        for i in range(mask.shape[0]):
            for j in range(mask.shape[1]):
                if mask[i, j]:
                    segmentation.append(j)
                    segmentation.append(i)
        segmentation = [segmentation]

        # Calculate the area of the segmentation
        area = 0
        for i in range(len(segmentation[0]) // 2):
            x1 = segmentation[0][2 * i]
            y1 = segmentation[0][2 * i + 1]
            x2 = segmentation[0][(2 * i + 2) % len(segmentation[0])]
            y2 = segmentation[0][(2 * i + 3) % len(segmentation[0])]
            area += x1 * y2 - x2 * y1
        area = abs(area) / 2
        
        # Draw the segmentation onto a copy of the original image
        #image_copy = image.copy()
        #cv2.fillPoly(image_copy, aux_segmentation, color=(0, 255, 0))

        # Display the image with segmentation overlay
        #cv2.imshow('Image with Segmentation', image_copy)
        #cv2.waitKey(0)
        #cv2.destroyAllWindows()

        # Calculate the maximum allowed position for the top-left corner
        max_x = bg_img.width - fg_img.width
        max_y = bg_img.height - fg_img.height
        area = fg_img.width * fg_img.height

        # Generate a random location until an unoccupied area is found that meets the overlap limit
        total_area = bg_img.width * bg_img.height
        overlap_area = total_area
        
        while overlap_area / area > max_overlap_pct / 100:
            try:
                x = random.randint(0, max_x)
                y = random.randint(0, max_y)
            except:
                x = random.randint(0, abs(max_x))
                y = random.randint(0, abs(max_y))

            # Calculate the overlap area
            overlap_area = np.sum(occupied[y:y+fg_img.height, x:x+fg_img.width])

            # Check if the area is unoccupied and the overlap limit is not exceeded
            if (max_overlap_area) >= overlap_area/10:
                break
            if i==10:
                continue
        
        for i in range(0, len(segmentation[0])):
            if i % 2:
                segmentation[0][i]=int(segmentation[0][i]+y)
            else :
                segmentation[0][i]=int(segmentation[0][i]+x)
        # Update the occupied array
        occupied[y:y+fg_img.height, x:x+fg_img.width] = 1

        bg_img.paste(fg_img, (x, y), fg_img)
        x_center_ann = (x+fg_img.width/2) / bg_img.width
        y_center_ann = (y+fg_img.height/2) / bg_img.height
        width_ann = fg_img.width / bg_img.width
        height_ann = fg_img.height / bg_img.height
        with open(label_file, 'a') as f:
            f.write(f"{annotations_ID[img[0]]} {x_center_ann} {y_center_ann} {width_ann} {height_ann}\n")
        annotations2.append({"id": anno_id,"image_id": img_id,"category_id": annotations_ID[img[0]],"bbox": [x, y, fg_img.width, fg_img.height],"segmentation": segmentation,"area": area,"iscrowd": 0})
        annotations.append({"id": anno_id,"image_id": img_id,"category_id": annotations_ID[img[0]],"bbox": [x, y, fg_img.width, fg_img.height],"segmentation": [],"area": fg_img.height*fg_img.width,"iscrowd": 0})
        annot_csv.append(["TRAIN", output_folder + str(img_id)+".jpg", img[0], x/bg_img.width, y/bg_img.height,"","",(x+fg_img.width)/bg_img.width, (y+fg_img.height)/bg_img.height])
        anno_id=anno_id+1
        #draw = ImageDraw.Draw(bg_img)
        #fdraw.rectangle((x, y, x+fg_img.width, y+fg_img.height), outline='red', width=3)
    final_image_path = os.path.join(trainfolder, "images", f"{img_id}.jpg")
    bg_img.save(final_image_path, quality=100)
    images.append({"id": img_id, "file_name": str(img_id)+".jpg","height": bg_img.height,"width": bg_img.width})
    img_id=img_id+1
    progress_bar.update(1)
    #print(img_id)

#making data.yaml
data = dict(
    train = f"{trainfolder}images",
    val = f"{validfolder}images",
    test = f"{validfolder}images",
    nc = len(annotations_ID),
    names = list(annotations_ID.keys())
    )
#storing
with open(f'{output_folder}data.yaml', 'w') as outfile:
    yaml.dump(data, outfile, default_flow_style=False)

Generating images:   0%|          | 0/50 [00:00<?, ?it/s]

### SPLIT TRAIN AND VALIDATION ###

In [34]:
import os
import shutil
import random

validation = 0.1
test = 0.1

# Assumes test has 100% of data
output_folder = "datasets/bags_yolo_dataset/"
trainfolder = os.path.join(output_folder, "train")
testfolder = os.path.join(output_folder, "test")
validfolder = os.path.join(output_folder, "valid")

trainfolderimgs = os.path.join(trainfolder, "images")
trainfolderlabels = os.path.join(trainfolder, "labels")
testfolderimgs = os.path.join(testfolder, "images")
testfolderlabels = os.path.join(testfolder, "labels")
validfolderimgs = os.path.join(validfolder, "images")
validfolderlabels = os.path.join(validfolder, "labels")

fullSize = len(os.listdir(trainfolderimgs))
validSize = int(fullSize * validation)
testSize = int(fullSize * test)

for i in range(validSize):
    filelist = os.listdir(trainfolderimgs)
    #randomize file list, to not pick files in order
    random.shuffle(filelist)
    filetomove = filelist[i]
    #take out .jpg, .png, etc
    filetomovename = filetomove[:-4]
    #move images
    shutil.move(os.path.join(trainfolderimgs, filetomove), os.path.join(validfolderimgs, filetomove))
    #move labels
    shutil.move(os.path.join(trainfolderlabels, f"{filetomovename}.txt"), os.path.join(validfolderlabels, f"{filetomovename}.txt"))
for i in range(testSize):
    filetomove = os.listdir(trainfolderimgs)[i]
    #take out .jpg, .png, etc
    filetomovename = filetomove[:-4]
    #move images
    shutil.move(os.path.join(trainfolderimgs, filetomove), os.path.join(testfolderimgs, filetomove))
    #move labels
    shutil.move(os.path.join(trainfolderlabels, f"{filetomovename}.txt"), os.path.join(testfolderlabels, f"{filetomovename}.txt"))

#Validation
print(f"Train size is now: {len(os.listdir(trainfolderimgs))}")
print(f"Validation size is now: {len(os.listdir(validfolderimgs))}")
print(f"Test size is now: {len(os.listdir(testfolderimgs))}")

Train size is now: 40
Validation size is now: 5
Test size is now: 5
