# Make one big unified datasets from a group of already cleaned datasets

In [None]:
import tqdm.notebook as tqdm
from PIL import Image
import os, shutil
import yaml

def RecreateDatasetFolder(RootPaths, *args):
    # If alreaty exixst remove the directory and all its files to create a fresh new dataset from scratch
    if os.path.exists(RootPaths):
        print("Removing Old Images Files and folders")
        shutil.rmtree(RootPaths)
        print("Finished")

    # Create new images and labels folders structure
    print("Creating New Images Files and folders")
    for path in args:
        os.makedirs(path)
    print("Creation Done")

### Directory Path declaration

In [None]:
# Set a Dataset name for the new dataset we want to create
NEW_DATASET_NAME = "GigaDataset"

# Source directory that contains all the datasets we want to merge in the YOLOv8 format
SRC_DATASETS_PATH = "./CleanedDatasets"
# Destination Directory for the new created dataset
DST_DATASET_PATH = f"./{NEW_DATASET_NAME}"

DST_DATASET_IMAGES_PATH = os.path.join(DST_DATASET_PATH, "images")
DST_DATASET_LABELS_PATH = os.path.join(DST_DATASET_PATH, "labels")

# Path for discarded images because of they have zero Labels
DST_DATASET_DISCARDED_IMAGES_NO_LABELS_PATH = os.path.join("discarded", "noLabels", "images")
DST_DATASET_DISCARDED_LABELS_NO_LABELS_PATH = os.path.join("discarded", "noLabels", "labels")

# Path for discarded images because of they have a very small Bounding Box
DST_DATASET_DISCARDED_IMAGES_TOO_SMALL_PATH = os.path.join("discarded", "tooSmall", "images")
DST_DATASET_DISCARDED_LABELS_TOO_SMALL_PATH = os.path.join("discarded", "tooSmall", "labels")

# Percentage of area covered by the bounding box
BB_COVERAGE_THRESHOLD = 0.05

Utility Functions

In [None]:
def discardBBdimension(imagePath, labelPath) -> bool:
    # Get the image resolution
    with Image.open(imagePath) as image:
        width, height = image.size

    with open(labelPath, "r") as label_fin:
        # Counter of the BB founded in the Image
        bb_counter = 0
        # List to store the BB, in the image, that are under the threshold 
        bb_eval = []

        for line in label_fin:
            bb_counter += 1
            data = line.split(" ")

            # Rescale the normalized BB values
            dim_x, dim_y = width * float(data[3]), height * float(data[4])
            
            bb_eval.append((dim_x * dim_y) / (width * height) * 100 < BB_COVERAGE_THRESHOLD) 

    # If the BBs under the threshold are more than the 50% of the total BB the image is discarded
    return sum(bb_eval) / bb_counter * 100 >= 50


def isLabelFileEmpty(filePath) -> bool:
    return os.path.getsize(filePath) == 0

## Merge all datasets and remove images with no label in label file and with small Bounding boxes

In [None]:
# Each time we run this script we erase the previously created stuff to restart from a clean environment
# Comment this line if needed
RecreateDatasetFolder(DST_DATASET_PATH, 
                      DST_DATASET_IMAGES_PATH, 
                      DST_DATASET_LABELS_PATH, 
                      DST_DATASET_DISCARDED_IMAGES_NO_LABELS_PATH,
                      DST_DATASET_DISCARDED_LABELS_NO_LABELS_PATH,
                      DST_DATASET_DISCARDED_IMAGES_TOO_SMALL_PATH,
                      DST_DATASET_DISCARDED_LABELS_TOO_SMALL_PATH
                    )

# Take all the datasets path in the source directory
datasets_dir = [os.path.join(SRC_DATASETS_PATH, ds_name) for ds_name in os.listdir(SRC_DATASETS_PATH) if os.path.isdir(os.path.join(SRC_DATASETS_PATH, ds_name))]

# Struc used to build a new yaml class mapping for the new Dataset compliant with the YOLOv8 format
new_config_file = {}

for dataset_dir in datasets_dir:
    
    # TODO:: Remove this if statement
    if "Discarded" in dataset_dir: continue
    
    # Check the existence of the configuration file
    if os.path.exists(os.path.join(dataset_dir, "data.yaml")):
    # Get the configuration file of the current analyzed dataset
        dataset_config_path = os.path.join(dataset_dir, "data.yaml")
        
        with open(dataset_config_path, "r") as config_stream:
            config_label_map: list = yaml.safe_load(config_stream)["names"]
            idx_offset = max(new_config_file.keys(), default=-1)
    else: 
        print(f"Configuration file Not found for dataset: {dataset_dir}")
        
    
    for dataset_split in ["train", "valid", "test"]:

        # Check if the dataset split path exist
        if not os.path.exists(os.path.join(dataset_dir, dataset_split, "images")):
            continue

        print(f"Serving {dataset_dir}: {dataset_split}")

        # Defining the considered dataset directory path
        src_dataset_images_path = os.path.join(dataset_dir, dataset_split, "images")
        src_dataset_labels_path = os.path.join(dataset_dir, dataset_split, "labels")

        #===========================================================================
        # Creating a set we eliminate all the duplicates from the list.
        all_images_files = [im for im in os.listdir(src_dataset_images_path) if os.path.isfile(os.path.join(src_dataset_images_path, im))]
        all_labels_files_list = [lb for lb in os.listdir(src_dataset_labels_path) if os.path.isfile(os.path.join(src_dataset_labels_path, lb))]
        all_labels_files_set = set(all_labels_files_list)

        # Check that there are no duplicated or missing labels file
        assert len(all_labels_files_set) == len(all_labels_files_list)
        #===========================================================================


        for image_name in tqdm.tqdm(all_images_files):
            
            # Check if the image has its correspondent match in the label files
            label_name = os.path.splitext(image_name)[0] + ".txt"
            # This check have a complexity of O(1)
            assert label_name in all_labels_files_set

                        
            #===========================================================================
            # TODO:: imporve the redability of this code
            
            # Check that the label file is not empty
            if not isLabelFileEmpty(os.path.join(src_dataset_labels_path, label_name)):

                # Check that the bounding box is big enough
                if discardBBdimension(imagePath=os.path.join(src_dataset_images_path, image_name),
                                      labelPath=os.path.join(src_dataset_labels_path, label_name)):
                    shutil.copy(f"{os.path.join(src_dataset_images_path, image_name)}", f"{DST_DATASET_DISCARDED_IMAGES_TOO_SMALL_PATH}")
                    shutil.copy(f"{os.path.join(src_dataset_labels_path, label_name)}", f"{DST_DATASET_DISCARDED_LABELS_TOO_SMALL_PATH}")
                    continue

                # Copy the image and label in the new folder
                shutil.copy(os.path.join(src_dataset_images_path, image_name), DST_DATASET_IMAGES_PATH)
                shutil.copy(os.path.join(src_dataset_labels_path, label_name), DST_DATASET_LABELS_PATH)

                # Change the label file to be consistent with the new label mapping
                with open(os.path.join(DST_DATASET_LABELS_PATH, label_name), "r") as label_fin:
                    file_content = ""
                    for line_number, line in enumerate(label_fin):
                        # Eliminate all the newLine and split by space
                        line = line.replace("\n", "").split(" ")
                        # The first value is the label value.
                        # Update of the value considering the new mapping
                        line[0] = str(int(line[0]) + idx_offset + 1)
                        # Re-join the line content
                        line = " ".join(line)
                        # If the file have only one line we do not append a newLine.
                        # In practice we are rebuilding the file but with all the lines in the reverse order
                        file_content = line if line_number == 0 else file_content + "\n" + line 
                
                # Rewrite the file with the updated content
                with open(os.path.join(DST_DATASET_LABELS_PATH, label_name), "w") as label_fin:
                    label_fin.write(file_content)

            else:
                shutil.copy(f"{os.path.join(src_dataset_images_path, image_name)}", f"{DST_DATASET_DISCARDED_IMAGES_NO_LABELS_PATH}")
                shutil.copy(f"{os.path.join(src_dataset_labels_path, label_name)}", f"{DST_DATASET_DISCARDED_LABELS_NO_LABELS_PATH}")
            #===========================================================================
        print(f"Done with {dataset_dir}: {dataset_split}")
        
            
                
    # Update the new config file for the merged dataset
    for i, name in enumerate(config_label_map, start=idx_offset + 1):
        new_config_file[i] = name  
    print(new_config_file)


#===========================================================================
# Write the updated configuration file
config_file = {
    "train": "../train/images",
    "val": "../valid/images",
    "test": "../test/images",

    "nc": len(new_config_file),
    "names": new_config_file,
}

with open(os.path.join(DST_DATASET_PATH, "data.yaml"), "w") as file: 
    yaml.dump(config_file, file)
    



# Grounding Dino Automatic Label un-labeled images

In [None]:
from autodistill_grounded_sam import GroundedSAM
from autodistill_grounding_dino import GroundingDINO

from autodistill.detection import CaptionOntology
from autodistill.helpers import sync_with_roboflow
import roboflow


UNLABELED_IMAGES_PATH = "GigaDataset\\discarded\\noLabels\\images"
LABELED_IMAGES_PATH = "GigaDataset\\re-labeled\\images_labeled"

# TAKS = "detection"
TAKS = "segmentation" 

CAPTION_ONTOLOGY = {
    "black robot with colored light": "robot"
}

BOX_THRESHOLD = 0.5
TEXT_THRESHOLD = 0.70

classes = {i: label for i, label in enumerate(CAPTION_ONTOLOGY.values())}

mode_f = GroundedSAM if TAKS == "segmentation" else GroundingDINO
model = mode_f(
    ontology=CaptionOntology(CAPTION_ONTOLOGY),
    box_threshold=BOX_THRESHOLD,
    text_threshold=TEXT_THRESHOLD,
)

To autolabel directly on Roboflow (VERY SLOW !!!!!!!!!!)

In [None]:
# roboflow.login(force=True)

# sync_with_roboflow(
#     workspace_id="yBkLwcSpuygMbFFpUWPp6nvZwbo1",
#     workspace_url="ilchrees",
#     project_id = "robotsegment",
#     batch_id = "R1kRVKKcVSdMDRYDtNG9",
#     model = model
# )

### Test Grounding Dino on random samples

Run this code before starting labeling all images together to check if the process does not have bugs

In [None]:
import cv2, os
import supervision as sv
import random

# From all the images choose one randomly
IMAGE_NAME = random.choice([im_name for im_name in os.listdir(UNLABELED_IMAGES_PATH) if os.path.isfile(os.path.join(UNLABELED_IMAGES_PATH, im_name))])

image_path = os.path.join(UNLABELED_IMAGES_PATH, IMAGE_NAME)

predictions = model.predict(image_path)

print(f"Prediction struct length: {len(predictions)}")
labels = [f"{classes[class_id]} {confidence:0.2f}" for _, _, confidence, class_id, _ , _ in predictions]

image = cv2.imread(image_path)

annotator = sv.BoxAnnotator()
annotated_image = annotator.annotate(scene=image, detections=predictions, labels=labels)

sv.plot_image(annotated_image)

## Label The Unlabeled images using Grounding DINO

In [None]:
model.label(input_folder=UNLABELED_IMAGES_PATH, output_folder=LABELED_IMAGES_PATH)