In [1]:
import os
import time
import json
import struct
import shutil

from numpy.random import default_rng

In [2]:
data_folder = "data"
destination_folder = "yolov5_data"

subfolders = ["Regensburg_Plant"]

proportions = {
    "train": 80,
    "val": 10,
    "test": 10
}

In [3]:
print("Creating dir: ", destination_folder)

dataset_imgs_train_p = os.path.join(destination_folder, 'images', 'train')
dataset_imgs_val_p = os.path.join(destination_folder, 'images', 'val')
dataset_imgs_test_p = os.path.join(destination_folder, 'images', 'test')
dataset_labels_train_p = os.path.join(destination_folder, 'labels', 'train')
dataset_labels_val_p = os.path.join(destination_folder, 'labels', 'val')
dataset_labels_test_p = os.path.join(destination_folder, 'labels', 'test')
os.makedirs(destination_folder, exist_ok = True)
os.makedirs(dataset_imgs_train_p, exist_ok = True)
os.makedirs(dataset_imgs_val_p, exist_ok = True)
os.makedirs(dataset_imgs_test_p, exist_ok = True)
os.makedirs(dataset_labels_train_p, exist_ok = True)
os.makedirs(dataset_labels_val_p, exist_ok = True)
os.makedirs(dataset_labels_test_p, exist_ok = True)

Creating dir:  yolov5_data


In [4]:
def get_image_size(file_path):
    """
    Return (width, height) for a given img file content - no external
    dependencies except the os and struct modules from core
    """
    size = os.path.getsize(file_path)

    with open(file_path, 'rb') as input:
        height = -1
        width = -1
        data = input.read(25)
        #data = str(data
        #data = data.decode('utf-8')
        #data = bytes(data)
        #print(data)
        if True:
            input.seek(0)
            input.read(2)
            b = input.read(1)
            while (b and ord(b) != 0xDA):
                while (ord(b) != 0xFF): b = input.read(1)
                while (ord(b) == 0xFF): b = input.read(1)
                if (ord(b) >= 0xC0 and ord(b) <= 0xC3):
                    input.read(3)
                    h, w = struct.unpack(">HH", input.read(4))
                    break
                else:
                    input.read(int(struct.unpack(">H", input.read(2))[0])-2)
                b = input.read(1)
            width = int(w)
            height = int(h)
            return width, height

In [7]:
id2path = {}

all_classes = []
all_classes_set = []
fileindex = 0

if isinstance(subfolders, list):
    folders = subfolders 
else:
    folders = os.listdir(data_folder)

for folder in folders:
    # print(folder)
    folder_path = os.path.join(data_folder, folder)
    if not os.path.isdir(folder_path):        
        continue
    print(os.path.join(folder_path, 'objectclasses.json'))
    a = time.time()
    with open(os.path.join(folder_path, 'objectclasses.json'), 'r') as clsf:
        classes = json.load(clsf)
    for objclass in classes:
        if objclass not in all_classes:
            all_classes.append(objclass)
        objname = objclass['Name']
        if objname not in all_classes_set:
            all_classes_set.append(objname)
    # print(all_classes)
    # print(all_classes_set)
    files = os.listdir(os.path.join(folder_path, 'images'))
    # print(files)
    for img_name in files:
        import random
        assigned = random.choices(["train", "val", "test"], weights=(proportions["train"], proportions["val"], proportions["test"]), k=1)
        b = time.time()
        img_path = os.path.join(folder_path, 'images', img_name)
        # print("Image Path: ", img_path)
        label_path = os.path.join(folder_path, 'labels', 'json', img_name.replace('jpg', 'json'))
        # print("Label path: ", label_path)
        if os.path.exists(img_path):
            print(img_path)
            source_image = img_path
            new_image_name = f"{folder}_{img_name}"
            print(new_image_name)
            destination_image = os.path.join(destination_folder, 'images', assigned[0], new_image_name)
            shutil.copyfile(source_image, destination_image)
            # shutil.copyfile(src_l, dst_l)
    #     #if not os.path.exists(os.path.join(dataset_imgs_train_p, str(fileindex) + '.jpg')):
    #     #    os.symlink(os.path.abspath(p_img), os.path.join(dataset_imgs_train_p, str(fileindex) + '.jpg'))
    #     #print(p_img)
    #     split = p_img.split('/')
    #     id2path[fileindex] = split[-3] + ',' + split[-1]
        w, h = get_image_size(img_path)
        with open(label_path, 'r') as annotation_file:
            annotations = json.load(annotation_file)
        dupcheck = []
        with open(os.path.join(destination_folder, 'labels', assigned, new_image_name.replace('jpg', '') + '.txt'), 'w') as txtf:
            for anno in annotations:
                yolo_class_index = all_classes_set.index(anno['ObjectClassName'])
                right, left, top, bottom = anno['Right'], anno['Left'], anno['Top'], anno['Bottom']
                x_center = ((left + right) / 2) / w
                y_center = ((top + bottom) / 2) / h
                w_normalized = (right - left) / w
                h_normalized = (bottom - top) / h
                area = (right - left) * (bottom - top)
                if area < 400:
                    continue

                yololine = str(yolo_class_index) + ' ' + str(x_center) + ' ' + str(y_center) + ' ' + str(w_normalized) + ' ' + str(h_normalized)
                if yololine in dupcheck:
                    continue
                dupcheck.append(yololine)
                txtf.write(yololine)
                txtf.write('\n')
        c = time.time()
    d = time.time()

with open(os.path.join(destination_folder, 'sordi.json'), 'w', encoding='utf-8') as f:
    json.dump(all_classes, f, ensure_ascii=False, indent=4)
    # print('folder:', (d-a))

with open(os.path.join(destination_folder, 'sordi.yaml'), 'w', encoding='utf-8') as f:
    f.write('path: ./sordi\n')
    f.write('train: images/train\n')
    f.write('val: images/val\n')
    f.write('test: images/test\n')
    f.write('names:\n')
    index = 0
    for anno_class in all_classes_set:
        f.write('  ' + str(index) + ': ' + anno_class + '\n')
        index += 1

data/Regensburg_Plant/objectclasses.json
data/Regensburg_Plant/images/952.jpg
Regensburg_Plant_952.jpg


FileNotFoundError: [Errno 2] No such file or directory: 'yolov5_data/images/train/Regensburg_Plant_952.jpg'