# Dataset Creation

This small code is needed to set up the dataset for the binary classifier.
The dataset need to be in the following format:
- root:
    - images:
        - image_1.jpg
        - ...
    - labels:
        - labels.csv

The label.csv file must contain for each row the name of the image and the correspondent label.
- 0 not approved image
- 1 approved image


In [None]:
import shutil, os
from tqdm import tqdm

for dataset_type in ["train", "valid", "test"]:
    print(f"Working on {dataset_type}")
    
    original_labels_dir = f"Original\\{dataset_type}\\labels"
    cleaned_labels_dir = f"Cleaned\\{dataset_type}\\labels"
    
    images_dir = f"Cleaned\\{dataset_type}\\images"

    labels_files = set(f for f in os.listdir(original_labels_dir) if os.path.isfile(os.path.join(original_labels_dir, f)))
    images_files = [im for im in os.listdir(images_dir) if os.path.isfile(os.path.join(images_dir, im))]
    print(len(labels_files), len(images_files))

    # Create a directory if does not exist
    if not os.path.exists(cleaned_labels_dir):
        os.makedirs(cleaned_labels_dir)
        
    # Remove all file inside a directory
    else:
        new_labels_files = set(f for f in os.listdir(cleaned_labels_dir) if os.path.isfile(os.path.join(cleaned_labels_dir, f)))
        remove_bar = tqdm(new_labels_files)
        remove_bar.set_description("Removing old labels files")
        for nf in remove_bar:
            os.remove(os.path.join(cleaned_labels_dir, nf))

    copy_bar = tqdm(images_files)
    copy_bar.set_description("Copy new labels files")
    for im in tqdm(images_files):
        im = im.replace(".jpg", ".txt")
        if im in labels_files:
            shutil.copy(original_labels_dir + f"\\{im}", cleaned_labels_dir + f"\\{im}")

    new_labels_files = set(f for f in os.listdir(cleaned_labels_dir) if os.path.isfile(os.path.join(cleaned_labels_dir, f)))
    assert len(new_labels_files) == len(images_files)

In [None]:
import random, os, tqdm, shutil
random.seed(42)

bad_images_dir = "D:\\Projects\\RoboTO\\Cleaned\\discarded"
good_images_dir = "D:\\Projects\\RoboTO\\Cleaned\\train\\images"

dataset_dir = "D:\\Projects\\RoboTO\\BinaryClassifierDataset"

bad_images_files = [im for im in os.listdir(bad_images_dir) if os.path.isfile(os.path.join(bad_images_dir, im))]
good_images_files = [im for im in os.listdir(good_images_dir) if os.path.isfile(os.path.join(good_images_dir, im))]

random.shuffle(bad_images_files)
random.shuffle(good_images_files)

length = min(len(bad_images_files), len(good_images_files))

bad_images_files = bad_images_files[:length]
good_images_files = good_images_files[:length]

with open(dataset_dir + f"\\labels\\labels.csv", "w") as fout:
    for image_file in tqdm.tqdm(bad_images_files):
        src_image_path = os.path.join(bad_images_dir, image_file)
        dst_image_path = os.path.join(dataset_dir+"\\images", image_file)
        
        shutil.copy(src_image_path, dst_image_path)
        print(f"{image_file},{0}", file=fout)
    
    for image_file in tqdm.tqdm(good_images_files):
        src_image_path = os.path.join(good_images_dir, image_file)
        dst_image_path = os.path.join(dataset_dir+"\\images", image_file)

        shutil.copy(src_image_path, dst_image_path)
        print(f"{image_file},{1}", file=fout)