# Large Dataset Split
This file creates a smaller dataset from a large dataset.  This is useful for manually labeling a the smaller dataset and training a model to create masks for the larger dataset.  This file creates a new dataset with the `raw` directory. The `raw` directory is populated with random files for each class. These files are then uploaded to LabelBox for manual annotations.

##### !!!!IF YOU DONT WANT TO SPLIT THE DATASET
Use the `im_to_labelbox.ipynb` or `im_to_labelbox.py` files to upload your entire dataset to LabelBox.

In [None]:
import os, shutil, random
from labelbox_class import LabelBox, remove_all
from tqdm import tqdm

In [None]:
# LabelBox API Key
api_key = ''
large_dataset = '.\\..\\data\\datasets\\birds_dataset'

dataset_size = 75    # Number of ims per class


# LabelBox Project and Ontology Name
project_name = "Small_Birds"
ontology_name = "Small_Birds"

# TXT file to be made with project and ontology IDs within it
proj_id_txt = ".\\..\\data\\projects\\"+project_name+".txt"
small_dataset = f'.\\..\\data\\datasets\\small_{os.path.basename(large_dataset)}'
small_size = 8

# Directory to the raw data in the dataset (for uploading)
raw_dir = os.path.join(small_dataset, 'raw')
print(raw_dir)

In [None]:
def make_small_dataset(old_dir, new_dir, num_images):
    for root, dirs, files in os.walk(os.path.join(old_dir,'raw')):
        for dir in dirs:
            old_sub_dir = os.path.join(root, dir)
            new_sub_dir = old_sub_dir.replace(os.path.join(old_dir,'raw'), os.path.join(new_dir,'raw'))
            
            os.makedirs(new_sub_dir, exist_ok=True)
            images = [f for f in os.listdir(old_sub_dir) if os.path.isfile(os.path.join(old_sub_dir, f))]

            selected_images = random.sample(images, num_images)
            for image in selected_images:
                shutil.copy(os.path.join(old_sub_dir, image), new_sub_dir)
    for root, dirs, files in os.walk(old_dir):
        for file in files:
            if file.endswith(".txt"):
                shutil.copy(os.path.join(root, file),os.path.join(new_dir, file))


In [None]:
if os.path.exists(raw_dir):
    os.remove(raw_dir)

In [None]:
make_small_dataset(large_dataset,small_dataset, small_size)

In [None]:
# Creates LabelBox Project, Datasets, and Ontology
labels = LabelBox(api_key, raw_dir, project_name,ontology_name)

In [None]:
# Finds all images that could not be uploaded to LabelBox
rm_lst = []
for error in tqdm(labels.find_error_sets()):
    data_lst = [ os.path.join(raw_dir, error.replace("-s ", "'s "),x) for x in os.listdir(os.path.join(raw_dir, error.replace("-s ", "'s ")))]
    rm_lst.append(labels.labelbox_dataset_lst(error.replace("-s ", "'s "), data_lst))

In [None]:
# Removes images that could not be uploaded from the dataset 
for im_lst in rm_lst:
    for im in im_lst:
        print(im)
        os.remove(im)

In [None]:
# Creates TXT file with project and ontolgy IDs
if os.path.exists(proj_id_txt):
    valid_path = False
    count = 0
    while not valid_path:
        new_name =  proj_id_txt.split(".txt")[0] + str(count) + ".txt"
        if not os.path.exists(new_name):
            valid_path = True
    proj_id_txt = new_name
    
with open(proj_id_txt, 'w') as f:
    f.write("Project_name:"+project_name+", Project_id:"+labels.project.uid+"\n"+
            "Ontology_name:"+ontology_name+", Ontology_id:"+labels.ontology)
print("Project ID: ",labels.project.uid)
print("Ontology ID: ", labels.ontology)
print("File made at: ", proj_id_txt)