In [None]:
# default_exp core

In [None]:
#hide
%%capture
from google.colab import drive
drive.mount("/content/drive")

In [None]:
#hide
%cd "/content/drive/MyDrive/Coding/ModelAssistedLabel"
%run "./00_ultralytics.ipynb"

Setup complete. Using torch 1.8.0+cu101 _CudaDeviceProperties(name='Tesla P100-PCIE-16GB', major=6, minor=0, total_memory=16280MB, multi_processor_count=56)
/content/drive/MyDrive/Coding/ModelAssistedLabel


In [None]:
%run "_Synch.ipynb"

autorelader
Converted 00_ultralytics.ipynb.
Converted 01_split.ipynb.
Converted 02_augment.ipynb.
Converted index.ipynb.
converting: /content/drive/My Drive/Coding/ModelAssistedLabel/01_split.ipynb


In [None]:
#hide 
%load_ext autoreload 
%autoreload 2

In [None]:
from ModelAssistedLabel.core import Defaults

In [None]:
d = Defaults()
d.__dict__

{'resource_folder': '/content/drive/MyDrive/Coding/Roboflow/try it out',
 'root': '/content/drive/MyDrive/Coding/ModelAssistedLabel/',
 'split_ratio': {'test': 0.1, 'train': 0.7, 'valid': 0.2}}

In [None]:
#hide
import os
# define pathway to the weights
weight_filenames = {
    "lcd": "21-2-20-94-universal-lcd.pt",
    "digits":'21-2-25 1k-digits YOLOv5-weights.pt'
    }

# detectors = []
# for filename in weight_filenames:
# weights_path = os.path.join(resource_folder, weights_filename)



# FileSystems 01

> handling the logistics of grouping moving and overall wrangling groups of images.

In [None]:
images = ".jpg"
labels = ".txt"

resource_map = {"images": images, "labels": labels}

In [None]:
# export

import glob
from os.path import join

class FileUtilities:
  def collect_files(walk_dir, recursive):
    """
    By default, returns all the ".jpg" and ".txt" files in a directory. The filetypes
    are specified by the :resource_map:.

    Args:
      walk_dir: directory from which to pull resources
      recursive: if `True`, resursively searches the folder for the desired resource.
    
    Returns:
      A dictionary keyed to the :resource_map: with each value being an array of 
      the keyed type.
    """
    res = {}
    for key, extension in resource_map.items():
      resource_generator = glob.iglob(walk_dir + '/**/*' + extension, recursive=recursive)
      res[key] = [{"pair_id": os.path.basename(x)[:-1*len(extension)], "path": x, "basename":os.path.basename(x)} for x in resource_generator]
    return res

  def matched(file_collection):
    """
    Pairs up an image and label based on a shared resource name.

    Arges:
      res: the result of a 
    """
    bn = lambda x: set([z["pair_id"] for z in x])
    matched = (bn(file_collection["labels"]).intersection(bn(file_collection["images"])))
    pairs = []
    for resource in matched:
      tmp = {}
      for k in resource_map:
        tmp[k] = [x for x in file_collection[k] if x["pair_id"] == resource][0]
      pairs.append(tmp)
      
    return pairs

  def match_files(walk_dir, recursive=True):
    return FileUtilities.matched(FileUtilities.collect_files(walk_dir, recursive=recursive))

  def mkdir(dir):
    import os
    if not os.path.exists(dir):
      os.mkdir(f"{dir}")


In [None]:
# export
from ModelAssistedLabel.core import Defaults
from datetime import datetime
import math, random

class Generation:
  """
    Container and organizer of photos for a given repository.
  """

  def __init__(self, repo, out_dir, data_yaml):
    """
      Args:
        repo: <string> path to the parent directory of the repository.
    """
    self.repo = repo
    self.split = None
    self.data_yaml = data_yaml
    self.out_dir = out_dir

  def set_split(self, split_ratio = None, MAX_SIZE=None):
    """
    Sets the value of `self.split` 

    Args:
      split_ratio: relative fractions of split between test train and validation
      sets.
      MAX_SIZE: The total number of images to be used in the image set 
    """
    if split_ratio is None:
      split_ratio = Defaults().split_ratio

    files = FileUtilities.match_files(self.repo)
    random.shuffle(files)
    if MAX_SIZE:
      files = files[:MAX_SIZE]

    train = math.ceil(len(files) * split_ratio["train"])
    valid = train + math.ceil(len(files) * split_ratio["valid"])

    split =  {"train": files[:train],
    "valid": files[train: valid],
    "test": files[valid:]}

    assert sum([len(split[x]) for x in split]) == len(files)
    self.split = split
  

  def process_split(self, descriptor = "", autoname_output=True):
    """
    Takes the given `self.split` and writes the split of the data to disk. Also
    writes a data.yaml file to retain class label information.

    Args:
      descriptor: <str> a unique identifier for the output's filename
      autoname_output: <bool> if True, `descriptor` field is a component of the
      output's filename. Otherwise, it is the entire name.

    Returns:
      A path to the zipped information.
    """
    assert self.split is not None

    if autoname_output:
      out_folder = self.default_filename(descriptor)
    else:
      assert len(descriptor) > 0, "need to provide a filename with `descriptor` argument"
      out_folder = descriptor
      
    dirs = self.write_images() #write images
    zipped = self.zip_dirs(out_folder, dirs) #zip folders
    os.system(f"mv '{zipped}' '{self.out_dir}'") #move the output
    return f"{self.out_dir}/{zipped}"


  def zip_dirs(self, folder, dirs):
    """
    Takes an array of resources and places them all as the children in a specified
    `folder`.

    Args:
      folder: Ultimately will be transformed into `folder.zip`
      dirs: resources to become zipped

    Returns:
      the name of the zip file uniting the resources in `dirs`
    """
    FileUtilities.mkdir(folder)
    self.write_data_yaml(folder)
    for subdir in self.split:
      os.system(f"mv './{subdir}' '{folder}/'")

    os.system(f'zip -r "{folder}.zip" "{folder}"')
    os.system(f'rm -f -r "{folder}"')
    return f"{folder}.zip"

  
  def write_images(self):
    """
    If the dataset has already been split, then write the files to disk accordingly.
    All resources are present two levels deep. The top folders are named according
    to "test"/"train"/"valid". The mid-level folders are named "images" or "labels".
    Resources can be found in the corresponding folder.

    Returns:
      A list of directories to the test/train/valid split
    """
    assert self.split is not None
    directories = []
    for dirname, pairs in self.split.items(): 
      dir = join("./", dirname) #test/valid/train
      FileUtilities.mkdir(dir)
      directories.append(dir)
      for pair in pairs:
        for resource, data in pair.items():
          subdir = join(dir, resource)
          FileUtilities.mkdir(subdir)

          target = data["path"]
          destination = join(subdir, data["basename"])
          print("target/dest", target, "|", destination)
          if not os.path.exists(destination): 
            os.system(f"cp '{target}' '{destination}'")
    return directories
    
  def default_filename(self, prefix=""):
    """
    Helper to ease the burden of continually generating unique names or accidentally
    overwriting important data.

    Args:
      prefix: 
    """
    now = datetime.now() # current date and time
    timestamp = now.strftime(" %y-%m-%d %H-%M-%S")
    zipname = self.repo.split("/")[-1] + prefix + timestamp
    return zipname

  def write_data_yaml(self, folder="./"):
    f = open(join(folder, "data.yaml"),"w+")
    f.writelines(self.data_yaml)
    f.close()
  

In [None]:
#hide
from fastcore.all import *

Test a use case. Define the `repo` or the location of the images and their associated labels. `out_dir` is where the zipped and organized information is stored.

In [None]:
g = Generation(repo = "/content/drive/MyDrive/Coding/Roboflow Export (841)", 
               out_dir = "/content/drive/MyDrive/Coding/01_train",
               data_yaml=Defaults().data_yaml)

g.set_split(MAX_SIZE=4)
out_dir = g.process_split(" processor")

AttributeError: ignored

Make sure that no files will be overwritten

In [None]:
test_eq_type(os.path.exists(os.path.basename(out_dir)), False)
z = os.path.basename(out_dir)
!mv "{out_dir}" . #help clean up

mv: cannot stat '/content/drive/MyDrive/Coding/01_train/Roboflow Export (841) processor 21-03-15 18-19-56.zip': No such file or directory


Unzip and check if the number of files is as expected according to `g.split`

In [None]:
!unzip "./{z}"
ls = !ls "{z[:-4]}/train/labels"
test_eq(len(ls), 3)

Archive:  ./Roboflow Export (841) processor 21-03-15 18-19-56.zip
   creating: Roboflow Export (841) processor 21-03-15 18-19-56/
   creating: Roboflow Export (841) processor 21-03-15 18-19-56/train/
   creating: Roboflow Export (841) processor 21-03-15 18-19-56/train/images/
  inflating: Roboflow Export (841) processor 21-03-15 18-19-56/train/images/save_dirrtake-77-jpg-cropped-jpg-jpg_jpg.rf.fc755d0260916bf6d6126082a672aa3b.jpg  
  inflating: Roboflow Export (841) processor 21-03-15 18-19-56/train/images/digittake-357-jpg_jpg.rf.6446f16c5c7157a36793a382780bbb31.jpg  
  inflating: Roboflow Export (841) processor 21-03-15 18-19-56/train/images/digittake-33-jpg_jpg.rf.f1d00d9a9f1e64e8da2e528824bbb27f.jpg  
   creating: Roboflow Export (841) processor 21-03-15 18-19-56/train/labels/
  inflating: Roboflow Export (841) processor 21-03-15 18-19-56/train/labels/save_dirrtake-77-jpg-cropped-jpg-jpg_jpg.rf.fc755d0260916bf6d6126082a672aa3b.txt  
  inflating: Roboflow Export (841) processor 21-0

Remove the test zip file and zipped file.

In [None]:
!rm -f -r "{z[:-4]}"
!rm "{z}"