## Install Libraries

In [None]:
pip install ultralytics

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os

In [None]:
from ultralytics import YOLO
from ultralytics.data.utils import autosplit

## Define Constants

In [None]:
PROJECT_DIR = "/content/drive/MyDrive/Verizon ML Project"
DATASET_DIR = os.path.join(PROJECT_DIR, "dataset")

## Split and Merge Functions

In [None]:
split_paths = set() # element => (img_path, label_path)

def split():

    # split dataset into train, val, and test
    autosplit(
        path=DATASET_DIR,
        weights=(0.7, 0.2, 0.1),
        annotated_only=True # only images w/ .txt files are split
    )

    # create images and labels directories
    images_path = os.path.join(DATASET_DIR, "images")
    labels_path = os.path.join(DATASET_DIR, "labels")
    try:
        images_dir = os.mkdir(images_path)
        labels_dir = os.mkdir(labels_path)
    except:
        pass

    # key (folder) => value (corresponding .txt from autosplit)
    splits = {"train":"autosplit_train.txt",
              "val":"autosplit_val.txt",
              "test":"autosplit_test.txt"}

    for split_type, split_txt in splits.items():

        # create partitioned directories (ex. ./dataset/images/train)
        split_img_path = os.path.join(images_path, split_type)
        split_label_path = os.path.join(labels_path, split_type)
        split_paths.add((split_img_path, split_label_path))
        try:
            split_img_dir = os.mkdir(split_img_path)
            split_label_dir = os.mkdir(split_label_path)
        except:
            pass

        # move data into partitions
        f = open(os.path.join(PROJECT_DIR, split_txt))
        for img_path in f:

            img_path = os.path.join(PROJECT_DIR, img_path.strip()) # remove \n
            img_name = img_path.split('/')[-1] # CaseDesign_PhoneModels_Number.png

            # change .png or .jpg (from autosplit paths) to .txt
            txt_path = img_path.replace(".png", ".txt").replace(".jpg", ".txt")
            txt_name = txt_path.split('/')[-1] # CaseDesign_PhoneModels_Number.txt

            # move files
            try:
                new_img_path = os.path.join(split_img_path, img_name)
                new_txt_path = os.path.join(split_label_path, txt_name)
                os.rename(img_path, new_img_path)
                os.rename(txt_path, new_txt_path)
            except:
                pass

def is_valid_split():
    for img_path, label_path in split_paths:
        img_files = set( [ f.replace(".png", "").replace(".jpg","") for f in os.listdir(img_path) ] )
        label_files = set( [ f.replace(".txt","") for f in os.listdir(label_path) ] )
        if img_files != label_files:
            return False
    return True

def merge():
    for img_path, label_path in split_paths:
        for f in os.listdir(img_path):
            os.rename(os.path.join(img_path, f), os.path.join(DATASET_DIR, f))
        for f in os.listdir(label_path):
            os.rename(os.path.join(label_path, f), os.path.join(DATASET_DIR, f))

In [None]:
split()

In [None]:
is_valid_split()

In [None]:
merge()

## Relabelling

#### Create Label Map

In [None]:
# pull excel sheet
df = pd.read_excel(os.path.join(PROJECT_DIR, 'Verizon Phone Cases Spreadsheet.xlsx'))
df_design_label = df[['Case Design', 'Label']]

# initialize maps
label_map = {} # case name => label
yaml_map = {}  # label => case name

# create maps
for index, row in df_design_label.iterrows():
  label_map[row['Case Design'].replace(" ", "")] = row['Label']
  yaml_map[row['Label']] = [row['Case Design'].replace(" ", "")]

label_map

#### Use Case Design in Filename to Assign Label in .txt file

In [None]:
# Replace 'your-folder-path' with the path to your Google Drive folder

# assumes all data is not split
folder_path = os.path.join(PROJECT_DIR, 'dataset')

# iterate over all files in the folder
for filename in os.listdir(folder_path):

    # check if the file is a text file
    if filename.endswith('.txt'):
        file_path = os.path.join(folder_path, filename)

        # read file
        with open(file_path, 'r') as file:
            content = file.read()

        # obtain case design from file name
        # (assuming name is CaseDesign_PhoneModels_Number.png)
        case_design = filename.split('_')[0]
        if case_design not in label_map:
            print(case_design + " not in map")
            continue

        # get the corresponding value from the label_map
        new_id = str(label_map[case_design])

        # check if file is not empty
        if content:
            # modify first number in YOLO format
            margin = len(content.split()[0])
            modified_content = new_id + content[margin:]

            # write the modified content back to the file
            with open(file_path, 'w') as file:
                file.write(modified_content)

print("All text files have been updated.")

## Final Dataset w/ `.yaml` File

#### dataset.yaml
```
# Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..]

path: /content/drive/MyDrive/Verizon_ML_Project/dataset
train: images/train  # train images (relative to 'path')
val: images/val  # val images (relative to 'path')
test: images/test # test images

# Classes (25 case styles)
names:
  0: OtterBoxSymmetryCore
  1: CASETiFYMirror
  2: IncipioCru
  3: Dahliaa-la
  4: UAGPathfinder
  5: AfternoonInVersailles
  6: DefenderSeriesProBlack
  7: RegencyEra
  8: CherryMonCheri
  9: BowPosiePink
  10: Penelope
  11: LadyWhistledown
  12: FrenchBlue
  13: SpigenCoreArmor
  14: TouchOfPearl
  15: SymmetrySeriesBlack
  16: SundayinSoho
  17: VioletFloralMix
  18: VelvetCaviarChecker-NudeVibe
  19: MoonandStars
  20: GoldFloral
  21: HollyhockFloralClear
  22: UAGCivilianCase
  23: Camo
  24: MoodyFloral
```