# YOLO training pipeling

Generic pipeline to train a classification model from observations with multiple images.


In [16]:
# Some basic setup:

import shutil
import random
import glob
import numpy as np
import zipfile
import pandas as pd

dataset = "litter"

ratios = {"train": 0.7, "validation": 0.15, "test": 0.15}

# Read the csv
csv = pd.read_csv("datasets/litter.csv")

min_samples = 40

Assuming that we have a folder with raw data, separated into labels and were each file may or may not start with an observation before an underscore so that images from the same observation can all go to the same train/val/test set, copy all images into a data folder where the sets have been determined.

The dataset folder in raw_data contains a folder per label.

Files can be grouped into "observations" by having the same filename before the first underscore.

These should be kept together in either train, validation or test set, with a ratio (specified above).

For each label, make a list of the observations, and copy all files in the observation to either the train, validation or test set in `./data/[dataset]/[train|validation|test]/[label]`


In [None]:
import tqdm
import os

# Rename each file so that it has the observation id in the name and move it to the right folder
for index, row in tqdm.tqdm(csv.iterrows(), total=len(csv)):
    observation = str(row["object_id"]).strip()
    image_url = str(row["filename"]).strip()
    taxon = str(row["label"]).strip()

    # if the url is empty or nan, skip
    if pd.isna(image_url) or image_url == "":
        continue

    # if the label is empty or nan, skip
    if pd.isna(observation) or observation == "":
        continue    

    # create the folder
    os.makedirs(f"raw_data/{dataset}/{taxon}", exist_ok=True)

    # move the image from the unclassified folder to the taxon folder
    out_path = os.path.join(
        f"raw_data/{dataset}/{taxon}", str(observation) + "_" + str(index) + ".jpg"
    )
    source_folder = f"raw_data/{dataset}"

    if os.path.exists(out_path) or not os.path.exists(
        os.path.join(source_folder, image_url)
    ):
        print("Skipping", out_path)
        print("File exists", os.path.exists(out_path))
        print("Source exists", os.path.join(source_folder, image_url), os.path.exists(os.path.join(source_folder, image_url)))
        continue

    shutil.move(
        os.path.join(source_folder, image_url),
        out_path,
    )

Now we have a folder per species, with subfolders containing "no track", "other track" and "snow track" images. File names start with the observation id for grouping.
Files that were classified wrong can be copied to a folder with "actually" appended, i.e. "snow track actually". Such files will be ignored in their original folder and included as the actual label, without having to remove or move anything.

The goal is to test different regimes, optimizing for snow track recognition

In [None]:
import os

if os.path.exists(f"data/{dataset}"):
    print(f"Folder data/{dataset} already exists, exiting")
else:
    # Get the observation from the filename
    def get_observation(filename):
        filename = filename.split("/")[-1]
        # if the filename contains an underscore, split on the first one
        if "_" not in filename:
            return filename
        # return the filename up to the last underscore
        return filename[: filename.rfind("_")] + "_"

    def decide_set(ratios=ratios):
        r = random.random()
        if "train" in ratios and r < ratios["train"]:
            return "train"
        elif "validation" in ratios and r < ratios["train"] + ratios["validation"]:
            return "val"
        else:
            return "test"

    folders = os.listdir("raw_data/" + dataset)
    regime_name = dataset

    for folder in folders:
        files = os.listdir("raw_data/" + dataset + "/" + folder)
        path = "raw_data/" + dataset + "/" + folder

        if len(files) < min_samples:
            print(f"Taxon {folder} has too few samples")
            continue

        label = folder.split("/")[-1]
        observations = set([get_observation(filename) for filename in files])

        for observation in observations:
            destination = decide_set()
            observation_files = [
                file for file in files if file.split("/")[-1].startswith(observation)
            ]

            for file in observation_files:
                # create the destination folder if it doesn't exist
                os.makedirs(
                    "data/" + regime_name + "/" + destination + "/" + label,
                    exist_ok=True,
                )
                shutil.copy(
                    os.path.join(path,file),
                    "data/"
                    + regime_name
                    + "/"
                    + destination
                    + "/"
                    + label
                    + "/"
                    + os.path.basename(file),
                )

    print(f"Files copied to data/{regime_name}")



Now that we have the data, let's train a model!


In [None]:
import train as train

# train the model
train.train(dataset, 10)


In [None]:
from ultralytics import YOLO
import glob
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

runs = glob.glob("runs/classify/train*/")

dataset_index = 1

for run in runs:
    with open(os.path.join(run, "args.yaml")) as f:
        metadata = f.read()
        f.close()

    lines = metadata.split("\n")
    data_path = lines[3].split("data: ")[1]

    if os.path.exists(os.path.join(run, "test results.csv")):
        continue

    model = YOLO(os.path.join(run, "weights/best.pt"))  # initialize with best.pt
    dataset_config = f"test_sets/{dataset}.yaml"
    metrics = model.val(data=dataset_config, split="test")

    with open(
        os.path.join(run, "test results.csv"), "w"
    ) as f:
        f.write("metric,value\n")
        f.write(f"top1,{metrics.results_dict.get('metrics/accuracy_top1')}\n")
        f.write(f"top5,{metrics.results_dict.get('metrics/accuracy_top5')}\n")
        f.write(f"dir,{str(metrics.save_dir)}\n")
        f.close()