# YOLO training pipeling

Generic pipeline to train a classification model from observations with multiple images.


In [1]:
# Some basic setup:

import shutil
import random
import glob
import numpy as np
import zipfile
import pandas as pd

dataset = "tracks_indeed_actually"

taxa = [
    "Alces alces",
    "Canis lupus",
    "Capreolus capreolus",
    "Lutra lutra",
    "Meles meles",
    "Sus scrofa",
    "Vulpes vulpes",
]

ratios = {"train": 0.7, "validation": 0.15, "test": 0.15}

# Read the csv from the zip file
zipfile = zipfile.ZipFile(f"datasets/observations-482131.csv.zip")
inat_csv = pd.read_csv(zipfile.open("observations-482131.csv"))
indeed_csv = pd.read_csv("datasets/indeed.csv")

min_samples = 50

Assuming that we have a folder with raw data, separated into labels and were each file may or may not start with an observation before an underscore so that images from the same observation can all go to the same train/val/test set, copy all images into a data folder where the sets have been determined.

The dataset folder in raw_data contains a folder per label.

Files can be grouped into "observations" by having the same filename before the first underscore.

These should be kept together in either train, validation or test set, with a ratio (specified above).

For each label, make a list of the observations, and copy all files in the observation to either the train, validation or test set in `./data/[dataset]/[train|validation|test]/[label]`


In [13]:
# Based on an iNaturalist export, get a list in the right format
# sort by id
inat_csv = inat_csv.sort_values("id")

# drop the columns we don't need: species_guess, common_name, iconic_taxon_name, taxon_id
inat_csv = inat_csv.drop(
    ["species_guess", "common_name", "iconic_taxon_name", "taxon_id"], axis=1
)

inat_csv.columns = ["observation", "image_url", "taxon"]

In [None]:
# Based on the list ["observation", "image_url", "taxon"], download the images into a folder structure
# loop over the rows

import requests
import tqdm
import os

import requests

from PIL import Image
from multiprocessing import Pool

from requests import ConnectTimeout
from urllib3.exceptions import MaxRetryError

target_folder = f"raw_data/{dataset}/unclassified_images"

# drop all rows where the taxon does not contain a space
inat_csv = inat_csv[inat_csv["taxon"].str.contains(" ")]

# trim all taxa to the first two words
inat_csv["taxon"] = inat_csv["taxon"].apply(lambda x: " ".join(x.split()[:2]))

# drop all rows where the taxon is not in the list
inat_csv = inat_csv[inat_csv["taxon"].isin(taxa)]

for taxon in taxa:
    if os.path.exists(f"raw_data/{dataset}/{taxon}"):
        # throw an exception if the folder already exists
        raise Exception(f"Folder {taxon} exists, not downloading anything")
    


def download(url):
    try:
        # if the url is empty or nan, skip
        if pd.isna(url) or url == "":
            return

        basename = "img_" + url.split("/")[-2]
        out_path = os.path.join(target_folder, basename + ".jpg")
        if os.path.exists(out_path):
            return
        r = requests.get(
            url.replace("medium", "large"), stream=True, timeout=(5, 30), verify=False
        )
        if r.status_code == 200:
            with open(out_path, "wb") as f:
                r.raw.decode_content = True
                shutil.copyfileobj(r.raw, f)
            # print(f"download {basename} took {time.time() - start_time} s")
        else:
            print(f"{url} failed to download")
    except MaxRetryError:
        print(f"MaxRetryError {url}")
    except ConnectTimeout:
        print(f"ConnectTimeout {url}")
    except:
        print(f"Other error with {url}")


os.makedirs(target_folder, exist_ok=True)

pool = Pool(16)
results = pool.map(download, inat_csv.image_url)

Laurens made an annotation tool that separates different kinds of pictures. Use these to label the files


In [19]:
# Move each file to the folder as specified by the indeed.csv

# loop through all files in the existing folders in the raw_data folder
files = glob.glob(f"raw_data/{dataset}/unclassified_images/img_*.jpg")

for index, file in enumerate(files):
    basename = file.split("/")[-1].split(".")[0]

    if "[" in file:
        label = file.split("[")[1].split("]")[0]
        basename = basename.split(" [")[0]
    else:
        indeed__row = indeed_csv[indeed_csv["uid"] == basename]
        label_true = indeed__row["label_true"].values
        label_predicted = indeed__row["label_predicted"].values

        if len(label_true) == 0 or pd.isna(label_true[0]):
            label_true = []
        else:
            label_true = label_true[0].split(",")

        if len(label_predicted) == 0 or pd.isna(label_predicted[0]):
            label_predicted = []
        else:
            label_predicted = label_predicted[0].split(",")

        if (
            len(label_true) > 0 and not "Track" in label_true
        ) or not "Track" in label_predicted:
            label = "no track"
        elif (
            len(label_true) > 0 and "Snow" in label_true
        ) or "Snow" in label_predicted:
            label = "snow track"
        else:
            label = "other track"

    basename_id = str(basename.split("_")[-1])
    inat_row = inat_csv[
        inat_csv["image_url"].str.contains("/" + basename_id + "/", na=False)
    ]

    if len(inat_row) != 1:
        print(f"Error with {basename_id}: {len(inat_row)} rows found")
        continue

    taxon = inat_row["taxon"].values[0]

    if " " not in taxon:
        continue

    taxon = " ".join(taxon.split(" ")[:2])

    observation = inat_row["observation"].values[0]

    # make the folder if it does not exist
    os.makedirs(f"raw_data/{dataset}/{taxon}/{label}", exist_ok=True)

    shutil.move(
        file,
        f"raw_data/{dataset}/{taxon}/{label}/{observation}_{index}.jpg",
    )

In [None]:
# Rename each file so that it has the observation id in the name and move it to the right folder
for index, row in tqdm.tqdm(inat_csv.iterrows(), total=len(inat_csv)):
    observation = row["observation"]
    image_url = row["image_url"]
    taxon = row["taxon"]

    # if the url is empty or nan, skip
    if pd.isna(image_url) or image_url == "":
        continue

    # create the folder
    os.makedirs(f"raw_data/{dataset}/{taxon}", exist_ok=True)

    # move the image from the unclassified folder to the taxon folder
    basename = "img_" + image_url.split("/")[-2]
    out_path = os.path.join(
        f"raw_data/{dataset}/{taxon}", str(observation) + "_" + str(index) + ".jpg"
    )
    source_folder = f"raw_data/{dataset}/unclassified_images"

    if os.path.exists(out_path) or not os.path.exists(
        os.path.join(source_folder, basename + ".jpg")
    ):
        continue

    shutil.move(
        os.path.join(source_folder, basename + ".jpg"),
        out_path,
    )

Now we have a folder per species, with subfolders containing "no track", "other track" and "snow track" images. File names start with the observation id for grouping.
Files that were classified wrong can be copied to a folder with "actually" appended, i.e. "snow track actually". Such files will be ignored in their original folder and included as the actual label, without having to remove or move anything.

The goal is to test different regimes, optimizing for snow track recognition

In [None]:
import os

subsets = {
    "train": "other track",
    "validation": "other track",
    "test": "snow track"
    }

regime_name = f"{dataset} ({subsets['train']} - {subsets['validation']} - {subsets['test']})"

if os.path.exists(f"data/{regime_name}"):
    print(f"Folder data/{regime_name} already exists, exiting")
else:
    # Get the observation from the filename
    def get_observation(filename):
        filename = filename.split('/')[-1]
        # if the filename contains an underscore, split on the first one
        if '_' not in filename:
            return filename
        return filename.split('_')[0] + '_'


    def decide_set(ratios=ratios):
        r = random.random()
        if "train" in ratios and r < ratios["train"]:
            return "train"
        elif "validation" in ratios and r < ratios["train"] + ratios["validation"]:
            return "val"
        else:
            return "test"
    
    # Removes all files that should have been classified as a different class, and adds all files that should have been classified as this class
    def get_cleaned_files(taxon, label):
        files = glob.glob('raw_data/' + dataset + '/' + taxon + '/' + label + '/*.jpg')
        files = [file.split('/')[-1] for file in files]

        remove_files = glob.glob('raw_data/' + dataset + '/' + taxon + '/* actually/*.jpg')
        remove_files = [file.split('/')[-1] for file in remove_files]

        files = [file for file in files if file not in remove_files]
        files = ['raw_data/' + dataset + '/' + taxon + '/' + label + '/' + file for file in files]

        add_files = glob.glob('raw_data/' + dataset + '/' + taxon + '/' + label + ' actually/*.jpg')

        files = files + add_files
        return files


    # For each value in the subsets dict, sum the ratios of the corresponding label
    labels = set(subsets.values())
    taxa = os.listdir('raw_data/' + dataset)

    adjusted_ratios = {}

    for label in labels:
        # get the keys of the subsets dict that have the right label
        keys = [key for key, value in subsets.items() if value == label]
        total_ratio = np.sum([ratios[key] for key in keys])
        for key in keys:
            adjusted_ratios[key] = ratios[key] / total_ratio

        needed = np.sum([ratios[key] * min_samples for key in keys])

        for taxon in taxa.copy():
            files = get_cleaned_files(taxon, label)
            if len(files) < needed:
                print(f"Taxon {taxon} has too few samples for label {label}")
                taxa.remove(taxon)

    for label in labels:
        # set the ratio to 0 if the label is not in the subsets dict
        label_ratios = adjusted_ratios.copy()

        for key in label_ratios.keys():
            if subsets[key] != label:
                label_ratios[key] = 0

        for taxon in taxa:

        # Get all observations for this label
            files = get_cleaned_files(taxon, label)
            observations = set([get_observation(filename) for filename in files])

            # Copy all files starting with this observation to the correct set
            for observation in observations:
                destination = decide_set(label_ratios)
                observation_files = [file for file in files if file.split("/")[-1].startswith(observation)]

                for file in observation_files:
                    # create the destination folder if it doesn't exist
                    os.makedirs('data/' + regime_name + '/' + destination + '/' + taxon, exist_ok=True)
                    shutil.copy(file, 'data/' + regime_name + '/' + destination + '/' + taxon + '/' + os.path.basename(file))


    print(f"Files copied to data/{regime_name}")



Now that we have the data, let's train a model!


In [None]:
import train as train

# Pick one of the following regimes to train the model on

regime_name = f"{dataset} (other track train)"
regime_name = f"{dataset} (snow track train)"
regime_name =  f"{dataset} (mix train)"

# train the model
train.train(dataset, regime_name)


In [28]:
from ultralytics import YOLO
import glob
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

options = ["snow track", "mix", "other track"]
option_labels = [label.replace(" track", "") for label in options]
runs = glob.glob("runs/classify/train*/")

dataset_index = 1

for run in runs:
    with open(os.path.join(run, "args.yaml")) as f:
        metadata = f.read()
        f.close()

    lines = metadata.split("\n")
    data_source = lines[3].split("(")[1].split(" train)")[0]
    data_path = metadata.split("\n")[3].split("data: ")[1]

    for testlabel in options:
        if os.path.exists(os.path.join(run, "test results for " + testlabel + ".csv")):
            continue

        model = YOLO(os.path.join(run, "weights/best.pt"))  # initialize with best.pt

        selectionset = "all"
        if testlabel == data_source or data_source == "mix":
            selectionset = "test"

        dataset_config = f"test_sets/{selectionset} {testlabel}.yaml"

        metrics = model.val(data=dataset_config, split="test")

        # save the results to a csv file
        with open(
            os.path.join(run, "test results for " + testlabel + ".csv"), "w"
        ) as f:
            f.write("metric,value\n")
            f.write(f"top1,{metrics.results_dict.get('metrics/accuracy_top1')}\n")
            f.write(f"top5,{metrics.results_dict.get('metrics/accuracy_top5')}\n")
            f.write(f"dir,{str(metrics.save_dir)}\n")
            f.close()

        # rename the directory to include the test label
        os.rename(
            str(metrics.save_dir),
            str(metrics.save_dir)
            + " "
            + str(dataset_index)
            + " (train "
            + data_source
            + ", test "
            + testlabel
            + ")",
        )
        dataset_index += 1

# --------------------------------------------
# Make the accuracy plot

plotdata = []

for run in runs:
    # read the results metadata yaml as a text file
    with open(f"{run}/args.yaml") as f:
        metadata = f.read()
        f.close()

    # get the 4th line of the metadata
    lines = metadata.split("\n")
    if len(lines) < 9:
        continue

    data_source = lines[3].split("(")[1].split(" train)")[0]
    yolomodel = lines[2].split("model: yolo")[1].split("-cls.pt")[0]
    imgsize = lines[8].split("imgsz: ")[1]
    run_number = run.split("/")[-1]

    run_csv = pd.read_csv(f"{run}/results.csv", sep=",")

    datarow = {
        "run": run_number,
        "data_sources": data_source,
        "yolomodel": yolomodel,
        "imgsize": imgsize,
        "top1_accuracy": run_csv[run_csv.columns[3]].values,
        "top5_accuracy": run_csv[run_csv.columns[4]].values,
    }

    plotdata.append(datarow)


# plot a graph of the top1_accuracies over time

# prepare a plot with two x-axes, one for the training data (snow, mix, other) and one for the validation data (snow, mix, other)

fig, ax = plt.subplots(layout="constrained", figsize=(7, 4))

ax.set_xlabel("Epoch")
ax.set_ylabel("Top 1 accuracy")

legend = []

for plotdata_row in plotdata:
    ax.plot(plotdata_row["top1_accuracy"])
    # add a legend with the data sources
    legend.append(plotdata_row["data_sources"].split(" - ")[0].replace(" track", ""))

ax.legend(legend)


fig.savefig(f"{run}/../top1_accuracy.png")


# --------------------------------------------
# Make the TEST accuracy plot

plotdata = []

dataset_index = 1

for run in runs:
    # read the results metadata yaml as a text file
    with open(f"{run}/args.yaml") as f:
        metadata = f.read()
        f.close()

    # get the 4th line of the metadata
    lines = metadata.split("\n")
    if len(lines) < 9:
        continue

    data_source = lines[3].split("(")[1].split(" train)")[0]
    yolomodel = lines[2].split("model: yolo")[1].split("-cls.pt")[0]
    imgsize = lines[8].split("imgsz: ")[1]
    run_number = run.split("/")[-1]

    run_csv = pd.read_csv(f"{run}/results.csv", sep=",")

    for testlabel in options:
        if os.path.exists(os.path.join(run, "test results for " + testlabel + ".csv")):
            test_results = pd.read_csv(
                os.path.join(run, "test results for " + testlabel + ".csv"), sep=","
            )
                  
            datarow = {
                "run": run_number,
                "train": data_source,
                "test": testlabel,
                "yolomodel": yolomodel,
                "imgsize": imgsize,
                "top1_accuracy": run_csv[run_csv.columns[3]].values,
                "top5_accuracy": run_csv[run_csv.columns[4]].values,
                "top1_accuracy_test": float(test_results[test_results["metric"] == "top1"]["value"].values[0]),
                "top5_accuracy_test": float(test_results[test_results["metric"] == "top5"]["value"].values[0]),
            }

            plotdata.append(datarow)


# prepare a plot with two x-axes, one for the training data (snow, mix, other) and one for the validation data (snow, mix, other)
fig, ax = plt.subplots(layout="constrained", figsize=(7, 4))


ax.set_xticks(list(range(len(options) ** 2)), labels=option_labels * len(options))
ax.tick_params("x", length=0)

ax.axhline(1 / 7, linestyle="--", color="black", linewidth=0.5)

# label the classes:
sec = ax.secondary_xaxis(location=0)

triple_option_labels = option_labels * len(options)


sec.set_xticks(
    [-1] + list(range(len(options) ** 2)),
    labels=["TRAIN\n\nTEST"]
    + np.concatenate(
        [(["\n\n" + i] * len(option_labels)) for i in option_labels], axis=0
    ).tolist(),
)
sec.tick_params("x", length=0)

# lines between the classes:
sec2 = ax.secondary_xaxis(location=0)
sec2.set_xticks([-0.5, 2.5, 5.5], labels=[])
sec2.tick_params("x", length=40, width=1.5)
ax.set_xlim(-1.5, 8.5)
ax.set_ylim(0, 1)



for plotdata_row in plotdata:
    x = options.index(plotdata_row["train"]) + options.index(plotdata_row["test"]) * 3
    ax.plot(x, plotdata_row["top1_accuracy_test"], marker="o", markersize=5)


fig.savefig(f"{run}/../top1_test_accuracy.png")