# YOLOv5

In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import tarfile
import scipy.io
import os
from urllib.request import urlretrieve
import io
import shutil
import torch

In [None]:
# clone YOLOv5 repository
! git clone https://github.com/ultralytics/yolov5  # clone repo
%cd yolov5
# install dependencies as necessary (ignore errors)
! pip install -qr requirements.txt
! pip install -U ultralytics

In [3]:
dataset = "https://www.robots.ox.ac.uk/~vgg/data/flowers/102/102flowers.tgz"
labels = "https://www.robots.ox.ac.uk/~vgg/data/flowers/102/imagelabels.mat"
urlretrieve(dataset, dataset.rsplit('/', 1)[-1])
urlretrieve(labels, labels.rsplit('/', 1)[-1])

tgz_file = dataset.rsplit('/', 1)[-1]
os.makedirs("/model2/classes/")
with tarfile.open(tgz_file, 'r:gz') as file:
    # Extract all files to the specified directory
    file.extractall('/model2/classes/')

In [4]:
mat = scipy.io.loadmat(labels.rsplit('/', 1)[-1])
y = pd.Series(mat['labels'][0])
# Shift the index by one position
y.index = y.index + 1
unique_labels = y.unique()
print(f"Number of unique classes: {len(unique_labels)}")

Number of unique classes: 102


In [None]:
df = pd.DataFrame({"filename": sorted(os.listdir('/model2/classes/jpg')),  "class": y.astype(str)})
print(df)

To run the YOLOv5 model, the dataset needs to be organized in a specific way:

Start with a base directory, which in our case is /dataset.
Inside the base directory, create two subfolders: train and val. These folders will hold the training and validation images, respectively.
Within each of the train and val folders, create subfolders for each classification class. In our case, there are 102 classes numbered from 1 to 102. Each class folder will contain images belonging to that specific class.

To achieve this organization, the code performs the following steps:
- Unpacking all the images to the /model2/classes directory.
- Looping over a list that holds the class each image belongs to: This list provides the class label for each image.
- Moving all the images to the /dataset directory: Each image is moved to the /dataset directory under the corresponding class subfolder, based on its class label. (e.g /dataset/71/image_07483.jpg)
- Creating a list of all the images in the dataset: This list is created to facilitate the random splitting of the dataset.
- Dividing the image array into three random arrays: The dataset is split into three arrays: train, validation, and test. These arrays will be used for training, validation, and testing the YOLOv5 model.
- Creating the train and val subdirectories with the corresponding images: The train and val subdirectories are created within the /dataset directory. The images selected for training and validation are placed inside their respective subdirectories under the class subfolders.

In [6]:
def copy_dataset():
  for index, class_label in y.items():
    # print(f'Index is {index} and class_label is {class_label}')
    image_filename = f"image_{index:05d}.jpg"  # Assuming the image filenames follow the pattern image_#####.jpg
    source_path = os.path.join('/model2/classes/jpg/', image_filename)
    # print(f"source path {source_path}")
    target_path = os.path.join('/dataset', str(class_label), image_filename)
    # print(f"target path {target_path}")
    os.makedirs(os.path.join('/dataset', str(class_label)), exist_ok=True)
    shutil.copyfile(source_path, target_path)
    # print(os.listdir(os.path.join('/dataset/', str(class_label))))

def find_file_directory(file_name, directory):
    file_path = None
    for root, dirs, files in os.walk(directory):
        if file_name in files:
            file_path = os.path.join(root, file_name)
            break
    if file_path:
        file_directory = os.path.dirname(file_path)
        return file_directory
    else:
        return None

def move_images_to_directory(lst, source_dir, target_dir):
    for image_filename in lst:
        img_curr_path = os.path.join(find_file_directory(image_filename, '/dataset'), image_filename)
        image_class = os.path.basename(os.path.dirname(img_curr_path))
        target_path = os.path.join(target_dir, image_class, image_filename)
        # Create the target directory with class folder if it doesn't exist
        os.makedirs(os.path.join(target_dir, image_class), exist_ok=True)
        shutil.move(img_curr_path, target_path)

def copy_files_from_subdirectories(source_dir, target_dir):
    # Create the target directory if it doesn't exist
    os.makedirs(target_dir, exist_ok=True)
    # Iterate over all subdirectories in the source directory
    for root, dirs, files in os.walk(source_dir):
        for file in files:
            # Get the full path of the file
            file_path = os.path.join(root, file)
            # Copy the file to the target directory
            shutil.copy2(file_path, target_dir)

def count_files_in_directory(directory):
    count = 0
    for _, _, files in os.walk(directory):
        count += len(files)
    return count


def sum_train_and_val_loss_per_epoch(i):
  name = 'flower_classifier' if i == 0 else f'flower_classifier{i+1}'
  train_loss_avg_epochs, val_loss_avg_epochs, val_acc_avg_epochs = {}, {}, {}
  train_val_df = pd.read_csv(f'/content/yolov5/runs/train-cls/{name}/results.csv')
  train_val_df.columns = train_val_df.columns.str.strip()

  for _, row in train_val_df.iterrows():
      epoch = int(row['epoch'])
      train_loss = row['train/loss']
      val_loss = row['val/loss']
      acc = row['metrics/accuracy_top1']

      if epoch not in train_loss_avg_epochs:
          train_loss_avg_epochs[epoch] = train_loss
      else:
          train_loss_avg_epochs[epoch] += train_loss

      if epoch not in val_loss_avg_epochs:
          val_loss_avg_epochs[epoch] = val_loss
      else:
          val_loss_avg_epochs[epoch] += val_loss

      if epoch not in val_acc_avg_epochs:
        val_acc_avg_epochs[epoch] = acc
      else:
        val_acc_avg_epochs[epoch] += acc

  return train_loss_avg_epochs, val_loss_avg_epochs, val_acc_avg_epochs

def calc_train_val_loss_acc_avg(n, train_loss_avg_epochs, val_loss_avg_epochs, val_acc_avg_epochs):
  for epoch in train_loss_avg_epochs:
      train_loss_avg_epochs[epoch] /= n

  for epoch in val_loss_avg_epochs:
      val_loss_avg_epochs[epoch] /= n

  for epoch in val_acc_avg_epochs:
      val_acc_avg_epochs[epoch] /= n

  return train_loss_avg_epochs, val_loss_avg_epochs, val_acc_avg_epochs

def calculate_correct_pred(df_model, df_total):
    correct = 0
    for index, row in df_model.iterrows():
      image = row['image'].rstrip(':')
      max_class = row['max_class']
      matching_row = df_total[df_total['filename'] == image]
      if not matching_row.empty:
        class_value = matching_row.iloc[0]['class']
        # Compare class values
        if max_class == class_value:
            correct = correct +1
    return correct

def clean_runtime_env(to_del: list[str]):
  for path in to_del:
    if os.path.isdir(path):
      print(f"delete path - {path}")
      shutil.rmtree(path)

In [7]:
def train_and_eval(df, n, num_epoch, seed, optimizer,lr, dropout, batch_size):
  # Delete old run folder if exists
  if os.path.isdir(os.path.join(os.getcwd(), 'runs')):
    shutil.rmtree(os.path.join(os.getcwd(), 'runs'))

  test_accuracy = []

  for i in range(n):
    copy_dataset()
    train_df, val_test_df = train_test_split(df, test_size=0.5, stratify=df['class'],  shuffle=True, random_state=seed[i])
    val_df, test_df = train_test_split(val_test_df, test_size=0.5, stratify=val_test_df['class'], shuffle=True, random_state=seed[i])

    move_images_to_directory(train_df['filename'], '/dataset', '/dataset/train')
    print(f"Number of training images: {count_files_in_directory('/dataset/train')}")
    move_images_to_directory(val_df['filename'], '/dataset', '/dataset/val')
    print(f"Number of validation images: {count_files_in_directory('/dataset/val')}")
    move_images_to_directory(test_df['filename'], '/dataset', '/test_dataset')
    print(f"Number of testing images: {count_files_in_directory('/test_dataset')}")
    # /test_dataset is the test directory with the image classification and /test directory is a directory with all image tests files (not splitted into classes)
    copy_files_from_subdirectories('/test_dataset', '/test')

    # optimizer options ['SGD', 'Adam', 'AdamW', 'RMSProp']
    !python classify/train.py --model yolov5s-cls.pt --data /dataset --epochs $num_epoch --pretrained yolov5s-cls.pt --name flower_classifier --lr0 $lr --seed "${seed[i]}" --batch-size $batch_size

    train_loss_avg_epochs,val_loss_avg_epochs, val_acc_avg_epochs = sum_train_and_val_loss_per_epoch(i)

    # calculate test score for iteration
    # Run the predict command according to the weights from the train command
    name = 'flower_classifier' if i == 0 else f'flower_classifier{i+1}'
    weights_path = f"runs/train-cls/{name}/weights/best.pt"

    output = !python classify/predict.py --weights $weights_path --source /test

    # Process the output and store it in a list
    data = []
    for line in output:
        if line.startswith('image'):
            parts = line.split()
            image_path = parts[2].replace('/test/', '').rstrip(':')
            max_class = parts[4]
            data.append({'image': image_path, 'max_class': max_class})

    # Create a DataFrame from the list of dictionaries
    df_model = pd.DataFrame(data)

    # calculate the number of correct predictions
    correct_pred = calculate_correct_pred(df_model, df)
    # number of total test samples
    total_test_samples = df_model.shape[0]
    accuracy = correct_pred/total_test_samples
    print(f"Test accuracy cycle {i}: {round(accuracy * 100, 3)}%")
    test_accuracy.append(accuracy)
    clean_runtime_env(['/dataset', '/test', '/test_dataset'])

  print(f"model accuracy = {round(np.mean(test_accuracy) * 100, 3)}%")
  #return the avg per epoch

  return calc_train_val_loss_acc_avg(n, train_loss_avg_epochs, val_loss_avg_epochs, val_acc_avg_epochs)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

train_loss_avg, val_loss_avg, val_acc_avg = train_and_eval(df, n=2, num_epoch=50, seed=[123, 42], optimizer='ADAM', lr=0.001, dropout=0, batch_size=64)
print(f'train loss {train_loss_avg}')
print(f'val loss {val_loss_avg}')
print(f'val acc {val_acc_avg}')

In [None]:
plt.plot(np.array(list(train_loss_avg.keys())) + 1, list(train_loss_avg.values()), label='Training')
plt.plot(np.array(list(val_loss_avg.keys())) + 1, list(val_loss_avg.values()), label='Validation')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Loss vs Epochs')
plt.legend()
plt.show()

plt.plot(np.array(list(val_acc_avg.keys())) + 1, list(val_acc_avg.values()), label='Validation')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Accuracy vs Epochs')
plt.legend()
plt.show()