In [None]:
# names:
#     0: healthy
#     1: stressed

In [1]:
import datetime
import shutil
from pathlib import Path
from collections import Counter

import yaml
import numpy as np
import pandas as pd
from ultralytics import YOLO
from sklearn.model_selection import KFold
import glob, os
from PIL import Image

In [2]:
# Define the dataset path
dataset_path = Path(r'C:\Users\amalti\Desktop\sarina\test\train')

# Get all label files
labels = sorted(dataset_path.rglob("labels/*.txt"))

# Define classes (assuming 2 classes for example)
classes = ['stressed', 'healthy']
cls_idx = list(range(len(classes)))

# Create a DataFrame with indices from the filenames (using the full name)
indx = sorted([l.name for l in labels], key=lambda x: int(''.join(filter(str.isdigit, x.split('.')[0]))))
labels_df = pd.DataFrame(0.0, columns=cls_idx, index=indx)

for label in labels:
    lbl_counter = Counter()

    with open(label, 'r') as lf:
        lines = lf.readlines()

    for l in lines:
        try:
            class_index = int(l.split(' ')[0])
            lbl_counter[class_index] += 1
        
        except ValueError as e:
            print(f"Error: Invalid class index in line {line_num} of file {label}: {l.strip()}. Exception: {e}")

    labels_df.loc[label.stem] = lbl_counter

# labels_df = labels_df.fillna(0.0)  # Replace `NaN` values with `0.0`
# print(labels_df.head(10))  # Print only the head of the DataFrame


In [4]:
import os
from collections import Counter
import pandas as pd

# Define the dataset path
dataset_path = r'C:\Users\amalti\Desktop\sarina\test\train'

# Get all label files
labels = sorted([os.path.join(dp, f) for dp, dn, filenames in os.walk(dataset_path) 
                 for f in filenames if os.path.splitext(f)[1] == '.txt' and 'labels' in dp])

# Define classes (assuming 2 classes for example)
classes = ['stressed', 'healthy']
cls_idx = list(range(len(classes)))

# Create a DataFrame with indices from the filenames (using the full name without extension)
indx = sorted([os.path.splitext(os.path.basename(l))[0] for l in labels], 
              key=lambda x: int(''.join(filter(str.isdigit, x))))
labels_df = pd.DataFrame(0.0, columns=cls_idx, index=indx)

# Populate the DataFrame with label counts
for label in labels:
    lbl_counter = Counter()

    with open(label, 'r') as lf:
        lines = lf.readlines()

    for line_num, l in enumerate(lines, start=1):
        try:
            # Classes for YOLO label use integer at the first position of each line
            class_index = int(l.split()[0])
            lbl_counter[class_index] += 1
        except ValueError as e:
            print(f"Error: Invalid class index in line {line_num} of file {label}: {l.strip()}. Exception: {e}")

    labels_df.loc[os.path.splitext(os.path.basename(label))[0]] = lbl_counter

# Replace `NaN` values with `0.0`
# labels_df = labels_df.fillna(0.0)
print(labels_df.head(10))  # Print only the head of the DataFrame


           0    1
image1   4.0  2.0
image2   4.0  2.0
image3   4.0  4.0
image4   4.0  4.0
image5   3.0  5.0
image6   2.0  7.0
image7   5.0  4.0
image8   3.0  5.0
image9   5.0  5.0
image10  3.0  6.0


In [5]:
labels_df

Unnamed: 0,0,1
image1,4.0,2.0
image2,4.0,2.0
image3,4.0,4.0
image4,4.0,4.0
image5,3.0,5.0
...,...,...
image1496,3.0,7.0
image1497,2.0,10.0
image1498,4.0,10.0
image1499,3.0,8.0


In [6]:
ksplit = 3
kf = KFold(n_splits=ksplit, shuffle=True, random_state=20)   # setting random_state for repeatable results

kfolds = list(kf.split(labels_df))

In [8]:
folds = [f'split_{n}' for n in range(1, ksplit + 1)]
fold_lbl_distrb = pd.DataFrame(index=folds, columns=cls_idx)

for n, (train_indices, val_indices) in enumerate(kfolds, start=1):
    train_totals = labels_df.iloc[train_indices].sum()
    val_totals = labels_df.iloc[val_indices].sum()

    # To avoid division by zero, we add a small value (1E-7) to the denominator
    ratio = val_totals / (train_totals + 1E-7)
    fold_lbl_distrb.loc[f'split_{n}'] = ratio

In [9]:
fold_lbl_distrb


Unnamed: 0,0,1
split_1,0.492632,0.487887
split_2,0.513558,0.4978
split_3,0.493991,0.514554


In [17]:
# K-fold Cross Validation Setup
base_path = r'C:\Users\amalti\Desktop\sarina\test'
kfold_base_path = os.path.join(base_path, 'kfold')

# Remove existing folder if it exists
if os.path.isdir(kfold_base_path):
    shutil.rmtree(kfold_base_path)

# Create new folder
os.makedirs(kfold_base_path)


In [29]:
# Store image and label paths for future use
TARGET_IMAGES_PATH = r'C:\Users\amalti\Desktop\sarina\test\train\images'
TARGET_LABELS_PATH = r'C:\Users\amalti\Desktop\sarina\test\train\labels'
image_paths = glob.glob(os.path.join(TARGET_IMAGES_PATH, "*.jpg"))
label_paths = glob.glob(os.path.join(TARGET_LABELS_PATH, "*.txt"))

In [33]:
print(len(image_paths))

1500


In [36]:
yaml_paths = list()
train_txt_paths = list()
val_txt_paths = list()


for i, (train_idx, val_idx) in enumerate(kfolds):
    # Get image paths for train-val split
    train_paths = [image_paths[j] for j in train_idx]
    val_paths = [image_paths[j] for j in val_idx]
    
    # Create text files to store image paths
    # Create text files to store image paths
    train_txt = os.path.join(kfold_base_path, f"train_{i}.txt")
    val_txt = os.path.join(kfold_base_path, f"val_{i}.txt")

    # Write images paths for training and validation in split i
    with open(str(train_txt), 'w') as f:
        f.writelines(s + '\n' for s in train_paths)
    with open(str(val_txt), 'w') as f:
        f.writelines(s + '\n' for s in val_paths)

    train_txt_paths.append(str(train_txt))
    val_txt_paths.append(str(val_txt))

    # Create yaml file
    yaml_path = os.path.join(kfold_base_path, f'data_{i}.yaml')
    with open(yaml_path, 'w') as ds_y:
        yaml.safe_dump({
            'train': os.path.basename(train_txt),
            'val': os.path.basename(val_txt),
            'names': classes
        }, ds_y)
    yaml_paths.append(yaml_path)
print("Yaml Paths")
print(yaml_paths)

Yaml Paths
['C:\\Users\\amalti\\Desktop\\sarina\\test\\kfold\\data_0.yaml', 'C:\\Users\\amalti\\Desktop\\sarina\\test\\kfold\\data_1.yaml', 'C:\\Users\\amalti\\Desktop\\sarina\\test\\kfold\\data_2.yaml']


In [8]:
for i in range(ksplit):
    model = YOLO('yolov8s.pt')
    dataset_yaml = yaml_paths[i]
    print(f"Training for fold={i} using {dataset_yaml}")
    model.train(data=dataset_yaml, batch=batch, project=project, epochs=100, verbose=False, workers=28)
    result = model.metrics # Metrics on validation set
    results.append(result) # save output metrics for further analysis
    clear_output()

In [9]:
metric_values = dict()

for result in results:
    for metric, metric_val in result.results_dict.items():
        if metric not in metric_values:
            metric_values[metric] = []
        metric_values[metric].append(metric_val)

metric_df = pd.DataFrame.from_dict(metric_values)
visualize_metric = ['mean', 'std', 'min', 'max']
metric_df.describe().loc[visualize_metric]

Unnamed: 0,metrics/precision(B),metrics/recall(B),metrics/mAP50(B),metrics/mAP50-95(B),fitness
mean,0.615071,0.623538,0.591094,0.449511,0.463669
std,0.35571,0.35058,0.420025,0.411607,0.412448
min,0.270952,0.273026,0.19918,0.067993,0.081112
max,0.948898,0.953456,0.979138,0.831956,0.846603


# Kfold RGB
## I tried to add patience and reduce number of epochs to resolve overfitting

In [None]:

# from IPython.display import clear_output
import time

project = 'kfold_demo'

# Specify the save directory for training runs
save_dir = r'C:\Users\amalti\Desktop\sarina\kfold_demo'
os.makedirs(save_dir, exist_ok=True)

# yaml paths
yaml_paths = ['C:\\Users\\amalti\\Desktop\\sarina\\test\\kfold\\data_0.yaml', 'C:\\Users\\amalti\\Desktop\\sarina\\test\\kfold\\data_1.yaml', 'C:\\Users\\amalti\\Desktop\\sarina\\test\\kfold\\data_2.yaml']
os.environ['WANDB_DISABLED'] = 'true'

ksplit = 3

# hyperparameters: 
batch = 16
epochs = 75
patience = 5
# freeze= 
weight_decay = 0.0005
# lr0=

results = list()
for i in range(ksplit):
    model = YOLO('yolov8s.pt')
    name = f'split{i}'
    dataset_yaml = yaml_paths[i]
    print(f"Training for fold={i} using {dataset_yaml}")
    model.train(data=dataset_yaml, batch=batch, project=project, epochs=epochs, verbose=True, workers=28, save_dir=save_dir, name=name)
    result = model.metrics # Metrics on validation set
    results.append(result) # save output metrics for further analysis
    # clear_output()

Training for fold=0 using C:\Users\amalti\Desktop\sarina\test\kfold\data_0.yaml
New https://pypi.org/project/ultralytics/8.2.27 available  Update with 'pip install -U ultralytics'
Ultralytics YOLOv8.2.22  Python-3.12.3 torch-2.3.0 CUDA:0 (NVIDIA A40-12Q, 12288MiB)
[34m[1mengine\trainer: [0mtask=detect, mode=train, model=yolov8s.pt, data=C:\Users\amalti\Desktop\sarina\test\kfold\data_0.yaml, epochs=75, time=None, patience=100, batch=16, imgsz=640, save=True, save_period=-1, cache=False, device=None, workers=28, project=kfold_demo, name=split0, exist_ok=False, pretrained=True, optimizer=auto, verbose=True, seed=0, deterministic=True, single_cls=False, rect=False, cos_lr=False, close_mosaic=10, resume=False, amp=True, fraction=1.0, profile=False, freeze=None, multi_scale=False, overlap_mask=True, mask_ratio=4, dropout=0.0, val=True, split=val, save_json=False, save_hybrid=False, conf=None, iou=0.7, max_det=300, half=False, dnn=False, plots=True, source=None, vid_stride=1, stream_buffer

[34m[1mtrain: [0mScanning C:\Users\amalti\Desktop\sarina\test\train\lab[0m


[34m[1mtrain: [0mNew cache created: C:\Users\amalti\Desktop\sarina\test\train\labels.cache


[34m[1mval: [0mScanning C:\Users\amalti\Desktop\sarina\test\train\label[0m

[34m[1mval: [0mNew cache created: C:\Users\amalti\Desktop\sarina\test\train\labels.cache





In [None]:
metric_values = dict()

for result in results:
    for metric, metric_val in result.results_dict.items():
        if metric not in metric_values:
            metric_values[metric] = []
        metric_values[metric].append(metric_val)

metric_df = pd.DataFrame.from_dict(metric_values)
visualize_metric = ['mean', 'std', 'min', 'max']
metric_df.describe().loc[visualize_metric]

## validation on test dataset

In [None]:
model = YOLO(r'C:\Users\amalti\Desktop\sarina\kfold_demo\split0\weights\best.pt')  # load a custom model
model.overrides['conf'] = 0.7  # NMS confidence threshold
# model.overrides['iou'] = 0.45  # NMS IoU threshold
# Validate the model
metrics = model.val()  # no arguments needed, dataset and settings remembered
metrics.box.map  # map50-95
metrics.box.map50  # map50
metrics.box.map75  # map75
metrics.box.maps  # a list contains map50-95 of each category

# Kfold RGB
## I tried to add patience and reduce number of epochs to resolve overfitting and freezing layers