In [None]:
%pip install -U ultralytics ipywidgets jax jaxlib wandb
!jupyter nbextension enable --py widgetsnbextension


# Prepare Tomato data

<a href="https://www.kaggle.com/code/youssefelkilany/omdena-diseases-detection-yolo-model" target="_blank"><img alt="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"/></a> <a href="https://colab.research.google.com/github/OmdenaAI/KenyaChapter_EarlyDetectionofCropDiseases/blob/main/notebooks/YOLO-model.ipynb" target="_blank"><img alt="Open in Colab" src="https://colab.research.google.com/assets/colab-badge.svg"/></a>

This notebook has a companion repo for Omdena challenge, Disease Early Detection in Kenya, have a look at the [repo here](https://github.com/OmdenaAI/KenyaChapter_EarlyDetectionofCropDiseases)


In [None]:
import os
import shutil
import pandas as pd
from PIL import Image
from tqdm import tqdm
import requests

import torch
from torch.utils.data import random_split

from ultralytics import YOLO


def download_file(url, path):
    response = requests.get(url)
    response.raise_for_status()  # Raise an error for bad responses
    with open(path, "wb") as f:
        f.write(response.content)


The following cells for preparing data depends on the dataset I prepared & uploaded on kaggle, you may [have a look here](https://www.kaggle.com/code/youssefelkilany/omdena-diseases-detection-maize-dataset) at the process for maize dataset, [here for tomato](https://www.kaggle.com/code/youssefelkilany/omdena-diseases-detection-tomato-dataset), [here for beans](https://www.kaggle.com/code/youssefelkilany/omdena-diseases-detection-beans-dataset), or [here for cassava](https://www.kaggle.com/code/youssefelkilany/omdena-diseases-detection-cassava-dataset). There have been some issues with kaggle while saving the data, so the rest of this section here have some parts follow version 1 of the linked datasets, while some commented parts are for 2nd version. The only difference is where structuring the data as per YOLO structure is done here in this notebook or already done there.


In [None]:
dataset_csv_url = "https://raw.githubusercontent.com/OmdenaAI/KenyaChapter_EarlyDetectionofCropDiseases/refs/heads/main/Tomato_metadata.csv"
tomato_csv_path = "/kaggle/working/tomato.csv"
download_file(dataset_csv_url, tomato_csv_path)

# tomato_splits_path = '/kaggle/input/omdena-diseases-detection-tomato-dataset'
# %ls -1 {tomato_splits_path}
tomato_data_path = "/kaggle/input/omdena-diseases-detection-datasets/tomato-images"
tomato_splits_path = "/kaggle/working/tomato"
%ls -1 {tomato_data_path} | wc -l


In [None]:
dataset_csv_url = "https://raw.githubusercontent.com/OmdenaAI/KenyaChapter_EarlyDetectionofCropDiseases/refs/heads/main/Beans_metadata.csv"
beans_csv_path = "/kaggle/working/beans.csv"
download_file(dataset_csv_url, beans_csv_path)

# beans_splits_path = f'/kaggle/input/omdena-diseases-detection-beans-dataset'
# %ls -1 {beans_splits_path}
beans_data_path = "/kaggle/input/omdena-diseases-detection-datasets/beans-images"
beans_splits_path = "/kaggle/working/beans"
%ls -1 {beans_data_path} | wc -l


In [None]:
dataset_csv_url = "https://raw.githubusercontent.com/OmdenaAI/KenyaChapter_EarlyDetectionofCropDiseases/refs/heads/main/maize_filtered.csv"
maize_csv_path = "/kaggle/working/maize.csv"
download_file(dataset_csv_url, maize_csv_path)

# maize_splits_path = f'/kaggle/input/omdena-diseases-detection-maize-dataset'
# %ls -1 {maize_splits_path}
maize_data_path = "/kaggle/input/omdena-diseases-detection-datasets/maize-images"
maize_splits_path = "/kaggle/working/maize"
%ls -1 {maize_data_path} | wc -l


## Prepare directory structure for the data


In [None]:
def get_valid_classes_indices(df, min_threshold=100):
    classes_idx = {}
    for k in df.disease.unique():
        indices = df[df.disease == k].index.tolist()
        if len(indices) > min_threshold:
            classes_idx[k] = indices
    return classes_idx


def split_dataset(dataset_classes_idx, val_percent=0.15, test_percent=0.1):
    dataset_classes_splits = {}

    for k, indices in dataset_classes_idx.items():
        test_split_generator = torch.Generator().manual_seed(42)
        train_data, test_data = random_split(
            indices,
            [1 - test_percent, test_percent],
            generator=test_split_generator,
        )

        val_split_generator = torch.Generator().manual_seed(42)
        train_data, val_data = random_split(
            train_data,
            [1 - val_percent, val_percent],
            generator=val_split_generator,
        )
        dataset_classes_splits[k.lower()] = {
            "train": train_data,
            "val": val_data,
            "test": test_data,
        }
    return dataset_classes_splits


In [None]:
def prep_directory_structure(
    dataset_name, dataset_splits, dataset_df, dataset_input_path, dataset_output_path
):
    for k, splits in tqdm(
        dataset_splits.items(), position=0, leave=False, desc=f"{dataset_name} classes"
    ):
        for s, indices in tqdm(splits.items(), position=0, desc=f"{k} splits"):
            if s != "test":
                continue
            class_split_path = f"{dataset_output_path}/{s}/{k.lower()}"
            os.makedirs(class_split_path, exist_ok=True)
            for idx in tqdm(indices, position=0, desc=f"{k} - {s} images"):
                img_fname = os.path.basename(dataset_df.iloc[idx].path)
                img_from_path = f"{dataset_input_path}/{img_fname}"
                img_to_path = f"{class_split_path}/{img_fname}"
                shutil.copy(img_from_path, img_to_path)


def get_dataset_df(dataset_csv_path):
    dataset_df = pd.read_csv(dataset_csv_path)
    if "class" in list(dataset_df.columns):
        dataset_df["disease"] = dataset_df["class"]
    dataset_df.disease = dataset_df.disease.apply(lambda x: x.replace(" ", "_"))
    return dataset_df


def get_df_dataset_splits_prepare(
    dataset_name,
    dataset_csv_path,
    dataset_data_path,
    dataset_splits_path,
    preapare=False,
):
    dataset_df = get_dataset_df(dataset_csv_path)
    dataset_data = get_valid_classes_indices(dataset_df, 100)
    dataset_splits = split_dataset(dataset_data, 0.15, 0.1)

    if preapare:
        prep_directory_structure(
            dataset_name,
            dataset_splits,
            dataset_df,
            dataset_data_path,
            dataset_splits_path,
        )
    return dataset_df, dataset_splits


def get_yolo_dataset_splits(dataset_splits_path):
    def get_dir_files(dir_path):
        classes = os.listdir(dir_path)
        return {k: os.listdir(dir_path + f"/{k}") for k in classes}

    return {
        "train": get_dir_files(dataset_splits_path + "/train"),
        "val": get_dir_files(dataset_splits_path + "/val"),
        "test": get_dir_files(dataset_splits_path + "/test"),
    }


def get_df_yolo_dataset_splits(dataset_csv_path, dataset_splits_path):
    dataset_df = get_dataset_df(dataset_csv_path)
    dataset_splits = get_yolo_dataset_splits(dataset_splits_path)
    return dataset_df, dataset_splits


In [None]:
tomato_df, tomato_splits, _ = get_df_dataset_splits_prepare(
    "tomato", tomato_csv_path, tomato_data_path, tomato_splits_path, prepare=True
)
beans_df, beans_splits, _ = get_df_dataset_splits_prepare(
    "beans", beans_csv_path, beans_data_path, beans_splits_path, prepare=True
)
maize_df, maize_splits, _ = get_df_dataset_splits_prepare(
    "maize", maize_csv_path, maize_data_path, maize_splits_path, prepare=True
)


# Yolo model Training


In [None]:
def train_yolo(yolo_model_ckpt="yolo11n-cls.pt", dataset_splits_path=None):
    assert dataset_splits_path is not None

    # for more about these settings, check yolo augmenations reference from here:
    # https://docs.ultralytics.com/modes/train/#augmentation-settings-and-hyperparameters

    augmentation_configs = {
        "fliplr": 0.5,
        "flipud": 0.5,
        "degrees": 90,
        "perspective": 1e-4,
    }

    train_configs = {
        "data": dataset_splits_path,
        "batch": 32,
        "epochs": 10,
        "imgsz": 320,
        "save": True,
        "seed": 123,
        "plots": True,
        "device": "0",
        "optimizer": "AdamW",
        "auto_augment": "augmix",
        **augmentation_configs,
    }

    model = YOLO(yolo_model_ckpt)
    return model.train(**train_configs)


In [None]:
# train_yolo(dataset_splits_path=tomato_splits_path)
# train_yolo(dataset_splits_path=beans_splits_path)
# train_yolo(dataset_splits_path=maize_splits_path)


archive models' checkpoints for downloading


In [None]:
tomato_nano_model_path = "/kaggle/working/runs/classify/train"
!tar -cvzf tomato_nano_model.zip {tomato_nano_model_path}

beans_nano_model_path = "/kaggle/working/runs/classify/train"
!tar -cvzf beans_nano_model.zip {beans_nano_model_path}

maize_nano_model_path = "/kaggle/working/runs/classify/train"
!tar -cvzf maize_nano_model.zip {maize_nano_model_path}


# Yolo model testing


In [None]:
def get_test_dataset_labels_imgs(dataset_df, dataset_splits, dataset_splits_path):
    gt_labels = [
        k for k, split in dataset_splits.items() for i in range(len(split["test"]))
    ]
    imgs = [
        f"{dataset_splits_path}/test/{k.lower()}/{os.path.basename(dataset_df.iloc[img_i].path)}"
        for k, splits in dataset_splits.items()
        for img_i in splits["test"]
    ]

    return gt_labels, imgs


def test_yolo_ckpt(
    model_ckpt,
    dataset_df,
    dataset_splits_path,
    dataset_splits,
    test_cnt=-1,
    verbose=False,
):
    inference_configs = {"stream": True, "verbose": verbose, "device": 0}

    gt_labels, imgs = get_test_dataset_labels_imgs(
        dataset_df, dataset_splits, dataset_splits_path
    )
    model = YOLO(model_ckpt, verbose=verbose)
    preds = model(imgs[:test_cnt], **inference_configs)

    preds_labels = [model.names[pred.probs.top1] for pred in preds]
    acc = sum(
        [1 if pred == gt.lower() else 0 for pred, gt in zip(preds_labels, gt_labels)]
    )
    return acc / len(preds_labels)


In [None]:
yolo_ckpt = f"{tomato_nano_model_path}/weights/best.pt"
test_yolo_ckpt(yolo_ckpt, tomato_df, tomato_splits_path, tomato_splits, test_cnt=-1)

yolo_ckpt = f"{beans_nano_model_path}/weights/best.pt"
test_yolo_ckpt(yolo_ckpt, beans_df, beans_splits_path, beans_splits, test_cnt=-1)

yolo_ckpt = f"{maize_nano_model_path}/weights/best.pt"
test_yolo_ckpt(yolo_ckpt, maize_df, maize_splits_path, maize_splits, test_cnt=-1)


# TFLite model exporting

exporting tflite model in half floating-point format actually decreased the `nano` model tflite size from \~6mb (float32) to \~3.5mb, which is good since we have 4 different models.


In [None]:
yolo_model = YOLO(yolo_ckpt)
tflit_model_f32 = yolo_model.export(format="tflite")
tflit_model_f16 = yolo_model.export(format="tflite", half=True)


In [None]:
tflite_saved_model = "/kaggle/working/runs/classify/train/weights/best_saved_model"
!tar -cvzf maize_tflite_nano_model.zip {tflite_saved_model}


# TFLite model testing


In [None]:
def test_tflite_ckpt(
    model_ckpt,
    dataset_df,
    dataset_splits_path,
    dataset_splits,
    test_cnt=-1,
    verbose=False,
):
    def im_resize(img_path, dim):
        return Image.open(img_path).resize((dim, dim))

    # gt_labels, test_imgs = get_test_dataset_labels_imgs(dataset_splits['test'], dataset_splits_path)
    gt_labels, test_imgs = get_test_dataset_labels_imgs(
        dataset_df, dataset_splits, dataset_splits_path
    )
    model = YOLO(model_ckpt, task="classify", verbose=verbose)
    preds_labels = []
    for fname in test_imgs[:test_cnt]:
        img = im_resize(fname, 320)
        pred = model(img, imgsz=320, verbose=verbose)
        preds_labels.append(model.names[pred[0].probs.top1])

    acc = sum(
        [1 if pred == gt.lower() else 0 for pred, gt in zip(preds_labels, gt_labels)]
    )
    return acc / len(preds_labels)


tflite_ckpt_f32 = f"{tflite_saved_model}/best_float32.tflite"
tflite_ckpt_f16 = f"{tflite_saved_model}/best_float16.tflite"

print(
    test_tflite_ckpt(
        tflite_ckpt_f32, maize_df, maize_splits_path, maize_splits, verbose=False
    )
)
print(
    test_tflite_ckpt(
        tflite_ckpt_f16, maize_df, maize_splits_path, maize_splits, verbose=False
    )
)


# Train a "Router" model

#### the idea of this model is to train it on all the data we have no matter which sub-class it is, the only important thing is to capture the pattern of each crop on its own, so we can decide automatically which model to use to identify the disease for a specific crop.


In the next cell we create directory structure as per yolo classification task guide. I moved all the data from their yolo-like directory structure when they each model was trained alone to a directory named `all` to contain all train images from all subclasses into one class, `tomato`, `beans`, or `maize`.


In [None]:
!cd /kaggle/working/all && mkdir -p train val test

!cd /kaggle/working/all/train && mkdir -p tomato beans maize
!mv /kaggle/working/tomato/train/* /kaggle/working/all/train/tomato
!mv /kaggle/working/beans/train/* /kaggle/working/all/train/beans
!mv /kaggle/working/maize/train/* /kaggle/working/all/train/maize

!cd /kaggle/working/all/val && mkdir -p tomato beans maize
!mv /kaggle/working/tomato/val/* /kaggle/working/all/val/tomato
!mv /kaggle/working/beans/val/* /kaggle/working/all/val/beans
!mv /kaggle/working/maize/val/* /kaggle/working/all/val/maize

!cd /kaggle/working/all/test && mkdir -p tomato beans maize
!mv /kaggle/working/tomato/test/* /kaggle/working/all/test/tomato
!mv /kaggle/working/beans/test/* /kaggle/working/all/test/beans
!mv /kaggle/working/maize/test/* /kaggle/working/all/test/maize


This cell moves images from all subclasses to its parent, e.g. all images insed beans sub-classes, `bean_rust`, `angular_leaf_spot`, and `healthy` to their parent `beans` since they all will represnt a single class `beans`. And this is applied to `train`, `val`, and `test`


In [None]:
# TODO: update these commands to a better way of achieving this step

!find /kaggle/working/all/train/maize -maxdepth 2 -exec mv {} /kaggle/working/all/train/maize \;
!find /kaggle/working/all/train/tomato -maxdepth 2 -exec mv {} /kaggle/working/all/train/tomato \;
!find /kaggle/working/all/train/beans -maxdepth 2 -exec mv {} /kaggle/working/all/train/beans \;
!ls -1 /kaggle/working/all/train/maize | wc -l
!ls -1 /kaggle/working/all/train/tomato | wc -l
!ls -1 /kaggle/working/all/train/beans | wc -l

!rm -r /kaggle/working/all/train/maize/maize_lethal_necrosis
!rm -r /kaggle/working/all/train/maize/gray_leaf_spot
!rm -r /kaggle/working/all/train/maize/healthy
!rm -r /kaggle/working/all/train/maize/common_rust
!rm -r /kaggle/working/all/train/maize/northern_leaf_spot
!rm -r /kaggle/working/all/train/maize/maize_streak_virus
!rm -r /kaggle/working/all/train/tomato/leaf_mold
!rm -r /kaggle/working/all/train/tomato/late_blight
!rm -r /kaggle/working/all/train/tomato/healthy
!rm -r /kaggle/working/all/train/tomato/mosaic_virus
!rm -r /kaggle/working/all/train/tomato/early_blight
!rm -r /kaggle/working/all/train/tomato/yellow_leaf_virus
!rm -r /kaggle/working/all/train/tomato/target_spot
!rm -r /kaggle/working/all/train/tomato/septoria_leaf_spot
!rm -r /kaggle/working/all/train/beans/bean_rust
!rm -r /kaggle/working/all/train/beans/healthy
!rm -r /kaggle/working/all/train/beans/angular_leaf_spot


In [None]:
!find /kaggle/working/all/val/maize -maxdepth 2 -exec mv {} /kaggle/working/all/val/maize \;
!find /kaggle/working/all/val/tomato -maxdepth 2 -exec mv {} /kaggle/working/all/val/tomato \;
!find /kaggle/working/all/val/beans -maxdepth 2 -exec mv {} /kaggle/working/all/val/beans \;
!ls -1 /kaggle/working/all/val/maize | wc -l
!ls -1 /kaggle/working/all/val/tomato | wc -l
!ls -1 /kaggle/working/all/val/beans | wc -l

!rm -r /kaggle/working/all/val/maize/maize_lethal_necrosis
!rm -r /kaggle/working/all/val/maize/gray_leaf_spot
!rm -r /kaggle/working/all/val/maize/healthy
!rm -r /kaggle/working/all/val/maize/common_rust
!rm -r /kaggle/working/all/val/maize/northern_leaf_spot
!rm -r /kaggle/working/all/val/maize/maize_streak_virus
!rm -r /kaggle/working/all/val/tomato/leaf_mold
!rm -r /kaggle/working/all/val/tomato/late_blight
!rm -r /kaggle/working/all/val/tomato/healthy
!rm -r /kaggle/working/all/val/tomato/mosaic_virus
!rm -r /kaggle/working/all/val/tomato/early_blight
!rm -r /kaggle/working/all/val/tomato/yellow_leaf_virus
!rm -r /kaggle/working/all/val/tomato/target_spot
!rm -r /kaggle/working/all/val/tomato/septoria_leaf_spot
!rm -r /kaggle/working/all/val/beans/bean_rust
!rm -r /kaggle/working/all/val/beans/healthy
!rm -r /kaggle/working/all/val/beans/angular_leaf_spot


In [None]:
!find /kaggle/working/all/test/maize -maxdepth 2 -exec mv {} /kaggle/working/all/test/maize \;
!find /kaggle/working/all/test/tomato -maxdepth 2 -exec mv {} /kaggle/working/all/test/tomato \;
!find /kaggle/working/all/test/beans -maxdepth 2 -exec mv {} /kaggle/working/all/test/beans \;
!ls -1 /kaggle/working/all/test/maize | wc -l
!ls -1 /kaggle/working/all/test/tomato | wc -l
!ls -1 /kaggle/working/all/test/beans | wc -l

!rm -r /kaggle/working/all/test/maize/maize_lethal_necrosis
!rm -r /kaggle/working/all/test/maize/gray_leaf_spot
!rm -r /kaggle/working/all/test/maize/healthy
!rm -r /kaggle/working/all/test/maize/common_rust
!rm -r /kaggle/working/all/test/maize/northern_leaf_spot
!rm -r /kaggle/working/all/test/maize/maize_streak_virus
!rm -r /kaggle/working/all/test/tomato/leaf_mold
!rm -r /kaggle/working/all/test/tomato/late_blight
!rm -r /kaggle/working/all/test/tomato/healthy
!rm -r /kaggle/working/all/test/tomato/mosaic_virus
!rm -r /kaggle/working/all/test/tomato/early_blight
!rm -r /kaggle/working/all/test/tomato/yellow_leaf_virus
!rm -r /kaggle/working/all/test/tomato/target_spot
!rm -r /kaggle/working/all/test/tomato/septoria_leaf_spot
!rm -r /kaggle/working/all/test/beans/bean_rust
!rm -r /kaggle/working/all/test/beans/healthy
!rm -r /kaggle/working/all/test/beans/angular_leaf_spot


In [None]:
all_router_splits_path = "/kaggle/working/all"
print(train_yolo(dataset_splits_path=all_router_splits_path))


In [None]:
all_router_nano_model_path = "/kaggle/working/runs/classify/train"
!tar -cvzf all_router_nano_model.zip {all_router_nano_model_path}


In [None]:
yolo_ckpt = f"{all_router_nano_model_path}/weights/best.pt"
# test_yolo_ckpt(yolo_ckpt, all_df, all_splits_path, all_splits, test_cnt=-1)


In [None]:
yolo_model = YOLO(yolo_ckpt)
tflit_model_f32 = yolo_model.export(format="tflite")
tflit_model_f16 = yolo_model.export(format="tflite", half=True)


In [None]:
tflite_saved_model = "/kaggle/working/runs/classify/train/weights/best_saved_model"
!tar -cvzf all_tflite_nano_model.zip {tflite_saved_model}


In [None]:
def test_tflite_ckpt(
    model_ckpt, dataset_splits_path, dataset_splits, test_cnt=-1, verbose=False
):
    def im_resize(img_path, dim):
        return Image.open(img_path).resize((dim, dim))

    gt_labels, test_imgs = get_test_dataset_labels_imgs(
        dataset_splits["test"], dataset_splits_path
    )
    model = YOLO(model_ckpt, task="classify", verbose=verbose)
    preds_labels = []
    for label, fname in zip(gt_labels[:test_cnt], test_imgs[:test_cnt]):
        img = im_resize(fname, 320)
        pred = model(img, imgsz=320, verbose=verbose)
        preds_labels.append(model.names[pred[0].probs.top1])

    acc = sum(
        [1 if pred == gt.lower() else 0 for pred, gt in zip(preds_labels, gt_labels)]
    )
    return acc / len(preds_labels)


tflite_ckpt_f32 = f"{tflite_saved_model}/best_float32.tflite"
tflite_ckpt_f16 = f"{tflite_saved_model}/best_float16.tflite"

all_router_splits = get_yolo_dataset_splits(all_router_splits_path)

print(
    test_tflite_ckpt(
        tflite_ckpt_f32, all_router_splits_path, all_router_splits, verbose=False
    )
)
print(
    test_tflite_ckpt(
        tflite_ckpt_f16, all_router_splits_path, all_router_splits, verbose=False
    )
)
