In [10]:
from ultralytics import YOLO
import torch
import os
import shutil
from pathlib import Path
import numpy as np
import random
import zipfile
import pandas as pd
import cv2

In [2]:
# set random seed
random_seed = 42
random.seed(random_seed)
np.random.seed(random_seed)
torch.manual_seed(random_seed)

<torch._C.Generator at 0x23c2aaddcf0>

# Load dataset from kaggle

In [None]:
# this cell is for running in Kaggle
# load kaggle.json from google drive
# !gdown <item id>

In [None]:
# check if the kaggle.json file exists
from sys import platform

# Windows
# C:\Users\<Windows-username>\.kaggle\kaggle.json
if platform == "win32":
    print("Windows")
    kaggle_json_path = os.path.join(os.path.expanduser("~"), ".kaggle", "kaggle.json")
    if os.path.exists(kaggle_json_path):
        print("kaggle.json exists")
    else:
        print("kaggle.json does not exist")
        shutil.copy("./kaggle.json", kaggle_json_path)
        print("kaggle.json is copied to the directory")

# Linux
# ~/.kaggle/kaggle.json
elif platform == "linux" or platform == "linux2":
    print("Linux")
    kaggle_json_path = os.path.join(os.path.expanduser("~"), ".kaggle", "kaggle.json")
    if os.path.exists(kaggle_json_path):
        print("kaggle.json exists")
    else:
        print("kaggle.json does not exist")
        os.makedirs(os.path.join(os.path.expanduser("~"), ".kaggle"), exist_ok=True)
        shutil.copy("./kaggle.json", kaggle_json_path)
        print("kaggle.json is copied to the directory")

# macOS
# ~/.kaggle/kaggle.json
elif platform == "darwin":
    print("macOS")
    kaggle_json_path = os.path.join(os.path.expanduser("~"), ".kaggle", "kaggle.json")
    if os.path.exists(kaggle_json_path):
        print("kaggle.json exists")
    else:
        print("kaggle.json does not exist")
        os.makedirs(os.path.join(os.path.expanduser("~"), ".kaggle"), exist_ok=True)
        shutil.copy("./kaggle.json", kaggle_json_path)
        print("kaggle.json is copied to the directory")

In [None]:
# download and unzip the dataset
if(os.path.exists("./rice-image-dataset-for-object-detection")):
    print("The dataset is already downloaded")
else:
    !kaggle datasets download -d alikhalilit98/rice-image-dataset-for-object-detection
    with zipfile.ZipFile("rice-image-dataset-for-object-detection.zip", 'r') as zip_ref:
        zip_ref.extractall("./rice-image-dataset-for-rice-germ-detection")

In [4]:
raw_dataset_path = './rice-image-dataset-for-rice-germ-detection'
dataset_path = './rice-germ-dataset'
Path(dataset_path).mkdir(parents=True, exist_ok=True)
Path(f'{dataset_path}/images/train').mkdir(parents=True, exist_ok=True)
Path(f'{dataset_path}/images/val').mkdir(parents=True, exist_ok=True)
Path(f'{dataset_path}/labels/train').mkdir(parents=True, exist_ok=True)
Path(f'{dataset_path}/labels/val').mkdir(parents=True, exist_ok=True)

In [None]:
!gdown 1Gk73cVdNwL4PYF37XRBKa-lybbdE0k7O -O ./rice-germ-dataset/rice-germ-1050labels.zip

In [None]:
with zipfile.ZipFile(f"{dataset_path}/rice-germ-1050labels.zip", 'r') as zip_ref:
    zip_ref.extractall(path=dataset_path)

# Splitting dataset

In [5]:
df = pd.read_csv(f"{dataset_path}/rice-germ-1050labels.csv")
df.sort_values(by=['image_name'], inplace=True)
df[:10]

Unnamed: 0,label_name,bbox_x,bbox_y,bbox_width,bbox_height,image_name,image_width,image_height
0,Bad,155,93,50,57,Jasmine (1).jpg,250,250
1,Bad,156,106,49,64,Jasmine (10).jpg,250,250
2,Moderate,130,124,60,63,Jasmine (100).jpg,250,250
392,Good,64,62,48,44,Jasmine (1006).jpg,250,250
3,Moderate,112,49,61,59,Jasmine (101).jpg,250,250
393,Good,93,158,51,43,Jasmine (1010).jpg,250,250
4,Moderate,74,151,51,56,Jasmine (102).jpg,250,250
5,Good,157,96,46,52,Jasmine (103).jpg,250,250
6,Bad,88,150,61,60,Jasmine (104).jpg,250,250
7,Bad,152,94,60,61,Jasmine (105).jpg,250,250


In [6]:
from sklearn.model_selection import train_test_split

raw_labels_path = f"{dataset_path}/rice-germ-1050labels"
raw_images_path = f"{raw_dataset_path}/Rice_Image_Dataset_OD/Jasmine"

data_lists = os.listdir(raw_labels_path)
data_lists = [data_list.split(".")[0] for data_list in data_lists]
print(data_lists[:10])

train, val = train_test_split(data_lists, test_size=0.2, random_state=random_seed, stratify=df["label_name"])
print(f"train: {len(train)}, val: {len(val)}")

['Jasmine (1)', 'Jasmine (10)', 'Jasmine (100)', 'Jasmine (1006)', 'Jasmine (101)', 'Jasmine (1010)', 'Jasmine (102)', 'Jasmine (103)', 'Jasmine (104)', 'Jasmine (105)']
train: 840, val: 210


In [12]:
# insert backgound image (rgb 0, 0, 0) (size 128,128) for 10% of each set
bg = np.zeros((128, 128, 3), dtype=np.uint8)
bg_train_num = int(np.ceil(len(train) * 0.1))
bg_val_num = int(np.ceil(len(val) * 0.1))

for i in range(bg_train_num):
    bg_name = f"bg_{i}"
    bg_path = f"{dataset_path}/images/train/{bg_name}.jpg"
    cv2.imwrite(bg_path, bg)
    with open(f"{dataset_path}/labels/train/{bg_name}.txt", "w") as f:
        f.write("")
        
for i in range(bg_val_num):
    bg_name = f"bg_{i}"
    bg_path = f"{dataset_path}/images/val/{bg_name}.jpg"
    cv2.imwrite(bg_path, bg)
    with open(f"{dataset_path}/labels/val/{bg_name}.txt", "w") as f:
        f.write("")

new_train_num = len(os.listdir(f"{dataset_path}/images/train"))
new_val_num = len(os.listdir(f"{dataset_path}/images/val"))
print(f"train: {new_train_num}, val: {new_val_num}")

train: 924, val: 231


In [13]:
for x in train:
    shutil.copy(f"{raw_labels_path}/{x}.txt", f"{dataset_path}/labels/train/{x}.txt")
    shutil.copy(f"{raw_images_path}/{x}.jpg", f"{dataset_path}/images/train/{x}.jpg")

for x in val:
    shutil.copy(f"{raw_labels_path}/{x}.txt", f"{dataset_path}/labels/val/{x}.txt")
    shutil.copy(f"{raw_images_path}/{x}.jpg", f"{dataset_path}/images/val/{x}.jpg")

# Create yaml file

In [None]:
import yaml

full_path = os.path.abspath("./rice-germ-dataset")

yaml_data = dict(
    path = full_path,
    train='images/train',
    val='images/val',
    names={0: 'Good', 1: 'Moderate', 2: 'Bad'},
)

with open("rice-germ-dataset/data.yaml", "w") as f:
    yaml.dump(yaml_data, f, default_flow_style=False, sort_keys=False)

# Train model

In [None]:
# training parameters
epochs = 300
batch = 32
imgsz = 128

In [None]:
model = YOLO("yolov8x.pt")
trained_model = model.train(data=f"{dataset_path}/data.yaml", epochs=epochs, batch=batch, imgsz=imgsz, device="0", project="rice-germ-detection", name="rice-germ-run", seed=random_seed)