# Setting Up

In [13]:
import os
import sys
sys.path.append(os.path.join(os.getcwd(), '..'))
from configs import paths_helper

# Set up directoris for training a yolo model

# Images directories
from pathlib import Path
DATA_DIR = Path(paths_helper.DATA_DIR)
DATASET_DIR = DATA_DIR
IMAGES_DIR = DATASET_DIR / 'images'
TRAIN_IMAGES_DIR = IMAGES_DIR / 'train'
VAL_IMAGES_DIR = IMAGES_DIR / 'val'
TEST_IMAGES_DIR = IMAGES_DIR / 'test'

# Labels directories
LABELS_DIR = DATASET_DIR / 'labels'
TRAIN_LABELS_DIR = LABELS_DIR / 'train'
VAL_LABELS_DIR = LABELS_DIR / 'val'
TEST_LABELS_DIR = LABELS_DIR / 'test'

# Reading Data

In [None]:
import pandas as pd
# Load train and test files
train = pd.read_csv(DATA_DIR / 'Train.csv')
test = pd.read_csv(DATA_DIR / 'Test.csv')
ss = pd.read_csv(DATA_DIR / 'SampleSubmission.csv')

# Add an image_path column
train['image_path'] = [Path('images/' + x) for x in train.Image_ID]
test['image_path'] = [Path('images/' + x) for x in test.Image_ID]

# Map str classes to ints (label encoding targets)
class_mapper = {x:y for x,y in zip(sorted(train['class'].unique().tolist()), range(train['class'].nunique()))}
train['class_id'] = train['class'].map(class_mapper)

# Preview the head of the train set
train.head()

Unnamed: 0,Image_ID,confidence,class,ymin,xmin,ymax,xmax,image_path,class_id
0,id_11543h.jpg,1.0,Pepper_Bacterial_Spot,194.649671,328.803454,208.10773,341.967928,images\id_11543h.jpg,5
1,id_11543h.jpg,1.0,Pepper_Bacterial_Spot,149.632401,256.768914,162.910362,266.195724,images\id_11543h.jpg,5
2,id_11543h.jpg,1.0,Pepper_Bacterial_Spot,234.046875,327.138158,252.712993,338.876645,images\id_11543h.jpg,5
3,id_11543h.jpg,1.0,Pepper_Bacterial_Spot,221.277138,340.411184,238.59375,354.651316,images\id_11543h.jpg,5
4,id_11ee1c.jpg,1.0,Pepper_Fusarium,2000.563598,989.588908,2184.252196,1401.748952,images\id_11ee1c.jpg,8


In [15]:
test.head()

Unnamed: 0,Image_ID,confidence,class,ymin,xmin,ymax,xmax,image_path
0,id_128pxx.jpg,,,,,,,images\id_128pxx.jpg
1,id_12jbci.jpg,,,,,,,images\id_12jbci.jpg
2,id_143s4o.jpg,,,,,,,images\id_143s4o.jpg
3,id_14tfmb.jpg,,,,,,,images\id_14tfmb.jpg
4,id_14tw4o.jpg,,,,,,,images\id_14tw4o.jpg


In [16]:
ss.head()

Unnamed: 0,Image_ID,class,confidence,ymin,xmin,ymax,xmax
0,id_128pxx.jpg,Corn_Cercospora_Leaf_Spot,0.5,100,100,100,100
1,id_128pxx.jpg,Corn_Common_Rust,0.5,100,100,100,100
2,id_128pxx.jpg,Corn_Healthy,0.5,100,100,100,100
3,id_128pxx.jpg,Corn_Northern_Leaf_Blight,0.5,100,100,100,100
4,id_128pxx.jpg,Corn_Streak,0.5,100,100,100,100


# Splitting Data

In [18]:
from sklearn.model_selection import train_test_split
# Split data into training and validation
train_unique_imgs_df = train.drop_duplicates(subset = ['Image_ID'], ignore_index = True)
X_train, X_val = train_test_split(train_unique_imgs_df, test_size = 0.25, stratify=train_unique_imgs_df['class'], random_state=42)

X_train = train[train.Image_ID.isin(X_train.Image_ID)]
X_val = train[train.Image_ID.isin(X_val.Image_ID)]

# Check shapes of training and validation data
X_train.shape, X_val.shape

((30777, 9), (10252, 9))

# Target Distribution (Imbalanced)

In [24]:
# Preview target distribution, seems there a class imbalance that needs to be handled

target_dis = pd.DataFrame(X_train['class'].value_counts(normalize = True))
target_dis['val'] = X_val['class'].value_counts(normalize = True)
target_dis

Unnamed: 0_level_0,proportion,val
class,Unnamed: 1_level_1,Unnamed: 2_level_1
Corn_Cercospora_Leaf_Spot,0.160444,0.156067
Tomato_Septoria,0.159047,0.154897
Tomato_Late_Blight,0.098905,0.085252
Corn_Streak,0.077201,0.078424
Tomato_Healthy,0.069045,0.072864
Pepper_Septoria,0.051922,0.067987
Pepper_Leaf_Mosaic,0.051662,0.051014
Tomato_Early_Blight,0.047763,0.04643
Pepper_Bacterial_Spot,0.047665,0.04643
Corn_Common_Rust,0.04029,0.043114


# Creating Directories

In [25]:
# Check if dirs exist, if they do, remove them, otherwise create them.
# This only needs to run once
import shutil
for DIR in [TRAIN_IMAGES_DIR,VAL_IMAGES_DIR, TEST_IMAGES_DIR, TRAIN_LABELS_DIR,VAL_LABELS_DIR,TEST_LABELS_DIR]:
  if DIR.exists():
    shutil.rmtree(DIR)
  DIR.mkdir(parents=True, exist_ok = True)

In [28]:
import shutil
from concurrent.futures import ThreadPoolExecutor
from tqdm.asyncio import tqdm
import asyncio
from pathlib import Path
import nest_asyncio

# Create a shared ThreadPoolExecutor
executor = ThreadPoolExecutor()

async def copy_image(img, dest_dir):
    """Copies a single image to the destination directory."""
    await asyncio.to_thread(shutil.copy, img, dest_dir / Path(img).name)

async def copy_images_with_progress(imgs, dest_dir):
    """Copies images with a progress bar asynchronously using ThreadPoolExecutor."""
    with tqdm(total=len(imgs), desc=f"Copying to {dest_dir.name}") as pbar:
        tasks = [copy_image(img, dest_dir) for img in imgs]
        for coro in asyncio.as_completed(tasks):
            await coro  # Wait for each task to complete
            pbar.update()

async def main():
    # Run all the copying tasks concurrently
    await asyncio.gather(
        copy_images_with_progress(X_train.image_path.unique(), TRAIN_IMAGES_DIR),
        copy_images_with_progress(X_val.image_path.unique(), VAL_IMAGES_DIR),
        copy_images_with_progress(test.image_path.unique(), TEST_IMAGES_DIR),
    )

# Allow nested use of asyncio.run()
nest_asyncio.apply()

# Run the main event loop
asyncio.run(main())

Copying to train:   0%|          | 0/3676 [00:00<?, ?it/s]
Copying to train:   0%|          | 0/3676 [00:00<?, ?it/s]


FileNotFoundError: [Errno 2] No such file or directory: 'images\\id_517hum.jpg'

Copying to val:   0%|          | 0/1226 [00:01<?, ?it/s]
Copying to test:   0%|          | 0/2101 [00:01<?, ?it/s]


In [29]:
# Copy train, val and test images to their respective dirs
for img in tqdm(X_train.image_path.unique()):
  shutil.copy(img, TRAIN_IMAGES_DIR / img.parts[-1])

for img in tqdm(X_val.image_path.unique()):
  shutil.copy(img, VAL_IMAGES_DIR / img.parts[-1])

for img in tqdm(test.image_path.unique()):
  shutil.copy(img, TEST_IMAGES_DIR / img.parts[-1])

  0%|          | 0/3676 [00:00<?, ?it/s]


FileNotFoundError: [Errno 2] No such file or directory: 'images\\id_11543h.jpg'