# Mount and Import

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from zipfile import ZipFile
zip_path = '/content/drive/MyDrive/MV Project/crop & weed.zip'
extract_path = '/content/dataset'

with ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

# Raw Dataset

In [None]:
import os
import xml.etree.ElementTree as ET

annotation_dir = "/content/dataset/Ronin_OPEN_DB/annotations"  # path

# Counters
weed_count = 0
crop_count = 0

# Loop
for xml_file in os.listdir(annotation_dir):

    if xml_file.endswith(".xml"):
        file_path = os.path.join(annotation_dir, xml_file)

        try:
            tree = ET.parse(file_path)
            root = tree.getroot()

            # Count
            for obj in root.findall("object"):
                class_name = obj.find("name").text

                if class_name == "weed":
                    weed_count += 1

                elif class_name == "crop":
                    crop_count += 1

        except Exception as e:
            print(f"Error processing {xml_file}: {e}")

# Print
print(f"Total 'weed' instances: {weed_count}")
print(f"Total 'crop' instances: {crop_count}")
print(f"Total objects: {weed_count + crop_count}")

Total 'weed' instances: 7442
Total 'crop' instances: 411
Total objects: 7853


In [None]:
annotation_dir = "/content/dataset/Ronin_OPEN_DB/annotations"  # path

# Counters
weed_count = 0
crop_count = 0
files_with_weed = 0
files_with_crop = 0

# Loop
for xml_file in os.listdir(annotation_dir):

    if xml_file.endswith(".xml"):
        file_path = os.path.join(annotation_dir, xml_file)

        try:
            tree = ET.parse(file_path)
            root = tree.getroot()

            # Flags
            has_weed = False
            has_crop = False

            # Count
            for obj in root.findall("object"):
                class_name = obj.find("name").text.lower()  # Case-insensitive

                if class_name == "weed":
                    weed_count += 1
                    has_weed = True

                elif class_name == "crop":
                    crop_count += 1
                    has_crop = True

            # Update
            if has_weed:
                files_with_weed += 1

            if has_crop:
                files_with_crop += 1

        except Exception as e:
            print(f"Error processing {xml_file}: {e}")

# Print
print(f"Total 'weed' instances: {weed_count}")
print(f"Total 'crop' instances: {crop_count}")
print(f"Total objects: {weed_count + crop_count}")
print(f"Files with at least one 'weed': {files_with_weed}")
print(f"Files with at least one 'crop': {files_with_crop}")
print(f"Total XML files processed: {len([f for f in os.listdir(annotation_dir) if f.endswith('.xml')])}")

Total 'weed' instances: 7442
Total 'crop' instances: 411
Total objects: 7853
Files with at least one 'weed': 1132
Files with at least one 'crop': 103
Total XML files processed: 1176


In [None]:
# directory
start_path = "/content/dataset"

# Variable to track
found = False

# Walk through
for root, dirs, files in os.walk(start_path):
    # Check
    xml_files = [f for f in files if f.endswith('.xml')]
    jpg_files = [f for f in files if f.endswith('.jpg')]

    # find
    if xml_files and jpg_files:
        directory = os.path.abspath(root)
        print(f"First directory with both XML and JPG found: {directory}")
        print(f"Some XML files: {xml_files[:3]}")  # Show up to 3 XML files
        print(f"Some JPG files: {jpg_files[:3]}")  # Show up to 3 JPG files
        found = True
        break  # Stop

if not found:
    # show
    print("\nDirectories searched:")
    for root, dirs, files in os.walk(start_path):
        print(f"Checked: {root}")
        if files:
            print(f"Files found: {files[:5]}")  # Show up to 5 files


Directories searched:
Checked: /content/dataset
Checked: /content/dataset/Ronin_OPEN_DB
Checked: /content/dataset/Ronin_OPEN_DB/annotations
Files found: ['32917.xml', '32556.xml', 'IMG_5965.xml', 'IMG_5976.xml', '32331.xml']
Checked: /content/dataset/Ronin_OPEN_DB/raw images
Files found: ['34294.jpg', 'IMG_5934.JPG', '32747.jpg', '32502.jpg', 'IMG_6119.JPG']


# Prepare Dataset

In [None]:
import os
import xml.etree.ElementTree as ET
import shutil
from pathlib import Path
# Define
base_path = "/content/dataset/Ronin_OPEN_DB"
image_dir = os.path.join(base_path, "raw images")
annotation_dir = os.path.join(base_path, "annotations")
output_dir = "/content/yolo_format_dataset"

# Class mapping
class_map = {
    "weed": 0,
    "crop": 1
}

In [None]:
# Convert XML to YOLO format
def xml_to_yolo(xml_path):
    tree = ET.parse(xml_path)
    root = tree.getroot()
    size = root.find('size')
    img_width = float(size.find('width').text)
    img_height = float(size.find('height').text)
    yolo_lines = []

    for obj in root.findall('object'):
        class_name = obj.find('name').text
        try:
            class_id = class_map[class_name]
        except KeyError:
            print(f"Warning: Class '{class_name}' not in class_map. Skipping object in {xml_path}")
            continue

        bbox = obj.find('bndbox')
        xmin = float(bbox.find('xmin').text)
        ymin = float(bbox.find('ymin').text)
        xmax = float(bbox.find('xmax').text)
        ymax = float(bbox.find('ymax').text)

        x_center = (xmin + xmax) / 2 / img_width
        y_center = (ymin + ymax) / 2 / img_height
        width = (xmax - xmin) / img_width
        height = (ymax - ymin) / img_height

        yolo_lines.append(f"{class_id} {x_center:.6f} {y_center:.6f} {width:.6f} {height:.6f}")

    return "\n".join(yolo_lines)

In [None]:
import numpy as np
# Prepare
def prepare_stratified_dataset(image_dir, annotation_dir, output_dir):
    def create_dataset_structure():
        dirs = {
            'train_images': os.path.join(output_dir, 'images/train'),
            'validation_images': os.path.join(output_dir, 'images/validation'),
            'test_images': os.path.join(output_dir, 'images/test'),
            'train_labels': os.path.join(output_dir, 'labels/train'),
            'validation_labels': os.path.join(output_dir, 'labels/validation'),
            'test_labels': os.path.join(output_dir, 'labels/test'),
        }
        for d in dirs.values():
            os.makedirs(d, exist_ok=True)
        return dirs

    # Group
    image_files = [f for f in os.listdir(image_dir) if f.lower().endswith(('.jpg', '.jpeg'))]
    groups = {'weed_only': [], 'crop_only': [], 'both': []}

    for img_file in image_files:
        base = os.path.splitext(img_file)[0]
        ann_file = os.path.join(annotation_dir, base + '.xml')
        if os.path.exists(ann_file):
            yolo_txt = xml_to_yolo(ann_file)
            classes = set(line.split()[0] for line in yolo_txt.strip().split('\n') if line)
            if classes == {'0'}:
                groups['weed_only'].append((img_file, yolo_txt))
            elif classes == {'1'}:
                groups['crop_only'].append((img_file, yolo_txt))
            elif classes == {'0', '1'} or classes == {'1', '0'}:
                groups['both'].append((img_file, yolo_txt))

    def stratified_split(data, train_ratio=0.9, val_ratio=0.05):
        np.random.shuffle(data)
        total = len(data)
        train_end = int(total * train_ratio)
        val_end = train_end + int(total * val_ratio)
        return data[:train_end], data[train_end:val_end], data[val_end:]

    # Apply splitting
    train_data, val_data, test_data = [], [], []
    for group in groups.values():
        tr, va, te = stratified_split(group)
        train_data += tr
        val_data += va
        test_data += te

    dirs = create_dataset_structure()

    def copy_data(data, img_dest, label_dest):
        for img_file, yolo_txt in data:
            base = os.path.splitext(img_file)[0]
            src_img_path = os.path.join(image_dir, img_file)
            dst_img_path = os.path.join(img_dest, img_file)
            shutil.copy(src_img_path, dst_img_path)

            label_path = os.path.join(label_dest, base + '.txt')
            with open(label_path, 'w') as f:
                f.write(yolo_txt)

    copy_data(train_data, dirs['train_images'], dirs['train_labels'])
    copy_data(val_data, dirs['validation_images'], dirs['validation_labels'])
    copy_data(test_data, dirs['test_images'], dirs['test_labels'])

    # data.yaml
    data_yaml = f"""
path: {output_dir}
train: images/train
val: images/validation
test: images/test
names:
  0: weed
  1: crop
"""
    yaml_path = os.path.join(output_dir, "data.yaml")
    with open(yaml_path, 'w') as f:
        f.write(data_yaml.strip())

    return dirs

In [None]:
prepare_stratified_dataset(
    image_dir="/content/dataset/Ronin_OPEN_DB/raw images",
    annotation_dir="/content/dataset/Ronin_OPEN_DB/annotations",
    output_dir="/content/yolo_format_dataset"
)

{'train_images': '/content/yolo_format_dataset/images/train',
 'validation_images': '/content/yolo_format_dataset/images/validation',
 'test_images': '/content/yolo_format_dataset/images/test',
 'train_labels': '/content/yolo_format_dataset/labels/train',
 'validation_labels': '/content/yolo_format_dataset/labels/validation',
 'test_labels': '/content/yolo_format_dataset/labels/test'}

In [None]:
import zipfile
import os
import shutil

# Define
output_dir = "/content/yolo_format_dataset"
zip_path = "/content/yolo_format_dataset.zip"

# Create zip
def create_zip():
    with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, _, files in os.walk(output_dir):
            for file in files:
                file_path = os.path.join(root, file)
                arcname = os.path.relpath(file_path, os.path.dirname(output_dir))  # preserve yolo_format_dataset folder
                zipf.write(file_path, arcname)
    return zip_path


zip_file = create_zip()
print(f"Zip created: {zip_file}")

Zip created: /content/yolo_format_dataset.zip


In [None]:
import os

def check_yolo_pairs(image_dir, label_dir):
    image_files = [f for f in os.listdir(image_dir) if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
    label_files = [f for f in os.listdir(label_dir) if f.endswith('.txt')]

    image_basenames = set(os.path.splitext(f)[0] for f in image_files)
    label_basenames = set(os.path.splitext(f)[0] for f in label_files)

    missing_labels = image_basenames - label_basenames
    missing_images = label_basenames - image_basenames

    print(f"Total images: {len(image_files)}")
    print(f"Total labels: {len(label_files)}")
    print(f"Images with NO matching label: {len(missing_labels)}")
    print(f"Labels with NO matching image: {len(missing_images)}")

    if missing_labels:
        print("\nImages without labels:")
        for f in sorted(missing_labels):
            print(f"{f}.jpg")

    if missing_images:
        print("\nLabels without images:")
        for f in sorted(missing_images):
            print(f"{f}.txt")

# Run the checks
check_yolo_pairs('/content/yolo_format_dataset/images/train', '/content/yolo_format_dataset/labels/train')
check_yolo_pairs('/content/yolo_format_dataset/images/validation', '/content/yolo_format_dataset/labels/validation')
check_yolo_pairs('/content/yolo_format_dataset/images/test', '/content/yolo_format_dataset/labels/test')

Total images: 1057
Total labels: 1057
Images with NO matching label: 0
Labels with NO matching image: 0
Total images: 57
Total labels: 57
Images with NO matching label: 0
Labels with NO matching image: 0
Total images: 62
Total labels: 62
Images with NO matching label: 0
Labels with NO matching image: 0


In [None]:
def check_weed_and_crop(file_path):

    # Open
    try:
        with open(file_path, 'r') as file:
            lines = file.readlines()

    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return False

    # Extract
    class_ids = []

    for line in lines:
        parts = line.strip().split()

        if parts:
            class_id = int(parts[0])  # Convert
            class_ids.append(class_id)

    # Check
    has_weed = 0 in class_ids
    has_crop = 1 in class_ids
    return has_weed and has_crop

In [None]:
def check_three_folders_for_weed_and_crop(folder1_path, folder2_path, folder3_path):

    # Function to process a single folder
    def process_folder(folder_path):
        txt_files = [f for f in os.listdir(folder_path) if f.endswith('.txt')]
        both_present = 0
        either_present = 0

        for txt_file in txt_files:
            file_path = os.path.join(folder_path, txt_file)
            result = check_weed_and_crop(file_path)

            if result:
                both_present += 1
            else:
                either_present += 1

        total_files = both_present + either_present
        return both_present, either_present, total_files

    # Folder 1
    print(f"\nProcessing Folder 1: {folder1_path}")
    both1, either1, total1 = process_folder(folder1_path)
    print(f"Files with both 'weed' and 'crop': {both1}")
    print(f"Files with either 'weed' or 'crop' or neither: {either1}")
    print(f"Total files in Folder 1: {total1}")

    # Folder 2
    print(f"\nProcessing Folder 2: {folder2_path}")
    both2, either2, total2 = process_folder(folder2_path)
    print(f"Files with both 'weed' and 'crop': {both2}")
    print(f"Files with either 'weed' or 'crop' or neither: {either2}")
    print(f"Total files in Folder 2: {total2}")

    # Folder 3
    print(f"\nProcessing Folder 3: {folder3_path}")
    both3, either3, total3 = process_folder(folder3_path)
    print(f"Files with both 'weed' and 'crop': {both3}")
    print(f"Files with either 'weed' or 'crop' or neither: {either3}")
    print(f"Total files in Folder 3: {total3}")

    # Grand totals
    grand_both = both1 + both2 + both3
    grand_either = either1 + either2 + either3
    grand_total = total1 + total2 + total3

    print(f"\nGrand Totals Across All Folders:")
    print(f"Total files with both 'weed' and 'crop': {grand_both}")
    print(f"Total files with either 'weed' or 'crop' or neither: {grand_either}")
    print(f"Total files processed: {grand_total}")

In [None]:
# usage
check_three_folders_for_weed_and_crop(
    '/content/yolo_format_dataset/labels/train',
    '/content/yolo_format_dataset/labels/validation',
    '/content/yolo_format_dataset/labels/test'
)


Processing Folder 1: /content/yolo_format_dataset/labels/train
Files with both 'weed' and 'crop': 53
Files with either 'weed' or 'crop' or neither: 1004
Total files in Folder 1: 1057

Processing Folder 2: /content/yolo_format_dataset/labels/validation
Files with both 'weed' and 'crop': 2
Files with either 'weed' or 'crop' or neither: 55
Total files in Folder 2: 57

Processing Folder 3: /content/yolo_format_dataset/labels/test
Files with both 'weed' and 'crop': 4
Files with either 'weed' or 'crop' or neither: 58
Total files in Folder 3: 62

Grand Totals Across All Folders:
Total files with both 'weed' and 'crop': 59
Total files with either 'weed' or 'crop' or neither: 1117
Total files processed: 1176


In [None]:
# Move to Google Drive
destination_path = "/content/drive/MyDrive/MV Project/"
shutil.move(zip_file, destination_path)

print(f"Zip file moved to Google Drive: {destination_path}")

Zip file moved to Google Drive: /content/drive/MyDrive/MV Project/


#Dataset Augmentation

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from zipfile import ZipFile

with ZipFile('/content/drive/MyDrive/MV Project/yolo_format_dataset.zip', 'r') as zip_ref:
    zip_ref.extractall('/content/')

In [None]:
import os

# Helper function to check contents of a label file
def check_weed_and_crop(file_path):
    with open(file_path, 'r') as f:
        lines = f.readlines()

    has_weed = any(line.startswith('0') for line in lines)
    has_crop = any(line.startswith('1') for line in lines)

    if has_weed and has_crop:
        return "both"
    elif has_weed:
        return "weed"
    elif has_crop:
        return "crop"
    else:
        return "neither"

In [None]:
# Minimal output version with folder-specific labels
def check_three_folders_for_weed_and_crop(folder1_path, folder2_path, folder3_path):
    def process_folder(folder_path):
        txt_files = [f for f in os.listdir(folder_path) if f.endswith('.txt')]
        only_weed = 0
        only_crop = 0
        both = 0

        for txt_file in txt_files:
            file_path = os.path.join(folder_path, txt_file)
            label_type = check_weed_and_crop(file_path).strip()
            if label_type == "weed":
                only_weed += 1
            elif label_type == "crop":
                only_crop += 1
            elif label_type == "both":
                both += 1

        total = only_weed + only_crop + both
        return only_weed, only_crop, both, total

    # Folder 1
    w1, c1, b1, t1 = process_folder(folder1_path)
    print(f"Folder 1: Weed = {w1}, Crop = {c1}, Both = {b1}, Total = {t1}")

    # Folder 2
    w2, c2, b2, t2 = process_folder(folder2_path)
    print(f"Folder 2: Weed = {w2}, Crop = {c2}, Both = {b2}, Total = {t2}")

    # Folder 3
    w3, c3, b3, t3 = process_folder(folder3_path)
    print(f"Folder 3: Weed = {w3}, Crop = {c3}, Both = {b3}, Total = {t3}")

    # Grand totals
    total_weed = w1 + w2 + w3
    total_crop = c1 + c2 + c3
    total_both = b1 + b2 + b3
    grand_total = t1 + t2 + t3

    print(f"Grand Total: Weed = {total_weed}, Crop = {total_crop}, Both = {total_both}, Total = {grand_total}")

In [None]:
# usage
check_three_folders_for_weed_and_crop(
    '/content/yolo_format_dataset/labels/train',
    '/content/yolo_format_dataset/labels/validation',
    '/content/yolo_format_dataset/labels/test'
)

Folder 1: Weed = 965, Crop = 39, Both = 53, Total = 1057
Folder 2: Weed = 53, Crop = 2, Both = 2, Total = 57
Folder 3: Weed = 55, Crop = 3, Both = 4, Total = 62
Grand Total: Weed = 1073, Crop = 44, Both = 59, Total = 1176


In [None]:
!pip install -q albumentations opencv-python

In [None]:
import os
import cv2
import glob
import random
import shutil
import albumentations as A
from tqdm import tqdm

In [None]:
# Define paths
BASE_PATH = "/content/yolo_format_dataset"
SPLITS = ["train", "validation", "test"]
CLASS_NAMES = {0: 'weed', 1: 'crop'}

In [None]:
# Augmentation configs (except zoom)
AUGMENTATIONS = A.Compose([
    A.HorizontalFlip(p=0.5),
    A.VerticalFlip(p=0.5),
    A.RandomRotate90(p=0.5),
    A.Rotate(limit=30, p=0.5),
    A.RandomBrightnessContrast(p=0.5),
    A.HueSaturationValue(p=0.5),
    A.MotionBlur(p=0.2),
    A.Affine(shear={"x": 20, "y": 20}, p=0.3)
], bbox_params=A.BboxParams(format='yolo', label_fields=['class_labels']))

In [None]:
# Global counter for ignored images
ignored_images_counter = 0

def get_class_type(label_file):
    global ignored_images_counter
    try:
        with open(label_file, 'r') as f:
            lines = f.readlines()
            classes = set()

            for idx, line in enumerate(lines):
                parts = line.strip().split()
                if len(parts) < 1:
                    print(f"Malformed line in {label_file} at line {idx + 1}: '{line.strip()}'")
                    ignored_images_counter += 1
                    return None

                cls_str = parts[0]

                try:
                    cls = int(float(cls_str))  # handles '0.0', '1.0', etc.
                except ValueError:
                    print(f"Invalid class label '{cls_str}' in {label_file} at line {idx + 1}")
                    ignored_images_counter += 1
                    continue

                if cls not in [0, 1]:
                    print(f"Unexpected class '{cls}' in {label_file} at line {idx + 1}")
                    ignored_images_counter += 1
                    return None

                classes.add(cls)

            if classes == {1}:
                return 'crop'
            elif classes == {0}:
                return 'weed'
            elif classes == {0, 1}:
                return 'both'
            else:
                print(f"Unknown class combination in {label_file}: {classes}")
                ignored_images_counter += 1
                return None
    except Exception as e:
        print(f"Error reading {label_file}: {e}")
        ignored_images_counter += 1
        return None

In [None]:
def apply_augmentation(image_path, label_path, class_type, augment_times, image_dir, label_dir):
    image = cv2.imread(image_path)
    height, width = image.shape[:2]

    # Load and parse original bounding boxes
    with open(label_path, 'r') as f:
        lines = f.readlines()

    bboxes = []
    class_labels = []
    for line in lines:
        cls, x, y, w, h = map(float, line.strip().split())
        x = min(max(x, 0.0), 1.0)
        y = min(max(y, 0.0), 1.0)
        w = min(max(w, 0.0), 1.0)
        h = min(max(h, 0.0), 1.0)
        if w > 0 and h > 0:
            bboxes.append([x, y, w, h])
            class_labels.append(int(cls))

    if len(bboxes) == 0:
        return  # skip images with no valid bboxes

    base_filename = os.path.splitext(os.path.basename(image_path))[0]

    for i in range(augment_times):
        try:
            augmented = AUGMENTATIONS(image=image, bboxes=bboxes, class_labels=class_labels)
        except ValueError:
            print(f"[{class_type.upper()}] Skipped]")
            continue

        aug_image = augmented['image']
        aug_bboxes = augmented['bboxes']
        aug_labels = augmented['class_labels']

        # Post-clip and validate bboxes
        clipped_aug_bboxes = []
        clipped_aug_labels = []

        for cls, bbox in zip(aug_labels, aug_bboxes):
            x, y, w, h = bbox
            x = min(max(x, 0.0), 1.0)
            y = min(max(y, 0.0), 1.0)
            w = min(max(w, 0.0), 1.0)
            h = min(max(h, 0.0), 1.0)

            if w > 0 and h > 0:
                clipped_aug_bboxes.append([x, y, w, h])
                clipped_aug_labels.append(cls)

        skipped = 0
        successful = 0
        if len(clipped_aug_bboxes) == 0:
            skipped += 1
            print(f"[{class_type.upper()}] Aug {i+1}/{augment_times} for {image_path} — Success: {successful}, Skipped: {skipped}")
            continue
        else:
          successful += 1

        new_image_name = f"{base_filename}_aug_{i}.jpg"
        new_label_name = f"{base_filename}_aug_{i}.txt"
        new_image_path = os.path.join(image_dir, new_image_name)
        new_label_path = os.path.join(label_dir, new_label_name)


        # Save image
        cv2.imwrite(new_image_path, aug_image)

        # Save label
        with open(new_label_path, 'w') as out_f:
            for cls, bbox in zip(clipped_aug_labels, clipped_aug_bboxes):
                out_f.write(f"{cls} {' '.join(map(str, bbox))}\n")

In [None]:
print("\n Augmenting TRAIN set")
train_img_dir = "/content/yolo_format_dataset/images/train"
train_lbl_dir = "/content/yolo_format_dataset/labels/train"

train_imgs = glob.glob(os.path.join(train_img_dir, "*.jpg")) + \
             glob.glob(os.path.join(train_img_dir, "*.JPG"))
for image_path in tqdm(train_imgs, desc="Train"):
    label_filename = os.path.basename(image_path).rsplit('.', 1)[0] + ".txt"
    label_path = os.path.join(train_lbl_dir, label_filename)
    if not os.path.exists(label_path):
        continue
    class_type = get_class_type(label_path)
    if class_type == 'crop':
        apply_augmentation(image_path, label_path, class_type, 25, train_img_dir, train_lbl_dir)
    #elif class_type == 'weed':

    #elif class_type == 'both':



 Augmenting TRAIN set


Train: 100%|██████████| 1057/1057 [00:21<00:00, 48.90it/s]


In [None]:
print("\n Augmenting VALIDATION set")
val_img_dir = "/content/yolo_format_dataset/images/validation"
val_lbl_dir = "/content/yolo_format_dataset/labels/validation"

val_imgs = glob.glob(os.path.join(val_img_dir, "*.jpg")) + \
           glob.glob(os.path.join(val_img_dir, "*.JPG"))
for image_path in tqdm(val_imgs, desc="Validation"):
    label_filename = os.path.basename(image_path).rsplit('.', 1)[0] + ".txt"
    label_path = os.path.join(val_lbl_dir, label_filename)
    if not os.path.exists(label_path):
        continue
    class_type = get_class_type(label_path)
    if class_type == 'crop':
        apply_augmentation(image_path, label_path, class_type, 25, val_img_dir, val_lbl_dir)
    # elif class_type == 'weed':

    # elif class_type == 'both':



 Augmenting VALIDATION set


Validation: 100%|██████████| 57/57 [00:00<00:00, 95.37it/s] 


In [None]:
print("\n Augmenting TEST set")
test_img_dir = "/content/yolo_format_dataset/images/test"
test_lbl_dir = "/content/yolo_format_dataset/labels/test"

test_imgs = glob.glob(os.path.join(test_img_dir, "*.jpg")) + \
            glob.glob(os.path.join(test_img_dir, "*.JPG"))
'''
print("Files found:")
for f in test_imgs:
    print(os.path.basename(f))
'''
for image_path in tqdm(test_imgs, desc="Test"):
    label_filename = os.path.basename(image_path).rsplit('.', 1)[0] + ".txt"
    label_path = os.path.join(test_lbl_dir, label_filename)
    if not os.path.exists(label_path):
        continue
    class_type = get_class_type(label_path)
    if class_type == 'crop':
        apply_augmentation(image_path, label_path, class_type, 25, test_img_dir, test_lbl_dir)
    # elif class_type == 'weed':

    # elif class_type == 'both':
    #     apply_augmentation(image_path, label_path, class_type, 7, test_img_dir, test_lbl_dir)


 Augmenting TEST set


Test: 100%|██████████| 62/62 [00:00<00:00, 90.90it/s] 


In [None]:
print(ignored_images_counter)

0


In [None]:
def check_yolo_pairs(image_dir, label_dir):
    image_files = [f for f in os.listdir(image_dir) if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
    label_files = [f for f in os.listdir(label_dir) if f.endswith('.txt')]

    image_basenames = set(os.path.splitext(f)[0] for f in image_files)
    label_basenames = set(os.path.splitext(f)[0] for f in label_files)

    missing_labels = image_basenames - label_basenames
    missing_images = label_basenames - image_basenames

    print(f"Total images: {len(image_files)}")
    print(f"Total labels: {len(label_files)}")
    print(f"Images with NO matching label: {len(missing_labels)}")
    print(f"Labels with NO matching image: {len(missing_images)}")

    if missing_labels:
        print("\nImages without labels:")
        for f in sorted(missing_labels):
            print(f"{f}.jpg")

    if missing_images:
        print("\nLabels without images:")
        for f in sorted(missing_images):
            print(f"{f}.txt")

# Run
check_yolo_pairs('/content/yolo_format_dataset/images/train', '/content/yolo_format_dataset/labels/train')
check_yolo_pairs('/content/yolo_format_dataset/images/validation', '/content/yolo_format_dataset/labels/validation')
check_yolo_pairs('/content/yolo_format_dataset/images/test', '/content/yolo_format_dataset/labels/test')

Total images: 2032
Total labels: 2032
Images with NO matching label: 0
Labels with NO matching image: 0
Total images: 107
Total labels: 107
Images with NO matching label: 0
Labels with NO matching image: 0
Total images: 137
Total labels: 137
Images with NO matching label: 0
Labels with NO matching image: 0


In [None]:
import os

# Check
def check_weed_and_crop(file_path):
    with open(file_path, 'r') as f:
        lines = f.readlines()

    has_weed = any(line.startswith('0') for line in lines)
    has_crop = any(line.startswith('1') for line in lines)

    if has_weed and has_crop:
        return "both"
    elif has_weed:
        return "weed"
    elif has_crop:
        return "crop"
    else:
        return "neither"

In [None]:
def check_three_folders_for_weed_and_crop(folder1_path, folder2_path, folder3_path):
    def process_folder(folder_path):
        txt_files = [f for f in os.listdir(folder_path) if f.endswith('.txt')]
        only_weed = 0
        only_crop = 0
        both = 0

        for txt_file in txt_files:
            file_path = os.path.join(folder_path, txt_file)
            label_type = check_weed_and_crop(file_path).strip()
            if label_type == "weed":
                only_weed += 1
            elif label_type == "crop":
                only_crop += 1
            elif label_type == "both":
                both += 1

        total = only_weed + only_crop + both
        return only_weed, only_crop, both, total

    # Folder 1
    w1, c1, b1, t1 = process_folder(folder1_path)
    print(f"Folder 1: Weed = {w1}, Crop = {c1}, Both = {b1}, Total = {t1}")

    # Folder 2
    w2, c2, b2, t2 = process_folder(folder2_path)
    print(f"Folder 2: Weed = {w2}, Crop = {c2}, Both = {b2}, Total = {t2}")

    # Folder 3
    w3, c3, b3, t3 = process_folder(folder3_path)
    print(f"Folder 3: Weed = {w3}, Crop = {c3}, Both = {b3}, Total = {t3}")

    # Totals
    total_weed = w1 + w2 + w3
    total_crop = c1 + c2 + c3
    total_both = b1 + b2 + b3
    grand_total = t1 + t2 + t3

    print(f"Grand Total: Weed = {total_weed}, Crop = {total_crop}, Both = {total_both}, Total = {grand_total}")

In [None]:
# usage
check_three_folders_for_weed_and_crop(
    '/content/yolo_format_dataset/labels/train',
    '/content/yolo_format_dataset/labels/validation',
    '/content/yolo_format_dataset/labels/test'
)

Folder 1: Weed = 965, Crop = 1014, Both = 53, Total = 2032
Folder 2: Weed = 53, Crop = 52, Both = 2, Total = 107
Folder 3: Weed = 55, Crop = 78, Both = 4, Total = 137
Grand Total: Weed = 1073, Crop = 1144, Both = 59, Total = 2276


In [None]:
import zipfile
import os
import shutil

# Define
output_dir = "/content/yolo_format_dataset"
zip_path = "/content/yolo_augmented_dataset.zip"

# Create zip
def create_zip():
    with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, _, files in os.walk(output_dir):
            for file in files:
                file_path = os.path.join(root, file)
                arcname = os.path.relpath(file_path, os.path.dirname(output_dir))  # preserve yolo_format_dataset folder
                zipf.write(file_path, arcname)
    return zip_path


zip_file = create_zip()
print(f"Zip created: {zip_file}")

Zip created: /content/yolo_augmented_dataset.zip


In [None]:
# Move to Google Drive
destination_path = "/content/drive/MyDrive/MV Project/Augmentation"
shutil.move(zip_file, destination_path)

print(f"Zip file moved to Google Drive: {destination_path}")


Zip file moved to Google Drive: /content/drive/MyDrive/MV Project/Augmentation


# Image Preprocessing


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from zipfile import ZipFile
zip_path = '/content/drive/MyDrive/MV Project/Augmentation/yolo_augmented_dataset.zip'
extract_path = '/content/new'

with ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

In [None]:
!pip install -q opencv-python
!pip install -q albumentations

In [None]:
import os
import cv2
import shutil
from zipfile import ZipFile
import numpy as np
from tqdm import tqdm
import zipfile

In [None]:
# Input and output
input_img_dir = "/content/new/yolo_format_dataset/images"
output_img_dir = "/content/yolo_format_dataset/images"

# Create
os.makedirs(output_img_dir, exist_ok=True)

# Subfolders
subfolders = ["train", "validation", "test"]

In [None]:
# Image processing function
def process_image(img):

    # Convert to HSV
    hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
    h, s, v = cv2.split(hsv)

    # CLAHE on the V channel
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
    v_clahe = clahe.apply(v)

    # Merge and convert
    hsv_clahe = cv2.merge((h, s, v_clahe))
    img_clahe = cv2.cvtColor(hsv_clahe, cv2.COLOR_HSV2BGR)

    # Sharpening
    kernel = np.array([[0, -1, 0],
                       [-1, 5, -1],
                       [0, -1, 0]])

    sharpened = cv2.filter2D(img_clahe, -1, kernel)

    return sharpened

In [None]:
source_root = "/content/new/yolo_format_dataset/images"
dest_root = "/content/yolo_format_dataset/images"
# Process
for subfolder in subfolders:
    src_folder = os.path.join(source_root, subfolder)
    dst_folder = os.path.join(dest_root, subfolder)
    os.makedirs(dst_folder, exist_ok=True)

    for filename in os.listdir(src_folder):
        if filename.lower().endswith(('.jpg', '.jpeg', '.png', '.JPG')):
            src_path = os.path.join(src_folder, filename)
            dst_path = os.path.join(dst_folder, filename)

            img = cv2.imread(src_path)
            if img is not None:
                processed_img = process_image(img)
                cv2.imwrite(dst_path, processed_img)

In [None]:
# Define
original_root = "/content/new/yolo_format_dataset/images"
processed_root = "/content/yolo_format_dataset/images"
subfolders = ["train", "validation", "test"]
image_exts = ('.jpg', '.jpeg', '.png', '.JPG')

# Function to count
def count_images(root, name):
    print(f"\n{name} IMAGE COUNTS:")
    for sub in subfolders:
        subdir = os.path.join(root, sub)
        if not os.path.exists(subdir):
            print(f"  {sub}: folder does not exist")
            continue
        count = len([f for f in os.listdir(subdir) if f.lower().endswith(image_exts)])
        print(f"  {sub}: {count} images")

# Print
count_images(original_root, "ORIGINAL")
count_images(processed_root, "PROCESSED")


ORIGINAL IMAGE COUNTS:
  train: 2032 images
  validation: 107 images
  test: 137 images

PROCESSED IMAGE COUNTS:
  train: 2032 images
  validation: 107 images
  test: 137 images


In [None]:
# Set paths
source_folder = "/content/new/yolo_format_dataset/labels"  # Replace with your actual folder
source_file = "/content/new/yolo_format_dataset/data.yaml"  # Replace with your actual file
destination_directory = "/content/yolo_format_dataset"

# Move folder
shutil.move(source_folder, os.path.join(destination_directory, os.path.basename(source_folder)))

# Move file
shutil.move(source_file, os.path.join(destination_directory, os.path.basename(source_file)))

print(f"Moved '{source_folder}' to '{destination_directory}'")
print(f"Moved '{source_file}' to '{destination_directory}'")

Moved '/content/new/yolo_format_dataset/labels' to '/content/yolo_format_dataset'
Moved '/content/new/yolo_format_dataset/data.yaml' to '/content/yolo_format_dataset'


In [None]:
folder_to_delete = "/content/new"

# Delete
if os.path.exists(folder_to_delete):
    shutil.rmtree(folder_to_delete)
    print(f"Deleted folder: {folder_to_delete}")
else:
    print(f"Folder does not exist: {folder_to_delete}")

Deleted folder: /content/new


In [None]:
def check_yolo_pairs(image_dir, label_dir):
    image_files = [f for f in os.listdir(image_dir) if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
    label_files = [f for f in os.listdir(label_dir) if f.endswith('.txt')]

    image_basenames = set(os.path.splitext(f)[0] for f in image_files)
    label_basenames = set(os.path.splitext(f)[0] for f in label_files)

    missing_labels = image_basenames - label_basenames
    missing_images = label_basenames - image_basenames

    print(f"Total images: {len(image_files)}")
    print(f"Total labels: {len(label_files)}")
    print(f"Images with NO matching label: {len(missing_labels)}")
    print(f"Labels with NO matching image: {len(missing_images)}")

    if missing_labels:
        print("\nImages without labels:")
        for f in sorted(missing_labels):
            print(f"{f}.jpg")

    if missing_images:
        print("\nLabels without images:")
        for f in sorted(missing_images):
            print(f"{f}.txt")

# Run the checks
check_yolo_pairs('/content/yolo_format_dataset/images/train', '/content/yolo_format_dataset/labels/train')
check_yolo_pairs('/content/yolo_format_dataset/images/validation', '/content/yolo_format_dataset/labels/validation')
check_yolo_pairs('/content/yolo_format_dataset/images/test', '/content/yolo_format_dataset/labels/test')

Total images: 2032
Total labels: 2032
Images with NO matching label: 0
Labels with NO matching image: 0
Total images: 107
Total labels: 107
Images with NO matching label: 0
Labels with NO matching image: 0
Total images: 137
Total labels: 137
Images with NO matching label: 0
Labels with NO matching image: 0


In [None]:
# Define
output_dir = "/content/yolo_format_dataset"
zip_path = "/content/yolo_enhanced.zip"

# Create zip
def create_zip():
    with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, _, files in os.walk(output_dir):
            for file in files:
                file_path = os.path.join(root, file)
                arcname = os.path.relpath(file_path, os.path.dirname(output_dir))  # preserve yolo_format_dataset folder
                zipf.write(file_path, arcname)
    return zip_path

zip_file = create_zip()
print(f"Zip created: {zip_file}")

Zip created: /content/yolo_enhanced.zip


In [None]:
# Move to Google Drive
import shutil
zip_path = "/content/yolo_enhanced.zip"
destination_path = "/content/drive/MyDrive/MV Project/Enhanced/"
shutil.move(zip_path, destination_path)

print(f"Zip file moved to Google Drive: {destination_path}")

Zip file moved to Google Drive: /content/drive/MyDrive/MV Project/Enhanced/


In [None]:
folder_to_delete = "/content/yolo_format_dataset"

# Delete
if os.path.exists(folder_to_delete):
    shutil.rmtree(folder_to_delete)
    print(f"Deleted folder: {folder_to_delete}")
else:
    print(f"Folder does not exist: {folder_to_delete}")

Deleted folder: /content/yolo_format_dataset


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from zipfile import ZipFile
zip_path = '/content/drive/MyDrive/MV Project/Enhanced/yolo_enhanced.zip'
extract_path = '/content/'

with ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

In [None]:
# Define
original_root = "/content/yolo_format_dataset/images"
subfolders = ["train", "validation", "test"]
image_exts = ('.jpg', '.jpeg', '.png', '.JPG')

# Function to count
def count_images(root, name):
    print(f"\n{name} IMAGE COUNTS:")
    for sub in subfolders:
        subdir = os.path.join(root, sub)
        if not os.path.exists(subdir):
            print(f"  {sub}: folder does not exist")
            continue
        count = len([f for f in os.listdir(subdir) if f.lower().endswith(image_exts)])
        print(f"  {sub}: {count} images")

# Print
count_images(original_root, "ORIGINAL")


ORIGINAL IMAGE COUNTS:
  train: 2032 images
  validation: 107 images
  test: 137 images


In [None]:
def check_yolo_pairs(image_dir, label_dir):
    image_files = [f for f in os.listdir(image_dir) if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
    label_files = [f for f in os.listdir(label_dir) if f.endswith('.txt')]

    image_basenames = set(os.path.splitext(f)[0] for f in image_files)
    label_basenames = set(os.path.splitext(f)[0] for f in label_files)

    missing_labels = image_basenames - label_basenames
    missing_images = label_basenames - image_basenames

    print(f"Total images: {len(image_files)}")
    print(f"Total labels: {len(label_files)}")
    print(f"Images with NO matching label: {len(missing_labels)}")
    print(f"Labels with NO matching image: {len(missing_images)}")

    if missing_labels:
        print("\nImages without labels:")
        for f in sorted(missing_labels):
            print(f"{f}.jpg")

    if missing_images:
        print("\nLabels without images:")
        for f in sorted(missing_images):
            print(f"{f}.txt")

# Run the checks
check_yolo_pairs('/content/yolo_format_dataset/images/train', '/content/yolo_format_dataset/labels/train')
check_yolo_pairs('/content/yolo_format_dataset/images/validation', '/content/yolo_format_dataset/labels/validation')
check_yolo_pairs('/content/yolo_format_dataset/images/test', '/content/yolo_format_dataset/labels/test')

Total images: 2032
Total labels: 2032
Images with NO matching label: 0
Labels with NO matching image: 0
Total images: 107
Total labels: 107
Images with NO matching label: 0
Labels with NO matching image: 0
Total images: 137
Total labels: 137
Images with NO matching label: 0
Labels with NO matching image: 0


In [None]:
import os

# Check
def check_weed_and_crop(file_path):
    with open(file_path, 'r') as f:
        lines = f.readlines()

    has_weed = any(line.startswith('0') for line in lines)
    has_crop = any(line.startswith('1') for line in lines)

    if has_weed and has_crop:
        return "both"
    elif has_weed:
        return "weed"
    elif has_crop:
        return "crop"
    else:
        return "neither"

In [None]:
def check_three_folders_for_weed_and_crop(folder1_path, folder2_path, folder3_path):
    def process_folder(folder_path):
        txt_files = [f for f in os.listdir(folder_path) if f.endswith('.txt')]
        only_weed = 0
        only_crop = 0
        both = 0

        for txt_file in txt_files:
            file_path = os.path.join(folder_path, txt_file)
            label_type = check_weed_and_crop(file_path).strip()
            if label_type == "weed":
                only_weed += 1
            elif label_type == "crop":
                only_crop += 1
            elif label_type == "both":
                both += 1

        total = only_weed + only_crop + both
        return only_weed, only_crop, both, total

    # Folder 1
    w1, c1, b1, t1 = process_folder(folder1_path)
    print(f"Folder 1: Weed = {w1}, Crop = {c1}, Both = {b1}, Total = {t1}")

    # Folder 2
    w2, c2, b2, t2 = process_folder(folder2_path)
    print(f"Folder 2: Weed = {w2}, Crop = {c2}, Both = {b2}, Total = {t2}")

    # Folder 3
    w3, c3, b3, t3 = process_folder(folder3_path)
    print(f"Folder 3: Weed = {w3}, Crop = {c3}, Both = {b3}, Total = {t3}")

    # Totals
    total_weed = w1 + w2 + w3
    total_crop = c1 + c2 + c3
    total_both = b1 + b2 + b3
    grand_total = t1 + t2 + t3

    print(f"Grand Total: Weed = {total_weed}, Crop = {total_crop}, Both = {total_both}, Total = {grand_total}")

In [None]:
# usage
check_three_folders_for_weed_and_crop(
    '/content/yolo_format_dataset/labels/train',
    '/content/yolo_format_dataset/labels/validation',
    '/content/yolo_format_dataset/labels/test'
)

Folder 1: Weed = 965, Crop = 1014, Both = 53, Total = 2032
Folder 2: Weed = 53, Crop = 52, Both = 2, Total = 107
Folder 3: Weed = 55, Crop = 78, Both = 4, Total = 137
Grand Total: Weed = 1073, Crop = 1144, Both = 59, Total = 2276
