In [30]:
import os
import random
import shutil

In [31]:
input_dir = "augmented/"
output_train_dir = "train/"
output_test_dir = "test/"

In [32]:
# Split percent (ex : 80% for trainig and 20% for test)
train_split =  0.8 # 80%

In [33]:
# Define the random seed for reproducibility
seed = 100 # change any number
random.seed(seed)

In [34]:
# get data classes
classes = os.listdir(input_dir)
print(classes)

['Potato___Early_blight_aug', 'Potato___healthy_aug', 'Potato___Late_blight_aug']


In [35]:
# Create output directories
for output_dir in [output_train_dir,output_test_dir]:
    os.makedirs(output_dir,exist_ok=True)
    for cls in classes:
        os.makedirs(os.path.join(output_dir,cls),exist_ok=True)

In [36]:
# count files
def count_file(directory):
    return sum([len(files) for r, d, files in os.walk(directory)])

In [37]:
# Count images in each class and total images
def count_images_in_classes(directory,classes):
    class_counts = {}
    total_count = 0
    
    for cls in classes:
        class_dir = os.path.join(directory,cls)
        count = count_file(class_dir)
        class_counts[cls] = count
        total_count += count
    return class_counts, total_count

In [38]:
# Count images in each class and total images before split
print("Number of images before splitting : ")
class_counts_before, total_images_before = count_images_in_classes(input_dir,classes)

for cls, count in class_counts_before.items():
    print(f"{cls}: {count} images")
print(f"Total iamges : {total_images_before}")

Number of images before splitting : 
Potato___Early_blight_aug: 3000 images
Potato___healthy_aug: 456 images
Potato___Late_blight_aug: 3000 images
Total iamges : 6456


In [39]:
def copy_files(file_paths, target_dir):
    for file_path in file_paths:
        os.makedirs(target_dir, exist_ok = True)
        shutil.copy(file_path, os.path.join(target_dir, os.path.basename(file_path)))
        

In [40]:
# Split the data
for cls in classes:
    class_input_dir = os.path.join(input_dir,cls)
    images = [os.path.join(class_input_dir,img) for img in os.listdir(class_input_dir)]
    
    # Shuffle images
    random.shuffle(images)
    
    # Split the images
    train_count = int(len(images)* train_split)
    train_images = images[:train_count]
    test_images = images[train_count:]
    
    # move training images
    print(f"Spliting train images....{cls}")
    copy_files(train_images, os.path.join(output_train_dir,cls))
    # move test images
    print(f"Spliting test images....{cls}")
    copy_files(test_images, os.path.join(output_test_dir, cls))
print("\nDone")

Spliting train images....Potato___Early_blight_aug
Spliting test images....Potato___Early_blight_aug
Spliting train images....Potato___healthy_aug
Spliting test images....Potato___healthy_aug
Spliting train images....Potato___Late_blight_aug
Spliting test images....Potato___Late_blight_aug

Done


In [41]:
# Count images in each class and total images after split
print("Number of images after splitting:\n")
print("Training set : ")

class_counts_train, total_images_train = count_images_in_classes(output_train_dir, classes)

for cls, count in class_counts_train.items():
    print(f"{cls} : {count} images")
print(f"Total training images : {total_images_train}")

print("\nTest set : ")
class_counts_test, total_images_test = count_images_in_classes(output_test_dir, classes)

for cls, count in class_counts_test.items():
    print(f"{cls} : {count} images")
print(f"Total test images : {total_images_test}")


print(f"\nTotal images : {total_images_train + total_images_test}")
print("\nData Splitting Complete")


Number of images after splitting:

Training set : 
Potato___Early_blight_aug : 2400 images
Potato___healthy_aug : 364 images
Potato___Late_blight_aug : 2400 images
Total training images : 5164

Test set : 
Potato___Early_blight_aug : 600 images
Potato___healthy_aug : 92 images
Potato___Late_blight_aug : 600 images
Total test images : 1292

Total images : 6456

Data Splitting Complete
