In [47]:
from pathlib import Path
import os
from PIL import Image
import shutil
import random
import matplotlib.pyplot as plt
import ipynbname
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator, img_to_array, load_img, array_to_img

In [48]:
print(ipynbname.name())

basic_augmentation


In [49]:
# Base data directory
base_dir = Path('/Users/tehreem/Desktop/Study/Projects/SDS-CP028-smart-leaf/submissions/team-members/tehreem-ansari/data')
source_dir = base_dir / 'ValidCrops'
target_dirs = {
    'train': base_dir /ipynbname.name() / 'train',
    'val': base_dir/ipynbname.name() / 'val',
    'test': base_dir/ipynbname.name() / 'test'
}

In [50]:
#First remove corrupt images before splitting your data into train, validation, and test sets.
#If a corrupt image ends up in any split, it can cause your training or evaluation code to crash or produce errors.
#If corrupt images are removed after splitting, some splits may lose more images than others, leading to imbalanced or unrepresentative datasets

#Remove Corrupt Images
def remove_corrupt_images(directory):
    corrupted_files = []
    for dirpath, _, filenames in os.walk(directory):
        for filename in filenames:
            normalized_name = filename.strip().lower()
            if normalized_name.endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff', '.gif')):
                file_path = os.path.join(dirpath, filename)
                try:
                    with Image.open(file_path) as img:
                        img.verify()
                except Exception as e:
                    corrupted_files.append(file_path)
                    print(f"Corrupt image found and deleted: {file_path} ({e})")
                    os.remove(file_path)
            else:
                print(f"Unknown file format ignoring: {filename}")
    print(f"Total corrupt images deleted: {len(corrupted_files)}")



remove_corrupt_images(source_dir)

Unknown file format ignoring: .DS_Store
Unknown file format ignoring: .DS_Store
Unknown file format ignoring: .DS_Store
Unknown file format ignoring: DOC-20231219-WA0001.pdf
Total corrupt images deleted: 0


In [51]:
# Create target directories for train, validation, and test splits if they don't exist
for split_dir in target_dirs.values():
    os.makedirs(split_dir, exist_ok=True)


"""
    Return a list of image file paths in the given directory.
    Only files with extensions .jpg, .jpeg, .png (case-insensitive) are included.
    """
def get_images(path: Path) -> list:
    return [file for file in path.iterdir() if file.is_file() and file.suffix.lower() in ['.jpg', '.jpeg', '.png']]



# Iterate over each crop category folder (e.g., corn, rice)
for crop_folder in source_dir.iterdir():
    if crop_folder.is_dir():
        # Iterate over each disease subfolder within the crop folder
        for disease_folder in crop_folder.iterdir():
            if disease_folder.is_dir():
                images = get_images(disease_folder)  # Get all images in this disease folder
                random.shuffle(images)  # Shuffle images randomly before splitting

                total = len(images)
                train_split = int(0.8 * total)  # 80% for training
                val_split = int(0.9 * total)    # Next 10% for validation (80% + 10%)

                # Split images into train, validation, and test sets
                train_images = images[:train_split]          # First 80%
                val_images = images[train_split:val_split]   # Next 10%
                test_images = images[val_split:]              # Remaining 10%

                # Organize splits in a dictionary for easy iteration
                split_data = {
                    'train': train_images,
                    'val': val_images,
                    'test': test_images
                }

                class_name = disease_folder.name  # Use disease folder name as class label

                # For each split, copy images to corresponding target directory/class folder
                for split, image_list in split_data.items():
                    class_dir = target_dirs[split] / class_name  # e.g., train/corn_early_blight
                    os.makedirs(class_dir, exist_ok=True)        # Create class folder if it doesn't exist
                    for image_path in image_list:
                        shutil.copy(image_path, class_dir / image_path.name)  # Copy image file

print("Data split and copied successfully.")

Data split and copied successfully.


In [52]:
# Counting from directory instead of dataset, this is fast
def count_images_in_train(train_dir):
    image_counts = {}
    # Traverse through all class directories in train
    for class_dir in train_dir.iterdir():
        if class_dir.is_dir():
            # Count the images in each class folder
            image_count = len([file for file in class_dir.iterdir() if file.is_file() and file.suffix.lower() in ['.jpg', '.jpeg', '.png']])
            image_counts[class_dir.name] = image_count
    
    return image_counts

In [53]:
# Get image count for each class in train set
class_counts = count_images_in_train(target_dirs['train'])

# Print the image count for each class
# for class_name, count in image_counts.items():
def print_count_of_classes(class_counts):
    plt.bar(class_counts.keys(), class_counts.values())
    plt.xticks(rotation=90)
    plt.title("Number of Images per Class")
    plt.show()
    
#print_count_of_classes(class_counts)

In [54]:
#Lists the classes and total images in each class
def list_class_count(class_counts):
    for class_name, count in class_counts.items():
        print(f"{class_name}: {count} images")

In [55]:
#Create the dataframes from directories
BATCH_SIZE = 32
IMG_SIZE = (224, 224)  # standard size for CNNs

train_ds = tf.keras.utils.image_dataset_from_directory(
    directory=target_dirs['train'],
    labels="inferred",
    image_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    shuffle=True
)

val_ds = tf.keras.utils.image_dataset_from_directory(
    directory=target_dirs['val'],
    labels="inferred",
    image_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    shuffle=False
)

test_ds = tf.keras.utils.image_dataset_from_directory(
    directory=target_dirs['test'],
    labels="inferred",
    image_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    shuffle=False
)

class_names = train_ds.class_names #Need to save class_names before doing normalization since dataset changes from _PrefetchDataset to MapDataset and mapdataset doesnt have class_names


Found 10414 files belonging to 14 classes.
Found 1301 files belonging to 14 classes.
Found 1309 files belonging to 14 classes.


In [56]:
list_class_count(count_images_in_train(target_dirs['train']))
print(train_ds)

Wheat___Brown_Rust: 721 images
Potato___Early_Blight: 800 images
Wheat___Healthy: 892 images
Potato___Late_Blight: 800 images
Wheat___Yellow_Rust: 739 images
Rice___Healthy: 1190 images
Corn___Northern_Leaf_Blight: 788 images
Rice___Brown_Spot: 490 images
Rice___Leaf_Blast: 781 images
Corn___Common_Rust: 953 images
Corn___Healthy: 929 images
Corn___Gray_Leaf_Spot: 410 images
Rice___Neck_Blast: 800 images
Potato___Healthy: 121 images
<_PrefetchDataset element_spec=(TensorSpec(shape=(None, 224, 224, 3), dtype=tf.float32, name=None), TensorSpec(shape=(None,), dtype=tf.int32, name=None))>


In [57]:
# Normalize pixel values to [0,1]
normalization_layer = tf.keras.layers.Rescaling(1./255)

train_ds = train_ds.map(lambda x, y: (normalization_layer(x), y))
val_ds = val_ds.map(lambda x, y: (normalization_layer(x), y))

#Verifying if normalization has happened
for images, labels in train_ds.take(1):
    print("Pixel range:", tf.reduce_min(images).numpy(), "-", tf.reduce_max(images).numpy())
    print("Example pixel:", images[0, 0, 0].numpy())  # Top-left pixel of the first image

Pixel range: 0.0 - 1.0
Example pixel: [0.8742297 0.8742297 0.9840337]


In [58]:
# Define the data augmentation pipeline
datagen = ImageDataGenerator(
    rotation_range=30,           # Randomly rotate images up to 30 degrees
    width_shift_range=0.2,       # Randomly shift images horizontally by up to 20% of width
    height_shift_range=0.2,      # Randomly shift images vertically by up to 20% of height
    zoom_range=0.2,              # Randomly zoom in/out by up to 20%
    shear_range=0.2,             # Shear transformations up to 20%
    horizontal_flip=True,        # Randomly flip images horizontally
    fill_mode='nearest'          # Fill in new pixels after transformations with nearest pixel values
)

In [59]:
def augment_images(source_dir, save_dir, datagen, target_count, number_of_copy, keep_original=False):

    image_count = 0  # Counter for total images saved (original + augmented)
    
    image_files = [f for f in source_dir.iterdir()
               if f.is_file() and f.suffix.lower() in ['.jpg', '.jpeg', '.png']]
    
    for image_file in image_files:
        # Stop if we have reached the target number of images
        if image_count >= target_count:
            break

        # Optionally save the original image before augmentation
        if keep_original:
            dest_file = save_dir / image_file.name
            # Copy original image only if it doesn't already exist in save_dir
            if not dest_file.exists():
                shutil.copy(image_file, dest_file)
                image_count += 1  # Increment count for saved original image
                # Check again if target count reached after saving original
                if image_count >= target_count:
                    break

        # Load the image from disk
        img = load_img(image_file)
        # Convert the image to a numpy array
        x = img_to_array(img)
        # Reshape array to add batch dimension: (1, height, width, channels)
        x = x.reshape((1,) + x.shape)

        # Generate augmented images from this original image
        i = 0  # Counter for augmented images generated per original image
        #datagen.flow() method automatically saves each generated augmented image to the folder save_dir
        for batch in datagen.flow(
            x,
            batch_size=1,
            save_to_dir=save_dir,       # Directory to save augmented images
            save_prefix='aug',          # Prefix for saved filenames
            save_format='JPG'           # File format for saved images
        ):
            i += 1
            image_count += 1  # Increment total image count
            # Stop if reached target count or generated required copies for this image
            if image_count >= target_count: #or i >= number_of_copy:
                break


In [60]:
#Use augmentation to increase number of samples in Potato___Healthy 
'''
Original count:

Wheat___Brown_Rust: 721 images
Potato___Early_Blight: 800 images
Wheat___Healthy: 892 images
Potato___Late_Blight: 800 images
Wheat___Yellow_Rust: 739 images
Rice___Healthy: 1190 images
Corn___Northern_Leaf_Blight: 788 images
Rice___Brown_Spot: 490 images
Rice___Leaf_Blast: 781 images
Corn___Common_Rust: 953 images
Corn___Healthy: 929 images
Corn___Gray_Leaf_Spot: 410 images
Rice___Neck_Blast: 800 images
Potato___Healthy: 121 images
'''
potato_source_dir = target_dirs['train']/'Potato___Healthy'  # change to your dataset path
potato_target_count = 6 * len(list(potato_source_dir.glob('*.JPG')))  # 6x the dataset

augment_images(potato_source_dir, potato_source_dir, datagen, potato_target_count, 2, True)
list_class_count(count_images_in_train(target_dirs['train']))

Wheat___Brown_Rust: 721 images
Potato___Early_Blight: 800 images
Wheat___Healthy: 892 images
Potato___Late_Blight: 800 images
Wheat___Yellow_Rust: 739 images
Rice___Healthy: 1190 images
Corn___Northern_Leaf_Blight: 788 images
Rice___Brown_Spot: 490 images
Rice___Leaf_Blast: 781 images
Corn___Common_Rust: 953 images
Corn___Healthy: 929 images
Corn___Gray_Leaf_Spot: 410 images
Rice___Neck_Blast: 800 images
Potato___Healthy: 820 images


In [61]:
Corn___Gray_Leaf_Spot_source_dir = target_dirs['train']/'Corn___Gray_Leaf_Spot'  # change to your dataset path
Corn___Gray_Leaf_Spot_target_count = 1 * len(list(Corn___Gray_Leaf_Spot_source_dir.glob('*.JPG')))  # One set of aug, hence doubling the dataset
#Some of the images are in jpg and not JPG, hence they are not considered
augment_images(Corn___Gray_Leaf_Spot_source_dir, Corn___Gray_Leaf_Spot_source_dir, datagen, Corn___Gray_Leaf_Spot_target_count, 1, True)
list_class_count(count_images_in_train(target_dirs['train']))

Wheat___Brown_Rust: 721 images
Potato___Early_Blight: 800 images
Wheat___Healthy: 892 images
Potato___Late_Blight: 800 images
Wheat___Yellow_Rust: 739 images
Rice___Healthy: 1190 images
Corn___Northern_Leaf_Blight: 788 images
Rice___Brown_Spot: 490 images
Rice___Leaf_Blast: 781 images
Corn___Common_Rust: 953 images
Corn___Healthy: 929 images
Corn___Gray_Leaf_Spot: 760 images
Rice___Neck_Blast: 800 images
Potato___Healthy: 820 images


In [62]:
Rice___Brown_Spot_source_dir = target_dirs['train']/'Rice___Brown_Spot'  # change to your dataset path
original_count = len([f for f in Rice___Brown_Spot_source_dir.glob('*.jpg') 
                      if not f.name.startswith('aug')])  # one set of augmented images

augment_images(Rice___Brown_Spot_source_dir, Rice___Brown_Spot_source_dir, datagen, original_count, 1, True)
list_class_count(count_images_in_train(target_dirs['train']))

Wheat___Brown_Rust: 721 images
Potato___Early_Blight: 800 images
Wheat___Healthy: 892 images
Potato___Late_Blight: 800 images
Wheat___Yellow_Rust: 739 images
Rice___Healthy: 1190 images
Corn___Northern_Leaf_Blight: 788 images
Rice___Brown_Spot: 969 images
Rice___Leaf_Blast: 781 images
Corn___Common_Rust: 953 images
Corn___Healthy: 929 images
Corn___Gray_Leaf_Spot: 760 images
Rice___Neck_Blast: 800 images
Potato___Healthy: 820 images


In [63]:
#now the data is having more or less same samples.
#re reading the train_ds to have all the sampeles
train_ds = tf.keras.utils.image_dataset_from_directory(
    directory=target_dirs['train'],
    labels="inferred",
    image_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    shuffle=True
)

Found 11942 files belonging to 14 classes.


In [72]:
#Define the CNN
DROPOUT_RATE = 0.3

from tensorflow.keras import layers, models

num_classes = len(class_names)

model = models.Sequential([
    layers.InputLayer(shape=IMG_SIZE + (3,)),
    layers.Conv2D(32, kernel_size=3, activation='relu'),
    layers.MaxPooling2D(),
    layers.Conv2D(64, 3, activation='relu'),
    layers.MaxPooling2D(),
    layers.Conv2D(32, 3, activation='relu'),
    layers.MaxPooling2D(),
    layers.Flatten(),
    layers.Dense(32, activation='relu'),
    layers.Dropout(DROPOUT_RATE), # and keep_prob=0.7
    layers.Dense(num_classes, activation='softmax')
])

model.summary()

In [85]:
#compile the model
from tensorflow.keras.optimizers import Adam

model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)


In [86]:
#Set Callbacks
callbacks = [
    tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True),
    tf.keras.callbacks.ModelCheckpoint(ipynbname.name()+'.keras', save_best_only=True)
]

In [88]:
#Train the model
#We have ~11900 training images, and a batch size of 32, then: ceil(~11900 / 32) = 374 
#Tensorflow computes: steps_per_epoch = math.ceil(total_training_samples / batch_size)

epochs = 20
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=epochs,
    callbacks=callbacks
)

Epoch 1/20
[1m374/374[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m117s[0m 313ms/step - accuracy: 0.4494 - loss: 1.7015 - val_accuracy: 0.1745 - val_loss: 2.5245
Epoch 2/20
[1m374/374[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m131s[0m 351ms/step - accuracy: 0.3899 - loss: 1.8638 - val_accuracy: 0.1653 - val_loss: 2.5618
Epoch 3/20
[1m374/374[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m128s[0m 342ms/step - accuracy: 0.4331 - loss: 1.6704 - val_accuracy: 0.1453 - val_loss: 2.8945
Epoch 4/20
[1m374/374[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m130s[0m 347ms/step - accuracy: 0.5436 - loss: 1.2772 - val_accuracy: 0.0945 - val_loss: 5.6712
