In [2]:
import os
import shutil
import random
from pathlib import Path
import matplotlib.pyplot as plt
from PIL import Image
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator, img_to_array, load_img, array_to_img

In [3]:
# Base data directory
import ipynbname
base_dir = Path('/Users/tehreem/Desktop/Study/Projects/SDS-CP028-smart-leaf/submissions/team-members/tehreem-ansari/data')
source_dir = base_dir / 'ValidCrops'
target_dirs = {
    'train': base_dir /ipynbname.name() / 'train',
    'val': base_dir/ipynbname.name() / 'val',
    'test': base_dir/ipynbname.name() / 'test'
}

In [3]:
#Remove Corrupt Images
def remove_corrupt_images(directory):
    corrupted_files = []
    for dirpath, _, filenames in os.walk(directory):
        for filename in filenames:
            normalized_name = filename.strip().lower()
            if normalized_name.endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff', '.gif')):
                file_path = os.path.join(dirpath, filename)
                try:
                    with Image.open(file_path) as img:
                        img.verify()
                except Exception as e:
                    corrupted_files.append(file_path)
                    print(f"Corrupt image found and deleted: {file_path} ({e})")
                    os.remove(file_path)
            else:
                print(f"Unknown file format ignoring: {filename}")
    print(f"Total corrupt images deleted: {len(corrupted_files)}")



remove_corrupt_images(source_dir)

Unknown file format ignoring: .DS_Store
Unknown file format ignoring: .DS_Store
Unknown file format ignoring: .DS_Store
Unknown file format ignoring: DOC-20231219-WA0001.pdf
Total corrupt images deleted: 0


In [4]:
# Create target dirs
for split_dir in target_dirs.values():
    os.makedirs(split_dir, exist_ok=True)

# Function to get all image file paths
def get_images(path):
    return [file for file in path.iterdir() if file.is_file() and file.suffix.lower() in ['.jpg', '.jpeg', '.png']]

# Walk through each category (corn, rice, etc.)
for crop_folder in source_dir.iterdir():
    if crop_folder.is_dir():
        for disease_folder in crop_folder.iterdir():
            if disease_folder.is_dir():
                images = get_images(disease_folder)
                random.shuffle(images)

                # Split images
                total = len(images)
                train_end = int(0.8 * total)
                val_end = int(0.9 * total)

                split_data = {
                    'train': images[:train_end],
                    'val': images[train_end:val_end],
                    'test': images[val_end:]
                }

                # Copy images to new folders
                class_name = disease_folder.name
                for split, image_list in split_data.items():
                    class_dir = target_dirs[split] / class_name
                    os.makedirs(class_dir, exist_ok=True)
                    for image_path in image_list:
                        shutil.copy(image_path, class_dir / image_path.name)

print("Data split and copied successfully.")


Data split and copied successfully.


In [4]:
#Create the dataframes from directories
BATCH_SIZE = 32
IMG_SIZE = (224, 224)  # standard size for CNNs

train_ds = tf.keras.utils.image_dataset_from_directory(
    directory=target_dirs['train'],
    labels="inferred",
    image_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    shuffle=True
)

val_ds = tf.keras.utils.image_dataset_from_directory(
    directory=target_dirs['val'],
    labels="inferred",
    image_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    shuffle=False
)

test_ds = tf.keras.utils.image_dataset_from_directory(
    directory=target_dirs['test'],
    labels="inferred",
    image_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    shuffle=False
)
class_names = train_ds.class_names

Found 12502 files belonging to 14 classes.
Found 2496 files belonging to 14 classes.
Found 2492 files belonging to 14 classes.


In [5]:
# Normalize pixel values to [0,1]
normalization_layer = tf.keras.layers.Rescaling(1./255)

train_ds = train_ds.map(lambda x, y: (normalization_layer(x), y))
val_ds = val_ds.map(lambda x, y: (normalization_layer(x), y))

#Verifying if normalization has happened
for images, labels in train_ds.take(1):
    print("Pixel range:", tf.reduce_min(images).numpy(), "-", tf.reduce_max(images).numpy())
    print("Example pixel:", images[0, 0, 0].numpy())  # Top-left pixel of the first image



Pixel range: 0.0 - 1.0
Example pixel: [0.52575034 0.5179072  0.57280916]


2025-05-07 11:59:00.752341: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [6]:
#Define the CNN
DROPOUT_RATE = 0.3

from tensorflow.keras import layers, models

#num_classes = len(class_names)

model = models.Sequential([
    layers.InputLayer(shape=IMG_SIZE + (3,)),
    layers.Conv2D(32, 3, activation='relu'),
    layers.MaxPooling2D(),
    layers.Conv2D(64, 3, activation='relu'),
    layers.MaxPooling2D(),
    layers.Conv2D(32, 3, activation='relu'),
    layers.MaxPooling2D(),
    layers.Flatten(),
    layers.Dense(32, activation='relu'),
    layers.Dropout(DROPOUT_RATE), # and keep_prob=0.7
    layers.Dense(14, activation='softmax')
])

model.summary()

In [7]:
#compile the model
model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)


In [8]:
#Set Callbacks
callbacks = [
    tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True),
    tf.keras.callbacks.ModelCheckpoint('best_model.keras', save_best_only=True)
]

In [None]:
#Train the model
#We have 10,656 training images, and a batch size of 32, then: ceil(10656 / 32) = 333
#Tensorflow computes: steps_per_epoch = math.ceil(total_training_samples / batch_size)

epochs = 1
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=epochs,
    callbacks=callbacks
)

In [None]:
#Plot Training History
plt.plot(history.history['accuracy'], label='train acc')
plt.plot(history.history['val_accuracy'], label='val acc')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [None]:
#Evaluate Model on Validation Set
val_loss, val_acc = model.evaluate(val_ds)
print(f"Validation accuracy: {val_acc:.2f}")

In [None]:
#confusion matrix
import numpy as np
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Get true labels and predictions
y_true = np.concatenate([y for x, y in val_ds], axis=0)
y_pred_probs = model.predict(val_ds)
y_pred = np.argmax(y_pred_probs, axis=1)

cm = confusion_matrix(y_true, y_pred)

plt.figure(figsize=(12, 10))
sns.heatmap(cm, xticklabels=class_names, yticklabels=class_names, annot=True, fmt='d')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()