In [None]:
# ✅ STEP 1: Install & Import Necessary Libraries
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import os
import shutil
from sklearn.model_selection import train_test_split
import pandas as pd
import matplotlib.pyplot as plt
import requests, zipfile, io
import numpy as np
from tensorflow.keras.preprocessing import image

# ✅ STEP 2: Download & Extract the Dataset
dataset_url = "https://prod-dcd-datasets-cache-zipfiles.s3.eu-west-1.amazonaws.com/hxsnvwty3r-1.zip"
# This is the base directory where the zip file will be extracted.
# The zip file itself contains a top-level folder named "MangoLeafBD Dataset".
# So, the actual image data will be inside: extracted_base_dir/MangoLeafBD Dataset/
extracted_base_dir = "/content/extracted_dataset"
os.makedirs(extracted_base_dir, exist_ok=True)

# Define the actual root directory where the image classes are located after extraction
expected_dataset_folder_name = "MangoLeafBD Dataset"
initial_image_source_dir = os.path.join(extracted_base_dir, expected_dataset_folder_name)

# --- NEW ROBUST EXTRACTION LOGIC ---
# Always ensure the initial_image_source_dir is populated for splitting.
# If it's empty or doesn't exist, re-download and re-extract.
# This handles cases where previous runs moved files out.
if not os.path.exists(initial_image_source_dir) or not os.listdir(initial_image_source_dir):
    print("Dataset not found or empty in source directory. Downloading and extracting dataset...")
    # Clean up previous partial extraction attempts if any
    if os.path.exists(extracted_base_dir):
        shutil.rmtree(extracted_base_dir)
    os.makedirs(extracted_base_dir, exist_ok=True) # Recreate the base directory

    response = requests.get(dataset_url)
    dataset_zip = zipfile.ZipFile(io.BytesIO(response.content))
    dataset_zip.extractall(extracted_base_dir) # Extract into the base directory
    dataset_zip.close()
    print("✅ Dataset downloaded and extracted")
else:
    print("Dataset already exists in source directory and is populated. Skipping download and extraction.")

# ✅ STEP 3: Organize Dataset into Train & Validation
train_dir = "/content/MangoLeafBD_Train"
val_dir = "/content/MangoLeafBD_Val"

class_names = ["Anthracnose", "Bacterial Canker", "Cutting Weevil", "Die Back", "Gall Midge", "Healthy", "Powdery Mildew", "Sooty Mould"]
val_split = 0.2

# --- NEW ROBUST SPLITTING LOGIC ---
# Always perform a fresh split to ensure train_dir and val_dir are correctly populated.
# This is necessary because shutil.move empties the source directory each time.
print("Cleaning up previous train/validation directories for a fresh split...")
if os.path.exists(train_dir):
    shutil.rmtree(train_dir)
if os.path.exists(val_dir):
    shutil.rmtree(val_dir)

os.makedirs(train_dir, exist_ok=True)
os.makedirs(val_dir, exist_ok=True)

print("Splitting dataset into training and validation sets...")
for class_name in class_names:
    class_source_path = os.path.join(initial_image_source_dir, class_name)
    train_class_dir = os.path.join(train_dir, class_name)
    val_class_dir = os.path.join(val_dir, class_name)

    os.makedirs(train_class_dir, exist_ok=True)
    os.makedirs(val_class_dir, exist_ok=True)

    try:
        # Get all image filenames for the current class
        images = os.listdir(class_source_path)
        # Filter out any non-image files (e.g., .DS_Store, hidden files)
        images = [img for img in images if img.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp'))]
    except FileNotFoundError:
        print(f"Error: Source directory not found for class: {class_name} at {class_source_path}. This class will be empty in split.")
        continue # Skip to the next class if directory is not found
    except Exception as e:
        print(f"An error occurred while listing files for class {class_name}: {e}. Skipping this class.")
        continue

    if not images:
        print(f"No valid images found for class: {class_name} in {class_source_path}. Skipping split for this class.")
        continue

    # Split image filenames into training and validation sets
    train_images, val_images = train_test_split(images, test_size=val_split, random_state=42)

    # Move images to their respective directories
    # IMPORTANT: shutil.move moves the files, so the original directory will become empty after this.
    for img in train_images:
        src_path = os.path.join(class_source_path, img)
        dst_path = os.path.join(train_class_dir, img)
        if os.path.exists(src_path): # Double-check if source file exists before moving
            shutil.move(src_path, dst_path)
        else:
            print(f"Warning: Source file not found for move to train: {src_path}")

    for img in val_images:
        src_path = os.path.join(class_source_path, img)
        dst_path = os.path.join(val_class_dir, img)
        if os.path.exists(src_path): # Double-check if source file exists before moving
            shutil.move(src_path, dst_path)
        else:
            print(f"Warning: Source file not found for move to val: {src_path}")
print("✅ Dataset split into training and validation")


# ✅ STEP 4: Create Image Generators
batch_size = 32
image_size = (224, 224) # Standard input size for many pre-trained models, good choice

# Data augmentation for training to improve model generalization
train_datagen = ImageDataGenerator(
    rescale=1./255, # Normalize pixel values to [0, 1]
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest' # Strategy for filling in new pixels created by transformations
)

# Only rescaling for validation data to ensure consistent evaluation
val_datagen = ImageDataGenerator(rescale=1./255)

# Create generators from the newly created train and validation directories
train_generator = train_datagen.flow_from_directory(
    train_dir,
    target_size=image_size,
    batch_size=batch_size,
    class_mode='categorical' # Use 'categorical' for one-hot encoded labels (8 classes)
)

val_generator = val_datagen.flow_from_directory(
    val_dir,
    target_size=image_size,
    batch_size=batch_size,
    class_mode='categorical'
)

# Corrected attribute from num_samples to samples
print(f"Found {train_generator.samples} training images belonging to {train_generator.num_classes} classes.")
print(f"Found {val_generator.samples} validation images belonging to {val_generator.num_classes} classes.")

# ✅ STEP 5: Build the Model
# Define the input shape for the first convolutional layer
input_shape = (image_size[0], image_size[1], 3) # (height, width, channels)

model = Sequential([
    # First convolutional block
    layers.Conv2D(32, (3, 3), activation='relu', input_shape=input_shape),
    layers.MaxPooling2D((2, 2)),

    # Second convolutional block
    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),

    # Third convolutional block
    layers.Conv2D(128, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),

    # Fourth convolutional block (added for deeper feature extraction)
    layers.Conv2D(256, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),

    # Flatten the 3D output to 1D for the dense layers
    layers.Flatten(),

    # Fully connected layers
    layers.Dense(512, activation='relu'),
    layers.Dropout(0.5), # Dropout for regularization to prevent overfitting

    # Output layer: 8 units for 8 classes with softmax for probability distribution
    layers.Dense(len(class_names), activation='softmax')
])

# Compile the model
model.compile(optimizer='adam', # Adam optimizer is a good general-purpose choice
              loss='categorical_crossentropy', # Appropriate for multi-class classification with one-hot labels
              metrics=['accuracy']) # Monitor accuracy during training

model.summary() # Print a summary of the model architecture

# ✅ STEP 6: Train the Model
epochs = 10 # You can increase to 20+ for better results
print(f"\nStarting model training for {epochs} epochs...")
# Ensure generators have samples before fitting the model
if train_generator.samples == 0 or val_generator.samples == 0:
    print("Error: No samples found in one or both generators. Cannot train the model.")
else:
    history = model.fit(
        train_generator,
        epochs=epochs,
        validation_data=val_generator,
        steps_per_epoch=train_generator.samples // train_generator.batch_size, # Calculate steps per epoch
        validation_steps=val_generator.samples // val_generator.batch_size # Calculate validation steps
    )
    print("✅ Model training finished.")

    # ✅ STEP 7: Evaluate and Visualize Results
    print("\nEvaluating model performance...")
    train_metrics = model.evaluate(train_generator, steps=train_generator.samples // train_generator.batch_size)
    val_metrics = model.evaluate(val_generator, steps=val_generator.samples // val_generator.batch_size)

    print("\n📊 Training Metrics:")
    print(f"Loss: {train_metrics[0]:.4f} | Accuracy: {train_metrics[1]*100:.2f}%")
    print("\n📊 Validation Metrics:")
    print(f"Loss: {val_metrics[0]:.4f} | Accuracy: {val_metrics[1]*100:.2f}%")

    # Plotting training history (loss and accuracy)
    plt.figure(figsize=(12, 6))

    plt.subplot(1, 2, 1)
    plt.plot(history.history['loss'], label='Train Loss')
    plt.plot(history.history['val_loss'], label='Val Loss')
    plt.title('Model Loss Over Epochs')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.grid(True)

    plt.subplot(1, 2, 2)
    plt.plot(history.history['accuracy'], label='Train Accuracy')
    plt.plot(history.history['val_accuracy'], label='Val Accuracy')
    plt.title('Model Accuracy Over Epochs')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.grid(True)

    plt.tight_layout()
    plt.show()
    print("✅ Evaluation and visualization complete.")

    # ✅ STEP 8: Save the Model and Download
    model_filename = "mango_leaf_disease_model.h5"
    model.save(model_filename)
    print(f"✅ Model saved as {model_filename}")

    # This part is specific to Google Colab for downloading the file.
    # If you are running this script outside of Google Colab (e.g., on your local machine),
    # the 'files.download' line will cause an error.
    # In a local environment, the model will simply be saved to the current directory.
    try:
        from google.colab import files
        files.download(model_filename)
        print(f"✅ Model '{model_filename}' downloaded to your local machine.")
    except ImportError:
        print("\nNote: 'google.colab.files' not found. This script is likely not running in Google Colab.")
        print(f"The model '{model_filename}' is saved in the current directory.")
    except Exception as e:
        print(f"An error occurred during file download: {e}")
    else:
        print("Model training skipped due to empty generators.")

# Optional: Cleanup the extracted and split datasets if no longer needed
# print("\nCleaning up temporary dataset directories...")
# shutil.rmtree(extracted_base_dir)
# shutil.rmtree(train_dir)
# shutil.rmtree(val_dir)
# print("✅ Cleanup complete.")
