In [None]:
!git clone https://github.com/spMohanty/PlantVillage-Dataset.git

Cloning into 'PlantVillage-Dataset'...
remote: Enumerating objects: 163235, done.[K
remote: Counting objects: 100% (6/6), done.[K
remote: Compressing objects: 100% (6/6), done.[K
remote: Total 163235 (delta 2), reused 1 (delta 0), pack-reused 163229 (from 1)[K
Receiving objects: 100% (163235/163235), 2.00 GiB | 32.46 MiB/s, done.
Resolving deltas: 100% (101/101), done.
Updating files: 100% (182401/182401), done.


In [None]:
# --- 1. Import Necessary Libraries ---
# TensorFlow is Google's powerful library for building and training AI models.
import tensorflow as tf
# These are specific tools from TensorFlow to help us with images and building models.
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
from tensorflow.keras.models import Model
# These are standard libraries for file operations and data handling.
import os
import json

# --- 2. Define the Path to Your Data ---
# This tells our script where to find the images we just downloaded.
base_dir = '/content/PlantVillage-Dataset/raw/color'

# --- 3. Create the Data "Assembly Line" (ImageDataGenerator) ---
# This is the most efficient way to prepare images for training.
# It will automatically fetch images from folders, resize them, and prepare them for the model.
image_generator = ImageDataGenerator(
    # A crucial step: Neural networks work best with small numbers.
    # Image pixels range from 0-255. This scales them to be between 0 and 1.
    rescale=1./255,

    # This reserves 20% of our data for validation (testing). The model will
    # train on 80% and we'll check its performance on the remaining 20%.
    validation_split=0.2,

    # === Data Augmentation ===
    # To make our model more robust, we create slightly altered versions of our
    # images during training. This teaches the model to recognize a disease
    # even if the leaf is at a different angle, zoomed in, or flipped.
    rotation_range=40,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)

# Now, we use the generator to create our training set.
# It cleverly uses the folder names as the labels for the images.
train_generator = image_generator.flow_from_directory(
    base_dir,
    target_size=(224, 224), # MobileNetV2 requires images to be 224x224 pixels.
    batch_size=32,          # It will feed images to the model in batches of 32.
    class_mode='categorical',
    subset='training'       # We tell it this is the 80% training subset.
)

# And we do the same for our validation (testing) set.
validation_generator = image_generator.flow_from_directory(
    base_dir,
    target_size=(224, 224),
    batch_size=32,
    class_mode='categorical',
    subset='validation'     # This is the 20% validation subset.
)

# --- 4. Build the Model Using Transfer Learning ---
# We load the powerful MobileNetV2 model, pre-trained on millions of images.
# This model already knows how to detect edges, textures, and shapes.
base_model = MobileNetV2(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

# We "freeze" the knowledge of the base model. We don't want to change it.
base_model.trainable = False

# We now add our own small "brain" on top of the expert base model.
# This is the only part that will be trained on our plant images.
x = base_model.output
x = GlobalAveragePooling2D()(x) # This layer simplifies the features from the base model.
x = Dense(1024, activation='relu')(x) # A standard layer for learning complex patterns.
# The final layer makes the prediction. It has one output for each disease class.
predictions = Dense(train_generator.num_classes, activation='softmax')(x)

# We assemble our final model.
model = Model(inputs=base_model.input, outputs=predictions)

# --- 5. Compile and Train the Model ---
# 'compile' sets up the rules for training.
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# 'fit' starts the actual training process. This is the part that will take time (15-30 mins).
# It will run through the entire dataset 5 times (epochs).
print("Starting model training...")
history = model.fit(
    train_generator,
    epochs=5,
    validation_data=validation_generator
)
print("Training complete.")

# --- 6. Save Your Trained Brain and its Answer Key ---
# Save the entire model to a single file. This file IS your trained model.
model.save('disease_detector.h5')
print("Model saved to disease_detector.h5")

# The model outputs numbers (like 0, 1, 2...). We need to know which disease each
# number corresponds to. This saves a "dictionary" or "answer key" for that.
class_indices = train_generator.class_indices
import json
with open('class_indices.json', 'w') as f:
    json.dump(class_indices, f)
print("Class indices (the 'answer key') saved to class_indices.json")



Found 43456 images belonging to 38 classes.
Found 10849 images belonging to 38 classes.
Starting model training...


  self._warn_if_super_not_called()


Epoch 1/5
[1m 445/1358[0m [32m━━━━━━[0m[37m━━━━━━━━━━━━━━[0m [1m6:12[0m 408ms/step - accuracy: 0.7015 - loss: 1.1119

KeyboardInterrupt: 