# Tomato Disease Advisory System — Training Pipeline (v3)

**Fixes from v2:**
- ✅ Fixed preprocessing: `preprocess_input` instead of `rescale=1/255`
- ✅ Separate validation generator (no augmentation)
- ✅ Dynamic EfficientNet selection based on IMAGE_SIZE
- ✅ Stabilized LR: Phase 1 = 3e-4, Phase 2 = 1e-4
- ✅ Simplified classification head
- ✅ Pre-training sanity checks

**Runtime:** Set to **GPU** (Runtime > Change runtime type > T4 GPU)

## Step 1: Verify GPU

In [None]:
import tensorflow as tf
print('TensorFlow version:', tf.__version__)
print('GPU Available:', tf.config.list_physical_devices('GPU'))
assert len(tf.config.list_physical_devices('GPU')) > 0, 'No GPU found! Change runtime to GPU.'

## Step 2: Clone Repository

In [None]:
import os

REPO_URL = 'https://github.com/ShubhamPawar-3333/Classification_of_tomato_plant_disease.git'
PROJECT_DIR = '/content/Classification_of_tomato_plant_disease'

if os.path.exists(PROJECT_DIR):
    %cd {PROJECT_DIR}
    !git pull origin master
else:
    !git clone {REPO_URL}
    %cd {PROJECT_DIR}

!ls -la

## Step 3: Install Dependencies

In [None]:
!pip install -q python-box ensure PyYAML mlflow scikit-learn seaborn

In [None]:
# Add src to Python path
import sys
sys.path.insert(0, os.path.join(PROJECT_DIR, 'src'))

from tomato_disease_advisor.utils import read_yaml
from pathlib import Path

# Load configs
config = read_yaml(Path(os.path.join(PROJECT_DIR, 'config', 'config.yaml')))
params = read_yaml(Path(os.path.join(PROJECT_DIR, 'params.yaml')))
print('Config loaded successfully')

## Step 4: Download Dataset

In [None]:
import zipfile

DATA_DIR = os.path.join(PROJECT_DIR, 'artifacts', 'data_ingestion')
DATASET_DIR = os.path.join(DATA_DIR, 'dataset')
os.makedirs(DATASET_DIR, exist_ok=True)

if not os.path.exists(DATASET_DIR) or len(os.listdir(DATASET_DIR)) == 0:
    print('Downloading dataset from Kaggle...')
    try:
        !pip install -q opendatasets
        import opendatasets as od
        od.download('https://www.kaggle.com/datasets/arjuntejaswi/plant-village', DATASET_DIR)
    except Exception as e:
        print(f'Kaggle download failed: {e}')
        print('\nPlease upload dataset manually using the cell below.')
else:
    print(f'Dataset already exists at: {DATASET_DIR}')

In [None]:
# MANUAL UPLOAD OPTION: Uncomment and run if Kaggle download failed
# from google.colab import files
# uploaded = files.upload()
# zip_path = list(uploaded.keys())[0]
# with zipfile.ZipFile(zip_path, 'r') as zip_ref:
#     zip_ref.extractall(DATASET_DIR)
# print(f'Extracted to: {DATASET_DIR}')

In [None]:
# Filter only tomato classes from PlantVillage
import shutil

# Find the actual data directory
data_root = DATASET_DIR
for root, dirs, files_list in os.walk(DATASET_DIR):
    if any('Tomato' in d for d in dirs):
        data_root = root
        break

print(f'Data root: {data_root}')

# List tomato classes
tomato_classes = [d for d in os.listdir(data_root) if 'Tomato' in d]
print(f'\nFound {len(tomato_classes)} tomato classes:')
total_images = 0
for cls in sorted(tomato_classes):
    count = len(os.listdir(os.path.join(data_root, cls)))
    total_images += count
    print(f'  {cls}: {count} images')

# Create filtered dataset with only tomato classes
TOMATO_DIR = os.path.join(DATASET_DIR, 'tomato')
if not os.path.exists(TOMATO_DIR):
    os.makedirs(TOMATO_DIR, exist_ok=True)
    for cls in tomato_classes:
        src = os.path.join(data_root, cls)
        dst = os.path.join(TOMATO_DIR, cls)
        if not os.path.exists(dst):
            shutil.copytree(src, dst)
    print(f'\nFiltered dataset created at: {TOMATO_DIR}')
else:
    print(f'Filtered dataset already exists at: {TOMATO_DIR}')

print(f'\nTotal images: {total_images}')

## Step 5: Dynamic EfficientNet Selection

Backbone is chosen **automatically** based on `IMAGE_SIZE` in `params.yaml`:
- 224 → EfficientNetB0
- 240 → EfficientNetB1
- 260 → EfficientNetB2
- 300 → EfficientNetB3
- 380 → EfficientNetB4

In [None]:
IMAGE_SIZE = params.IMAGE_SIZE
NUM_CLASSES = config.model.classes

# Dynamic backbone selection based on IMAGE_SIZE
EFFICIENTNET_MAP = {
    224: ('EfficientNetB0', tf.keras.applications.EfficientNetB0),
    240: ('EfficientNetB1', tf.keras.applications.EfficientNetB1),
    260: ('EfficientNetB2', tf.keras.applications.EfficientNetB2),
    300: ('EfficientNetB3', tf.keras.applications.EfficientNetB3),
    380: ('EfficientNetB4', tf.keras.applications.EfficientNetB4),
}

assert IMAGE_SIZE in EFFICIENTNET_MAP, (
    f'IMAGE_SIZE={IMAGE_SIZE} not supported. Valid: {list(EFFICIENTNET_MAP.keys())}'
)

backbone_name, backbone_fn = EFFICIENTNET_MAP[IMAGE_SIZE]
print(f'[Backbone] Selected: {backbone_name} (IMAGE_SIZE={IMAGE_SIZE})')
print(f'[Backbone] Preprocessing: tf.keras.applications.efficientnet.preprocess_input')

# Download base model
base_model = backbone_fn(
    include_top=False,
    weights='imagenet',
    input_shape=(IMAGE_SIZE, IMAGE_SIZE, 3)
)

# Freeze ALL base layers initially (Phase 1)
base_model.trainable = False

# Simplified classification head: GAP -> Dropout -> Dense -> Output
x = base_model.output
x = tf.keras.layers.GlobalAveragePooling2D(name='global_avg_pool')(x)
x = tf.keras.layers.Dropout(params.DROPOUT_RATE, name='dropout_1')(x)
x = tf.keras.layers.Dense(params.DENSE_UNITS, activation='relu', name='dense_1')(x)
predictions = tf.keras.layers.Dense(NUM_CLASSES, activation='softmax', name='predictions')(x)

model = tf.keras.Model(
    inputs=base_model.input,
    outputs=predictions,
    name=f'{backbone_name}_TomatoDisease'
)

print(f'\nBase model layers: {len(base_model.layers)}')
print(f'Total params: {model.count_params():,}')
trainable = sum(tf.keras.backend.count_params(w) for w in model.trainable_weights)
print(f'Trainable params: {trainable:,}')

## Step 6: Prepare Data Generators

**Key fixes:**
- Uses `preprocess_input` (NOT `rescale=1/255`)
- **Separate** validation generator with NO augmentation

In [None]:
from tensorflow.keras.applications.efficientnet import preprocess_input

# Training generator: augmentation + EfficientNet preprocessing
train_datagen = tf.keras.preprocessing.image.ImageDataGenerator(
    preprocessing_function=preprocess_input,
    rotation_range=params.AUGMENTATION.rotation_range,
    width_shift_range=params.AUGMENTATION.width_shift_range,
    height_shift_range=params.AUGMENTATION.height_shift_range,
    horizontal_flip=params.AUGMENTATION.horizontal_flip,
    zoom_range=params.AUGMENTATION.zoom_range,
    fill_mode=params.AUGMENTATION.fill_mode,
    validation_split=params.VALIDATION_SPLIT
)

# Validation generator: EfficientNet preprocessing ONLY (no augmentation!)
val_datagen = tf.keras.preprocessing.image.ImageDataGenerator(
    preprocessing_function=preprocess_input,
    validation_split=params.VALIDATION_SPLIT
)

train_gen = train_datagen.flow_from_directory(
    TOMATO_DIR,
    target_size=(IMAGE_SIZE, IMAGE_SIZE),
    batch_size=params.BATCH_SIZE,
    class_mode='categorical',
    subset='training',
    shuffle=True,
    seed=42
)

val_gen = val_datagen.flow_from_directory(
    TOMATO_DIR,
    target_size=(IMAGE_SIZE, IMAGE_SIZE),
    batch_size=params.BATCH_SIZE,
    class_mode='categorical',
    subset='validation',
    shuffle=False,
    seed=42
)

print(f'\nTraining samples: {train_gen.samples}')
print(f'Validation samples: {val_gen.samples}')
print(f'Classes: {list(train_gen.class_indices.keys())}')
print(f'Preprocessing: preprocess_input (NOT rescale=1/255)')
print(f'Validation augmentation: NONE (correct)')

# Sanity checks
assert IMAGE_SIZE in [224, 240, 260, 300, 380]
assert NUM_CLASSES == train_gen.num_classes, (
    f'Class mismatch! Config has {NUM_CLASSES} but data has {train_gen.num_classes}'
)
print(f'\n\u2713 Sanity checks passed')

## Step 7: Phase 1 — Train Head Only

Base model frozen. LR = 3e-4 (stabilized for EfficientNet).
10 epochs gives the head a strong starting point.

In [None]:
PHASE1_EPOCHS = 10
PHASE1_LR = 3e-4  # Stabilized (not 1e-3 which is too aggressive for EfficientNet)

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=PHASE1_LR),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

print(f'Phase 1: Training head only ({PHASE1_EPOCHS} epochs, LR={PHASE1_LR})')
print(f'Trainable params: {sum(tf.keras.backend.count_params(w) for w in model.trainable_weights):,}')
print()

history_phase1 = model.fit(
    train_gen,
    epochs=PHASE1_EPOCHS,
    validation_data=val_gen,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss', patience=5,
            restore_best_weights=True, verbose=1
        )
    ]
)

print(f'\nPhase 1 Results:')
print(f'  Train Accuracy: {history_phase1.history["accuracy"][-1]:.4f}')
print(f'  Val Accuracy:   {history_phase1.history["val_accuracy"][-1]:.4f}')

## Step 8: Phase 2 — Fine-Tune Top Layers

Unfreeze top 25% of base model. LR = 1e-4 to preserve pretrained weights.

In [None]:
PHASE2_EPOCHS = 20
PHASE2_LR = 1e-4
UNFREEZE_FROM = int(len(base_model.layers) * 0.75)

base_model.trainable = True
for layer in base_model.layers[:UNFREEZE_FROM]:
    layer.trainable = False

unfrozen = sum(1 for layer in base_model.layers if layer.trainable)
print(f'Unfroze {unfrozen} / {len(base_model.layers)} base model layers')

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=PHASE2_LR),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

trainable_count = sum(tf.keras.backend.count_params(w) for w in model.trainable_weights)
print(f'Trainable params now: {trainable_count:,}')
print(f'\nPhase 2: Fine-tuning ({PHASE2_EPOCHS} epochs, LR={PHASE2_LR})')

history_phase2 = model.fit(
    train_gen,
    epochs=PHASE2_EPOCHS,
    validation_data=val_gen,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=params.EARLY_STOPPING_PATIENCE,
            restore_best_weights=True,
            verbose=1
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=params.REDUCE_LR_FACTOR,
            patience=params.REDUCE_LR_PATIENCE,
            min_lr=1e-7,
            verbose=1
        )
    ]
)

print(f'\nPhase 2 Results:')
print(f'  Train Accuracy: {history_phase2.history["accuracy"][-1]:.4f}')
print(f'  Val Accuracy:   {history_phase2.history["val_accuracy"][-1]:.4f}')
print(f'  Best Val Acc:   {max(history_phase2.history["val_accuracy"]):.4f}')

## Step 9: Training History

In [None]:
import matplotlib.pyplot as plt

# Combine histories
acc = history_phase1.history['accuracy'] + history_phase2.history['accuracy']
val_acc = history_phase1.history['val_accuracy'] + history_phase2.history['val_accuracy']
loss = history_phase1.history['loss'] + history_phase2.history['loss']
val_loss = history_phase1.history['val_loss'] + history_phase2.history['val_loss']

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].plot(acc, label='Train Accuracy', linewidth=2)
axes[0].plot(val_acc, label='Val Accuracy', linewidth=2)
axes[0].axvline(x=len(history_phase1.history['accuracy'])-0.5, color='r',
               linestyle='--', alpha=0.7, label='Phase 1 \u2192 2')
axes[0].set_title('Model Accuracy', fontsize=14)
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Accuracy')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

axes[1].plot(loss, label='Train Loss', linewidth=2)
axes[1].plot(val_loss, label='Val Loss', linewidth=2)
axes[1].axvline(x=len(history_phase1.history['loss'])-0.5, color='r',
               linestyle='--', alpha=0.7, label='Phase 1 \u2192 2')
axes[1].set_title('Model Loss', fontsize=14)
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Loss')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('training_history.png', dpi=150)
plt.show()

print(f'\nFinal Best Val Accuracy: {max(val_acc):.4f}')

## Step 10: Evaluate

In [None]:
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
import seaborn as sns

# Sanity checks before evaluation
print(f'Generator class indices: {val_gen.class_indices}')
print(f'Model output shape: {model.output_shape}')
print(f'Model output classes: {model.output_shape[-1]}')
assert val_gen.num_classes == model.output_shape[-1], 'Class count mismatch!'
print('\u2713 Evaluation sanity checks passed\n')

# Reset generator before prediction for deterministic results
val_gen.reset()

# Predictions
predictions = model.predict(val_gen, verbose=1)
y_pred = np.argmax(predictions, axis=1)
y_true = val_gen.classes

# Class names from generator (ground truth)
class_names = list(val_gen.class_indices.keys())
short_names = [name.replace('Tomato___', '').replace('Tomato_', '')[:25] for name in class_names]

print('Classification Report:')
print('=' * 60)
print(classification_report(y_true, y_pred, target_names=short_names))

In [None]:
# Confusion matrix
cm = confusion_matrix(y_true, y_pred)

fig, ax = plt.subplots(figsize=(12, 10))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=short_names, yticklabels=short_names, ax=ax)
ax.set_xlabel('Predicted', fontsize=12)
ax.set_ylabel('True', fontsize=12)
ax.set_title('Confusion Matrix - Tomato Disease Classification', fontsize=14)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('confusion_matrix.png', dpi=150)
plt.show()

In [None]:
# Save scores.json
import json

scores = {
    'accuracy': float(accuracy_score(y_true, y_pred)),
    'f1_weighted': float(f1_score(y_true, y_pred, average='weighted')),
    'precision_weighted': float(precision_score(y_true, y_pred, average='weighted')),
    'recall_weighted': float(recall_score(y_true, y_pred, average='weighted'))
}

with open('scores.json', 'w') as f:
    json.dump(scores, f, indent=4)

print('Scores:')
for k, v in scores.items():
    print(f'  {k}: {v:.4f}')

## Step 11: Save & Download Model

In [None]:
MODEL_DIR = os.path.join(PROJECT_DIR, 'artifacts', 'training')
os.makedirs(MODEL_DIR, exist_ok=True)

model_path = os.path.join(MODEL_DIR, 'model.keras')
model.save(model_path)
print(f'Model saved to: {model_path}')

size_mb = os.path.getsize(model_path) / (1024 * 1024)
print(f'Model size: {size_mb:.1f} MB')

In [None]:
from google.colab import files

files.download(model_path)
files.download('scores.json')
files.download('training_history.png')
files.download('confusion_matrix.png')

In [None]:
# Option 2: Save to Google Drive (uncomment to use)
# from google.colab import drive
# drive.mount('/content/drive')
# DRIVE_DIR = '/content/drive/MyDrive/tomato-disease-model'
# os.makedirs(DRIVE_DIR, exist_ok=True)
# import shutil
# shutil.copy(model_path, os.path.join(DRIVE_DIR, 'model.keras'))
# shutil.copy('scores.json', os.path.join(DRIVE_DIR, 'scores.json'))
# shutil.copy('training_history.png', os.path.join(DRIVE_DIR, 'training_history.png'))
# shutil.copy('confusion_matrix.png', os.path.join(DRIVE_DIR, 'confusion_matrix.png'))
# print(f'Files saved to Google Drive: {DRIVE_DIR}')

## Done!

**On your local machine:**
1. Place `model.keras` in `artifacts/training/`
2. Place `scores.json` in project root
3. Place plots in `artifacts/evaluation/`
4. Commit and push