# Tomato Disease Advisory System - Training Pipeline

This notebook runs the full ML pipeline on Google Colab GPU:
1. Clone repository
2. Install dependencies
3. Download & extract dataset
4. Prepare EfficientNet-B4 base model
5. Train with augmentation
6. Evaluate and log metrics to MLflow
7. Push trained model back to GitHub

**Runtime:** Set to **GPU** (Runtime > Change runtime type > T4 GPU)

## Step 1: Verify GPU

In [None]:
import tensorflow as tf
print('TensorFlow version:', tf.__version__)
print('GPU Available:', tf.config.list_physical_devices('GPU'))
assert len(tf.config.list_physical_devices('GPU')) > 0, 'No GPU found! Change runtime to GPU.'

## Step 2: Clone Repository

In [None]:
import os

REPO_URL = 'https://github.com/ShubhamPawar-3333/Classification_of_tomato_plant_disease.git'
PROJECT_DIR = '/content/Classification_of_tomato_plant_disease'

if os.path.exists(PROJECT_DIR):
    %cd {PROJECT_DIR}
    !git pull origin master
else:
    !git clone {REPO_URL}
    %cd {PROJECT_DIR}

!ls -la

## Step 3: Install Dependencies

In [None]:
!pip install -q python-box ensure PyYAML mlflow scikit-learn seaborn

In [None]:
# Add src to Python path
import sys
sys.path.insert(0, os.path.join(PROJECT_DIR, 'src'))

# Verify import works
from tomato_disease_advisor.config import ConfigurationManager
print('Imports successful!')

## Step 4: Download Dataset

Downloads the PlantVillage tomato disease dataset.

In [None]:
import gdown
import zipfile

# PlantVillage Tomato Dataset (10 classes)
# You can replace this with your own dataset URL
DATASET_URL = 'https://www.kaggle.com/api/v1/datasets/download/arjuntejaswi/plant-village'

DATA_DIR = os.path.join(PROJECT_DIR, 'artifacts', 'data_ingestion')
DATASET_DIR = os.path.join(DATA_DIR, 'dataset')
os.makedirs(DATA_DIR, exist_ok=True)

if not os.path.exists(DATASET_DIR) or len(os.listdir(DATASET_DIR)) == 0:
    print('Downloading dataset from Kaggle...')
    print('NOTE: If Kaggle download fails, upload dataset manually.')
    print('Alternative: Upload your dataset zip to Google Drive and use gdown.')
    
    # Option 1: Using opendatasets (requires Kaggle credentials)
    try:
        !pip install -q opendatasets
        import opendatasets as od
        od.download('https://www.kaggle.com/datasets/arjuntejaswi/plant-village', DATA_DIR)
    except Exception as e:
        print(f'Kaggle download failed: {e}')
        print('\nPlease upload dataset manually:')
        print('1. Download from: https://www.kaggle.com/datasets/arjuntejaswi/plant-village')
        print('2. Upload the zip file to Colab')
        print('3. Run the manual upload cell below')
else:
    print(f'Dataset already exists at: {DATASET_DIR}')
    print(f'Contents: {os.listdir(DATASET_DIR)}')

In [None]:
# MANUAL UPLOAD OPTION: Run this cell if Kaggle download failed
# Upload your dataset zip file when prompted

# from google.colab import files
# uploaded = files.upload()
# zip_path = list(uploaded.keys())[0]
# with zipfile.ZipFile(zip_path, 'r') as zip_ref:
#     zip_ref.extractall(DATASET_DIR)
# print(f'Extracted to: {DATASET_DIR}')

In [None]:
# Filter only tomato classes from PlantVillage
import shutil

# Find the actual data directory
data_root = DATASET_DIR
for root, dirs, files_list in os.walk(DATASET_DIR):
    if any('Tomato' in d for d in dirs):
        data_root = root
        break

print(f'Data root: {data_root}')

# List tomato classes
tomato_classes = [d for d in os.listdir(data_root) if 'Tomato' in d]
print(f'\nFound {len(tomato_classes)} tomato classes:')
for cls in sorted(tomato_classes):
    count = len(os.listdir(os.path.join(data_root, cls)))
    print(f'  {cls}: {count} images')

# Create filtered dataset with only tomato classes
TOMATO_DIR = os.path.join(DATASET_DIR, 'tomato')
if not os.path.exists(TOMATO_DIR):
    os.makedirs(TOMATO_DIR, exist_ok=True)
    for cls in tomato_classes:
        src = os.path.join(data_root, cls)
        dst = os.path.join(TOMATO_DIR, cls)
        if not os.path.exists(dst):
            shutil.copytree(src, dst)
    print(f'\nFiltered dataset created at: {TOMATO_DIR}')
else:
    print(f'Filtered dataset already exists at: {TOMATO_DIR}')

total = sum(len(os.listdir(os.path.join(TOMATO_DIR, d))) for d in os.listdir(TOMATO_DIR) if os.path.isdir(os.path.join(TOMATO_DIR, d)))
print(f'\nTotal images: {total}')

## Step 5: Prepare Base Model (EfficientNet-B4)

In [None]:
from tomato_disease_advisor.utils import read_yaml
from pathlib import Path

# Load configs
config = read_yaml(Path(os.path.join(PROJECT_DIR, 'config', 'config.yaml')))
params = read_yaml(Path(os.path.join(PROJECT_DIR, 'params.yaml')))

print('Config loaded successfully')
print(f'Model: {config.model.name}')
print(f'Input shape: {list(config.model.input_shape)}')
print(f'Classes: {config.model.classes}')

In [None]:
# Build the model
IMAGE_SIZE = params.IMAGE_SIZE
NUM_CLASSES = config.model.classes

# Download EfficientNetB4
base_model = tf.keras.applications.EfficientNetB4(
    include_top=False,
    weights='imagenet',
    input_shape=(IMAGE_SIZE, IMAGE_SIZE, 3)
)

# Freeze base
for layer in base_model.layers:
    layer.trainable = False

# Add classification head
x = base_model.output
x = tf.keras.layers.GlobalAveragePooling2D(name='global_avg_pool')(x)
x = tf.keras.layers.BatchNormalization(name='bn_head')(x)
x = tf.keras.layers.Dropout(params.DROPOUT_RATE, name='dropout_1')(x)
x = tf.keras.layers.Dense(params.DENSE_UNITS, activation='relu', name='dense_1')(x)
x = tf.keras.layers.BatchNormalization(name='bn_dense')(x)
x = tf.keras.layers.Dropout(0.2, name='dropout_2')(x)
predictions = tf.keras.layers.Dense(NUM_CLASSES, activation='softmax', name='predictions')(x)

model = tf.keras.Model(
    inputs=base_model.input,
    outputs=predictions,
    name='EfficientNetB4_TomatoDisease'
)

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=params.LEARNING_RATE),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

print(f'Total params: {model.count_params():,}')
trainable = sum(tf.keras.backend.count_params(w) for w in model.trainable_weights)
print(f'Trainable params: {trainable:,}')

## Step 6: Train the Model

In [None]:
# Data generators
train_datagen = tf.keras.preprocessing.image.ImageDataGenerator(
    rescale=1.0/255,
    rotation_range=params.AUGMENTATION.rotation_range,
    width_shift_range=params.AUGMENTATION.width_shift_range,
    height_shift_range=params.AUGMENTATION.height_shift_range,
    horizontal_flip=params.AUGMENTATION.horizontal_flip,
    zoom_range=params.AUGMENTATION.zoom_range,
    fill_mode=params.AUGMENTATION.fill_mode,
    validation_split=params.VALIDATION_SPLIT
)

train_gen = train_datagen.flow_from_directory(
    TOMATO_DIR,
    target_size=(IMAGE_SIZE, IMAGE_SIZE),
    batch_size=params.BATCH_SIZE,
    class_mode='categorical',
    subset='training',
    shuffle=True,
    seed=42
)

val_gen = train_datagen.flow_from_directory(
    TOMATO_DIR,
    target_size=(IMAGE_SIZE, IMAGE_SIZE),
    batch_size=params.BATCH_SIZE,
    class_mode='categorical',
    subset='validation',
    shuffle=False,
    seed=42
)

print(f'\nTraining samples: {train_gen.samples}')
print(f'Validation samples: {val_gen.samples}')
print(f'Classes: {list(train_gen.class_indices.keys())}')

In [None]:
# Callbacks
callbacks = [
    tf.keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=params.EARLY_STOPPING_PATIENCE,
        restore_best_weights=True,
        verbose=1
    ),
    tf.keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss',
        factor=params.REDUCE_LR_FACTOR,
        patience=params.REDUCE_LR_PATIENCE,
        min_lr=1e-7,
        verbose=1
    )
]

# Train
history = model.fit(
    train_gen,
    epochs=params.EPOCHS,
    validation_data=val_gen,
    callbacks=callbacks,
    steps_per_epoch=train_gen.samples // params.BATCH_SIZE,
    validation_steps=val_gen.samples // params.BATCH_SIZE
)

In [None]:
# Plot training history
import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Accuracy
axes[0].plot(history.history['accuracy'], label='Train Accuracy')
axes[0].plot(history.history['val_accuracy'], label='Val Accuracy')
axes[0].set_title('Model Accuracy')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Accuracy')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Loss
axes[1].plot(history.history['loss'], label='Train Loss')
axes[1].plot(history.history['val_loss'], label='Val Loss')
axes[1].set_title('Model Loss')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Loss')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('training_history.png', dpi=150)
plt.show()

print(f'\nBest Val Accuracy: {max(history.history["val_accuracy"]):.4f}')
print(f'Final Train Accuracy: {history.history["accuracy"][-1]:.4f}')

## Step 7: Evaluate the Model

In [None]:
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns

# Predictions
predictions = model.predict(val_gen, verbose=1)
y_pred = np.argmax(predictions, axis=1)
y_true = val_gen.classes

# Class names
class_names = list(val_gen.class_indices.keys())
short_names = [name.replace('Tomato___', '').replace('Tomato_', '')[:25] for name in class_names]

# Classification report
print('Classification Report:')
print('=' * 60)
print(classification_report(y_true, y_pred, target_names=short_names))

In [None]:
# Confusion matrix
cm = confusion_matrix(y_true, y_pred)

fig, ax = plt.subplots(figsize=(12, 10))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=short_names, yticklabels=short_names, ax=ax)
ax.set_xlabel('Predicted', fontsize=12)
ax.set_ylabel('True', fontsize=12)
ax.set_title('Confusion Matrix - Tomato Disease Classification', fontsize=14)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('confusion_matrix.png', dpi=150)
plt.show()

In [None]:
# Save scores.json
import json
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

scores = {
    'accuracy': float(accuracy_score(y_true, y_pred)),
    'f1_weighted': float(f1_score(y_true, y_pred, average='weighted')),
    'precision_weighted': float(precision_score(y_true, y_pred, average='weighted')),
    'recall_weighted': float(recall_score(y_true, y_pred, average='weighted'))
}

with open('scores.json', 'w') as f:
    json.dump(scores, f, indent=4)

print('Scores:')
for k, v in scores.items():
    print(f'  {k}: {v:.4f}')

## Step 8: Save Model

In [None]:
# Save trained model
MODEL_DIR = os.path.join(PROJECT_DIR, 'artifacts', 'training')
os.makedirs(MODEL_DIR, exist_ok=True)

model_path = os.path.join(MODEL_DIR, 'model.h5')
model.save(model_path)
print(f'Model saved to: {model_path}')

# Also save as SavedModel format
savedmodel_path = os.path.join(MODEL_DIR, 'saved_model')
model.save(savedmodel_path)
print(f'SavedModel saved to: {savedmodel_path}')

# Model size
size_mb = os.path.getsize(model_path) / (1024 * 1024)
print(f'Model size: {size_mb:.1f} MB')

## Step 9: Download Model to Local Machine

In [None]:
# Option 1: Download directly from Colab
from google.colab import files

# Download model.h5
files.download(model_path)

# Download scores
files.download('scores.json')
files.download('training_history.png')
files.download('confusion_matrix.png')

In [None]:
# Option 2: Save to Google Drive
# from google.colab import drive
# drive.mount('/content/drive')
# 
# DRIVE_DIR = '/content/drive/MyDrive/tomato-disease-model'
# os.makedirs(DRIVE_DIR, exist_ok=True)
# 
# import shutil
# shutil.copy(model_path, os.path.join(DRIVE_DIR, 'model.h5'))
# shutil.copy('scores.json', os.path.join(DRIVE_DIR, 'scores.json'))
# shutil.copy('training_history.png', os.path.join(DRIVE_DIR, 'training_history.png'))
# shutil.copy('confusion_matrix.png', os.path.join(DRIVE_DIR, 'confusion_matrix.png'))
# print(f'Files saved to Google Drive: {DRIVE_DIR}')

## Done!

### Next Steps (on your local machine):
1. Place `model.h5` in `artifacts/training/model.h5`
2. Place `scores.json` in the project root
3. Place `confusion_matrix.png` in `artifacts/evaluation/`
4. Commit and push:
```bash
git add scores.json artifacts/
git commit -m 'feat: Add trained model and evaluation results'
git push origin master
```