In [None]:
# Imports
import sys
from pathlib import Path

# Add src to path
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root / 'src'))

import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
from sklearn.metrics import classification_report, confusion_matrix
import time
import os

# Local imports
from data.dataset import create_dataloaders
from models.efficientnet import create_efficientnet_b0
from models.cnn import create_custom_cnn
from utils.training import load_checkpoint, MetricsCalculator

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

## 1. Configuration

In [None]:
# Configuration
class Config:
    # Paths
    DATA_DIR = project_root.parent / 'data' / 'processed'
    CHECKPOINT_DIR = project_root.parent / 'models_exported'
    RESULTS_DIR = project_root.parent / 'results'
    
    # Model checkpoints
    EFFICIENTNET_CKPT = CHECKPOINT_DIR / 'efficientnet_best.pth'
    CNN_CKPT = CHECKPOINT_DIR / 'custom_cnn_best.pth'
    
    # Model
    NUM_CLASSES = 38
    IMAGE_SIZE = 224
    
    # Data
    BATCH_SIZE = 32
    NUM_WORKERS = 4
    
    # Device
    DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

config = Config()

# Create results directory
config.RESULTS_DIR.mkdir(parents=True, exist_ok=True)

print(f"Data directory: {config.DATA_DIR}")
print(f"Results directory: {config.RESULTS_DIR}")
print(f"Device: {config.DEVICE}")

## 2. Load Test Data

In [None]:
# Create data loaders
dataloaders, class_names = create_dataloaders(
    data_dir=config.DATA_DIR,
    batch_size=config.BATCH_SIZE,
    num_workers=config.NUM_WORKERS,
    image_size=config.IMAGE_SIZE
)

test_loader = dataloaders['test']

print(f"Number of classes: {len(class_names)}")
print(f"Test samples: {len(test_loader.dataset)}")
print(f"Test batches: {len(test_loader)}")

## 3. Load Models

In [None]:
# Load EfficientNet-B0
efficientnet = create_efficientnet_b0(
    num_classes=config.NUM_CLASSES,
    pretrained=False
)

if config.EFFICIENTNET_CKPT.exists():
    ckpt = load_checkpoint(config.EFFICIENTNET_CKPT, efficientnet)
    print(f"Loaded EfficientNet-B0 from epoch {ckpt['epoch']}")
    print(f"  Val Accuracy: {ckpt['best_metric']:.4f}")
else:
    print(f"Warning: EfficientNet checkpoint not found at {config.EFFICIENTNET_CKPT}")

efficientnet = efficientnet.to(config.DEVICE)
efficientnet.eval()

print(f"EfficientNet-B0 parameters: {efficientnet.get_num_params():,}")

In [None]:
# Load Custom CNN
custom_cnn = create_custom_cnn(
    num_classes=config.NUM_CLASSES,
    variant='standard'
)

if config.CNN_CKPT.exists():
    ckpt = load_checkpoint(config.CNN_CKPT, custom_cnn)
    print(f"Loaded Custom CNN from epoch {ckpt['epoch']}")
    print(f"  Val Accuracy: {ckpt['best_metric']:.4f}")
else:
    print(f"Warning: Custom CNN checkpoint not found at {config.CNN_CKPT}")

custom_cnn = custom_cnn.to(config.DEVICE)
custom_cnn.eval()

print(f"Custom CNN parameters: {custom_cnn.get_num_params():,}")

## 4. Evaluation Functions

In [None]:
def evaluate_model(model, test_loader, device, model_name="Model"):
    """
    Evaluate a model on the test set.
    
    Returns:
        dict: Evaluation results including predictions, labels, and timing
    """
    model.eval()
    all_preds = []
    all_labels = []
    all_probs = []
    total_time = 0
    
    with torch.no_grad():
        for inputs, labels in tqdm(test_loader, desc=f"Evaluating {model_name}"):
            inputs = inputs.to(device)
            
            # Time inference
            start_time = time.time()
            outputs = model(inputs)
            if device == 'cuda':
                torch.cuda.synchronize()
            total_time += time.time() - start_time
            
            probs = torch.softmax(outputs, dim=1)
            _, preds = torch.max(outputs, 1)
            
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.numpy())
            all_probs.extend(probs.cpu().numpy())
    
    return {
        'predictions': np.array(all_preds),
        'labels': np.array(all_labels),
        'probabilities': np.array(all_probs),
        'total_time': total_time,
        'samples_per_second': len(all_preds) / total_time
    }


def get_model_size(model):
    """Get model size in MB."""
    param_size = sum(p.nelement() * p.element_size() for p in model.parameters())
    buffer_size = sum(b.nelement() * b.element_size() for b in model.buffers())
    return (param_size + buffer_size) / 1024 / 1024


print("Evaluation functions defined.")

## 5. Evaluate Both Models

In [None]:
# Evaluate EfficientNet-B0
efficientnet_results = evaluate_model(
    efficientnet, test_loader, config.DEVICE, "EfficientNet-B0"
)

# Evaluate Custom CNN
cnn_results = evaluate_model(
    custom_cnn, test_loader, config.DEVICE, "Custom CNN"
)

print("\nEvaluation complete!")

## 6. Model Comparison Summary

In [None]:
# Calculate metrics
metrics_calc = MetricsCalculator(num_classes=config.NUM_CLASSES)

efficientnet_metrics = metrics_calc.calculate(
    efficientnet_results['labels'],
    efficientnet_results['predictions']
)

cnn_metrics = metrics_calc.calculate(
    cnn_results['labels'],
    cnn_results['predictions']
)

# Create comparison table
comparison_data = {
    'Metric': [
        'Test Accuracy',
        'F1 Score (Macro)',
        'F1 Score (Weighted)',
        'Parameters',
        'Model Size (MB)',
        'Inference Speed (samples/sec)'
    ],
    'EfficientNet-B0': [
        f"{efficientnet_metrics['accuracy']:.4f}",
        f"{efficientnet_metrics['f1_macro']:.4f}",
        f"{efficientnet_metrics['f1_weighted']:.4f}",
        f"{efficientnet.get_num_params():,}",
        f"{get_model_size(efficientnet):.2f}",
        f"{efficientnet_results['samples_per_second']:.1f}"
    ],
    'Custom CNN': [
        f"{cnn_metrics['accuracy']:.4f}",
        f"{cnn_metrics['f1_macro']:.4f}",
        f"{cnn_metrics['f1_weighted']:.4f}",
        f"{custom_cnn.get_num_params():,}",
        f"{get_model_size(custom_cnn):.2f}",
        f"{cnn_results['samples_per_second']:.1f}"
    ]
}

comparison_df = pd.DataFrame(comparison_data)
print("\n" + "="*70)
print("MODEL COMPARISON SUMMARY")
print("="*70)
print(comparison_df.to_string(index=False))

In [None]:
# Visualize comparison
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

models = ['EfficientNet-B0', 'Custom CNN']
colors = ['#2ecc71', '#e74c3c']

# Accuracy comparison
accuracies = [efficientnet_metrics['accuracy'], cnn_metrics['accuracy']]
axes[0].bar(models, accuracies, color=colors)
axes[0].set_ylabel('Accuracy')
axes[0].set_title('Test Accuracy')
axes[0].set_ylim(0, 1)
for i, v in enumerate(accuracies):
    axes[0].text(i, v + 0.02, f'{v:.4f}', ha='center', fontweight='bold')

# F1 Score comparison
f1_scores = [efficientnet_metrics['f1_macro'], cnn_metrics['f1_macro']]
axes[1].bar(models, f1_scores, color=colors)
axes[1].set_ylabel('F1 Score (Macro)')
axes[1].set_title('F1 Score Comparison')
axes[1].set_ylim(0, 1)
for i, v in enumerate(f1_scores):
    axes[1].text(i, v + 0.02, f'{v:.4f}', ha='center', fontweight='bold')

# Inference speed comparison
speeds = [efficientnet_results['samples_per_second'], cnn_results['samples_per_second']]
axes[2].bar(models, speeds, color=colors)
axes[2].set_ylabel('Samples/Second')
axes[2].set_title('Inference Speed')
for i, v in enumerate(speeds):
    axes[2].text(i, v + max(speeds)*0.02, f'{v:.1f}', ha='center', fontweight='bold')

plt.tight_layout()
plt.savefig(config.RESULTS_DIR / 'model_comparison.png', dpi=150, bbox_inches='tight')
plt.show()

## 7. Confusion Matrices

In [None]:
def plot_confusion_matrix(labels, predictions, class_names, title, save_path=None):
    """Plot a confusion matrix."""
    cm = confusion_matrix(labels, predictions)
    
    # Normalize
    cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    
    fig, ax = plt.subplots(figsize=(16, 14))
    sns.heatmap(
        cm_normalized, 
        annot=False,  # Too many classes for annotations
        cmap='Blues',
        xticklabels=class_names,
        yticklabels=class_names,
        ax=ax
    )
    plt.title(title, fontsize=14)
    plt.xlabel('Predicted', fontsize=12)
    plt.ylabel('True', fontsize=12)
    plt.xticks(rotation=90, fontsize=6)
    plt.yticks(rotation=0, fontsize=6)
    
    if save_path:
        plt.savefig(save_path, dpi=150, bbox_inches='tight')
    plt.show()
    
    return cm

In [None]:
# EfficientNet confusion matrix
cm_efficientnet = plot_confusion_matrix(
    efficientnet_results['labels'],
    efficientnet_results['predictions'],
    class_names,
    'EfficientNet-B0 Confusion Matrix',
    config.RESULTS_DIR / 'efficientnet_confusion_matrix.png'
)

In [None]:
# Custom CNN confusion matrix
cm_cnn = plot_confusion_matrix(
    cnn_results['labels'],
    cnn_results['predictions'],
    class_names,
    'Custom CNN Confusion Matrix',
    config.RESULTS_DIR / 'custom_cnn_confusion_matrix.png'
)

## 8. Per-Class Performance Analysis

In [None]:
# Create per-class comparison
per_class_df = pd.DataFrame({
    'Class': class_names,
    'EfficientNet F1': efficientnet_metrics['f1_per_class'],
    'Custom CNN F1': cnn_metrics['f1_per_class']
})

per_class_df['Difference'] = per_class_df['EfficientNet F1'] - per_class_df['Custom CNN F1']
per_class_df = per_class_df.sort_values('EfficientNet F1', ascending=False)

print("\nPer-Class F1 Scores:")
print(per_class_df.to_string(index=False))

In [None]:
# Classes where EfficientNet significantly outperforms CNN
print("\nClasses where EfficientNet is significantly better (diff > 0.1):")
better_efficientnet = per_class_df[per_class_df['Difference'] > 0.1]
print(better_efficientnet.to_string(index=False))

# Classes where CNN outperforms EfficientNet
print("\nClasses where Custom CNN is better (diff < -0.05):")
better_cnn = per_class_df[per_class_df['Difference'] < -0.05]
print(better_cnn.to_string(index=False) if len(better_cnn) > 0 else "None")

In [None]:
# Visualize per-class F1 comparison
fig, ax = plt.subplots(figsize=(16, 8))

x = np.arange(len(class_names))
width = 0.35

# Sort by EfficientNet F1 for better visualization
sorted_df = per_class_df.sort_values('EfficientNet F1', ascending=True)

ax.barh(x - width/2, sorted_df['EfficientNet F1'], width, label='EfficientNet-B0', color='#2ecc71')
ax.barh(x + width/2, sorted_df['Custom CNN F1'], width, label='Custom CNN', color='#e74c3c')

ax.set_xlabel('F1 Score')
ax.set_ylabel('Class')
ax.set_title('Per-Class F1 Score Comparison')
ax.set_yticks(x)
ax.set_yticklabels(sorted_df['Class'], fontsize=6)
ax.legend()
ax.set_xlim(0, 1.1)

plt.tight_layout()
plt.savefig(config.RESULTS_DIR / 'per_class_f1_comparison.png', dpi=150, bbox_inches='tight')
plt.show()

## 9. Classification Reports

In [None]:
# EfficientNet classification report
print("="*70)
print("EfficientNet-B0 Classification Report")
print("="*70)
print(classification_report(
    efficientnet_results['labels'],
    efficientnet_results['predictions'],
    target_names=class_names,
    digits=4
))

In [None]:
# Custom CNN classification report
print("="*70)
print("Custom CNN Classification Report")
print("="*70)
print(classification_report(
    cnn_results['labels'],
    cnn_results['predictions'],
    target_names=class_names,
    digits=4
))

## 10. Save Results

In [None]:
# Save comparison results
comparison_df.to_csv(config.RESULTS_DIR / 'model_comparison.csv', index=False)
per_class_df.to_csv(config.RESULTS_DIR / 'per_class_comparison.csv', index=False)

# Save detailed results
results_summary = {
    'efficientnet': {
        'accuracy': float(efficientnet_metrics['accuracy']),
        'f1_macro': float(efficientnet_metrics['f1_macro']),
        'f1_weighted': float(efficientnet_metrics['f1_weighted']),
        'parameters': efficientnet.get_num_params(),
        'model_size_mb': get_model_size(efficientnet),
        'samples_per_second': efficientnet_results['samples_per_second']
    },
    'custom_cnn': {
        'accuracy': float(cnn_metrics['accuracy']),
        'f1_macro': float(cnn_metrics['f1_macro']),
        'f1_weighted': float(cnn_metrics['f1_weighted']),
        'parameters': custom_cnn.get_num_params(),
        'model_size_mb': get_model_size(custom_cnn),
        'samples_per_second': cnn_results['samples_per_second']
    }
}

import json
with open(config.RESULTS_DIR / 'evaluation_results.json', 'w') as f:
    json.dump(results_summary, f, indent=2)

print(f"Results saved to: {config.RESULTS_DIR}")
print(f"  - model_comparison.csv")
print(f"  - per_class_comparison.csv")
print(f"  - evaluation_results.json")
print(f"  - model_comparison.png")
print(f"  - per_class_f1_comparison.png")
print(f"  - efficientnet_confusion_matrix.png")
print(f"  - custom_cnn_confusion_matrix.png")

## 11. Conclusions

In [None]:
print("\n" + "="*70)
print("EVALUATION SUMMARY")
print("="*70)

# Determine winner
efficientnet_acc = efficientnet_metrics['accuracy']
cnn_acc = cnn_metrics['accuracy']

print(f"\n1. ACCURACY:")
print(f"   - EfficientNet-B0: {efficientnet_acc:.4f}")
print(f"   - Custom CNN: {cnn_acc:.4f}")
acc_diff = abs(efficientnet_acc - cnn_acc) * 100
if efficientnet_acc > cnn_acc:
    print(f"   -> EfficientNet-B0 wins by {acc_diff:.2f}%")
else:
    print(f"   -> Custom CNN wins by {acc_diff:.2f}%")

print(f"\n2. F1 SCORE (Macro):")
print(f"   - EfficientNet-B0: {efficientnet_metrics['f1_macro']:.4f}")
print(f"   - Custom CNN: {cnn_metrics['f1_macro']:.4f}")

print(f"\n3. MODEL EFFICIENCY:")
print(f"   - EfficientNet-B0: {efficientnet.get_num_params():,} params, {get_model_size(efficientnet):.2f} MB")
print(f"   - Custom CNN: {custom_cnn.get_num_params():,} params, {get_model_size(custom_cnn):.2f} MB")

print(f"\n4. INFERENCE SPEED:")
print(f"   - EfficientNet-B0: {efficientnet_results['samples_per_second']:.1f} samples/sec")
print(f"   - Custom CNN: {cnn_results['samples_per_second']:.1f} samples/sec")

print("\n" + "="*70)
print("KEY TAKEAWAYS:")
print("="*70)
print("""
1. Transfer learning with EfficientNet-B0 typically provides better accuracy
   due to pre-trained ImageNet features.

2. The Custom CNN serves as a baseline and demonstrates what can be achieved
   training from scratch on this dataset.

3. For production deployment, consider the trade-off between accuracy and
   model size/inference speed based on your requirements.

4. Classes with similar visual features (e.g., similar plant diseases) may
   have lower F1 scores due to confusion between them.
""")