# Cassava Leaf Disease Detection

#### Section 1 - Data Exploration

In [1]:
# Import Libraries

import numpy as np              # For numerical operations
import pandas as pd             # For data manipulation
import json                     # To read JSON files
import matplotlib.pyplot as plt # For creating plots
import seaborn as sns           # For beautiful visualizations
import os                       # For file operations

# Set style for plots to look professional
sns.set_style('whitegrid')

# TensorFlow and Keras for deep learning
import tensorflow as tf
from tensorflow import keras

# Set random seeds so results are reproducible
# This means if you run the code again, you get same results
SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)

print("✓ All libraries imported successfully!")
print(f"TensorFlow version: {tf.__version__}")

✓ All libraries imported successfully!
TensorFlow version: 2.20.0


In [2]:
# Configuration
# Instead of hardcoding values throughout the code, we put all settings here
# This makes it easy to change parameters in one place

class Config:
    """
    Configuration class containing all hyperparameters and settings.
    
    Think of this as the "control panel" for this entire project.
    Change values here instead of searching through code.
    """
    
    # ----- FILE PATHS -----
    # Where your data is located (CHANGE THESE to match your setup)
    TRAIN_CSV = 'train.csv'
    TRAIN_IMAGES_DIR = 'train_images'
    LABEL_MAP_JSON = 'label_num_to_disease_map.json'
    
    # ----- IMAGE SETTINGS -----
    IMG_HEIGHT = 224  # Resize all images to this height
    IMG_WIDTH = 224   # Resize all images to this width
    IMG_SIZE = (224, 224)  # Combined as tuple
    
    # Why 224x224?
    # - Standard size for pre-trained models (ImageNet)
    # - Good balance between detail and computation speed
    # - Larger = more detail but slower, Smaller = faster but less detail
    
    # ----- DATASET SPLIT RATIOS -----
    TRAIN_SPLIT = 0.70  # 70% for training
    VAL_SPLIT = 0.15    # 15% for validation
    TEST_SPLIT = 0.15   # 15% for testing
    
    # Why this split?
    # - Training (70%): Needs most data to learn patterns
    # - Validation (15%): Monitor performance during training
    # - Test (15%): Final evaluation on completely unseen data
    
    # ----- MODEL SETTINGS -----
    NUM_CLASSES = 5  # We have 5 classes (4 diseases + healthy)
    BATCH_SIZE = 32  # Process 32 images at a time
    
    # Why batch_size = 32?
    # - Fits in most GPU memory
    # - Not too small (slow) or too large (less generalizable)
    # - Standard choice in deep learning
    
    # ----- RANDOM SEED -----
    SEED = 42  # For reproducibility

# Create an instance of Config to use throughout code
config = Config()

print("✓ Configuration loaded!")
print(f"  Image size: {config.IMG_SIZE}")
print(f"  Number of classes: {config.NUM_CLASSES}")
print(f"  Batch size: {config.BATCH_SIZE}")

✓ Configuration loaded!
  Image size: (224, 224)
  Number of classes: 5
  Batch size: 32


In [3]:
# Data Loading

def load_disease_labels():
    """
    Load the mapping of class numbers (0-4) to disease names.
    
    This function reads the JSON file and extracts the label mapping.
    
    Returns:
    --------
    label_map : dict
        Dictionary like {0: 'CBB', 1: 'CBSD', ...}
    
    Example:
    --------
    If JSON contains: {"root": {"0": "Cassava Bacterial Blight", ...}}
    We extract: {0: "Cassava Bacterial Blight", ...}
    """
    
    print("\n" + "="*70)
    print("LOADING DISEASE LABELS")
    print("="*70)
    
    # Open and read the JSON file
    with open(config.LABEL_MAP_JSON, 'r') as file:
        data = json.load(file)  # Load JSON as Python dictionary
    
    # Extract the actual mapping from 'root' key
    label_map = data['root']
    
    # Print the mapping so you can see what we loaded
    print("\nDisease Classes:")
    print("-" * 70)
    for class_num, disease_name in label_map.items():
        print(f"  Class {class_num}: {disease_name}")
    
    print("\n✓ Labels loaded successfully!")
    return label_map

In [4]:
def load_training_data():
    """
    Load the CSV file containing image filenames and their labels.
    
    The CSV has two columns:
    - image_id: filename of the image (e.g., '1000015157.jpg')
    - label: class number (0-4)
    
    Returns:
    --------
    df : pandas DataFrame
        Contains all image filenames and labels
    """
    
    print("\n" + "="*70)
    print("LOADING TRAINING DATA CSV")
    print("="*70)
    
    # Read CSV file into pandas DataFrame
    # A DataFrame is like an Excel spreadsheet in Python
    df = pd.read_csv(config.TRAIN_CSV)
    
    # Show basic information
    print(f"\n✓ Loaded {len(df)} images")
    print(f"\nFirst 5 rows of data:")
    print(df.head())
    
    # Check for missing values (important data quality check)
    missing = df.isnull().sum()
    print(f"\nMissing values check:")
    print(missing)
    
    if missing.sum() == 0:
        print("✓ No missing values found!")
    else:
        print("⚠ WARNING: Missing values detected!")
    
    return df