# Exploratory Data Analysis: PlantVillage Dataset Overview

This notebook provides an initial exploration of the PlantVillage leaf disease dataset, including class distribution analysis and sample image visualization.

## 1. Import Required Libraries

In [None]:
import os
import glob
import random
import matplotlib.pyplot as plt
import cv2

# Set random seed for reproducibility
random.seed(42)
print("Libraries imported successfully!")

## 2. Define Data Directory Path

In [None]:
DATA_DIR = "../data/raw/PlantVillage"
print(f"Data directory: {DATA_DIR}")
print(f"Directory exists: {os.path.exists(DATA_DIR)}")

## 3. Analyze Class Distribution

In [None]:
# Get all class folders
class_folders = [d for d in os.listdir(DATA_DIR) if os.path.isdir(os.path.join(DATA_DIR, d))]
class_folders.sort()

print(f"Total number of classes: {len(class_folders)}")
print("\nClass folders:")
for folder in class_folders:
    print(f"- {folder}")

In [None]:
# Count images per class
class_counts = {}
total_images = 0

for class_name in class_folders:
    class_path = os.path.join(DATA_DIR, class_name)
    # Count image files (common image extensions)
    image_files = glob.glob(os.path.join(class_path, "*.jpg")) + \
                  glob.glob(os.path.join(class_path, "*.jpeg")) + \
                  glob.glob(os.path.join(class_path, "*.png"))
    
    count = len(image_files)
    class_counts[class_name] = count
    total_images += count

print(f"Total number of images: {total_images:,}")
print(f"Average images per class: {total_images / len(class_folders):.1f}")
print("\nImages per class:")
for class_name, count in sorted(class_counts.items(), key=lambda x: x[1], reverse=True):
    print(f"{class_name}: {count:,} images")

## 4. Visualize Class Distribution

In [None]:
# Create bar chart of class distribution
plt.figure(figsize=(15, 8))

# Sort classes by count for better visualization
sorted_classes = sorted(class_counts.items(), key=lambda x: x[1], reverse=True)
class_names = [x[0] for x in sorted_classes]
counts = [x[1] for x in sorted_classes]

plt.bar(range(len(class_names)), counts, color="skyblue", edgecolor="navy")
plt.xlabel("Classes")
plt.ylabel("Number of Images")
plt.title("PlantVillage Dataset: Images per Class")
plt.xticks(range(len(class_names)), class_names, rotation=45, ha="right")
plt.grid(axis="y", alpha=0.3)
plt.tight_layout()
plt.show()

# Print summary statistics
print(f"\nDataset Summary:")
print(f"- Total classes: {len(class_folders)}")
print(f"- Total images: {total_images:,}")
print(f"- Min images per class: {min(counts):,}")
print(f"- Max images per class: {max(counts):,}")
print(f"- Median images per class: {sorted(counts)[len(counts)//2]:,}")

## 5. Display Sample Images

In [None]:
# Function to get random sample images
def get_random_samples(num_samples=9):
    samples = []
    
    # Get random classes
    selected_classes = random.sample(class_folders, min(num_samples, len(class_folders)))
    
    for i, class_name in enumerate(selected_classes):
        class_path = os.path.join(DATA_DIR, class_name)
        image_files = glob.glob(os.path.join(class_path, "*.jpg")) + \
                      glob.glob(os.path.join(class_path, "*.jpeg")) + \
                      glob.glob(os.path.join(class_path, "*.png"))
        
        if image_files:
            # Select random image from this class
            random_image = random.choice(image_files)
            samples.append((random_image, class_name))
    
    return samples

# Get 9 random samples
sample_images = get_random_samples(9)

# Display 3x3 grid
fig, axes = plt.subplots(3, 3, figsize=(15, 15))
fig.suptitle("Random Sample Images from PlantVillage Dataset", fontsize=16, y=0.95)

for i, (image_path, class_name) in enumerate(sample_images):
    row = i // 3
    col = i % 3
    
    # Read and display image
    img = cv2.imread(image_path)
    img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    
    axes[row, col].imshow(img_rgb)
    axes[row, col].set_title(f"{class_name}\n{os.path.basename(image_path)}", 
                             fontsize=10, wrap=True)
    axes[row, col].axis("off")

plt.tight_layout()
plt.show()

print("Sample images displayed successfully!")

## Summary

This notebook has provided an initial overview of the PlantVillage dataset:

- **Dataset Structure**: Analyzed the organization of classes and images
- **Class Distribution**: Visualized the distribution of images across different disease categories
- **Sample Images**: Displayed random samples to understand image quality and variety

**Next Steps**:
- Implement data preprocessing pipeline
- Create train/validation/test splits
- Develop baseline models
- Analyze image characteristics (size, color distribution, etc.)