# Data Exploration Notebook

This notebook helps you explore and understand your crop disease dataset.

## Steps:
1. Load and visualize dataset statistics
2. Display sample images from each class
3. Check data distribution
4. Verify data quality


In [None]:
import sys
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from PIL import Image
import os

# Add src to path
sys.path.insert(0, str(Path().resolve().parent / "src"))


## 1. Dataset Statistics


In [None]:
# Update this path to your dataset
data_dir = Path("../data/raw")

# Count images per class
class_counts = {}
for crop_dir in data_dir.iterdir():
    if crop_dir.is_dir():
        for disease_dir in crop_dir.iterdir():
            if disease_dir.is_dir():
                class_name = f"{crop_dir.name}/{disease_dir.name}"
                count = len(list(disease_dir.glob("*.jpg"))) + len(list(disease_dir.glob("*.png")))
                class_counts[class_name] = count

# Display statistics
df = pd.DataFrame(list(class_counts.items()), columns=["Class", "Count"])
print("Dataset Statistics:")
print(df)
print(f"\nTotal images: {df['Count'].sum()}")
print(f"Number of classes: {len(df)}")
