---
## ⚙️ 1. Setup and Configuration

Import all required libraries, suppress warnings, and set up utility paths.

In [2]:
# Ignore all warnings
import warnings
warnings.filterwarnings("ignore")

# === Standard Library Imports ===
import os
import sys
from pathlib import Path
from collections import defaultdict, Counter

# === Third-Party Library Imports ===
import pandas as pd
import matplotlib.pyplot as plt
import cv2
from IPython.display import JSON

### 🔧 Add utility folder and import custom modules

In [4]:
# Add utils/ to path
project_root = Path.cwd().parent
utils_path = project_root / 'utils'
if str(utils_path) not in sys.path:
    sys.path.append(str(utils_path))

# Modular imports
import data_loader as dl
import qa

---

## 📁 2. Define Dataset and Output Paths

Set up paths to images, labels, config, and output directories.

In [6]:
# === Project paths ===
project_root = Path.cwd().parent
config_dir = project_root / 'config'

# Define paths to dataset
IMAGE_DIR = Path.cwd().parent / "dataset" / "MPP" / "images"
LABEL_DIR = Path.cwd().parent / "dataset" / "MPP" / "labels"
data_yaml = Path.cwd().parent /  "config" / "MPP" / "data.yaml"

In [7]:
# === Output dirs ===
SAVE_VIS_DIR = Path.cwd().parent / "visuals" / "2_data_qa"
SAVE_FLE_DIR = Path.cwd().parent / "reports" / "2_data_qa"

SAVE_VIS_DIR.mkdir(parents=True, exist_ok=True)
SAVE_FLE_DIR.mkdir(parents=True, exist_ok=True)

### ✅ Path Checks

In [9]:
# === Path Existence Checks ===
paths_to_check = {
    "Image directory": IMAGE_DIR,
    "Label directory": LABEL_DIR,
    "Config file": data_yaml,
    "Utils path": utils_path,
    "Save visuals dir": SAVE_VIS_DIR,
    "Save reports dir": SAVE_FLE_DIR,
}

print("\n🔎 Verifying directory and file paths...\n")
for name, path in paths_to_check.items():
    exists = path.exists()
    print(f"{name:<20} : {'✅ Exists' if exists else '❌ Missing'} — {path}")


🔎 Verifying directory and file paths...

Image directory      : ✅ Exists — C:\Users\ADITHYA\OneDrive\Desktop\AgriVision\dataset\MPP\images
Label directory      : ✅ Exists — C:\Users\ADITHYA\OneDrive\Desktop\AgriVision\dataset\MPP\labels
Config file          : ✅ Exists — C:\Users\ADITHYA\OneDrive\Desktop\AgriVision\config\MPP\data.yaml
Utils path           : ✅ Exists — C:\Users\ADITHYA\OneDrive\Desktop\AgriVision\utils
Save visuals dir     : ✅ Exists — C:\Users\ADITHYA\OneDrive\Desktop\AgriVision\visuals\2_data_qa
Save reports dir     : ✅ Exists — C:\Users\ADITHYA\OneDrive\Desktop\AgriVision\reports\2_data_qa


---

## 📥 3. Load Dataset and Extract Metadata

Load image and label paths, then extract label data and metadata from dataset.

In [11]:
# === Load image paths ===
image_paths = dl.get_image_paths(IMAGE_DIR)
print(f"📸 Found {len(image_paths)} image(s) in {IMAGE_DIR}")

# === Load label paths ===
label_paths = dl.get_label_paths(image_paths, LABEL_DIR)
print(f"🏷️ Matched {len(label_paths)} label(s) from image filenames in {LABEL_DIR}")

Found 36305 files for pattern *.jpg
📸 Found 36305 image(s) in C:\Users\ADITHYA\OneDrive\Desktop\AgriVision\dataset\MPP\images


Generating label paths: 100%|██████████| 36305/36305 [00:04<00:00, 8369.72it/s] 

Generated 36305 label paths from 36305 images
🏷️ Matched 36305 label(s) from image filenames in C:\Users\ADITHYA\OneDrive\Desktop\AgriVision\dataset\MPP\labels





In [12]:
dataset = dl.YoloDataset(images_dir=IMAGE_DIR,
                         labels_dir=LABEL_DIR,
                         yaml_path=data_yaml)

Loaded YAML config from C:\Users\ADITHYA\OneDrive\Desktop\AgriVision\config\MPP\data.yaml: keys=['train', 'val', 'test', 'nc', 'names', 'roboflow']
Found 36305 files for pattern *.jpg


Generating label paths: 100%|██████████| 36305/36305 [00:00<00:00, 37191.35it/s]


Generated 36305 label paths from 36305 images


Loading label files: 100%|██████████| 36305/36305 [05:21<00:00, 112.82it/s]


Loaded labels from 36305/36305 files.


In [13]:
# Access key components
image_paths = dataset.image_paths
label_paths = dataset.label_paths
all_labels = dataset.all_labels
class_names = dataset.class_names
num_classes = dataset.num_classes

In [14]:
print("="*30)
print("📊 DATASET DIAGNOSTICS")
print("="*30)

# ---------------------------------------------
# 🔹 IMAGE PATHS
# ---------------------------------------------
print("\n🔹 Image Paths")
print(f"Type         : {type(image_paths)}")
print(f"Count        : {len(image_paths)}")
print(f"Sample Paths : {image_paths[:1]}")

# ---------------------------------------------
# 🔹 LABEL PATHS
# ---------------------------------------------
print("\n🔹 Label Paths")
print(f"Type         : {type(label_paths)}")
print(f"Count        : {len(label_paths)}")
print(f"Sample Paths : {label_paths[:1]}")

# ---------------------------------------------
# 🔹 ALL LABELS
# ---------------------------------------------
print("\n🔹 All Labels")
print(f"Type               : {type(all_labels)}")
print(f"Count (images)     : {len(all_labels)}")
print(f"First item type    : {type(all_labels[0]) if all_labels else 'None'}")

if all_labels:
    # If labels exist, show a few values from the first image's labels
    if isinstance(all_labels[0], list):
        print(f"First label sample : {all_labels[0][:2]}")
    else:
        print("First label sample : N/A (not a list)")

# ---------------------------------------------
# 🔹 CLASS NAMES
# ---------------------------------------------
print("\n🔹 Class Names")
print(f"Type         : {type(class_names)}")
print(f"Count        : {len(class_names)}")
print(f"Classes      : {class_names}")

# ---------------------------------------------
# 🔹 NUMBER OF CLASSES
# ---------------------------------------------
print("\n🔹 Number of Classes")
print(f"Value        : {num_classes}")
print(f"Matches class_names? : {num_classes == len(class_names)}")

📊 DATASET DIAGNOSTICS

🔹 Image Paths
Type         : <class 'list'>
Count        : 36305
Sample Paths : ['C:\\Users\\ADITHYA\\OneDrive\\Desktop\\AgriVision\\dataset\\MPP\\images\\00001_Cassava_mosaic_jpg.rf.676253e0c2beef9c5e113197f2f78d0b.jpg']

🔹 Label Paths
Type         : <class 'list'>
Count        : 36305
Sample Paths : ['C:\\Users\\ADITHYA\\OneDrive\\Desktop\\AgriVision\\dataset\\MPP\\labels\\00001_Cassava_mosaic_jpg.rf.676253e0c2beef9c5e113197f2f78d0b.txt']

🔹 All Labels
Type               : <class 'list'>
Count (images)     : 36305
First item type    : <class 'list'>
First label sample : [[9, 0.5, 0.5, 1.0, 1.0]]

🔹 Class Names
Type         : <class 'list'>
Count        : 22
Classes      : ['Cashew anthracnose', 'Cashew gumosis', 'Cashew healthy', 'Cashew leaf miner', 'Cashew red rust', 'Cassava bacterial blight', 'Cassava brown spot', 'Cassava green mite', 'Cassava healthy', 'Cassava mosaic', 'Maize fall armyworm', 'Maize grasshoper', 'Maize healthy', 'Maize leaf beetle', 'Maiz

In [15]:
# Capitalize each word in class names
formatted_class_names = [name.title() for name in class_names]
formatted_class_names

['Cashew Anthracnose',
 'Cashew Gumosis',
 'Cashew Healthy',
 'Cashew Leaf Miner',
 'Cashew Red Rust',
 'Cassava Bacterial Blight',
 'Cassava Brown Spot',
 'Cassava Green Mite',
 'Cassava Healthy',
 'Cassava Mosaic',
 'Maize Fall Armyworm',
 'Maize Grasshoper',
 'Maize Healthy',
 'Maize Leaf Beetle',
 'Maize Leaf Blight',
 'Maize Leaf Spot',
 'Maize Streak Virus',
 'Tomato Healthy',
 'Tomato Leaf Blight',
 'Tomato Leaf Curl',
 'Tomato Septoria Leaf Spot',
 'Tomato Verticulium Wilt']

In [17]:
qa.check_exact_duplicates(image_paths)

Exact duplicates: 100%|██████████| 36305/36305 [00:00<00:00, 657933.95it/s]


{'exact_duplicates': []}

In [None]:
qa.check_perceptual_duplicates(image_paths)

Perceptual duplicates:  52%|█████▏    | 18742/36305 [1:38:18<2:16:59,  2.14it/s]

In [None]:
qa.check_corrupt_images(image_paths)

In [None]:
qa.

In [None]:
qa.

In [None]:
qa.

In [None]:
qa.