# Notebook 02 - Dataset Validation & Cleaning (YOLOv8 Format)

This notebook validates the Roboflow YOLOv8 dataset:
- checks missing labels/images
- checks empty or invalid label files
- validates bounding box ranges and class IDs
- checks corrupted images
- prints a clean report for training readiness


In [3]:
import os
print(os.listdir("/content"))


['.config', 'sample_data']


In [4]:
!pip install -q roboflow

from roboflow import Roboflow
rf = Roboflow(api_key="ATUHTf9DF0DoAlA3Tfeg")
project = rf.workspace("nivu").project("indian-license-plate-knte7")
version = project.version(1)
dataset = version.download("yolov8")


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/91.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.8/91.8 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.8/66.8 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.9/49.9 MB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m75.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m60.9 MB/s[0m eta [36m0:00:00[0m
loading Roboflow workspace...
loading Roboflow project...


Downloading Dataset Version Zip in Indian-License-Plate-1 to yolov8:: 100%|██████████| 43537/43537 [00:00<00:00, 54881.25it/s]





Extracting Dataset Version Zip to Indian-License-Plate-1 in yolov8:: 100%|██████████| 3312/3312 [00:00<00:00, 7347.48it/s]


In [5]:
import os
print(os.listdir("/content"))


['.config', 'Indian-License-Plate-1', 'sample_data']


In [6]:
DATASET_DIR = dataset.location
print("DATASET_DIR =", DATASET_DIR)


DATASET_DIR = /content/Indian-License-Plate-1


In [7]:
import os

expected=[
    "train/images","train/labels",
    "valid/images","valid/labels",
    "test/images","test/labels",
    "data.yaml"
]

for p in expected:
    full=os.path.join(DATASET_DIR,p)
    print(("✅" if os.path.exists(full) else "❌"),p)


✅ train/images
✅ train/labels
✅ valid/images
✅ valid/labels
✅ test/images
✅ test/labels
✅ data.yaml


In [8]:
import glob,os

def cnt(path,exts):
    c=0
    for e in exts:
        c+=len(glob.glob(os.path.join(path,f"*.{e}")))
    return c

for s in ["train","valid","test"]:
    img_dir=os.path.join(DATASET_DIR,s,"images")
    lab_dir=os.path.join(DATASET_DIR,s,"labels")
    imgs=cnt(img_dir,["jpg","jpeg","png"])
    labs=cnt(lab_dir,["txt"])
    print(f"{s.upper():5} | images={imgs} | labels={labs}")


TRAIN | images=1156 | labels=1156
VALID | images=330 | labels=330
TEST  | images=164 | labels=164


In [9]:
missing=[]
for s in ["train","valid","test"]:
    img_dir=os.path.join(DATASET_DIR,s,"images")
    lab_dir=os.path.join(DATASET_DIR,s,"labels")
    imgs=glob.glob(os.path.join(img_dir,"*.jpg"))+glob.glob(os.path.join(img_dir,"*.png"))+glob.glob(os.path.join(img_dir,"*.jpeg"))
    for ip in imgs:
        base=os.path.splitext(os.path.basename(ip))[0]
        lp=os.path.join(lab_dir,base+".txt")
        if not os.path.exists(lp):
            missing.append((s,ip))

print("Missing labels =",len(missing))
missing[:5]


Missing labels = 0


[]

In [10]:
empty=[]
for s in ["train","valid","test"]:
    lab_dir=os.path.join(DATASET_DIR,s,"labels")
    labs=glob.glob(os.path.join(lab_dir,"*.txt"))
    for lp in labs:
        if os.path.getsize(lp)==0:
            empty.append((s,lp))

print("Empty label files =",len(empty))
empty[:5]


Empty label files = 9


[('train',
  '/content/Indian-License-Plate-1/train/labels/bcdc8b42-5dd4-48e2-8104-706440524bfe___hqdefault0-jpg_jpeg.rf.78a86559da937c2cdd5f75f4e66bda8a.txt'),
 ('train',
  '/content/Indian-License-Plate-1/train/labels/video9_260_jpg.rf.2cd015208d61a04ce848ba92688aab2a.txt'),
 ('train',
  '/content/Indian-License-Plate-1/train/labels/AP29_jpg.rf.90a4a3cfa9f951f5c7189e1728eff478.txt'),
 ('train',
  '/content/Indian-License-Plate-1/train/labels/car-wbs-TS08ER1643_00000_png.rf.f72336a35ea4cd444dfe15e401f72a06.txt'),
 ('train',
  '/content/Indian-License-Plate-1/train/labels/2e23964b-ce59-4bcf-8c35-98f3a096a748___design-ind-number-plates-vijayanagar-bangalore-number-plate-dealers-2etqybj-jpg_jpeg.rf.9914607834288b607abe1c22e87526a6.txt')]

In [11]:
import os,glob

deleted_imgs=0
deleted_lbls=0
not_found_imgs=[]

for s,lp in empty:
    # delete label
    if os.path.exists(lp):
        os.remove(lp)
        deleted_lbls+=1

    # delete corresponding image
    base=os.path.splitext(os.path.basename(lp))[0]
    img_dir=os.path.join(DATASET_DIR,s,"images")

    found=False
    for ext in ["jpg","jpeg","png"]:
        ip=os.path.join(img_dir,base+"."+ext)
        if os.path.exists(ip):
            os.remove(ip)
            deleted_imgs+=1
            found=True
            break

    if not found:
        not_found_imgs.append((s,base))

print("Deleted empty labels:",deleted_lbls)
print("Deleted corresponding images:",deleted_imgs)
print("Images not found for some labels:",len(not_found_imgs))
not_found_imgs[:5]


Deleted empty labels: 9
Deleted corresponding images: 9
Images not found for some labels: 0


[]

In [12]:
empty=[]
for s in ["train","valid","test"]:
    lab_dir=os.path.join(DATASET_DIR,s,"labels")
    labs=glob.glob(os.path.join(lab_dir,"*.txt"))
    for lp in labs:
        if os.path.getsize(lp)==0:
            empty.append((s,lp))

print("Empty label files =",len(empty))
empty[:5]


Empty label files = 0


[]

In [13]:
def cnt(path,exts):
    c=0
    for e in exts:
        c+=len(glob.glob(os.path.join(path,f"*.{e}")))
    return c

for s in ["train","valid","test"]:
    img_dir=os.path.join(DATASET_DIR,s,"images")
    lab_dir=os.path.join(DATASET_DIR,s,"labels")
    imgs=cnt(img_dir,["jpg","jpeg","png"])
    labs=cnt(lab_dir,["txt"])
    print(f"{s.upper():5} | images={imgs} | labels={labs}")


TRAIN | images=1149 | labels=1149
VALID | images=328 | labels=328
TEST  | images=164 | labels=164


In [14]:
print("✅ Dataset cleaned: empty-label samples removed. Ready for training.")


✅ Dataset cleaned: empty-label samples removed. Ready for training.
