In [5]:
from pathlib import Path
import re
from tqdm import tqdm

In [6]:
# YOLO-valid line regex
valid_yolo_line = re.compile(r'^\d+\s+')

In [11]:
# --- Configuration ---
raw_labels_dir = Path(r"dataset\raw\labels")
cleaned_labels_dir = Path(r"dataset\raw\clean_labels")
cleaned_labels_dir.mkdir(parents=True, exist_ok=True)

In [12]:
# --- Valid YOLO line pattern ---
def is_valid_yolo_line(line: str) -> bool:
    try:
        parts = line.strip().split()
        if len(parts) != 5:
            return False
        int(parts[0])  # class_id
        float(parts[1])
        float(parts[2])
        float(parts[3])
        float(parts[4])
        return True
    except:
        return False

In [13]:
# --- Markers to exclude ---
conflict_markers = ['<<<<<<<', '=======', '>>>>>>>']
git_lfs_markers = ['version https://git-lfs.github.com', 'oid sha256', 'size ']

In [14]:
# --- Cleaning Function ---
def clean_yolo_label_file(src_path: Path, dst_path: Path) -> bool:
    try:
        with open(src_path, 'r', encoding='utf-8') as f:
            lines = f.readlines()

        cleaned = []
        for line in lines:
            if any(marker in line for marker in conflict_markers + git_lfs_markers):
                continue
            if is_valid_yolo_line(line):
                cleaned.append(line)

        if cleaned:
            with open(dst_path, 'w', encoding='utf-8') as f:
                f.writelines(cleaned)
            return True
    except Exception as e:
        print(f"❌ Error cleaning {src_path}: {e}")
    return False

In [15]:
# --- Process all .txt files ---
label_files = list(raw_labels_dir.glob('*.txt'))
fixed_count = 0

for file in tqdm(label_files, desc="Cleaning raw/labels"):
    dst_file = cleaned_labels_dir / file.name
    if clean_yolo_label_file(file, dst_file):
        fixed_count += 1

print(f"\nDone. Cleaned and saved {fixed_count} files to: {cleaned_labels_dir}")

Cleaning raw/labels: 100%|█████████████████████████████████████████████████████| 10853/10853 [00:03<00:00, 2854.99it/s]



Done. Cleaned and saved 10687 files to: dataset\raw\clean_labels
