In [1]:
import os
import xml.etree.ElementTree as ET
from pathlib import Path
from collections import defaultdict
from tqdm import tqdm
def count_processed_data(processed_dir: str):
    image_dir = Path(processed_dir) / "images"
    xml_dir = Path(processed_dir) / "annotations"
    
    if not image_dir.exists() or not xml_dir.exists():
        raise FileNotFoundError("Thư mục processed không hợp lệ!")
    
    stats = {
        "total_images": 0,
        "total_boxes": 0,
        "boxes_per_class": defaultdict(int),
        "boxes_per_image": defaultdict(int)
    }

    # Duyệt qua từng ảnh đã xử lý
    for img_path in tqdm(list(image_dir.glob("*")), desc="Đang đếm"):
        xml_path = xml_dir / f"{img_path.stem}.xml"
        
        if not xml_path.exists():
            continue
            
        try:
            tree = ET.parse(xml_path)
            boxes = tree.findall(".//object")
            num_boxes = len(boxes)
            
            stats["total_images"] += 1
            stats["total_boxes"] += num_boxes
            stats["boxes_per_image"][num_boxes] += 1
            
            for box in boxes:
                class_name = box.find("name").text
                stats["boxes_per_class"][class_name] += 1
                
        except Exception as e:
            print(f"Lỗi khi xử lý {xml_path.name}: {e}")
            continue
    
    return stats

def print_stats(stats: dict):
    print("\n" + "="*50)
    print("THỐNG KÊ SAU TIỀN XỬ LÝ")
    print("="*50)
    print(f"Tổng số ảnh hợp lệ: {stats['total_images']}")
    print(f"Tổng số bounding boxes: {stats['total_boxes']}")
    print(f"Trung bình box/ảnh: {stats['total_boxes']/max(1, stats['total_images']):.2f}")
    
    print("\nPhân phối số box/ảnh:")
    for num_boxes, count in sorted(stats['boxes_per_image'].items()):
        print(f"- {num_boxes} box: {count} ảnh ({count/stats['total_images']:.1%})")
    
    print("\nSố box theo class (top 10):")
    for class_name, count in sorted(stats['boxes_per_class'].items(), 
                                  key=lambda x: x[1], reverse=True)[:10]:
        print(f"- {class_name}: {count} box ({count/stats['total_boxes']:.1%})")
    print("="*50)

if __name__ == "__main__":
    processed_folder = r"D:\xla v1\processed_data"  
    stats = count_processed_data(processed_folder)
    print_stats(stats)

Đang đếm: 100%|██████████| 21427/21427 [00:02<00:00, 7432.16it/s]


THỐNG KÊ SAU TIỀN XỬ LÝ
Tổng số ảnh hợp lệ: 21427
Tổng số bounding boxes: 34191
Trung bình box/ảnh: 1.60

Phân phối số box/ảnh:
- 1 box: 12191 ảnh (56.9%)
- 2 box: 6418 ảnh (30.0%)
- 3 box: 2194 ảnh (10.2%)
- 4 box: 544 ảnh (2.5%)
- 5 box: 76 ảnh (0.4%)
- 6 box: 2 ảnh (0.0%)
- 7 box: 2 ảnh (0.0%)

Số box theo class (top 10):
- regulatory--maximum-speed-limit: 2592 box (7.6%)
- regulatory--shared-path-pedestrians-and-bicycles: 1254 box (3.7%)
- regulatory--yield: 892 box (2.6%)
- information--parking: 886 box (2.6%)
- complementary--chevron-left: 828 box (2.4%)
- regulatory--stop: 592 box (1.7%)
- complementary--distance: 474 box (1.4%)
- regulatory--keep-right: 456 box (1.3%)
- regulatory--no-goods-vehicles: 442 box (1.3%)





In [2]:
import os
import xml.etree.ElementTree as ET
import json

def extract_labels_from_xml(xml_dir):
    labels = set()

    # Duyệt qua tất cả file XML trong thư mục
    for filename in os.listdir(xml_dir):
        if filename.endswith('.xml'):
            xml_path = os.path.join(xml_dir, filename)
            tree = ET.parse(xml_path)
            root = tree.getroot()

            # Tìm tất cả các nhãn trong file XML
            for obj in root.findall('object'):
                label = obj.find('name').text.strip()
                labels.add(label)

    return sorted(labels)

def create_class_mapping(labels):
    class_to_idx = {label: idx + 1 for idx, label in enumerate(labels)}
    idx_to_class = {str(idx + 1): label for idx, label in enumerate(labels)}
    return class_to_idx, idx_to_class

def save_mapping_to_json(class_to_idx, idx_to_class, output_file):
    mapping = {
        "class_to_idx": class_to_idx,
        "idx_to_class": idx_to_class,
        "num_classes": len(class_to_idx)
    }

    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(mapping, f, ensure_ascii=False, indent=2)

    print(f"Đã lưu file mapping tại: {output_file}")

if __name__ == "__main__":
    xml_directory = r"C:\Users\Admin\Downloads\archive\xmls"  # Thay bằng đường dẫn thư mục chứa XML
    output_json = r"D:\xla v1\model_output\class_mapping\class_mapping_V1.json"     # Tên file JSON muốn xuất ra

    labels = extract_labels_from_xml(xml_directory)
    class_to_idx, idx_to_class = create_class_mapping(labels)
    save_mapping_to_json(class_to_idx, idx_to_class, output_json)


Đã lưu file mapping tại: D:\xla v1\model_output\class_mapping\class_mapping_V1.json
