In [30]:
import os
import json
import shutil
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from PIL import Image
from sklearn.model_selection import train_test_split
import yaml
import warnings
warnings.filterwarnings('ignore')

# Cấu hình plotting
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

print("✅ Libraries imported successfully!")

✅ Libraries imported successfully!


In [17]:
df = pd.read_csv('../data/images/annotation.csv')

print("📊 THÔNG TIN DATASET GỐC")
print("=" * 60)
print(f"Tổng số annotations: {len(df):,}")
print(f"Tổng số ảnh unique: {df['file_name'].nunique():,}")
print(f"Tổng số classes: {df['category_id'].nunique()}")
print("\n📋 Columns trong dataset:")
print(df.columns.tolist())
print("\n🔍 Sample data (5 dòng đầu):")
df.head()

📊 THÔNG TIN DATASET GỐC
Tổng số annotations: 11,000
Tổng số ảnh unique: 4,500
Tổng số classes: 7

📋 Columns trong dataset:
['area', 'iscrowd', 'image_id', 'bbox', 'category_id', 'box_id', 'file_name', 'height', 'width', 'street_id', 'supercategory']

🔍 Sample data (5 dòng đầu):


Unnamed: 0,area,iscrowd,image_id,bbox,category_id,box_id,file_name,height,width,street_id,supercategory
0,342,0,3,"[880, 333, 19, 18]",2,0,3.png,626,1622,3,Cấm dừng và đỗ
1,63,0,5,"[768, 480, 9, 7]",2,2,5.png,626,1622,4,Cấm dừng và đỗ
2,56,0,16,"[733, 352, 7, 8]",2,4,16.png,626,1622,3,Cấm dừng và đỗ
3,5400,0,17,"[1024, 160, 75, 72]",2,5,17.png,626,1622,8,Cấm dừng và đỗ
4,1927,0,18,"[1138, 295, 47, 41]",2,6,18.png,626,1622,3,Cấm dừng và đỗ


In [15]:
with open('../configs/zalo.yaml', 'r') as f:
    config = yaml.safe_load(f)

class_names = config['names']
print(f"Số lượng classes: {len(class_names)}")
print(f"Classes: {class_names}")

Số lượng classes: 7


In [12]:
train_images = glob.glob('../data/images/train/*.jpg') + glob.glob('../data/images/train/*.png')
val_images = glob.glob('../data/images/val/*.jpg') + glob.glob('../data/images/val/*.png')
test_images = glob.glob('../data/images/test/*.jpg') + glob.glob('../data/images/test/*.png')

print(f"📊 THỐNG KÊ DATASET")
print(f"{'='*50}")
print(f"Train images: {len(train_images)}")
# đang thiếu val images trong data
print(f"Val images: {len(val_images)}")
print(f"Test images: {len(test_images)}")
print(f"Total: {len(train_images) + len(val_images) + len(test_images)}")

📊 THỐNG KÊ DATASET
Train images: 4500
Val images: 0
Test images: 586
Total: 5086


In [18]:
print("\n📈 THỐNG KÊ THEO CLASS")
print("=" * 60)
class_stats = df.groupby('supercategory').agg({
    'box_id': 'count',
    'area': ['mean', 'std', 'min', 'max']
}).round(2)
class_stats.columns = ['Count', 'Mean_Area', 'Std_Area', 'Min_Area', 'Max_Area']
print(class_stats)


📈 THỐNG KÊ THEO CLASS
                 Count  Mean_Area  Std_Area  Min_Area  Max_Area
supercategory                                                  
Cấm còn lại       1787    1069.71   2547.62         9     45024
Cấm dừng và đỗ    2221    1368.00   3135.65         9     54526
Cấm ngược chiều   1416     538.40   1428.91         4     15006
Cấm rẽ             556    1705.58   3451.42        24     24402
Giới hạn tốc độ    949     828.65   1552.66         9     22363
Hiệu lệnh         1022     696.79   1928.02         6     27300
Nguy hiểm         3049    1164.60   2960.21         9     47736


In [19]:
print("\n📐 THỐNG KÊ KÍCH THƯỚC ẢNH")
print("=" * 60)
img_sizes = df[['file_name', 'width', 'height']].drop_duplicates()
print(f"Unique image sizes: {img_sizes[['width', 'height']].drop_duplicates().values.tolist()}")
print(f"Chiều rộng: {img_sizes['width'].unique()}")
print(f"Chiều cao: {img_sizes['height'].unique()}")


📐 THỐNG KÊ KÍCH THƯỚC ẢNH
Unique image sizes: [[1622, 626]]
Chiều rộng: [1622]
Chiều cao: [626]


In [32]:
# Lấy danh sách ảnh unique
image_files = df['file_name'].unique()
print(f"📂 Total unique images: {len(image_files)}")

# Split: 70% train, 20% val, 10% test
train_imgs, temp_imgs = train_test_split(
    image_files, test_size=0.3, random_state=42, shuffle=True
)
val_imgs, test_imgs = train_test_split(
    temp_imgs, test_size=0.33, random_state=42, shuffle=True
)

print(f"\n📊 DATASET SPLIT")
print("=" * 60)
print(f"🟦 Train: {len(train_imgs):,} images ({len(train_imgs)/len(image_files)*100:.1f}%)")
print(f"🟨 Val:   {len(val_imgs):,} images ({len(val_imgs)/len(image_files)*100:.1f}%)")
print(f"🟥 Test:  {len(test_imgs):,} images ({len(test_imgs)/len(image_files)*100:.1f}%)")

# Tạo dictionary để mapping
split_dict = {}
for img in train_imgs:
    split_dict[img] = 'train'
for img in val_imgs:
    split_dict[img] = 'val'
for img in test_imgs:
    split_dict[img] = 'test'

df['split'] = df['file_name'].map(split_dict)

📂 Total unique images: 4500

📊 DATASET SPLIT
🟦 Train: 3,150 images (70.0%)
🟨 Val:   904 images (20.1%)
🟥 Test:  446 images (9.9%)


In [38]:
def convert_bbox_to_yolo(bbox_str, img_width, img_height):
    """
    Convert COCO bbox [x_min, y_min, width, height] 
    to YOLO format [x_center, y_center, width, height] (normalized)
    """
    bbox = json.loads(bbox_str)
    x_min, y_min, bbox_width, bbox_height = bbox
    
    # Calculate center
    x_center = (x_min + bbox_width / 2) / img_width
    y_center = (y_min + bbox_height / 2) / img_height
    
    # Normalize width and height
    norm_width = bbox_width / img_width
    norm_height = bbox_height / img_height
    
    return x_center, y_center, norm_width, norm_height

# Tạo thư mục output
for split in ['train', 'val', 'test']:
    Path(f'../data/images/{split}').mkdir(parents=True, exist_ok=True)
    Path(f'../data/labels/{split}').mkdir(parents=True, exist_ok=True)

print("🔄 Converting annotations to YOLO format...")
print("=" * 60)

conversion_stats = {'train': 0, 'val': 0, 'test': 0}

for split in ['train', 'val', 'test']:
    split_df = df[df['split'] == split]
    
    for image_file, group in split_df.groupby('file_name'):
        img_width = group.iloc[0]['width']
        img_height = group.iloc[0]['height']
        
        # Tạo file .txt label
        label_file = Path(f'../data/labels/{split}') / f"{Path(image_file).stem}.txt"
        
        with open(label_file, 'w') as f:
            for _, row in group.iterrows():
                # Sử dụng category_id gốc từ dataset
                class_id = row['category_id']
                x_c, y_c, w, h = convert_bbox_to_yolo(
                    row['bbox'], img_width, img_height
                )
                f.write(f"{class_id} {x_c:.6f} {y_c:.6f} {w:.6f} {h:.6f}\n")
        
        conversion_stats[split] += 1

print("✅ Conversion completed!")
for split, count in conversion_stats.items():
    print(f"   {split.capitalize()}: {count} label files created")

# Verify một file mẫu
sample_label = Path('../data/labels/train').glob('*.txt').__next__()
print(f"\n📝 Sample label file ({sample_label.name}):")
print(sample_label.read_text()[:200] + "...")

🔄 Converting annotations to YOLO format...
✅ Conversion completed!
   Train: 3150 label files created
   Val: 904 label files created
   Test: 446 label files created

📝 Sample label file (10002.txt):
6 0.435265 0.555911 0.008631 0.022364
6 0.435573 0.555911 0.006782 0.019169
...


In [None]:
config = {
    'path': './data',
    'train': 'images/train',
    'val': 'images/val',
    'test': 'images/test',
    'nc': num_classes,
    'names': class_names  # YOLO format: {0: 'warning', 1: 'no parking/waiting', ...}
}

config_path = Path('../configs/zalo.yaml')
config_path.parent.mkdir(parents=True, exist_ok=True)

with open(config_path, 'w') as f:
    yaml.dump(config, f, default_flow_style=False, sort_keys=False)

print("✅ Config file created: configs/zalo.yaml")
print("\n📄 Config content:")
print(yaml.dump(config, default_flow_style=False))


✅ Config file created: configs/zalo.yaml

📄 Config content:
names:
  1: Cấm ngược chiều
  2: Cấm dừng và đỗ
  3: Cấm rẽ
  4: Giới hạn tốc độ
  5: Cấm còn lại
  6: Nguy hiểm
  7: Hiệu lệnh
nc: 7
path: ./data
test: images/test
train: images/train
val: images/val



In [43]:
split_class_stats = df.groupby(['split', 'class_name_en']).size().unstack(fill_value=0)

# Reorder columns by YOLO class ID
ordered_cols = [class_names[i] for i in sorted(class_names.keys())]
split_class_stats = split_class_stats[ordered_cols]

print("📊 CLASS DISTRIBUTION BY SPLIT")
print("=" * 80)
print(split_class_stats)
print("\n📈 Total per split:")
print(split_class_stats.sum(axis=1))
print("\n📈 Total per class:")
print(split_class_stats.sum(axis=0))

KeyError: 'class_name_en'