In [None]:
# Cell 1: Import Libraries
import os
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from PIL import Image
import yaml

# Cấu hình
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Cell 2: Load Config
with open('../configs/zalo.yaml', 'r') as f:
    config = yaml.safe_load(f)

class_names = config['names']
print(f"Số lượng classes: {len(class_names)}")
print(f"Classes: {class_names}")

# Cell 3: Đếm số lượng ảnh
train_images = glob.glob('../data/images/train/*.jpg') + glob.glob('../data/images/train/*.png')
val_images = glob.glob('../data/images/val/*.jpg') + glob.glob('../data/images/val/*.png')
test_images = glob.glob('../data/images/test/*.jpg') + glob.glob('../data/images/test/*.png')

print(f"📊 THỐNG KÊ DATASET")
print(f"{'='*50}")
print(f"Train images: {len(train_images)}")
print(f"Val images: {len(val_images)}")
print(f"Test images: {len(test_images)}")
print(f"Total: {len(train_images) + len(val_images) + len(test_images)}")

# Cell 4: Phân tích phân bố classes
def count_class_distribution(label_dir):
    """Đếm số lượng instances của mỗi class"""
    class_counts = {i: 0 for i in range(len(class_names))}
    
    label_files = glob.glob(f'{label_dir}/*.txt')
    
    for label_file in label_files:
        with open(label_file, 'r') as f:
            lines = f.readlines()
            for line in lines:
                class_id = int(line.split()[0])
                class_counts[class_id] += 1
    
    return class_counts

# Đếm cho train và val
train_counts = count_class_distribution('../data/labels/train')
val_counts = count_class_distribution('../data/labels/val')

# Tạo DataFrame
df_train = pd.DataFrame({
    'Class': [class_names[i] for i in range(len(class_names))],
    'Train': [train_counts[i] for i in range(len(class_names))],
    'Val': [val_counts[i] for i in range(len(class_names))]
})

print("\n📈 PHÂN BỐ CLASSES:")
print(df_train)

# Cell 5: Vẽ biểu đồ phân bố classes
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Train distribution
axes[0].bar(df_train['Class'], df_train['Train'], color='skyblue', edgecolor='navy')
axes[0].set_title('Class Distribution - Training Set', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Class Name', fontsize=12)
axes[0].set_ylabel('Number of Instances', fontsize=12)
axes[0].tick_params(axis='x', rotation=45)
axes[0].grid(axis='y', alpha=0.3)

# Val distribution
axes[1].bar(df_train['Class'], df_train['Val'], color='lightcoral', edgecolor='darkred')
axes[1].set_title('Class Distribution - Validation Set', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Class Name', fontsize=12)
axes[1].set_ylabel('Number of Instances', fontsize=12)
axes[1].tick_params(axis='x', rotation=45)
axes[1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig('../docs/class_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

print("✅ Đã lưu: docs/class_distribution.png")

# Cell 6: Phân tích kích thước bbox
def analyze_bbox_sizes(label_dir, img_dir):
    """Phân tích kích thước bounding boxes"""
    bbox_widths = []
    bbox_heights = []
    bbox_areas = []
    
    label_files = glob.glob(f'{label_dir}/*.txt')
    
    for label_file in label_files:
        # Lấy kích thước ảnh
        img_name = Path(label_file).stem
        img_path = None
        for ext in ['.jpg', '.png', '.jpeg']:
            potential_path = f'{img_dir}/{img_name}{ext}'
            if os.path.exists(potential_path):
                img_path = potential_path
                break
        
        if img_path:
            img = Image.open(img_path)
            img_w, img_h = img.size
            
            # Đọc annotations
            with open(label_file, 'r') as f:
                lines = f.readlines()
                for line in lines:
                    parts = line.strip().split()
                    if len(parts) >= 5:
                        _, x_center, y_center, width, height = map(float, parts[:5])
                        
                        # Convert normalized to pixel
                        bbox_w = width * img_w
                        bbox_h = height * img_h
                        
                        bbox_widths.append(bbox_w)
                        bbox_heights.append(bbox_h)
                        bbox_areas.append(bbox_w * bbox_h)
    
    return bbox_widths, bbox_heights, bbox_areas

train_widths, train_heights, train_areas = analyze_bbox_sizes('../data/labels/train', '../data/images/train')

print(f"\n📏 THỐNG KÊ KÍCH THƯỚC BBOX (Training):")
print(f"{'='*50}")
print(f"Width  - Mean: {np.mean(train_widths):.1f}px, Std: {np.std(train_widths):.1f}px")
print(f"Height - Mean: {np.mean(train_heights):.1f}px, Std: {np.std(train_heights):.1f}px")
print(f"Area   - Mean: {np.mean(train_areas):.1f}px², Std: {np.std(train_areas):.1f}px²")

# Cell 7: Vẽ biểu đồ phân bố kích thước bbox
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Width distribution
axes[0, 0].hist(train_widths, bins=50, color='skyblue', edgecolor='black', alpha=0.7)
axes[0, 0].axvline(np.mean(train_widths), color='red', linestyle='--', linewidth=2, label=f'Mean: {np.mean(train_widths):.1f}px')
axes[0, 0].set_title('Bbox Width Distribution', fontsize=12, fontweight='bold')
axes[0, 0].set_xlabel('Width (pixels)')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].legend()
axes[0, 0].grid(alpha=0.3)

# Height distribution
axes[0, 1].hist(train_heights, bins=50, color='lightcoral', edgecolor='black', alpha=0.7)
axes[0, 1].axvline(np.mean(train_heights), color='red', linestyle='--', linewidth=2, label=f'Mean: {np.mean(train_heights):.1f}px')
axes[0, 1].set_title('Bbox Height Distribution', fontsize=12, fontweight='bold')
axes[0, 1].set_xlabel('Height (pixels)')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].legend()
axes[0, 1].grid(alpha=0.3)

# Area distribution
axes[1, 0].hist(train_areas, bins=50, color='lightgreen', edgecolor='black', alpha=0.7)
axes[1, 0].axvline(np.mean(train_areas), color='red', linestyle='--', linewidth=2, label=f'Mean: {np.mean(train_areas):.1f}px²')
axes[1, 0].set_title('Bbox Area Distribution', fontsize=12, fontweight='bold')
axes[1, 0].set_xlabel('Area (pixels²)')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].legend()
axes[1, 0].grid(alpha=0.3)

# Scatter: Width vs Height
axes[1, 1].scatter(train_widths, train_heights, alpha=0.3, s=10, color='purple')
axes[1, 1].set_title('Bbox Width vs Height', fontsize=12, fontweight='bold')
axes[1, 1].set_xlabel('Width (pixels)')
axes[1, 1].set_ylabel('Height (pixels)')
axes[1, 1].grid(alpha=0.3)

plt.tight_layout()
plt.savefig('../docs/bbox_size_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

print("✅ Đã lưu: docs/bbox_size_distribution.png")

# Cell 8: Summary Report
print("\n" + "="*60)
print("📋 TỔNG KẾT EDA")
print("="*60)
print(f"✅ Dataset đã được phân tích thành công!")
print(f"✅ Tổng số classes: {len(class_names)}")
print(f"✅ Tổng số ảnh: {len(train_images) + len(val_images)}")
print(f"✅ Đã tạo 2 biểu đồ phân tích trong thư mục docs/")
print(f"\n💡 Gợi ý cho training:")
if max(train_counts.values()) / min([v for v in train_counts.values() if v > 0]) > 5:
    print("⚠️  Dataset có class imbalance! Cân nhắc dùng weighted loss")
if np.mean(train_areas) < 1000:
    print("⚠️  Bbox nhỏ! Nên dùng SAHI cho inference")
print("="*60)


: 