Khai phá và phân tích dữ liệu

In [2]:
import os
import random
import xml.etree.ElementTree as ET
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
from collections import Counter
import glob
import pandas as pd
from roboflow import Roboflow
import json
import numpy as np

Phân tích bộ dữ liệu gốc

In [None]:
# Dowload the dataset from Roboflow

from roboflow import Roboflow
rf = Roboflow(api_key="lsMUyqEwAhueesw6QDJe")
project = rf.workspace("yololam").project("sh17_original")
version = project.version(1)
dataset = version.download("yolov8")

In [None]:
# Dataset path
# Đặt lại tên giống thư mục dữ liệu được tải xuống

BASE_PATH = "./SH17_Original-1"
print(os.listdir(BASE_PATH))

In [52]:
IMAGES_PATH = os.path.join(BASE_PATH, "train/images")
LABELS_PATH = os.path.join(BASE_PATH, "train/labels")
TRAIN_FILES = os.path.join(BASE_PATH, "train_files.txt")
VAL_FILES = os.path.join(BASE_PATH, "val_files.txt")

In [53]:
def list_files_in_folder(folder_path, extension):
    """List all files with a specific extension in a folder."""
    return sorted(glob.glob(os.path.join(folder_path, f"*.{extension}")))

def load_metadata(metadata_path):
    """Load metadata JSON file."""
    with open(metadata_path, 'r') as f:
        return json.load(f)

def load_label(label_path):
    """Load YOLO label file and parse it."""
    with open(label_path, 'r') as f:
        lines = f.readlines()
    return [line.strip().split() for line in lines]

In [54]:
# Tần xuất xuất hiện của các lớp trong tập dữ liệu

def analyze_class_distribution(labels_folder):
    """Analyze class distribution across all labels."""
    
    labels = list_files_in_folder(labels_folder, 'txt')
        
    class_instance_counter = Counter()
    class_image_counter = Counter()
  
    for label_file in labels:
        annotations = load_label(label_file)
        tmp_set = set()
        for annotation in annotations:
            class_id = annotation[0]
            class_instance_counter[class_id] += 1
            if class_id not in tmp_set : 
                class_image_counter[class_id] += 1
                tmp_set.add(class_id)
                
    # Create first DataFrame with instance counts
    df = pd.DataFrame(class_instance_counter.items(), columns=['Class_ID', 'Count Instances'])
    df['Class_ID'] = df['Class_ID'].astype(int)
    df = df.sort_values('Class_ID')
    
    # Create second DataFrame with image counts
    df_img_count = pd.DataFrame(class_image_counter.items(), columns=['Class_ID', 'Count Images'])
    df_img_count['Class_ID'] = df_img_count['Class_ID'].astype(int)
    df_img_count = df_img_count.sort_values('Class_ID')
    
    # Merge the DataFrames on Class_ID
    combined_df = pd.merge(df, df_img_count, on='Class_ID', how='outer')
    
    # Sort by Class_ID and reset index
    combined_df = combined_df.sort_values('Class_ID').reset_index(drop=True)
    
    return combined_df

In [None]:
class_df = analyze_class_distribution(LABELS_PATH)
class_df.head()

In [56]:
# Vẽ sơ đồ phân phối lớp

def plot_class_distribution(df, class_names, figsize=(12, 6)):
    """Plot class distribution with percentages and colors."""
    # Calculate percentages
    total = df['Count'].sum()
    df['Percentage'] = ((df['Count'] / total) * 100).round(1)

    # Sort the DataFrame by counts in descending order
    df = df.sort_values('Count', ascending=False)

    df['Class_Name'] = df['Class_ID'].map(lambda x: class_names[int(x)]) # Cần hiểu thêm

    # Create the plot
    plt.figure(figsize=figsize)
    colors = sns.color_palette("husl", len(df))  
    barplot = sns.barplot(
        x='Class_Name', 
        y='Count', 
        data=df, 
        palette=colors
    )

    # Add percentage labels above the bars
    for i, row in enumerate(df.itertuples()):
        barplot.text(
            i, 
            row.Count + total * 0.003,  
            f"{row.Percentage}%", 
            ha='center', 
            fontsize=10, 
            color='black'
        )
    
    # Add labels and title
    plt.xlabel("Categories", fontsize=12)
    plt.ylabel("Instances", fontsize=12)
    plt.title("Class Distribution with Percentages", fontsize=14)
    plt.xticks(rotation=45, ha='right')
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()

In [None]:
class_names = ['ear', 'ear-mufs', 'face', 'face-guard', 'face-mask', 'foot', 'glasses', 
               'gloves', 'hands', 'head', 'helmet', 'medical-suit', 'no-gloves', 'no-helmet', 
               'no-safety-vest', 'person', 'safety-suit', 'safety-vest', 'shoes', 'tool']
plot_class_distribution(class_df.rename(columns={'Count Instances': 'Count'}), class_names)

In [50]:
# Hiển thị một ảnh bất kỳ kèm bounding box

def visualize_annotations(image_path, label_path, title=''):
    """Display an image with its bounding boxes."""
    img = cv2.imread(image_path)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

    annotations = load_label(label_path)
    for annotation in annotations:
        class_id = int(annotation[0])
        x_center, y_center, width, height = map(float, annotation[1:])
        
        # Convert YOLO format to pixel coordinates
        h, w, _ = img.shape
        xmin = int((x_center - width / 2) * w)
        ymin = int((y_center - height / 2) * h)
        xmax = int((x_center + width / 2) * w)
        ymax = int((y_center + height / 2) * h)

        cv2.rectangle(img, (xmin, ymin), (xmax, ymax), (255, 0, 0), 2)

        cv2.putText(img, str(class_id), (xmin, ymin - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 2)

        plt.figure(figsize=(8, 8))
        plt.imshow(img)
        plt.title(title)
        plt.axis('off')
        plt.show()

In [None]:
sample_image = f'{IMAGES_PATH}/pexels-photo-824300_jpeg.rf.25eff5b732dcf80b0d854bf8756dc774.jpg'
sample_label = f'{LABELS_PATH}/pexels-photo-824300_jpeg.rf.25eff5b732dcf80b0d854bf8756dc774.txt'
visualize_annotations(sample_image, sample_label)

Phân tích bộ dữ liệu sau khi được xử lý

In [None]:
# Dowload the dataset from Roboflow

from roboflow import Roboflow
rf = Roboflow(api_key="lsMUyqEwAhueesw6QDJe")
project = rf.workspace("yololam").project("sh17_violence")
version = project.version(9)
dataset = version.download("yolov8")     

In [None]:
# Dataset path
# Đặt lại tên giống thư mục dữ liệu được tải xuống

BASE_PATH_2 = "./SH17_Violence.v9-dataset-original-1.yolov8"
print(os.listdir(BASE_PATH))

In [None]:
IMAGES_PATH_2 = os.path.join(BASE_PATH, "train/images")
LABELS_PATH_2 = os.path.join(BASE_PATH, "train/labels")
TRAIN_FILES_2 = os.path.join(BASE_PATH, "train_files.txt")
VAL_FILES_2 = os.path.join(BASE_PATH, "val_files.txt")

In [None]:
class_df_2 = analyze_class_distribution(LABELS_PATH_2)
class_df_2.head()

In [None]:
class_names = ['ear', 'ear-mufs', 'face', 'face-guard', 'face-mask', 'foot', 'glasses', 
               'gloves', 'hands', 'head', 'helmet', 'medical-suit', 'no-gloves', 'no-helmet', 
               'no-safety-vest', 'person', 'safety-suit', 'safety-vest', 'shoes', 'tool']
plot_class_distribution(class_df_2.rename(columns={'Count Instances': 'Count'}), class_names)

Phân tích mô hình được huấn luyện

In [1]:
import cv2
import matplotlib.pyplot as plt
import os
from ultralytics import YOLO
from IPython.display import Image
import random

In [3]:
def visualize_yolo_result(results, image_path, class_names=None, conf_threshold=0.5):

    image = cv2.imread(image_path)
    img = image.copy()
    
    detections = results[0].boxes  # lấy boxes từ kết quả đầu tiên

    # Tạo màu ngẫu nhiên cho từng class
    if class_names is None:
        class_names = results[0].names  # lấy tên lớp từ model (dict)
        class_names = [class_names[i] for i in sorted(class_names.keys())]

    box_color_map = {
        class_id: tuple([random.randint(0, 255) for _ in range(3)])
        for class_id in range(len(class_names))
    }

    for box in detections:
        x1, y1, x2, y2 = map(int, box.xyxy[0].tolist())
        conf = float(box.conf[0])
        class_id = int(box.cls[0])

        if conf < conf_threshold:
            continue

        label = f"{class_names[class_id]}: {conf*100:.1f}%"
        color = box_color_map[class_id]

        # Vẽ khung
        cv2.rectangle(img, (x1, y1), (x2, y2), color, 2)

        # Ghi nhãn
        (w, h), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)
        cv2.rectangle(img, (x1, y1 - h - 8), (x1 + w, y1), color, -1)
        cv2.putText(img, label, (x1, y1 - 4), cv2.FONT_HERSHEY_SIMPLEX, 0.5,
                    (255, 255, 255), 1, cv2.LINE_AA)

    # Chuyển sang RGB để hiển thị bằng matplotlib
    img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    plt.figure(figsize=(10, 8))
    plt.imshow(img_rgb)
    plt.axis('off')
    plt.title("YOLO Detection Visualization")
    plt.show()

In [4]:
# Predict on an image

test_image_path = 'test_dataset/background-check-2739233_1280.jpg'

In [21]:
model = YOLO('./test.pt')

In [None]:
results = model.predict(source=test_image_path, conf=0.6, iou=0.5, show=False, save=True, save_txt=True)

result = results[0]

# Hiển thị chỉ số lớp:
print(result.boxes.cls)

class_names = result.names
class_ids = result.boxes.cls.int().tolist()
labels = [class_names[i] for i in class_ids]
print(labels)

visualize_yolo_result(results, test_image_path)