<a href="https://colab.research.google.com/github/Onedory/yolov5-art-analyze/blob/main/%EB%9E%9C%EB%8D%A4%ED%8F%AC%EB%A0%88%EC%8A%A4%ED%8A%B8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. 데이터 전처리: 병렬 처리 및 GPU 사용

In [None]:
import torch
import cv2
import numpy as np
import os
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

# YOLO 모델 로드 (GPU 사용)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = torch.hub.load('ultralytics/yolov5', 'custom', path='/content/drive/MyDrive/yolov5_checkpoints/yolov5_results8/weights/best.pt', trust_repo=True).to(device)
model.eval()

def extract_object_features(image_path):
    if not os.path.exists(image_path):
        print(f"Image not found: {image_path}")
        return []

    img = cv2.imread(image_path)
    if img is None:
        print(f"Failed to load image: {image_path}")
        return []

    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

    # 객체 감지 수행 (GPU 사용)
    results = model(img)
    features = []

    for result in results.xyxy[0].cpu().numpy():
        x1, y1, x2, y2, confidence, class_id = result[:6]
        width = x2 - x1
        height = y2 - y1
        features.append([x1, y1, width, height, confidence, int(class_id)])

    return features

# 이미지 경로 확인
image_dir = '/content/drive/MyDrive/Data/train/images'
if not os.path.exists(image_dir):
    raise FileNotFoundError(f"Directory not found: {image_dir}")

try:
    image_files = [os.path.join(image_dir, file) for file in os.listdir(image_dir) if file.endswith('.jpg')]
except OSError as e:
    print(f"Error accessing directory {image_dir}: {e}")
    image_files = []

# 특징 데이터를 수집하는 함수
def collect_features(image_file):
    try:
        features = extract_object_features(image_file)
        return features
    except Exception as e:
        print(f"Error processing {image_file}: {e}")
        return []

# 병렬 처리를 사용하여 특징 데이터 수집
features_list = []
if image_files:
    with ThreadPoolExecutor(max_workers=8) as executor:  # 적절한 max_workers 값을 설정하세요
        future_to_file = {executor.submit(collect_features, image_file): image_file for image_file in image_files}
        for future in tqdm(as_completed(future_to_file), total=len(image_files), desc="Extracting object features"):
            result = future.result()
            if result:  # 결과가 비어있지 않을 때만 추가
                features_list.extend(result)

# 특징 데이터 배열로 변환
if features_list:
    features_array = np.array(features_list)
    print(f"Extracted features array shape: {features_array.shape}")
else:
    features_array = np.array([])
    print("No features extracted.")


Using cache found in /root/.cache/torch/hub/ultralytics_yolov5_master
YOLOv5 🚀 2024-7-1 Python-3.10.12 torch-2.3.0+cu121 CUDA:0 (Tesla T4, 15102MiB)

Fusing layers... 
Model summary: 157 layers, 7139581 parameters, 0 gradients, 16.2 GFLOPs
Adding AutoShape... 
Extracting object features: 100%|██████████| 44799/44799 [1:09:00<00:00, 10.82it/s]


Extracted features array shape: (1195974, 6)


# 2. 랜덤 포레스트 모델 학습 및 모델 저장

In [None]:
pip install tqdm



In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import joblib
from tqdm import tqdm

if features_array.size > 0:
    # 특징 데이터와 라벨 분리
    X = features_array[:, :-1]  # 특징 데이터
    y = features_array[:, -1]  # 라벨 (객체 클래스)

    # 데이터 분할
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # 랜덤 포레스트 모델 학습
    rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)

    # tqdm을 사용한 모델 학습 진행 표시
    print("Training RandomForestClassifier...")
    for i in tqdm(range(1)):
        rf_clf.fit(X_train, y_train)

    # 예측 수행
    print("Predicting...")
    y_pred = []
    for i in tqdm(range(len(X_test))):
        y_pred.append(rf_clf.predict([X_test[i]]))
    y_pred = np.array(y_pred).flatten()

    # 모델 평가
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.4f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

    # 모델 저장
    model_path = '/content/drive/MyDrive/random_forest_best_model.joblib'
    joblib.dump(rf_clf, model_path)
    print(f"Model saved to {model_path}")
else:
    print("No data available for training.")


Training RandomForestClassifier...


100%|██████████| 1/1 [10:00<00:00, 600.31s/it]


Predicting...


100%|██████████| 239195/239195 [17:30<00:00, 227.66it/s]


Accuracy: 0.5155
Classification Report:
              precision    recall  f1-score   support

         0.0       0.55      0.40      0.46      2365
         1.0       0.62      0.48      0.54      2314
         2.0       0.49      0.44      0.47      2471
         3.0       0.29      0.14      0.19      2560
         4.0       0.32      0.20      0.24      4199
         5.0       0.51      0.48      0.49      2351
         6.0       0.39      0.36      0.38      2463
         7.0       0.49      0.48      0.49      3629
         8.0       0.48      0.42      0.45      2748
         9.0       0.51      0.46      0.48      2328
        10.0       0.60      0.61      0.61      3142
        11.0       0.56      0.49      0.53      2770
        13.0       0.56      0.56      0.56      9809
        14.0       0.64      0.71      0.67      2230
        15.0       0.62      0.59      0.61      2413
        16.0       0.61      0.58      0.59      2545
        17.0       0.49      0.44      0.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
