<a href="https://colab.research.google.com/github/Pongaaa/ComputerVision/blob/main/baitap3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Định nghĩa đường dẫn
train_path = '/content/drive/My Drive/HoaVietNam/train'
test_path = '/content/drive/My Drive/HoaVietNam/test'

In [None]:
import os
import numpy as np
import cv2
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, make_scorer

In [None]:
# Hàm trích xuất histogram HSV (16x16x16 bin)
def extract_histogram(image_path):
    image = cv2.imread(image_path)
    if image is None or image.size == 0:
        print(f"Warning: Could not load image at {image_path}")
        return np.zeros(4096)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
    hist = cv2.calcHist([image], [0, 1, 2], None, [16, 16, 16], [0, 180, 0, 256, 0, 256])
    cv2.normalize(hist, hist)
    return hist.flatten()

In [None]:
# Hàm trích xuất đặc trưng từ thư mục
def extract_features_from_folder(folder_path):
    X, y = [], []
    if not os.path.exists(folder_path):
        print(f"Folder {folder_path} does not exist")
        return np.array([]), np.array([])

    class_labels = sorted([d for d in os.listdir(folder_path) if os.path.isdir(os.path.join(folder_path, d))])
    if not class_labels:
        print(f"No valid class directories found in {folder_path}")
        return np.array([]), np.array([])

    for label in tqdm(class_labels, desc="Processing classes"):
        class_path = os.path.join(folder_path, label)
        images = [f for f in os.listdir(class_path) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]

        for image_name in images:
            image_path = os.path.join(class_path, image_name)
            features = extract_histogram(image_path)
            if not np.all(features == 0):
                X.append(features)
                y.append(label)

    if len(X) == 0:
        print(f"No valid features extracted from {folder_path}")
        return np.array([]), np.array([])

    return np.array(X), np.array(y)

In [None]:
# Hàm lọc các tổ hợp tham số hợp lệ
def filter_valid_params(param_grid):
    valid_combinations = []
    penalties = param_grid['penalty']
    solvers = param_grid['solver']
    Cs = param_grid['C']

    for penalty, solver, C in product(penalties, solvers, Cs):
        if solver in ['lbfgs', 'newton-cg', 'sag'] and penalty not in ['l2', None]:
            continue
        if solver == 'liblinear' and penalty not in ['l1', 'l2']:
            continue
        if solver == 'saga' and penalty not in ['l1', 'l2', 'elasticnet', None]:
            continue
        valid_combinations.append({'penalty': penalty, 'solver': solver, 'C': C})

    return valid_combinations

In [None]:
# Trích xuất dữ liệu
print("Extracting training features...")
X_train, y_train = extract_features_from_folder(train_path)
print("Extracting test features...")
X_test, y_test = extract_features_from_folder(test_path)

Extracting training features...


Processing classes: 100%|██████████| 5/5 [00:01<00:00,  2.55it/s]


Extracting test features...


Processing classes: 100%|██████████| 5/5 [00:00<00:00,  8.32it/s]


In [None]:
# Chuyển đổi nhãn thành số
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

In [None]:
# Kiểm tra dữ liệu
print("Training samples:", X_train.shape[0])
print("Test samples:", X_test.shape[0])
print("Unique labels in training set:", len(np.unique(y_train)), np.unique(y_train))
print("Unique labels in test set:", len(np.unique(y_test)), np.unique(y_test))

Training samples: 150
Test samples: 50
Unique labels in training set: 5 ['Cuc' 'Dao' 'Lan' 'Mai' 'Tho']
Unique labels in test set: 5 ['Cuc' 'Dao' 'Lan' 'Mai' 'Tho']


In [None]:
# Kiểm tra phân phối lớp
unique, counts = np.unique(y_train_encoded, return_counts=True)
print("Class distribution:", dict(zip(unique, counts)))
if np.any(counts < 5):
    print("Warning: Some classes have too few samples for reliable CV")

Class distribution: {0: 30, 1: 30, 2: 30, 3: 30, 4: 30}


In [None]:
# Định nghĩa tham số cần tối ưu
param_grid = {
    'penalty': ['l1', 'l2', 'elasticnet', None],
    'solver': ['lbfgs', 'liblinear', 'newton-cg', 'sag', 'saga']
}

# Huấn luyện Logistic Regression với GridSearchCV
logreg = LogisticRegression(max_iter=5000, random_state=42)
grid_search = GridSearchCV(logreg, param_grid, scoring=make_scorer(f1_score, average='macro', zero_division=0), cv=3, n_jobs=-1)

print("Fitting model...")
grid_search.fit(X_train, y_train_encoded)

Fitting model...


27 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/linear_model/_logistic.py", line 1193, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


In [None]:
# In tham số tốt nhất và điểm số
print("Best parameters:", grid_search.best_params_)
print("Best macro-F1 score:", grid_search.best_score_)

# Đánh giá trên tập test
y_pred = grid_search.predict(X_test)
test_f1 = f1_score(y_test_encoded, y_pred, average='macro', zero_division=0)
print("Macro-F1 score on test set:", test_f1)


Best parameters: {'penalty': 'l2', 'solver': 'sag'}
Best macro-F1 score: 0.7615725161155232
Macro-F1 score on test set: 0.860228832951945
