In [1]:
import os
from pathlib import Path

import numpy as np
from skimage.io import imread
from skimage.color import rgb2gray
from skimage.transform import resize
from skimage.feature import hog

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm

import optuna
from optuna.pruners import MedianPruner
from optuna.samplers import TPESampler
import joblib

In [2]:
def extract_hog_features(path: Path) -> np.ndarray:
    img = imread(path)
    gray = rgb2gray(img) if img.ndim == 3 else img
    gray = resize(gray, IMAGE_SIZE, anti_aliasing=True)
    return hog(gray, **HOG_PARAMS)

def load_split(split: str):
    X, y = [], []
    for label in LABELS:
        folder = DATA_DIR / split / label
        for img_path in tqdm(list(folder.glob('*.*')), desc=f'Loading {split}/{label}'):
            try:
                X.append(extract_hog_features(img_path))
                y.append(label)
            except Exception as e:
                print(f'-- error reading {img_path}: {e}')
    return np.vstack(X), np.array(y)

def objective(trial):
    # Không gian tham số với warm_start để incremental fit :contentReference[oaicite:6]{index=6}
    params = {
        'n_estimators'      : 10,  # sẽ tăng dần
        'max_depth'         : trial.suggest_int('max_depth', 5, 50),
        'min_samples_split' : trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf'  : trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features'      : trial.suggest_categorical('max_features', ['sqrt', 'log2']),  # bỏ None để giảm time
        'random_state'      : 42,
        'warm_start'        : True,
        'n_jobs'            : 1
    }
    clf = RandomForestClassifier(**params)
    # Incremental training & pruning :contentReference[oaicite:7]{index=7} :contentReference[oaicite:8]{index=8}
    for n in [50, 100, 150, 200]:
        clf.set_params(n_estimators=n)
        clf.fit(X_train, y_train_enc)
        acc = accuracy_score(y_val_enc, clf.predict(X_val))
        trial.report(acc, n)
        if trial.should_prune():
            raise optuna.TrialPruned()
    # Lưu instance model cuối cùng
    trial.set_user_attr("model", clf)
    return acc

In [3]:
DATA_DIR   = Path('/kaggle/input/face-mask-12k-images-dataset/Face Mask Dataset')
LABELS     = ['WithMask', 'WithoutMask']
IMAGE_SIZE = (128, 128)
HOG_PARAMS = {
    'orientations'    : 9,
    'pixels_per_cell' : (6, 6),
    'cells_per_block' : (3, 3),
    'block_norm'      : 'L2-Hys',
}

In [4]:
X_train, y_train = load_split('Train')
X_val,   y_val   = load_split('Validation')
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_val_enc   = le.transform(y_val)

Loading Train/WithMask: 100%|██████████| 5000/5000 [02:00<00:00, 41.59it/s]
Loading Train/WithoutMask: 100%|██████████| 5000/5000 [01:39<00:00, 50.18it/s]
Loading Validation/WithMask: 100%|██████████| 400/400 [00:08<00:00, 47.06it/s]
Loading Validation/WithoutMask: 100%|██████████| 400/400 [00:08<00:00, 47.38it/s]


In [5]:
study = optuna.create_study(
    direction='maximize',
    pruner=MedianPruner(n_startup_trials=5, n_warmup_steps=1, interval_steps=1),
    sampler=TPESampler(multivariate=True)
)

[I 2025-05-15 14:40:53,494] A new study created in memory with name: no-name-2a97e06c-259e-4473-9e1b-67578ae4728a


In [6]:
study.optimize(objective, n_trials=50, n_jobs=4)

# --- Kết quả ---
print("Best validation accuracy:", study.best_value)
print("Best parameters:", study.best_params)

[I 2025-05-15 14:41:32,692] Trial 2 finished with value: 0.98 and parameters: {'max_depth': 19, 'min_samples_split': 14, 'min_samples_leaf': 3, 'max_features': 'log2'}. Best is trial 2 with value: 0.98.
[I 2025-05-15 14:47:17,209] Trial 4 finished with value: 0.985 and parameters: {'max_depth': 10, 'min_samples_split': 20, 'min_samples_leaf': 2, 'max_features': 'sqrt'}. Best is trial 4 with value: 0.985.
[I 2025-05-15 14:47:49,688] Trial 3 finished with value: 0.98125 and parameters: {'max_depth': 47, 'min_samples_split': 12, 'min_samples_leaf': 10, 'max_features': 'sqrt'}. Best is trial 4 with value: 0.985.
[I 2025-05-15 14:47:55,520] Trial 0 finished with value: 0.98125 and parameters: {'max_depth': 19, 'min_samples_split': 14, 'min_samples_leaf': 9, 'max_features': 'sqrt'}. Best is trial 4 with value: 0.985.
[I 2025-05-15 14:48:18,512] Trial 1 finished with value: 0.98375 and parameters: {'max_depth': 42, 'min_samples_split': 12, 'min_samples_leaf': 7, 'max_features': 'sqrt'}. Best 

Best validation accuracy: 0.98875
Best parameters: {'max_depth': 26, 'min_samples_split': 15, 'min_samples_leaf': 3, 'max_features': 'sqrt'}


In [7]:
best_clf = study.best_trial.user_attrs["model"]

X_test, y_test = load_split('Test')
y_test_enc     = le.transform(y_test)
y_pred         = best_clf.predict(X_test)

print("\nTest Accuracy:", accuracy_score(y_test_enc, y_pred))
print("\nClassification Report:")
print(classification_report(y_test_enc, y_pred, target_names=le.classes_))

Loading Test/WithMask: 100%|██████████| 483/483 [00:11<00:00, 41.61it/s]
Loading Test/WithoutMask: 100%|██████████| 509/509 [00:10<00:00, 47.03it/s]



Test Accuracy: 0.9818548387096774

Classification Report:
              precision    recall  f1-score   support

    WithMask       0.97      0.99      0.98       483
 WithoutMask       0.99      0.97      0.98       509

    accuracy                           0.98       992
   macro avg       0.98      0.98      0.98       992
weighted avg       0.98      0.98      0.98       992



In [8]:
joblib.dump({'model': best_clf, 'label_encoder': le},
            'hog_rf_face_mask_model_optuna_direct.joblib')
print('\nModel và encoder đã được lưu vào hog_rf_face_mask_model_optuna_direct.joblib')


Model và encoder đã được lưu vào hog_rf_face_mask_model_optuna_direct.joblib
