In [1]:
import os
from pathlib import Path

import numpy as np
from skimage.io import imread
from skimage.color import rgb2gray
from skimage.transform import resize
from skimage.feature import hog

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm

import optuna
from optuna.pruners import MedianPruner
from optuna.samplers import TPESampler
import joblib

In [2]:
def extract_hog_features(path: Path) -> np.ndarray:
    img = imread(path)
    gray = rgb2gray(img) if img.ndim == 3 else img
    gray = resize(gray, IMAGE_SIZE, anti_aliasing=True)
    return hog(gray, **HOG_PARAMS)

def load_split(split: str):
    X, y = [], []
    for label in LABELS:
        folder = DATA_DIR / split / label
        for img_path in tqdm(list(folder.glob('*.*')), desc=f'Loading {split}/{label}'):
            try:
                X.append(extract_hog_features(img_path))
                y.append(label)
            except Exception as e:
                print(f'-- error reading {img_path}: {e}')
    return np.vstack(X), np.array(y)

def objective(trial):
    # Không gian tham số với warm_start để incremental fit :contentReference[oaicite:6]{index=6}
    params = {
        'n_estimators'      : 10,  # sẽ tăng dần
        'max_depth'         : trial.suggest_int('max_depth', 5, 50),
        'min_samples_split' : trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf'  : trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features'      : trial.suggest_categorical('max_features', ['sqrt', 'log2']),  # bỏ None để giảm time
        'random_state'      : 42,
        'warm_start'        : True,
        'n_jobs'            : 1
    }
    clf = RandomForestClassifier(**params)
    # Incremental training & pruning :contentReference[oaicite:7]{index=7} :contentReference[oaicite:8]{index=8}
    for n in [50, 100, 150, 200]:
        clf.set_params(n_estimators=n)
        clf.fit(X_train, y_train_enc)
        acc = accuracy_score(y_val_enc, clf.predict(X_val))
        trial.report(acc, n)
        if trial.should_prune():
            raise optuna.TrialPruned()
    # Lưu instance model cuối cùng
    trial.set_user_attr("model", clf)
    return acc

In [3]:
DATA_DIR   = Path('/kaggle/input/face-mask-12k-images-dataset/Face Mask Dataset')
LABELS     = ['WithMask', 'WithoutMask']
IMAGE_SIZE = (128, 128)
HOG_PARAMS = {
    'orientations'    : 9,
    'pixels_per_cell' : (8, 8),
    'cells_per_block' : (2, 2),
    'block_norm'      : 'L2-Hys',
}

In [4]:
X_train, y_train = load_split('Train')
X_val,   y_val   = load_split('Validation')
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_val_enc   = le.transform(y_val)

Loading Train/WithMask: 100%|██████████| 5000/5000 [01:45<00:00, 47.49it/s]
Loading Train/WithoutMask: 100%|██████████| 5000/5000 [01:32<00:00, 54.26it/s]
Loading Validation/WithMask: 100%|██████████| 400/400 [00:07<00:00, 50.27it/s]
Loading Validation/WithoutMask: 100%|██████████| 400/400 [00:07<00:00, 53.30it/s]


In [5]:
study = optuna.create_study(
    direction='maximize',
    pruner=MedianPruner(n_startup_trials=5, n_warmup_steps=1, interval_steps=1),
    sampler=TPESampler(multivariate=True)
)

[I 2025-05-15 14:39:56,882] A new study created in memory with name: no-name-f929dfed-471c-4376-87f0-903224c5b9da


In [6]:
study.optimize(objective, n_trials=50, n_jobs=4)

# --- Kết quả ---
print("Best validation accuracy:", study.best_value)
print("Best parameters:", study.best_params)

[I 2025-05-15 14:40:23,137] Trial 1 finished with value: 0.975 and parameters: {'max_depth': 9, 'min_samples_split': 13, 'min_samples_leaf': 3, 'max_features': 'log2'}. Best is trial 1 with value: 0.975.
[I 2025-05-15 14:40:29,377] Trial 0 finished with value: 0.98 and parameters: {'max_depth': 15, 'min_samples_split': 7, 'min_samples_leaf': 7, 'max_features': 'log2'}. Best is trial 0 with value: 0.98.
[I 2025-05-15 14:40:29,633] Trial 3 finished with value: 0.97625 and parameters: {'max_depth': 29, 'min_samples_split': 2, 'min_samples_leaf': 9, 'max_features': 'log2'}. Best is trial 0 with value: 0.98.
[I 2025-05-15 14:41:08,139] Trial 6 finished with value: 0.97625 and parameters: {'max_depth': 40, 'min_samples_split': 3, 'min_samples_leaf': 8, 'max_features': 'log2'}. Best is trial 0 with value: 0.98.
[I 2025-05-15 14:41:40,875] Trial 7 finished with value: 0.97625 and parameters: {'max_depth': 27, 'min_samples_split': 5, 'min_samples_leaf': 8, 'max_features': 'log2'}. Best is trial

Best validation accuracy: 0.9875
Best parameters: {'max_depth': 13, 'min_samples_split': 10, 'min_samples_leaf': 6, 'max_features': 'sqrt'}


In [7]:
best_clf = study.best_trial.user_attrs["model"]

X_test, y_test = load_split('Test')
y_test_enc     = le.transform(y_test)
y_pred         = best_clf.predict(X_test)

print("\nTest Accuracy:", accuracy_score(y_test_enc, y_pred))
print("\nClassification Report:")
print(classification_report(y_test_enc, y_pred, target_names=le.classes_))

Loading Test/WithMask: 100%|██████████| 483/483 [00:09<00:00, 51.15it/s]
Loading Test/WithoutMask: 100%|██████████| 509/509 [00:08<00:00, 59.26it/s]



Test Accuracy: 0.9858870967741935

Classification Report:
              precision    recall  f1-score   support

    WithMask       0.98      1.00      0.99       483
 WithoutMask       1.00      0.98      0.99       509

    accuracy                           0.99       992
   macro avg       0.99      0.99      0.99       992
weighted avg       0.99      0.99      0.99       992



In [8]:
joblib.dump({'model': best_clf, 'label_encoder': le},
            'hog_rf_face_mask_model_optuna_direct.joblib')
print('\nModel và encoder đã được lưu vào hog_rf_face_mask_model_optuna_direct.joblib')


Model và encoder đã được lưu vào hog_rf_face_mask_model_optuna_direct.joblib
