In [1]:
!pip install optuna



In [2]:
# -*- coding: utf-8 -*-
import os
from pathlib import Path
import numpy as np
from skimage.io import imread
from skimage.color import rgb2gray
from skimage.transform import resize
from skimage.feature import hog
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
import joblib
import optuna
from optuna.pruners import MedianPruner
from optuna.samplers import TPESampler
from joblib import Parallel, delayed
from tqdm import tqdm

# --- Cấu hình ---
DATA_DIR   = Path('/kaggle/input/face-mask-12k-images-dataset/Face Mask Dataset')
LABELS     = ['WithMask', 'WithoutMask']
IMAGE_SIZE = (128, 128)
HOG_PARAMS = {
    'orientations'    : 9,
    'pixels_per_cell' : (6, 6),
    'cells_per_block' : (3, 3),
    'block_norm'      : 'L2-Hys',
}
CACHE_DIR = Path('cache')
CACHE_DIR.mkdir(exist_ok=True)

def extract_hog(path):
    img = imread(path)
    gray = rgb2gray(img) if img.ndim == 3 else img
    gray = resize(gray, IMAGE_SIZE, anti_aliasing=True)
    return hog(gray, **HOG_PARAMS)

def load_features(split: str):
    cache_file = CACHE_DIR / f'{split}_hog.npz'
    if cache_file.exists():
        data = np.load(cache_file)
        return data['X'], data['y']
    
    img_paths, labels = [], []
    for lbl in LABELS:
        folder = DATA_DIR / split / lbl
        for p in folder.glob('*.*'):
            img_paths.append(p)
            labels.append(lbl)
    
    # parallel extract
    X = Parallel(n_jobs=8)(
        delayed(extract_hog)(p) for p in tqdm(img_paths, desc=f'HOG {split}')
    )
    X = np.vstack(X)
    y = np.array(labels)
    np.savez_compressed(cache_file, X=X, y=y)
    return X, y

# --- Load / cache features ---
X_train, y_train = load_features('Train')
X_val,   y_val   = load_features('Validation')
X_test,  y_test  = load_features('Test')

# --- Encode labels once ---
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_val_enc   = le.transform(y_val)
y_test_enc  = le.transform(y_test)

# --- Optuna objective chỉ trả về accuracy ---
def objective(trial):
    params = {
        'C':       trial.suggest_float('C', 1e-2, 1e2, log=True),
        'kernel':  trial.suggest_categorical('kernel', ['linear', 'rbf']),
        'gamma':   trial.suggest_categorical('gamma', ['scale', 'auto']),
        'random_state': 42,
        # tắt probability để speed-up
        'probability': False,
    }
    clf = SVC(**params)
    clf.fit(X_train, y_train_enc)
    y_pred = clf.predict(X_val)
    return accuracy_score(y_val_enc, y_pred)

study = optuna.create_study(
    direction='maximize',
    pruner=MedianPruner(n_startup_trials=5, n_warmup_steps=1),
    sampler=TPESampler(multivariate=True),
)
study.optimize(objective, n_trials=30, n_jobs=4)

print("→ Best validation accuracy:", study.best_value)
print("→ Best params:", study.best_params)

# --- Refit trên Train+Validation ---
X_combined = np.vstack([X_train, X_val])
y_combined = np.concatenate([y_train_enc, y_val_enc])

best_clf = SVC(**study.best_params, probability=False, random_state=42)
best_clf.fit(X_combined, y_combined)

# --- Đánh giá trên Test ---
y_pred_test = best_clf.predict(X_test)
print("\nTest Accuracy:", accuracy_score(y_test_enc, y_pred_test))
print(classification_report(y_test_enc, y_pred_test, target_names=le.classes_))

# --- Lưu model và encoder ---
joblib.dump({'model': best_clf, 'label_encoder': le},
            'hog_svm_optuna_fast.joblib')
print("\nSaved to hog_svm_optuna_fast.joblib")


HOG Train: 100%|██████████| 10000/10000 [01:15<00:00, 132.05it/s]
HOG Validation: 100%|██████████| 800/800 [00:05<00:00, 153.28it/s]
HOG Test: 100%|██████████| 992/992 [00:07<00:00, 140.18it/s]
[I 2025-05-18 13:52:09,325] A new study created in memory with name: no-name-1e85b0eb-56cb-4571-86e5-d3e27c650d06
[I 2025-05-18 14:27:31,370] Trial 0 finished with value: 0.99 and parameters: {'C': 24.955124154853273, 'kernel': 'rbf', 'gamma': 'auto'}. Best is trial 0 with value: 0.99.
[I 2025-05-18 14:52:37,543] Trial 1 finished with value: 0.9875 and parameters: {'C': 0.3271419437166026, 'kernel': 'rbf', 'gamma': 'scale'}. Best is trial 0 with value: 0.99.
[I 2025-05-18 15:22:09,316] Trial 5 finished with value: 0.99 and parameters: {'C': 21.747270588252817, 'kernel': 'linear', 'gamma': 'scale'}. Best is trial 0 with value: 0.99.
[I 2025-05-18 15:25:10,569] Trial 4 finished with value: 0.99 and parameters: {'C': 2.0738916172193034, 'kernel': 'rbf', 'gamma': 'scale'}. Best is trial 0 with value

→ Best validation accuracy: 0.99
→ Best params: {'C': 24.955124154853273, 'kernel': 'rbf', 'gamma': 'auto'}

Test Accuracy: 0.9848790322580645
              precision    recall  f1-score   support

    WithMask       0.97      1.00      0.98       483
 WithoutMask       1.00      0.97      0.99       509

    accuracy                           0.98       992
   macro avg       0.98      0.99      0.98       992
weighted avg       0.99      0.98      0.98       992


Saved to hog_svm_optuna_fast.joblib
