In [1]:
import os
import numpy as np
from PIL import Image
import joblib
import matplotlib.pyplot as plt

from skimage.feature import local_binary_pattern
from skimage import img_as_ubyte

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import optuna
from tqdm import tqdm
from pathlib import Path

import joblib

In [2]:
# ====== Cấu hình ======
DATA_DIR   = Path('/kaggle/input/face-mask-12k-images-dataset/Face Mask Dataset')
SPLITS     = ['Train', 'Validation', 'Test']
LABELS     = ['WithMask', 'WithoutMask']
IMAGE_SIZE = (128, 128)

LBP_RADIUS = 1
LBP_POINTS = 8 * LBP_RADIUS
LBP_METHOD = 'uniform'

In [3]:
# === LBP Bin ===
def get_lbp_bins(method):
    if method == 'default':
        return 2 ** LBP_POINTS
    elif method == 'ror':
        return LBP_POINTS + 1
    elif method == 'uniform':
        return LBP_POINTS + 2
    elif method == 'nri_uniform':
        return LBP_POINTS * (LBP_POINTS - 1) + 3
    else:
        raise ValueError(f"Unsupported LBP method: {method}")

In [4]:
# === Hàm trích xuất đặc trưng LBP ===
def extract_lbp_features(image, method):
    lbp = local_binary_pattern(image, LBP_POINTS, LBP_RADIUS, method)
    n_bins = get_lbp_bins(method)
    hist, _ = np.histogram(lbp.ravel(), bins=n_bins, range=(0, n_bins))
    hist = hist.astype("float")
    hist /= (hist.sum() + 1e-7)
    return hist, lbp

In [5]:
def load_split(split: str, method='default'):
    X, y = [], []
    for label in LABELS:
        folder = DATA_DIR / split / label
        for img_path in tqdm(list(folder.glob('*.*')), desc=f'Loading {split}/{label}'):
            try:
                image = Image.open(img_path).convert('L')  # chuyển ảnh về grayscale
                image_np = np.array(image)
                hist, _ = extract_lbp_features(image_np, method=method)
                X.append(hist)
                y.append(label)
            except Exception as e:
                print(f'-- error reading {img_path}: {e}')
    return np.vstack(X), np.array(y)


def objective(trial):
    # Không gian tham số RF
    params = {
        'n_estimators'      : trial.suggest_int('n_estimators', 50, 300),
        'max_depth'         : trial.suggest_int('max_depth', 5, 50),
        'min_samples_split' : trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf'  : trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features'      : trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
        'random_state'      : 42,
        'n_jobs'            : -1
    }
    clf = RandomForestClassifier(**params)
    clf.fit(X_train, y_train_enc)
    preds = clf.predict(X_val)
    acc = accuracy_score(y_val_enc, preds)

    # Lưu model vào user_attr của trial
    trial.set_user_attr("model", clf)
    return acc

In [6]:
X_train, y_train = load_split('Train',LBP_METHOD)
X_val,   y_val   = load_split('Validation',LBP_METHOD)

le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_val_enc   = le.transform(y_val)

Loading Train/WithMask: 100%|██████████| 5000/5000 [01:13<00:00, 67.77it/s]
Loading Train/WithoutMask: 100%|██████████| 5000/5000 [00:34<00:00, 144.22it/s]
Loading Validation/WithMask: 100%|██████████| 400/400 [00:05<00:00, 71.90it/s]
Loading Validation/WithoutMask: 100%|██████████| 400/400 [00:02<00:00, 141.01it/s]


In [7]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

print("Best validation accuracy:", study.best_value)
print("Best parameters:", study.best_params)

[I 2025-05-16 09:52:34,883] A new study created in memory with name: no-name-7b649c56-1085-4adc-a00e-ce8c49b0ef3b
[I 2025-05-16 09:52:37,697] Trial 0 finished with value: 0.95375 and parameters: {'n_estimators': 262, 'max_depth': 11, 'min_samples_split': 18, 'min_samples_leaf': 1, 'max_features': 'log2'}. Best is trial 0 with value: 0.95375.
[I 2025-05-16 09:52:41,253] Trial 1 finished with value: 0.96125 and parameters: {'n_estimators': 256, 'max_depth': 33, 'min_samples_split': 12, 'min_samples_leaf': 2, 'max_features': 'sqrt'}. Best is trial 1 with value: 0.96125.
[I 2025-05-16 09:52:42,469] Trial 2 finished with value: 0.95 and parameters: {'n_estimators': 104, 'max_depth': 49, 'min_samples_split': 8, 'min_samples_leaf': 10, 'max_features': 'log2'}. Best is trial 1 with value: 0.96125.
[I 2025-05-16 09:52:44,946] Trial 3 finished with value: 0.95125 and parameters: {'n_estimators': 216, 'max_depth': 25, 'min_samples_split': 8, 'min_samples_leaf': 10, 'max_features': 'log2'}. Best i

Best validation accuracy: 0.965
Best parameters: {'n_estimators': 137, 'max_depth': 25, 'min_samples_split': 7, 'min_samples_leaf': 2, 'max_features': 'sqrt'}


In [8]:
best_clf = study.best_trial.user_attrs["model"]

X_test, y_test  = load_split('Test',LBP_METHOD)
y_test_enc      = le.transform(y_test)
y_pred          = best_clf.predict(X_test)

print("\nTest Accuracy:", accuracy_score(y_test_enc, y_pred))
print("\nClassification Report:")
print(classification_report(y_test_enc, y_pred, target_names=le.classes_))

Loading Test/WithMask: 100%|██████████| 483/483 [00:06<00:00, 69.18it/s]
Loading Test/WithoutMask: 100%|██████████| 509/509 [00:03<00:00, 130.76it/s]



Test Accuracy: 0.96875

Classification Report:
              precision    recall  f1-score   support

    WithMask       0.99      0.94      0.97       483
 WithoutMask       0.95      0.99      0.97       509

    accuracy                           0.97       992
   macro avg       0.97      0.97      0.97       992
weighted avg       0.97      0.97      0.97       992



In [9]:
# ====== Lưu model và encoder ======
joblib.dump({'model': best_clf, 'label_encoder': le},
            'lbp_rf_face_mask_model_optuna_uniform.joblib')
print('\nModel và encoder đã được lưu vào lbp_rf_face_mask_model_optuna.joblib')


Model và encoder đã được lưu vào lbp_rf_face_mask_model_optuna.joblib
