In [1]:
import os
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
from skimage.feature import local_binary_pattern
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
import joblib
import optuna
from sklearn.model_selection import cross_val_score
import pandas as pd
from tqdm import tqdm

# Cố định tham số LBP
LBP_PARAMS = {
    'radius': 1,
    'n_points': 8,
    'method': 'default'
}

def extract_lbp_features(image):
    """Trích xuất đặc trưng LBP với tham số cố định"""
    lbp = local_binary_pattern(image, 
                             P=LBP_PARAMS['n_points'], 
                             R=LBP_PARAMS['radius'], 
                             method=LBP_PARAMS['method'])
    
    if LBP_PARAMS['method'] == 'uniform':
        n_bins = LBP_PARAMS['n_points'] + 2
    else:
        n_bins = 2 ** LBP_PARAMS['n_points']
        
    lbp_hist, _ = np.histogram(lbp.ravel(), bins=n_bins, range=(0, n_bins))
    lbp_hist = lbp_hist.astype('float')
    lbp_hist /= (lbp_hist.sum() + 1e-6)
    return lbp_hist

def load_split(split_name, img_size=(128, 128)):
    """Load dataset từ thư mục với progress bar"""
    X = []
    y = []
    split_dir = os.path.join(BASE_DIR, split_name)
    
    for class_name in ['WithMask', 'WithoutMask']:
        class_dir = os.path.join(split_dir, class_name)
        print(f"Loading {split_name}/{class_name}: ", end='')
        
        for fname in tqdm(os.listdir(class_dir)):
            if not fname.lower().endswith(('.png', '.jpg', '.jpeg')):
                continue
                
            fpath = os.path.join(class_dir, fname)
            try:
                with Image.open(fpath) as im:
                    im = im.convert('L')
                    im = im.resize(img_size)
                    img = np.array(im, dtype=np.uint8)
                    
                # Trích xuất đặc trưng LBP
                features = extract_lbp_features(img)
                X.append(features)
                y.append(class_name)
                
            except Exception as e:
                print(f"\n⚠️ Skip corrupted file {fpath}: {e}")
                continue
    
    return np.array(X), np.array(y)

def objective(trial):
    # Không gian tham số SVC
    kernel = trial.suggest_categorical('kernel', ['linear', 'poly', 'rbf'])
    
    params = {
        'kernel': kernel,
        'C': trial.suggest_categorical('C', [1, 5, 10]),
        'probability': True,
        'random_state': 42
    }
    
    # Thêm gamma tùy theo kernel
    if kernel == 'linear':
        params['gamma'] = 'scale'  # linear kernel không cần gamma
    elif kernel == 'poly':
        gamma = trial.suggest_categorical('gamma_poly', ['0.01', '0.1', 'scale', '1', 'auto'])
        params['gamma'] = gamma if gamma in ['scale', 'auto'] else float(gamma)
        params['degree'] = trial.suggest_int('degree', 3, 15)
    else:  # rbf
        gamma = trial.suggest_categorical('gamma_rbf', ['0.01', '0.1', 'scale', '1', '10', 'auto'])
        params['gamma'] = gamma if gamma in ['scale', 'auto'] else float(gamma)
    
    # Tạo và huấn luyện model
    clf = SVC(**params)
    clf.fit(X_train, y_train)
    
    # Đánh giá trên tập validation
    preds = clf.predict(X_val)
    acc = accuracy_score(y_val, preds)
    
    # Lưu model vào user_attr của trial
    trial.set_user_attr("model", clf)
    return acc

def optimize_parameters():
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=1000)
    
    print("\nBest validation accuracy:", study.best_value)
    print("Best parameters:", study.best_params)
    
    # Lấy model tốt nhất
    best_model = study.best_trial.user_attrs["model"]
    
    # Đánh giá trên tập test
    y_pred = best_model.predict(X_test)
    report = classification_report(y_test, y_pred, target_names=['WithMask', 'WithoutMask'])
    print("\nTest set classification report:")
    print(report)
    
    # Lưu model tốt nhất
    joblib.dump(best_model, 'best_svc_model.joblib')
    print("\nModel đã được lưu vào 'best_svc_model.joblib'")

if __name__ == '__main__':
    BASE_DIR = '/kaggle/input/face-mask-12k-images-dataset/Face Mask Dataset'
    
    print("Loading Train set...")
    X_train, y_train = load_split('Train')
    
    print("\nLoading Validation set...")
    X_val, y_val = load_split('Validation')
    
    print("\nLoading Test set...")
    X_test, y_test = load_split('Test')
    
    print(f"\nTrain samples: {len(y_train)}")
    print(f"Validation samples: {len(y_val)}")
    print(f"Test samples: {len(y_test)}")
    
    print("\nOptimizing SVC parameters...")
    optimize_parameters()


Loading Train set...
Loading Train/WithMask: 

100%|██████████| 5000/5000 [01:02<00:00, 80.38it/s]


Loading Train/WithoutMask: 

100%|██████████| 5000/5000 [00:55<00:00, 90.15it/s]



Loading Validation set...
Loading Validation/WithMask: 

100%|██████████| 400/400 [00:05<00:00, 78.61it/s]


Loading Validation/WithoutMask: 

100%|██████████| 400/400 [00:04<00:00, 93.95it/s]



Loading Test set...
Loading Test/WithMask: 

100%|██████████| 483/483 [00:06<00:00, 79.14it/s]


Loading Test/WithoutMask: 

100%|██████████| 509/509 [00:05<00:00, 87.07it/s]
[I 2025-05-17 17:45:09,861] A new study created in memory with name: no-name-01ffee3d-12fa-49db-abc6-f97939ad99e4



Train samples: 10000
Validation samples: 800
Test samples: 992

Optimizing SVC parameters...


[I 2025-05-17 17:47:07,523] Trial 0 finished with value: 0.575 and parameters: {'kernel': 'poly', 'C': 5, 'gamma_poly': '0.01', 'degree': 10}. Best is trial 0 with value: 0.575.
[I 2025-05-17 17:47:56,167] Trial 1 finished with value: 0.83 and parameters: {'kernel': 'linear', 'C': 10}. Best is trial 1 with value: 0.83.
[I 2025-05-17 17:48:55,916] Trial 2 finished with value: 0.80875 and parameters: {'kernel': 'linear', 'C': 1}. Best is trial 1 with value: 0.83.
[I 2025-05-17 17:50:18,145] Trial 3 finished with value: 0.80875 and parameters: {'kernel': 'rbf', 'C': 5, 'gamma_rbf': '0.1'}. Best is trial 1 with value: 0.83.
[I 2025-05-17 17:52:26,382] Trial 4 finished with value: 0.5025 and parameters: {'kernel': 'poly', 'C': 1, 'gamma_poly': '0.01', 'degree': 4}. Best is trial 1 with value: 0.83.
[I 2025-05-17 17:53:00,010] Trial 5 finished with value: 0.94875 and parameters: {'kernel': 'rbf', 'C': 10, 'gamma_rbf': '10'}. Best is trial 5 with value: 0.94875.
[I 2025-05-17 17:53:34,929] Tr


Best validation accuracy: 0.96375
Best parameters: {'kernel': 'poly', 'C': 10, 'gamma_poly': 'scale', 'degree': 5}

Test set classification report:
              precision    recall  f1-score   support

    WithMask       0.98      0.95      0.97       483
 WithoutMask       0.96      0.98      0.97       509

    accuracy                           0.97       992
   macro avg       0.97      0.97      0.97       992
weighted avg       0.97      0.97      0.97       992


Model đã được lưu vào 'best_svc_model.joblib'
