In [None]:
import cv2
import matplotlib.pyplot as plt
from pathlib import Path
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [None]:
def preprocess_image(path, target_size=(128, 128)):
    # 1. Reading image
    img = cv2.imread(str(path))
    # 2. Change color system to gray
    img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    # 3. Resize
    img = cv2.resize(img, target_size, interpolation=cv2.INTER_AREA)
    # 4. Normalize
    img = img.astype('float32') / 255.0
    return img

In [None]:
data_root = Path('../data/raw/lfw_funneled/')
data_path = list(data_root.rglob('*.jpg'))

In [None]:
def split_data(data_path: list[Path]):
    train_path, temp_path = train_test_split(
        data_path, train_size=0.8, test_size=0.2, random_state=42
    )
    val_path, test_path = train_test_split(
        temp_path, train_size=0.1, test_size=0.1, random_state=42
    )

    return train_path, val_path, test_path

In [None]:
train_paths, val_paths, test_paths = split_data(data_path)
print(len(data_path))
print(type(train_paths[0]))

In [None]:
def prepare_data_list(image_paths: list[Path], target_size = (128, 128)):
    X = []
    Y = [] 
    unique_labels = sorted(list(set(p.parent.name for p in image_paths)))
    label_to_idx = {name: i for i, name in enumerate(unique_labels)}
    
    print(f"Đang xử lý {len(image_paths)} ảnh từ {len(unique_labels)} lớp...")

    # Loop for reading and preprocessing images
    for p in tqdm(image_paths, desc="Đang xử lý ảnh", unit="Ảnh"):

        try:
            
            img = preprocess_image(str(p), target_size)

            # Add dim for image (128, 128) -> (128, 128, 1)
            img = img.reshape(target_size[0], target_size[1], 1)
            
            X.append(img)
            Y.append(label_to_idx[p.parent.name])
        except Exception as e:
            print(f"Lỗi tại ảnh {p}: {e}")
    return np.array(X), np.array(Y), label_to_idx



In [None]:
# prepare_data_list(image_path)
x_train, y_train, mapping = prepare_data_list(train_paths)
x_val, y_val, _ = prepare_data_list(val_paths)
x_test, y_test, _ = prepare_data_list(test_paths)

In [None]:
save_dir = Path('../data/processed/')
save_dir.mkdir(parents=True, exist_ok=True)

def save_split_data(name, save_dir, x, y, mapping=None):
    file_path = save_dir / f'{name}.npz'

    data_to_save = {'X': x, 'Y': y}
    if mapping is not None:
        data_to_save['mapping'] = mapping

    np.savez_compressed(file_path, **data_to_save)

save_split_data("train", save_dir, x_train, y_train, mapping)
save_split_data("test", save_dir, x_test, y_test)
save_split_data("val", save_dir, x_val, y_val)

In [None]:
test_load = np.load('../data/processed/test.npz', allow_pickle=True)
print(test_load.files)

In [None]:
with np.load('../data/processed/train.npz', allow_pickle=True) as data:
    x_train = data['X']
    y_train = data['Y']
    mapping = data['mapping'].item()
with np.load('../data/processed/test.npz', allow_pickle=True) as data:
    x_test = data['X']
    y_test = data['Y']

In [None]:
y_train

In [None]:
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import time

def train_svm_with_pca(X_train, Y_train, X_test, Y_test, n_components=150):
    start_time = time.time()

    # 1. Flatting image
    X_train_flat = X_train.reshape(len(X_train), -1)
    X_test_flat = X_test.reshape(len(X_test), -1)

    # 2. PCA on train
    pca = PCA(n_components=n_components, whiten=True, random_state=42)
    pca.fit(X_train_flat)

    # 3. Transform
    X_train_pca = pca.transform(X_train_flat)
    X_test_pca = pca.transform(X_test_flat)

    # 4. Training svm
    model = SVC(kernel='linear', C=1.0, gamma='scale', class_weight='balanced')
    model.fit(X_train_pca, Y_train)

    # 5. Evaluate
    end_time = time.time()

    print(f"Training time: {end_time - start_time:.2f} second")
    # Predict for y hat
    y_train_pred = model.predict(X_train_pca)
    y_test_pred = model.predict(X_test_pca)

    train_acc = accuracy_score(Y_train, y_train_pred)
    test_acc = accuracy_score(Y_test, y_test_pred)

    print(f"Train Accuracy: {train_acc:.4f}")
    print(f"Test Accuracy: {test_acc:.4f}")
    
    return model, pca

In [None]:
model, pca = train_svm_with_pca(x_train, y_train, x_test, y_test)