In [20]:
import cv2
import numpy as np
from sklearn.cluster import KMeans
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import os

def extract_sift_features(image):
    gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    sift = cv2.SIFT_create()
    keypoints, descriptors = sift.detectAndCompute(gray_image, None)
    return descriptors

def create_visual_vocabulary(descriptor_list, num_clusters):
    all_descriptors = np.vstack(descriptor_list)
    kmeans = KMeans(n_clusters=num_clusters)
    kmeans.fit(all_descriptors)
    visual_vocabulary = kmeans.cluster_centers_
    return kmeans, visual_vocabulary

def compute_histogram(descriptors, kmeans, num_clusters):
    histogram = np.zeros(num_clusters)
    cluster_assignments = kmeans.predict(descriptors)
    for cluster in cluster_assignments:
        histogram[cluster] += 1
    histogram /= np.sum(histogram)  
    return histogram

def train_with_split(feature_histograms, labels, test_size=0.5):
    X_train, X_test, y_train, y_test = train_test_split(feature_histograms, labels, test_size=test_size, random_state=42)
    classifier = svm.SVC(kernel='linear')
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("50% train / 50% test ACC:", accuracy)
    return classifier, accuracy

def main(images, labels, num_clusters):
    descriptor_list = []
    valid_labels = []  
    for i, image in enumerate(images):
        descriptors = extract_sift_features(image)
        if descriptors is not None and len(descriptors) > 0:  
            descriptor_list.append(descriptors)
            valid_labels.append(labels[i])

    kmeans, visual_vocabulary = create_visual_vocabulary(descriptor_list, num_clusters)
    feature_histograms = np.array([compute_histogram(descriptors, kmeans, num_clusters) for descriptors in descriptor_list])
    
    classifier, accuracy = train_with_split(feature_histograms, valid_labels, test_size=0.5)
    return classifier

def load_images_from_root_folder(root_folder_path, target_size=(224, 224)):
    images = []
    labels = []
    for domain_folder in os.listdir(root_folder_path):
        domain_path = os.path.join(root_folder_path, domain_folder)
        if os.path.isdir(domain_path):  
            for category_folder in os.listdir(domain_path):
                category_path = os.path.join(domain_path, category_folder)
                if os.path.isdir(category_path):  
                    for filename in os.listdir(category_path):
                        img_path = os.path.join(category_path, filename)
                        img = cv2.imread(img_path)
                        if img is not None:
                            img_resized = cv2.resize(img, target_size)
                            images.append(img_resized)
                            labels.append(category_folder)
    return images, labels

root_folder_path = "/Users/yanzhu/Documents/Office31"
images, labels = load_images_from_root_folder(root_folder_path)

cluster_options = [50, 100, 150, 200]
for num_clusters in cluster_options:
    print(f"\nTesting num_clusters = {num_clusters}:")
    classifier = main(images, labels, num_clusters=num_clusters)


Testing num_clusters = 50:
50% train / 50% test ACC: 0.07918050941306755

Testing num_clusters = 100:
50% train / 50% test ACC: 0.05260243632336656

Testing num_clusters = 150:
50% train / 50% test ACC: 0.04928017718715393

Testing num_clusters = 200:
50% train / 50% test ACC: 0.044296788482835
