In [1]:
import numpy as np
import cv2
import os
from skimage import feature

def resize_image(image, size=(128, 128)):
    return cv2.resize(image, size)

def extract_color_histogram(image, bins=(8, 8, 8)):
    hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
    hist = cv2.calcHist([hsv], [0, 1, 2], None, bins, [0, 180, 0, 256, 0, 256])
    hist = cv2.normalize(hist, hist).flatten()
    return hist

def extract_edges(image):
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    edges = cv2.Canny(gray, 100, 200)
    return edges

def extract_features(image):
    image = resize_image(image)
    color_hist = extract_color_histogram(image)
    edges = extract_edges(image)
    edges_flat = edges.flatten()
    features = np.hstack([color_hist, edges_flat])
    return features

def extract_and_store_features_in_batches(folder, batch_size=100):
    features = []
    labels = []
    batch_features = []
    batch_labels = []
    count = 0
    for subfolder in os.listdir(folder):
        subfolder_path = os.path.join(folder, subfolder)
        if os.path.isdir(subfolder_path):
            for filename in os.listdir(subfolder_path):
                img_path = os.path.join(subfolder_path, filename)
                img = cv2.imread(img_path)
                if img is not None:
                    img_features = extract_features(img)
                    batch_features.append(img_features)
                    batch_labels.append(subfolder)
                    count += 1
                    if count % batch_size == 0:
                        features.append(np.array(batch_features))
                        labels.append(np.array(batch_labels))
                        batch_features = []
                        batch_labels = []
    if batch_features:
        features.append(np.array(batch_features))
        labels.append(np.array(batch_labels))
    return features, labels

dataset_folder = r'C:\Users\tw93\OneDrive\Desktop\masters_project\dataset'

features_batches, labels_batches = extract_and_store_features_in_batches(dataset_folder)


In [2]:
import numpy as np

def save_batches(features_batches, labels_batches, base_filename='features_batch'):
    for i, (features_batch, labels_batch) in enumerate(zip(features_batches, labels_batches)):
        np.save(f'{base_filename}_features_{i}.npy', features_batch.astype(np.float32))
        np.save(f'{base_filename}_labels_{i}.npy', labels_batch)

save_batches(features_batches, labels_batches)



In [3]:
import numpy as np
from glob import glob

def load_and_concatenate_batches(base_filename='features_batch'):
    features_files = sorted(glob(f'{base_filename}_features_*.npy'))
    labels_files = sorted(glob(f'{base_filename}_labels_*.npy'))
    features = []
    labels = []
    for features_file, labels_file in zip(features_files, labels_files):
        features.append(np.load(features_file))
        labels.append(np.load(labels_file))
    features = np.vstack(features)
    labels = np.hstack(labels)
    return features, labels

features, labels = load_and_concatenate_batches()
