In [None]:
import os
import cv2
import numpy as np
import pandas as pd
from tqdm import tqdm
from skimage.feature import hog
from pathlib import Path
from skimage import feature
from sklearn.preprocessing import LabelEncoder, StandardScaler
import joblib

from sklearn.preprocessing import LabelEncoder

In [None]:
# Paths
root_path = os.path.join('dataset', 'malaria')
train_base_path = os.path.join(root_path, 'training_ds')
test_base_path = os.path.join(root_path, 'testing_ds')

FEATURES_DIR = os.path.join(root_path, 'extracted_features')
os.makedirs(FEATURES_DIR, exist_ok=True)

# image_sizes = [128,256]
image_sizes = [128]

print("Root Path:", root_path)
print("Train Base Path:", train_base_path)
print("Test Base Path:", test_base_path)

Root Path: dataset\malaria
Train Base Path: dataset\malaria\training_ds
Test Base Path: dataset\malaria\testing_ds


### Feature Extraction

In [17]:

# --- Feature extractors ---
def extract_color_histogram(image, bins=(8, 8, 8)):
    # This function now accepts an 'image' object, not an 'image_path'
    hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
    hist = cv2.calcHist([hsv], [0, 1, 2], None, bins, [0, 180, 0, 256, 0, 256])
    cv2.normalize(hist, hist)
    return hist.flatten()

def extract_hog_features(image, pixels_per_cell=(16, 16), cells_per_block=(2, 2)):
    # This function now accepts an 'image' object, not an 'image_path'
    # It also expects the image to be the correct size already.
    gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    features = feature.hog(gray_image, pixels_per_cell=pixels_per_cell,
                           cells_per_block=cells_per_block, visualize=False)
    return features

def extract_lbp_features(image, numPoints=24, radius=8, eps=1e-7):
    # This function was already correct! No changes needed here.
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    lbp = feature.local_binary_pattern(gray, numPoints, radius, method="uniform")
    (hist, _) = np.histogram(lbp.ravel(), bins=np.arange(0, numPoints + 3))
    hist = hist.astype("float")
    hist /= (hist.sum() + eps)
    return hist

# --- Dictionary of Feature Extractors ---
# Make sure the key for your color histogram is 'HIST' if you use it.
feature_extrators = {
    'HOG': extract_hog_features,
    'LBP': extract_lbp_features,
    'HIST': extract_color_histogram 
}

In [18]:
def load_data_from_directory(directory_path):
    """Loads image paths and labels into a DataFrame."""
    print(f"Loading image paths from: {directory_path}")
    data_list = []
    classes = [d for d in os.listdir(directory_path) if os.path.isdir(os.path.join(directory_path, d))]
    for label in classes:
        class_dir = os.path.join(directory_path, label)
        for filename in os.listdir(class_dir):
            if filename.lower().endswith(('.png', '.jpg', '.jpeg')):
                full_path = os.path.join(class_dir, filename)
                data_list.append({'image_path': full_path, 'label': label})
    return pd.DataFrame(data_list)


# In Cell 2 (Helper and Feature Extractor Functions)
def process_and_extract(df, extractor_fn):
    """
    Loops through a DataFrame, loads images, and applies a feature extractor.
    This version includes error handling to skip problematic files.
    """
    features_list = []
    labels_list = []
    
    for index, row in tqdm(df.iterrows(), total=len(df), desc=f"Extracting features"):
        try:
            # 1. Load the image
            image = cv2.imread(row['image_path'])
            
            # 2. Check if image loading was successful
            if image is None:
                print(f"Warning: Could not read image file, skipping: {row['image_path']}")
                continue # Skip to the next image
            
            # 3. Apply the feature extractor
            features = extractor_fn(image)
            
            # 4. Append the results
            features_list.append(features)
            labels_list.append(row['label'])

        except Exception as e:
            # Catch any other unexpected errors during processing
            print(f"Error processing file {row['image_path']}. Error: {e}. Skipping.")
            continue
            
    return np.array(features_list), np.array(labels_list)

In [19]:
for size in image_sizes:
    print(f"\nProcessing {size}x{size} images...")
    train_df = load_data_from_directory(os.path.join(train_base_path, f'resized_{size}'))
    test_df = load_data_from_directory(os.path.join(test_base_path, f'resized_{size}'))

    # Initialize and fit the LabelEncoder
    le = LabelEncoder()
    le.fit(train_df['label'])

    # *** THE FIX IS ON THE NEXT LINE ***
    for name, func in feature_extrators.items(): # Corrected from 'feature_extrators'
        output_path = os.path.join(FEATURES_DIR, f"{size}_{name}_features.pkl")
        
        # This makes the cell resumable
        if os.path.exists(output_path):
            print(f"Features for {size}_{name} already exist. Skipping.")
            continue

        print(f"--- Extracting {name} features for {size}x{size} images ---")
        X_train, y_train_labels = process_and_extract(train_df, func)
        X_test, y_test_labels = process_and_extract(test_df, func)

        # Encode labels
        y_train_encoded = le.transform(y_train_labels)
        y_test_encoded = le.transform(y_test_labels)

        # Save everything needed for training in one file
        data_to_save = {
            'X_train': X_train, 'y_train': y_train_encoded,
            'X_test': X_test, 'y_test': y_test_encoded,
            'label_encoder': le
        }

        print(f"Saving features to {output_path}...")
        joblib.dump(data_to_save, output_path)


Processing 128x128 images...
Loading image paths from: dataset\malaria\training_ds\resized_128
Loading image paths from: dataset\malaria\testing_ds\resized_128
Features for 128_HOG already exist. Skipping.
Features for 128_LBP already exist. Skipping.
Features for 128_HIST already exist. Skipping.
