In [1]:
import cv2
import os
import glob
import numpy as np
from tqdm import tqdm # Used to display a progress bar
from sklearn.model_selection import train_test_split


ModuleNotFoundError: No module named 'cv2'

In [None]:
#  Set Paths and Target Size
# **TODO: 
DATA_DIR = 'cell_images' 
TARGET_SIZE = (128, 128) 



In [None]:
# Classes: Parasitized (0) and Uninfected (1)
CATEGORIES = ['Parasitized', 'Uninfected'] 

# Lists to store the processed image arrays (X) and their labels (y)
all_processed_images = []
all_labels = []


def preprocess_image(image_path, target_size=TARGET_SIZE):
    """Function to load, resize, and scale a single image."""
    
    # 1. Load the image (OpenCV loads images in BGR format by default)
    img = cv2.imread(image_path)
    
    # Check for Integrity (Handle cases where the image fails to load)
    if img is None:
        return None

    # 2. Standardize dimensions (Resizing)
    # All images must have the same input size for the CNN
    img_resized = cv2.resize(img, target_size)

    # 3. Convert BGR to RGB (Most deep learning models expect RGB format)
    img_rgb = cv2.cvtColor(img_resized, cv2.COLOR_BGR2RGB)

    # 4. Scale the pixel values from [0, 255] to [0.0, 1.0]
    # Scaling speeds up convergence during model training
    img_scaled = img_rgb / 255.0

    return img_scaled



In [None]:
## Loop Through and Process all Images

print(f" Starting Image Preprocessing for {len(CATEGORIES)} Categories ")
current_label = 0 # 0 for Parasitized, 1 for Uninfected

for category in CATEGORIES:
    path = os.path.join(DATA_DIR, category)
    # Get all image paths (assuming .png format)
    image_paths = glob.glob(os.path.join(path, '*.png'))
    
    print(f"\nProcessing {len(image_paths)} images in category: {category}")
    
    # Use tqdm to show a progress bar for the long loading process
    for image_path in tqdm(image_paths, desc=f"Cleaning {category}"):
        
        processed_img = preprocess_image(image_path)
        
        if processed_img is not None:
            # Append the processed image array
            all_processed_images.append(processed_img)
            # Append the corresponding label (0 or 1)
            all_labels.append(current_label)
        
    current_label += 1 




 Starting Image Preprocessing for 2 Categories 

Processing 13779 images in category: Parasitized


Cleaning Parasitized: 100%|██████████| 13779/13779 [03:18<00:00, 69.57it/s]



Processing 13779 images in category: Uninfected


Cleaning Uninfected: 100%|██████████| 13779/13779 [03:04<00:00, 74.58it/s]


In [None]:
## 2. Final Conversion to NumPy Arrays

print("\nFinalizing Data Arrays")

# Convert lists to NumPy arrays, which are mandatory for CNN input
X_images = np.array(all_processed_images)
y_labels = np.array(all_labels)

print(f"SUCCESS: Total images processed: {len(X_images)}")
# The final shape is (Number_of_Images, Height, Width, Color_Channels)
print(f"Final Image Data Shape (X_images): {X_images.shape}")
print(f"Final Label Data Shape (y_labels): {y_labels.shape}")




Finalizing Data Arrays
SUCCESS: Total images processed: 27558
Final Image Data Shape (X_images): (27558, 128, 128, 3)
Final Label Data Shape (y_labels): (27558,)


In [None]:

features_img = X_images      
target_labels = y_labels     

X_train_img, X_test_img, y_train_img, y_test_img = train_test_split(
    features_img, 
    target_labels, 
    test_size=0.20, 
    random_state=42,

    stratify=target_labels
)

# Verify Dimensions
print("\n Image Data Splitting Completed ")
print(f"Total Images: {len(X_images)}")
print(f"Training Images (80%): {len(X_train_img)}")
print(f"Testing Images (20%): {len(X_test_img)}")
print(f"Training Features Shape: {X_train_img.shape}")
print(f"Testing Labels Shape: {y_test_img.shape}")


 Image Data Splitting Completed 
Total Images: 27558
Training Images (80%): 22046
Testing Images (20%): 5512
Training Features Shape: (22046, 128, 128, 3)
Testing Labels Shape: (5512,)


In [None]:
# 1. Save the Training and Testing Images (Features)
np.save('X_train_img.npy', X_train_img)
np.save('X_test_img.npy', X_test_img)

# 2. Save the Training and Testing Labels (Targets)
np.save('y_train_img.npy', y_train_img)
np.save('y_test_img.npy', y_test_img)