# Training a Modified LeNet CNN for Breast Cancer Image Classification and deploying on FPGA target, using HLS4ML.
## Vaggelis Ananiadis, Supervisor: Prof. Karakonstantis G.
#### ECE-284 - Processor Design

## Read Dataset, collect data paths and labels from directory structure and preprocess images
#### Training dataset was taken from: 
https://www.kaggle.com/datasets/aryashah2k/breast-ultrasound-images-dataset

In [None]:
import os
import numpy as np
import cv2
import random
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Config
IMAGE_SIZE = (32, 32)
NUM_CHANNELS = 1
DATASET_PATH = 'dataset_grayscale'
OUTPUT_PATH = 'training'
TEST_SIZE = 0.15
VAL_SIZE = 0.15
AUGMENT_FACTOR = 1  # How many extra augmented versions to generate per image

# Create output directory
os.makedirs(OUTPUT_PATH, exist_ok=True)

# Load images and labels
images = []
labels = []
class_names = sorted(os.listdir(DATASET_PATH))

for label_index, class_name in enumerate(class_names):
    class_path = os.path.join(DATASET_PATH, class_name)
    if not os.path.isdir(class_path):
        continue
    for file in os.listdir(class_path):
        file_path = os.path.join(class_path, file)
        img = cv2.imread(file_path, cv2.IMREAD_GRAYSCALE)
        if img is None:
            continue
        img = cv2.resize(img, IMAGE_SIZE)
        img = img.astype(np.float32) / 255.0
        img = np.expand_dims(img, axis=-1)
        images.append(img)
        labels.append(label_index)

# Convert to arrays
X = np.array(images)
y = np.array(labels)

# Stratified split into train/val/test
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=VAL_SIZE + TEST_SIZE, stratify=y, random_state=42)
val_ratio = VAL_SIZE / (VAL_SIZE + TEST_SIZE)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=1 - val_ratio, stratify=y_temp, random_state=42)

# Data augmentation setup (only for training)
datagen = ImageDataGenerator(
    rotation_range=15,
    width_shift_range=0.2,
    height_shift_range=0.2,
    zoom_range=0.1,
    horizontal_flip=True,
    fill_mode='nearest'
)

augmented_X = []
augmented_y = []

for i in range(len(X_train)):
    x = X_train[i]
    label = y_train[i]
    x_expanded = np.expand_dims(x, axis=0)
    gen = datagen.flow(x_expanded, batch_size=1)
    for _ in range(AUGMENT_FACTOR):
        aug = next(gen)[0]
        augmented_X.append(aug)
        augmented_y.append(label)

# Combine original + augmented training data
X_train_aug = np.concatenate([X_train] + [np.array(augmented_X)], axis=0)
y_train_aug = np.concatenate([y_train] + [np.array(augmented_y)], axis=0)

# One-hot encode labels
y_train_aug = to_categorical(y_train_aug, num_classes=len(class_names))
y_val = to_categorical(y_val, num_classes=len(class_names))
y_test = to_categorical(y_test, num_classes=len(class_names))

# Save datasets
np.save(os.path.join(OUTPUT_PATH, 'X_train_32.npy'), X_train_aug)
np.save(os.path.join(OUTPUT_PATH, 'X_val_32.npy'), X_val)
np.save(os.path.join(OUTPUT_PATH, 'X_test_32.npy'), X_test)
np.save(os.path.join(OUTPUT_PATH, 'y_train.npy'), y_train_aug)
np.save(os.path.join(OUTPUT_PATH, 'y_val.npy'), y_val)
np.save(os.path.join(OUTPUT_PATH, 'y_test.npy'), y_test)
np.save(os.path.join(OUTPUT_PATH, 'classes.npy'), np.array(class_names))

print("Data saved to", OUTPUT_PATH)
print(f"Train (augmented): {X_train_aug.shape}, Val: {X_val.shape}, Test: {X_test.shape}")