In [1]:
import os
import cv2
import numpy as np
import time
import random

In [2]:
DATA_DIR = 'data'
CLASSES = {'cats': 0, 'dogs': 1}
BATCH_SIZE = 64
IMAGE_SIZE = (128, 128)

In [3]:
def traditional_data_loader(data_dir, batch_size):
    """A naive, single-threaded data loader."""
    filepaths = []
    labels = []
    for cls, label_idx in CLASSES.items():
        class_dir = os.path.join(data_dir, cls)
        for filename in os.listdir(class_dir):
            filepaths.append(os.path.join(class_dir, filename))
            labels.append(label_idx)
    print(labels[:10])  # Debug: Show first 10 labels
    print(filepaths[:10])  # Debug: Show first 10 labels
    

    while True:
        # Shuffle data each "epoch"
        indices = np.random.permutation(len(filepaths))
        for i in range(0, len(filepaths), batch_size):
            batch_indices = indices[i:i + batch_size]
            if len(batch_indices) != batch_size:
                continue

            batch_images = []
            batch_labels = []

            for idx in batch_indices:
                # 1. Read file from disk (I/O block)
                image = cv2.imread(filepaths[idx])
                image = cv2.resize(image, IMAGE_SIZE)

                # 2. Manual Data Augmentation (CPU-bound)
                if random.random() > 0.5:
                    image = cv2.flip(image, 1) # Horizontal flip
                
                # Adjust brightness
                brightness_factor = 1.0 + random.uniform(-0.2, 0.2)
                image = np.clip(image * brightness_factor, 0, 255)

                batch_images.append(image)
                batch_labels.append(labels[idx])

            # 3. Stack and yield (Finally ready for GPU)
            yield np.array(batch_images, dtype=np.float32) / 255.0, np.array(batch_labels)


In [4]:
root_path = "/Users/tharhtet/Documents/github/ML-in-Prod-batch-3/6_deep_learning/tf_best_practices/cats_and_dogs_filtered"
DATA_DIR = root_path+"/train"
BATCH_SIZE = 32

print(f"Data directory: {DATA_DIR}")
loader = traditional_data_loader(DATA_DIR, BATCH_SIZE)


Data directory: /Users/tharhtet/Documents/github/ML-in-Prod-batch-3/6_deep_learning/tf_best_practices/cats_and_dogs_filtered/train


In [5]:
os.getcwd() 

'/Users/tharhtet/Documents/github/ML-in-Prod-batch-3/6_deep_learning/tf_best_practices/data_pre'

In [6]:
print("Starting Traditional Loader benchmark...")
start_time = time.time()
for i, (images, labels) in enumerate(loader):
    #print(images.shape, labels.shape)  # Debug: Show batch shapes
    if i >= 500: 
        break
end_time = time.time()

print(f"Traditional Loader: Processed 500 batches in {end_time - start_time:.4f} seconds.")


Starting Traditional Loader benchmark...
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
['/Users/tharhtet/Documents/github/ML-in-Prod-batch-3/6_deep_learning/tf_best_practices/cats_and_dogs_filtered/train/cats/cat.952.jpg', '/Users/tharhtet/Documents/github/ML-in-Prod-batch-3/6_deep_learning/tf_best_practices/cats_and_dogs_filtered/train/cats/cat.946.jpg', '/Users/tharhtet/Documents/github/ML-in-Prod-batch-3/6_deep_learning/tf_best_practices/cats_and_dogs_filtered/train/cats/cat.2042.jpg', '/Users/tharhtet/Documents/github/ML-in-Prod-batch-3/6_deep_learning/tf_best_practices/cats_and_dogs_filtered/train/cats/cat.2056.jpg', '/Users/tharhtet/Documents/github/ML-in-Prod-batch-3/6_deep_learning/tf_best_practices/cats_and_dogs_filtered/train/cats/cat.6.jpg', '/Users/tharhtet/Documents/github/ML-in-Prod-batch-3/6_deep_learning/tf_best_practices/cats_and_dogs_filtered/train/cats/cat.749.jpg', '/Users/tharhtet/Documents/github/ML-in-Prod-batch-3/6_deep_learning/tf_best_practices/cats_and_dogs_filtered/train/c