# Module 1, Task 1: Comparison of Memory-Based and Generator-Based Data Loading

**Objective:** To understand and compare two primary methods of loading data: loading the entire dataset into memory versus using a data generator that loads data in batches.

In [None]:
# Install necessary libraries
!pip install tensorflow tensorflow-datasets numpy matplotlib memory_profiler

### Setup
First, let's import the libraries and load a sample of the EuroSAT dataset. We'll use `tensorflow_datasets` for convenience.

In [None]:
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
import time
import matplotlib.pyplot as plt
from memory_profiler import memory_usage

# Load the EuroSAT dataset metadata
ds, ds_info = tfds.load('eurosat/rgb', with_info=True, as_supervised=True)
train_ds = ds['train']

print(f"Total number of images in the dataset: {ds_info.splits['train'].num_examples}")
print(f"Image shape: {ds_info.features['image'].shape}")
print(f"Number of classes: {ds_info.features['label'].num_classes}")

### Method 1: Memory-Based Data Loading

Here, we load the entire dataset into RAM. This is feasible for small datasets but quickly becomes a bottleneck for large datasets, as it can exhaust the available system memory.

In [None]:
def load_into_memory():
    """Loads the entire EuroSAT dataset into numpy arrays in memory."""
    # Convert the tf.data.Dataset to a list of numpy arrays
    images, labels = tfds.as_numpy(train_ds)
    print(f"Loaded {len(images)} images and {len(labels)} labels into memory.")
    print(f"Images array shape: {images.shape}")
    print(f"Labels array shape: {labels.shape}")
    return images, labels

print("Measuring memory usage for memory-based loading...")
# The `memory_usage` function runs the target function and records its memory footprint.
# The `max_usage=True` returns the peak memory usage during the function's execution.
mem_based_peak_mem = memory_usage(load_into_memory, max_usage=True)

print(f"\nPeak memory usage for memory-based loading: {mem_based_peak_mem:.2f} MiB")

### Method 2: Generator-Based Data Loading

A generator is an iterable that yields data one batch at a time. It only loads the data for the current batch into memory, making it highly memory-efficient. TensorFlow's `tf.data.Dataset` API is a powerful, generator-based system for building efficient input pipelines.

In [None]:
def generator_based_loading():
    """Simulates iterating through the dataset using a generator approach."""
    batch_size = 32
    # The tf.data.Dataset object is already a generator.
    # We just configure it with a batch size.
    batched_ds = train_ds.batch(batch_size)
    
    print(f"Processing dataset with a batch size of {batch_size}...")
    num_batches = 0
    for images, labels in batched_ds:
        # In a real training loop, you would perform an operation here.
        # For this demo, we'll just simulate work with a small delay.
        time.sleep(0.01)
        num_batches += 1
    
    print(f"Successfully processed {num_batches} batches.")

print("\nMeasuring memory usage for generator-based loading...")
gen_based_peak_mem = memory_usage(generator_based_loading, max_usage=True)

print(f"\nPeak memory usage for generator-based loading: {gen_based_peak_mem:.2f} MiB")

### Comparison and Conclusion

Let's visualize the difference in peak memory consumption.

In [None]:
methods = ['Memory-Based', 'Generator-Based']
memory_values = [mem_based_peak_mem, gen_based_peak_mem]

plt.figure(figsize=(8, 6))
bars = plt.bar(methods, memory_values, color=['orangered', 'dodgerblue'])
plt.ylabel('Peak Memory Usage (MiB)')
plt.title('Memory Usage Comparison: Memory-Based vs. Generator-Based Loading')
plt.bar_label(bars, fmt='%.2f MiB')
plt.show()

print("Conclusion:")
print("- **Memory-Based Loading:** Consumes a large amount of RAM because the entire dataset is loaded at once. This approach is not scalable and will fail for datasets larger than the available RAM.")
print("- **Generator-Based Loading:** Exhibits significantly lower and constant memory usage. It only holds a small batch of data in memory at any given time, making it the standard and most efficient method for deep learning, especially with large datasets.")