In [2]:
from src.utils.system_check import check_gpu, check_system_resources
check_gpu()
check_system_resources()

TensorFlow version: 2.18.0

Physical devices:
  CPU: /physical_device:CPU:0

No GPUs found. Running on CPU

CPU Resources:
  Physical CPU cores: 12
  Logical CPU cores: 12

Memory Resources:
  Total Memory: 36.00 GB
  Available Memory: 15.78 GB

TensorFlow Threading:
  Inter-op parallelism threads: 0
  Intra-op parallelism threads: 0


12

In [1]:
import tensorflow as tf
from tensorflow.keras import layers, models
from ray.train.tensorflow import TensorflowTrainer

import time
import psutil
import numpy as np

import ray
from ray import train

## Get the MNIST Data
We will use this section to get the MNIST data and parallelize when necessary.

In [3]:
(train_images, train_labels), (test_images, test_labels) = tf.keras.datasets.mnist.load_data()

## Setup Model
Declare the model

In [4]:
def build_model():
    model = models.Sequential([
        layers.Input(shape=(28, 28, 1)),  # Replace Conv2D input_shape with Input layer
        layers.Conv2D(32, (3, 3), activation='relu'),
        layers.MaxPooling2D((2, 2)),
        layers.Conv2D(64, (3, 3), activation='relu'),
        layers.MaxPooling2D((2, 2)),
        layers.Conv2D(64, (3, 3), activation='relu'),
        layers.Flatten(),
        layers.Dense(64, activation='relu'),
        layers.Dense(10, activation='softmax')
    ])
    
    model.compile(
        optimizer='adam',
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    return model

## Train the Model
Here we will train the model and benchmark the performance and time taken.

In [5]:
def train_func(config={}):
    # Get the MNIST data
    (train_images, train_labels), (test_images, test_labels) = tf.keras.datasets.mnist.load_data()
    
    # Normalize and reshape images
    train_images = train_images.reshape((60000, 28, 28, 1)).astype('float32') / 255
    test_images = test_images.reshape((10000, 28, 28, 1)).astype('float32') / 255
    
    # Build and compile the model
    model = build_model()
    
    # Create the CPU monitor callback
    class CPUMonitor(tf.keras.callbacks.Callback):
        def on_epoch_begin(self, epoch, logs=None):
            self.epoch_start_time = time.time()
            self.cpu_percent = psutil.cpu_percent()
            print(f"\nEpoch {epoch+1} starting CPU usage: {self.cpu_percent}%")
            
        def on_epoch_end(self, epoch, logs=None):
            epoch_time = time.time() - self.epoch_start_time
            cpu_percent = psutil.cpu_percent()
            print(f"Epoch {epoch+1} ending CPU usage: {cpu_percent}%")
            print(f"Epoch time: {epoch_time:.2f}s")
    
    # Train the model
    start_time = time.time()
    history = model.fit(
        train_images, train_labels,
        epochs=5,
        validation_data=(test_images, test_labels),
        callbacks=[CPUMonitor()]
    )
    end_time = time.time()
    print(f"Training time: {end_time - start_time:.2f} seconds")
    
    return history.history

In [7]:
train_func(config={})


Epoch 1 starting CPU usage: 18.5%
Epoch 1/5
[1m1874/1875[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 5ms/step - accuracy: 0.8925 - loss: 0.3395Epoch 1 ending CPU usage: 48.1%
Epoch time: 9.35s
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 5ms/step - accuracy: 0.8925 - loss: 0.3393 - val_accuracy: 0.9853 - val_loss: 0.0490

Epoch 2 starting CPU usage: 0.0%
Epoch 2/5
[1m1870/1875[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 5ms/step - accuracy: 0.9857 - loss: 0.0453Epoch 2 ending CPU usage: 47.9%
Epoch time: 9.36s
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 5ms/step - accuracy: 0.9857 - loss: 0.0453 - val_accuracy: 0.9837 - val_loss: 0.0523

Epoch 3 starting CPU usage: 0.0%
Epoch 3/5
[1m1873/1875[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 5ms/step - accuracy: 0.9895 - loss: 0.0306Epoch 3 ending CPU usage: 48.4%
Epoch time: 10.80s
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 6ms/step - 

{'accuracy': [0.9553666710853577,
  0.9859166741371155,
  0.9894166588783264,
  0.9919499754905701,
  0.9933833479881287],
 'loss': [0.14468730986118317,
  0.044151224195957184,
  0.03144754841923714,
  0.02391563355922699,
  0.02003360725939274],
 'val_accuracy': [0.9853000044822693,
  0.9836999773979187,
  0.988099992275238,
  0.9886000156402588,
  0.9919000267982483],
 'val_loss': [0.04897346720099449,
  0.05234222486615181,
  0.03661230579018593,
  0.03670158609747887,
  0.027031652629375458]}

In [12]:
from ray.train import RunConfig, ScalingConfig

trainer = TensorflowTrainer(
    train_func,
    scaling_config=ScalingConfig(num_workers=11, use_gpu=False),
    run_config=RunConfig(name="mnist_training")
)

results = trainer.fit()

2025-02-09 21:32:20,288	INFO tune.py:616 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949


== Status ==
Current time: 2025-02-09 21:32:20 (running for 00:00:00.12)
Using FIFO scheduling algorithm.
Logical resource usage: 12.0/12 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2025-02-09_21-28-39_270929_69376/artifacts/2025-02-09_21-32-20/mnist_training/driver_artifacts
Number of trials: 1/1 (1 PENDING)


== Status ==
Current time: 2025-02-09 21:32:25 (running for 00:00:05.12)
Using FIFO scheduling algorithm.
Logical resource usage: 12.0/12 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2025-02-09_21-28-39_270929_69376/artifacts/2025-02-09_21-32-20/mnist_training/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2025-02-09 21:32:30 (running for 00:00:10.21)
Using FIFO scheduling algorithm.
Logical resource usage: 12.0/12 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2025-02-09_21-28-39_270929_69376/artifacts/2025-02-09_21-32-20/mnist_training/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2025-02-09 21:32:35 (runn

2025-02-09 21:36:59,142	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/Users/scott.mckean/ray_results/mnist_training' in 0.0032s.
2025-02-09 21:36:59,144	INFO tune.py:1041 -- Total run time: 278.86 seconds (278.85 seconds for the tuning loop).


Trial TensorflowTrainer_04156_00000 completed. Last result: 
== Status ==
Current time: 2025-02-09 21:36:59 (running for 00:04:38.85)
Using FIFO scheduling algorithm.
Logical resource usage: 12.0/12 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2025-02-09_21-28-39_270929_69376/artifacts/2025-02-09_21-32-20/mnist_training/driver_artifacts
Number of trials: 1/1 (1 TERMINATED)




In [11]:
trainer

<TensorflowTrainer scaling_config=ScalingConfig(num_workers=4) run_config=RunConfig(name='mnist_training', storage_path='/Users/scott.mckean/ray_results', verbose=1)>