#Ray & Tensorflow

In [0]:
from src.utils.system_check import check_gpu, check_system_resources
from src.ray.mnist_tensorflow import get_mnist_data, build_mnist_model, CPUMonitor

## Single Driver Training


In [0]:
import time

start_time = time.time()

(train_images, train_labels), (test_images, test_labels) = get_mnist_data()

model = build_mnist_model()

history = model.fit(
    train_images, train_labels,
    epochs=5,
    validation_data=(test_images, test_labels),
    callbacks=[CPUMonitor()]
)

end_time = time.time()
training_time = end_time - start_time

print(f"Training time: {training_time} seconds")

In [0]:
from ray.util.spark import setup_ray_cluster, shutdown_ray_cluster, MAX_NUM_WORKER_NODES
setup_ray_cluster(
  min_worker_nodes=1,
  max_worker_nodes=1,
  collect_log_to_path="/dbfs/Users/scott.mckean@databricks.com/ray_collected_logs"
)

In [0]:
import ray
ray.init()
ray.cluster_resources()

In [0]:
import json
import os
import mlflow
import tempfile
import numpy as np
import tensorflow as tf

from ray.air.integrations.keras import ReportCheckpointCallback
from ray.train import Result, RunConfig, ScalingConfig, Checkpoint
from ray.train.tensorflow import TensorflowTrainer
from ray import train

import time
import psutil
import tensorflow as tf
from tensorflow.keras import layers, models

def train_func(config={}):
    (train_images, train_labels), (test_images, test_labels) = tf.keras.datasets.mnist.load_data()
    train_images = train_images.reshape((60000, 28, 28, 1)).astype('float32') / 255
    test_images = test_images.reshape((10000, 28, 28, 1)).astype('float32') / 255

    model = models.Sequential([
        layers.Input(shape=(28, 28, 1)),
        layers.Conv2D(32, (3, 3), activation='relu'),
        layers.MaxPooling2D((2, 2)),
        layers.Conv2D(64, (3, 3), activation='relu'),
        layers.MaxPooling2D((2, 2)),
        layers.Conv2D(64, (3, 3), activation='relu'),
        layers.Flatten(),
        layers.Dense(64, activation='relu'),
        layers.Dense(10, activation='softmax')
    ])

    model.compile(
        optimizer='adam',
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )

    history = model.fit(
        train_images, train_labels,
        epochs=5,
        validation_data=(test_images, test_labels)
    )

    results = history.history
    return results

trainer = TensorflowTrainer(
    train_loop_per_worker=train_func,
    scaling_config=ScalingConfig(num_workers=1, use_gpu=True),
)

In [0]:
results = trainer.fit()