This notebook converts the dataset into a Ray dataset

In [0]:
import ray
import tensorflow as tf
from ray.util.spark import setup_ray_cluster, shutdown_ray_cluster

In [0]:
# shutdown_ray_cluster()

In [0]:
setup_ray_cluster(
    num_cpus_per_worker=16,
    num_gpus_per_worker=0,
    max_worker_nodes=2,
    num_cpus_head_node=4,
    num_gpus_head_node=0,
)

ray.init(ignore_reinit_error=True)

In [0]:
ray.cluster_resources()

In [0]:
from pathlib import Path
image_paths = [str(x) for x in Path('/Volumes/shm/default/cppe5/').glob('*.png')][0:100]
batch_size=32
img_shape=(224, 224)
MAX_OBJECTS = 30

In [0]:
from preprocess import load_and_preprocess, prepare_tf_batch

In [0]:
# Apply preprocessing in parallel
tf_ds = (
  ray.data.from_items(image_paths)
  .map(load_and_preprocess)
  .map_batches(prepare_tf_batch, batch_size=batch_size)
  .to_tf(
    feature_columns=["images"],
    label_columns=["bboxes", "classes"],
    batch_size=batch_size,
    drop_last=True
  )
  .cache()
)

In [0]:
tf_ds.element_spec

In [0]:
for batch in tf_ds.take(1):
    images = batch[0]['images']
    bboxes = batch[1]['bboxes']
    categories = batch[1]['classes']
    print(f"Input shape: {images.shape}")
    print(f"Bounding boxes shape: {bboxes.shape}")
    print(f"Categories shape: {categories.shape}")

In [0]:
from model import build_object_detection_model, masked_mse, masked_sparse_categorical_crossentropy, process_predictions

model = build_object_detection_model(num_classes=5, max_objects=30)

In [0]:
model.fit(tf_ds, epochs=1)

In [0]:
import tensorflow as tf

def train_func(config):
    # Use MultiWorkerMirroredStrategy for multi-GPU distributed training
    strategy = tf.distribute.MultiWorkerMirroredStrategy()

    with strategy.scope():
        # Define a simple model
        model = tf.keras.Sequential([
            tf.keras.layers.Conv2D(32, (3, 3), activation='relu', input_shape=(128, 128, 3)),
            tf.keras.layers.MaxPooling2D((2, 2)),
            tf.keras.layers.Flatten(),
            tf.keras.layers.Dense(128, activation='relu'),
            tf.keras.layers.Dense(config["num_classes"], activation='softmax')
        ])

        model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

        # Mock dataset for demonstration purposes
        train_data = tf.random.uniform((config["batch_size"], 128, 128, 3))
        train_labels = tf.random.uniform((config["batch_size"],), maxval=config["num_classes"], dtype=tf.int32)

        # Train the model
        model.fit(train_data, train_labels, epochs=config["epochs"], batch_size=config["batch_size"])

    return model.summary()


Working on a Ray Trainer

In [0]:
import ray
from ray.train.tensorflow import TensorflowTrainer
from ray.train import ScalingConfig

def train_func(config):
    import tensorflow as tf
    from ray.train.tensorflow import prepare_dataset_shard
    
    # Get the TensorFlow dataset shard for this worker
    tf_dataset = ray.train.get_dataset_shard("train")
    
    # Create your model (same as your existing model)
    model = build_object_detection_model(
        num_classes=config["num_classes"],
        input_shape=config["input_shape"],
        max_objects=config["max_objects"]
    )
    
    # Train the model
    model.fit(
        tf_dataset,
        epochs=config["epochs"],
        callbacks=[ray.train.tensorflow.TensorflowCheckpoint.from_checkpoint_dir(
            checkpoint_dir=ray.train.get_checkpoint_dir()
        )]
    )
    
    # Save the model for later use
    ray.train.report({"status": "training_completed"})
    ray.train.save_checkpoint({"model": model})

In [0]:
# Configure training parameters
# trainer = TensorflowTrainer(
#     train_func,
#     train_loop_config={
#         "num_classes": 10,  # Adjust based on your dataset
#         "input_shape": (3, 224, 224),
#         "max_objects": 50,
#         "epochs": 3,
#         "batch_size": 32
#     },
#     scaling_config=ScalingConfig(
#         num_workers=2,  # Number of GPUs to use
#         use_gpu=False,
#         resources_per_worker={"GPU": 0, "CPU":4}
#     ),
#     datasets={"train": tf_ds}
# )

# # Start distributed training
# result = trainer.fit()

# # Get the best checkpoint
# best_checkpoint = result.checkpoint