Imports necessary libraries: ray, torch, torch.nn, torchvision, torch.utils.data, and Ray Train-specific modules.

In [8]:
!pip install -q ray[default] torch torchvision tqdm ray[train]

In [4]:
from cmlextensions.ray_cluster import RayCluster

# Set up the Ray Cluster using CMLExtensions
cluster = RayCluster(num_workers=3, worker_cpu=4, worker_memory=8, worker_nvidia_gpu=1, head_cpu=4, head_memory=8, head_nvidia_gpu=1)
cluster.init()

Starting ray head...
Starting 3 ray workers...

--------------------
Ray cluster started
--------------------

The Ray dashboard is running at
https://nke47n3zmba7mpvh.goes-ocp-cml.apps.field-team-ocp-01.kcloud.cloudera.com/

To connect to this Ray cluster from this CML Session,
use the following Python code:
  import ray
  ray.init(address='ray://10.254.12.125:10001')



In [6]:
  import ray
  ray.init(address='ray://10.254.12.125:10001')

2025-04-04 08:00:12,943	INFO client_builder.py:244 -- Passing the following kwargs to ray.init() on the server: log_to_driver
SIGTERM handler is not set because current thread is not the main thread.


0,1
Python version:,3.10.13
Ray version:,2.44.1
Dashboard:,http://127.0.0.1:8090


In [5]:
import os
from typing import Dict

import torch
from filelock import FileLock
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from torchvision.transforms import Normalize, ToTensor
from tqdm import tqdm

import ray.train
from ray.train import ScalingConfig
from ray.train.torch import TorchTrainer

2025-04-04 07:59:58,173	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
2025-04-04 07:59:59,095	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


In [7]:
def get_dataloaders(batch_size):
    # Transform to normalize the input images
    transform = transforms.Compose([ToTensor(), Normalize((0.28604,), (0.32025,))])

    with FileLock(os.path.expanduser("~/data.lock")):
        # Download training data from open datasets
        training_data = datasets.FashionMNIST(
            root="~/data",
            train=True,
            download=True,
            transform=transform,
        )

        # Download test data from open datasets
        test_data = datasets.FashionMNIST(
            root="~/data",
            train=False,
            download=True,
            transform=transform,
        )

    # Create data loaders
    train_dataloader = DataLoader(training_data, batch_size=batch_size, shuffle=True)
    test_dataloader = DataLoader(test_data, batch_size=batch_size)

    return train_dataloader, test_dataloader


# Model Definition
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28 * 28, 512),
            nn.ReLU(),
            nn.Dropout(0.25),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Dropout(0.25),
            nn.Linear(512, 10),
            nn.ReLU(),
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits


def train_func_per_worker(config: Dict):
    lr = config["lr"]
    epochs = config["epochs"]
    batch_size = config["batch_size_per_worker"]

    # Get dataloaders inside the worker training function
    train_dataloader, test_dataloader = get_dataloaders(batch_size=batch_size)

    # [1] Prepare Dataloader for distributed training
    # Shard the datasets among workers and move batches to the correct device
    # =======================================================================
    train_dataloader = ray.train.torch.prepare_data_loader(train_dataloader)
    test_dataloader = ray.train.torch.prepare_data_loader(test_dataloader)

    model = NeuralNetwork()

    # [2] Prepare and wrap your model with DistributedDataParallel
    # Move the model to the correct GPU/CPU device
    # ============================================================
    model = ray.train.torch.prepare_model(model)

    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9)

    # Model training loop
    for epoch in range(epochs):
        if ray.train.get_context().get_world_size() > 1:
            # Required for the distributed sampler to shuffle properly across epochs.
            train_dataloader.sampler.set_epoch(epoch)

        model.train()
        for X, y in tqdm(train_dataloader, desc=f"Train Epoch {epoch}"):
            pred = model(X)
            loss = loss_fn(pred, y)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        model.eval()
        test_loss, num_correct, num_total = 0, 0, 0
        with torch.no_grad():
            for X, y in tqdm(test_dataloader, desc=f"Test Epoch {epoch}"):
                pred = model(X)
                loss = loss_fn(pred, y)

                test_loss += loss.item()
                num_total += y.shape[0]
                num_correct += (pred.argmax(1) == y).sum().item()

        test_loss /= len(test_dataloader)
        accuracy = num_correct / num_total

        # [3] Report metrics to Ray Train
        # ===============================
        ray.train.report(metrics={"loss": test_loss, "accuracy": accuracy})


def train_fashion_mnist(num_workers=2, use_gpu=False):
    global_batch_size = 32

    train_config = {
        "lr": 1e-3,
        "epochs": 10,
        "batch_size_per_worker": global_batch_size // num_workers,
    }

    # Configure computation resources
    scaling_config = ScalingConfig(num_workers=num_workers, use_gpu=use_gpu)

    # Initialize a Ray TorchTrainer
    trainer = TorchTrainer(
        train_loop_per_worker=train_func_per_worker,
        train_loop_config=train_config,
        scaling_config=scaling_config,
    )

    # [4] Start distributed training
    # Run `train_func_per_worker` on all workers
    # =============================================
    result = trainer.fit()
    print(f"Training result: {result}")


In [9]:
# Connect to the existing Ray cluster
train_fashion_mnist(num_workers=3, use_gpu=True)

[36m(TunerInternal pid=5392)[0m AIR_VERBOSITY is set, ignoring passed-in ProgressReporter for now.


[36m(TunerInternal pid=5392)[0m 
[36m(TunerInternal pid=5392)[0m View detailed results here: /home/cdsw/ray_results/TorchTrainer_2025-04-04_09-08-58
[36m(TunerInternal pid=5392)[0m To visualize your results with TensorBoard, run: `tensorboard --logdir /tmp/ray/session_2025-04-04_07-58-21_269898_174/artifacts/2025-04-04_09-10-40/TorchTrainer_2025-04-04_09-08-58/driver_artifacts`


[36m(TrainTrainable pid=524, ip=10.254.5.92)[0m Trainable.setup took 10.083 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


[36m(TunerInternal pid=5392)[0m 
[36m(TunerInternal pid=5392)[0m Training started with configuration:
[36m(TunerInternal pid=5392)[0m ╭─────────────────────────────────────────────────╮
[36m(TunerInternal pid=5392)[0m │ Training config                                 │
[36m(TunerInternal pid=5392)[0m ├─────────────────────────────────────────────────┤
[36m(TunerInternal pid=5392)[0m │ train_loop_config/batch_size_per_worker      10 │
[36m(TunerInternal pid=5392)[0m │ train_loop_config/epochs                     10 │
[36m(TunerInternal pid=5392)[0m │ train_loop_config/lr                      0.001 │
[36m(TunerInternal pid=5392)[0m ╰─────────────────────────────────────────────────╯


[36m(RayTrainWorker pid=609, ip=10.254.5.92)[0m Setting up process group for: env:// [rank=0, world_size=3]
[36m(TorchTrainer pid=524, ip=10.254.5.92)[0m Started distributed worker processes: 
[36m(TorchTrainer pid=524, ip=10.254.5.92)[0m - (node_id=2016aa8d81bf705b555d3e38fcd3418265fc395aa3cac9605675967c, ip=10.254.5.92, pid=609) world_rank=0, local_rank=0, node_rank=0
[36m(TorchTrainer pid=524, ip=10.254.5.92)[0m - (node_id=8792d6a98db40c93ce9d013e5ff47bf7f8b36560afae053a2297343b, ip=10.254.7.86, pid=484) world_rank=1, local_rank=0, node_rank=1
[36m(TorchTrainer pid=524, ip=10.254.5.92)[0m - (node_id=517abdb83fdf3f9c595e10c94caef6af39bb85d020eac5ef97fcf5b4, ip=10.254.6.97, pid=559) world_rank=2, local_rank=0, node_rank=2
[36m(RayTrainWorker pid=609, ip=10.254.5.92)[0m Moving model to device: cuda:0
[36m(RayTrainWorker pid=609, ip=10.254.5.92)[0m Wrapping provided model in DistributedDataParallel.
[36m(RayTrainWorker pid=559, ip=10.254.6.97)[0m Moving model to device: 

[36m(TunerInternal pid=5392)[0m 
[36m(TunerInternal pid=5392)[0m Training finished iteration 1 at 2025-04-04 09:15:00. Total running time: 4min 17s
[36m(TunerInternal pid=5392)[0m ╭───────────────────────────────╮
[36m(TunerInternal pid=5392)[0m │ Training result               │
[36m(TunerInternal pid=5392)[0m ├───────────────────────────────┤
[36m(TunerInternal pid=5392)[0m │ checkpoint_dir_name           │
[36m(TunerInternal pid=5392)[0m │ time_this_iter_s      136.538 │
[36m(TunerInternal pid=5392)[0m │ time_total_s          136.538 │
[36m(TunerInternal pid=5392)[0m │ training_iteration          1 │
[36m(TunerInternal pid=5392)[0m │ accuracy              0.81314 │
[36m(TunerInternal pid=5392)[0m │ loss                  0.49233 │
[36m(TunerInternal pid=5392)[0m ╰───────────────────────────────╯


Train Epoch 1:   2%|▏         | 34/2000 [00:00<00:12, 157.46it/s]
Train Epoch 1:   1%|          | 14/2000 [00:00<00:14, 139.47it/s]
Train Epoch 1:   2%|▎         | 50/2000 [00:00<00:13, 146.88it/s]
Train Epoch 1:   2%|▎         | 50/2000 [00:00<00:12, 157.47it/s]
Train Epoch 1:   2%|▏         | 30/2000 [00:00<00:13, 150.00it/s]
Train Epoch 1:   3%|▎         | 67/2000 [00:00<00:12, 153.01it/s]
Train Epoch 1:   3%|▎         | 66/2000 [00:00<00:12, 157.99it/s]
Train Epoch 1:   2%|▏         | 46/2000 [00:00<00:12, 153.49it/s]
Train Epoch 1:   4%|▍         | 82/2000 [00:00<00:12, 154.38it/s]
Train Epoch 1:   3%|▎         | 62/2000 [00:00<00:13, 148.76it/s]
Train Epoch 1:   4%|▍         | 83/2000 [00:00<00:12, 150.80it/s]
Train Epoch 1:   5%|▍         | 98/2000 [00:00<00:12, 151.84it/s]
Train Epoch 1:   4%|▍         | 78/2000 [00:00<00:12, 150.89it/s]
Train Epoch 1:   5%|▍         | 99/2000 [00:00<00:12, 150.40it/s]
Train Epoch 1:   5%|▍         | 94/2000 [00:00<00:12, 152.09it/s]
Train Epoc

[36m(TunerInternal pid=5392)[0m 
[36m(TunerInternal pid=5392)[0m Training finished iteration 2 at 2025-04-04 09:15:14. Total running time: 4min 32s
[36m(TunerInternal pid=5392)[0m ╭───────────────────────────────╮
[36m(TunerInternal pid=5392)[0m │ Training result               │
[36m(TunerInternal pid=5392)[0m ├───────────────────────────────┤
[36m(TunerInternal pid=5392)[0m │ checkpoint_dir_name           │
[36m(TunerInternal pid=5392)[0m │ time_this_iter_s      14.7048 │
[36m(TunerInternal pid=5392)[0m │ time_total_s          151.242 │
[36m(TunerInternal pid=5392)[0m │ training_iteration          2 │
[36m(TunerInternal pid=5392)[0m │ accuracy              0.83953 │
[36m(TunerInternal pid=5392)[0m │ loss                  0.43498 │
[36m(TunerInternal pid=5392)[0m ╰───────────────────────────────╯


Test Epoch 1: 100%|██████████| 334/334 [00:00<00:00, 336.16it/s]
Train Epoch 2:   0%|          | 0/2000 [00:00<?, ?it/s]
Train Epoch 2:   1%|          | 17/2000 [00:00<00:12, 160.77it/s]
Train Epoch 2:   1%|          | 21/2000 [00:00<00:09, 209.38it/s]
Train Epoch 2:   1%|          | 13/2000 [00:00<00:15, 128.69it/s]
Train Epoch 2:   2%|▏         | 34/2000 [00:00<00:11, 165.93it/s]
Train Epoch 2:   2%|▏         | 44/2000 [00:00<00:08, 218.61it/s]
Train Epoch 2:   1%|▏         | 27/2000 [00:00<00:14, 133.87it/s]
Train Epoch 2:   3%|▎         | 51/2000 [00:00<00:11, 166.88it/s]
Train Epoch 2:   3%|▎         | 66/2000 [00:00<00:10, 176.77it/s]
Train Epoch 2:   2%|▏         | 42/2000 [00:00<00:14, 135.97it/s]
Train Epoch 2:   3%|▎         | 68/2000 [00:00<00:11, 167.22it/s]
Train Epoch 2:   3%|▎         | 56/2000 [00:00<00:14, 132.96it/s]
Train Epoch 2:   4%|▍         | 85/2000 [00:00<00:11, 167.19it/s]
Train Epoch 2:   4%|▍         | 85/2000 [00:00<00:12, 156.64it/s]
Train Epoch 2:   4%|▎

[36m(TunerInternal pid=5392)[0m 
[36m(TunerInternal pid=5392)[0m Training finished iteration 3 at 2025-04-04 09:15:29. Total running time: 4min 46s
[36m(TunerInternal pid=5392)[0m ╭───────────────────────────────╮
[36m(TunerInternal pid=5392)[0m │ Training result               │
[36m(TunerInternal pid=5392)[0m ├───────────────────────────────┤
[36m(TunerInternal pid=5392)[0m │ checkpoint_dir_name           │
[36m(TunerInternal pid=5392)[0m │ time_this_iter_s      14.4571 │
[36m(TunerInternal pid=5392)[0m │ time_total_s          165.699 │
[36m(TunerInternal pid=5392)[0m │ training_iteration          3 │
[36m(TunerInternal pid=5392)[0m │ accuracy              0.85303 │
[36m(TunerInternal pid=5392)[0m │ loss                  0.41076 │
[36m(TunerInternal pid=5392)[0m ╰───────────────────────────────╯


Train Epoch 3:   2%|▏         | 32/2000 [00:00<00:12, 158.46it/s]
Train Epoch 3:   1%|          | 16/2000 [00:00<00:12, 157.06it/s]
Train Epoch 3:   2%|▏         | 35/2000 [00:00<00:11, 171.40it/s]
Train Epoch 3:   2%|▏         | 49/2000 [00:00<00:12, 159.89it/s]
Train Epoch 3:   2%|▏         | 34/2000 [00:00<00:11, 165.43it/s]
Train Epoch 3:   3%|▎         | 53/2000 [00:00<00:11, 175.18it/s]
Train Epoch 3:   3%|▎         | 52/2000 [00:00<00:11, 167.71it/s]
Train Epoch 3:   4%|▎         | 71/2000 [00:00<00:10, 175.45it/s]
Train Epoch 3:   3%|▎         | 66/2000 [00:00<00:12, 160.12it/s]
Train Epoch 3:   4%|▎         | 70/2000 [00:00<00:11, 169.34it/s]
Train Epoch 3:   4%|▍         | 89/2000 [00:00<00:10, 175.73it/s]
Train Epoch 3:   4%|▍         | 83/2000 [00:00<00:12, 159.26it/s]
Train Epoch 3:   4%|▍         | 88/2000 [00:00<00:11, 169.84it/s]
Train Epoch 3:   5%|▌         | 100/2000 [00:00<00:11, 160.36it/s]
Train Epoch 3:   5%|▌         | 105/2000 [00:00<00:12, 156.54it/s]
Train Ep

[36m(TunerInternal pid=5392)[0m 
[36m(TunerInternal pid=5392)[0m Training finished iteration 4 at 2025-04-04 09:15:42. Total running time: 5min 0s
[36m(TunerInternal pid=5392)[0m ╭───────────────────────────────╮
[36m(TunerInternal pid=5392)[0m │ Training result               │
[36m(TunerInternal pid=5392)[0m ├───────────────────────────────┤
[36m(TunerInternal pid=5392)[0m │ checkpoint_dir_name           │
[36m(TunerInternal pid=5392)[0m │ time_this_iter_s      13.4537 │
[36m(TunerInternal pid=5392)[0m │ time_total_s          179.153 │
[36m(TunerInternal pid=5392)[0m │ training_iteration          4 │
[36m(TunerInternal pid=5392)[0m │ accuracy              0.85873 │
[36m(TunerInternal pid=5392)[0m │ loss                  0.39019 │
[36m(TunerInternal pid=5392)[0m ╰───────────────────────────────╯


Train Epoch 4:   2%|▏         | 34/2000 [00:00<00:14, 133.36it/s]
Train Epoch 4:   1%|          | 15/2000 [00:00<00:13, 145.26it/s]
Train Epoch 4:   2%|▏         | 34/2000 [00:00<00:14, 138.15it/s]
Train Epoch 4:   2%|▎         | 50/2000 [00:00<00:13, 143.52it/s]
Train Epoch 4:   2%|▏         | 31/2000 [00:00<00:12, 153.36it/s]
Train Epoch 4:   2%|▎         | 50/2000 [00:00<00:13, 146.20it/s]
Train Epoch 4:   3%|▎         | 66/2000 [00:00<00:13, 148.33it/s]
Train Epoch 4:   2%|▏         | 47/2000 [00:00<00:12, 154.24it/s]
Train Epoch 4:   3%|▎         | 66/2000 [00:00<00:12, 150.02it/s]
Train Epoch 4:   4%|▍         | 82/2000 [00:00<00:12, 150.01it/s]
Train Epoch 4:   3%|▎         | 64/2000 [00:00<00:12, 158.81it/s]
Train Epoch 4:   4%|▍         | 82/2000 [00:00<00:12, 151.32it/s]
Train Epoch 4:   5%|▍         | 99/2000 [00:00<00:12, 156.66it/s]
Train Epoch 4:   5%|▍         | 99/2000 [00:00<00:12, 154.59it/s]
Train Epoch 4:   4%|▍         | 82/2000 [00:00<00:11, 165.49it/s]
Train Epoc

[36m(TunerInternal pid=5392)[0m 
[36m(TunerInternal pid=5392)[0m Training finished iteration 5 at 2025-04-04 09:15:56. Total running time: 5min 13s
[36m(TunerInternal pid=5392)[0m ╭───────────────────────────────╮
[36m(TunerInternal pid=5392)[0m │ Training result               │
[36m(TunerInternal pid=5392)[0m ├───────────────────────────────┤
[36m(TunerInternal pid=5392)[0m │ checkpoint_dir_name           │
[36m(TunerInternal pid=5392)[0m │ time_this_iter_s      13.5312 │
[36m(TunerInternal pid=5392)[0m │ time_total_s          192.684 │
[36m(TunerInternal pid=5392)[0m │ training_iteration          5 │
[36m(TunerInternal pid=5392)[0m │ accuracy              0.86623 │
[36m(TunerInternal pid=5392)[0m │ loss                  0.37706 │
[36m(TunerInternal pid=5392)[0m ╰───────────────────────────────╯


Train Epoch 5:   1%|          | 16/2000 [00:00<00:12, 158.77it/s]
Train Epoch 5:   1%|          | 16/2000 [00:00<00:12, 159.31it/s]
Train Epoch 5:   1%|          | 15/2000 [00:00<00:14, 141.33it/s]
Train Epoch 5:   2%|▏         | 34/2000 [00:00<00:11, 169.22it/s]
Train Epoch 5:   2%|▏         | 34/2000 [00:00<00:11, 166.19it/s]
Train Epoch 5:   2%|▏         | 31/2000 [00:00<00:13, 149.20it/s]
Train Epoch 5:   3%|▎         | 52/2000 [00:00<00:11, 173.14it/s]
Train Epoch 5:   3%|▎         | 52/2000 [00:00<00:11, 168.10it/s]
Train Epoch 5:   2%|▏         | 47/2000 [00:00<00:12, 151.40it/s]
Train Epoch 5:   4%|▎         | 70/2000 [00:00<00:11, 174.10it/s]
Train Epoch 5:   4%|▎         | 70/2000 [00:00<00:11, 168.88it/s]
Train Epoch 5:   3%|▎         | 63/2000 [00:00<00:12, 151.89it/s]
Train Epoch 5:   4%|▍         | 88/2000 [00:00<00:10, 175.36it/s]
Train Epoch 5:   4%|▍         | 87/2000 [00:00<00:11, 168.97it/s]
Train Epoch 5:   4%|▍         | 79/2000 [00:00<00:12, 152.17it/s]
Train Epoc

[36m(TunerInternal pid=5392)[0m 
[36m(TunerInternal pid=5392)[0m Training finished iteration 6 at 2025-04-04 09:16:09. Total running time: 5min 26s
[36m(TunerInternal pid=5392)[0m ╭───────────────────────────────╮
[36m(TunerInternal pid=5392)[0m │ Training result               │
[36m(TunerInternal pid=5392)[0m ├───────────────────────────────┤
[36m(TunerInternal pid=5392)[0m │ checkpoint_dir_name           │
[36m(TunerInternal pid=5392)[0m │ time_this_iter_s      12.9628 │
[36m(TunerInternal pid=5392)[0m │ time_total_s          205.647 │
[36m(TunerInternal pid=5392)[0m │ training_iteration          6 │
[36m(TunerInternal pid=5392)[0m │ accuracy              0.86083 │
[36m(TunerInternal pid=5392)[0m │ loss                  0.37167 │
[36m(TunerInternal pid=5392)[0m ╰───────────────────────────────╯


Train Epoch 6:   2%|▏         | 31/2000 [00:00<00:12, 153.30it/s]
Train Epoch 6:   2%|▏         | 34/2000 [00:00<00:11, 164.94it/s]
Train Epoch 6:   2%|▏         | 35/2000 [00:00<00:11, 170.04it/s]
Train Epoch 6:   2%|▏         | 48/2000 [00:00<00:12, 157.28it/s]
Train Epoch 6:   3%|▎         | 52/2000 [00:00<00:11, 167.95it/s]
Train Epoch 6:   3%|▎         | 53/2000 [00:00<00:11, 172.99it/s]
Train Epoch 6:   3%|▎         | 64/2000 [00:00<00:12, 155.51it/s]
Train Epoch 6:   3%|▎         | 69/2000 [00:00<00:11, 167.58it/s]
Train Epoch 6:   4%|▎         | 71/2000 [00:00<00:10, 175.70it/s]
Train Epoch 6:   4%|▍         | 81/2000 [00:00<00:12, 158.14it/s]
Train Epoch 6:   4%|▍         | 86/2000 [00:00<00:11, 168.23it/s]
Train Epoch 6:   4%|▍         | 89/2000 [00:00<00:10, 175.84it/s]
Train Epoch 6:   5%|▍         | 98/2000 [00:00<00:11, 159.59it/s]
Train Epoch 6:   6%|▌         | 114/2000 [00:00<00:11, 159.30it/s]
Train Epoch 6:   5%|▌         | 103/2000 [00:00<00:12, 146.09it/s]
Train Ep

[36m(TunerInternal pid=5392)[0m 
[36m(TunerInternal pid=5392)[0m Training finished iteration 7 at 2025-04-04 09:16:22. Total running time: 5min 39s
[36m(TunerInternal pid=5392)[0m ╭───────────────────────────────╮
[36m(TunerInternal pid=5392)[0m │ Training result               │
[36m(TunerInternal pid=5392)[0m ├───────────────────────────────┤
[36m(TunerInternal pid=5392)[0m │ checkpoint_dir_name           │
[36m(TunerInternal pid=5392)[0m │ time_this_iter_s      13.0597 │
[36m(TunerInternal pid=5392)[0m │ time_total_s          218.707 │
[36m(TunerInternal pid=5392)[0m │ training_iteration          7 │
[36m(TunerInternal pid=5392)[0m │ accuracy              0.87582 │
[36m(TunerInternal pid=5392)[0m │ loss                  0.35735 │
[36m(TunerInternal pid=5392)[0m ╰───────────────────────────────╯


Train Epoch 7:   1%|          | 17/2000 [00:00<00:11, 165.28it/s]
Train Epoch 7:   1%|          | 16/2000 [00:00<00:12, 156.12it/s]
Train Epoch 7:   1%|          | 14/2000 [00:00<00:14, 136.36it/s]
Train Epoch 7:   2%|▏         | 33/2000 [00:00<00:12, 163.23it/s]
Train Epoch 7:   2%|▏         | 34/2000 [00:00<00:11, 166.98it/s]
Train Epoch 7:   1%|▏         | 28/2000 [00:00<00:14, 135.12it/s]
Train Epoch 7:   2%|▎         | 50/2000 [00:00<00:11, 163.76it/s]
Train Epoch 7:   3%|▎         | 52/2000 [00:00<00:11, 169.57it/s]
Train Epoch 7:   2%|▏         | 48/2000 [00:00<00:12, 161.14it/s]
Train Epoch 7:   3%|▎         | 67/2000 [00:00<00:11, 162.66it/s]
Train Epoch 7:   4%|▎         | 70/2000 [00:00<00:11, 171.51it/s]
Train Epoch 7:   3%|▎         | 67/2000 [00:00<00:11, 171.01it/s]
Train Epoch 7:   4%|▍         | 86/2000 [00:00<00:11, 169.87it/s]
Train Epoch 7:   4%|▍         | 88/2000 [00:00<00:11, 172.40it/s]
Train Epoch 7:   4%|▍         | 86/2000 [00:00<00:10, 175.08it/s]
Train Epoc

[36m(TunerInternal pid=5392)[0m 
[36m(TunerInternal pid=5392)[0m Training finished iteration 8 at 2025-04-04 09:16:35. Total running time: 5min 52s
[36m(TunerInternal pid=5392)[0m ╭───────────────────────────────╮
[36m(TunerInternal pid=5392)[0m │ Training result               │
[36m(TunerInternal pid=5392)[0m ├───────────────────────────────┤
[36m(TunerInternal pid=5392)[0m │ checkpoint_dir_name           │
[36m(TunerInternal pid=5392)[0m │ time_this_iter_s      13.0898 │
[36m(TunerInternal pid=5392)[0m │ time_total_s          231.796 │
[36m(TunerInternal pid=5392)[0m │ training_iteration          8 │
[36m(TunerInternal pid=5392)[0m │ accuracy              0.87313 │
[36m(TunerInternal pid=5392)[0m │ loss                  0.35531 │
[36m(TunerInternal pid=5392)[0m ╰───────────────────────────────╯


Train Epoch 8:   2%|▏         | 34/2000 [00:00<00:13, 146.62it/s]
Train Epoch 8:   2%|▏         | 34/2000 [00:00<00:14, 140.22it/s]
Train Epoch 8:   1%|          | 15/2000 [00:00<00:13, 145.44it/s]
Train Epoch 8:   2%|▎         | 50/2000 [00:00<00:12, 151.56it/s]
Train Epoch 8:   2%|▎         | 50/2000 [00:00<00:13, 147.64it/s]
Train Epoch 8:   2%|▏         | 31/2000 [00:00<00:12, 152.53it/s]
Train Epoch 8:   3%|▎         | 66/2000 [00:00<00:12, 153.35it/s]
Train Epoch 8:   3%|▎         | 66/2000 [00:00<00:12, 151.05it/s]
Train Epoch 8:   2%|▏         | 47/2000 [00:00<00:12, 154.11it/s]
Train Epoch 8:   4%|▍         | 82/2000 [00:00<00:12, 154.69it/s]
Train Epoch 8:   4%|▍         | 82/2000 [00:00<00:12, 153.32it/s]
Train Epoch 8:   3%|▎         | 63/2000 [00:00<00:12, 154.86it/s]
Train Epoch 8:   5%|▍         | 98/2000 [00:00<00:12, 155.63it/s]
Train Epoch 8:   5%|▍         | 98/2000 [00:00<00:12, 152.30it/s]
Train Epoch 8:   4%|▍         | 79/2000 [00:00<00:12, 156.24it/s]
Train Epoc

[36m(TunerInternal pid=5392)[0m 
[36m(TunerInternal pid=5392)[0m Training finished iteration 9 at 2025-04-04 09:16:48. Total running time: 6min 6s
[36m(TunerInternal pid=5392)[0m ╭───────────────────────────────╮
[36m(TunerInternal pid=5392)[0m │ Training result               │
[36m(TunerInternal pid=5392)[0m ├───────────────────────────────┤
[36m(TunerInternal pid=5392)[0m │ checkpoint_dir_name           │
[36m(TunerInternal pid=5392)[0m │ time_this_iter_s      13.2409 │
[36m(TunerInternal pid=5392)[0m │ time_total_s          245.037 │
[36m(TunerInternal pid=5392)[0m │ training_iteration          9 │
[36m(TunerInternal pid=5392)[0m │ accuracy              0.87343 │
[36m(TunerInternal pid=5392)[0m │ loss                  0.35364 │
[36m(TunerInternal pid=5392)[0m ╰───────────────────────────────╯


Train Epoch 9:   2%|▏         | 33/2000 [00:00<00:12, 157.92it/s]
Train Epoch 9:   2%|▏         | 35/2000 [00:00<00:11, 170.10it/s]
Train Epoch 9:   2%|▏         | 34/2000 [00:00<00:11, 166.07it/s]
Train Epoch 9:   3%|▎         | 53/2000 [00:00<00:11, 173.20it/s]
Train Epoch 9:   3%|▎         | 52/2000 [00:00<00:11, 168.19it/s]
Train Epoch 9:   2%|▎         | 50/2000 [00:00<00:12, 160.11it/s]
Train Epoch 9:   4%|▎         | 71/2000 [00:00<00:11, 175.25it/s]
Train Epoch 9:   3%|▎         | 69/2000 [00:00<00:11, 168.77it/s]
Train Epoch 9:   3%|▎         | 67/2000 [00:00<00:11, 161.55it/s]
Train Epoch 9:   4%|▍         | 84/2000 [00:00<00:11, 163.29it/s]
Train Epoch 9:   4%|▍         | 89/2000 [00:00<00:10, 176.34it/s]
Train Epoch 9:   4%|▍         | 87/2000 [00:00<00:11, 169.82it/s]
Train Epoch 9:   5%|▌         | 104/2000 [00:00<00:12, 148.01it/s]
Train Epoch 9:   5%|▌         | 101/2000 [00:00<00:11, 162.59it/s]
Train Epoch 9:   5%|▌         | 107/2000 [00:00<00:14, 133.88it/s]
Train E

[36m(TunerInternal pid=5392)[0m 
[36m(TunerInternal pid=5392)[0m Training finished iteration 10 at 2025-04-04 09:17:01. Total running time: 6min 19s
[36m(TunerInternal pid=5392)[0m ╭───────────────────────────────╮
[36m(TunerInternal pid=5392)[0m │ Training result               │
[36m(TunerInternal pid=5392)[0m ├───────────────────────────────┤
[36m(TunerInternal pid=5392)[0m │ checkpoint_dir_name           │
[36m(TunerInternal pid=5392)[0m │ time_this_iter_s      13.0292 │
[36m(TunerInternal pid=5392)[0m │ time_total_s          258.067 │
[36m(TunerInternal pid=5392)[0m │ training_iteration         10 │
[36m(TunerInternal pid=5392)[0m │ accuracy              0.87283 │
[36m(TunerInternal pid=5392)[0m │ loss                  0.34522 │
[36m(TunerInternal pid=5392)[0m ╰───────────────────────────────╯
[36m(TunerInternal pid=5392)[0m 
[36m(TunerInternal pid=5392)[0m Training completed after 10 iterations at 2025-04-04 09:17:03. Total running time: 6min 20s


[36m(TunerInternal pid=5392)[0m Wrote the latest version of all result files and experiment state to '/home/cdsw/ray_results/TorchTrainer_2025-04-04_09-08-58' in 1.0291s.


[36m(TunerInternal pid=5392)[0m 
Training result: Result(
  metrics={'loss': 0.3452150629258932, 'accuracy': 0.8728254349130175},
  path='/home/cdsw/ray_results/TorchTrainer_2025-04-04_09-08-58/TorchTrainer_ae4a4_00000_0_2025-04-04_09-10-42',
  filesystem='local',
  checkpoint=None
)


In [3]:
cluster.terminate()