Imports necessary libraries: ray, torch, torch.nn, torchvision, torch.utils.data, and Ray Train-specific modules.

In [5]:
!pip install -q ray[default] torch torchvision tqdm ray[train] psutil nvidia-ml-py3

In [None]:
%pip install -U -q ipywidgets

In [43]:
from cmlextensions.ray_cluster import RayCluster

# Set up the Ray Cluster using CMLExtensions
cluster = RayCluster(num_workers=3, worker_cpu=4, worker_memory=8, worker_nvidia_gpu=1, head_cpu=4, head_memory=8, head_nvidia_gpu=1)
cluster.init()

Starting ray head...
Starting 3 ray workers...

--------------------
Ray cluster started
--------------------

The Ray dashboard is running at
https://734nix0zkpcgc133.goes-ocp-cml.apps.field-team-ocp-01.kcloud.cloudera.com/

To connect to this Ray cluster from this CML Session,
use the following Python code:
  import ray
  ray.init(address='ray://10.254.5.113:10001')



In [48]:
# Option 1: If available in your CMLExtensions version
import ray
import json
ray_address = cluster.get_client_url()
dashboard_url = cluster.get_dashboard_url()
print(f"Dashboard URL : {dashboard_url}")
print(f"RAY ADDRESS : {ray_address}")

ray.init(address=ray_address, allow_multiple=True)

# Save both to a JSON file
connection_info = {
    "ray_address": ray_address,
    "dashboard_url": dashboard_url
}

with open("ray_address.json", "w") as f:
    json.dump(connection_info, f)

print(f"✅ Connected to Ray cluster at {ray_address}")
print(f"📊 Dashboard URL: {dashboard_url}")
print("📄 Connection info saved to ray_address.json")


2025-04-07 12:44:47,065	INFO client_builder.py:244 -- Passing the following kwargs to ray.init() on the server: log_to_driver


Dashboard URL : https://734nix0zkpcgc133.goes-ocp-cml.apps.field-team-ocp-01.kcloud.cloudera.com/
RAY ADDRESS : ray://10.254.5.113:10001
✅ Connected to Ray cluster at ray://10.254.5.113:10001
📊 Dashboard URL: https://734nix0zkpcgc133.goes-ocp-cml.apps.field-team-ocp-01.kcloud.cloudera.com/
📄 Connection info saved to ray_address.json


In [49]:
import os
from typing import Dict
import psutil
import torch
from filelock import FileLock
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from torchvision.transforms import Normalize, ToTensor
from tqdm import tqdm
from ray import train, tune
from ray.tune import Tuner
from ray.train import ScalingConfig, get_context
from ray.train.torch import TorchTrainer
from ray.util import get_node_ip_address
from torch.utils.tensorboard import SummaryWriter
import hashlib
import pynvml

In [52]:
def get_dataloaders(batch_size):
    # Transform to normalize the input images
    transform = transforms.Compose([ToTensor(), Normalize((0.28604,), (0.32025,))])

    with FileLock(os.path.expanduser("~/data.lock")):
        # Download training data from open datasets
        training_data = datasets.FashionMNIST(
            root="~/data",
            train=True,
            download=True,
            transform=transform,
        )

        # Download test data from open datasets
        test_data = datasets.FashionMNIST(
            root="~/data",
            train=False,
            download=True,
            transform=transform,
        )

    # Create data loaders
    train_dataloader = DataLoader(training_data, batch_size=batch_size, shuffle=True)
    test_dataloader = DataLoader(test_data, batch_size=batch_size)

    return train_dataloader, test_dataloader


# Model Definition
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28 * 28, 512),
            nn.ReLU(),
            nn.Dropout(0.25),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Dropout(0.25),
            nn.Linear(512, 10),
            nn.ReLU(),
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits


def train_func_per_worker(config: Dict):
    
    # Log the node details
    node_ip = get_node_ip_address()
    context = ray.train.get_context()
    node_id = ray.get_runtime_context().node_id.hex()
    rank = context.get_world_rank()
    print(f"[Worker Init] Running on Node ID: {node_id}, IP: {node_ip} worker rank : {rank}")

    # ensuring it sees GPUs
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # gpu_id = torch.cuda.current_device() if torch.cuda.is_available() else "N/A"

    # Initialize NVML
    pynvml.nvmlInit()
    visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", "0")
    gpu_id = int(visible_devices.split(",")[0])  # usually just one GPU per worker
    handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_id)
    print(f"[DEBUG] Worker running on node {ray.get_runtime_context().node_id.hex()} using device {device} (GPU ID: {gpu_id})")

    
    # Each Ray worker usually gets assigned a specific GPU
    # Use CUDA_VISIBLE_DEVICES to figure it out
    cuda_visible = os.environ.get("CUDA_VISIBLE_DEVICES", "0")
    gpu_index = int(cuda_visible.split(",")[0])  # take first assigned GPU index
    # Get NVML handle for this GPU
    handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_index)
    
    #let us set up Tensor Board
    rank = context.get_world_rank()
    ray_result_root = os.environ.get("RAY_RESULT_DIR", "/home/cdsw/ray_results")
    experiment_name = "tb_logs"
    writer_logdir = os.path.join(ray_result_root, experiment_name, f"worker_{rank}")
    os.makedirs(writer_logdir, exist_ok=True)

    writer = SummaryWriter(log_dir=writer_logdir)
    print(f"[TensorBoard] Logging to: {writer_logdir}")
    
    lr = config["lr"]
    epochs = config["epochs"]
    batch_size = config["batch_size_per_worker"]

    # Get dataloaders inside the worker training function
    train_dataloader, test_dataloader = get_dataloaders(batch_size=batch_size)

    # [1] Prepare Dataloader for distributed training
    # Shard the datasets among workers and move batches to the correct device
    # =======================================================================
    train_dataloader = ray.train.torch.prepare_data_loader(train_dataloader)
    test_dataloader = ray.train.torch.prepare_data_loader(test_dataloader)

    model = NeuralNetwork()

    # [2] Prepare and wrap your model with DistributedDataParallel
    # Move the model to the correct GPU/CPU device
    # ============================================================
    model = ray.train.torch.prepare_model(model)

    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9)

    # Function to generate checksum of model weights
    def model_checksum(model):
        combined = b"".join([p.detach().cpu().numpy().tobytes() for p in model.parameters()])
        return hashlib.md5(combined).hexdigest()

    
    # Model training loop
    for epoch in range(epochs):
        if ray.train.get_context().get_world_size() > 1:
            # Required for the distributed sampler to shuffle properly across epochs.
            train_dataloader.sampler.set_epoch(epoch)

        model.train()
        # for X, y in tqdm(train_dataloader, desc=f"[Rank {rank}] Train Epoch {epoch}"):
        for batch_idx, (X, y) in enumerate(tqdm(train_dataloader, desc=f"[Rank {rank}] Train Epoch {epoch}")):

            pred = model(X)
            loss = loss_fn(pred, y)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            # --- Per-batch GPU Logging ---
            if torch.cuda.is_available():
                gpu_mem_allocated = torch.cuda.memory_allocated() / 1024**2  # MB
                gpu_mem_reserved = torch.cuda.memory_reserved() / 1024**2
                gpu_utilization = pynvml.nvmlDeviceGetUtilizationRates(handle).gpu  # %

                global_step = epoch * len(train_dataloader) + batch_idx

                writer.add_scalar("Train/GPU/Memory Allocated (MB)", gpu_mem_allocated, global_step)
                writer.add_scalar("Train/GPU/Memory Reserved (MB)", gpu_mem_reserved, global_step)
                writer.add_scalar("Train/GPU/Utilization (%)", gpu_utilization, global_step)
        
            print(f"[GPU LOG] Epoch {epoch}, Batch {batch_idx} | Mem: {gpu_mem_allocated:.2f}MB, Util: {gpu_utilization}%  global_step : {global_step}")

        model.eval()
        test_loss, num_correct, num_total = 0, 0, 0
        with torch.no_grad():
            for X, y in tqdm(test_dataloader, desc=f"[Rank {rank}] Test Epoch {epoch}"):
                pred = model(X)
                loss = loss_fn(pred, y)

                test_loss += loss.item()
                num_total += y.shape[0]
                num_correct += (pred.argmax(1) == y).sum().item()

        test_loss /= len(test_dataloader)
        accuracy = num_correct / num_total
        checksum = model_checksum(model)
        
        # Log metrics to TensorBoard
        writer.add_scalar("Loss/test", test_loss, epoch)
        writer.add_scalar("Accuracy/test", accuracy, epoch)
        writer.add_scalar("ModelChecksum", int(checksum[:8], 16), epoch)  # Log first 8 hex chars as int
        writer.add_text("Node Info", f"Epoch {epoch} - Rank: {rank}, Node: {node_id}", epoch)
    
        # Log sync info (just once per epoch per rank)
        print(f"[Rank {rank}] Epoch {epoch} | Loss: {test_loss:.4f}, Acc: {accuracy:.4f}, Model Checksum: {checksum}")


        # [3] Report metrics to Ray Train
        # ===============================
        # ray.train.report(metrics={"loss": test_loss, "accuracy": accuracy})
        # Log sync info (just once per epoch per rank)
        print(f"[ NodeId {node_id} Rank {rank}] Epoch {epoch} | Loss: {test_loss:.4f}, Acc: {accuracy:.4f}, Model Checksum: {checksum}")

        # ray.train.report(metrics={
        # "loss": test_loss,
        # "accuracy": accuracy,
        # "epoch": epoch,
        # "node_id": node_id,
        # "rank": rank,
        ray.train.report(metrics={
            "loss": test_loss,
            "accuracy": accuracy,
            "epoch": epoch,
            "node_id": node_id,
            "rank": rank,
            "model_checksum": checksum,
            "gpu_utilization": gpu_utilization,
            "gpu_mem_alloc": gpu_mem_allocated,
        })

    #close logging Tensor Board
    writer.close()

def train_fashion_mnist(num_workers=2, use_gpu=False):
    global_batch_size = 32

    train_config = {
        "lr": 1e-3,
        "epochs": 10,
        "batch_size_per_worker": global_batch_size // num_workers,
    }

    # Configure computation resources
    scaling_config = ScalingConfig(num_workers=num_workers, use_gpu=use_gpu)

    # Initialize a Ray TorchTrainer
    trainer = TorchTrainer(
        train_loop_per_worker=train_func_per_worker,
        train_loop_config=train_config,
        scaling_config=scaling_config,
    )

    # [4] Start distributed training
    # Run `train_func_per_worker` on all workers
    # =============================================
    result = trainer.fit()
    print(f"Training result: {result}")
 

In [53]:
# Connect to the existing Ray cluster
train_fashion_mnist(num_workers=3, use_gpu=True)

[36m(TunerInternal pid=767)[0m AIR_VERBOSITY is set, ignoring passed-in ProgressReporter for now.


[36m(TunerInternal pid=767)[0m 
[36m(TunerInternal pid=767)[0m View detailed results here: /home/cdsw/ray_results/TorchTrainer_2025-04-07_12-47-50
[36m(TunerInternal pid=767)[0m To visualize your results with TensorBoard, run: `tensorboard --logdir /tmp/ray/session_2025-04-07_12-40-53_203395_177/artifacts/2025-04-07_12-48-49/TorchTrainer_2025-04-07_12-47-50/driver_artifacts`
[36m(TunerInternal pid=767)[0m 
[36m(TunerInternal pid=767)[0m Training started with configuration:
[36m(TunerInternal pid=767)[0m ╭─────────────────────────────────────────────────╮
[36m(TunerInternal pid=767)[0m │ Training config                                 │
[36m(TunerInternal pid=767)[0m ├─────────────────────────────────────────────────┤
[36m(TunerInternal pid=767)[0m │ train_loop_config/batch_size_per_worker      10 │
[36m(TunerInternal pid=767)[0m │ train_loop_config/epochs                     10 │
[36m(TunerInternal pid=767)[0m │ train_loop_config/lr                      0.001 │
[

[36m(TrainTrainable pid=520, ip=10.254.7.94)[0m Trainable.setup took 12.573 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m Setting up process group for: env:// [rank=0, world_size=3]
[36m(TorchTrainer pid=520, ip=10.254.7.94)[0m Started distributed worker processes: 
[36m(TorchTrainer pid=520, ip=10.254.7.94)[0m - (node_id=f94cfd48e881a8ef0b964a8593da4b704a1574ca224f294a4cfe1791, ip=10.254.7.94, pid=604) world_rank=0, local_rank=0, node_rank=0
[36m(TorchTrainer pid=520, ip=10.254.7.94)[0m - (node_id=d78b974282fa0fa2bddc3a93a3217bbba8df4be1998f6b20ec83243d, ip=10.254.12.140, pid=398) world_rank=1, local_rank=0, node_rank=1
[36m(TorchTrainer pid=520, ip=10.254.7.94)[0m - (node_id=8132e66e3928a4b82addde0aa19309709a58e788e1dcddeaab904191, ip=10.254.6.117, pid=401) world_rank=2, local_rank=0, node_rank=2
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m Use 

[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [Worker Init] Running on Node ID: 8132e66e3928a4b82addde0aa19309709a58e788e1dcddeaab904191, IP: 10.254.6.117 worker rank : 2
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [DEBUG] Worker running on node 8132e66e3928a4b82addde0aa19309709a58e788e1dcddeaab904191 using device cuda (GPU ID: 0)
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [Worker Init] Running on Node ID: f94cfd48e881a8ef0b964a8593da4b704a1574ca224f294a4cfe1791, IP: 10.254.7.94 worker rank : 0
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [DEBUG] Worker running on node f94cfd48e881a8ef0b964a8593da4b704a1574ca224f294a4cfe1791 using device cuda (GPU ID: 0)
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [Worker Init] Running on Node ID: d78b974282fa0fa2bddc3a93a3217bbba8df4be1998f6b20ec83243d, IP: 10.254.12.140 worker rank : 1
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [DEBUG] Worker running on node d78b974282fa0fa2bddc3a93a3217bbba8df4be1998f6b20ec83

[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m Moving model to device: cuda:0
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m Wrapping provided model in DistributedDataParallel.
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m Moving model to device: cuda:0
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m Wrapping provided model in DistributedDataParallel.
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m Moving model to device: cuda:0
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m Wrapping provided model in DistributedDataParallel.
[Rank 1] Train Epoch 0:   0%|          | 0/2000 [00:00<?, ?it/s]
[Rank 2] Train Epoch 0:   0%|          | 0/2000 [00:00<?, ?it/s]
[Rank 0] Train Epoch 0:   0%|          | 0/2000 [00:00<?, ?it/s]
[Rank 0] Train Epoch 0:   0%|          | 1/2000 [00:04<2:15:27,  4.07s/it]
[Rank 1] Train Epoch 0:   0%|          | 1/2000 [00:04<2:18:37,  4.16s/it]
[Rank 2] Train Epoch 0:   0%|          | 1/2000 [00:04<2:18:24,  4.15s/it]
[Rank 0] Train Epoch 0:  

[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 0 | Mem: 26.53MB, Util: 100%  global_step : 0
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 0 | Mem: 26.53MB, Util: 22%  global_step : 0
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1 | Mem: 26.53MB, Util: 22%  global_step : 1
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 2 | Mem: 26.53MB, Util: 22%  global_step : 2
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 0 | Mem: 26.53MB, Util: 0%  global_step : 0
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 1 | Mem: 26.53MB, Util: 0%  global_step : 1
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 2 | Mem: 26.53MB, Util: 0%  global_step : 2
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 1 | Mem: 26.53MB, Util: 100%  global_step : 1
[36m(RayTrainWorker pid=604, ip

[Rank 0] Train Epoch 0:   0%|          | 4/2000 [00:04<23:14,  1.43it/s]
[Rank 2] Train Epoch 0:   0%|          | 4/2000 [00:04<27:43,  1.20it/s]  
[Rank 1] Train Epoch 0:   0%|          | 4/2000 [00:04<28:38,  1.16it/s]  


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 3 | Mem: 26.53MB, Util: 100%  global_step : 3
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 3 | Mem: 26.53MB, Util: 0%  global_step : 3
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 3 | Mem: 26.53MB, Util: 22%  global_step : 3


[Rank 1] Train Epoch 0:   0%|          | 5/2000 [00:04<21:39,  1.53it/s]
[Rank 2] Train Epoch 0:   0%|          | 6/2000 [00:04<17:19,  1.92it/s]
[Rank 0] Train Epoch 0:   0%|          | 5/2000 [00:04<18:52,  1.76it/s]
[Rank 2] Train Epoch 0:   0%|          | 8/2000 [00:04<11:33,  2.87it/s]
[Rank 0] Train Epoch 0:   0%|          | 7/2000 [00:04<11:08,  2.98it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 4 | Mem: 26.53MB, Util: 14%  global_step : 4
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 5 | Mem: 26.53MB, Util: 14%  global_step : 5
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 4 | Mem: 26.53MB, Util: 4%  global_step : 4
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 5 | Mem: 26.53MB, Util: 14%  global_step : 5
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 4 | Mem: 26.53MB, Util: 96%  global_step : 4
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 6 | Mem: 26.53MB, Util: 3%  global_step : 6
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 6 | Mem: 26.53MB, Util: 47%  global_step : 6
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 7 | Mem: 26.53MB, Util: 47%  global_step : 7
[36m(RayTrainWorker pid=604, ip

[Rank 1] Train Epoch 0:   0%|          | 8/2000 [00:04<10:53,  3.05it/s]
[Rank 0] Train Epoch 0:   0%|          | 8/2000 [00:04<09:13,  3.60it/s]
[Rank 1] Train Epoch 0:   0%|          | 10/2000 [00:04<08:09,  4.06it/s]
[Rank 2] Train Epoch 0:   0%|          | 10/2000 [00:04<08:29,  3.90it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 7 | Mem: 26.53MB, Util: 3%  global_step : 7
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 8 | Mem: 26.53MB, Util: 66%  global_step : 8
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 7 | Mem: 26.53MB, Util: 37%  global_step : 7
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 9 | Mem: 26.53MB, Util: 66%  global_step : 9
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 10 | Mem: 26.53MB, Util: 66%  global_step : 10
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 8 | Mem: 26.53MB, Util: 98%  global_step : 8
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 9 | Mem: 26.53MB, Util: 98%  global_step : 9
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 10 | Mem: 26.53MB, Util: 98%  global_step : 10
[36m(RayTrainWorker pid=4

[Rank 1] Train Epoch 0:   1%|          | 13/2000 [00:05<05:13,  6.33it/s]
[Rank 2] Train Epoch 0:   1%|          | 16/2000 [00:04<03:44,  8.85it/s]
[Rank 0] Train Epoch 0:   1%|          | 13/2000 [00:04<03:50,  8.61it/s]
[Rank 2] Train Epoch 0:   1%|          | 21/2000 [00:05<02:28, 13.36it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 11 | Mem: 26.53MB, Util: 66%  global_step : 11
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 12 | Mem: 26.53MB, Util: 66%  global_step : 12
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 13 | Mem: 26.53MB, Util: 98%  global_step : 13
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 14 | Mem: 26.53MB, Util: 98%  global_step : 14
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 15 | Mem: 26.53MB, Util: 97%  global_step : 15
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 16 | Mem: 26.53MB, Util: 97%  global_step : 16
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 17 | Mem: 26.53MB, Util: 97%  global_step : 17
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 12 | Mem: 26.53MB, Util: 37%  global_step : 12
[36m(RayTrainW

[Rank 2] Train Epoch 0:   1%|▏         | 25/2000 [00:05<01:57, 16.76it/s]
[Rank 0] Train Epoch 0:   1%|          | 15/2000 [00:05<03:47,  8.74it/s]
[Rank 1] Train Epoch 0:   1%|          | 15/2000 [00:05<05:06,  6.48it/s]
[Rank 2] Train Epoch 0:   2%|▏         | 30/2000 [00:05<01:29, 21.94it/s]
[Rank 0] Train Epoch 0:   1%|          | 18/2000 [00:05<02:48, 11.73it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 21 | Mem: 26.53MB, Util: 97%  global_step : 21
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 22 | Mem: 26.53MB, Util: 100%  global_step : 22
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 23 | Mem: 26.53MB, Util: 100%  global_step : 23
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 24 | Mem: 26.53MB, Util: 100%  global_step : 24
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 25 | Mem: 26.53MB, Util: 100%  global_step : 25
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 26 | Mem: 26.53MB, Util: 100%  global_step : 26
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 14 | Mem: 26.53MB, Util: 15%  global_step : 14
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 15 | Mem: 26.53MB, Util: 29%  global_step : 15
[36m(RayTrai

[Rank 0] Train Epoch 0:   1%|          | 20/2000 [00:05<02:46, 11.87it/s]
[Rank 1] Train Epoch 0:   1%|          | 18/2000 [00:05<03:46,  8.76it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 20 | Mem: 26.53MB, Util: 100%  global_step : 20
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 18 | Mem: 26.53MB, Util: 3%  global_step : 18
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 19 | Mem: 26.53MB, Util: 0%  global_step : 19
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 32 | Mem: 26.53MB, Util: 100%  global_step : 32
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 33 | Mem: 26.53MB, Util: 100%  global_step : 33


[Rank 1] Train Epoch 0:   1%|          | 20/2000 [00:09<20:50,  1.58it/s]
[Rank 2] Train Epoch 0:   2%|▏         | 34/2000 [00:09<11:22,  2.88it/s]
[Rank 0] Train Epoch 0:   1%|          | 22/2000 [00:09<20:14,  1.63it/s]
[Rank 1] Train Epoch 0:   1%|          | 23/2000 [00:09<13:44,  2.40it/s]
[Rank 0] Train Epoch 0:   1%|          | 24/2000 [00:09<15:23,  2.14it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 21 | Mem: 26.53MB, Util: 100%  global_step : 21
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 20 | Mem: 26.53MB, Util: 0%  global_step : 20
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 21 | Mem: 26.53MB, Util: 0%  global_step : 21
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 22 | Mem: 26.53MB, Util: 4%  global_step : 22
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 34 | Mem: 26.53MB, Util: 100%  global_step : 34
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 35 | Mem: 26.53MB, Util: 100%  global_step : 35
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 22 | Mem: 26.53MB, Util: 100%  global_step : 22
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 23 | Mem: 26.53MB, Util: 100%  global_step : 23
[36m(RayTrain

[Rank 2] Train Epoch 0:   2%|▏         | 37/2000 [00:09<08:59,  3.64it/s]
[Rank 1] Train Epoch 0:   1%|▏         | 25/2000 [00:09<10:44,  3.06it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 24 | Mem: 26.53MB, Util: 14%  global_step : 24
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 24 | Mem: 26.53MB, Util: 100%  global_step : 24
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 25 | Mem: 26.53MB, Util: 14%  global_step : 25
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 38 | Mem: 26.53MB, Util: 100%  global_step : 38
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 25 | Mem: 26.53MB, Util: 0%  global_step : 25
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 26 | Mem: 26.53MB, Util: 0%  global_step : 26
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 26 | Mem: 26.53MB, Util: 0%  global_step : 26
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 27 | Mem: 26.53MB, Util: 0%  global_step : 27
[36m(RayTrainWor

[Rank 0] Train Epoch 0:   1%|▏         | 26/2000 [00:10<16:01,  2.05it/s]
[Rank 1] Train Epoch 0:   1%|▏         | 27/2000 [00:10<12:22,  2.66it/s]
[Rank 2] Train Epoch 0:   2%|▏         | 40/2000 [00:10<09:45,  3.35it/s]
[Rank 0] Train Epoch 0:   1%|▏         | 28/2000 [00:10<11:57,  2.75it/s]
[Rank 1] Train Epoch 0:   1%|▏         | 29/2000 [00:10<09:33,  3.44it/s]
[Rank 2] Train Epoch 0:   2%|▏         | 42/2000 [00:10<08:16,  3.95it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 28 | Mem: 26.53MB, Util: 0%  global_step : 28
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 41 | Mem: 26.53MB, Util: 100%  global_step : 41
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 28 | Mem: 26.53MB, Util: 0%  global_step : 28
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 42 | Mem: 26.53MB, Util: 100%  global_step : 42
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 29 | Mem: 26.53MB, Util: 59%  global_step : 29


[Rank 1] Train Epoch 0:   2%|▏         | 31/2000 [00:13<18:37,  1.76it/s]
[Rank 0] Train Epoch 0:   2%|▏         | 30/2000 [00:13<20:38,  1.59it/s]
[Rank 2] Train Epoch 0:   2%|▏         | 44/2000 [00:13<15:27,  2.11it/s]
[Rank 1] Train Epoch 0:   2%|▏         | 36/2000 [00:13<09:37,  3.40it/s]
[Rank 0] Train Epoch 0:   2%|▏         | 31/2000 [00:13<17:45,  1.85it/s]
[Rank 2] Train Epoch 0:   2%|▏         | 48/2000 [00:13<09:53,  3.29it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 30 | Mem: 26.53MB, Util: 0%  global_step : 30
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 31 | Mem: 26.53MB, Util: 100%  global_step : 31
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 32 | Mem: 26.53MB, Util: 100%  global_step : 32
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 29 | Mem: 26.53MB, Util: 10%  global_step : 29
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 43 | Mem: 26.53MB, Util: 100%  global_step : 43
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 44 | Mem: 26.53MB, Util: 100%  global_step : 44
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 45 | Mem: 26.53MB, Util: 100%  global_step : 45
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 46 | Mem: 26.53MB, Util: 100%  global_step : 46
[36m(Ray

[Rank 1] Train Epoch 0:   2%|▏         | 38/2000 [00:13<07:50,  4.17it/s]
[Rank 2] Train Epoch 0:   2%|▎         | 50/2000 [00:13<08:12,  3.96it/s]
[Rank 0] Train Epoch 0:   2%|▏         | 35/2000 [00:13<09:42,  3.37it/s]
[Rank 2] Train Epoch 0:   3%|▎         | 52/2000 [00:13<06:41,  4.85it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 37 | Mem: 26.53MB, Util: 100%  global_step : 37
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 38 | Mem: 26.53MB, Util: 100%  global_step : 38
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 32 | Mem: 26.53MB, Util: 4%  global_step : 32
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 33 | Mem: 26.53MB, Util: 4%  global_step : 33
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 49 | Mem: 26.53MB, Util: 100%  global_step : 49
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 39 | Mem: 26.53MB, Util: 100%  global_step : 39
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 34 | Mem: 26.53MB, Util: 4%  global_step : 34
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 35 | Mem: 26.53MB, Util: 4%  global_step : 35
[36m(RayTrainWor

[Rank 1] Train Epoch 0:   2%|▏         | 41/2000 [00:13<06:00,  5.43it/s]
[Rank 2] Train Epoch 0:   3%|▎         | 57/2000 [00:13<03:56,  8.21it/s]
[Rank 0] Train Epoch 0:   2%|▏         | 37/2000 [00:13<07:49,  4.18it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 40 | Mem: 26.53MB, Util: 100%  global_step : 40
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 41 | Mem: 26.53MB, Util: 100%  global_step : 41
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 52 | Mem: 26.53MB, Util: 100%  global_step : 52
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 53 | Mem: 26.53MB, Util: 100%  global_step : 53
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 54 | Mem: 26.53MB, Util: 100%  global_step : 54
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 55 | Mem: 26.53MB, Util: 100%  global_step : 55
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 56 | Mem: 26.53MB, Util: 100%  global_step : 56
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 36 | Mem: 26.53MB, Util: 4%  global_step : 36
[36m(Ray

[Rank 1] Train Epoch 0:   2%|▏         | 43/2000 [00:14<05:06,  6.38it/s]
[Rank 0] Train Epoch 0:   2%|▏         | 39/2000 [00:14<06:12,  5.26it/s]
[Rank 2] Train Epoch 0:   3%|▎         | 61/2000 [00:14<02:58, 10.86it/s]
[Rank 2] Train Epoch 0:   3%|▎         | 64/2000 [00:14<02:40, 12.10it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 42 | Mem: 26.53MB, Util: 100%  global_step : 42
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 43 | Mem: 26.53MB, Util: 100%  global_step : 43
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 38 | Mem: 26.53MB, Util: 16%  global_step : 38
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 60 | Mem: 26.53MB, Util: 100%  global_step : 60
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 61 | Mem: 26.53MB, Util: 100%  global_step : 61
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 62 | Mem: 26.53MB, Util: 100%  global_step : 62
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 63 | Mem: 26.53MB, Util: 100%  global_step : 63


[Rank 1] Train Epoch 0:   2%|▏         | 45/2000 [00:14<04:43,  6.90it/s]
[Rank 1] Train Epoch 0:   2%|▏         | 47/2000 [00:14<04:15,  7.64it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 44 | Mem: 26.53MB, Util: 100%  global_step : 44
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 45 | Mem: 26.53MB, Util: 100%  global_step : 45
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 39 | Mem: 26.53MB, Util: 7%  global_step : 39
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 64 | Mem: 26.53MB, Util: 100%  global_step : 64
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 46 | Mem: 26.53MB, Util: 100%  global_step : 46
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 47 | Mem: 26.53MB, Util: 100%  global_step : 47
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 40 | Mem: 26.53MB, Util: 7%  global_step : 40
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 41 | Mem: 26.53MB, Util: 3%  global_step : 41
[36m(RayTrai

[Rank 0] Train Epoch 0:   2%|▏         | 41/2000 [00:14<06:30,  5.02it/s]
[Rank 2] Train Epoch 0:   3%|▎         | 67/2000 [00:14<02:45, 11.65it/s]
[Rank 1] Train Epoch 0:   2%|▏         | 49/2000 [00:14<03:39,  8.87it/s]
[Rank 0] Train Epoch 0:   2%|▏         | 44/2000 [00:15<07:07,  4.57it/s]
[Rank 1] Train Epoch 0:   3%|▎         | 52/2000 [00:15<05:06,  6.35it/s]
[Rank 1] Train Epoch 0:   3%|▎         | 56/2000 [00:15<03:20,  9.69it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 43 | Mem: 26.53MB, Util: 3%  global_step : 43
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 68 | Mem: 26.53MB, Util: 100%  global_step : 68
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 51 | Mem: 26.53MB, Util: 100%  global_step : 51
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 52 | Mem: 26.53MB, Util: 100%  global_step : 52
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 53 | Mem: 26.53MB, Util: 100%  global_step : 53
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 54 | Mem: 26.53MB, Util: 100%  global_step : 54
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 55 | Mem: 26.53MB, Util: 100%  global_step : 55
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 56 | Mem: 26.53MB, Util: 100%  global_step : 56
[36m

[Rank 0] Train Epoch 0:   2%|▏         | 45/2000 [00:15<06:39,  4.90it/s]
[Rank 2] Train Epoch 0:   4%|▎         | 70/2000 [00:15<04:34,  7.03it/s]
[Rank 1] Train Epoch 0:   3%|▎         | 59/2000 [00:15<02:40, 12.11it/s]
[Rank 0] Train Epoch 0:   2%|▏         | 46/2000 [00:15<06:20,  5.13it/s]
[Rank 1] Train Epoch 0:   3%|▎         | 63/2000 [00:15<02:09, 14.97it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 44 | Mem: 26.53MB, Util: 0%  global_step : 44
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 69 | Mem: 26.53MB, Util: 100%  global_step : 69
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 58 | Mem: 26.53MB, Util: 100%  global_step : 58
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 59 | Mem: 26.53MB, Util: 100%  global_step : 59
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 60 | Mem: 26.53MB, Util: 100%  global_step : 60
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 61 | Mem: 26.53MB, Util: 100%  global_step : 61
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 45 | Mem: 26.53MB, Util: 0%  global_step : 45
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 70 | Mem: 26.53MB, Util: 100%  global_step : 70
[36m(RayT

[Rank 0] Train Epoch 0:   2%|▏         | 48/2000 [00:15<04:54,  6.62it/s]
[Rank 2] Train Epoch 0:   4%|▎         | 72/2000 [00:15<04:26,  7.23it/s]
[Rank 1] Train Epoch 0:   3%|▎         | 66/2000 [00:15<01:58, 16.30it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 65 | Mem: 26.53MB, Util: 100%  global_step : 65
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 49 | Mem: 26.53MB, Util: 7%  global_step : 49
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 50 | Mem: 26.53MB, Util: 3%  global_step : 50
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 74 | Mem: 26.53MB, Util: 100%  global_step : 74
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 66 | Mem: 26.53MB, Util: 100%  global_step : 66
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 67 | Mem: 26.53MB, Util: 100%  global_step : 67
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 68 | Mem: 26.53MB, Util: 100%  global_step : 68
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 69 | Mem: 26.53MB, Util: 100%  global_step : 69
[36m(Ray

[Rank 0] Train Epoch 0:   2%|▎         | 50/2000 [00:15<04:09,  7.82it/s]
[Rank 2] Train Epoch 0:   4%|▍         | 75/2000 [00:15<03:41,  8.69it/s]
[Rank 1] Train Epoch 0:   3%|▎         | 69/2000 [00:15<01:51, 17.26it/s]
[Rank 0] Train Epoch 0:   3%|▎         | 52/2000 [00:15<03:31,  9.21it/s]
[Rank 2] Train Epoch 0:   4%|▍         | 77/2000 [00:15<03:20,  9.60it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 53 | Mem: 26.53MB, Util: 9%  global_step : 53
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 78 | Mem: 26.53MB, Util: 100%  global_step : 78
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 70 | Mem: 26.53MB, Util: 100%  global_step : 70
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 71 | Mem: 26.53MB, Util: 100%  global_step : 71
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 72 | Mem: 26.53MB, Util: 100%  global_step : 72


[Rank 1] Train Epoch 0:   4%|▎         | 72/2000 [00:17<06:46,  4.75it/s]
[Rank 0] Train Epoch 0:   3%|▎         | 55/2000 [00:17<09:36,  3.37it/s]
[Rank 2] Train Epoch 0:   4%|▍         | 80/2000 [00:17<08:17,  3.86it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 54 | Mem: 26.53MB, Util: 9%  global_step : 54
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 79 | Mem: 26.53MB, Util: 100%  global_step : 79
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 73 | Mem: 26.53MB, Util: 100%  global_step : 73
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 74 | Mem: 26.53MB, Util: 100%  global_step : 74
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 75 | Mem: 26.53MB, Util: 100%  global_step : 75
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 55 | Mem: 26.53MB, Util: 0%  global_step : 55
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 80 | Mem: 26.53MB, Util: 100%  global_step : 80
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 76 | Mem: 26.53MB, Util: 100%  global_step : 76


[Rank 1] Train Epoch 0:   4%|▍         | 77/2000 [00:17<04:33,  7.04it/s]
[Rank 0] Train Epoch 0:   3%|▎         | 57/2000 [00:17<08:02,  4.02it/s]
[Rank 1] Train Epoch 0:   4%|▍         | 80/2000 [00:18<03:42,  8.64it/s]
[Rank 2] Train Epoch 0:   4%|▍         | 82/2000 [00:17<07:14,  4.41it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 56 | Mem: 26.53MB, Util: 0%  global_step : 56
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 77 | Mem: 26.53MB, Util: 100%  global_step : 77
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 78 | Mem: 26.53MB, Util: 100%  global_step : 78
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 79 | Mem: 26.53MB, Util: 100%  global_step : 79
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 57 | Mem: 26.53MB, Util: 8%  global_step : 57
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 58 | Mem: 26.53MB, Util: 8%  global_step : 58
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 81 | Mem: 26.53MB, Util: 100%  global_step : 81
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 82 | Mem: 26.53MB, Util: 100%  global_step : 82
[36m(RayTrain

[Rank 0] Train Epoch 0:   3%|▎         | 60/2000 [00:18<06:07,  5.28it/s]
[Rank 2] Train Epoch 0:   4%|▍         | 85/2000 [00:18<05:45,  5.55it/s]
[Rank 0] Train Epoch 0:   3%|▎         | 62/2000 [00:18<04:58,  6.50it/s]
[Rank 2] Train Epoch 0:   4%|▍         | 87/2000 [00:18<04:46,  6.68it/s]
[Rank 1] Train Epoch 0:   4%|▍         | 82/2000 [00:18<04:07,  7.74it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 59 | Mem: 26.53MB, Util: 8%  global_step : 59
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 60 | Mem: 26.53MB, Util: 10%  global_step : 60
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 84 | Mem: 26.53MB, Util: 100%  global_step : 84
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 85 | Mem: 26.53MB, Util: 100%  global_step : 85
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 80 | Mem: 26.53MB, Util: 100%  global_step : 80
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 61 | Mem: 26.53MB, Util: 10%  global_step : 61
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 62 | Mem: 26.53MB, Util: 4%  global_step : 62
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 63 | Mem: 26.53MB, Util: 4%  global_step : 63
[36m(RayTrainWorker

[Rank 0] Train Epoch 0:   3%|▎         | 66/2000 [00:18<03:24,  9.46it/s]
[Rank 2] Train Epoch 0:   5%|▍         | 91/2000 [00:18<03:20,  9.52it/s]
[Rank 1] Train Epoch 0:   4%|▍         | 85/2000 [00:18<03:31,  9.07it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 66 | Mem: 26.53MB, Util: 4%  global_step : 66
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 91 | Mem: 26.53MB, Util: 100%  global_step : 91
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 87 | Mem: 26.53MB, Util: 100%  global_step : 87


[Rank 1] Train Epoch 0:   4%|▍         | 88/2000 [00:18<03:12,  9.91it/s]
[Rank 0] Train Epoch 0:   3%|▎         | 68/2000 [00:18<03:57,  8.13it/s]
[Rank 2] Train Epoch 0:   5%|▍         | 93/2000 [00:18<03:52,  8.20it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 67 | Mem: 26.53MB, Util: 16%  global_step : 67
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 92 | Mem: 26.53MB, Util: 100%  global_step : 92


[Rank 1] Train Epoch 0:   4%|▍         | 90/2000 [00:19<03:38,  8.73it/s]
[Rank 0] Train Epoch 0:   4%|▎         | 70/2000 [00:19<04:17,  7.50it/s]
[Rank 2] Train Epoch 0:   5%|▍         | 95/2000 [00:19<04:12,  7.56it/s]
[Rank 0] Train Epoch 0:   4%|▎         | 73/2000 [00:19<03:13,  9.98it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 88 | Mem: 26.53MB, Util: 100%  global_step : 88
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 89 | Mem: 26.53MB, Util: 100%  global_step : 89
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 68 | Mem: 26.53MB, Util: 3%  global_step : 68
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 69 | Mem: 26.53MB, Util: 3%  global_step : 69
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 93 | Mem: 26.53MB, Util: 100%  global_step : 93
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 94 | Mem: 26.53MB, Util: 100%  global_step : 94
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 90 | Mem: 26.53MB, Util: 100%  global_step : 90
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 91 | Mem: 26.53MB, Util: 100%  global_step : 91
[36m(RayT

[Rank 0] Train Epoch 0:   4%|▍         | 76/2000 [00:19<02:32, 12.61it/s]
[Rank 2] Train Epoch 0:   5%|▍         | 98/2000 [00:19<03:10,  9.99it/s]
[Rank 1] Train Epoch 0:   5%|▍         | 93/2000 [00:19<03:31,  9.03it/s]
[Rank 2] Train Epoch 0:   5%|▌         | 100/2000 [00:19<02:54, 10.88it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 97 | Mem: 26.53MB, Util: 100%  global_step : 97
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 98 | Mem: 26.53MB, Util: 100%  global_step : 98
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 73 | Mem: 26.53MB, Util: 3%  global_step : 73
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 74 | Mem: 26.53MB, Util: 3%  global_step : 74
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 75 | Mem: 26.53MB, Util: 3%  global_step : 75
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 92 | Mem: 26.53MB, Util: 100%  global_step : 92
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 99 | Mem: 26.53MB, Util: 100%  global_step : 99
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 93 | Mem: 26.53MB, Util: 100%  global_step : 93
[36m(RayTrainW

[Rank 1] Train Epoch 0:   5%|▍         | 95/2000 [00:20<08:04,  3.94it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 94 | Mem: 26.53MB, Util: 100%  global_step : 94


[Rank 1] Train Epoch 0:   5%|▍         | 96/2000 [00:25<25:56,  1.22it/s]
[Rank 0] Train Epoch 0:   4%|▍         | 78/2000 [00:25<23:55,  1.34it/s]
[Rank 0] Train Epoch 0:   4%|▍         | 81/2000 [00:25<16:07,  1.98it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 95 | Mem: 26.53MB, Util: 100%  global_step : 95
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 96 | Mem: 26.53MB, Util: 100%  global_step : 96
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 97 | Mem: 26.53MB, Util: 100%  global_step : 97
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 77 | Mem: 26.53MB, Util: 0%  global_step : 77
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 78 | Mem: 26.53MB, Util: 0%  global_step : 78
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 79 | Mem: 26.53MB, Util: 0%  global_step : 79
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 98 | Mem: 26.53MB, Util: 100%  global_step : 98
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 99 | Mem: 26.53MB, Util: 100%  global_step : 99
[36m(RayTra

[Rank 0] Train Epoch 0:   4%|▍         | 86/2000 [00:25<09:10,  3.47it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 84 | Mem: 26.53MB, Util: 0%  global_step : 84
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 85 | Mem: 26.53MB, Util: 0%  global_step : 85
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 86 | Mem: 26.53MB, Util: 0%  global_step : 86
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 87 | Mem: 26.53MB, Util: 9%  global_step : 87
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 88 | Mem: 26.53MB, Util: 9%  global_step : 88
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 89 | Mem: 26.53MB, Util: 9%  global_step : 89
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 90 | Mem: 26.53MB, Util: 9%  global_step : 90
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 91 | Mem: 26.53MB, Util: 0%  global_step : 91
[36m(RayTrainWorker pid=604, ip

[Rank 0] Train Epoch 0:   4%|▍         | 89/2000 [00:25<07:52,  4.04it/s]
[Rank 0] Train Epoch 0:   5%|▍         | 95/2000 [00:25<04:37,  6.86it/s]
[Rank 1] Train Epoch 0:   5%|▌         | 101/2000 [00:25<15:24,  2.05it/s]
[Rank 2] Train Epoch 0:   5%|▌         | 102/2000 [00:25<28:52,  1.10it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 93 | Mem: 26.53MB, Util: 0%  global_step : 93
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 94 | Mem: 26.53MB, Util: 0%  global_step : 94
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 95 | Mem: 26.53MB, Util: 0%  global_step : 95
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 96 | Mem: 26.53MB, Util: 0%  global_step : 96
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 97 | Mem: 26.53MB, Util: 0%  global_step : 97
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 98 | Mem: 26.53MB, Util: 0%  global_step : 98
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 99 | Mem: 26.53MB, Util: 0%  global_step : 99
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 100 | Mem: 26.53MB, Util: 100%  global_step : 100
[36m(RayTrainWorker pid=3

[Rank 0] Train Epoch 0:   5%|▌         | 102/2000 [00:25<02:51, 11.05it/s]
[Rank 1] Train Epoch 0:   5%|▌         | 108/2000 [00:26<07:51,  4.01it/s]
[Rank 2] Train Epoch 0:   6%|▌         | 116/2000 [00:26<08:06,  3.87it/s]
[Rank 0] Train Epoch 0:   5%|▌         | 108/2000 [00:26<02:05, 15.08it/s]
[Rank 2] Train Epoch 0:   6%|▋         | 130/2000 [00:26<04:04,  7.65it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 107 | Mem: 26.53MB, Util: 34%  global_step : 107
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 123 | Mem: 26.53MB, Util: 100%  global_step : 123
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 124 | Mem: 26.53MB, Util: 100%  global_step : 124
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 125 | Mem: 26.53MB, Util: 100%  global_step : 125
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 126 | Mem: 26.53MB, Util: 100%  global_step : 126
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 127 | Mem: 26.53MB, Util: 100%  global_step : 127
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 128 | Mem: 26.53MB, Util: 100%  global_step : 128
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 129 | Mem: 26.53MB, Util: 100%  global_step 

[Rank 1] Train Epoch 0:   6%|▌         | 111/2000 [00:28<11:46,  2.68it/s]
[Rank 0] Train Epoch 0:   6%|▌         | 113/2000 [00:28<05:46,  5.44it/s]
[Rank 1] Train Epoch 0:   6%|▌         | 113/2000 [00:28<09:56,  3.16it/s]
[Rank 2] Train Epoch 0:   7%|▋         | 138/2000 [00:28<05:33,  5.59it/s]
[Rank 0] Train Epoch 0:   6%|▌         | 117/2000 [00:28<04:32,  6.91it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 114 | Mem: 26.53MB, Util: 0%  global_step : 114
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 115 | Mem: 26.53MB, Util: 0%  global_step : 115
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 116 | Mem: 26.53MB, Util: 0%  global_step : 116
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 114 | Mem: 26.53MB, Util: 100%  global_step : 114
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 139 | Mem: 26.53MB, Util: 100%  global_step : 139
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 117 | Mem: 26.53MB, Util: 0%  global_step : 117
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 118 | Mem: 26.53MB, Util: 49%  global_step : 118
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 115 | Mem: 26.53MB, Util: 100%  global_step : 115
[36

[Rank 1] Train Epoch 0:   6%|▌         | 115/2000 [00:28<08:11,  3.83it/s]
[Rank 0] Train Epoch 0:   6%|▌         | 121/2000 [00:28<03:42,  8.46it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 119 | Mem: 26.53MB, Util: 49%  global_step : 119
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 120 | Mem: 26.53MB, Util: 49%  global_step : 120
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 117 | Mem: 26.53MB, Util: 22%  global_step : 117
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 118 | Mem: 26.53MB, Util: 22%  global_step : 118
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 142 | Mem: 26.53MB, Util: 100%  global_step : 142
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 143 | Mem: 26.53MB, Util: 100%  global_step : 143
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 121 | Mem: 26.53MB, Util: 49%  global_step : 121
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 122 | Mem: 26.53MB, Util: 100%  global_step : 122

[Rank 1] Train Epoch 0:   6%|▌         | 118/2000 [00:28<06:07,  5.12it/s]
[Rank 2] Train Epoch 0:   7%|▋         | 144/2000 [00:28<04:36,  6.72it/s]
[Rank 1] Train Epoch 0:   6%|▌         | 120/2000 [00:28<05:13,  5.99it/s]
[Rank 0] Train Epoch 0:   6%|▋         | 125/2000 [00:28<03:07, 10.00it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 123 | Mem: 26.53MB, Util: 100%  global_step : 123
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 124 | Mem: 26.53MB, Util: 100%  global_step : 124
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 125 | Mem: 26.53MB, Util: 100%  global_step : 125
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 126 | Mem: 26.53MB, Util: 100%  global_step : 126
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 127 | Mem: 26.53MB, Util: 100%  global_step : 127
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 121 | Mem: 26.53MB, Util: 22%  global_step : 121
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 122 | Mem: 26.53MB, Util: 22%  global_step : 122
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 123 | Mem: 26.53MB, Util: 22%  global_step : 1

[Rank 1] Train Epoch 0:   6%|▌         | 124/2000 [00:28<03:31,  8.88it/s]
[Rank 2] Train Epoch 0:   7%|▋         | 149/2000 [00:28<03:48,  8.08it/s]
[Rank 0] Train Epoch 0:   6%|▋         | 129/2000 [00:29<02:28, 12.63it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 130 | Mem: 26.53MB, Util: 100%  global_step : 130
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 131 | Mem: 26.53MB, Util: 100%  global_step : 131
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 125 | Mem: 26.53MB, Util: 13%  global_step : 125
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 126 | Mem: 26.53MB, Util: 15%  global_step : 126
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 127 | Mem: 26.53MB, Util: 15%  global_step : 127
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 150 | Mem: 26.53MB, Util: 100%  global_step : 150
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 151 | Mem: 26.53MB, Util: 100%  global_step : 151
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 152 | Mem: 26.53MB, Util: 100%  global_step 

[Rank 1] Train Epoch 0:   6%|▋         | 127/2000 [00:29<03:04, 10.17it/s]
[Rank 2] Train Epoch 0:   8%|▊         | 153/2000 [00:29<03:25,  8.99it/s]
[Rank 0] Train Epoch 0:   7%|▋         | 133/2000 [00:29<02:09, 14.37it/s]
[Rank 0] Train Epoch 0:   7%|▋         | 139/2000 [00:29<01:33, 19.90it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 138 | Mem: 26.53MB, Util: 100%  global_step : 138
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 139 | Mem: 26.53MB, Util: 100%  global_step : 139
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 140 | Mem: 26.53MB, Util: 100%  global_step : 140
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 141 | Mem: 26.53MB, Util: 100%  global_step : 141
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 142 | Mem: 26.53MB, Util: 100%  global_step : 142
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 143 | Mem: 26.53MB, Util: 100%  global_step : 143


[Rank 0] Train Epoch 0:   7%|▋         | 143/2000 [00:29<01:23, 22.13it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 144 | Mem: 26.53MB, Util: 100%  global_step : 144


[Rank 0] Train Epoch 0:   7%|▋         | 147/2000 [00:30<02:14, 13.76it/s]
[Rank 1] Train Epoch 0:   6%|▋         | 130/2000 [00:29<04:43,  6.61it/s]
[Rank 2] Train Epoch 0:   8%|▊         | 157/2000 [00:30<04:05,  7.50it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 145 | Mem: 26.53MB, Util: 100%  global_step : 145
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 146 | Mem: 26.53MB, Util: 100%  global_step : 146
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 129 | Mem: 26.53MB, Util: 15%  global_step : 129
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 130 | Mem: 26.53MB, Util: 0%  global_step : 130
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 154 | Mem: 26.53MB, Util: 100%  global_step : 154
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 155 | Mem: 26.53MB, Util: 100%  global_step : 155
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 147 | Mem: 26.53MB, Util: 100%  global_step : 147
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 148 | Mem: 26.53MB, Util: 100%  global_step : 1

[Rank 0] Train Epoch 0:   8%|▊         | 151/2000 [00:30<01:58, 15.60it/s]
[Rank 1] Train Epoch 0:   7%|▋         | 133/2000 [00:30<03:54,  7.96it/s]
[Rank 0] Train Epoch 0:   8%|▊         | 155/2000 [00:30<01:39, 18.53it/s]
[Rank 1] Train Epoch 0:   7%|▋         | 136/2000 [00:30<03:03, 10.15it/s]
[Rank 2] Train Epoch 0:   8%|▊         | 160/2000 [00:30<03:40,  8.33it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 150 | Mem: 26.53MB, Util: 100%  global_step : 150
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 151 | Mem: 26.53MB, Util: 100%  global_step : 151
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 152 | Mem: 26.53MB, Util: 100%  global_step : 152
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 132 | Mem: 26.53MB, Util: 0%  global_step : 132
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 133 | Mem: 26.53MB, Util: 3%  global_step : 133
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 157 | Mem: 26.53MB, Util: 100%  global_step : 157
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 158 | Mem: 26.53MB, Util: 100%  global_step : 158
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 153 | Mem: 26.53MB, Util: 100%  global_step : 15

[Rank 1] Train Epoch 0:   7%|▋         | 138/2000 [00:30<03:11,  9.73it/s]
[Rank 2] Train Epoch 0:   8%|▊         | 163/2000 [00:30<03:30,  8.73it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 138 | Mem: 26.53MB, Util: 10%  global_step : 138
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 163 | Mem: 26.53MB, Util: 100%  global_step : 163
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 157 | Mem: 26.53MB, Util: 100%  global_step : 157
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 158 | Mem: 26.53MB, Util: 100%  global_step : 158
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 139 | Mem: 26.53MB, Util: 10%  global_step : 139
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 140 | Mem: 26.53MB, Util: 10%  global_step : 140
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 164 | Mem: 26.53MB, Util: 100%  global_step : 164
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 165 | Mem: 26.53MB, Util: 100%  global_step 

[Rank 0] Train Epoch 0:   8%|▊         | 158/2000 [00:30<02:11, 14.01it/s]
[Rank 1] Train Epoch 0:   7%|▋         | 140/2000 [00:30<02:56, 10.57it/s]
[Rank 2] Train Epoch 0:   8%|▊         | 165/2000 [00:30<03:15,  9.38it/s]
[Rank 0] Train Epoch 0:   8%|▊         | 161/2000 [00:30<01:55, 15.93it/s]
[Rank 1] Train Epoch 0:   7%|▋         | 142/2000 [00:30<02:36, 11.86it/s]
[Rank 2] Train Epoch 0:   8%|▊         | 167/2000 [00:30<02:56, 10.37it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 159 | Mem: 26.53MB, Util: 100%  global_step : 159
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 160 | Mem: 26.53MB, Util: 100%  global_step : 160
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 141 | Mem: 26.53MB, Util: 7%  global_step : 141
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 142 | Mem: 26.53MB, Util: 7%  global_step : 142
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 166 | Mem: 26.53MB, Util: 100%  global_step : 166
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 167 | Mem: 26.53MB, Util: 100%  global_step : 167
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 161 | Mem: 26.53MB, Util: 100%  global_step : 161
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 162 | Mem: 26.53MB, Util: 100%  global_step : 16

[Rank 0] Train Epoch 0:   8%|▊         | 164/2000 [00:30<01:42, 17.95it/s]
[Rank 1] Train Epoch 0:   7%|▋         | 145/2000 [00:30<02:06, 14.66it/s]
[Rank 2] Train Epoch 0:   8%|▊         | 170/2000 [00:30<02:24, 12.66it/s]
[Rank 1] Train Epoch 0:   7%|▋         | 148/2000 [00:32<08:00,  3.85it/s]
[Rank 2] Train Epoch 0:   9%|▊         | 173/2000 [00:32<07:28,  4.07it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 147 | Mem: 26.53MB, Util: 13%  global_step : 147
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 165 | Mem: 26.53MB, Util: 100%  global_step : 165
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 172 | Mem: 26.53MB, Util: 100%  global_step : 172
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 148 | Mem: 26.53MB, Util: 0%  global_step : 148
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 166 | Mem: 26.53MB, Util: 100%  global_step : 166
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 173 | Mem: 26.53MB, Util: 100%  global_step : 173
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 149 | Mem: 26.53MB, Util: 0%  global_step : 149
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 150 | Mem: 26.53MB, Util: 0%  global_step : 1

[Rank 0] Train Epoch 0:   8%|▊         | 167/2000 [00:33<07:32,  4.06it/s]
[Rank 1] Train Epoch 0:   8%|▊         | 150/2000 [00:33<07:33,  4.07it/s]
[Rank 0] Train Epoch 0:   8%|▊         | 169/2000 [00:33<06:18,  4.84it/s]
[Rank 2] Train Epoch 0:   9%|▉         | 175/2000 [00:33<07:10,  4.24it/s]
[Rank 1] Train Epoch 0:   8%|▊         | 153/2000 [00:33<05:23,  5.70it/s]
[Rank 0] Train Epoch 0:   9%|▊         | 172/2000 [00:33<04:46,  6.38it/s]
[Rank 2] Train Epoch 0:   9%|▉         | 178/2000 [00:33<05:15,  5.78it/s]
[Rank 1] Train Epoch 0:   8%|▊         | 159/2000 [00:33<02:59, 10.23it/s]
[Rank 2] Train Epoch 0:   9%|▉         | 184/2000 [00:33<02:59, 10.13it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 152 | Mem: 26.53MB, Util: 0%  global_step : 152
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 153 | Mem: 26.53MB, Util: 7%  global_step : 153
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 154 | Mem: 26.53MB, Util: 7%  global_step : 154
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 155 | Mem: 26.53MB, Util: 7%  global_step : 155
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 156 | Mem: 26.53MB, Util: 7%  global_step : 156
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 169 | Mem: 26.53MB, Util: 100%  global_step : 169
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 170 | Mem: 26.53MB, Util: 100%  global_step : 170
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 171 | Mem: 26.53MB, Util: 100%  global_step : 171


[Rank 0] Train Epoch 0:   9%|▊         | 174/2000 [00:33<04:12,  7.24it/s]
[Rank 1] Train Epoch 0:   8%|▊         | 162/2000 [00:33<02:42, 11.33it/s]
[Rank 2] Train Epoch 0:   9%|▉         | 187/2000 [00:33<02:42, 11.18it/s]
[Rank 1] Train Epoch 0:   8%|▊         | 167/2000 [00:33<01:59, 15.39it/s]
[Rank 0] Train Epoch 0:   9%|▉         | 176/2000 [00:33<03:51,  7.86it/s]
[Rank 2] Train Epoch 0:  10%|▉         | 192/2000 [00:33<01:59, 15.14it/s]
[Rank 1] Train Epoch 0:   8%|▊         | 170/2000 [00:33<01:47, 16.96it/s]
[Rank 2] Train Epoch 0:  10%|▉         | 195/2000 [00:33<01:48, 16.71it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 165 | Mem: 26.53MB, Util: 11%  global_step : 165
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 166 | Mem: 26.53MB, Util: 11%  global_step : 166
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 167 | Mem: 26.53MB, Util: 11%  global_step : 167
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 168 | Mem: 26.53MB, Util: 11%  global_step : 168
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 175 | Mem: 26.53MB, Util: 100%  global_step : 175
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 190 | Mem: 26.53MB, Util: 100%  global_step : 190
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 191 | Mem: 26.53MB, Util: 100%  global_step : 191
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 192 | Mem: 26.53MB, Util: 100%  global_step

[Rank 1] Train Epoch 0:   9%|▊         | 173/2000 [00:34<01:50, 16.47it/s]
[Rank 0] Train Epoch 0:   9%|▉         | 178/2000 [00:34<04:08,  7.34it/s]
[Rank 2] Train Epoch 0:  10%|▉         | 198/2000 [00:34<01:50, 16.38it/s]
[Rank 1] Train Epoch 0:   9%|▉         | 179/2000 [00:34<01:17, 23.59it/s]
[Rank 0] Train Epoch 0:   9%|▉         | 184/2000 [00:34<02:14, 13.49it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 176 | Mem: 26.53MB, Util: 6%  global_step : 176
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 177 | Mem: 26.53MB, Util: 6%  global_step : 177
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 178 | Mem: 26.53MB, Util: 6%  global_step : 178
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 179 | Mem: 26.53MB, Util: 6%  global_step : 179
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 182 | Mem: 26.53MB, Util: 100%  global_step : 182
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 183 | Mem: 26.53MB, Util: 100%  global_step : 183
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 184 | Mem: 26.53MB, Util: 100%  global_step : 184
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 185 | Mem: 26.53MB, Util: 100%  global_step : 185


[Rank 1] Train Epoch 0:   9%|▉         | 183/2000 [00:34<01:08, 26.67it/s]
[Rank 0] Train Epoch 0:   9%|▉         | 188/2000 [00:34<01:45, 17.19it/s]
[Rank 1] Train Epoch 0:   9%|▉         | 187/2000 [00:34<01:05, 27.88it/s]
[Rank 0] Train Epoch 0:  10%|▉         | 192/2000 [00:34<01:27, 20.67it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 185 | Mem: 26.53MB, Util: 28%  global_step : 185
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 186 | Mem: 26.53MB, Util: 28%  global_step : 186
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 191 | Mem: 26.53MB, Util: 100%  global_step : 191
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 192 | Mem: 26.53MB, Util: 100%  global_step : 192
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 187 | Mem: 26.53MB, Util: 28%  global_step : 187


[Rank 1] Train Epoch 0:  10%|▉         | 191/2000 [00:34<01:51, 16.28it/s]
[Rank 0] Train Epoch 0:  10%|▉         | 196/2000 [00:34<02:11, 13.73it/s]
[Rank 1] Train Epoch 0:  10%|▉         | 196/2000 [00:35<01:27, 20.69it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 188 | Mem: 26.53MB, Util: 22%  global_step : 188
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 189 | Mem: 26.53MB, Util: 4%  global_step : 189
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 190 | Mem: 26.53MB, Util: 0%  global_step : 190
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 191 | Mem: 26.53MB, Util: 0%  global_step : 191
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 192 | Mem: 26.53MB, Util: 0%  global_step : 192
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 193 | Mem: 26.53MB, Util: 100%  global_step : 193
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 194 | Mem: 26.53MB, Util: 100%  global_step : 194
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 195 | Mem: 26.53MB, Util: 100%  global_step : 195

[Rank 2] Train Epoch 0:  10%|█         | 201/2000 [00:35<04:01,  7.45it/s]
[Rank 0] Train Epoch 0:  10%|█         | 201/2000 [00:35<01:49, 16.40it/s]
[Rank 1] Train Epoch 0:  10%|█         | 202/2000 [00:35<01:12, 24.75it/s]
[Rank 2] Train Epoch 0:  11%|█         | 215/2000 [00:35<01:34, 18.97it/s]
[Rank 0] Train Epoch 0:  10%|█         | 206/2000 [00:35<01:28, 20.38it/s]
[Rank 1] Train Epoch 0:  10%|█         | 206/2000 [00:35<01:07, 26.69it/s]
[Rank 2] Train Epoch 0:  11%|█▏        | 229/2000 [00:35<00:54, 32.28it/s]
[Rank 0] Train Epoch 0:  10%|█         | 209/2000 [00:35<01:32, 19.39it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 204 | Mem: 26.53MB, Util: 23%  global_step : 204
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 205 | Mem: 26.53MB, Util: 22%  global_step : 205
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 206 | Mem: 26.53MB, Util: 22%  global_step : 206
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 207 | Mem: 26.53MB, Util: 22%  global_step : 207
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 208 | Mem: 26.53MB, Util: 22%  global_step : 208
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 209 | Mem: 26.53MB, Util: 22%  global_step : 209
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 225 | Mem: 26.53MB, Util: 99%  global_step : 225
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 226 | Mem: 26.53MB, Util: 99%  global_step 

[Rank 1] Train Epoch 0:  11%|█         | 211/2000 [00:35<01:06, 26.80it/s]
[Rank 0] Train Epoch 0:  11%|█         | 212/2000 [00:35<01:25, 20.98it/s]
[Rank 2] Train Epoch 0:  12%|█▏        | 237/2000 [00:35<00:53, 32.94it/s]
[Rank 1] Train Epoch 0:  11%|█         | 215/2000 [00:35<01:05, 27.19it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 210 | Mem: 26.53MB, Util: 22%  global_step : 210
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 211 | Mem: 26.53MB, Util: 22%  global_step : 211
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 212 | Mem: 26.53MB, Util: 66%  global_step : 212
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 210 | Mem: 26.53MB, Util: 96%  global_step : 210
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 211 | Mem: 26.53MB, Util: 96%  global_step : 211
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 212 | Mem: 26.53MB, Util: 96%  global_step : 212
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 234 | Mem: 26.53MB, Util: 100%  global_step : 234
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 235 | Mem: 26.53MB, Util: 100%  global_step : 23

[Rank 1] Train Epoch 0:  11%|█         | 220/2000 [00:35<00:57, 30.94it/s]
[Rank 0] Train Epoch 0:  11%|█         | 215/2000 [00:35<01:45, 17.00it/s]
[Rank 1] Train Epoch 0:  11%|█         | 224/2000 [00:35<00:58, 30.42it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 217 | Mem: 26.53MB, Util: 66%  global_step : 217
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 218 | Mem: 26.53MB, Util: 66%  global_step : 218
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 219 | Mem: 26.53MB, Util: 48%  global_step : 219
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 220 | Mem: 26.53MB, Util: 48%  global_step : 220
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 214 | Mem: 26.53MB, Util: 24%  global_step : 214
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 221 | Mem: 26.53MB, Util: 48%  global_step : 221
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 222 | Mem: 26.53MB, Util: 48%  global_step : 222
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 223 | Mem: 26.53MB, Util: 48%  global_step 

[Rank 0] Train Epoch 0:  11%|█         | 218/2000 [00:43<21:15,  1.40it/s]
[Rank 0] Train Epoch 0:  11%|█         | 220/2000 [00:43<17:13,  1.72it/s]
[Rank 1] Train Epoch 0:  11%|█▏        | 228/2000 [00:43<16:05,  1.83it/s]
[Rank 2] Train Epoch 0:  12%|█▏        | 244/2000 [00:43<09:11,  3.18it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 217 | Mem: 26.53MB, Util: 19%  global_step : 217
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 225 | Mem: 26.53MB, Util: 100%  global_step : 225
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 242 | Mem: 26.53MB, Util: 100%  global_step : 242
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 218 | Mem: 26.53MB, Util: 0%  global_step : 218
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 219 | Mem: 26.53MB, Util: 0%  global_step : 219
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 226 | Mem: 26.53MB, Util: 100%  global_step : 226
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 227 | Mem: 26.53MB, Util: 100%  global_step : 227
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 228 | Mem: 26.53MB, Util: 100%  global_step : 

[Rank 0] Train Epoch 0:  11%|█         | 222/2000 [00:43<13:40,  2.17it/s]
[Rank 1] Train Epoch 0:  12%|█▏        | 234/2000 [00:43<10:14,  2.87it/s]
[Rank 2] Train Epoch 0:  12%|█▏        | 249/2000 [00:43<07:27,  3.91it/s]
[Rank 1] Train Epoch 0:  12%|█▏        | 241/2000 [00:43<06:28,  4.53it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 220 | Mem: 26.53MB, Util: 0%  global_step : 220
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 221 | Mem: 26.53MB, Util: 8%  global_step : 221
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 232 | Mem: 26.53MB, Util: 100%  global_step : 232
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 233 | Mem: 26.53MB, Util: 100%  global_step : 233
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 234 | Mem: 26.53MB, Util: 100%  global_step : 234
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 235 | Mem: 26.53MB, Util: 100%  global_step : 235
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 236 | Mem: 26.53MB, Util: 100%  global_step : 236
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 237 | Mem: 26.53MB, Util: 100%  global_ste

[Rank 0] Train Epoch 0:  11%|█▏        | 226/2000 [00:43<08:48,  3.35it/s]
[Rank 2] Train Epoch 0:  13%|█▎        | 254/2000 [00:43<05:59,  4.86it/s]
[Rank 1] Train Epoch 0:  12%|█▏        | 248/2000 [00:43<04:20,  6.74it/s]
[Rank 0] Train Epoch 0:  12%|█▏        | 230/2000 [00:43<05:56,  4.96it/s]
[Rank 1] Train Epoch 0:  13%|█▎        | 254/2000 [00:43<03:10,  9.15it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 225 | Mem: 26.53MB, Util: 8%  global_step : 225
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 226 | Mem: 26.53MB, Util: 19%  global_step : 226
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 227 | Mem: 26.53MB, Util: 19%  global_step : 227
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 250 | Mem: 26.53MB, Util: 100%  global_step : 250
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 251 | Mem: 26.53MB, Util: 100%  global_step : 251
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 252 | Mem: 26.53MB, Util: 100%  global_step : 252
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 253 | Mem: 26.53MB, Util: 100%  global_step : 253
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 245 | Mem: 26.53MB, Util: 100%  global_step : 24

[Rank 0] Train Epoch 0:  12%|█▏        | 233/2000 [00:43<04:38,  6.35it/s]
[Rank 2] Train Epoch 0:  13%|█▎        | 258/2000 [00:43<04:59,  5.82it/s]
[Rank 1] Train Epoch 0:  13%|█▎        | 259/2000 [00:43<02:35, 11.23it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 231 | Mem: 26.53MB, Util: 19%  global_step : 231
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 232 | Mem: 26.53MB, Util: 13%  global_step : 232
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 256 | Mem: 26.53MB, Util: 100%  global_step : 256
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 257 | Mem: 26.53MB, Util: 100%  global_step : 257
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 257 | Mem: 26.53MB, Util: 100%  global_step : 257
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 233 | Mem: 26.53MB, Util: 13%  global_step : 233
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 234 | Mem: 26.53MB, Util: 13%  global_step : 234
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 258 | Mem: 26.53MB, Util: 100%  global_step : 25

[Rank 0] Train Epoch 0:  12%|█▏        | 236/2000 [00:44<03:40,  8.01it/s]
[Rank 2] Train Epoch 0:  13%|█▎        | 262/2000 [00:44<04:08,  6.99it/s]
[Rank 1] Train Epoch 0:  13%|█▎        | 264/2000 [00:44<02:08, 13.52it/s]
[Rank 0] Train Epoch 0:  12%|█▏        | 239/2000 [00:44<02:54, 10.07it/s]
[Rank 2] Train Epoch 0:  13%|█▎        | 266/2000 [00:44<03:18,  8.75it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 235 | Mem: 26.53MB, Util: 13%  global_step : 235
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 236 | Mem: 26.53MB, Util: 13%  global_step : 236
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 237 | Mem: 26.53MB, Util: 14%  global_step : 237
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 261 | Mem: 26.53MB, Util: 100%  global_step : 261
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 262 | Mem: 26.53MB, Util: 100%  global_step : 262
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 263 | Mem: 26.53MB, Util: 100%  global_step : 263
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 264 | Mem: 26.53MB, Util: 100%  global_step : 264
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 261 | Mem: 26.53MB, Util: 100%  global_step : 2

[Rank 0] Train Epoch 0:  12%|█▏        | 242/2000 [00:44<02:22, 12.36it/s]
[Rank 1] Train Epoch 0:  13%|█▎        | 268/2000 [00:44<01:51, 15.51it/s]
[Rank 2] Train Epoch 0:  14%|█▎        | 270/2000 [00:44<02:44, 10.53it/s]
[Rank 0] Train Epoch 0:  12%|█▏        | 245/2000 [00:44<01:58, 14.81it/s]
[Rank 1] Train Epoch 0:  14%|█▎        | 272/2000 [00:44<01:42, 16.79it/s]
[Rank 2] Train Epoch 0:  14%|█▎        | 273/2000 [00:44<02:22, 12.16it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 241 | Mem: 26.53MB, Util: 14%  global_step : 241
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 242 | Mem: 26.53MB, Util: 22%  global_step : 242
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 243 | Mem: 26.53MB, Util: 22%  global_step : 243
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 267 | Mem: 26.53MB, Util: 100%  global_step : 267
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 268 | Mem: 26.53MB, Util: 100%  global_step : 268
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 269 | Mem: 26.53MB, Util: 100%  global_step : 269
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 268 | Mem: 26.53MB, Util: 100%  global_step : 268
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 269 | Mem: 26.53MB, Util: 100%  global_step :

[Rank 0] Train Epoch 0:  12%|█▏        | 248/2000 [00:44<01:43, 17.01it/s]
[Rank 1] Train Epoch 0:  14%|█▍        | 278/2000 [00:44<01:26, 19.83it/s]
[Rank 2] Train Epoch 0:  14%|█▍        | 278/2000 [00:44<01:55, 14.88it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 246 | Mem: 26.53MB, Util: 22%  global_step : 246
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 247 | Mem: 26.53MB, Util: 16%  global_step : 247
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 248 | Mem: 26.53MB, Util: 16%  global_step : 248
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 249 | Mem: 26.53MB, Util: 16%  global_step : 249
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 250 | Mem: 26.53MB, Util: 16%  global_step : 250
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 274 | Mem: 26.53MB, Util: 100%  global_step : 274
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 275 | Mem: 26.53MB, Util: 100%  global_step : 275
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 276 | Mem: 26.53MB, Util: 100%  global_step : 276

[Rank 0] Train Epoch 0:  13%|█▎        | 253/2000 [00:44<01:26, 20.11it/s]
[Rank 1] Train Epoch 0:  14%|█▍        | 282/2000 [00:44<01:15, 22.67it/s]
[Rank 2] Train Epoch 0:  14%|█▍        | 282/2000 [00:44<01:35, 18.02it/s]
[Rank 0] Train Epoch 0:  13%|█▎        | 257/2000 [00:44<01:13, 23.70it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 252 | Mem: 26.53MB, Util: 16%  global_step : 252
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 253 | Mem: 26.53MB, Util: 22%  global_step : 253
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 254 | Mem: 26.53MB, Util: 22%  global_step : 254
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 255 | Mem: 26.53MB, Util: 22%  global_step : 255
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 279 | Mem: 26.53MB, Util: 100%  global_step : 279
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 280 | Mem: 26.53MB, Util: 100%  global_step : 280
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 281 | Mem: 26.53MB, Util: 100%  global_step : 281
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 282 | Mem: 26.53MB, Util: 100%  global_step : 

[Rank 2] Train Epoch 0:  14%|█▍        | 286/2000 [00:45<02:01, 14.07it/s]
[Rank 1] Train Epoch 0:  14%|█▍        | 286/2000 [00:45<01:45, 16.26it/s]
[Rank 0] Train Epoch 0:  13%|█▎        | 261/2000 [00:45<01:50, 15.71it/s]
[Rank 0] Train Epoch 0:  13%|█▎        | 264/2000 [00:45<01:43, 16.74it/s]
[Rank 1] Train Epoch 0:  14%|█▍        | 289/2000 [00:45<01:40, 17.07it/s]
[Rank 2] Train Epoch 0:  14%|█▍        | 289/2000 [00:45<01:52, 15.21it/s]
[Rank 0] Train Epoch 0:  13%|█▎        | 269/2000 [00:45<01:28, 19.46it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 287 | Mem: 26.53MB, Util: 100%  global_step : 287
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 288 | Mem: 26.53MB, Util: 100%  global_step : 288
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 289 | Mem: 26.53MB, Util: 100%  global_step : 289
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 290 | Mem: 26.53MB, Util: 100%  global_step : 290
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 291 | Mem: 26.53MB, Util: 100%  global_step : 291
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 287 | Mem: 26.53MB, Util: 100%  global_step : 287
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 288 | Mem: 26.53MB, Util: 100%  global_step : 288
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 289 | Mem: 26.53MB, Util: 100%  globa

[Rank 1] Train Epoch 0:  15%|█▍        | 294/2000 [00:45<01:27, 19.41it/s]
[Rank 2] Train Epoch 0:  15%|█▍        | 294/2000 [00:45<01:35, 17.92it/s]
[Rank 0] Train Epoch 0:  14%|█▎        | 272/2000 [00:45<01:28, 19.55it/s]
[Rank 1] Train Epoch 0:  15%|█▍        | 297/2000 [00:45<01:27, 19.49it/s]
[Rank 2] Train Epoch 0:  15%|█▍        | 297/2000 [00:45<01:32, 18.33it/s]
[Rank 0] Train Epoch 0:  14%|█▍        | 277/2000 [00:45<01:09, 24.77it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 293 | Mem: 26.53MB, Util: 100%  global_step : 293
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 294 | Mem: 26.53MB, Util: 100%  global_step : 294
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 293 | Mem: 26.53MB, Util: 100%  global_step : 293
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 294 | Mem: 26.53MB, Util: 100%  global_step : 294
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 269 | Mem: 26.53MB, Util: 19%  global_step : 269
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 270 | Mem: 26.53MB, Util: 12%  global_step : 270
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 271 | Mem: 26.53MB, Util: 12%  global_step : 271
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 295 | Mem: 26.53MB, Util: 100%  global_step :

[Rank 0] Train Epoch 0:  14%|█▍        | 283/2000 [00:45<00:55, 31.12it/s]
[Rank 0] Train Epoch 0:  14%|█▍        | 289/2000 [00:46<00:48, 35.23it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 288 | Mem: 26.53MB, Util: 22%  global_step : 288
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 289 | Mem: 26.53MB, Util: 33%  global_step : 289
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 290 | Mem: 26.53MB, Util: 0%  global_step : 290
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 291 | Mem: 26.53MB, Util: 0%  global_step : 291
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 292 | Mem: 26.53MB, Util: 0%  global_step : 292
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 293 | Mem: 26.53MB, Util: 0%  global_step : 293
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 294 | Mem: 26.53MB, Util: 0%  global_step : 294
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 295 | Mem: 26.53MB, Util: 0%  global_step : 295
[36m(RayTrain

[Rank 0] Train Epoch 0:  15%|█▍        | 293/2000 [00:47<03:22,  8.42it/s]
[Rank 0] Train Epoch 0:  15%|█▌        | 301/2000 [00:47<02:08, 13.18it/s]
[Rank 1] Train Epoch 0:  15%|█▌        | 301/2000 [00:47<05:14,  5.40it/s]
[Rank 2] Train Epoch 0:  15%|█▌        | 301/2000 [00:47<05:23,  5.25it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 300 | Mem: 26.53MB, Util: 100%  global_step : 300
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 301 | Mem: 26.53MB, Util: 100%  global_step : 301
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 302 | Mem: 26.53MB, Util: 100%  global_step : 302
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 303 | Mem: 26.53MB, Util: 100%  global_step : 303
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 304 | Mem: 26.53MB, Util: 100%  global_step : 304
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 305 | Mem: 26.53MB, Util: 99%  global_step : 305
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 300 | Mem: 26.53MB, Util: 100%  global_step : 300
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 301 | Mem: 26.53MB, Util: 100%  globa

[Rank 1] Train Epoch 0:  15%|█▌        | 307/2000 [00:47<03:26,  8.20it/s]
[Rank 2] Train Epoch 0:  16%|█▌        | 315/2000 [00:47<02:13, 12.62it/s]
[Rank 0] Train Epoch 0:  15%|█▌        | 305/2000 [00:47<01:59, 14.23it/s]
[Rank 1] Train Epoch 0:  16%|█▌        | 314/2000 [00:47<02:13, 12.60it/s]
[Rank 2] Train Epoch 0:  16%|█▋        | 329/2000 [00:47<01:18, 21.25it/s]
[Rank 0] Train Epoch 0:  15%|█▌        | 309/2000 [00:48<01:42, 16.54it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 310 | Mem: 26.53MB, Util: 99%  global_step : 310
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 311 | Mem: 26.53MB, Util: 99%  global_step : 311
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 312 | Mem: 26.53MB, Util: 99%  global_step : 312
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 313 | Mem: 26.53MB, Util: 99%  global_step : 313
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 314 | Mem: 26.53MB, Util: 99%  global_step : 314
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 327 | Mem: 26.53MB, Util: 98%  global_step : 327
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 328 | Mem: 26.53MB, Util: 98%  global_step : 328
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 329 | Mem: 26.53MB, Util: 98%  global_step :

[Rank 0] Train Epoch 0:  16%|█▌        | 313/2000 [00:48<01:28, 19.13it/s]
[Rank 1] Train Epoch 0:  16%|█▌        | 318/2000 [00:48<01:56, 14.41it/s]
[Rank 2] Train Epoch 0:  17%|█▋        | 336/2000 [00:48<01:08, 24.29it/s]
[Rank 1] Train Epoch 0:  16%|█▌        | 322/2000 [00:48<01:48, 15.53it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 317 | Mem: 26.53MB, Util: 100%  global_step : 317
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 318 | Mem: 26.53MB, Util: 100%  global_step : 318
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 319 | Mem: 26.53MB, Util: 100%  global_step : 319
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 335 | Mem: 26.53MB, Util: 100%  global_step : 335
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 336 | Mem: 26.53MB, Util: 100%  global_step : 336
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 337 | Mem: 26.53MB, Util: 100%  global_step : 337
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 313 | Mem: 26.53MB, Util: 20%  global_step : 313
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 314 | Mem: 26.53MB, Util: 20%  global_step

[Rank 0] Train Epoch 0:  16%|█▌        | 317/2000 [00:48<01:38, 17.17it/s]
[Rank 1] Train Epoch 0:  16%|█▋        | 325/2000 [00:48<01:48, 15.37it/s]
[Rank 2] Train Epoch 0:  17%|█▋        | 342/2000 [00:48<01:17, 21.41it/s]
[Rank 0] Train Epoch 0:  16%|█▌        | 323/2000 [00:48<01:12, 23.05it/s]
[Rank 2] Train Epoch 0:  17%|█▋        | 348/2000 [00:48<01:04, 25.47it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 325 | Mem: 26.53MB, Util: 100%  global_step : 325
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 326 | Mem: 26.53MB, Util: 100%  global_step : 326
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 327 | Mem: 26.53MB, Util: 100%  global_step : 327
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 328 | Mem: 26.53MB, Util: 100%  global_step : 328
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 329 | Mem: 26.53MB, Util: 100%  global_step : 329
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 330 | Mem: 26.53MB, Util: 100%  global_step : 330
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 343 | Mem: 26.53MB, Util: 100%  global_step : 343
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 344 | Mem: 26.53MB, Util: 100%  glob

[Rank 1] Train Epoch 0:  17%|█▋        | 332/2000 [00:48<01:14, 22.25it/s]
[Rank 0] Train Epoch 0:  16%|█▋        | 327/2000 [00:48<01:10, 23.87it/s]
[Rank 1] Train Epoch 0:  17%|█▋        | 336/2000 [00:48<01:10, 23.45it/s]
[Rank 2] Train Epoch 0:  18%|█▊        | 354/2000 [00:48<01:01, 26.95it/s]
[Rank 0] Train Epoch 0:  17%|█▋        | 333/2000 [00:48<00:55, 29.95it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 333 | Mem: 26.53MB, Util: 100%  global_step : 333
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 334 | Mem: 26.53MB, Util: 100%  global_step : 334
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 335 | Mem: 26.53MB, Util: 100%  global_step : 335
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 336 | Mem: 26.53MB, Util: 100%  global_step : 336
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 337 | Mem: 26.53MB, Util: 100%  global_step : 337
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 338 | Mem: 26.53MB, Util: 100%  global_step : 338
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 351 | Mem: 26.53MB, Util: 100%  global_step : 351
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 352 | Mem: 26.53MB, Util: 100%  glob

[Rank 1] Train Epoch 0:  17%|█▋        | 340/2000 [00:48<01:06, 24.87it/s]
[Rank 2] Train Epoch 0:  18%|█▊        | 359/2000 [00:48<00:58, 28.22it/s]
[Rank 0] Train Epoch 0:  17%|█▋        | 337/2000 [00:49<01:02, 26.79it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 341 | Mem: 26.53MB, Util: 100%  global_step : 341
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 342 | Mem: 26.53MB, Util: 100%  global_step : 342
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 358 | Mem: 26.53MB, Util: 100%  global_step : 358
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 359 | Mem: 26.53MB, Util: 100%  global_step : 359
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 360 | Mem: 26.53MB, Util: 100%  global_step : 360
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 336 | Mem: 26.53MB, Util: 24%  global_step : 336
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 337 | Mem: 26.53MB, Util: 16%  global_step : 337
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 338 | Mem: 26.53MB, Util: 16%  global_step : 

[Rank 1] Train Epoch 0:  17%|█▋        | 344/2000 [00:49<01:07, 24.40it/s]
[Rank 2] Train Epoch 0:  18%|█▊        | 364/2000 [00:49<00:58, 27.98it/s]
[Rank 0] Train Epoch 0:  17%|█▋        | 341/2000 [00:49<00:57, 28.99it/s]
[Rank 1] Train Epoch 0:  17%|█▋        | 349/2000 [00:49<00:56, 29.16it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 365 | Mem: 26.53MB, Util: 100%  global_step : 365
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 366 | Mem: 26.53MB, Util: 100%  global_step : 366
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 347 | Mem: 26.53MB, Util: 100%  global_step : 347
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 348 | Mem: 26.53MB, Util: 100%  global_step : 348
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 342 | Mem: 26.53MB, Util: 16%  global_step : 342
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 343 | Mem: 26.53MB, Util: 0%  global_step : 343
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 349 | Mem: 26.53MB, Util: 100%  global_step : 349
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 367 | Mem: 26.53MB, Util: 100%  global_step 

[Rank 2] Train Epoch 0:  18%|█▊        | 368/2000 [00:49<01:40, 16.25it/s]
[Rank 0] Train Epoch 0:  17%|█▋        | 345/2000 [00:49<01:53, 14.54it/s]
[Rank 1] Train Epoch 0:  18%|█▊        | 353/2000 [00:49<01:51, 14.82it/s]
[Rank 0] Train Epoch 0:  17%|█▋        | 349/2000 [00:49<01:38, 16.84it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 346 | Mem: 26.53MB, Util: 0%  global_step : 346
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 347 | Mem: 26.53MB, Util: 0%  global_step : 347
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 348 | Mem: 26.53MB, Util: 0%  global_step : 348
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 353 | Mem: 26.53MB, Util: 100%  global_step : 353
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 354 | Mem: 26.53MB, Util: 100%  global_step : 354
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 371 | Mem: 26.53MB, Util: 100%  global_step : 371
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 372 | Mem: 26.53MB, Util: 100%  global_step : 372
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 373 | Mem: 26.53MB, Util: 100%  global_step : 373

[Rank 2] Train Epoch 0:  19%|█▊        | 372/2000 [00:49<01:30, 17.94it/s]
[Rank 1] Train Epoch 0:  18%|█▊        | 356/2000 [00:49<01:45, 15.63it/s]
[Rank 0] Train Epoch 0:  18%|█▊        | 352/2000 [00:50<01:30, 18.12it/s]
[Rank 2] Train Epoch 0:  19%|█▉        | 375/2000 [00:49<01:24, 19.21it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 376 | Mem: 26.53MB, Util: 100%  global_step : 376
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 358 | Mem: 26.53MB, Util: 100%  global_step : 358
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 352 | Mem: 26.53MB, Util: 13%  global_step : 352
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 353 | Mem: 26.53MB, Util: 16%  global_step : 353
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 377 | Mem: 26.53MB, Util: 100%  global_step : 377
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 378 | Mem: 26.53MB, Util: 100%  global_step : 378
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 359 | Mem: 26.53MB, Util: 100%  global_step : 359
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 354 | Mem: 26.53MB, Util: 16%  global_step : 

[Rank 1] Train Epoch 0:  18%|█▊        | 359/2000 [00:50<01:46, 15.34it/s]
[Rank 2] Train Epoch 0:  19%|█▉        | 378/2000 [00:50<01:30, 18.01it/s]
[Rank 0] Train Epoch 0:  18%|█▊        | 355/2000 [00:50<01:43, 15.88it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 379 | Mem: 26.53MB, Util: 100%  global_step : 379
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 360 | Mem: 26.53MB, Util: 100%  global_step : 360
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 361 | Mem: 26.53MB, Util: 100%  global_step : 361
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 355 | Mem: 26.53MB, Util: 10%  global_step : 355
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 356 | Mem: 26.53MB, Util: 10%  global_step : 356
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 357 | Mem: 26.53MB, Util: 10%  global_step : 357
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 362 | Mem: 26.53MB, Util: 100%  global_step : 362
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 363 | Mem: 26.53MB, Util: 100%  global_step 

[Rank 1] Train Epoch 0:  18%|█▊        | 362/2000 [00:50<01:48, 15.17it/s]
[Rank 0] Train Epoch 0:  18%|█▊        | 358/2000 [00:50<01:38, 16.71it/s]
[Rank 2] Train Epoch 0:  19%|█▉        | 381/2000 [00:50<01:34, 17.10it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 358 | Mem: 26.53MB, Util: 10%  global_step : 358
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 382 | Mem: 26.53MB, Util: 100%  global_step : 382
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 383 | Mem: 26.53MB, Util: 100%  global_step : 383
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 359 | Mem: 26.53MB, Util: 7%  global_step : 359
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 360 | Mem: 26.53MB, Util: 7%  global_step : 360
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 364 | Mem: 26.53MB, Util: 100%  global_step : 364
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 365 | Mem: 26.53MB, Util: 100%  global_step : 365
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 384 | Mem: 26.53MB, Util: 100%  global_step : 38

[Rank 2] Train Epoch 0:  19%|█▉        | 384/2000 [00:50<01:36, 16.78it/s]
[Rank 0] Train Epoch 0:  18%|█▊        | 361/2000 [00:50<01:44, 15.74it/s]
[Rank 1] Train Epoch 0:  18%|█▊        | 365/2000 [00:50<01:57, 13.92it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 366 | Mem: 26.53MB, Util: 100%  global_step : 366
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 367 | Mem: 26.53MB, Util: 100%  global_step : 367
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 361 | Mem: 26.53MB, Util: 7%  global_step : 361
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 362 | Mem: 26.53MB, Util: 7%  global_step : 362
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 385 | Mem: 26.53MB, Util: 100%  global_step : 385
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 386 | Mem: 26.53MB, Util: 100%  global_step : 386
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 387 | Mem: 26.53MB, Util: 100%  global_step : 387
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 368 | Mem: 26.53MB, Util: 100%  global_step :

[Rank 2] Train Epoch 0:  19%|█▉        | 386/2000 [00:50<01:36, 16.81it/s]
[Rank 1] Train Epoch 0:  18%|█▊        | 369/2000 [00:50<01:43, 15.74it/s]
[Rank 0] Train Epoch 0:  18%|█▊        | 364/2000 [00:50<01:37, 16.75it/s]
[Rank 2] Train Epoch 0:  19%|█▉        | 389/2000 [00:50<01:32, 17.51it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 390 | Mem: 26.53MB, Util: 100%  global_step : 390


[Rank 1] Train Epoch 0:  19%|█▊        | 372/2000 [00:54<09:14,  2.94it/s]
[Rank 2] Train Epoch 0:  20%|█▉        | 392/2000 [00:54<09:54,  2.70it/s]
[Rank 0] Train Epoch 0:  18%|█▊        | 367/2000 [00:54<09:44,  2.79it/s]
[Rank 0] Train Epoch 0:  19%|█▉        | 380/2000 [00:54<03:37,  7.45it/s]
[Rank 1] Train Epoch 0:  19%|█▊        | 374/2000 [00:54<07:42,  3.51it/s]
[Rank 2] Train Epoch 0:  20%|█▉        | 399/2000 [00:54<05:09,  5.17it/s]
[Rank 0] Train Epoch 0:  20%|█▉        | 396/2000 [00:54<01:46, 15.02it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 371 | Mem: 26.53MB, Util: 100%  global_step : 371
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 372 | Mem: 26.53MB, Util: 100%  global_step : 372
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 391 | Mem: 26.53MB, Util: 100%  global_step : 391
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 392 | Mem: 26.53MB, Util: 100%  global_step : 392
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 393 | Mem: 26.53MB, Util: 100%  global_step : 393
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 394 | Mem: 26.53MB, Util: 100%  global_step : 394
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 395 | Mem: 26.53MB, Util: 100%  global_step : 395
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 396 | Mem: 26.53MB, Util: 100%  global_s

[Rank 1] Train Epoch 0:  19%|█▉        | 378/2000 [00:54<06:26,  4.20it/s]
[Rank 1] Train Epoch 0:  19%|█▉        | 384/2000 [00:54<03:47,  7.11it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 377 | Mem: 26.53MB, Util: 100%  global_step : 377
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 378 | Mem: 26.53MB, Util: 0%  global_step : 378
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 379 | Mem: 26.53MB, Util: 0%  global_step : 379
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 380 | Mem: 26.53MB, Util: 0%  global_step : 380
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 381 | Mem: 26.53MB, Util: 0%  global_step : 381
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 382 | Mem: 26.53MB, Util: 9%  global_step : 382
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 383 | Mem: 26.53MB, Util: 9%  global_step : 383
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 384 | Mem: 26.53MB, Util: 9%  global_step : 38

[Rank 1] Train Epoch 0:  19%|█▉        | 389/2000 [00:55<02:41, 10.00it/s]
[Rank 1] Train Epoch 0:  20%|█▉        | 393/2000 [00:55<02:06, 12.67it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 386 | Mem: 26.53MB, Util: 9%  global_step : 386
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 387 | Mem: 26.53MB, Util: 9%  global_step : 387
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 388 | Mem: 26.53MB, Util: 9%  global_step : 388
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 389 | Mem: 26.53MB, Util: 9%  global_step : 389
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 390 | Mem: 26.53MB, Util: 9%  global_step : 390
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 391 | Mem: 26.53MB, Util: 31%  global_step : 391
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 392 | Mem: 26.53MB, Util: 31%  global_step : 392
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 393 | Mem: 26.53MB, Util: 31%  global_step : 3

[Rank 1] Train Epoch 0:  20%|█▉        | 398/2000 [00:55<01:35, 16.75it/s]
[Rank 2] Train Epoch 0:  20%|██        | 402/2000 [00:55<06:13,  4.27it/s]
[Rank 1] Train Epoch 0:  20%|██        | 402/2000 [00:55<01:21, 19.66it/s]
[Rank 0] Train Epoch 0:  20%|██        | 405/2000 [00:55<02:10, 12.24it/s]
[Rank 2] Train Epoch 0:  21%|██        | 417/2000 [00:55<02:25, 10.87it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 396 | Mem: 26.53MB, Util: 31%  global_step : 396
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 397 | Mem: 26.53MB, Util: 31%  global_step : 397
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 398 | Mem: 26.53MB, Util: 31%  global_step : 398
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 400 | Mem: 26.53MB, Util: 100%  global_step : 400
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 401 | Mem: 26.53MB, Util: 100%  global_step : 401
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 402 | Mem: 26.53MB, Util: 100%  global_step : 402
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 403 | Mem: 26.53MB, Util: 100%  global_step : 403
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 404 | Mem: 26.53MB, Util: 100%  global_ste

[Rank 0] Train Epoch 0:  21%|██        | 420/2000 [00:55<01:19, 19.77it/s]
[Rank 2] Train Epoch 0:  22%|██▏       | 432/2000 [00:55<01:21, 19.27it/s]
[Rank 1] Train Epoch 0:  20%|██        | 408/2000 [00:55<01:05, 24.23it/s]
[Rank 0] Train Epoch 0:  22%|██▏       | 434/2000 [00:55<00:55, 28.01it/s]
[Rank 1] Train Epoch 0:  21%|██        | 412/2000 [00:55<00:59, 26.90it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 411 | Mem: 26.53MB, Util: 28%  global_step : 411
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 412 | Mem: 26.53MB, Util: 28%  global_step : 412
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 413 | Mem: 26.53MB, Util: 28%  global_step : 413
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 414 | Mem: 26.53MB, Util: 28%  global_step : 414
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 436 | Mem: 26.53MB, Util: 100%  global_step : 436
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 437 | Mem: 26.53MB, Util: 100%  global_step : 437
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 438 | Mem: 26.53MB, Util: 100%  global_step : 438
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 439 | Mem: 26.53MB, Util: 100%  global_step : 

[Rank 2] Train Epoch 0:  22%|██▏       | 441/2000 [00:55<01:08, 22.68it/s]
[Rank 1] Train Epoch 0:  21%|██        | 416/2000 [00:55<00:55, 28.45it/s]
[Rank 0] Train Epoch 0:  22%|██▏       | 444/2000 [00:55<00:50, 30.67it/s]
[Rank 1] Train Epoch 0:  21%|██        | 420/2000 [00:55<00:53, 29.54it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 419 | Mem: 26.53MB, Util: 23%  global_step : 419
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 420 | Mem: 26.53MB, Util: 23%  global_step : 420
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 421 | Mem: 26.53MB, Util: 23%  global_step : 421
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 444 | Mem: 26.53MB, Util: 100%  global_step : 444
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 445 | Mem: 26.53MB, Util: 100%  global_step : 445
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 446 | Mem: 26.53MB, Util: 100%  global_step : 446
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 447 | Mem: 26.53MB, Util: 100%  global_step : 447
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 448 | Mem: 26.53MB, Util: 100%  global_step 

[Rank 2] Train Epoch 0:  22%|██▏       | 448/2000 [00:55<01:02, 24.89it/s]
[Rank 1] Train Epoch 0:  21%|██▏       | 425/2000 [00:56<00:47, 33.31it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 425 | Mem: 26.53MB, Util: 22%  global_step : 425
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 450 | Mem: 26.53MB, Util: 100%  global_step : 450
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 450 | Mem: 26.53MB, Util: 100%  global_step : 450
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 426 | Mem: 26.53MB, Util: 0%  global_step : 426
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 427 | Mem: 26.53MB, Util: 0%  global_step : 427
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 451 | Mem: 26.53MB, Util: 100%  global_step : 451
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 452 | Mem: 26.53MB, Util: 100%  global_step : 452
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 451 | Mem: 26.53MB, Util: 100%  global_step : 4

[Rank 0] Train Epoch 0:  23%|██▎       | 452/2000 [00:58<02:37,  9.81it/s]
[Rank 2] Train Epoch 0:  23%|██▎       | 454/2000 [00:58<03:17,  7.81it/s]
[Rank 1] Train Epoch 0:  21%|██▏       | 429/2000 [00:58<05:05,  5.14it/s]
[Rank 2] Train Epoch 0:  23%|██▎       | 459/2000 [00:58<02:43,  9.43it/s]
[Rank 1] Train Epoch 0:  22%|██▏       | 433/2000 [00:58<03:50,  6.80it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 432 | Mem: 26.53MB, Util: 10%  global_step : 432
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 433 | Mem: 26.53MB, Util: 10%  global_step : 433
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 434 | Mem: 26.53MB, Util: 10%  global_step : 434
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 435 | Mem: 26.53MB, Util: 10%  global_step : 435
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 436 | Mem: 26.53MB, Util: 10%  global_step : 436
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 457 | Mem: 26.53MB, Util: 100%  global_step : 457
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 458 | Mem: 26.53MB, Util: 100%  global_step : 458
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 459 | Mem: 26.53MB, Util: 100%  global_step :

[Rank 0] Train Epoch 0:  23%|██▎       | 458/2000 [00:58<02:13, 11.53it/s]
[Rank 2] Train Epoch 0:  23%|██▎       | 466/2000 [00:58<02:01, 12.64it/s]
[Rank 1] Train Epoch 0:  22%|██▏       | 439/2000 [00:58<02:33, 10.19it/s]
[Rank 0] Train Epoch 0:  23%|██▎       | 464/2000 [00:58<01:49, 14.03it/s]
[Rank 1] Train Epoch 0:  22%|██▏       | 443/2000 [00:58<02:04, 12.50it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 441 | Mem: 26.53MB, Util: 10%  global_step : 441
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 442 | Mem: 26.53MB, Util: 30%  global_step : 442
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 443 | Mem: 26.53MB, Util: 30%  global_step : 443
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 444 | Mem: 26.53MB, Util: 30%  global_step : 444
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 445 | Mem: 26.53MB, Util: 30%  global_step : 445
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 466 | Mem: 26.53MB, Util: 100%  global_step : 466
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 467 | Mem: 26.53MB, Util: 100%  global_step : 467
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 468 | Mem: 26.53MB, Util: 100%  global_step :

[Rank 0] Train Epoch 0:  24%|██▎       | 470/2000 [00:58<01:32, 16.46it/s]
[Rank 2] Train Epoch 0:  24%|██▎       | 471/2000 [00:58<01:43, 14.77it/s]
[Rank 1] Train Epoch 0:  22%|██▏       | 448/2000 [00:59<01:41, 15.30it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 448 | Mem: 26.53MB, Util: 23%  global_step : 448
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 449 | Mem: 26.53MB, Util: 23%  global_step : 449
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 450 | Mem: 26.53MB, Util: 23%  global_step : 450
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 473 | Mem: 26.53MB, Util: 100%  global_step : 473
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 474 | Mem: 26.53MB, Util: 100%  global_step : 474
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 475 | Mem: 26.53MB, Util: 100%  global_step : 475
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 475 | Mem: 26.53MB, Util: 100%  global_step : 475
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 476 | Mem: 26.53MB, Util: 100%  global_step :

[Rank 0] Train Epoch 0:  24%|██▍       | 475/2000 [00:59<01:23, 18.26it/s]
[Rank 2] Train Epoch 0:  24%|██▍       | 476/2000 [00:59<01:31, 16.68it/s]
[Rank 1] Train Epoch 0:  23%|██▎       | 452/2000 [00:59<01:31, 16.96it/s]
[Rank 2] Train Epoch 0:  24%|██▍       | 480/2000 [00:59<01:25, 17.82it/s]
[Rank 1] Train Epoch 0:  23%|██▎       | 456/2000 [00:59<01:17, 19.96it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 453 | Mem: 26.53MB, Util: 12%  global_step : 453
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 454 | Mem: 26.53MB, Util: 12%  global_step : 454
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 455 | Mem: 26.53MB, Util: 12%  global_step : 455
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 456 | Mem: 26.53MB, Util: 12%  global_step : 456
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 478 | Mem: 26.53MB, Util: 100%  global_step : 478
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 479 | Mem: 26.53MB, Util: 100%  global_step : 479
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 480 | Mem: 26.53MB, Util: 100%  global_step : 480
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 481 | Mem: 26.53MB, Util: 100%  global_step : 

[Rank 0] Train Epoch 0:  24%|██▍       | 480/2000 [00:59<01:18, 19.45it/s]
[Rank 2] Train Epoch 0:  24%|██▍       | 484/2000 [00:59<01:13, 20.54it/s]
[Rank 1] Train Epoch 0:  23%|██▎       | 460/2000 [00:59<01:06, 23.05it/s]
[Rank 0] Train Epoch 0:  24%|██▍       | 484/2000 [00:59<01:09, 21.87it/s]
[Rank 2] Train Epoch 0:  24%|██▍       | 489/2000 [00:59<01:01, 24.54it/s]
[Rank 1] Train Epoch 0:  23%|██▎       | 465/2000 [00:59<00:55, 27.46it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 461 | Mem: 26.53MB, Util: 18%  global_step : 461
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 462 | Mem: 26.53MB, Util: 18%  global_step : 462
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 463 | Mem: 26.53MB, Util: 18%  global_step : 463
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 464 | Mem: 26.53MB, Util: 18%  global_step : 464
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 465 | Mem: 26.53MB, Util: 18%  global_step : 465
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 466 | Mem: 26.53MB, Util: 18%  global_step : 466
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 486 | Mem: 26.53MB, Util: 100%  global_step : 486
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 487 | Mem: 26.53MB, Util: 100%  global_step 

[Rank 0] Train Epoch 0:  24%|██▍       | 489/2000 [00:59<00:59, 25.47it/s]
[Rank 2] Train Epoch 0:  25%|██▍       | 495/2000 [00:59<00:50, 29.97it/s]
[Rank 1] Train Epoch 0:  24%|██▎       | 471/2000 [00:59<00:44, 34.04it/s]
[Rank 0] Train Epoch 0:  25%|██▍       | 495/2000 [00:59<00:49, 30.53it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 471 | Mem: 26.53MB, Util: 29%  global_step : 471
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 472 | Mem: 26.53MB, Util: 29%  global_step : 472
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 473 | Mem: 26.53MB, Util: 29%  global_step : 473
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 474 | Mem: 26.53MB, Util: 29%  global_step : 474
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 496 | Mem: 26.53MB, Util: 100%  global_step : 496
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 497 | Mem: 26.53MB, Util: 100%  global_step : 497
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 498 | Mem: 26.53MB, Util: 100%  global_step : 498
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 499 | Mem: 26.53MB, Util: 100%  global_step : 

[Rank 0] Train Epoch 0:  25%|██▌       | 500/2000 [00:59<00:48, 30.95it/s]
[Rank 2] Train Epoch 0:  25%|██▌       | 500/2000 [00:59<00:49, 30.51it/s]
[Rank 1] Train Epoch 0:  24%|██▍       | 476/2000 [00:59<00:44, 34.56it/s]
[Rank 1] Train Epoch 0:  24%|██▍       | 481/2000 [00:59<00:40, 37.29it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 480 | Mem: 26.53MB, Util: 24%  global_step : 480
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 481 | Mem: 26.53MB, Util: 24%  global_step : 481
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 482 | Mem: 26.53MB, Util: 24%  global_step : 482
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 483 | Mem: 26.53MB, Util: 24%  global_step : 483
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 484 | Mem: 26.53MB, Util: 24%  global_step : 484
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 485 | Mem: 26.53MB, Util: 24%  global_step : 485
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 486 | Mem: 26.53MB, Util: 24%  global_step : 486
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 487 | Mem: 26.53MB, Util: 24%  global_ste

[Rank 1] Train Epoch 0:  24%|██▍       | 487/2000 [00:59<00:35, 42.68it/s]
[Rank 1] Train Epoch 0:  25%|██▍       | 492/2000 [01:00<00:40, 37.55it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 489 | Mem: 26.53MB, Util: 31%  global_step : 489
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 490 | Mem: 26.53MB, Util: 31%  global_step : 490
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 491 | Mem: 26.53MB, Util: 31%  global_step : 491
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 492 | Mem: 26.53MB, Util: 31%  global_step : 492
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 493 | Mem: 26.53MB, Util: 31%  global_step : 493
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 494 | Mem: 26.53MB, Util: 31%  global_step : 494
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 495 | Mem: 26.53MB, Util: 31%  global_step : 495
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 496 | Mem: 26.53MB, Util: 19%  global_ste

[Rank 1] Train Epoch 0:  25%|██▍       | 498/2000 [01:00<00:36, 41.55it/s]
[Rank 2] Train Epoch 0:  25%|██▌       | 504/2000 [01:00<01:26, 17.22it/s]
[Rank 1] Train Epoch 0:  25%|██▌       | 504/2000 [01:00<00:33, 45.22it/s]
[Rank 0] Train Epoch 0:  25%|██▌       | 505/2000 [01:00<01:20, 18.51it/s]
[Rank 2] Train Epoch 0:  26%|██▌       | 518/2000 [01:00<00:44, 33.38it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 499 | Mem: 26.53MB, Util: 19%  global_step : 499
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 500 | Mem: 26.53MB, Util: 19%  global_step : 500
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 501 | Mem: 26.53MB, Util: 19%  global_step : 501
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 502 | Mem: 26.53MB, Util: 19%  global_step : 502
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 503 | Mem: 26.53MB, Util: 19%  global_step : 503
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 504 | Mem: 26.53MB, Util: 19%  global_step : 504
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 500 | Mem: 26.53MB, Util: 100%  global_step : 500
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 501 | Mem: 26.53MB, Util: 100%  global_step 

[Rank 0] Train Epoch 0:  26%|██▌       | 520/2000 [01:00<00:41, 35.55it/s]
[Rank 2] Train Epoch 0:  27%|██▋       | 533/2000 [01:00<00:28, 51.41it/s]
[Rank 1] Train Epoch 0:  25%|██▌       | 509/2000 [01:00<00:34, 43.03it/s]
[Rank 0] Train Epoch 0:  27%|██▋       | 535/2000 [01:00<00:29, 50.48it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 508 | Mem: 26.53MB, Util: 33%  global_step : 508
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 509 | Mem: 26.53MB, Util: 33%  global_step : 509
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 531 | Mem: 26.53MB, Util: 100%  global_step : 531
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 532 | Mem: 26.53MB, Util: 100%  global_step : 532
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 533 | Mem: 26.53MB, Util: 100%  global_step : 533
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 534 | Mem: 26.53MB, Util: 99%  global_step : 534
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 535 | Mem: 26.53MB, Util: 100%  global_step : 535
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 536 | Mem: 26.53MB, Util: 100%  global_step : 5

[Rank 1] Train Epoch 0:  26%|██▌       | 514/2000 [01:05<07:12,  3.44it/s]
[Rank 1] Train Epoch 0:  26%|██▌       | 519/2000 [01:05<05:15,  4.70it/s]
[Rank 0] Train Epoch 0:  27%|██▋       | 543/2000 [01:05<04:04,  5.96it/s]
[Rank 2] Train Epoch 0:  27%|██▋       | 542/2000 [01:05<04:04,  5.96it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 541 | Mem: 26.53MB, Util: 100%  global_step : 541
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 542 | Mem: 26.53MB, Util: 100%  global_step : 542
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 543 | Mem: 26.53MB, Util: 100%  global_step : 543
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 544 | Mem: 26.53MB, Util: 100%  global_step : 544
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 545 | Mem: 26.53MB, Util: 100%  global_step : 545
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 546 | Mem: 26.53MB, Util: 100%  global_step : 546
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 547 | Mem: 26.53MB, Util: 100%  global_step : 547
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 519 | Mem: 26.53MB, Util: 15%  global_ste

[Rank 0] Train Epoch 0:  28%|██▊       | 551/2000 [01:05<03:04,  7.86it/s]
[Rank 2] Train Epoch 0:  28%|██▊       | 550/2000 [01:05<03:04,  7.86it/s]
[Rank 1] Train Epoch 0:  26%|██▋       | 527/2000 [01:05<03:19,  7.38it/s]
[Rank 2] Train Epoch 0:  28%|██▊       | 557/2000 [01:05<02:25,  9.89it/s]
[Rank 1] Train Epoch 0:  27%|██▋       | 533/2000 [01:05<02:27,  9.96it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 553 | Mem: 26.53MB, Util: 100%  global_step : 553
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 554 | Mem: 26.53MB, Util: 100%  global_step : 554
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 555 | Mem: 26.53MB, Util: 100%  global_step : 555
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 556 | Mem: 26.53MB, Util: 100%  global_step : 556
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 557 | Mem: 26.53MB, Util: 100%  global_step : 557
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 558 | Mem: 26.53MB, Util: 100%  global_step : 558
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 559 | Mem: 26.53MB, Util: 100%  global_step : 559
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 530 | Mem: 26.53MB, Util: 31%  global_ste

[Rank 0] Train Epoch 0:  28%|██▊       | 558/2000 [01:05<02:25,  9.90it/s]
[Rank 2] Train Epoch 0:  28%|██▊       | 564/2000 [01:05<01:54, 12.58it/s]
[Rank 1] Train Epoch 0:  27%|██▋       | 539/2000 [01:05<01:51, 13.15it/s]
[Rank 0] Train Epoch 0:  28%|██▊       | 564/2000 [01:05<01:57, 12.17it/s]
[Rank 2] Train Epoch 0:  29%|██▊       | 571/2000 [01:05<01:28, 16.13it/s]
[Rank 1] Train Epoch 0:  27%|██▋       | 546/2000 [01:05<01:21, 17.85it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 564 | Mem: 26.53MB, Util: 100%  global_step : 564
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 565 | Mem: 26.53MB, Util: 100%  global_step : 565
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 566 | Mem: 26.53MB, Util: 100%  global_step : 566
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 567 | Mem: 26.53MB, Util: 100%  global_step : 567
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 568 | Mem: 26.53MB, Util: 100%  global_step : 568
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 569 | Mem: 26.53MB, Util: 100%  global_step : 569
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 570 | Mem: 26.53MB, Util: 100%  global_step : 570
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 542 | Mem: 26.53MB, Util: 34%  global_ste

[Rank 0] Train Epoch 0:  29%|██▊       | 571/2000 [01:05<01:30, 15.77it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 575 | Mem: 26.53MB, Util: 100%  global_step : 575


[Rank 1] Train Epoch 0:  28%|██▊       | 552/2000 [01:07<02:37,  9.20it/s]
[Rank 0] Train Epoch 0:  29%|██▉       | 577/2000 [01:07<02:33,  9.24it/s]
[Rank 2] Train Epoch 0:  29%|██▉       | 577/2000 [01:07<02:31,  9.41it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 551 | Mem: 26.53MB, Util: 34%  global_step : 551
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 576 | Mem: 26.53MB, Util: 100%  global_step : 576
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 576 | Mem: 26.53MB, Util: 100%  global_step : 576
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 577 | Mem: 26.53MB, Util: 100%  global_step : 577
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 552 | Mem: 26.53MB, Util: 0%  global_step : 552
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 553 | Mem: 26.53MB, Util: 0%  global_step : 553
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 554 | Mem: 26.53MB, Util: 0%  global_step : 554
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 555 | Mem: 26.53MB, Util: 0%  global_step : 5

[Rank 1] Train Epoch 0:  28%|██▊       | 556/2000 [01:09<04:51,  4.95it/s]
[Rank 2] Train Epoch 0:  29%|██▉       | 582/2000 [01:09<04:20,  5.44it/s]
[Rank 0] Train Epoch 0:  29%|██▉       | 582/2000 [01:09<04:24,  5.36it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 582 | Mem: 26.53MB, Util: 100%  global_step : 582


[Rank 1] Train Epoch 0:  28%|██▊       | 559/2000 [01:10<05:17,  4.54it/s]
[Rank 0] Train Epoch 0:  29%|██▉       | 585/2000 [01:10<04:47,  4.93it/s]
[Rank 2] Train Epoch 0:  29%|██▉       | 586/2000 [01:10<04:29,  5.24it/s]
[Rank 1] Train Epoch 0:  28%|██▊       | 565/2000 [01:10<03:34,  6.70it/s]
[Rank 0] Train Epoch 0:  30%|██▉       | 591/2000 [01:10<03:24,  6.90it/s]
[Rank 2] Train Epoch 0:  30%|██▉       | 591/2000 [01:10<03:25,  6.86it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 558 | Mem: 26.53MB, Util: 0%  global_step : 558
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 559 | Mem: 26.53MB, Util: 0%  global_step : 559
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 583 | Mem: 26.53MB, Util: 100%  global_step : 583
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 584 | Mem: 26.53MB, Util: 100%  global_step : 584
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 585 | Mem: 26.53MB, Util: 100%  global_step : 585
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 586 | Mem: 26.53MB, Util: 100%  global_step : 586
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m 
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 583 | Mem: 26.53MB, Util: 100%  global_step : 583
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Bat

[Rank 0] Train Epoch 0:  30%|██▉       | 594/2000 [01:10<02:54,  8.04it/s]
[Rank 2] Train Epoch 0:  30%|██▉       | 594/2000 [01:10<02:55,  8.01it/s]
[Rank 1] Train Epoch 0:  28%|██▊       | 569/2000 [01:10<02:54,  8.21it/s]
[Rank 2] Train Epoch 0:  30%|██▉       | 599/2000 [01:10<02:10, 10.71it/s]
[Rank 1] Train Epoch 0:  29%|██▊       | 574/2000 [01:10<02:10, 10.93it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 572 | Mem: 26.53MB, Util: 28%  global_step : 572
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 573 | Mem: 26.53MB, Util: 28%  global_step : 573
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 574 | Mem: 26.53MB, Util: 28%  global_step : 574
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 598 | Mem: 26.53MB, Util: 100%  global_step : 598
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 599 | Mem: 26.53MB, Util: 100%  global_step : 599
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 599 | Mem: 26.53MB, Util: 100%  global_step : 599
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 575 | Mem: 26.53MB, Util: 22%  global_step : 575
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 576 | Mem: 26.53MB, Util: 22%  global_step 

[Rank 0] Train Epoch 0:  30%|██▉       | 599/2000 [01:10<02:10, 10.74it/s]
[Rank 1] Train Epoch 0:  29%|██▉       | 578/2000 [01:10<01:50, 12.90it/s]
[Rank 1] Train Epoch 0:  29%|██▉       | 582/2000 [01:11<01:30, 15.65it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 578 | Mem: 26.53MB, Util: 22%  global_step : 578
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 579 | Mem: 26.53MB, Util: 22%  global_step : 579
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 580 | Mem: 26.53MB, Util: 22%  global_step : 580
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 581 | Mem: 26.53MB, Util: 22%  global_step : 581
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 582 | Mem: 26.53MB, Util: 20%  global_step : 582
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 583 | Mem: 26.53MB, Util: 0%  global_step : 583
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 584 | Mem: 26.53MB, Util: 0%  global_step : 584
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 585 | Mem: 26.53MB, Util: 4%  global_step :

[Rank 1] Train Epoch 0:  29%|██▉       | 586/2000 [01:17<12:26,  1.89it/s]
[Rank 1] Train Epoch 0:  29%|██▉       | 589/2000 [01:18<09:44,  2.41it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 588 | Mem: 26.53MB, Util: 4%  global_step : 588
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 589 | Mem: 26.53MB, Util: 4%  global_step : 589
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 590 | Mem: 26.53MB, Util: 4%  global_step : 590
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 591 | Mem: 26.53MB, Util: 4%  global_step : 591
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 592 | Mem: 26.53MB, Util: 12%  global_step : 592
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 593 | Mem: 26.53MB, Util: 12%  global_step : 593
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 594 | Mem: 26.53MB, Util: 12%  global_step : 594
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 595 | Mem: 26.53MB, Util: 12%  global_step : 

[Rank 1] Train Epoch 0:  30%|██▉       | 594/2000 [01:18<06:28,  3.62it/s]
[Rank 1] Train Epoch 0:  30%|██▉       | 597/2000 [01:18<05:12,  4.50it/s]
[Rank 2] Train Epoch 0:  30%|███       | 603/2000 [01:18<13:09,  1.77it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 596 | Mem: 26.53MB, Util: 12%  global_step : 596
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 597 | Mem: 26.53MB, Util: 24%  global_step : 597
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 600 | Mem: 26.53MB, Util: 100%  global_step : 600
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 601 | Mem: 26.53MB, Util: 100%  global_step : 601
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 602 | Mem: 26.53MB, Util: 100%  global_step : 602
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 598 | Mem: 26.53MB, Util: 24%  global_step : 598
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 599 | Mem: 26.53MB, Util: 24%  global_step : 599
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 600 | Mem: 26.53MB, Util: 24%  global_ste

[Rank 0] Train Epoch 0:  30%|███       | 603/2000 [01:18<13:07,  1.77it/s]
[Rank 1] Train Epoch 0:  30%|███       | 603/2000 [01:18<03:19,  7.00it/s]
[Rank 2] Train Epoch 0:  31%|███       | 617/2000 [01:18<05:44,  4.01it/s]
[Rank 0] Train Epoch 0:  31%|███       | 618/2000 [01:18<05:30,  4.18it/s]
[Rank 2] Train Epoch 0:  32%|███▏      | 631/2000 [01:18<03:12,  7.10it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 610 | Mem: 26.53MB, Util: 97%  global_step : 610
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 611 | Mem: 26.53MB, Util: 97%  global_step : 611
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 612 | Mem: 26.53MB, Util: 97%  global_step : 612
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 613 | Mem: 26.53MB, Util: 97%  global_step : 613
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 614 | Mem: 26.53MB, Util: 97%  global_step : 614
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 615 | Mem: 26.53MB, Util: 97%  global_step : 615
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 616 | Mem: 26.53MB, Util: 97%  global_step : 616
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 617 | Mem: 26.53MB, Util: 97%  global_step : 617
[36m(Ra

[Rank 0] Train Epoch 0:  32%|███▏      | 632/2000 [01:18<03:12,  7.11it/s]
[Rank 1] Train Epoch 0:  30%|███       | 607/2000 [01:18<02:45,  8.43it/s]
[Rank 1] Train Epoch 0:  31%|███       | 612/2000 [01:18<02:00, 11.54it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 631 | Mem: 26.53MB, Util: 100%  global_step : 631
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 632 | Mem: 26.53MB, Util: 100%  global_step : 632
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 633 | Mem: 26.53MB, Util: 100%  global_step : 633
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 606 | Mem: 26.53MB, Util: 24%  global_step : 606
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 607 | Mem: 26.53MB, Util: 18%  global_step : 607
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 608 | Mem: 26.53MB, Util: 18%  global_step : 608
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 609 | Mem: 26.53MB, Util: 18%  global_step : 609
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 610 | Mem: 26.53MB, Util: 12%  global_step :

[Rank 0] Train Epoch 0:  32%|███▏      | 639/2000 [01:18<02:34,  8.80it/s]
[Rank 1] Train Epoch 0:  31%|███       | 616/2000 [01:18<01:40, 13.81it/s]
[Rank 2] Train Epoch 0:  32%|███▏      | 640/2000 [01:18<02:30,  9.07it/s]
[Rank 0] Train Epoch 0:  32%|███▏      | 645/2000 [01:19<02:08, 10.54it/s]
[Rank 1] Train Epoch 0:  31%|███       | 620/2000 [01:19<01:24, 16.29it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 637 | Mem: 26.53MB, Util: 100%  global_step : 637
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 638 | Mem: 26.53MB, Util: 100%  global_step : 638
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 639 | Mem: 26.53MB, Util: 100%  global_step : 639
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 640 | Mem: 26.53MB, Util: 100%  global_step : 640
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 614 | Mem: 26.53MB, Util: 12%  global_step : 614
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 615 | Mem: 26.53MB, Util: 12%  global_step : 615
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 616 | Mem: 26.53MB, Util: 12%  global_step : 616
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 639 | Mem: 26.53MB, Util: 100%  global_step : 

[Rank 2] Train Epoch 0:  32%|███▏      | 647/2000 [01:19<02:03, 10.98it/s]
[Rank 0] Train Epoch 0:  32%|███▎      | 650/2000 [01:19<01:49, 12.31it/s]
[Rank 1] Train Epoch 0:  31%|███       | 624/2000 [01:19<01:13, 18.70it/s]
[Rank 2] Train Epoch 0:  33%|███▎      | 653/2000 [01:19<01:40, 13.41it/s]
[Rank 1] Train Epoch 0:  31%|███▏      | 629/2000 [01:19<01:25, 16.12it/s]
[Rank 0] Train Epoch 0:  33%|███▎      | 655/2000 [01:19<01:49, 12.25it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 628 | Mem: 26.53MB, Util: 22%  global_step : 628
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 653 | Mem: 26.53MB, Util: 100%  global_step : 653
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 654 | Mem: 26.53MB, Util: 100%  global_step : 654
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 653 | Mem: 26.53MB, Util: 100%  global_step : 653
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 654 | Mem: 26.53MB, Util: 100%  global_step : 654
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 629 | Mem: 26.53MB, Util: 0%  global_step : 629
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 630 | Mem: 26.53MB, Util: 0%  global_step : 630
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 655 | Mem: 26.53MB, Util: 100%  global_step : 

[Rank 0] Train Epoch 0:  33%|███▎      | 659/2000 [01:19<01:41, 13.23it/s]
[Rank 1] Train Epoch 0:  32%|███▏      | 632/2000 [01:19<01:28, 15.43it/s]
[Rank 2] Train Epoch 0:  33%|███▎      | 659/2000 [01:19<01:47, 12.46it/s]
[Rank 2] Train Epoch 0:  33%|███▎      | 664/2000 [01:20<01:35, 14.05it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 663 | Mem: 26.53MB, Util: 100%  global_step : 663
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 663 | Mem: 26.53MB, Util: 100%  global_step : 663
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 664 | Mem: 26.53MB, Util: 100%  global_step : 664
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 665 | Mem: 26.53MB, Util: 100%  global_step : 665
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 638 | Mem: 26.53MB, Util: 15%  global_step : 638
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 639 | Mem: 26.53MB, Util: 15%  global_step : 639
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 640 | Mem: 26.53MB, Util: 15%  global_step : 640
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 641 | Mem: 26.53MB, Util: 15%  global_step :

[Rank 0] Train Epoch 0:  33%|███▎      | 664/2000 [01:20<01:28, 15.15it/s]
[Rank 1] Train Epoch 0:  32%|███▏      | 639/2000 [01:20<01:08, 19.74it/s]
[Rank 0] Train Epoch 0:  33%|███▎      | 668/2000 [01:20<01:15, 17.62it/s]
[Rank 1] Train Epoch 0:  32%|███▏      | 643/2000 [01:20<01:00, 22.30it/s]
[Rank 2] Train Epoch 0:  33%|███▎      | 668/2000 [01:20<01:22, 16.11it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 667 | Mem: 26.53MB, Util: 100%  global_step : 667
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 668 | Mem: 26.53MB, Util: 100%  global_step : 668
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 669 | Mem: 26.53MB, Util: 100%  global_step : 669
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 670 | Mem: 26.53MB, Util: 100%  global_step : 670
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 669 | Mem: 26.53MB, Util: 100%  global_step : 669
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 670 | Mem: 26.53MB, Util: 100%  global_step : 670
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 671 | Mem: 26.53MB, Util: 100%  global_step : 671
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 672 | Mem: 26.53MB, Util: 100%  global_step : 

[Rank 0] Train Epoch 0:  34%|███▎      | 672/2000 [01:20<01:06, 19.91it/s]
[Rank 1] Train Epoch 0:  32%|███▏      | 647/2000 [01:20<00:55, 24.45it/s]
[Rank 2] Train Epoch 0:  34%|███▎      | 672/2000 [01:20<01:13, 18.19it/s]
[Rank 1] Train Epoch 0:  33%|███▎      | 653/2000 [01:20<00:45, 29.35it/s]
[Rank 2] Train Epoch 0:  34%|███▍      | 678/2000 [01:20<00:58, 22.57it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 676 | Mem: 26.53MB, Util: 100%  global_step : 676
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 677 | Mem: 26.53MB, Util: 100%  global_step : 677
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 678 | Mem: 26.53MB, Util: 100%  global_step : 678
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 677 | Mem: 26.53MB, Util: 100%  global_step : 677
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 678 | Mem: 26.53MB, Util: 100%  global_step : 678
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 679 | Mem: 26.53MB, Util: 100%  global_step : 679
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 680 | Mem: 26.53MB, Util: 100%  global_step : 680
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 654 | Mem: 26.53MB, Util: 17%  global_step : 

[Rank 0] Train Epoch 0:  34%|███▍      | 678/2000 [01:20<00:53, 24.73it/s]
[Rank 1] Train Epoch 0:  33%|███▎      | 657/2000 [01:20<00:44, 29.99it/s]
[Rank 2] Train Epoch 0:  34%|███▍      | 682/2000 [01:20<00:54, 24.24it/s]
[Rank 0] Train Epoch 0:  34%|███▍      | 682/2000 [01:20<00:50, 26.20it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 682 | Mem: 26.53MB, Util: 100%  global_step : 682
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 683 | Mem: 26.53MB, Util: 100%  global_step : 683
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 684 | Mem: 26.53MB, Util: 100%  global_step : 684
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 684 | Mem: 26.53MB, Util: 100%  global_step : 684
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 685 | Mem: 26.53MB, Util: 100%  global_step : 685
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 686 | Mem: 26.53MB, Util: 100%  global_step : 686
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 687 | Mem: 26.53MB, Util: 100%  global_step : 687
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 659 | Mem: 26.53MB, Util: 26%  global_step : 

[Rank 0] Train Epoch 0:  34%|███▍      | 686/2000 [01:20<00:48, 26.87it/s]
[Rank 1] Train Epoch 0:  33%|███▎      | 661/2000 [01:20<00:45, 29.44it/s]
[Rank 2] Train Epoch 0:  34%|███▍      | 686/2000 [01:20<00:51, 25.32it/s]
[Rank 0] Train Epoch 0:  35%|███▍      | 692/2000 [01:20<00:39, 33.29it/s]
[Rank 1] Train Epoch 0:  33%|███▎      | 667/2000 [01:20<00:36, 36.05it/s]
[Rank 2] Train Epoch 0:  35%|███▍      | 692/2000 [01:20<00:41, 31.51it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 691 | Mem: 26.53MB, Util: 100%  global_step : 691
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 692 | Mem: 26.53MB, Util: 100%  global_step : 692
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 693 | Mem: 26.53MB, Util: 100%  global_step : 693
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 694 | Mem: 26.53MB, Util: 100%  global_step : 694
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 693 | Mem: 26.53MB, Util: 100%  global_step : 693
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 694 | Mem: 26.53MB, Util: 100%  global_step : 694
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 695 | Mem: 26.53MB, Util: 100%  global_step : 695
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 696 | Mem: 26.53MB, Util: 100%  global_step : 

[Rank 0] Train Epoch 0:  35%|███▍      | 697/2000 [01:20<00:36, 35.59it/s]
[Rank 1] Train Epoch 0:  34%|███▎      | 671/2000 [01:20<00:36, 36.26it/s]
[Rank 2] Train Epoch 0:  35%|███▍      | 697/2000 [01:20<00:38, 34.12it/s]
[Rank 1] Train Epoch 0:  34%|███▍      | 678/2000 [01:20<00:29, 44.49it/s]
[Rank 1] Train Epoch 0:  34%|███▍      | 683/2000 [01:21<01:20, 16.28it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 682 | Mem: 26.53MB, Util: 29%  global_step : 682


[Rank 1] Train Epoch 0:  34%|███▍      | 687/2000 [01:23<03:09,  6.92it/s]
[Rank 1] Train Epoch 0:  35%|███▍      | 692/2000 [01:23<02:20,  9.30it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 683 | Mem: 26.53MB, Util: 0%  global_step : 683
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 684 | Mem: 26.53MB, Util: 0%  global_step : 684
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 685 | Mem: 26.53MB, Util: 0%  global_step : 685
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 686 | Mem: 26.53MB, Util: 0%  global_step : 686
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 687 | Mem: 26.53MB, Util: 0%  global_step : 687
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 688 | Mem: 26.53MB, Util: 0%  global_step : 688
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 689 | Mem: 26.53MB, Util: 0%  global_step : 689
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 690 | Mem: 26.53MB, Util: 0%  global_step : 690


[Rank 2] Train Epoch 0:  35%|███▌      | 702/2000 [01:23<03:55,  5.51it/s]
[Rank 0] Train Epoch 0:  35%|███▌      | 702/2000 [01:23<03:56,  5.48it/s]
[Rank 1] Train Epoch 0:  35%|███▍      | 699/2000 [01:23<01:34, 13.78it/s]
[Rank 2] Train Epoch 0:  36%|███▌      | 716/2000 [01:23<01:51, 11.50it/s]
[Rank 0] Train Epoch 0:  36%|███▌      | 717/2000 [01:23<01:47, 11.97it/s]
[Rank 1] Train Epoch 0:  35%|███▌      | 704/2000 [01:23<01:16, 16.91it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 702 | Mem: 26.53MB, Util: 24%  global_step : 702
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 703 | Mem: 26.53MB, Util: 24%  global_step : 703
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 704 | Mem: 26.53MB, Util: 24%  global_step : 704
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 705 | Mem: 26.53MB, Util: 29%  global_step : 705
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 706 | Mem: 26.53MB, Util: 29%  global_step : 706
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 716 | Mem: 26.53MB, Util: 100%  global_step : 716
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 717 | Mem: 26.53MB, Util: 100%  global_step : 717
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 718 | Mem: 26.53MB, Util: 100%  global_ste

[Rank 2] Train Epoch 0:  36%|███▋      | 730/2000 [01:23<01:06, 19.17it/s]
[Rank 0] Train Epoch 0:  37%|███▋      | 732/2000 [01:23<01:02, 20.27it/s]
[Rank 1] Train Epoch 0:  35%|███▌      | 709/2000 [01:23<01:07, 19.24it/s]
[Rank 1] Train Epoch 0:  36%|███▌      | 714/2000 [01:23<00:55, 23.25it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 709 | Mem: 26.53MB, Util: 29%  global_step : 709
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 710 | Mem: 26.53MB, Util: 29%  global_step : 710
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 711 | Mem: 26.53MB, Util: 20%  global_step : 711
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 712 | Mem: 26.53MB, Util: 20%  global_step : 712
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 713 | Mem: 26.53MB, Util: 20%  global_step : 713
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 714 | Mem: 26.53MB, Util: 20%  global_step : 714
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 735 | Mem: 26.53MB, Util: 99%  global_step : 735
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 736 | Mem: 26.53MB, Util: 100%  global_step

[Rank 2] Train Epoch 0:  37%|███▋      | 739/2000 [01:23<00:54, 23.15it/s]
[Rank 0] Train Epoch 0:  37%|███▋      | 741/2000 [01:24<00:53, 23.63it/s]
[Rank 1] Train Epoch 0:  36%|███▌      | 721/2000 [01:24<00:42, 30.28it/s]
[Rank 2] Train Epoch 0:  37%|███▋      | 747/2000 [01:24<00:44, 27.91it/s]
[Rank 0] Train Epoch 0:  37%|███▋      | 749/2000 [01:24<00:44, 28.06it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 721 | Mem: 26.53MB, Util: 20%  global_step : 721
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 722 | Mem: 26.53MB, Util: 20%  global_step : 722
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 723 | Mem: 26.53MB, Util: 32%  global_step : 723
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 724 | Mem: 26.53MB, Util: 32%  global_step : 724
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 725 | Mem: 26.53MB, Util: 32%  global_step : 725
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 747 | Mem: 26.53MB, Util: 100%  global_step : 747
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 748 | Mem: 26.53MB, Util: 100%  global_step : 748
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 749 | Mem: 26.53MB, Util: 100%  global_ste

[Rank 1] Train Epoch 0:  36%|███▋      | 727/2000 [01:24<00:37, 34.40it/s]
[Rank 2] Train Epoch 0:  38%|███▊      | 755/2000 [01:24<00:38, 32.08it/s]
[Rank 0] Train Epoch 0:  38%|███▊      | 757/2000 [01:24<00:38, 32.26it/s]
[Rank 1] Train Epoch 0:  37%|███▋      | 734/2000 [01:24<00:31, 39.68it/s]
[Rank 2] Train Epoch 0:  38%|███▊      | 762/2000 [01:24<00:35, 34.78it/s]
[Rank 0] Train Epoch 0:  38%|███▊      | 764/2000 [01:24<00:36, 34.30it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 733 | Mem: 26.53MB, Util: 32%  global_step : 733
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 734 | Mem: 26.53MB, Util: 32%  global_step : 734
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 735 | Mem: 26.53MB, Util: 32%  global_step : 735
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 736 | Mem: 26.53MB, Util: 32%  global_step : 736
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 758 | Mem: 26.53MB, Util: 100%  global_step : 758
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 759 | Mem: 26.53MB, Util: 100%  global_step : 759
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 760 | Mem: 26.53MB, Util: 100%  global_step : 760
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 761 | Mem: 26.53MB, Util: 100%  global_ste

[Rank 1] Train Epoch 0:  37%|███▋      | 740/2000 [01:24<00:30, 40.79it/s]
[Rank 1] Train Epoch 0:  37%|███▋      | 745/2000 [01:24<00:31, 39.66it/s]
[Rank 2] Train Epoch 0:  38%|███▊      | 768/2000 [01:24<00:35, 34.91it/s]
[Rank 0] Train Epoch 0:  38%|███▊      | 770/2000 [01:24<00:34, 35.39it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 740 | Mem: 26.53MB, Util: 32%  global_step : 740
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 741 | Mem: 26.53MB, Util: 28%  global_step : 741
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 742 | Mem: 26.53MB, Util: 28%  global_step : 742
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 743 | Mem: 26.53MB, Util: 28%  global_step : 743
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 744 | Mem: 26.53MB, Util: 28%  global_step : 744
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 765 | Mem: 26.53MB, Util: 100%  global_step : 765
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 766 | Mem: 26.53MB, Util: 100%  global_step : 766
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 767 | Mem: 26.53MB, Util: 100%  global_ste

[Rank 1] Train Epoch 0:  38%|███▊      | 750/2000 [01:25<01:18, 16.02it/s]
[Rank 2] Train Epoch 0:  39%|███▊      | 774/2000 [01:25<01:09, 17.70it/s]
[Rank 0] Train Epoch 0:  39%|███▉      | 776/2000 [01:25<01:08, 17.87it/s]
[Rank 1] Train Epoch 0:  38%|███▊      | 757/2000 [01:25<00:56, 21.96it/s]
[Rank 2] Train Epoch 0:  39%|███▉      | 782/2000 [01:25<00:52, 23.33it/s]
[Rank 0] Train Epoch 0:  39%|███▉      | 783/2000 [01:25<00:53, 22.85it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 780 | Mem: 26.53MB, Util: 100%  global_step : 780
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 781 | Mem: 26.53MB, Util: 100%  global_step : 781
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 782 | Mem: 26.53MB, Util: 100%  global_step : 782
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 783 | Mem: 26.53MB, Util: 100%  global_step : 783
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 784 | Mem: 26.53MB, Util: 100%  global_step : 784
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 785 | Mem: 26.53MB, Util: 100%  global_step : 785
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 757 | Mem: 26.53MB, Util: 21%  global_step : 757
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 758 | Mem: 26.53MB, Util: 21%  global_step : 75

[Rank 1] Train Epoch 0:  38%|███▊      | 762/2000 [01:25<00:48, 25.75it/s]
[Rank 2] Train Epoch 0:  39%|███▉      | 787/2000 [01:25<00:45, 26.50it/s]
[Rank 0] Train Epoch 0:  39%|███▉      | 788/2000 [01:25<00:46, 25.90it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 790 | Mem: 26.53MB, Util: 100%  global_step : 790
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 791 | Mem: 26.53MB, Util: 100%  global_step : 791
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 766 | Mem: 26.53MB, Util: 27%  global_step : 766
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 767 | Mem: 26.53MB, Util: 27%  global_step : 767
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 791 | Mem: 26.53MB, Util: 100%  global_step : 791
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 792 | Mem: 26.53MB, Util: 100%  global_step : 792
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 793 | Mem: 26.53MB, Util: 100%  global_step : 793
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 792 | Mem: 26.53MB, Util: 100%  global_step :

[Rank 1] Train Epoch 0:  38%|███▊      | 767/2000 [01:25<00:45, 26.91it/s]
[Rank 2] Train Epoch 0:  40%|███▉      | 792/2000 [01:25<00:44, 27.38it/s]
[Rank 0] Train Epoch 0:  40%|███▉      | 793/2000 [01:25<00:46, 26.11it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 795 | Mem: 26.53MB, Util: 100%  global_step : 795
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 796 | Mem: 26.53MB, Util: 100%  global_step : 796
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 771 | Mem: 26.53MB, Util: 17%  global_step : 771
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 772 | Mem: 26.53MB, Util: 17%  global_step : 772
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 796 | Mem: 26.53MB, Util: 100%  global_step : 796
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 797 | Mem: 26.53MB, Util: 100%  global_step : 797
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 797 | Mem: 26.53MB, Util: 100%  global_step : 797


[Rank 1] Train Epoch 0:  39%|███▊      | 772/2000 [01:26<00:48, 25.37it/s]
[Rank 2] Train Epoch 0:  40%|███▉      | 797/2000 [01:26<00:46, 25.78it/s]
[Rank 0] Train Epoch 0:  40%|███▉      | 798/2000 [01:26<00:46, 25.97it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 773 | Mem: 26.53MB, Util: 17%  global_step : 773
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 0, Batch 798 | Mem: 26.53MB, Util: 100%  global_step : 798
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 0, Batch 798 | Mem: 26.53MB, Util: 100%  global_step : 798


[Rank 1] Train Epoch 0:  39%|███▉      | 776/2000 [01:35<12:28,  1.64it/s]
[Rank 1] Train Epoch 0:  40%|███▉      | 790/2000 [01:35<05:38,  3.57it/s]
[Rank 0] Train Epoch 0:  40%|████      | 802/2000 [01:35<11:47,  1.69it/s]
[Rank 2] Train Epoch 0:  40%|████      | 801/2000 [01:35<11:40,  1.71it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 774 | Mem: 26.53MB, Util: 0%  global_step : 774
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 775 | Mem: 26.53MB, Util: 0%  global_step : 775
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 776 | Mem: 26.53MB, Util: 0%  global_step : 776
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 777 | Mem: 26.53MB, Util: 0%  global_step : 777
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 778 | Mem: 26.53MB, Util: 0%  global_step : 778
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 779 | Mem: 26.53MB, Util: 0%  global_step : 779
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 780 | Mem: 26.53MB, Util: 0%  global_step : 780
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 781 | Mem: 26.53MB, Util: 0%  global_step : 781


[Rank 1] Train Epoch 0:  40%|████      | 805/2000 [01:35<03:05,  6.44it/s]
[Rank 0] Train Epoch 0:  41%|████      | 817/2000 [01:35<05:17,  3.73it/s]
[Rank 2] Train Epoch 0:  41%|████      | 815/2000 [01:35<05:28,  3.61it/s]
[Rank 1] Train Epoch 0:  41%|████      | 819/2000 [01:35<01:57, 10.06it/s]
[Rank 0] Train Epoch 0:  42%|████▏     | 831/2000 [01:35<03:03,  6.35it/s]
[Rank 2] Train Epoch 0:  41%|████▏     | 829/2000 [01:35<03:07,  6.23it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 801 | Mem: 26.53MB, Util: 0%  global_step : 801
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 802 | Mem: 26.53MB, Util: 57%  global_step : 802
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 803 | Mem: 26.53MB, Util: 57%  global_step : 803
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 804 | Mem: 26.53MB, Util: 57%  global_step : 804
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 805 | Mem: 26.53MB, Util: 57%  global_step : 805
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 806 | Mem: 26.53MB, Util: 57%  global_step : 806
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 807 | Mem: 26.53MB, Util: 57%  global_step : 807
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 808 | Mem: 26.53MB, Util: 57%  global_step

[Rank 1] Train Epoch 0:  42%|████▏     | 833/2000 [01:35<01:18, 14.91it/s]
[Rank 0] Train Epoch 0:  42%|████▏     | 845/2000 [01:36<01:56,  9.93it/s]
[Rank 2] Train Epoch 0:  42%|████▏     | 843/2000 [01:36<01:58,  9.80it/s]
[Rank 1] Train Epoch 0:  42%|████▏     | 848/2000 [01:36<00:53, 21.73it/s]
[Rank 0] Train Epoch 0:  43%|████▎     | 859/2000 [01:36<01:17, 14.71it/s]
[Rank 2] Train Epoch 0:  43%|████▎     | 857/2000 [01:36<01:18, 14.57it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 829 | Mem: 26.53MB, Util: 57%  global_step : 829
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 830 | Mem: 26.53MB, Util: 57%  global_step : 830
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 831 | Mem: 26.53MB, Util: 96%  global_step : 831
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 832 | Mem: 26.53MB, Util: 96%  global_step : 832
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 833 | Mem: 26.53MB, Util: 96%  global_step : 833
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 834 | Mem: 26.53MB, Util: 96%  global_step : 834
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 835 | Mem: 26.53MB, Util: 96%  global_step : 835
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 836 | Mem: 26.53MB, Util: 96%  global_ste

[Rank 1] Train Epoch 0:  43%|████▎     | 863/2000 [01:36<00:37, 30.28it/s]
[Rank 0] Train Epoch 0:  44%|████▎     | 874/2000 [01:36<00:52, 21.38it/s]
[Rank 2] Train Epoch 0:  44%|████▎     | 871/2000 [01:36<00:54, 20.79it/s]
[Rank 1] Train Epoch 0:  44%|████▍     | 877/2000 [01:36<00:28, 39.83it/s]
[Rank 0] Train Epoch 0:  44%|████▍     | 888/2000 [01:36<00:38, 29.25it/s]
[Rank 2] Train Epoch 0:  44%|████▍     | 885/2000 [01:36<00:38, 28.66it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 859 | Mem: 26.53MB, Util: 96%  global_step : 859
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 860 | Mem: 26.53MB, Util: 96%  global_step : 860
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 861 | Mem: 26.53MB, Util: 96%  global_step : 861
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 862 | Mem: 26.53MB, Util: 96%  global_step : 862
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 863 | Mem: 26.53MB, Util: 100%  global_step : 863
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 864 | Mem: 26.53MB, Util: 100%  global_step : 864
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 865 | Mem: 26.53MB, Util: 100%  global_step : 865
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 866 | Mem: 26.53MB, Util: 100%  global

[Rank 1] Train Epoch 0:  45%|████▍     | 892/2000 [01:36<00:21, 51.98it/s]
[Rank 0] Train Epoch 0:  45%|████▌     | 901/2000 [01:36<00:29, 37.66it/s]
[Rank 2] Train Epoch 0:  45%|████▍     | 899/2000 [01:36<00:28, 38.18it/s]
[Rank 1] Train Epoch 0:  45%|████▌     | 906/2000 [01:36<00:17, 61.80it/s]
[Rank 2] Train Epoch 0:  46%|████▌     | 913/2000 [01:36<00:22, 49.15it/s]
[Rank 0] Train Epoch 0:  46%|████▌     | 916/2000 [01:36<00:21, 49.55it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 889 | Mem: 26.53MB, Util: 100%  global_step : 889
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 890 | Mem: 26.53MB, Util: 100%  global_step : 890
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 891 | Mem: 26.53MB, Util: 100%  global_step : 891
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 892 | Mem: 26.53MB, Util: 100%  global_step : 892
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 893 | Mem: 26.53MB, Util: 100%  global_step : 893
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 894 | Mem: 26.53MB, Util: 100%  global_step : 894
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 895 | Mem: 26.53MB, Util: 100%  global_step : 895
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 896 | Mem: 26.53MB, Util: 100%  gl

[Rank 1] Train Epoch 0:  46%|████▌     | 922/2000 [01:36<00:14, 76.77it/s]
[Rank 2] Train Epoch 0:  46%|████▋     | 927/2000 [01:36<00:17, 61.18it/s]
[Rank 0] Train Epoch 0:  47%|████▋     | 931/2000 [01:36<00:17, 62.62it/s]
[Rank 1] Train Epoch 0:  47%|████▋     | 938/2000 [01:36<00:11, 91.35it/s]
[Rank 2] Train Epoch 0:  47%|████▋     | 941/2000 [01:36<00:14, 73.55it/s]
[Rank 0] Train Epoch 0:  47%|████▋     | 946/2000 [01:36<00:13, 75.86it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 915 | Mem: 26.53MB, Util: 100%  global_step : 915
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 916 | Mem: 26.53MB, Util: 100%  global_step : 916
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 917 | Mem: 26.53MB, Util: 100%  global_step : 917
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 918 | Mem: 26.53MB, Util: 98%  global_step : 918
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 919 | Mem: 26.53MB, Util: 98%  global_step : 919
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 920 | Mem: 26.53MB, Util: 98%  global_step : 920
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 921 | Mem: 26.53MB, Util: 98%  global_step : 921
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 922 | Mem: 26.53MB, Util: 98%  global_

[Rank 1] Train Epoch 0:  48%|████▊     | 954/2000 [01:36<00:09, 104.62it/s]
[Rank 2] Train Epoch 0:  48%|████▊     | 955/2000 [01:36<00:12, 85.64it/s]
[Rank 0] Train Epoch 0:  48%|████▊     | 960/2000 [01:36<00:11, 87.61it/s]
[Rank 1] Train Epoch 0:  48%|████▊     | 970/2000 [01:36<00:08, 115.47it/s]
[Rank 2] Train Epoch 0:  48%|████▊     | 969/2000 [01:36<00:10, 96.47it/s]
[Rank 0] Train Epoch 0:  49%|████▉     | 975/2000 [01:37<00:10, 99.43it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 947 | Mem: 26.53MB, Util: 100%  global_step : 947
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 948 | Mem: 26.53MB, Util: 100%  global_step : 948
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 949 | Mem: 26.53MB, Util: 100%  global_step : 949
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 950 | Mem: 26.53MB, Util: 100%  global_step : 950
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 951 | Mem: 26.53MB, Util: 100%  global_step : 951
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 952 | Mem: 26.53MB, Util: 100%  global_step : 952
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 953 | Mem: 26.53MB, Util: 100%  global_step : 953
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 954 | Mem: 26.53MB, Util: 100%  gl

[Rank 1] Train Epoch 0:  49%|████▉     | 985/2000 [01:37<00:08, 122.51it/s]
[Rank 2] Train Epoch 0:  49%|████▉     | 983/2000 [01:37<00:09, 105.96it/s]
[Rank 0] Train Epoch 0:  50%|████▉     | 990/2000 [01:37<00:09, 109.92it/s]
[Rank 1] Train Epoch 0:  50%|█████     | 1000/2000 [01:37<00:07, 128.15it/s]
[Rank 2] Train Epoch 0:  50%|████▉     | 997/2000 [01:37<00:08, 113.21it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 978 | Mem: 26.53MB, Util: 100%  global_step : 978
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 979 | Mem: 26.53MB, Util: 100%  global_step : 979
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 980 | Mem: 26.53MB, Util: 100%  global_step : 980
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 981 | Mem: 26.53MB, Util: 100%  global_step : 981
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 982 | Mem: 26.53MB, Util: 100%  global_step : 982
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 983 | Mem: 26.53MB, Util: 100%  global_step : 983
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 984 | Mem: 26.53MB, Util: 100%  global_step : 984
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 985 | Mem: 26.53MB, Util: 100%  gl

[Rank 2] Train Epoch 0:  51%|█████     | 1011/2000 [01:37<00:08, 119.01it/s]
[Rank 0] Train Epoch 0:  50%|█████     | 1004/2000 [01:37<00:09, 109.79it/s]
[Rank 1] Train Epoch 0:  51%|█████     | 1015/2000 [01:37<00:08, 113.64it/s]
[Rank 2] Train Epoch 0:  51%|█████▏    | 1025/2000 [01:37<00:07, 123.73it/s]
[Rank 0] Train Epoch 0:  51%|█████     | 1019/2000 [01:37<00:08, 118.65it/s]
[Rank 1] Train Epoch 0:  52%|█████▏    | 1030/2000 [01:37<00:08, 120.44it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1029 | Mem: 26.53MB, Util: 97%  global_step : 1029
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1030 | Mem: 26.53MB, Util: 97%  global_step : 1030
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1031 | Mem: 26.53MB, Util: 97%  global_step : 1031
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1032 | Mem: 26.53MB, Util: 97%  global_step : 1032
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1033 | Mem: 26.53MB, Util: 97%  global_step : 1033
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1034 | Mem: 26.53MB, Util: 97%  global_step : 1034
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1035 | Mem: 26.53MB, Util: 97%  global_step : 1035
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1036 | Mem: 26.53MB, Util: 

[Rank 2] Train Epoch 0:  52%|█████▏    | 1039/2000 [01:37<00:07, 127.34it/s]
[Rank 0] Train Epoch 0:  52%|█████▏    | 1034/2000 [01:37<00:07, 125.25it/s]
[Rank 1] Train Epoch 0:  52%|█████▏    | 1045/2000 [01:37<00:07, 126.44it/s]
[Rank 2] Train Epoch 0:  53%|█████▎    | 1053/2000 [01:37<00:07, 129.13it/s]
[Rank 0] Train Epoch 0:  52%|█████▏    | 1049/2000 [01:37<00:07, 130.42it/s]
[Rank 1] Train Epoch 0:  53%|█████▎    | 1061/2000 [01:37<00:07, 133.32it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1060 | Mem: 26.53MB, Util: 100%  global_step : 1060
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1061 | Mem: 26.53MB, Util: 100%  global_step : 1061
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1062 | Mem: 26.53MB, Util: 100%  global_step : 1062
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1063 | Mem: 26.53MB, Util: 100%  global_step : 1063
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1064 | Mem: 26.53MB, Util: 100%  global_step : 1064
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1065 | Mem: 26.53MB, Util: 100%  global_step : 1065
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1066 | Mem: 26.53MB, Util: 100%  global_step : 1066
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1067 | Mem: 26.53MB,

[Rank 2] Train Epoch 0:  53%|█████▎    | 1067/2000 [01:37<00:07, 131.04it/s]
[Rank 0] Train Epoch 0:  53%|█████▎    | 1064/2000 [01:37<00:06, 133.90it/s]
[Rank 1] Train Epoch 0:  54%|█████▍    | 1076/2000 [01:37<00:06, 136.37it/s]
[Rank 2] Train Epoch 0:  54%|█████▍    | 1081/2000 [01:37<00:06, 133.25it/s]
[Rank 0] Train Epoch 0:  54%|█████▍    | 1079/2000 [01:37<00:06, 136.43it/s]
[Rank 1] Train Epoch 0:  55%|█████▍    | 1091/2000 [01:37<00:06, 139.07it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1090 | Mem: 26.53MB, Util: 100%  global_step : 1090
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1091 | Mem: 26.53MB, Util: 100%  global_step : 1091
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1092 | Mem: 26.53MB, Util: 100%  global_step : 1092
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1093 | Mem: 26.53MB, Util: 100%  global_step : 1093
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1094 | Mem: 26.53MB, Util: 100%  global_step : 1094
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1095 | Mem: 26.53MB, Util: 100%  global_step : 1095
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1096 | Mem: 26.53MB, Util: 100%  global_step : 1096
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1097 | Mem: 26.53MB,

[Rank 2] Train Epoch 0:  55%|█████▍    | 1096/2000 [01:37<00:06, 136.05it/s]
[Rank 0] Train Epoch 0:  55%|█████▍    | 1094/2000 [01:37<00:06, 138.02it/s]
[Rank 1] Train Epoch 0:  55%|█████▌    | 1106/2000 [01:37<00:07, 125.81it/s]
[Rank 2] Train Epoch 0:  56%|█████▌    | 1111/2000 [01:38<00:06, 137.71it/s]
[Rank 0] Train Epoch 0:  55%|█████▌    | 1109/2000 [01:38<00:06, 128.30it/s]
[Rank 1] Train Epoch 0:  56%|█████▌    | 1121/2000 [01:38<00:06, 131.15it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1114 | Mem: 26.53MB, Util: 100%  global_step : 1114
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1115 | Mem: 26.53MB, Util: 100%  global_step : 1115
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1116 | Mem: 26.53MB, Util: 100%  global_step : 1116
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1117 | Mem: 26.53MB, Util: 100%  global_step : 1117
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1118 | Mem: 26.53MB, Util: 100%  global_step : 1118
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1119 | Mem: 26.53MB, Util: 100%  global_step : 1119
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1120 | Mem: 26.53MB, Util: 100%  global_step : 1120
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1121 | Mem: 26.53MB,

[Rank 2] Train Epoch 0:  56%|█████▋    | 1125/2000 [01:38<00:06, 136.71it/s]
[Rank 0] Train Epoch 0:  56%|█████▌    | 1124/2000 [01:38<00:06, 132.53it/s]
[Rank 1] Train Epoch 0:  57%|█████▋    | 1135/2000 [01:38<00:06, 133.27it/s]
[Rank 0] Train Epoch 0:  57%|█████▋    | 1139/2000 [01:38<00:06, 135.38it/s]
[Rank 2] Train Epoch 0:  57%|█████▋    | 1142/2000 [01:38<00:05, 144.17it/s]
[Rank 1] Train Epoch 0:  57%|█████▋    | 1149/2000 [01:38<00:06, 134.58it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1143 | Mem: 26.53MB, Util: 88%  global_step : 1143
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1144 | Mem: 26.53MB, Util: 88%  global_step : 1144
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1145 | Mem: 26.53MB, Util: 88%  global_step : 1145
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1146 | Mem: 26.53MB, Util: 88%  global_step : 1146
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1147 | Mem: 26.53MB, Util: 88%  global_step : 1147
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1148 | Mem: 26.53MB, Util: 88%  global_step : 1148
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1149 | Mem: 26.53MB, Util: 88%  global_step : 1149
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1150 | Mem: 26.53MB, Util: 

[Rank 2] Train Epoch 0:  58%|█████▊    | 1159/2000 [01:38<00:05, 149.51it/s]
[Rank 0] Train Epoch 0:  58%|█████▊    | 1153/2000 [01:38<00:06, 134.63it/s]
[Rank 1] Train Epoch 0:  58%|█████▊    | 1163/2000 [01:38<00:06, 135.50it/s]
[Rank 0] Train Epoch 0:  58%|█████▊    | 1168/2000 [01:38<00:06, 136.43it/s]
[Rank 2] Train Epoch 0:  59%|█████▉    | 1175/2000 [01:38<00:05, 145.13it/s]
[Rank 1] Train Epoch 0:  59%|█████▉    | 1177/2000 [01:38<00:06, 136.20it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1172 | Mem: 26.53MB, Util: 75%  global_step : 1172
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1173 | Mem: 26.53MB, Util: 75%  global_step : 1173
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1174 | Mem: 26.53MB, Util: 75%  global_step : 1174
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1175 | Mem: 26.53MB, Util: 75%  global_step : 1175
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1176 | Mem: 26.53MB, Util: 75%  global_step : 1176
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1177 | Mem: 26.53MB, Util: 75%  global_step : 1177
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1178 | Mem: 26.53MB, Util: 75%  global_step : 1178
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1179 | Mem: 26.53MB, Util: 

[Rank 2] Train Epoch 0:  60%|█████▉    | 1190/2000 [01:38<00:05, 142.19it/s]
[Rank 0] Train Epoch 0:  59%|█████▉    | 1183/2000 [01:38<00:05, 137.79it/s]
[Rank 1] Train Epoch 0:  60%|█████▉    | 1191/2000 [01:38<00:05, 136.61it/s]
[Rank 0] Train Epoch 0:  60%|█████▉    | 1198/2000 [01:38<00:05, 138.64it/s]
[Rank 1] Train Epoch 0:  60%|██████    | 1205/2000 [01:38<00:05, 136.93it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1201 | Mem: 26.53MB, Util: 69%  global_step : 1201
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1202 | Mem: 26.53MB, Util: 69%  global_step : 1202
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1203 | Mem: 26.53MB, Util: 69%  global_step : 1203
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1204 | Mem: 26.53MB, Util: 69%  global_step : 1204
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1205 | Mem: 26.53MB, Util: 69%  global_step : 1205
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1206 | Mem: 26.53MB, Util: 69%  global_step : 1206
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1207 | Mem: 26.53MB, Util: 69%  global_step : 1207
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1208 | Mem: 26.53MB, Util: 

[Rank 2] Train Epoch 0:  60%|██████    | 1205/2000 [01:38<00:06, 131.50it/s]
[Rank 0] Train Epoch 0:  61%|██████    | 1212/2000 [01:38<00:05, 137.19it/s]
[Rank 1] Train Epoch 0:  61%|██████    | 1219/2000 [01:38<00:05, 137.06it/s]
[Rank 2] Train Epoch 0:  61%|██████    | 1220/2000 [01:38<00:05, 134.92it/s]
[Rank 0] Train Epoch 0:  61%|██████▏   | 1226/2000 [01:38<00:05, 137.73it/s]
[Rank 2] Train Epoch 0:  62%|██████▏   | 1238/2000 [01:38<00:05, 146.91it/s]
[Rank 1] Train Epoch 0:  62%|██████▏   | 1233/2000 [01:38<00:05, 137.01it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1230 | Mem: 26.53MB, Util: 71%  global_step : 1230
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1231 | Mem: 26.53MB, Util: 71%  global_step : 1231
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1232 | Mem: 26.53MB, Util: 71%  global_step : 1232
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1233 | Mem: 26.53MB, Util: 71%  global_step : 1233
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1234 | Mem: 26.53MB, Util: 71%  global_step : 1234
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1235 | Mem: 26.53MB, Util: 71%  global_step : 1235
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1236 | Mem: 26.53MB, Util: 71%  global_step : 1236
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1237 | Mem: 26.53MB, Util: 

[Rank 2] Train Epoch 0:  63%|██████▎   | 1253/2000 [01:38<00:05, 147.32it/s]
[Rank 0] Train Epoch 0:  62%|██████▏   | 1240/2000 [01:38<00:05, 138.03it/s]
[Rank 1] Train Epoch 0:  62%|██████▏   | 1247/2000 [01:39<00:05, 137.23it/s]
[Rank 0] Train Epoch 0:  63%|██████▎   | 1255/2000 [01:39<00:05, 139.24it/s]
[Rank 2] Train Epoch 0:  64%|██████▎   | 1270/2000 [01:39<00:04, 151.48it/s]
[Rank 1] Train Epoch 0:  63%|██████▎   | 1261/2000 [01:39<00:05, 137.05it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1259 | Mem: 26.53MB, Util: 70%  global_step : 1259
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1260 | Mem: 26.53MB, Util: 70%  global_step : 1260
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1261 | Mem: 26.53MB, Util: 70%  global_step : 1261
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1262 | Mem: 26.53MB, Util: 70%  global_step : 1262
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1263 | Mem: 26.53MB, Util: 70%  global_step : 1263
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1264 | Mem: 26.53MB, Util: 70%  global_step : 1264
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1265 | Mem: 26.53MB, Util: 70%  global_step : 1265
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1266 | Mem: 26.53MB, Util: 

[Rank 2] Train Epoch 0:  64%|██████▍   | 1286/2000 [01:39<00:04, 150.97it/s]
[Rank 0] Train Epoch 0:  64%|██████▎   | 1270/2000 [01:39<00:05, 140.22it/s]
[Rank 1] Train Epoch 0:  64%|██████▍   | 1275/2000 [01:39<00:05, 136.97it/s]
[Rank 0] Train Epoch 0:  64%|██████▍   | 1285/2000 [01:39<00:05, 140.42it/s]
[Rank 1] Train Epoch 0:  64%|██████▍   | 1289/2000 [01:39<00:05, 136.76it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1288 | Mem: 26.53MB, Util: 69%  global_step : 1288
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1289 | Mem: 26.53MB, Util: 69%  global_step : 1289
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1290 | Mem: 26.53MB, Util: 69%  global_step : 1290
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1291 | Mem: 26.53MB, Util: 69%  global_step : 1291
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1292 | Mem: 26.53MB, Util: 69%  global_step : 1292
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1293 | Mem: 26.53MB, Util: 69%  global_step : 1293
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1294 | Mem: 26.53MB, Util: 69%  global_step : 1294
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1295 | Mem: 26.53MB, Util: 

[Rank 2] Train Epoch 0:  65%|██████▌   | 1302/2000 [01:39<00:05, 122.44it/s]
[Rank 0] Train Epoch 0:  65%|██████▌   | 1300/2000 [01:39<00:05, 139.35it/s]
[Rank 1] Train Epoch 0:  65%|██████▌   | 1304/2000 [01:39<00:05, 137.68it/s]
[Rank 2] Train Epoch 0:  66%|██████▌   | 1316/2000 [01:39<00:05, 126.60it/s]
[Rank 0] Train Epoch 0:  66%|██████▌   | 1314/2000 [01:39<00:05, 133.64it/s]
[Rank 1] Train Epoch 0:  66%|██████▌   | 1318/2000 [01:39<00:05, 133.93it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1316 | Mem: 26.53MB, Util: 69%  global_step : 1316
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1317 | Mem: 26.53MB, Util: 69%  global_step : 1317
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1318 | Mem: 26.53MB, Util: 69%  global_step : 1318
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1319 | Mem: 26.53MB, Util: 69%  global_step : 1319
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1320 | Mem: 26.53MB, Util: 69%  global_step : 1320
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1321 | Mem: 26.53MB, Util: 69%  global_step : 1321
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1322 | Mem: 26.53MB, Util: 69%  global_step : 1322
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1323 | Mem: 26.53MB, Util: 

[Rank 2] Train Epoch 0:  66%|██████▋   | 1330/2000 [01:39<00:05, 129.73it/s]
[Rank 0] Train Epoch 0:  66%|██████▋   | 1329/2000 [01:39<00:04, 135.69it/s]
[Rank 1] Train Epoch 0:  67%|██████▋   | 1332/2000 [01:39<00:05, 133.15it/s]
[Rank 2] Train Epoch 0:  67%|██████▋   | 1344/2000 [01:39<00:04, 131.69it/s]
[Rank 0] Train Epoch 0:  67%|██████▋   | 1343/2000 [01:39<00:04, 136.71it/s]
[Rank 1] Train Epoch 0:  67%|██████▋   | 1347/2000 [01:39<00:04, 136.26it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1344 | Mem: 26.53MB, Util: 78%  global_step : 1344
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1345 | Mem: 26.53MB, Util: 78%  global_step : 1345
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1346 | Mem: 26.53MB, Util: 78%  global_step : 1346
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1347 | Mem: 26.53MB, Util: 78%  global_step : 1347
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1348 | Mem: 26.53MB, Util: 78%  global_step : 1348
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1349 | Mem: 26.53MB, Util: 78%  global_step : 1349
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1350 | Mem: 26.53MB, Util: 78%  global_step : 1350
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1351 | Mem: 26.53MB, Util: 

[Rank 2] Train Epoch 0:  68%|██████▊   | 1358/2000 [01:39<00:04, 133.03it/s]
[Rank 0] Train Epoch 0:  68%|██████▊   | 1357/2000 [01:39<00:04, 137.49it/s]
[Rank 1] Train Epoch 0:  68%|██████▊   | 1362/2000 [01:39<00:04, 139.19it/s]
[Rank 2] Train Epoch 0:  69%|██████▊   | 1372/2000 [01:39<00:04, 134.72it/s]
[Rank 0] Train Epoch 0:  69%|██████▊   | 1371/2000 [01:39<00:04, 137.60it/s]
[Rank 1] Train Epoch 0:  69%|██████▉   | 1377/2000 [01:39<00:04, 140.94it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1375 | Mem: 26.53MB, Util: 69%  global_step : 1375
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1376 | Mem: 26.53MB, Util: 69%  global_step : 1376
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1377 | Mem: 26.53MB, Util: 69%  global_step : 1377
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1378 | Mem: 26.53MB, Util: 69%  global_step : 1378
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1379 | Mem: 26.53MB, Util: 69%  global_step : 1379
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1380 | Mem: 26.53MB, Util: 69%  global_step : 1380
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1381 | Mem: 26.53MB, Util: 69%  global_step : 1381
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1382 | Mem: 26.53MB, Util: 

[Rank 2] Train Epoch 0:  69%|██████▉   | 1386/2000 [01:40<00:04, 135.55it/s]
[Rank 0] Train Epoch 0:  69%|██████▉   | 1386/2000 [01:40<00:04, 138.67it/s]
[Rank 1] Train Epoch 0:  70%|██████▉   | 1392/2000 [01:40<00:04, 140.69it/s]
[Rank 2] Train Epoch 0:  70%|███████   | 1400/2000 [01:40<00:04, 136.68it/s]
[Rank 0] Train Epoch 0:  70%|███████   | 1400/2000 [01:40<00:04, 138.80it/s]
[Rank 1] Train Epoch 0:  70%|███████   | 1407/2000 [01:40<00:04, 139.55it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1404 | Mem: 26.53MB, Util: 76%  global_step : 1404
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1405 | Mem: 26.53MB, Util: 76%  global_step : 1405
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1406 | Mem: 26.53MB, Util: 76%  global_step : 1406
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1407 | Mem: 26.53MB, Util: 76%  global_step : 1407
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1408 | Mem: 26.53MB, Util: 76%  global_step : 1408
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1409 | Mem: 26.53MB, Util: 76%  global_step : 1409
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1410 | Mem: 26.53MB, Util: 76%  global_step : 1410
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1411 | Mem: 26.53MB, Util: 

[Rank 2] Train Epoch 0:  71%|███████   | 1414/2000 [01:40<00:04, 137.61it/s]
[Rank 0] Train Epoch 0:  71%|███████   | 1414/2000 [01:40<00:04, 135.06it/s]
[Rank 1] Train Epoch 0:  71%|███████   | 1422/2000 [01:40<00:04, 139.77it/s]
[Rank 2] Train Epoch 0:  71%|███████▏  | 1429/2000 [01:40<00:04, 138.42it/s]
[Rank 0] Train Epoch 0:  71%|███████▏  | 1429/2000 [01:40<00:04, 137.85it/s]
[Rank 1] Train Epoch 0:  72%|███████▏  | 1437/2000 [01:40<00:04, 140.36it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1433 | Mem: 26.53MB, Util: 96%  global_step : 1433
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1434 | Mem: 26.53MB, Util: 96%  global_step : 1434
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1435 | Mem: 26.53MB, Util: 96%  global_step : 1435
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1436 | Mem: 26.53MB, Util: 96%  global_step : 1436
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1437 | Mem: 26.53MB, Util: 96%  global_step : 1437
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1438 | Mem: 26.53MB, Util: 96%  global_step : 1438
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1439 | Mem: 26.53MB, Util: 96%  global_step : 1439
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1440 | Mem: 26.53MB, Util: 

[Rank 2] Train Epoch 0:  72%|███████▏  | 1443/2000 [01:40<00:04, 137.74it/s]
[Rank 0] Train Epoch 0:  72%|███████▏  | 1443/2000 [01:40<00:04, 137.94it/s]
[Rank 1] Train Epoch 0:  73%|███████▎  | 1452/2000 [01:40<00:03, 139.49it/s]
[Rank 2] Train Epoch 0:  73%|███████▎  | 1457/2000 [01:40<00:03, 137.34it/s]
[Rank 0] Train Epoch 0:  73%|███████▎  | 1458/2000 [01:40<00:03, 138.56it/s]
[Rank 1] Train Epoch 0:  73%|███████▎  | 1467/2000 [01:40<00:03, 140.00it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1463 | Mem: 26.53MB, Util: 96%  global_step : 1463
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1464 | Mem: 26.53MB, Util: 96%  global_step : 1464
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1465 | Mem: 26.53MB, Util: 96%  global_step : 1465
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1466 | Mem: 26.53MB, Util: 96%  global_step : 1466
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1467 | Mem: 26.53MB, Util: 96%  global_step : 1467
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1468 | Mem: 26.53MB, Util: 96%  global_step : 1468
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1469 | Mem: 26.53MB, Util: 96%  global_step : 1469
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1470 | Mem: 26.53MB, Util: 

[Rank 2] Train Epoch 0:  74%|███████▎  | 1471/2000 [01:40<00:03, 137.02it/s]
[Rank 0] Train Epoch 0:  74%|███████▎  | 1473/2000 [01:40<00:03, 139.82it/s]
[Rank 1] Train Epoch 0:  74%|███████▍  | 1482/2000 [01:40<00:03, 139.57it/s]
[Rank 2] Train Epoch 0:  74%|███████▍  | 1485/2000 [01:40<00:03, 136.35it/s]
[Rank 0] Train Epoch 0:  74%|███████▍  | 1488/2000 [01:40<00:03, 141.47it/s]
[Rank 1] Train Epoch 0:  75%|███████▍  | 1496/2000 [01:40<00:03, 139.23it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1492 | Mem: 26.53MB, Util: 100%  global_step : 1492
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1493 | Mem: 26.53MB, Util: 100%  global_step : 1493
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1494 | Mem: 26.53MB, Util: 100%  global_step : 1494
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1495 | Mem: 26.53MB, Util: 100%  global_step : 1495
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1496 | Mem: 26.53MB, Util: 100%  global_step : 1496
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1497 | Mem: 26.53MB, Util: 100%  global_step : 1497
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1498 | Mem: 26.53MB, Util: 100%  global_step : 1498
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1499 | Mem: 26.53MB,

[Rank 2] Train Epoch 0:  75%|███████▍  | 1499/2000 [01:40<00:03, 136.68it/s]
[Rank 0] Train Epoch 0:  75%|███████▌  | 1503/2000 [01:40<00:03, 133.21it/s]
[Rank 1] Train Epoch 0:  76%|███████▌  | 1510/2000 [01:40<00:03, 134.05it/s]
[Rank 2] Train Epoch 0:  76%|███████▌  | 1513/2000 [01:40<00:03, 136.30it/s]
[Rank 0] Train Epoch 0:  76%|███████▌  | 1518/2000 [01:40<00:03, 136.02it/s]
[Rank 1] Train Epoch 0:  76%|███████▌  | 1524/2000 [01:41<00:03, 134.22it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1519 | Mem: 26.53MB, Util: 100%  global_step : 1519
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1520 | Mem: 26.53MB, Util: 100%  global_step : 1520
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1521 | Mem: 26.53MB, Util: 100%  global_step : 1521
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1522 | Mem: 26.53MB, Util: 100%  global_step : 1522
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1523 | Mem: 26.53MB, Util: 98%  global_step : 1523
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1524 | Mem: 26.53MB, Util: 98%  global_step : 1524
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1525 | Mem: 26.53MB, Util: 98%  global_step : 1525
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1526 | Mem: 26.53MB, Ut

[Rank 2] Train Epoch 0:  76%|███████▋  | 1527/2000 [01:41<00:03, 136.28it/s]
[Rank 0] Train Epoch 0:  77%|███████▋  | 1533/2000 [01:41<00:03, 137.39it/s]
[Rank 1] Train Epoch 0:  77%|███████▋  | 1538/2000 [01:41<00:03, 135.83it/s]
[Rank 2] Train Epoch 0:  77%|███████▋  | 1541/2000 [01:41<00:03, 136.63it/s]
[Rank 0] Train Epoch 0:  77%|███████▋  | 1548/2000 [01:41<00:03, 138.00it/s]
[Rank 1] Train Epoch 0:  78%|███████▊  | 1552/2000 [01:41<00:03, 136.15it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1547 | Mem: 26.53MB, Util: 98%  global_step : 1547
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1548 | Mem: 26.53MB, Util: 98%  global_step : 1548
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1549 | Mem: 26.53MB, Util: 98%  global_step : 1549
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1550 | Mem: 26.53MB, Util: 98%  global_step : 1550
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1551 | Mem: 26.53MB, Util: 98%  global_step : 1551
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1552 | Mem: 26.53MB, Util: 93%  global_step : 1552
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1553 | Mem: 26.53MB, Util: 93%  global_step : 1553
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1554 | Mem: 26.53MB, Util: 

[Rank 2] Train Epoch 0:  78%|███████▊  | 1555/2000 [01:41<00:03, 134.31it/s]
[Rank 0] Train Epoch 0:  78%|███████▊  | 1562/2000 [01:41<00:03, 138.09it/s]
[Rank 1] Train Epoch 0:  78%|███████▊  | 1567/2000 [01:41<00:03, 137.54it/s]
[Rank 2] Train Epoch 0:  78%|███████▊  | 1569/2000 [01:41<00:03, 135.28it/s]
[Rank 0] Train Epoch 0:  79%|███████▉  | 1576/2000 [01:41<00:03, 137.14it/s]
[Rank 1] Train Epoch 0:  79%|███████▉  | 1581/2000 [01:41<00:03, 137.81it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1577 | Mem: 26.53MB, Util: 93%  global_step : 1577
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1578 | Mem: 26.53MB, Util: 93%  global_step : 1578
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1579 | Mem: 26.53MB, Util: 93%  global_step : 1579
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1580 | Mem: 26.53MB, Util: 93%  global_step : 1580
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1581 | Mem: 26.53MB, Util: 99%  global_step : 1581
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1582 | Mem: 26.53MB, Util: 99%  global_step : 1582
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1583 | Mem: 26.53MB, Util: 99%  global_step : 1583
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1584 | Mem: 26.53MB, Util: 

[Rank 2] Train Epoch 0:  79%|███████▉  | 1584/2000 [01:41<00:03, 138.54it/s]
[Rank 0] Train Epoch 0:  80%|███████▉  | 1590/2000 [01:41<00:02, 137.81it/s]
[Rank 1] Train Epoch 0:  80%|███████▉  | 1595/2000 [01:41<00:02, 136.02it/s]
[Rank 2] Train Epoch 0:  80%|███████▉  | 1598/2000 [01:41<00:02, 137.78it/s]
[Rank 0] Train Epoch 0:  80%|████████  | 1604/2000 [01:41<00:02, 133.42it/s]
[Rank 1] Train Epoch 0:  80%|████████  | 1609/2000 [01:41<00:02, 136.49it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1605 | Mem: 26.53MB, Util: 99%  global_step : 1605
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1606 | Mem: 26.53MB, Util: 99%  global_step : 1606
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1607 | Mem: 26.53MB, Util: 99%  global_step : 1607
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1608 | Mem: 26.53MB, Util: 98%  global_step : 1608
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1609 | Mem: 26.53MB, Util: 98%  global_step : 1609
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1610 | Mem: 26.53MB, Util: 98%  global_step : 1610
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1611 | Mem: 26.53MB, Util: 98%  global_step : 1611
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1612 | Mem: 26.53MB, Util: 

[Rank 2] Train Epoch 0:  81%|████████  | 1612/2000 [01:41<00:02, 137.91it/s]
[Rank 0] Train Epoch 0:  81%|████████  | 1619/2000 [01:41<00:02, 136.37it/s]
[Rank 1] Train Epoch 0:  81%|████████  | 1624/2000 [01:41<00:02, 137.77it/s]
[Rank 2] Train Epoch 0:  81%|████████▏ | 1626/2000 [01:41<00:02, 137.57it/s]
[Rank 0] Train Epoch 0:  82%|████████▏ | 1634/2000 [01:41<00:02, 138.84it/s]
[Rank 1] Train Epoch 0:  82%|████████▏ | 1639/2000 [01:41<00:02, 139.00it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1634 | Mem: 26.53MB, Util: 98%  global_step : 1634
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1635 | Mem: 26.53MB, Util: 98%  global_step : 1635
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1636 | Mem: 26.53MB, Util: 98%  global_step : 1636
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1637 | Mem: 26.53MB, Util: 98%  global_step : 1637
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1638 | Mem: 26.53MB, Util: 98%  global_step : 1638
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1639 | Mem: 26.53MB, Util: 98%  global_step : 1639
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1640 | Mem: 26.53MB, Util: 98%  global_step : 1640
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1641 | Mem: 26.53MB, Util: 

[Rank 2] Train Epoch 0:  82%|████████▏ | 1640/2000 [01:41<00:02, 137.32it/s]
[Rank 0] Train Epoch 0:  82%|████████▏ | 1648/2000 [01:41<00:02, 136.46it/s]
[Rank 1] Train Epoch 0:  83%|████████▎ | 1654/2000 [01:41<00:02, 139.60it/s]
[Rank 2] Train Epoch 0:  83%|████████▎ | 1654/2000 [01:41<00:02, 137.22it/s]
[Rank 0] Train Epoch 0:  83%|████████▎ | 1663/2000 [01:42<00:02, 138.47it/s]
[Rank 1] Train Epoch 0:  83%|████████▎ | 1669/2000 [01:42<00:02, 140.32it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1664 | Mem: 26.53MB, Util: 98%  global_step : 1664
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1665 | Mem: 26.53MB, Util: 98%  global_step : 1665
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1666 | Mem: 26.53MB, Util: 98%  global_step : 1666
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1667 | Mem: 26.53MB, Util: 98%  global_step : 1667
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1668 | Mem: 26.53MB, Util: 98%  global_step : 1668
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1669 | Mem: 26.53MB, Util: 100%  global_step : 1669
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1670 | Mem: 26.53MB, Util: 100%  global_step : 1670
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1671 | Mem: 26.53MB, Util

[Rank 2] Train Epoch 0:  83%|████████▎ | 1668/2000 [01:42<00:02, 136.93it/s]
[Rank 0] Train Epoch 0:  84%|████████▍ | 1677/2000 [01:42<00:02, 136.29it/s]
[Rank 1] Train Epoch 0:  84%|████████▍ | 1684/2000 [01:42<00:02, 140.42it/s]
[Rank 2] Train Epoch 0:  84%|████████▍ | 1682/2000 [01:42<00:02, 136.94it/s]
[Rank 0] Train Epoch 0:  85%|████████▍ | 1692/2000 [01:42<00:02, 137.97it/s]
[Rank 1] Train Epoch 0:  85%|████████▍ | 1699/2000 [01:42<00:02, 141.66it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1694 | Mem: 26.53MB, Util: 100%  global_step : 1694
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1695 | Mem: 26.53MB, Util: 100%  global_step : 1695
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1696 | Mem: 26.53MB, Util: 100%  global_step : 1696
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1697 | Mem: 26.53MB, Util: 100%  global_step : 1697
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1698 | Mem: 26.53MB, Util: 100%  global_step : 1698
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1699 | Mem: 26.53MB, Util: 100%  global_step : 1699
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1700 | Mem: 26.53MB, Util: 100%  global_step : 1700
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1701 | Mem: 26.53MB,

[Rank 2] Train Epoch 0:  85%|████████▍ | 1696/2000 [01:42<00:02, 137.00it/s]
[Rank 0] Train Epoch 0:  85%|████████▌ | 1706/2000 [01:42<00:02, 133.93it/s]
[Rank 1] Train Epoch 0:  86%|████████▌ | 1714/2000 [01:42<00:02, 132.29it/s]
[Rank 2] Train Epoch 0:  86%|████████▌ | 1710/2000 [01:42<00:02, 137.26it/s]
[Rank 0] Train Epoch 0:  86%|████████▌ | 1721/2000 [01:42<00:02, 136.66it/s]
[Rank 1] Train Epoch 0:  86%|████████▋ | 1729/2000 [01:42<00:02, 135.05it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1720 | Mem: 26.53MB, Util: 100%  global_step : 1720
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1721 | Mem: 26.53MB, Util: 100%  global_step : 1721
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1722 | Mem: 26.53MB, Util: 100%  global_step : 1722
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1723 | Mem: 26.53MB, Util: 100%  global_step : 1723
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1724 | Mem: 26.53MB, Util: 96%  global_step : 1724
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1725 | Mem: 26.53MB, Util: 96%  global_step : 1725
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1726 | Mem: 26.53MB, Util: 96%  global_step : 1726
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1727 | Mem: 26.53MB, Ut

[Rank 2] Train Epoch 0:  86%|████████▌ | 1724/2000 [01:42<00:02, 137.05it/s]
[Rank 0] Train Epoch 0:  87%|████████▋ | 1735/2000 [01:42<00:01, 137.32it/s]
[Rank 1] Train Epoch 0:  87%|████████▋ | 1744/2000 [01:42<00:01, 137.34it/s]
[Rank 2] Train Epoch 0:  87%|████████▋ | 1738/2000 [01:42<00:01, 136.83it/s]
[Rank 0] Train Epoch 0:  87%|████████▋ | 1749/2000 [01:42<00:01, 137.55it/s]
[Rank 1] Train Epoch 0:  88%|████████▊ | 1759/2000 [01:42<00:01, 138.87it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1750 | Mem: 26.53MB, Util: 96%  global_step : 1750
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1751 | Mem: 26.53MB, Util: 96%  global_step : 1751
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1752 | Mem: 26.53MB, Util: 96%  global_step : 1752
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1753 | Mem: 26.53MB, Util: 100%  global_step : 1753
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1754 | Mem: 26.53MB, Util: 100%  global_step : 1754
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1755 | Mem: 26.53MB, Util: 100%  global_step : 1755
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1756 | Mem: 26.53MB, Util: 100%  global_step : 1756
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1757 | Mem: 26.53MB, Ut

[Rank 2] Train Epoch 0:  88%|████████▊ | 1752/2000 [01:42<00:01, 136.79it/s]
[Rank 0] Train Epoch 0:  88%|████████▊ | 1764/2000 [01:42<00:01, 139.53it/s]
[Rank 1] Train Epoch 0:  89%|████████▊ | 1774/2000 [01:42<00:01, 139.75it/s]
[Rank 2] Train Epoch 0:  88%|████████▊ | 1766/2000 [01:42<00:01, 136.62it/s]
[Rank 0] Train Epoch 0:  89%|████████▉ | 1778/2000 [01:42<00:01, 139.41it/s]
[Rank 1] Train Epoch 0:  89%|████████▉ | 1789/2000 [01:42<00:01, 139.94it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1780 | Mem: 26.53MB, Util: 100%  global_step : 1780
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1781 | Mem: 26.53MB, Util: 100%  global_step : 1781
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1782 | Mem: 26.53MB, Util: 100%  global_step : 1782
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1783 | Mem: 26.53MB, Util: 100%  global_step : 1783
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1784 | Mem: 26.53MB, Util: 100%  global_step : 1784
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1785 | Mem: 26.53MB, Util: 100%  global_step : 1785
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1786 | Mem: 26.53MB, Util: 100%  global_step : 1786
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1787 | Mem: 26.53MB,

[Rank 2] Train Epoch 0:  89%|████████▉ | 1780/2000 [01:42<00:01, 136.01it/s]
[Rank 0] Train Epoch 0:  90%|████████▉ | 1792/2000 [01:42<00:01, 138.61it/s]
[Rank 1] Train Epoch 0:  90%|█████████ | 1804/2000 [01:43<00:01, 130.15it/s]
[Rank 2] Train Epoch 0:  90%|████████▉ | 1794/2000 [01:42<00:01, 136.53it/s]
[Rank 0] Train Epoch 0:  90%|█████████ | 1806/2000 [01:43<00:01, 131.98it/s]
[Rank 1] Train Epoch 0:  91%|█████████ | 1819/2000 [01:43<00:01, 132.68it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1806 | Mem: 26.53MB, Util: 100%  global_step : 1806
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1807 | Mem: 26.53MB, Util: 100%  global_step : 1807
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1808 | Mem: 26.53MB, Util: 100%  global_step : 1808
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1809 | Mem: 26.53MB, Util: 100%  global_step : 1809
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1810 | Mem: 26.53MB, Util: 100%  global_step : 1810
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1811 | Mem: 26.53MB, Util: 100%  global_step : 1811
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1812 | Mem: 26.53MB, Util: 100%  global_step : 1812
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1813 | Mem: 26.53MB,

[Rank 2] Train Epoch 0:  90%|█████████ | 1808/2000 [01:43<00:01, 136.58it/s]
[Rank 0] Train Epoch 0:  91%|█████████ | 1821/2000 [01:43<00:01, 135.10it/s]
[Rank 1] Train Epoch 0:  92%|█████████▏| 1834/2000 [01:43<00:01, 135.92it/s]
[Rank 2] Train Epoch 0:  91%|█████████ | 1822/2000 [01:43<00:01, 136.19it/s]
[Rank 0] Train Epoch 0:  92%|█████████▏| 1836/2000 [01:43<00:01, 137.17it/s]
[Rank 1] Train Epoch 0:  92%|█████████▏| 1849/2000 [01:43<00:01, 137.18it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1835 | Mem: 26.53MB, Util: 92%  global_step : 1835
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1836 | Mem: 26.53MB, Util: 92%  global_step : 1836
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1837 | Mem: 26.53MB, Util: 92%  global_step : 1837
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1838 | Mem: 26.53MB, Util: 92%  global_step : 1838
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1839 | Mem: 26.53MB, Util: 92%  global_step : 1839
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1840 | Mem: 26.53MB, Util: 92%  global_step : 1840
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1841 | Mem: 26.53MB, Util: 92%  global_step : 1841
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1842 | Mem: 26.53MB, Util: 

[Rank 2] Train Epoch 0:  92%|█████████▏| 1836/2000 [01:43<00:01, 136.03it/s]
[Rank 0] Train Epoch 0:  93%|█████████▎| 1851/2000 [01:43<00:01, 139.06it/s]
[Rank 1] Train Epoch 0:  93%|█████████▎| 1864/2000 [01:43<00:00, 138.20it/s]
[Rank 2] Train Epoch 0:  92%|█████████▎| 1850/2000 [01:43<00:01, 132.90it/s]
[Rank 0] Train Epoch 0:  93%|█████████▎| 1866/2000 [01:43<00:00, 139.74it/s]
[Rank 1] Train Epoch 0:  94%|█████████▍| 1879/2000 [01:43<00:00, 139.90it/s]
[Rank 2] Train Epoch 0:  93%|█████████▎| 1865/2000 [01:43<00:00, 137.43it/s]
[Rank 0] Train Epoch 0:  94%|█████████▍| 1881/2000 [01:43<00:00, 139.96it/s]
[Rank 1] Train Epoch 0:  95%|█████████▍| 1894/2000 [01:43<00:00, 141.47it/s]
[Rank 2] Train Epoch 0:  94%|█████████▍| 1880/2000 [01:43<00:00, 141.01it/s]
[Rank 2] Train Epoch 0:  95%|█████████▍| 1895/2000 [01:43<00:00, 143.22it/s]
[Rank 0] Train Epoch 0:  95%|█████████▍| 1896/2000 [01:43<00:00, 140.79it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1865 | Mem: 26.53MB, Util: 100%  global_step : 1865
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1866 | Mem: 26.53MB, Util: 100%  global_step : 1866
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1867 | Mem: 26.53MB, Util: 100%  global_step : 1867
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1868 | Mem: 26.53MB, Util: 100%  global_step : 1868
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1869 | Mem: 26.53MB, Util: 100%  global_step : 1869
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1870 | Mem: 26.53MB, Util: 100%  global_step : 1870
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1871 | Mem: 26.53MB, Util: 100%  global_step : 1871
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1872 | Mem: 26.53MB,

[Rank 1] Train Epoch 0:  95%|█████████▌| 1909/2000 [01:43<00:00, 139.02it/s]
[Rank 2] Train Epoch 0:  96%|█████████▌| 1910/2000 [01:43<00:00, 144.71it/s]
[Rank 0] Train Epoch 0:  96%|█████████▌| 1911/2000 [01:43<00:00, 138.20it/s]
[Rank 1] Train Epoch 0:  96%|█████████▌| 1923/2000 [01:43<00:00, 138.08it/s]
[Rank 2] Train Epoch 0:  96%|█████████▋| 1925/2000 [01:43<00:00, 145.29it/s]
[Rank 0] Train Epoch 0:  96%|█████████▋| 1926/2000 [01:43<00:00, 140.60it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1895 | Mem: 26.53MB, Util: 100%  global_step : 1895
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1896 | Mem: 26.53MB, Util: 100%  global_step : 1896
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1897 | Mem: 26.53MB, Util: 100%  global_step : 1897
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1898 | Mem: 26.53MB, Util: 100%  global_step : 1898
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1899 | Mem: 26.53MB, Util: 100%  global_step : 1899
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1900 | Mem: 26.53MB, Util: 100%  global_step : 1900
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1901 | Mem: 26.53MB, Util: 100%  global_step : 1901
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1902 | Mem: 26.53MB,

[Rank 1] Train Epoch 0:  97%|█████████▋| 1938/2000 [01:44<00:00, 140.58it/s]
[Rank 2] Train Epoch 0:  97%|█████████▋| 1940/2000 [01:44<00:00, 145.14it/s]
[Rank 0] Train Epoch 0:  97%|█████████▋| 1941/2000 [01:44<00:00, 141.77it/s]
[Rank 1] Train Epoch 0:  98%|█████████▊| 1953/2000 [01:44<00:00, 142.25it/s]
[Rank 2] Train Epoch 0:  98%|█████████▊| 1955/2000 [01:44<00:00, 145.41it/s]
[Rank 0] Train Epoch 0:  98%|█████████▊| 1956/2000 [01:44<00:00, 141.92it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1923 | Mem: 26.53MB, Util: 97%  global_step : 1923
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1924 | Mem: 26.53MB, Util: 97%  global_step : 1924
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1925 | Mem: 26.53MB, Util: 97%  global_step : 1925
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1926 | Mem: 26.53MB, Util: 97%  global_step : 1926
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1927 | Mem: 26.53MB, Util: 97%  global_step : 1927
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1928 | Mem: 26.53MB, Util: 97%  global_step : 1928
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1929 | Mem: 26.53MB, Util: 97%  global_step : 1929
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1930 | Mem: 26.53MB, Util: 

[Rank 1] Train Epoch 0:  98%|█████████▊| 1968/2000 [01:44<00:00, 143.49it/s]
[Rank 2] Train Epoch 0:  98%|█████████▊| 1970/2000 [01:44<00:00, 146.07it/s]
[Rank 0] Train Epoch 0:  99%|█████████▊| 1971/2000 [01:44<00:00, 139.00it/s]
[Rank 1] Train Epoch 0:  99%|█████████▉| 1983/2000 [01:44<00:00, 144.14it/s]
[Rank 2] Train Epoch 0:  99%|█████████▉| 1985/2000 [01:44<00:00, 147.12it/s]
[Rank 0] Train Epoch 0:  99%|█████████▉| 1986/2000 [01:44<00:00, 140.64it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1954 | Mem: 26.53MB, Util: 75%  global_step : 1954
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1955 | Mem: 26.53MB, Util: 75%  global_step : 1955
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1956 | Mem: 26.53MB, Util: 75%  global_step : 1956
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1957 | Mem: 26.53MB, Util: 75%  global_step : 1957
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1958 | Mem: 26.53MB, Util: 75%  global_step : 1958
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1959 | Mem: 26.53MB, Util: 75%  global_step : 1959
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1960 | Mem: 26.53MB, Util: 75%  global_step : 1960
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1961 | Mem: 26.53MB, Util: 

[Rank 1] Train Epoch 0: 100%|██████████| 2000/2000 [01:44<00:00, 19.15it/s] 
[Rank 1] Test Epoch 0:   0%|          | 0/334 [00:00<?, ?it/s]
[Rank 2] Train Epoch 0: 100%|██████████| 2000/2000 [01:44<00:00, 19.15it/s] 
[Rank 2] Test Epoch 0:   0%|          | 0/334 [00:00<?, ?it/s]
[Rank 0] Train Epoch 0: 100%|██████████| 2000/2000 [01:44<00:00, 19.15it/s] 
[Rank 0] Test Epoch 0:   0%|          | 0/334 [00:00<?, ?it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1985 | Mem: 26.53MB, Util: 70%  global_step : 1985
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1986 | Mem: 26.53MB, Util: 70%  global_step : 1986
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1987 | Mem: 26.53MB, Util: 70%  global_step : 1987
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1988 | Mem: 26.53MB, Util: 70%  global_step : 1988
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1989 | Mem: 26.53MB, Util: 70%  global_step : 1989
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1990 | Mem: 26.53MB, Util: 70%  global_step : 1990
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1991 | Mem: 26.53MB, Util: 70%  global_step : 1991
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 0, Batch 1992 | Mem: 26.53MB, Util: 

[Rank 0] Test Epoch 0:   0%|          | 1/334 [00:03<18:10,  3.28s/it]
[Rank 2] Test Epoch 0:   0%|          | 1/334 [00:03<18:18,  3.30s/it]
[Rank 0] Test Epoch 0:  10%|▉         | 33/334 [00:03<00:22, 13.66it/s]
[Rank 1] Test Epoch 0:   0%|          | 1/334 [00:03<18:48,  3.39s/it]
[Rank 2] Test Epoch 0:  10%|█         | 35/334 [00:03<00:20, 14.38it/s]
[Rank 0] Test Epoch 0:  20%|██        | 67/334 [00:03<00:08, 32.02it/s]
[Rank 1] Test Epoch 0:  10%|█         | 34/334 [00:03<00:22, 13.62it/s]
[Rank 2] Test Epoch 0:  21%|██▏       | 71/334 [00:03<00:07, 33.67it/s]
[Rank 0] Test Epoch 0:  30%|███       | 101/334 [00:03<00:04, 54.85it/s]
[Rank 1] Test Epoch 0:  20%|██        | 68/334 [00:03<00:08, 31.43it/s]
[Rank 2] Test Epoch 0:  32%|███▏      | 107/334 [00:03<00:03, 57.72it/s]
[Rank 0] Test Epoch 0:  40%|███▉      | 133/334 [00:03<00:02, 80.09it/s]
[Rank 1] Test Epoch 0:  31%|███       | 104/334 [00:03<00:04, 55.09it/s]
[Rank 2] Test Epoch 0:  43%|████▎     | 143/334 [00:03<00:02, 8

[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [Rank 2] Epoch 0 | Loss: 0.4636, Acc: 0.8287, Model Checksum: 690e2497c91de056e01854674d55f752
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [ NodeId 8132e66e3928a4b82addde0aa19309709a58e788e1dcddeaab904191 Rank 2] Epoch 0 | Loss: 0.4636, Acc: 0.8287, Model Checksum: 690e2497c91de056e01854674d55f752
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 1, Batch 0 | Mem: 26.53MB, Util: 6%  global_step : 2000
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 1, Batch 1 | Mem: 26.53MB, Util: 6%  global_step : 2001
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 1, Batch 2 | Mem: 26.53MB, Util: 6%  global_step : 2002
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 1, Batch 3 | Mem: 26.53MB, Util: 6%  global_step : 2003
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [Rank 0] Epoch 0 | Loss: 0.4891, Acc: 0.8146, Model Checksum: 690e2497c91de056e01854674d55f752
[36m(

[Rank 0] Test Epoch 0: 100%|██████████| 334/334 [00:04<00:00, 78.38it/s] 
[Rank 0] Train Epoch 1:   0%|          | 0/2000 [00:00<?, ?it/s]
[Rank 1] Test Epoch 0: 100%|██████████| 334/334 [00:04<00:00, 76.73it/s] 
[Rank 1] Train Epoch 1:   0%|          | 0/2000 [00:00<?, ?it/s]
[Rank 2] Train Epoch 1:   1%|          | 14/2000 [00:00<00:14, 139.79it/s]
[Rank 0] Train Epoch 1:   1%|          | 14/2000 [00:00<00:14, 137.60it/s]
[Rank 1] Train Epoch 1:   1%|          | 13/2000 [00:00<00:15, 129.51it/s]
[Rank 2] Train Epoch 1:   1%|▏         | 29/2000 [00:00<00:13, 142.70it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 1, Batch 19 | Mem: 26.53MB, Util: 6%  global_step : 2019
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 1, Batch 20 | Mem: 26.53MB, Util: 6%  global_step : 2020
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 1, Batch 21 | Mem: 26.53MB, Util: 6%  global_step : 2021
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 1, Batch 22 | Mem: 26.53MB, Util: 6%  global_step : 2022
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 1, Batch 23 | Mem: 26.53MB, Util: 6%  global_step : 2023
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 1, Batch 24 | Mem: 26.53MB, Util: 6%  global_step : 2024
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 1, Batch 25 | Mem: 26.53MB, Util: 6%  global_step : 2025
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 1, Batch 26 | Mem: 26.53MB, Util: 6%  global_step : 2026
[36m(Ra

[Rank 0] Train Epoch 1:   1%|▏         | 29/2000 [00:00<00:14, 139.28it/s]
[Rank 1] Train Epoch 1:   1%|▏         | 28/2000 [00:00<00:14, 136.56it/s]
[Rank 2] Train Epoch 1:   2%|▏         | 44/2000 [00:00<00:14, 136.37it/s]
[Rank 0] Train Epoch 1:   2%|▏         | 44/2000 [00:00<00:14, 139.62it/s]
[Rank 1] Train Epoch 1:   2%|▏         | 43/2000 [00:00<00:13, 140.55it/s]
[Rank 2] Train Epoch 1:   3%|▎         | 59/2000 [00:00<00:13, 140.08it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 29 | Mem: 26.53MB, Util: 6%  global_step : 2029
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 30 | Mem: 26.53MB, Util: 6%  global_step : 2030
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 31 | Mem: 26.53MB, Util: 6%  global_step : 2031
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 32 | Mem: 26.53MB, Util: 6%  global_step : 2032
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 33 | Mem: 26.53MB, Util: 6%  global_step : 2033
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 34 | Mem: 26.53MB, Util: 6%  global_step : 2034
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 35 | Mem: 26.53MB, Util: 6%  global_step : 2035
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 36 | Mem: 26.53MB, Util: 62%  global_step : 2036

[Rank 0] Train Epoch 1:   3%|▎         | 58/2000 [00:00<00:13, 139.48it/s]
[Rank 1] Train Epoch 1:   3%|▎         | 58/2000 [00:00<00:13, 143.14it/s]
[Rank 2] Train Epoch 1:   4%|▎         | 74/2000 [00:00<00:13, 141.93it/s]
[Rank 0] Train Epoch 1:   4%|▎         | 72/2000 [00:00<00:13, 139.28it/s]
[Rank 1] Train Epoch 1:   4%|▎         | 73/2000 [00:00<00:13, 144.78it/s]
[Rank 2] Train Epoch 1:   4%|▍         | 89/2000 [00:00<00:13, 143.40it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 60 | Mem: 26.53MB, Util: 62%  global_step : 2060
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 61 | Mem: 26.53MB, Util: 62%  global_step : 2061
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 62 | Mem: 26.53MB, Util: 62%  global_step : 2062
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 63 | Mem: 26.53MB, Util: 62%  global_step : 2063
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 64 | Mem: 26.53MB, Util: 62%  global_step : 2064
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 65 | Mem: 26.53MB, Util: 62%  global_step : 2065
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 66 | Mem: 26.53MB, Util: 62%  global_step : 2066
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 67 | Mem: 26.53MB, Util: 71%  global_step

[Rank 0] Train Epoch 1:   4%|▍         | 87/2000 [00:00<00:13, 139.84it/s]
[Rank 1] Train Epoch 1:   4%|▍         | 88/2000 [00:00<00:13, 145.77it/s]
[Rank 1] Train Epoch 1:   5%|▌         | 103/2000 [00:00<00:13, 145.86it/s]
[Rank 2] Train Epoch 1:   5%|▌         | 104/2000 [00:00<00:18, 105.22it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 91 | Mem: 26.53MB, Util: 71%  global_step : 2091
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 92 | Mem: 26.53MB, Util: 71%  global_step : 2092
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 93 | Mem: 26.53MB, Util: 71%  global_step : 2093
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 94 | Mem: 26.53MB, Util: 71%  global_step : 2094
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 95 | Mem: 26.53MB, Util: 71%  global_step : 2095
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 96 | Mem: 26.53MB, Util: 71%  global_step : 2096
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 97 | Mem: 26.53MB, Util: 71%  global_step : 2097
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 98 | Mem: 26.53MB, Util: 71%  global_step

[Rank 0] Train Epoch 1:   5%|▌         | 101/2000 [00:00<00:16, 117.11it/s]
[Rank 1] Train Epoch 1:   6%|▌         | 118/2000 [00:00<00:12, 145.17it/s]
[Rank 2] Train Epoch 1:   6%|▌         | 119/2000 [00:00<00:16, 115.73it/s]
[Rank 0] Train Epoch 1:   6%|▌         | 116/2000 [00:00<00:15, 124.32it/s]
[Rank 1] Train Epoch 1:   7%|▋         | 133/2000 [00:00<00:12, 144.67it/s]
[Rank 2] Train Epoch 1:   7%|▋         | 134/2000 [00:01<00:15, 123.90it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 121 | Mem: 26.53MB, Util: 79%  global_step : 2121
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 122 | Mem: 26.53MB, Util: 79%  global_step : 2122
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 123 | Mem: 26.53MB, Util: 79%  global_step : 2123
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 124 | Mem: 26.53MB, Util: 79%  global_step : 2124
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 125 | Mem: 26.53MB, Util: 79%  global_step : 2125
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 126 | Mem: 26.53MB, Util: 79%  global_step : 2126
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 127 | Mem: 26.53MB, Util: 79%  global_step : 2127
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 128 | Mem: 26.53MB, Util: 79%  glo

[Rank 0] Train Epoch 1:   7%|▋         | 131/2000 [00:00<00:14, 129.53it/s]
[Rank 1] Train Epoch 1:   7%|▋         | 148/2000 [00:01<00:12, 144.66it/s]
[Rank 2] Train Epoch 1:   7%|▋         | 148/2000 [00:01<00:14, 127.88it/s]
[Rank 0] Train Epoch 1:   7%|▋         | 146/2000 [00:01<00:13, 133.27it/s]
[Rank 1] Train Epoch 1:   8%|▊         | 163/2000 [00:01<00:12, 145.08it/s]
[Rank 2] Train Epoch 1:   8%|▊         | 163/2000 [00:01<00:13, 133.20it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 151 | Mem: 26.53MB, Util: 79%  global_step : 2151
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 152 | Mem: 26.53MB, Util: 79%  global_step : 2152
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 153 | Mem: 26.53MB, Util: 79%  global_step : 2153
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 154 | Mem: 26.53MB, Util: 79%  global_step : 2154
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 155 | Mem: 26.53MB, Util: 79%  global_step : 2155
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 156 | Mem: 26.53MB, Util: 79%  global_step : 2156
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 157 | Mem: 26.53MB, Util: 79%  global_step : 2157
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 158 | Mem: 26.53MB, Util: 79%  glo

[Rank 0] Train Epoch 1:   8%|▊         | 161/2000 [00:01<00:13, 136.26it/s]
[Rank 1] Train Epoch 1:   9%|▉         | 178/2000 [00:01<00:12, 146.21it/s]
[Rank 2] Train Epoch 1:   9%|▉         | 178/2000 [00:01<00:13, 137.31it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 182 | Mem: 26.53MB, Util: 100%  global_step : 2182
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 183 | Mem: 26.53MB, Util: 100%  global_step : 2183
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 184 | Mem: 26.53MB, Util: 100%  global_step : 2184
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 185 | Mem: 26.53MB, Util: 100%  global_step : 2185
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 186 | Mem: 26.53MB, Util: 100%  global_step : 2186
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 187 | Mem: 26.53MB, Util: 100%  global_step : 2187
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 188 | Mem: 26.53MB, Util: 100%  global_step : 2188
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 189 | Mem: 26.53MB, Util: 1

[Rank 1] Train Epoch 1:  10%|▉         | 193/2000 [00:01<00:30, 59.81it/s] 
[Rank 2] Train Epoch 1:  10%|▉         | 193/2000 [00:01<00:30, 58.52it/s] 
[Rank 0] Train Epoch 1:   9%|▉         | 175/2000 [00:01<00:37, 48.49it/s] 
[Rank 0] Train Epoch 1:  10%|▉         | 190/2000 [00:02<00:29, 60.77it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 186 | Mem: 26.53MB, Util: 0%  global_step : 2186
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 187 | Mem: 26.53MB, Util: 0%  global_step : 2187
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 188 | Mem: 26.53MB, Util: 0%  global_step : 2188
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 189 | Mem: 26.53MB, Util: 0%  global_step : 2189
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 190 | Mem: 26.53MB, Util: 0%  global_step : 2190
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 191 | Mem: 26.53MB, Util: 0%  global_step : 2191
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 192 | Mem: 26.53MB, Util: 0%  global_step : 2192
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 193 | Mem: 26.53MB, Util: 0%  global_step : 2193
[36m(Ra

[Rank 1] Train Epoch 1:  10%|█         | 204/2000 [00:02<00:32, 55.32it/s]
[Rank 2] Train Epoch 1:  10%|█         | 204/2000 [00:02<00:33, 54.17it/s]
[Rank 0] Train Epoch 1:  10%|█         | 205/2000 [00:02<00:24, 73.73it/s]
[Rank 1] Train Epoch 1:  11%|█         | 219/2000 [00:02<00:25, 68.89it/s]
[Rank 2] Train Epoch 1:  11%|█         | 217/2000 [00:02<00:27, 64.97it/s]
[Rank 0] Train Epoch 1:  11%|█         | 219/2000 [00:02<00:20, 85.16it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 215 | Mem: 26.53MB, Util: 43%  global_step : 2215
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 216 | Mem: 26.53MB, Util: 43%  global_step : 2216
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 217 | Mem: 26.53MB, Util: 43%  global_step : 2217
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 218 | Mem: 26.53MB, Util: 43%  global_step : 2218
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 219 | Mem: 26.53MB, Util: 43%  global_step : 2219
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 220 | Mem: 26.53MB, Util: 43%  global_step : 2220
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 221 | Mem: 26.53MB, Util: 43%  global_step : 2221
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 222 | Mem: 26.53MB, Util: 43%  global_step : 2222


[Rank 1] Train Epoch 1:  12%|█▏        | 234/2000 [00:02<00:21, 82.34it/s]
[Rank 2] Train Epoch 1:  12%|█▏        | 231/2000 [00:02<00:22, 77.07it/s]
[Rank 0] Train Epoch 1:  12%|█▏        | 233/2000 [00:02<00:18, 95.70it/s]
[Rank 1] Train Epoch 1:  12%|█▏        | 249/2000 [00:02<00:18, 94.66it/s]
[Rank 2] Train Epoch 1:  12%|█▏        | 246/2000 [00:02<00:19, 90.79it/s]
[Rank 0] Train Epoch 1:  12%|█▏        | 248/2000 [00:02<00:16, 106.28it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 245 | Mem: 26.53MB, Util: 81%  global_step : 2245
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 246 | Mem: 26.53MB, Util: 81%  global_step : 2246
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 247 | Mem: 26.53MB, Util: 81%  global_step : 2247
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 248 | Mem: 26.53MB, Util: 81%  global_step : 2248
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 249 | Mem: 26.53MB, Util: 81%  global_step : 2249
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 250 | Mem: 26.53MB, Util: 81%  global_step : 2250
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 251 | Mem: 26.53MB, Util: 100%  global_step : 2251
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 252 | Mem: 26.53MB, Util: 100%  global_step : 225

[Rank 1] Train Epoch 1:  13%|█▎        | 264/2000 [00:02<00:16, 105.69it/s]
[Rank 2] Train Epoch 1:  13%|█▎        | 261/2000 [00:02<00:16, 103.09it/s]
[Rank 0] Train Epoch 1:  13%|█▎        | 262/2000 [00:02<00:15, 112.82it/s]
[Rank 1] Train Epoch 1:  14%|█▍        | 278/2000 [00:02<00:15, 110.81it/s]
[Rank 2] Train Epoch 1:  14%|█▍        | 276/2000 [00:02<00:15, 113.47it/s]
[Rank 0] Train Epoch 1:  14%|█▍        | 277/2000 [00:02<00:14, 120.20it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 274 | Mem: 26.53MB, Util: 100%  global_step : 2274
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 275 | Mem: 26.53MB, Util: 100%  global_step : 2275
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 276 | Mem: 26.53MB, Util: 100%  global_step : 2276
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 277 | Mem: 26.53MB, Util: 100%  global_step : 2277
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 278 | Mem: 26.53MB, Util: 100%  global_step : 2278
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 279 | Mem: 26.53MB, Util: 100%  global_step : 2279
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 280 | Mem: 26.53MB, Util: 100%  global_step : 2280
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 281 | Mem: 26.53MB, Util: 100%  global_step

[Rank 1] Train Epoch 1:  15%|█▍        | 292/2000 [00:02<00:14, 116.94it/s]
[Rank 2] Train Epoch 1:  15%|█▍        | 291/2000 [00:02<00:14, 121.85it/s]
[Rank 0] Train Epoch 1:  15%|█▍        | 291/2000 [00:02<00:13, 125.24it/s]
[Rank 1] Train Epoch 1:  15%|█▌        | 306/2000 [00:02<00:13, 121.08it/s]
[Rank 2] Train Epoch 1:  15%|█▌        | 305/2000 [00:02<00:13, 125.43it/s]
[Rank 0] Train Epoch 1:  15%|█▌        | 306/2000 [00:02<00:13, 129.81it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 304 | Mem: 26.53MB, Util: 99%  global_step : 2304
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 305 | Mem: 26.53MB, Util: 99%  global_step : 2305
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 306 | Mem: 26.53MB, Util: 99%  global_step : 2306
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 307 | Mem: 26.53MB, Util: 99%  global_step : 2307
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 308 | Mem: 26.53MB, Util: 99%  global_step : 2308
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 309 | Mem: 26.53MB, Util: 72%  global_step : 2309
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 310 | Mem: 26.53MB, Util: 72%  global_step : 2310
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 311 | Mem: 26.53MB, Util: 72%  global_step : 2311


[Rank 2] Train Epoch 1:  16%|█▌        | 320/2000 [00:03<00:12, 131.34it/s]
[Rank 0] Train Epoch 1:  16%|█▌        | 321/2000 [00:02<00:12, 134.86it/s]
[Rank 1] Train Epoch 1:  16%|█▌        | 320/2000 [00:02<00:13, 123.65it/s]
[Rank 2] Train Epoch 1:  17%|█▋        | 335/2000 [00:03<00:12, 135.46it/s]
[Rank 0] Train Epoch 1:  17%|█▋        | 336/2000 [00:03<00:12, 137.42it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 334 | Mem: 26.53MB, Util: 72%  global_step : 2334
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 335 | Mem: 26.53MB, Util: 72%  global_step : 2335
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 336 | Mem: 26.53MB, Util: 72%  global_step : 2336
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 337 | Mem: 26.53MB, Util: 72%  global_step : 2337
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 338 | Mem: 26.53MB, Util: 72%  global_step : 2338
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 339 | Mem: 26.53MB, Util: 72%  global_step : 2339
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 340 | Mem: 26.53MB, Util: 72%  global_step : 2340
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 341 | Mem: 26.53MB, Util: 98%  global_step : 2341


[Rank 1] Train Epoch 1:  17%|█▋        | 335/2000 [00:03<00:12, 128.63it/s]
[Rank 2] Train Epoch 1:  18%|█▊        | 350/2000 [00:03<00:11, 138.66it/s]
[Rank 0] Train Epoch 1:  18%|█▊        | 351/2000 [00:03<00:11, 137.99it/s]
[Rank 1] Train Epoch 1:  18%|█▊        | 350/2000 [00:03<00:12, 132.87it/s]
[Rank 2] Train Epoch 1:  18%|█▊        | 365/2000 [00:03<00:11, 141.28it/s]
[Rank 0] Train Epoch 1:  18%|█▊        | 366/2000 [00:03<00:11, 138.81it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 364 | Mem: 26.53MB, Util: 98%  global_step : 2364
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 365 | Mem: 26.53MB, Util: 98%  global_step : 2365
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 366 | Mem: 26.53MB, Util: 98%  global_step : 2366
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 367 | Mem: 26.53MB, Util: 98%  global_step : 2367
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 368 | Mem: 26.53MB, Util: 100%  global_step : 2368
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 369 | Mem: 26.53MB, Util: 100%  global_step : 2369
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 370 | Mem: 26.53MB, Util: 100%  global_step : 2370
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 371 | Mem: 26.53MB, Util: 100%  global_step : 2

[Rank 1] Train Epoch 1:  18%|█▊        | 366/2000 [00:03<00:11, 138.01it/s]
[Rank 2] Train Epoch 1:  19%|█▉        | 380/2000 [00:03<00:11, 143.26it/s]
[Rank 0] Train Epoch 1:  19%|█▉        | 381/2000 [00:03<00:11, 139.18it/s]
[Rank 1] Train Epoch 1:  19%|█▉        | 381/2000 [00:03<00:11, 139.62it/s]
[Rank 2] Train Epoch 1:  20%|█▉        | 395/2000 [00:03<00:11, 144.09it/s]
[Rank 0] Train Epoch 1:  20%|█▉        | 396/2000 [00:03<00:11, 136.77it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 394 | Mem: 26.53MB, Util: 100%  global_step : 2394
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 395 | Mem: 26.53MB, Util: 100%  global_step : 2395
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 396 | Mem: 26.53MB, Util: 100%  global_step : 2396
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 397 | Mem: 26.53MB, Util: 100%  global_step : 2397
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 398 | Mem: 26.53MB, Util: 100%  global_step : 2398
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 399 | Mem: 26.53MB, Util: 100%  global_step : 2399
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 400 | Mem: 26.53MB, Util: 100%  global_step : 2400
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 401 | Mem: 26.53MB, Util: 100%  global_step

[Rank 1] Train Epoch 1:  20%|█▉        | 396/2000 [00:03<00:11, 140.57it/s]
[Rank 2] Train Epoch 1:  20%|██        | 410/2000 [00:03<00:12, 132.44it/s]
[Rank 0] Train Epoch 1:  20%|██        | 410/2000 [00:03<00:11, 136.58it/s]
[Rank 1] Train Epoch 1:  21%|██        | 411/2000 [00:03<00:11, 138.34it/s]
[Rank 2] Train Epoch 1:  21%|██▏       | 425/2000 [00:03<00:11, 136.44it/s]
[Rank 0] Train Epoch 1:  21%|██▏       | 425/2000 [00:03<00:11, 137.73it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 422 | Mem: 26.53MB, Util: 100%  global_step : 2422
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 423 | Mem: 26.53MB, Util: 100%  global_step : 2423
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 424 | Mem: 26.53MB, Util: 100%  global_step : 2424
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 425 | Mem: 26.53MB, Util: 81%  global_step : 2425
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 426 | Mem: 26.53MB, Util: 81%  global_step : 2426
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 427 | Mem: 26.53MB, Util: 81%  global_step : 2427
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 428 | Mem: 26.53MB, Util: 81%  global_step : 2428
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 429 | Mem: 26.53MB, Util: 81%  global_step : 24

[Rank 1] Train Epoch 1:  21%|██▏       | 425/2000 [00:03<00:11, 137.60it/s]
[Rank 2] Train Epoch 1:  22%|██▏       | 440/2000 [00:03<00:11, 139.17it/s]
[Rank 0] Train Epoch 1:  22%|██▏       | 440/2000 [00:03<00:11, 139.03it/s]
[Rank 1] Train Epoch 1:  22%|██▏       | 440/2000 [00:03<00:11, 139.35it/s]
[Rank 2] Train Epoch 1:  23%|██▎       | 455/2000 [00:03<00:10, 141.25it/s]
[Rank 0] Train Epoch 1:  23%|██▎       | 455/2000 [00:03<00:11, 140.35it/s]
[Rank 1] Train Epoch 1:  23%|██▎       | 455/2000 [00:03<00:11, 140.39it/s]
[Rank 2] Train Epoch 1:  24%|██▎       | 470/2000 [00:04<00:10, 142.25it/s]
[Rank 0] Train Epoch 1:  24%|██▎       | 470/2000 [00:04<00:10, 142.69it/s]
[Rank 1] Train Epoch 1:  24%|██▎       | 471/2000 [00:03<00:10, 144.25it/s]
[Rank 2] Train Epoch 1:  24%|██▍       | 485/2000 [00:04<00:10, 144.25it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 452 | Mem: 26.53MB, Util: 81%  global_step : 2452
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 453 | Mem: 26.53MB, Util: 81%  global_step : 2453
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 454 | Mem: 26.53MB, Util: 81%  global_step : 2454
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 455 | Mem: 26.53MB, Util: 95%  global_step : 2455
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 456 | Mem: 26.53MB, Util: 95%  global_step : 2456
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 457 | Mem: 26.53MB, Util: 95%  global_step : 2457
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 458 | Mem: 26.53MB, Util: 95%  global_step : 2458
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 459 | Mem: 26.53MB, Util: 95%  global_step : 2459


[Rank 0] Train Epoch 1:  24%|██▍       | 486/2000 [00:04<00:10, 146.01it/s]
[Rank 1] Train Epoch 1:  24%|██▍       | 486/2000 [00:04<00:10, 145.86it/s]
[Rank 2] Train Epoch 1:  25%|██▌       | 500/2000 [00:04<00:10, 144.93it/s]
[Rank 0] Train Epoch 1:  25%|██▌       | 501/2000 [00:04<00:10, 146.62it/s]
[Rank 1] Train Epoch 1:  25%|██▌       | 501/2000 [00:04<00:10, 146.39it/s]
[Rank 2] Train Epoch 1:  26%|██▌       | 515/2000 [00:04<00:10, 142.23it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 483 | Mem: 26.53MB, Util: 90%  global_step : 2483
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 484 | Mem: 26.53MB, Util: 90%  global_step : 2484
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 485 | Mem: 26.53MB, Util: 90%  global_step : 2485
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 486 | Mem: 26.53MB, Util: 90%  global_step : 2486
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 487 | Mem: 26.53MB, Util: 90%  global_step : 2487
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 488 | Mem: 26.53MB, Util: 90%  global_step : 2488
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 489 | Mem: 26.53MB, Util: 90%  global_step : 2489
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 490 | Mem: 26.53MB, Util: 90%  global_step : 2490


[Rank 0] Train Epoch 1:  26%|██▌       | 516/2000 [00:04<00:10, 146.95it/s]
[Rank 1] Train Epoch 1:  26%|██▌       | 516/2000 [00:04<00:10, 146.49it/s]
[Rank 2] Train Epoch 1:  26%|██▋       | 530/2000 [00:04<00:10, 144.06it/s]
[Rank 0] Train Epoch 1:  27%|██▋       | 531/2000 [00:04<00:09, 147.18it/s]
[Rank 1] Train Epoch 1:  27%|██▋       | 531/2000 [00:04<00:10, 146.76it/s]
[Rank 0] Train Epoch 1:  27%|██▋       | 546/2000 [00:04<00:09, 147.83it/s]
[Rank 2] Train Epoch 1:  27%|██▋       | 545/2000 [00:04<00:10, 144.42it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 515 | Mem: 26.53MB, Util: 85%  global_step : 2515
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 516 | Mem: 26.53MB, Util: 85%  global_step : 2516
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 517 | Mem: 26.53MB, Util: 85%  global_step : 2517
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 518 | Mem: 26.53MB, Util: 85%  global_step : 2518
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 519 | Mem: 26.53MB, Util: 85%  global_step : 2519
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 520 | Mem: 26.53MB, Util: 85%  global_step : 2520
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 521 | Mem: 26.53MB, Util: 85%  global_step : 2521
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 522 | Mem: 26.53MB, Util: 85%  global_step : 2522


[Rank 1] Train Epoch 1:  27%|██▋       | 546/2000 [00:04<00:09, 146.67it/s]
[Rank 0] Train Epoch 1:  28%|██▊       | 562/2000 [00:04<00:09, 149.90it/s]
[Rank 1] Train Epoch 1:  28%|██▊       | 561/2000 [00:04<00:09, 146.55it/s]
[Rank 2] Train Epoch 1:  28%|██▊       | 560/2000 [00:04<00:09, 145.18it/s]
[Rank 0] Train Epoch 1:  29%|██▉       | 578/2000 [00:04<00:09, 150.59it/s]
[Rank 1] Train Epoch 1:  29%|██▉       | 576/2000 [00:04<00:09, 146.25it/s]
[Rank 2] Train Epoch 1:  29%|██▉       | 575/2000 [00:04<00:09, 145.14it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 545 | Mem: 26.53MB, Util: 71%  global_step : 2545
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 546 | Mem: 26.53MB, Util: 71%  global_step : 2546
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 547 | Mem: 26.53MB, Util: 70%  global_step : 2547
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 548 | Mem: 26.53MB, Util: 70%  global_step : 2548
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 549 | Mem: 26.53MB, Util: 70%  global_step : 2549
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 550 | Mem: 26.53MB, Util: 70%  global_step : 2550
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 551 | Mem: 26.53MB, Util: 70%  global_step : 2551
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 552 | Mem: 26.53MB, Util: 70%  glo

[Rank 1] Train Epoch 1:  30%|██▉       | 591/2000 [00:04<00:09, 146.41it/s]
[Rank 0] Train Epoch 1:  30%|██▉       | 594/2000 [00:04<00:09, 150.61it/s]
[Rank 2] Train Epoch 1:  30%|██▉       | 590/2000 [00:04<00:09, 146.02it/s]
[Rank 1] Train Epoch 1:  30%|███       | 606/2000 [00:04<00:09, 146.15it/s]
[Rank 2] Train Epoch 1:  30%|███       | 605/2000 [00:05<00:09, 145.96it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 576 | Mem: 26.53MB, Util: 70%  global_step : 2576
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 577 | Mem: 26.53MB, Util: 70%  global_step : 2577
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 578 | Mem: 26.53MB, Util: 70%  global_step : 2578
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 579 | Mem: 26.53MB, Util: 70%  global_step : 2579
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 580 | Mem: 26.53MB, Util: 70%  global_step : 2580
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 581 | Mem: 26.53MB, Util: 70%  global_step : 2581
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 582 | Mem: 26.53MB, Util: 70%  global_step : 2582
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 583 | Mem: 26.53MB, Util: 70%  glo

[Rank 1] Train Epoch 1:  31%|███       | 621/2000 [00:04<00:09, 144.94it/s]
[Rank 0] Train Epoch 1:  30%|███       | 610/2000 [00:05<00:09, 140.29it/s]
[Rank 2] Train Epoch 1:  31%|███       | 620/2000 [00:05<00:09, 145.86it/s]
[Rank 0] Train Epoch 1:  31%|███▏      | 625/2000 [00:05<00:09, 139.12it/s]
[Rank 2] Train Epoch 1:  32%|███▏      | 635/2000 [00:05<00:09, 145.81it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 606 | Mem: 26.53MB, Util: 71%  global_step : 2606
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 607 | Mem: 26.53MB, Util: 71%  global_step : 2607
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 608 | Mem: 26.53MB, Util: 71%  global_step : 2608
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 609 | Mem: 26.53MB, Util: 71%  global_step : 2609
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 610 | Mem: 26.53MB, Util: 71%  global_step : 2610
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 611 | Mem: 26.53MB, Util: 71%  global_step : 2611
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 612 | Mem: 26.53MB, Util: 71%  global_step : 2612
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 613 | Mem: 26.53MB, Util: 71%  glo

[Rank 1] Train Epoch 1:  32%|███▏      | 636/2000 [00:05<00:09, 137.00it/s]
[Rank 0] Train Epoch 1:  32%|███▏      | 640/2000 [00:05<00:09, 137.42it/s]
[Rank 2] Train Epoch 1:  32%|███▎      | 650/2000 [00:05<00:09, 144.47it/s]
[Rank 1] Train Epoch 1:  33%|███▎      | 651/2000 [00:05<00:09, 138.83it/s]
[Rank 0] Train Epoch 1:  33%|███▎      | 654/2000 [00:05<00:09, 136.13it/s]
[Rank 2] Train Epoch 1:  33%|███▎      | 665/2000 [00:05<00:09, 145.04it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 633 | Mem: 26.53MB, Util: 71%  global_step : 2633
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 634 | Mem: 26.53MB, Util: 71%  global_step : 2634
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 635 | Mem: 26.53MB, Util: 71%  global_step : 2635
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 636 | Mem: 26.53MB, Util: 71%  global_step : 2636
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 637 | Mem: 26.53MB, Util: 87%  global_step : 2637
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 638 | Mem: 26.53MB, Util: 87%  global_step : 2638
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 639 | Mem: 26.53MB, Util: 87%  global_step : 2639
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 640 | Mem: 26.53MB, Util: 87%  glo

[Rank 1] Train Epoch 1:  33%|███▎      | 666/2000 [00:05<00:09, 139.91it/s]
[Rank 0] Train Epoch 1:  33%|███▎      | 668/2000 [00:05<00:09, 135.56it/s]
[Rank 2] Train Epoch 1:  34%|███▍      | 680/2000 [00:05<00:09, 145.02it/s]
[Rank 1] Train Epoch 1:  34%|███▍      | 681/2000 [00:05<00:09, 136.43it/s]
[Rank 0] Train Epoch 1:  34%|███▍      | 682/2000 [00:05<00:10, 129.81it/s]
[Rank 2] Train Epoch 1:  35%|███▍      | 695/2000 [00:05<00:08, 145.74it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 663 | Mem: 26.53MB, Util: 74%  global_step : 2663
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 664 | Mem: 26.53MB, Util: 74%  global_step : 2664
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 665 | Mem: 26.53MB, Util: 74%  global_step : 2665
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 666 | Mem: 26.53MB, Util: 74%  global_step : 2666
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 667 | Mem: 26.53MB, Util: 74%  global_step : 2667
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 668 | Mem: 26.53MB, Util: 74%  global_step : 2668
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 669 | Mem: 26.53MB, Util: 74%  global_step : 2669
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 670 | Mem: 26.53MB, Util: 74%  glo

[Rank 1] Train Epoch 1:  35%|███▍      | 695/2000 [00:05<00:09, 135.36it/s]
[Rank 0] Train Epoch 1:  35%|███▍      | 697/2000 [00:05<00:09, 134.54it/s]
[Rank 1] Train Epoch 1:  35%|███▌      | 709/2000 [00:05<00:09, 131.99it/s]
[Rank 2] Train Epoch 1:  36%|███▌      | 710/2000 [00:05<00:10, 124.19it/s]
[Rank 0] Train Epoch 1:  36%|███▌      | 711/2000 [00:05<00:09, 134.40it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 691 | Mem: 26.53MB, Util: 100%  global_step : 2691
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 692 | Mem: 26.53MB, Util: 100%  global_step : 2692
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 693 | Mem: 26.53MB, Util: 100%  global_step : 2693
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 694 | Mem: 26.53MB, Util: 100%  global_step : 2694
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 695 | Mem: 26.53MB, Util: 100%  global_step : 2695
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 696 | Mem: 26.53MB, Util: 100%  global_step : 2696
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 697 | Mem: 26.53MB, Util: 100%  global_step : 2697
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 698 | Mem: 26.53MB, Util: 1

[Rank 1] Train Epoch 1:  36%|███▌      | 723/2000 [00:05<00:09, 129.78it/s]
[Rank 2] Train Epoch 1:  36%|███▋      | 725/2000 [00:05<00:09, 130.48it/s]
[Rank 0] Train Epoch 1:  36%|███▋      | 725/2000 [00:05<00:09, 134.97it/s]
[Rank 1] Train Epoch 1:  37%|███▋      | 737/2000 [00:05<00:09, 130.26it/s]
[Rank 2] Train Epoch 1:  37%|███▋      | 740/2000 [00:05<00:09, 134.64it/s]
[Rank 0] Train Epoch 1:  37%|███▋      | 739/2000 [00:05<00:09, 133.61it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 717 | Mem: 26.53MB, Util: 97%  global_step : 2717
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 718 | Mem: 26.53MB, Util: 97%  global_step : 2718
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 719 | Mem: 26.53MB, Util: 97%  global_step : 2719
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 720 | Mem: 26.53MB, Util: 97%  global_step : 2720
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 721 | Mem: 26.53MB, Util: 97%  global_step : 2721
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 722 | Mem: 26.53MB, Util: 97%  global_step : 2722
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 723 | Mem: 26.53MB, Util: 97%  global_step : 2723
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 724 | Mem: 26.53MB, Util: 97%  glo

[Rank 1] Train Epoch 1:  38%|███▊      | 751/2000 [00:05<00:09, 130.39it/s]
[Rank 2] Train Epoch 1:  38%|███▊      | 755/2000 [00:06<00:08, 138.36it/s]
[Rank 0] Train Epoch 1:  38%|███▊      | 753/2000 [00:06<00:09, 132.03it/s]
[Rank 1] Train Epoch 1:  38%|███▊      | 765/2000 [00:06<00:09, 128.35it/s]
[Rank 2] Train Epoch 1:  38%|███▊      | 770/2000 [00:06<00:08, 140.93it/s]
[Rank 0] Train Epoch 1:  38%|███▊      | 767/2000 [00:06<00:09, 132.84it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 744 | Mem: 26.53MB, Util: 64%  global_step : 2744
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 745 | Mem: 26.53MB, Util: 64%  global_step : 2745
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 746 | Mem: 26.53MB, Util: 64%  global_step : 2746
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 747 | Mem: 26.53MB, Util: 64%  global_step : 2747
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 748 | Mem: 26.53MB, Util: 64%  global_step : 2748
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 749 | Mem: 26.53MB, Util: 64%  global_step : 2749
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 750 | Mem: 26.53MB, Util: 64%  global_step : 2750
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 751 | Mem: 26.53MB, Util: 64%  glo

[Rank 1] Train Epoch 1:  39%|███▉      | 778/2000 [00:06<00:09, 128.76it/s]
[Rank 2] Train Epoch 1:  39%|███▉      | 785/2000 [00:06<00:08, 138.95it/s]
[Rank 0] Train Epoch 1:  39%|███▉      | 781/2000 [00:06<00:09, 132.49it/s]
[Rank 1] Train Epoch 1:  40%|███▉      | 791/2000 [00:06<00:09, 128.43it/s]
[Rank 2] Train Epoch 1:  40%|████      | 800/2000 [00:06<00:08, 140.99it/s]
[Rank 0] Train Epoch 1:  40%|███▉      | 795/2000 [00:06<00:09, 132.98it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 771 | Mem: 26.53MB, Util: 63%  global_step : 2771
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 772 | Mem: 26.53MB, Util: 63%  global_step : 2772
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 773 | Mem: 26.53MB, Util: 63%  global_step : 2773
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 774 | Mem: 26.53MB, Util: 63%  global_step : 2774
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 775 | Mem: 26.53MB, Util: 63%  global_step : 2775
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 776 | Mem: 26.53MB, Util: 63%  global_step : 2776
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 777 | Mem: 26.53MB, Util: 63%  global_step : 2777
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 778 | Mem: 26.53MB, Util: 63%  glo

[Rank 1] Train Epoch 1:  40%|████      | 804/2000 [00:06<00:09, 128.55it/s]
[Rank 0] Train Epoch 1:  40%|████      | 809/2000 [00:06<00:09, 124.68it/s]
[Rank 1] Train Epoch 1:  41%|████      | 818/2000 [00:06<00:09, 129.31it/s]
[Rank 2] Train Epoch 1:  41%|████      | 815/2000 [00:06<00:10, 109.55it/s]
[Rank 0] Train Epoch 1:  41%|████      | 823/2000 [00:06<00:09, 127.48it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 797 | Mem: 26.53MB, Util: 62%  global_step : 2797
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 798 | Mem: 26.53MB, Util: 62%  global_step : 2798
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 799 | Mem: 26.53MB, Util: 62%  global_step : 2799
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 800 | Mem: 26.53MB, Util: 62%  global_step : 2800
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 801 | Mem: 26.53MB, Util: 62%  global_step : 2801
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 802 | Mem: 26.53MB, Util: 62%  global_step : 2802
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 803 | Mem: 26.53MB, Util: 62%  global_step : 2803
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 804 | Mem: 26.53MB, Util: 62%  glo

[Rank 1] Train Epoch 1:  42%|████▏     | 832/2000 [00:06<00:08, 131.82it/s]
[Rank 2] Train Epoch 1:  41%|████▏     | 829/2000 [00:06<00:10, 116.21it/s]
[Rank 0] Train Epoch 1:  42%|████▏     | 837/2000 [00:06<00:08, 129.75it/s]
[Rank 1] Train Epoch 1:  42%|████▏     | 846/2000 [00:06<00:08, 130.67it/s]
[Rank 2] Train Epoch 1:  42%|████▏     | 843/2000 [00:06<00:09, 121.38it/s]
[Rank 0] Train Epoch 1:  43%|████▎     | 851/2000 [00:06<00:08, 130.98it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 825 | Mem: 26.53MB, Util: 62%  global_step : 2825
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 826 | Mem: 26.53MB, Util: 62%  global_step : 2826
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 827 | Mem: 26.53MB, Util: 62%  global_step : 2827
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 828 | Mem: 26.53MB, Util: 62%  global_step : 2828
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 829 | Mem: 26.53MB, Util: 62%  global_step : 2829
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 830 | Mem: 26.53MB, Util: 62%  global_step : 2830
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 831 | Mem: 26.53MB, Util: 62%  global_step : 2831
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 832 | Mem: 26.53MB, Util: 62%  glo

[Rank 1] Train Epoch 1:  43%|████▎     | 860/2000 [00:06<00:08, 130.42it/s]
[Rank 2] Train Epoch 1:  43%|████▎     | 857/2000 [00:06<00:09, 125.22it/s]
[Rank 0] Train Epoch 1:  43%|████▎     | 865/2000 [00:06<00:08, 131.23it/s]
[Rank 1] Train Epoch 1:  44%|████▎     | 874/2000 [00:06<00:09, 124.87it/s]
[Rank 2] Train Epoch 1:  44%|████▎     | 871/2000 [00:07<00:08, 127.87it/s]
[Rank 0] Train Epoch 1:  44%|████▍     | 879/2000 [00:07<00:08, 132.82it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 852 | Mem: 26.53MB, Util: 88%  global_step : 2852
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 853 | Mem: 26.53MB, Util: 88%  global_step : 2853
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 854 | Mem: 26.53MB, Util: 88%  global_step : 2854
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 855 | Mem: 26.53MB, Util: 88%  global_step : 2855
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 856 | Mem: 26.53MB, Util: 88%  global_step : 2856
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 857 | Mem: 26.53MB, Util: 88%  global_step : 2857
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 858 | Mem: 26.53MB, Util: 88%  global_step : 2858
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 859 | Mem: 26.53MB, Util: 88%  glo

[Rank 1] Train Epoch 1:  44%|████▍     | 889/2000 [00:07<00:08, 131.37it/s]
[Rank 2] Train Epoch 1:  44%|████▍     | 885/2000 [00:07<00:08, 129.78it/s]
[Rank 0] Train Epoch 1:  45%|████▍     | 893/2000 [00:07<00:08, 132.94it/s]
[Rank 1] Train Epoch 1:  45%|████▌     | 904/2000 [00:07<00:08, 135.74it/s]
[Rank 2] Train Epoch 1:  45%|████▍     | 899/2000 [00:07<00:08, 130.90it/s]
[Rank 0] Train Epoch 1:  45%|████▌     | 907/2000 [00:07<00:08, 128.74it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 878 | Mem: 26.53MB, Util: 73%  global_step : 2878
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 879 | Mem: 26.53MB, Util: 73%  global_step : 2879
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 880 | Mem: 26.53MB, Util: 73%  global_step : 2880
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 881 | Mem: 26.53MB, Util: 73%  global_step : 2881
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 882 | Mem: 26.53MB, Util: 73%  global_step : 2882
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 883 | Mem: 26.53MB, Util: 73%  global_step : 2883
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 884 | Mem: 26.53MB, Util: 73%  global_step : 2884
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 885 | Mem: 26.53MB, Util: 73%  glo

[Rank 1] Train Epoch 1:  46%|████▌     | 918/2000 [00:07<00:07, 136.81it/s]
[Rank 2] Train Epoch 1:  46%|████▌     | 913/2000 [00:07<00:08, 129.74it/s]
[Rank 0] Train Epoch 1:  46%|████▌     | 922/2000 [00:07<00:08, 132.80it/s]
[Rank 1] Train Epoch 1:  47%|████▋     | 933/2000 [00:07<00:07, 138.85it/s]
[Rank 2] Train Epoch 1:  46%|████▋     | 927/2000 [00:07<00:08, 131.01it/s]
[Rank 0] Train Epoch 1:  47%|████▋     | 937/2000 [00:07<00:07, 135.36it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 908 | Mem: 26.53MB, Util: 58%  global_step : 2908
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 909 | Mem: 26.53MB, Util: 58%  global_step : 2909
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 910 | Mem: 26.53MB, Util: 58%  global_step : 2910
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 911 | Mem: 26.53MB, Util: 58%  global_step : 2911
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 912 | Mem: 26.53MB, Util: 58%  global_step : 2912
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 913 | Mem: 26.53MB, Util: 58%  global_step : 2913
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 914 | Mem: 26.53MB, Util: 58%  global_step : 2914
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 915 | Mem: 26.53MB, Util: 58%  glo

[Rank 1] Train Epoch 1:  47%|████▋     | 948/2000 [00:07<00:07, 140.19it/s]
[Rank 2] Train Epoch 1:  47%|████▋     | 941/2000 [00:07<00:08, 131.84it/s]
[Rank 0] Train Epoch 1:  48%|████▊     | 951/2000 [00:07<00:07, 136.47it/s]
[Rank 2] Train Epoch 1:  48%|████▊     | 955/2000 [00:07<00:07, 133.05it/s]
[Rank 1] Train Epoch 1:  48%|████▊     | 963/2000 [00:07<00:07, 140.60it/s]
[Rank 0] Train Epoch 1:  48%|████▊     | 966/2000 [00:07<00:07, 137.93it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 938 | Mem: 26.53MB, Util: 80%  global_step : 2938
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 939 | Mem: 26.53MB, Util: 80%  global_step : 2939
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 940 | Mem: 26.53MB, Util: 80%  global_step : 2940
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 941 | Mem: 26.53MB, Util: 80%  global_step : 2941
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 942 | Mem: 26.53MB, Util: 80%  global_step : 2942
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 943 | Mem: 26.53MB, Util: 80%  global_step : 2943
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 944 | Mem: 26.53MB, Util: 80%  global_step : 2944
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 945 | Mem: 26.53MB, Util: 80%  glo

[Rank 2] Train Epoch 1:  48%|████▊     | 969/2000 [00:07<00:07, 134.42it/s]
[Rank 1] Train Epoch 1:  49%|████▉     | 978/2000 [00:07<00:07, 141.09it/s]
[Rank 0] Train Epoch 1:  49%|████▉     | 981/2000 [00:07<00:07, 139.01it/s]
[Rank 2] Train Epoch 1:  49%|████▉     | 983/2000 [00:07<00:07, 134.77it/s]
[Rank 1] Train Epoch 1:  50%|████▉     | 993/2000 [00:07<00:07, 142.81it/s]
[Rank 0] Train Epoch 1:  50%|████▉     | 996/2000 [00:07<00:07, 140.09it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 1, Batch 964 | Mem: 26.53MB, Util: 65%  global_step : 2964
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 1, Batch 965 | Mem: 26.53MB, Util: 65%  global_step : 2965
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 1, Batch 966 | Mem: 26.53MB, Util: 65%  global_step : 2966
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 1, Batch 967 | Mem: 26.53MB, Util: 65%  global_step : 2967
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 1, Batch 968 | Mem: 26.53MB, Util: 65%  global_step : 2968
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 1, Batch 969 | Mem: 26.53MB, Util: 65%  global_step : 2969
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 1, Batch 970 | Mem: 26.53MB, Util: 65%  global_step : 2970
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 1, Batch 971 | Mem: 26.53MB, Util: 65%  global_step

[Rank 2] Train Epoch 1:  50%|████▉     | 997/2000 [00:07<00:07, 134.87it/s]
[Rank 1] Train Epoch 1:  50%|█████     | 1008/2000 [00:07<00:07, 128.68it/s]
[Rank 2] Train Epoch 1:  51%|█████     | 1011/2000 [00:08<00:07, 135.39it/s]
[Rank 0] Train Epoch 1:  51%|█████     | 1011/2000 [00:08<00:07, 130.63it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 998 | Mem: 26.53MB, Util: 100%  global_step : 2998
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 999 | Mem: 26.53MB, Util: 100%  global_step : 2999
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 1000 | Mem: 26.53MB, Util: 100%  global_step : 3000
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 1001 | Mem: 26.53MB, Util: 100%  global_step : 3001
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 1002 | Mem: 26.53MB, Util: 100%  global_step : 3002
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 1003 | Mem: 26.53MB, Util: 100%  global_step : 3003
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 1004 | Mem: 26.53MB, Util: 100%  global_step : 3004
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 1005 | Mem: 26.53MB, U

[Rank 1] Train Epoch 1:  51%|█████     | 1023/2000 [00:08<00:07, 133.36it/s]
[Rank 2] Train Epoch 1:  51%|█████▏    | 1025/2000 [00:08<00:07, 136.09it/s]
[Rank 0] Train Epoch 1:  51%|█████▏    | 1025/2000 [00:08<00:07, 132.80it/s]
[Rank 1] Train Epoch 1:  52%|█████▏    | 1038/2000 [00:08<00:07, 135.78it/s]
[Rank 2] Train Epoch 1:  52%|█████▏    | 1039/2000 [00:08<00:07, 135.68it/s]
[Rank 0] Train Epoch 1:  52%|█████▏    | 1043/2000 [00:08<00:06, 145.14it/s]
[Rank 0] Train Epoch 1:  53%|█████▎    | 1061/2000 [00:08<00:06, 154.18it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 1022 | Mem: 26.53MB, Util: 100%  global_step : 3022
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 1023 | Mem: 26.53MB, Util: 100%  global_step : 3023
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 1024 | Mem: 26.53MB, Util: 100%  global_step : 3024
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 1025 | Mem: 26.53MB, Util: 100%  global_step : 3025
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 1026 | Mem: 26.53MB, Util: 100%  global_step : 3026
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 1027 | Mem: 26.53MB, Util: 100%  global_step : 3027
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 1028 | Mem: 26.53MB, Util: 100%  global_step : 3028
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 1, Batch 1029 | Mem: 26.53MB,

[Rank 2] Train Epoch 1:  53%|█████▎    | 1053/2000 [00:08<00:06, 135.67it/s]
[Rank 1] Train Epoch 1:  53%|█████▎    | 1053/2000 [00:08<00:06, 137.29it/s]
[Rank 2] Train Epoch 1:  53%|█████▎    | 1067/2000 [00:08<00:06, 135.12it/s]
[Rank 1] Train Epoch 1:  53%|█████▎    | 1068/2000 [00:08<00:06, 138.50it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 1, Batch 1049 | Mem: 26.53MB, Util: 65%  global_step : 3049
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 1, Batch 1050 | Mem: 26.53MB, Util: 65%  global_step : 3050
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 1, Batch 1051 | Mem: 26.53MB, Util: 65%  global_step : 3051
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 1, Batch 1052 | Mem: 26.53MB, Util: 65%  global_step : 3052
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 1, Batch 1053 | Mem: 26.53MB, Util: 65%  global_step : 3053
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 1, Batch 1054 | Mem: 26.53MB, Util: 65%  global_step : 3054
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 1, Batch 1055 | Mem: 26.53MB, Util: 65%  global_step : 3055
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 1, Batch 1056 | Mem: 26.53MB, Util: 65%  glo

[Rank 2] Train Epoch 1:  54%|█████▍    | 1081/2000 [00:08<00:06, 135.42it/s]
[Rank 1] Train Epoch 1:  54%|█████▍    | 1083/2000 [00:08<00:06, 140.26it/s]
[Rank 2] Train Epoch 1:  55%|█████▍    | 1095/2000 [00:08<00:06, 136.72it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 1, Batch 1078 | Mem: 26.53MB, Util: 66%  global_step : 3078
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 1, Batch 1079 | Mem: 26.53MB, Util: 66%  global_step : 3079
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 1, Batch 1080 | Mem: 26.53MB, Util: 66%  global_step : 3080
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 1, Batch 1081 | Mem: 26.53MB, Util: 66%  global_step : 3081
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 1, Batch 1082 | Mem: 26.53MB, Util: 66%  global_step : 3082
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 1, Batch 1083 | Mem: 26.53MB, Util: 66%  global_step : 3083
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 1, Batch 1084 | Mem: 26.53MB, Util: 66%  global_step : 3084
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 1, Batch 1085 | Mem: 26.53MB, Util: 66%  glo

[Rank 1] Train Epoch 1:  55%|█████▍    | 1098/2000 [00:09<00:25, 35.87it/s] 
[Rank 0] Train Epoch 1:  54%|█████▍    | 1077/2000 [00:09<00:27, 33.75it/s] 
[Rank 0] Train Epoch 1:  55%|█████▍    | 1092/2000 [00:09<00:21, 43.08it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1088 | Mem: 26.53MB, Util: 3%  global_step : 3088
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1089 | Mem: 26.53MB, Util: 3%  global_step : 3089
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1090 | Mem: 26.53MB, Util: 3%  global_step : 3090
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1091 | Mem: 26.53MB, Util: 3%  global_step : 3091
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1092 | Mem: 26.53MB, Util: 3%  global_step : 3092
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1093 | Mem: 26.53MB, Util: 3%  global_step : 3093
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1094 | Mem: 26.53MB, Util: 3%  global_step : 3094
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1095 | Mem: 26.53MB, Util: 3%  global_step : 3095


[Rank 1] Train Epoch 1:  55%|█████▌    | 1109/2000 [00:09<00:23, 37.62it/s]
[Rank 2] Train Epoch 1:  55%|█████▌    | 1109/2000 [00:09<00:28, 30.94it/s] 
[Rank 0] Train Epoch 1:  55%|█████▌    | 1107/2000 [00:09<00:16, 54.10it/s]
[Rank 1] Train Epoch 1:  56%|█████▌    | 1123/2000 [00:09<00:18, 48.02it/s]
[Rank 2] Train Epoch 1:  56%|█████▌    | 1123/2000 [00:10<00:21, 40.25it/s]
[Rank 0] Train Epoch 1:  56%|█████▌    | 1124/2000 [00:10<00:12, 68.67it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1120 | Mem: 26.53MB, Util: 70%  global_step : 3120
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1121 | Mem: 26.53MB, Util: 70%  global_step : 3121
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1122 | Mem: 26.53MB, Util: 70%  global_step : 3122
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1123 | Mem: 26.53MB, Util: 70%  global_step : 3123
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1124 | Mem: 26.53MB, Util: 70%  global_step : 3124
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1125 | Mem: 26.53MB, Util: 70%  global_step : 3125
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1126 | Mem: 26.53MB, Util: 70%  global_step : 3126
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1127 | Mem: 26.53MB, Util: 70%  global_step

[Rank 1] Train Epoch 1:  57%|█████▋    | 1137/2000 [00:10<00:14, 59.61it/s]
[Rank 2] Train Epoch 1:  57%|█████▋    | 1137/2000 [00:10<00:16, 51.13it/s]
[Rank 0] Train Epoch 1:  57%|█████▋    | 1138/2000 [00:10<00:11, 77.99it/s]
[Rank 1] Train Epoch 1:  58%|█████▊    | 1151/2000 [00:10<00:11, 71.65it/s]
[Rank 2] Train Epoch 1:  58%|█████▊    | 1151/2000 [00:10<00:13, 62.79it/s]
[Rank 0] Train Epoch 1:  58%|█████▊    | 1152/2000 [00:10<00:09, 88.53it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1148 | Mem: 26.53MB, Util: 99%  global_step : 3148
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1149 | Mem: 26.53MB, Util: 99%  global_step : 3149
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1150 | Mem: 26.53MB, Util: 99%  global_step : 3150
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1151 | Mem: 26.53MB, Util: 99%  global_step : 3151
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1152 | Mem: 26.53MB, Util: 99%  global_step : 3152
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1153 | Mem: 26.53MB, Util: 99%  global_step : 3153
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1154 | Mem: 26.53MB, Util: 99%  global_step : 3154
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1155 | Mem: 26.53MB, Util: 99%  global_step

[Rank 2] Train Epoch 1:  58%|█████▊    | 1165/2000 [00:10<00:11, 75.18it/s]
[Rank 1] Train Epoch 1:  58%|█████▊    | 1164/2000 [00:10<00:10, 81.66it/s]
[Rank 0] Train Epoch 1:  58%|█████▊    | 1166/2000 [00:10<00:08, 98.47it/s]
[Rank 2] Train Epoch 1:  59%|█████▉    | 1180/2000 [00:10<00:09, 88.39it/s]
[Rank 1] Train Epoch 1:  59%|█████▉    | 1177/2000 [00:10<00:08, 91.46it/s]
[Rank 0] Train Epoch 1:  59%|█████▉    | 1181/2000 [00:10<00:07, 109.31it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1177 | Mem: 26.53MB, Util: 100%  global_step : 3177
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1178 | Mem: 26.53MB, Util: 100%  global_step : 3178
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1179 | Mem: 26.53MB, Util: 100%  global_step : 3179
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1180 | Mem: 26.53MB, Util: 100%  global_step : 3180
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1181 | Mem: 26.53MB, Util: 100%  global_step : 3181
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1182 | Mem: 26.53MB, Util: 100%  global_step : 3182
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1183 | Mem: 26.53MB, Util: 100%  global_step : 3183
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1184 | Mem: 26.53MB, Util: 100%  glo

[Rank 2] Train Epoch 1:  60%|█████▉    | 1194/2000 [00:10<00:08, 98.97it/s]
[Rank 1] Train Epoch 1:  60%|█████▉    | 1192/2000 [00:10<00:07, 104.50it/s]
[Rank 0] Train Epoch 1:  60%|█████▉    | 1195/2000 [00:10<00:06, 116.22it/s]
[Rank 2] Train Epoch 1:  60%|██████    | 1208/2000 [00:10<00:07, 104.13it/s]
[Rank 1] Train Epoch 1:  60%|██████    | 1208/2000 [00:10<00:06, 116.44it/s]
[Rank 0] Train Epoch 1:  60%|██████    | 1209/2000 [00:10<00:07, 112.75it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1202 | Mem: 26.53MB, Util: 100%  global_step : 3202
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1203 | Mem: 26.53MB, Util: 100%  global_step : 3203
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1204 | Mem: 26.53MB, Util: 100%  global_step : 3204
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1205 | Mem: 26.53MB, Util: 100%  global_step : 3205
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1206 | Mem: 26.53MB, Util: 100%  global_step : 3206
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1207 | Mem: 26.53MB, Util: 100%  global_step : 3207
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1208 | Mem: 26.53MB, Util: 100%  global_step : 3208
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1209 | Mem: 26.53MB, Util: 100%  glo

[Rank 2] Train Epoch 1:  61%|██████    | 1223/2000 [00:10<00:06, 113.29it/s]
[Rank 1] Train Epoch 1:  61%|██████    | 1224/2000 [00:10<00:06, 126.02it/s]
[Rank 0] Train Epoch 1:  61%|██████    | 1224/2000 [00:10<00:06, 120.23it/s]
[Rank 1] Train Epoch 1:  62%|██████▏   | 1240/2000 [00:10<00:05, 132.84it/s]
[Rank 2] Train Epoch 1:  62%|██████▏   | 1238/2000 [00:10<00:06, 120.54it/s]
[Rank 0] Train Epoch 1:  62%|██████▏   | 1238/2000 [00:10<00:06, 125.14it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1231 | Mem: 26.53MB, Util: 89%  global_step : 3231
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1232 | Mem: 26.53MB, Util: 89%  global_step : 3232
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1233 | Mem: 26.53MB, Util: 89%  global_step : 3233
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1234 | Mem: 26.53MB, Util: 89%  global_step : 3234
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1235 | Mem: 26.53MB, Util: 89%  global_step : 3235
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1236 | Mem: 26.53MB, Util: 89%  global_step : 3236
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1237 | Mem: 26.53MB, Util: 89%  global_step : 3237
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1238 | Mem: 26.53MB, Util: 89%  global_step

[Rank 2] Train Epoch 1:  63%|██████▎   | 1252/2000 [00:11<00:05, 125.61it/s]
[Rank 1] Train Epoch 1:  63%|██████▎   | 1256/2000 [00:10<00:05, 138.07it/s]
[Rank 0] Train Epoch 1:  63%|██████▎   | 1252/2000 [00:10<00:05, 128.86it/s]
[Rank 2] Train Epoch 1:  63%|██████▎   | 1266/2000 [00:11<00:05, 129.50it/s]
[Rank 1] Train Epoch 1:  64%|██████▎   | 1272/2000 [00:10<00:05, 141.84it/s]
[Rank 0] Train Epoch 1:  63%|██████▎   | 1267/2000 [00:11<00:05, 132.59it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1261 | Mem: 26.53MB, Util: 68%  global_step : 3261
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1262 | Mem: 26.53MB, Util: 68%  global_step : 3262
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1263 | Mem: 26.53MB, Util: 68%  global_step : 3263
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1264 | Mem: 26.53MB, Util: 68%  global_step : 3264
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1265 | Mem: 26.53MB, Util: 68%  global_step : 3265
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1266 | Mem: 26.53MB, Util: 68%  global_step : 3266
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1267 | Mem: 26.53MB, Util: 68%  global_step : 3267
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1268 | Mem: 26.53MB, Util: 68%  global_step

[Rank 2] Train Epoch 1:  64%|██████▍   | 1281/2000 [00:11<00:05, 132.77it/s]
[Rank 1] Train Epoch 1:  64%|██████▍   | 1288/2000 [00:11<00:04, 145.67it/s]
[Rank 0] Train Epoch 1:  64%|██████▍   | 1282/2000 [00:11<00:05, 135.79it/s]
[Rank 2] Train Epoch 1:  65%|██████▍   | 1296/2000 [00:11<00:05, 135.70it/s]
[Rank 0] Train Epoch 1:  65%|██████▍   | 1297/2000 [00:11<00:05, 137.59it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1291 | Mem: 26.53MB, Util: 68%  global_step : 3291
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1292 | Mem: 26.53MB, Util: 68%  global_step : 3292
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1293 | Mem: 26.53MB, Util: 68%  global_step : 3293
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1294 | Mem: 26.53MB, Util: 68%  global_step : 3294
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1295 | Mem: 26.53MB, Util: 68%  global_step : 3295
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1296 | Mem: 26.53MB, Util: 68%  global_step : 3296
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1297 | Mem: 26.53MB, Util: 68%  global_step : 3297
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1298 | Mem: 26.53MB, Util: 68%  global_step

[Rank 2] Train Epoch 1:  66%|██████▌   | 1311/2000 [00:11<00:05, 137.27it/s]
[Rank 1] Train Epoch 1:  65%|██████▌   | 1304/2000 [00:11<00:05, 127.96it/s]
[Rank 0] Train Epoch 1:  66%|██████▌   | 1312/2000 [00:11<00:04, 139.14it/s]
[Rank 2] Train Epoch 1:  66%|██████▋   | 1326/2000 [00:11<00:04, 137.86it/s]
[Rank 1] Train Epoch 1:  66%|██████▌   | 1320/2000 [00:11<00:05, 134.01it/s]
[Rank 0] Train Epoch 1:  66%|██████▋   | 1327/2000 [00:11<00:04, 139.98it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1321 | Mem: 26.53MB, Util: 68%  global_step : 3321
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1322 | Mem: 26.53MB, Util: 68%  global_step : 3322
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1323 | Mem: 26.53MB, Util: 68%  global_step : 3323
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1324 | Mem: 26.53MB, Util: 68%  global_step : 3324
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1325 | Mem: 26.53MB, Util: 68%  global_step : 3325
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1326 | Mem: 26.53MB, Util: 68%  global_step : 3326
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1327 | Mem: 26.53MB, Util: 68%  global_step : 3327
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1328 | Mem: 26.53MB, Util: 68%  global_step

[Rank 2] Train Epoch 1:  67%|██████▋   | 1341/2000 [00:11<00:04, 138.48it/s]
[Rank 1] Train Epoch 1:  67%|██████▋   | 1335/2000 [00:11<00:04, 137.95it/s]
[Rank 0] Train Epoch 1:  67%|██████▋   | 1342/2000 [00:11<00:04, 139.86it/s]
[Rank 2] Train Epoch 1:  68%|██████▊   | 1355/2000 [00:11<00:04, 135.18it/s]
[Rank 1] Train Epoch 1:  68%|██████▊   | 1351/2000 [00:11<00:04, 142.27it/s]
[Rank 0] Train Epoch 1:  68%|██████▊   | 1357/2000 [00:11<00:04, 140.71it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1350 | Mem: 26.53MB, Util: 78%  global_step : 3350
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1351 | Mem: 26.53MB, Util: 78%  global_step : 3351
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1352 | Mem: 26.53MB, Util: 78%  global_step : 3352
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1353 | Mem: 26.53MB, Util: 78%  global_step : 3353
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1354 | Mem: 26.53MB, Util: 78%  global_step : 3354
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1355 | Mem: 26.53MB, Util: 78%  global_step : 3355
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1356 | Mem: 26.53MB, Util: 78%  global_step : 3356
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1357 | Mem: 26.53MB, Util: 78%  global_step

[Rank 2] Train Epoch 1:  68%|██████▊   | 1370/2000 [00:11<00:04, 137.57it/s]
[Rank 1] Train Epoch 1:  68%|██████▊   | 1366/2000 [00:11<00:04, 143.08it/s]
[Rank 0] Train Epoch 1:  69%|██████▊   | 1372/2000 [00:11<00:04, 142.00it/s]
[Rank 2] Train Epoch 1:  69%|██████▉   | 1385/2000 [00:11<00:04, 140.12it/s]
[Rank 1] Train Epoch 1:  69%|██████▉   | 1381/2000 [00:11<00:04, 142.52it/s]
[Rank 0] Train Epoch 1:  69%|██████▉   | 1387/2000 [00:11<00:04, 141.96it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1380 | Mem: 26.53MB, Util: 93%  global_step : 3380
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1381 | Mem: 26.53MB, Util: 93%  global_step : 3381
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1382 | Mem: 26.53MB, Util: 93%  global_step : 3382
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1383 | Mem: 26.53MB, Util: 93%  global_step : 3383
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1384 | Mem: 26.53MB, Util: 93%  global_step : 3384
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1385 | Mem: 26.53MB, Util: 93%  global_step : 3385
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1386 | Mem: 26.53MB, Util: 93%  global_step : 3386
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1387 | Mem: 26.53MB, Util: 93%  global_step

[Rank 2] Train Epoch 1:  70%|███████   | 1400/2000 [00:12<00:04, 142.88it/s]
[Rank 1] Train Epoch 1:  70%|██████▉   | 1396/2000 [00:11<00:04, 141.11it/s]
[Rank 0] Train Epoch 1:  70%|███████   | 1402/2000 [00:12<00:04, 139.76it/s]
[Rank 2] Train Epoch 1:  71%|███████   | 1415/2000 [00:12<00:04, 142.55it/s]
[Rank 1] Train Epoch 1:  71%|███████   | 1411/2000 [00:12<00:04, 132.55it/s]
[Rank 0] Train Epoch 1:  71%|███████   | 1417/2000 [00:12<00:04, 140.37it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1409 | Mem: 26.53MB, Util: 100%  global_step : 3409
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1410 | Mem: 26.53MB, Util: 100%  global_step : 3410
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1411 | Mem: 26.53MB, Util: 100%  global_step : 3411
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1412 | Mem: 26.53MB, Util: 100%  global_step : 3412
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1413 | Mem: 26.53MB, Util: 100%  global_step : 3413
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1414 | Mem: 26.53MB, Util: 100%  global_step : 3414
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1415 | Mem: 26.53MB, Util: 100%  global_step : 3415
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1416 | Mem: 26.53MB, Util: 100%  glo

[Rank 2] Train Epoch 1:  72%|███████▏  | 1430/2000 [00:12<00:03, 143.95it/s]
[Rank 1] Train Epoch 1:  71%|███████▏  | 1426/2000 [00:12<00:04, 135.64it/s]
[Rank 0] Train Epoch 1:  72%|███████▏  | 1432/2000 [00:12<00:04, 140.39it/s]
[Rank 2] Train Epoch 1:  72%|███████▏  | 1445/2000 [00:12<00:03, 144.82it/s]
[Rank 1] Train Epoch 1:  72%|███████▏  | 1441/2000 [00:12<00:04, 138.03it/s]
[Rank 0] Train Epoch 1:  72%|███████▏  | 1447/2000 [00:12<00:03, 141.15it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1439 | Mem: 26.53MB, Util: 96%  global_step : 3439
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1440 | Mem: 26.53MB, Util: 96%  global_step : 3440
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1441 | Mem: 26.53MB, Util: 96%  global_step : 3441
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1442 | Mem: 26.53MB, Util: 96%  global_step : 3442
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1443 | Mem: 26.53MB, Util: 96%  global_step : 3443
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1444 | Mem: 26.53MB, Util: 96%  global_step : 3444
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1445 | Mem: 26.53MB, Util: 96%  global_step : 3445
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1446 | Mem: 26.53MB, Util: 96%  global_step

[Rank 2] Train Epoch 1:  73%|███████▎  | 1460/2000 [00:12<00:03, 143.67it/s]
[Rank 1] Train Epoch 1:  73%|███████▎  | 1456/2000 [00:12<00:03, 138.52it/s]
[Rank 0] Train Epoch 1:  73%|███████▎  | 1462/2000 [00:12<00:03, 139.78it/s]
[Rank 2] Train Epoch 1:  74%|███████▍  | 1475/2000 [00:12<00:03, 143.83it/s]
[Rank 1] Train Epoch 1:  74%|███████▎  | 1470/2000 [00:12<00:03, 137.59it/s]
[Rank 0] Train Epoch 1:  74%|███████▍  | 1476/2000 [00:12<00:03, 139.67it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1468 | Mem: 26.53MB, Util: 78%  global_step : 3468
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1469 | Mem: 26.53MB, Util: 78%  global_step : 3469
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1470 | Mem: 26.53MB, Util: 78%  global_step : 3470
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1471 | Mem: 26.53MB, Util: 78%  global_step : 3471
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1472 | Mem: 26.53MB, Util: 78%  global_step : 3472
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1473 | Mem: 26.53MB, Util: 78%  global_step : 3473
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1474 | Mem: 26.53MB, Util: 78%  global_step : 3474
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1475 | Mem: 26.53MB, Util: 78%  global_step

[Rank 2] Train Epoch 1:  74%|███████▍  | 1490/2000 [00:12<00:03, 143.28it/s]
[Rank 1] Train Epoch 1:  74%|███████▍  | 1485/2000 [00:12<00:03, 139.12it/s]
[Rank 0] Train Epoch 1:  74%|███████▍  | 1490/2000 [00:12<00:03, 138.16it/s]
[Rank 1] Train Epoch 1:  75%|███████▌  | 1500/2000 [00:12<00:03, 139.91it/s]
[Rank 0] Train Epoch 1:  75%|███████▌  | 1504/2000 [00:12<00:03, 134.14it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1497 | Mem: 26.53MB, Util: 67%  global_step : 3497
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1498 | Mem: 26.53MB, Util: 67%  global_step : 3498
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1499 | Mem: 26.53MB, Util: 67%  global_step : 3499
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1500 | Mem: 26.53MB, Util: 67%  global_step : 3500
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1501 | Mem: 26.53MB, Util: 67%  global_step : 3501
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1502 | Mem: 26.53MB, Util: 67%  global_step : 3502
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1503 | Mem: 26.53MB, Util: 67%  global_step : 3503
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1504 | Mem: 26.53MB, Util: 67%  global_step

[Rank 2] Train Epoch 1:  75%|███████▌  | 1505/2000 [00:12<00:03, 132.07it/s]
[Rank 2] Train Epoch 1:  76%|███████▌  | 1520/2000 [00:12<00:03, 136.03it/s]
[Rank 1] Train Epoch 1:  76%|███████▌  | 1515/2000 [00:12<00:03, 134.73it/s]
[Rank 0] Train Epoch 1:  76%|███████▌  | 1518/2000 [00:12<00:03, 132.82it/s]
[Rank 2] Train Epoch 1:  77%|███████▋  | 1535/2000 [00:13<00:03, 139.07it/s]
[Rank 1] Train Epoch 1:  76%|███████▋  | 1530/2000 [00:12<00:03, 136.85it/s]
[Rank 0] Train Epoch 1:  77%|███████▋  | 1532/2000 [00:12<00:03, 132.15it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1524 | Mem: 26.53MB, Util: 67%  global_step : 3524
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1525 | Mem: 26.53MB, Util: 67%  global_step : 3525
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1526 | Mem: 26.53MB, Util: 67%  global_step : 3526
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1527 | Mem: 26.53MB, Util: 67%  global_step : 3527
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1528 | Mem: 26.53MB, Util: 67%  global_step : 3528
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1529 | Mem: 26.53MB, Util: 67%  global_step : 3529
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1530 | Mem: 26.53MB, Util: 67%  global_step : 3530
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1531 | Mem: 26.53MB, Util: 67%  global_step

[Rank 2] Train Epoch 1:  78%|███████▊  | 1550/2000 [00:13<00:03, 141.37it/s]
[Rank 1] Train Epoch 1:  77%|███████▋  | 1545/2000 [00:12<00:03, 138.78it/s]
[Rank 0] Train Epoch 1:  77%|███████▋  | 1546/2000 [00:13<00:03, 131.45it/s]
[Rank 2] Train Epoch 1:  78%|███████▊  | 1565/2000 [00:13<00:03, 142.37it/s]
[Rank 1] Train Epoch 1:  78%|███████▊  | 1559/2000 [00:13<00:03, 136.47it/s]
[Rank 0] Train Epoch 1:  78%|███████▊  | 1560/2000 [00:13<00:03, 130.70it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1551 | Mem: 26.53MB, Util: 63%  global_step : 3551
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1552 | Mem: 26.53MB, Util: 63%  global_step : 3552
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1553 | Mem: 26.53MB, Util: 63%  global_step : 3553
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1554 | Mem: 26.53MB, Util: 63%  global_step : 3554
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1555 | Mem: 26.53MB, Util: 63%  global_step : 3555
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1556 | Mem: 26.53MB, Util: 63%  global_step : 3556
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1557 | Mem: 26.53MB, Util: 63%  global_step : 3557
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1558 | Mem: 26.53MB, Util: 63%  global_step

[Rank 2] Train Epoch 1:  79%|███████▉  | 1580/2000 [00:13<00:02, 143.74it/s]
[Rank 1] Train Epoch 1:  79%|███████▊  | 1574/2000 [00:13<00:03, 137.58it/s]
[Rank 0] Train Epoch 1:  79%|███████▊  | 1574/2000 [00:13<00:03, 130.13it/s]
[Rank 2] Train Epoch 1:  80%|███████▉  | 1595/2000 [00:13<00:02, 144.47it/s]
[Rank 1] Train Epoch 1:  79%|███████▉  | 1589/2000 [00:13<00:02, 138.95it/s]
[Rank 0] Train Epoch 1:  79%|███████▉  | 1588/2000 [00:13<00:03, 129.05it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1578 | Mem: 26.53MB, Util: 63%  global_step : 3578
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1579 | Mem: 26.53MB, Util: 63%  global_step : 3579
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1580 | Mem: 26.53MB, Util: 63%  global_step : 3580
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1581 | Mem: 26.53MB, Util: 63%  global_step : 3581
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1582 | Mem: 26.53MB, Util: 63%  global_step : 3582
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1583 | Mem: 26.53MB, Util: 63%  global_step : 3583
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1584 | Mem: 26.53MB, Util: 63%  global_step : 3584
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1585 | Mem: 26.53MB, Util: 62%  global_step

[Rank 0] Train Epoch 1:  80%|████████  | 1601/2000 [00:13<00:03, 129.16it/s]
[Rank 2] Train Epoch 1:  80%|████████  | 1610/2000 [00:13<00:03, 114.27it/s]
[Rank 1] Train Epoch 1:  80%|████████  | 1603/2000 [00:13<00:03, 118.02it/s]
[Rank 0] Train Epoch 1:  81%|████████  | 1615/2000 [00:13<00:02, 129.56it/s]
[Rank 2] Train Epoch 1:  81%|████████▏ | 1625/2000 [00:13<00:03, 122.07it/s]
[Rank 1] Train Epoch 1:  81%|████████  | 1618/2000 [00:13<00:03, 124.54it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1605 | Mem: 26.53MB, Util: 62%  global_step : 3605
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1606 | Mem: 26.53MB, Util: 62%  global_step : 3606
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1607 | Mem: 26.53MB, Util: 62%  global_step : 3607
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1608 | Mem: 26.53MB, Util: 62%  global_step : 3608
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1609 | Mem: 26.53MB, Util: 62%  global_step : 3609
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1610 | Mem: 26.53MB, Util: 62%  global_step : 3610
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1611 | Mem: 26.53MB, Util: 62%  global_step : 3611
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1612 | Mem: 26.53MB, Util: 62%  global_step

[Rank 0] Train Epoch 1:  81%|████████▏ | 1629/2000 [00:13<00:02, 130.22it/s]
[Rank 2] Train Epoch 1:  82%|████████▏ | 1640/2000 [00:13<00:02, 128.49it/s]
[Rank 1] Train Epoch 1:  82%|████████▏ | 1633/2000 [00:13<00:02, 129.62it/s]
[Rank 0] Train Epoch 1:  82%|████████▏ | 1643/2000 [00:13<00:02, 130.05it/s]
[Rank 2] Train Epoch 1:  83%|████████▎ | 1655/2000 [00:13<00:02, 133.50it/s]
[Rank 1] Train Epoch 1:  82%|████████▏ | 1648/2000 [00:13<00:02, 133.70it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1633 | Mem: 26.53MB, Util: 63%  global_step : 3633
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1634 | Mem: 26.53MB, Util: 63%  global_step : 3634
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1635 | Mem: 26.53MB, Util: 63%  global_step : 3635
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1636 | Mem: 26.53MB, Util: 63%  global_step : 3636
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1637 | Mem: 26.53MB, Util: 63%  global_step : 3637
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1638 | Mem: 26.53MB, Util: 63%  global_step : 3638
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1639 | Mem: 26.53MB, Util: 63%  global_step : 3639
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1640 | Mem: 26.53MB, Util: 63%  global_step

[Rank 0] Train Epoch 1:  83%|████████▎ | 1657/2000 [00:13<00:02, 128.03it/s]
[Rank 2] Train Epoch 1:  84%|████████▎ | 1670/2000 [00:14<00:02, 136.94it/s]
[Rank 1] Train Epoch 1:  83%|████████▎ | 1663/2000 [00:13<00:02, 136.27it/s]
[Rank 0] Train Epoch 1:  84%|████████▎ | 1671/2000 [00:14<00:02, 130.61it/s]
[Rank 2] Train Epoch 1:  84%|████████▍ | 1685/2000 [00:14<00:02, 139.40it/s]
[Rank 1] Train Epoch 1:  84%|████████▍ | 1678/2000 [00:13<00:02, 137.60it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1660 | Mem: 26.53MB, Util: 63%  global_step : 3660
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1661 | Mem: 26.53MB, Util: 63%  global_step : 3661
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1662 | Mem: 26.53MB, Util: 63%  global_step : 3662
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1663 | Mem: 26.53MB, Util: 63%  global_step : 3663
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1664 | Mem: 26.53MB, Util: 63%  global_step : 3664
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1665 | Mem: 26.53MB, Util: 63%  global_step : 3665
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1666 | Mem: 26.53MB, Util: 63%  global_step : 3666
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1667 | Mem: 26.53MB, Util: 63%  global_step

[Rank 0] Train Epoch 1:  84%|████████▍ | 1685/2000 [00:14<00:02, 133.21it/s]
[Rank 2] Train Epoch 1:  85%|████████▌ | 1700/2000 [00:14<00:02, 141.09it/s]
[Rank 1] Train Epoch 1:  85%|████████▍ | 1693/2000 [00:14<00:02, 139.87it/s]
[Rank 0] Train Epoch 1:  85%|████████▍ | 1699/2000 [00:14<00:02, 134.43it/s]
[Rank 1] Train Epoch 1:  85%|████████▌ | 1708/2000 [00:14<00:02, 122.50it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1689 | Mem: 26.53MB, Util: 65%  global_step : 3689
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1690 | Mem: 26.53MB, Util: 65%  global_step : 3690
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1691 | Mem: 26.53MB, Util: 65%  global_step : 3691
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1692 | Mem: 26.53MB, Util: 65%  global_step : 3692
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1693 | Mem: 26.53MB, Util: 65%  global_step : 3693
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1694 | Mem: 26.53MB, Util: 65%  global_step : 3694
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1695 | Mem: 26.53MB, Util: 65%  global_step : 3695
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1696 | Mem: 26.53MB, Util: 65%  global_step

[Rank 0] Train Epoch 1:  86%|████████▌ | 1713/2000 [00:14<00:02, 135.68it/s]
[Rank 2] Train Epoch 1:  86%|████████▌ | 1715/2000 [00:14<00:02, 119.46it/s]
[Rank 1] Train Epoch 1:  86%|████████▌ | 1724/2000 [00:14<00:02, 130.14it/s]
[Rank 0] Train Epoch 1:  86%|████████▋ | 1728/2000 [00:14<00:01, 137.01it/s]
[Rank 2] Train Epoch 1:  86%|████████▋ | 1730/2000 [00:14<00:02, 126.00it/s]
[Rank 1] Train Epoch 1:  87%|████████▋ | 1740/2000 [00:14<00:01, 136.32it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1718 | Mem: 26.53MB, Util: 68%  global_step : 3718
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1719 | Mem: 26.53MB, Util: 68%  global_step : 3719
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1720 | Mem: 26.53MB, Util: 68%  global_step : 3720
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1721 | Mem: 26.53MB, Util: 68%  global_step : 3721
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1722 | Mem: 26.53MB, Util: 68%  global_step : 3722
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1723 | Mem: 26.53MB, Util: 68%  global_step : 3723
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1724 | Mem: 26.53MB, Util: 68%  global_step : 3724
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1725 | Mem: 26.53MB, Util: 68%  global_step

[Rank 0] Train Epoch 1:  87%|████████▋ | 1742/2000 [00:14<00:01, 136.96it/s]
[Rank 2] Train Epoch 1:  87%|████████▋ | 1745/2000 [00:14<00:01, 131.45it/s]
[Rank 1] Train Epoch 1:  88%|████████▊ | 1755/2000 [00:14<00:01, 139.92it/s]
[Rank 0] Train Epoch 1:  88%|████████▊ | 1756/2000 [00:14<00:01, 137.27it/s]
[Rank 2] Train Epoch 1:  88%|████████▊ | 1760/2000 [00:14<00:01, 135.37it/s]
[Rank 1] Train Epoch 1:  89%|████████▊ | 1771/2000 [00:14<00:01, 144.85it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1747 | Mem: 26.53MB, Util: 70%  global_step : 3747
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1748 | Mem: 26.53MB, Util: 70%  global_step : 3748
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1749 | Mem: 26.53MB, Util: 70%  global_step : 3749
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1750 | Mem: 26.53MB, Util: 70%  global_step : 3750
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1751 | Mem: 26.53MB, Util: 70%  global_step : 3751
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1752 | Mem: 26.53MB, Util: 70%  global_step : 3752
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1753 | Mem: 26.53MB, Util: 70%  global_step : 3753
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1754 | Mem: 26.53MB, Util: 70%  global_step

[Rank 0] Train Epoch 1:  89%|████████▊ | 1771/2000 [00:14<00:01, 138.69it/s]
[Rank 2] Train Epoch 1:  89%|████████▉ | 1775/2000 [00:14<00:01, 138.27it/s]
[Rank 1] Train Epoch 1:  89%|████████▉ | 1786/2000 [00:14<00:01, 143.13it/s]
[Rank 0] Train Epoch 1:  89%|████████▉ | 1786/2000 [00:14<00:01, 139.67it/s]
[Rank 2] Train Epoch 1:  90%|████████▉ | 1790/2000 [00:14<00:01, 140.85it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1777 | Mem: 26.53MB, Util: 68%  global_step : 3777
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1778 | Mem: 26.53MB, Util: 68%  global_step : 3778
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1779 | Mem: 26.53MB, Util: 68%  global_step : 3779
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1780 | Mem: 26.53MB, Util: 68%  global_step : 3780
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1781 | Mem: 26.53MB, Util: 68%  global_step : 3781
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1782 | Mem: 26.53MB, Util: 68%  global_step : 3782
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1783 | Mem: 26.53MB, Util: 68%  global_step : 3783
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1784 | Mem: 26.53MB, Util: 68%  global_step

[Rank 0] Train Epoch 1:  90%|█████████ | 1801/2000 [00:14<00:01, 140.11it/s]
[Rank 2] Train Epoch 1:  90%|█████████ | 1805/2000 [00:15<00:01, 131.31it/s]
[Rank 1] Train Epoch 1:  90%|█████████ | 1801/2000 [00:14<00:01, 128.11it/s]
[Rank 0] Train Epoch 1:  91%|█████████ | 1816/2000 [00:15<00:01, 139.72it/s]
[Rank 2] Train Epoch 1:  91%|█████████ | 1820/2000 [00:15<00:01, 135.51it/s]
[Rank 1] Train Epoch 1:  91%|█████████ | 1816/2000 [00:15<00:01, 131.65it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1806 | Mem: 26.53MB, Util: 70%  global_step : 3806
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1807 | Mem: 26.53MB, Util: 70%  global_step : 3807
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1808 | Mem: 26.53MB, Util: 70%  global_step : 3808
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1809 | Mem: 26.53MB, Util: 70%  global_step : 3809
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1810 | Mem: 26.53MB, Util: 70%  global_step : 3810
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1811 | Mem: 26.53MB, Util: 70%  global_step : 3811
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1812 | Mem: 26.53MB, Util: 70%  global_step : 3812
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1813 | Mem: 26.53MB, Util: 70%  global_step

[Rank 0] Train Epoch 1:  92%|█████████▏| 1831/2000 [00:15<00:01, 140.48it/s]
[Rank 2] Train Epoch 1:  92%|█████████▏| 1835/2000 [00:15<00:01, 139.03it/s]
[Rank 1] Train Epoch 1:  92%|█████████▏| 1831/2000 [00:15<00:01, 135.15it/s]
[Rank 0] Train Epoch 1:  92%|█████████▏| 1846/2000 [00:15<00:01, 140.33it/s]
[Rank 2] Train Epoch 1:  92%|█████████▎| 1850/2000 [00:15<00:01, 141.02it/s]
[Rank 1] Train Epoch 1:  92%|█████████▏| 1846/2000 [00:15<00:01, 137.15it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1836 | Mem: 26.53MB, Util: 70%  global_step : 3836
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1837 | Mem: 26.53MB, Util: 70%  global_step : 3837
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1838 | Mem: 26.53MB, Util: 70%  global_step : 3838
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1839 | Mem: 26.53MB, Util: 70%  global_step : 3839
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1840 | Mem: 26.53MB, Util: 70%  global_step : 3840
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1841 | Mem: 26.53MB, Util: 70%  global_step : 3841
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1842 | Mem: 26.53MB, Util: 70%  global_step : 3842
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1843 | Mem: 26.53MB, Util: 70%  global_step

[Rank 0] Train Epoch 1:  93%|█████████▎| 1861/2000 [00:15<00:00, 140.72it/s]
[Rank 2] Train Epoch 1:  93%|█████████▎| 1865/2000 [00:15<00:00, 142.85it/s]
[Rank 1] Train Epoch 1:  93%|█████████▎| 1861/2000 [00:15<00:01, 138.27it/s]
[Rank 0] Train Epoch 1:  94%|█████████▍| 1876/2000 [00:15<00:00, 140.77it/s]
[Rank 2] Train Epoch 1:  94%|█████████▍| 1880/2000 [00:15<00:00, 143.20it/s]
[Rank 1] Train Epoch 1:  94%|█████████▍| 1876/2000 [00:15<00:00, 139.57it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1865 | Mem: 26.53MB, Util: 70%  global_step : 3865
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1866 | Mem: 26.53MB, Util: 70%  global_step : 3866
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1867 | Mem: 26.53MB, Util: 70%  global_step : 3867
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1868 | Mem: 26.53MB, Util: 70%  global_step : 3868
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1869 | Mem: 26.53MB, Util: 70%  global_step : 3869
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1870 | Mem: 26.53MB, Util: 70%  global_step : 3870
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1871 | Mem: 26.53MB, Util: 70%  global_step : 3871
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1872 | Mem: 26.53MB, Util: 70%  global_step

[Rank 0] Train Epoch 1:  95%|█████████▍| 1891/2000 [00:15<00:00, 140.66it/s]
[Rank 2] Train Epoch 1:  95%|█████████▍| 1895/2000 [00:15<00:00, 144.68it/s]
[Rank 1] Train Epoch 1:  95%|█████████▍| 1891/2000 [00:15<00:00, 139.61it/s]
[Rank 0] Train Epoch 1:  95%|█████████▌| 1906/2000 [00:15<00:00, 140.76it/s]
[Rank 2] Train Epoch 1:  96%|█████████▌| 1910/2000 [00:15<00:00, 134.31it/s]
[Rank 1] Train Epoch 1:  95%|█████████▌| 1906/2000 [00:15<00:00, 138.70it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1895 | Mem: 26.53MB, Util: 69%  global_step : 3895
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1896 | Mem: 26.53MB, Util: 69%  global_step : 3896
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1897 | Mem: 26.53MB, Util: 69%  global_step : 3897
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1898 | Mem: 26.53MB, Util: 69%  global_step : 3898
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1899 | Mem: 26.53MB, Util: 69%  global_step : 3899
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1900 | Mem: 26.53MB, Util: 69%  global_step : 3900
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1901 | Mem: 26.53MB, Util: 69%  global_step : 3901
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1902 | Mem: 26.53MB, Util: 69%  global_step

[Rank 0] Train Epoch 1:  96%|█████████▌| 1921/2000 [00:15<00:00, 141.28it/s]
[Rank 2] Train Epoch 1:  96%|█████████▋| 1925/2000 [00:15<00:00, 137.87it/s]
[Rank 1] Train Epoch 1:  96%|█████████▌| 1920/2000 [00:15<00:00, 136.40it/s]
[Rank 0] Train Epoch 1:  97%|█████████▋| 1936/2000 [00:15<00:00, 141.33it/s]
[Rank 2] Train Epoch 1:  97%|█████████▋| 1940/2000 [00:16<00:00, 140.12it/s]
[Rank 1] Train Epoch 1:  97%|█████████▋| 1934/2000 [00:15<00:00, 136.42it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1925 | Mem: 26.53MB, Util: 69%  global_step : 3925
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1926 | Mem: 26.53MB, Util: 69%  global_step : 3926
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1927 | Mem: 26.53MB, Util: 69%  global_step : 3927
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1928 | Mem: 26.53MB, Util: 69%  global_step : 3928
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1929 | Mem: 26.53MB, Util: 69%  global_step : 3929
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1930 | Mem: 26.53MB, Util: 69%  global_step : 3930
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1931 | Mem: 26.53MB, Util: 69%  global_step : 3931
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1932 | Mem: 26.53MB, Util: 69%  global_step

[Rank 0] Train Epoch 1:  98%|█████████▊| 1951/2000 [00:16<00:00, 141.41it/s]
[Rank 1] Train Epoch 1:  97%|█████████▋| 1949/2000 [00:15<00:00, 139.50it/s]
[Rank 2] Train Epoch 1:  98%|█████████▊| 1956/2000 [00:16<00:00, 143.30it/s]
[Rank 0] Train Epoch 1:  98%|█████████▊| 1966/2000 [00:16<00:00, 141.15it/s]
[Rank 1] Train Epoch 1:  98%|█████████▊| 1964/2000 [00:16<00:00, 141.79it/s]
[Rank 2] Train Epoch 1:  99%|█████████▊| 1971/2000 [00:16<00:00, 144.49it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1955 | Mem: 26.53MB, Util: 79%  global_step : 3955
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1956 | Mem: 26.53MB, Util: 79%  global_step : 3956
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1957 | Mem: 26.53MB, Util: 79%  global_step : 3957
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1958 | Mem: 26.53MB, Util: 79%  global_step : 3958
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1959 | Mem: 26.53MB, Util: 79%  global_step : 3959
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1960 | Mem: 26.53MB, Util: 79%  global_step : 3960
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1961 | Mem: 26.53MB, Util: 79%  global_step : 3961
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1962 | Mem: 26.53MB, Util: 79%  global_step

[Rank 0] Train Epoch 1:  99%|█████████▉| 1981/2000 [00:16<00:00, 141.91it/s]
[Rank 1] Train Epoch 1:  99%|█████████▉| 1979/2000 [00:16<00:00, 142.75it/s]
[Rank 2] Train Epoch 1:  99%|█████████▉| 1986/2000 [00:16<00:00, 145.43it/s]
[Rank 0] Train Epoch 1: 100%|██████████| 2000/2000 [00:16<00:00, 121.97it/s]
[Rank 0] Test Epoch 1:   0%|          | 0/334 [00:00<?, ?it/s]
[Rank 1] Train Epoch 1: 100%|██████████| 2000/2000 [00:16<00:00, 122.58it/s]
[Rank 1] Test Epoch 1:   0%|          | 0/334 [00:00<?, ?it/s]
[Rank 2] Train Epoch 1: 100%|██████████| 2000/2000 [00:16<00:00, 121.85it/s]
[Rank 2] Test Epoch 1:   0%|          | 0/334 [00:00<?, ?it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1985 | Mem: 26.53MB, Util: 100%  global_step : 3985
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1986 | Mem: 26.53MB, Util: 100%  global_step : 3986
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1987 | Mem: 26.53MB, Util: 100%  global_step : 3987
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1988 | Mem: 26.53MB, Util: 100%  global_step : 3988
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1989 | Mem: 26.53MB, Util: 100%  global_step : 3989
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1990 | Mem: 26.53MB, Util: 100%  global_step : 3990
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1991 | Mem: 26.53MB, Util: 100%  global_step : 3991
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 1, Batch 1992 | Mem: 26.53MB, Util: 100%  glo

[Rank 0] Test Epoch 1:  10%|▉         | 33/334 [00:00<00:00, 323.42it/s]
[Rank 1] Test Epoch 1:  10%|▉         | 33/334 [00:00<00:00, 325.72it/s]
[Rank 2] Test Epoch 1:   8%|▊         | 27/334 [00:00<00:01, 263.57it/s]
[Rank 0] Test Epoch 1:  20%|██        | 68/334 [00:00<00:00, 334.92it/s]
[Rank 1] Test Epoch 1:  20%|██        | 68/334 [00:00<00:00, 336.25it/s]
[Rank 2] Test Epoch 1:  19%|█▉        | 63/334 [00:00<00:00, 317.05it/s]
[Rank 0] Test Epoch 1:  31%|███       | 103/334 [00:00<00:00, 339.54it/s]
[Rank 1] Test Epoch 1:  31%|███       | 103/334 [00:00<00:00, 341.73it/s]
[Rank 2] Test Epoch 1:  30%|██▉       | 100/334 [00:00<00:00, 337.36it/s]
[Rank 0] Test Epoch 1:  41%|████▏     | 138/334 [00:00<00:00, 340.18it/s]
[Rank 1] Test Epoch 1:  41%|████▏     | 138/334 [00:00<00:00, 343.17it/s]
[Rank 2] Test Epoch 1:  41%|████      | 136/334 [00:00<00:00, 346.06it/s]
[Rank 0] Test Epoch 1:  52%|█████▏    | 173/334 [00:00<00:00, 341.30it/s]
[Rank 1] Test Epoch 1:  52%|█████▏    | 173/

[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [Rank 2] Epoch 1 | Loss: 0.4074, Acc: 0.8506, Model Checksum: d9e10dcb2c913c7b895364930c0ed3b7
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [ NodeId 8132e66e3928a4b82addde0aa19309709a58e788e1dcddeaab904191 Rank 2] Epoch 1 | Loss: 0.4074, Acc: 0.8506, Model Checksum: d9e10dcb2c913c7b895364930c0ed3b7
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 0 | Mem: 26.53MB, Util: 3%  global_step : 4000
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1 | Mem: 26.53MB, Util: 3%  global_step : 4001
[36m(TunerInternal pid=767)[0m 
[36m(TunerInternal pid=767)[0m Training finished iteration 2 at 2025-04-07 12:55:46. Total running time: 6min 55s
[36m(TunerInternal pid=767)[0m ╭────────────────────────────────────────────╮
[36m(TunerInternal pid=767)[0m │ Training result                            │
[36m(TunerInternal pid=767)[0m ├────────────────────────────────────────────┤
[36m(T

[Rank 0] Train Epoch 2:   1%|          | 13/2000 [00:00<00:15, 126.79it/s]
[Rank 1] Train Epoch 2:   1%|          | 12/2000 [00:00<00:16, 117.14it/s]
[Rank 2] Train Epoch 2:   1%|▏         | 29/2000 [00:00<00:13, 142.19it/s]
[Rank 0] Train Epoch 2:   1%|▏         | 26/2000 [00:00<00:15, 127.44it/s]
[Rank 1] Train Epoch 2:   1%|▏         | 28/2000 [00:00<00:14, 139.68it/s]
[Rank 2] Train Epoch 2:   2%|▏         | 44/2000 [00:00<00:13, 144.78it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 18 | Mem: 26.53MB, Util: 3%  global_step : 4018
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 19 | Mem: 26.53MB, Util: 34%  global_step : 4019
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 20 | Mem: 26.53MB, Util: 34%  global_step : 4020
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 21 | Mem: 26.53MB, Util: 34%  global_step : 4021
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 22 | Mem: 26.53MB, Util: 34%  global_step : 4022
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 23 | Mem: 26.53MB, Util: 34%  global_step : 4023
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 24 | Mem: 26.53MB, Util: 34%  global_step : 4024
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 25 | Mem: 26.53MB, Util: 34%  global_step : 4025


[Rank 0] Train Epoch 2:   2%|▏         | 41/2000 [00:00<00:14, 134.79it/s]
[Rank 1] Train Epoch 2:   2%|▏         | 43/2000 [00:00<00:13, 143.64it/s]
[Rank 2] Train Epoch 2:   3%|▎         | 59/2000 [00:00<00:13, 145.01it/s]
[Rank 0] Train Epoch 2:   3%|▎         | 55/2000 [00:00<00:14, 130.63it/s]
[Rank 1] Train Epoch 2:   3%|▎         | 58/2000 [00:00<00:13, 144.89it/s]
[Rank 2] Train Epoch 2:   4%|▎         | 74/2000 [00:00<00:13, 144.91it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 48 | Mem: 26.53MB, Util: 34%  global_step : 4048
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 49 | Mem: 26.53MB, Util: 34%  global_step : 4049
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 50 | Mem: 26.53MB, Util: 34%  global_step : 4050
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 51 | Mem: 26.53MB, Util: 34%  global_step : 4051
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 52 | Mem: 26.53MB, Util: 100%  global_step : 4052
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 53 | Mem: 26.53MB, Util: 100%  global_step : 4053
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 54 | Mem: 26.53MB, Util: 100%  global_step : 4054
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 55 | Mem: 26.53MB, Util: 100%  global_step : 4

[Rank 0] Train Epoch 2:   3%|▎         | 69/2000 [00:00<00:14, 131.28it/s]
[Rank 1] Train Epoch 2:   4%|▎         | 73/2000 [00:00<00:13, 144.79it/s]
[Rank 2] Train Epoch 2:   4%|▍         | 89/2000 [00:00<00:13, 145.80it/s]
[Rank 0] Train Epoch 2:   4%|▍         | 83/2000 [00:00<00:14, 131.91it/s]
[Rank 1] Train Epoch 2:   4%|▍         | 88/2000 [00:00<00:13, 144.66it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 79 | Mem: 26.53MB, Util: 100%  global_step : 4079
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 80 | Mem: 26.53MB, Util: 100%  global_step : 4080
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 81 | Mem: 26.53MB, Util: 100%  global_step : 4081
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 82 | Mem: 26.53MB, Util: 100%  global_step : 4082
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 83 | Mem: 26.53MB, Util: 100%  global_step : 4083
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 84 | Mem: 26.53MB, Util: 100%  global_step : 4084
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 85 | Mem: 26.53MB, Util: 100%  global_step : 4085
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 86 | Mem: 26.53MB, Util: 100%  global_step

[Rank 0] Train Epoch 2:   5%|▍         | 97/2000 [00:00<00:14, 129.53it/s]
[Rank 1] Train Epoch 2:   5%|▌         | 103/2000 [00:00<00:15, 120.14it/s]
[Rank 2] Train Epoch 2:   5%|▌         | 104/2000 [00:00<00:18, 104.22it/s]
[Rank 0] Train Epoch 2:   6%|▌         | 112/2000 [00:00<00:14, 134.03it/s]
[Rank 1] Train Epoch 2:   6%|▌         | 118/2000 [00:00<00:14, 125.96it/s]
[Rank 2] Train Epoch 2:   6%|▌         | 119/2000 [00:00<00:16, 115.04it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 100 | Mem: 26.53MB, Util: 100%  global_step : 4100
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 101 | Mem: 26.53MB, Util: 100%  global_step : 4101
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 102 | Mem: 26.53MB, Util: 100%  global_step : 4102
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 103 | Mem: 26.53MB, Util: 100%  global_step : 4103
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 104 | Mem: 26.53MB, Util: 100%  global_step : 4104
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 105 | Mem: 26.53MB, Util: 100%  global_step : 4105
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 2, Batch 103 | Mem: 26.53MB, Util: 62%  global_step : 4103
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 2, Batch 104 | Mem: 26.53MB, Util: 62%  global_

[Rank 0] Train Epoch 2:   6%|▋         | 127/2000 [00:00<00:13, 136.91it/s]
[Rank 1] Train Epoch 2:   7%|▋         | 133/2000 [00:00<00:14, 131.11it/s]
[Rank 2] Train Epoch 2:   7%|▋         | 134/2000 [00:01<00:15, 123.43it/s]
[Rank 0] Train Epoch 2:   7%|▋         | 142/2000 [00:01<00:13, 138.96it/s]
[Rank 1] Train Epoch 2:   7%|▋         | 148/2000 [00:01<00:13, 133.52it/s]
[Rank 2] Train Epoch 2:   8%|▊         | 150/2000 [00:01<00:14, 130.68it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 122 | Mem: 26.53MB, Util: 92%  global_step : 4122
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 123 | Mem: 26.53MB, Util: 92%  global_step : 4123
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 124 | Mem: 26.53MB, Util: 92%  global_step : 4124
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 125 | Mem: 26.53MB, Util: 92%  global_step : 4125
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 126 | Mem: 26.53MB, Util: 92%  global_step : 4126
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 127 | Mem: 26.53MB, Util: 92%  global_step : 4127
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 128 | Mem: 26.53MB, Util: 92%  global_step : 4128
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 129 | Mem: 26.53MB, Util: 92%  global_step

[Rank 0] Train Epoch 2:   8%|▊         | 157/2000 [00:01<00:13, 140.53it/s]
[Rank 2] Train Epoch 2:   8%|▊         | 165/2000 [00:01<00:13, 134.74it/s]
[Rank 1] Train Epoch 2:   8%|▊         | 163/2000 [00:01<00:13, 135.93it/s]
[Rank 0] Train Epoch 2:   9%|▊         | 172/2000 [00:01<00:12, 141.03it/s]
[Rank 2] Train Epoch 2:   9%|▉         | 180/2000 [00:01<00:13, 138.31it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 153 | Mem: 26.53MB, Util: 98%  global_step : 4153
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 154 | Mem: 26.53MB, Util: 98%  global_step : 4154
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 155 | Mem: 26.53MB, Util: 98%  global_step : 4155
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 156 | Mem: 26.53MB, Util: 98%  global_step : 4156
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 157 | Mem: 26.53MB, Util: 98%  global_step : 4157
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 158 | Mem: 26.53MB, Util: 98%  global_step : 4158
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 159 | Mem: 26.53MB, Util: 98%  global_step : 4159
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 160 | Mem: 26.53MB, Util: 98%  global_step

[Rank 1] Train Epoch 2:   9%|▉         | 177/2000 [00:01<00:13, 135.99it/s]
[Rank 0] Train Epoch 2:   9%|▉         | 187/2000 [00:01<00:12, 141.62it/s]
[Rank 2] Train Epoch 2:  10%|▉         | 195/2000 [00:01<00:12, 140.33it/s]
[Rank 1] Train Epoch 2:  10%|▉         | 192/2000 [00:01<00:13, 138.40it/s]
[Rank 0] Train Epoch 2:  10%|█         | 202/2000 [00:01<00:13, 138.08it/s]
[Rank 2] Train Epoch 2:  10%|█         | 210/2000 [00:01<00:13, 132.32it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 183 | Mem: 26.53MB, Util: 100%  global_step : 4183
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 184 | Mem: 26.53MB, Util: 100%  global_step : 4184
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 185 | Mem: 26.53MB, Util: 100%  global_step : 4185
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 186 | Mem: 26.53MB, Util: 100%  global_step : 4186
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 187 | Mem: 26.53MB, Util: 100%  global_step : 4187
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 188 | Mem: 26.53MB, Util: 100%  global_step : 4188
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 189 | Mem: 26.53MB, Util: 100%  global_step : 4189
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 190 | Mem: 26.53MB, Util: 100%  glo

[Rank 1] Train Epoch 2:  10%|█         | 207/2000 [00:01<00:12, 139.28it/s]
[Rank 0] Train Epoch 2:  11%|█         | 217/2000 [00:01<00:12, 139.33it/s]
[Rank 2] Train Epoch 2:  11%|█▏        | 225/2000 [00:01<00:13, 136.18it/s]
[Rank 1] Train Epoch 2:  11%|█         | 222/2000 [00:01<00:12, 140.12it/s]
[Rank 0] Train Epoch 2:  12%|█▏        | 232/2000 [00:01<00:12, 140.25it/s]
[Rank 2] Train Epoch 2:  12%|█▏        | 240/2000 [00:01<00:12, 139.24it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 210 | Mem: 26.53MB, Util: 100%  global_step : 4210
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 211 | Mem: 26.53MB, Util: 100%  global_step : 4211
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 212 | Mem: 26.53MB, Util: 100%  global_step : 4212
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 213 | Mem: 26.53MB, Util: 100%  global_step : 4213
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 214 | Mem: 26.53MB, Util: 100%  global_step : 4214
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 215 | Mem: 26.53MB, Util: 100%  global_step : 4215
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 216 | Mem: 26.53MB, Util: 100%  global_step : 4216
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 217 | Mem: 26.53MB, Util: 100%  glo

[Rank 1] Train Epoch 2:  12%|█▏        | 237/2000 [00:01<00:12, 140.90it/s]
[Rank 0] Train Epoch 2:  12%|█▏        | 247/2000 [00:01<00:12, 140.72it/s]
[Rank 2] Train Epoch 2:  13%|█▎        | 255/2000 [00:01<00:12, 140.79it/s]
[Rank 1] Train Epoch 2:  13%|█▎        | 252/2000 [00:01<00:12, 141.81it/s]
[Rank 0] Train Epoch 2:  13%|█▎        | 262/2000 [00:01<00:12, 141.67it/s]
[Rank 2] Train Epoch 2:  14%|█▎        | 270/2000 [00:02<00:12, 142.70it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 241 | Mem: 26.53MB, Util: 97%  global_step : 4241
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 242 | Mem: 26.53MB, Util: 97%  global_step : 4242
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 243 | Mem: 26.53MB, Util: 97%  global_step : 4243
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 244 | Mem: 26.53MB, Util: 97%  global_step : 4244
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 245 | Mem: 26.53MB, Util: 97%  global_step : 4245
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 246 | Mem: 26.53MB, Util: 97%  global_step : 4246
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 247 | Mem: 26.53MB, Util: 97%  global_step : 4247
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 248 | Mem: 26.53MB, Util: 97%  global_step

[Rank 1] Train Epoch 2:  13%|█▎        | 267/2000 [00:01<00:12, 141.78it/s]
[Rank 0] Train Epoch 2:  14%|█▍        | 277/2000 [00:02<00:12, 142.08it/s]
[Rank 2] Train Epoch 2:  14%|█▍        | 285/2000 [00:02<00:11, 144.62it/s]
[Rank 1] Train Epoch 2:  14%|█▍        | 282/2000 [00:02<00:12, 142.61it/s]
[Rank 0] Train Epoch 2:  15%|█▍        | 292/2000 [00:02<00:11, 142.74it/s]
[Rank 2] Train Epoch 2:  15%|█▌        | 300/2000 [00:02<00:11, 145.19it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 271 | Mem: 26.53MB, Util: 100%  global_step : 4271
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 272 | Mem: 26.53MB, Util: 100%  global_step : 4272
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 273 | Mem: 26.53MB, Util: 100%  global_step : 4273
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 274 | Mem: 26.53MB, Util: 100%  global_step : 4274
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 275 | Mem: 26.53MB, Util: 100%  global_step : 4275
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 276 | Mem: 26.53MB, Util: 100%  global_step : 4276
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 277 | Mem: 26.53MB, Util: 100%  global_step : 4277
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 278 | Mem: 26.53MB, Util: 100%  glo

[Rank 1] Train Epoch 2:  15%|█▍        | 297/2000 [00:02<00:12, 140.08it/s]
[Rank 0] Train Epoch 2:  15%|█▌        | 307/2000 [00:02<00:12, 138.90it/s]
[Rank 1] Train Epoch 2:  16%|█▌        | 312/2000 [00:02<00:12, 139.05it/s]
[Rank 0] Train Epoch 2:  16%|█▌        | 322/2000 [00:02<00:12, 139.71it/s]
[Rank 2] Train Epoch 2:  16%|█▌        | 315/2000 [00:02<00:12, 134.96it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 300 | Mem: 26.53MB, Util: 100%  global_step : 4300
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 301 | Mem: 26.53MB, Util: 100%  global_step : 4301
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 302 | Mem: 26.53MB, Util: 100%  global_step : 4302
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 303 | Mem: 26.53MB, Util: 100%  global_step : 4303
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 304 | Mem: 26.53MB, Util: 100%  global_step : 4304
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 305 | Mem: 26.53MB, Util: 100%  global_step : 4305
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 306 | Mem: 26.53MB, Util: 100%  global_step : 4306
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 307 | Mem: 26.53MB, Util: 100%  glo

[Rank 1] Train Epoch 2:  16%|█▋        | 326/2000 [00:02<00:12, 138.61it/s]
[Rank 0] Train Epoch 2:  17%|█▋        | 337/2000 [00:02<00:11, 140.71it/s]
[Rank 2] Train Epoch 2:  16%|█▋        | 330/2000 [00:02<00:12, 138.00it/s]
[Rank 1] Train Epoch 2:  17%|█▋        | 340/2000 [00:02<00:12, 138.19it/s]
[Rank 0] Train Epoch 2:  18%|█▊        | 352/2000 [00:02<00:11, 138.39it/s]
[Rank 2] Train Epoch 2:  17%|█▋        | 345/2000 [00:02<00:11, 141.11it/s]
[Rank 2] Train Epoch 2:  18%|█▊        | 360/2000 [00:02<00:11, 143.29it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 329 | Mem: 26.53MB, Util: 98%  global_step : 4329
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 330 | Mem: 26.53MB, Util: 98%  global_step : 4330
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 331 | Mem: 26.53MB, Util: 98%  global_step : 4331
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 332 | Mem: 26.53MB, Util: 98%  global_step : 4332
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 333 | Mem: 26.53MB, Util: 98%  global_step : 4333
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 334 | Mem: 26.53MB, Util: 98%  global_step : 4334
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 335 | Mem: 26.53MB, Util: 98%  global_step : 4335
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 336 | Mem: 26.53MB, Util: 98%  global_step

[Rank 1] Train Epoch 2:  18%|█▊        | 354/2000 [00:02<00:11, 137.43it/s]
[Rank 0] Train Epoch 2:  18%|█▊        | 366/2000 [00:02<00:11, 137.39it/s]
[Rank 2] Train Epoch 2:  19%|█▉        | 375/2000 [00:02<00:11, 145.10it/s]
[Rank 1] Train Epoch 2:  18%|█▊        | 368/2000 [00:02<00:11, 136.52it/s]
[Rank 0] Train Epoch 2:  19%|█▉        | 381/2000 [00:02<00:11, 138.38it/s]
[Rank 2] Train Epoch 2:  20%|█▉        | 390/2000 [00:02<00:11, 146.31it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 360 | Mem: 26.53MB, Util: 100%  global_step : 4360
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 361 | Mem: 26.53MB, Util: 100%  global_step : 4361
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 362 | Mem: 26.53MB, Util: 100%  global_step : 4362
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 363 | Mem: 26.53MB, Util: 100%  global_step : 4363
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 364 | Mem: 26.53MB, Util: 100%  global_step : 4364
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 365 | Mem: 26.53MB, Util: 100%  global_step : 4365
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 366 | Mem: 26.53MB, Util: 100%  global_step : 4366
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 367 | Mem: 26.53MB, Util: 100%  glo

[Rank 1] Train Epoch 2:  19%|█▉        | 382/2000 [00:02<00:11, 136.30it/s]
[Rank 0] Train Epoch 2:  20%|█▉        | 395/2000 [00:02<00:11, 138.43it/s]
[Rank 1] Train Epoch 2:  20%|█▉        | 396/2000 [00:02<00:11, 136.10it/s]
[Rank 0] Train Epoch 2:  20%|██        | 409/2000 [00:02<00:12, 132.31it/s]
[Rank 2] Train Epoch 2:  20%|██        | 405/2000 [00:03<00:12, 124.31it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 391 | Mem: 26.53MB, Util: 100%  global_step : 4391
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 392 | Mem: 26.53MB, Util: 100%  global_step : 4392
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 393 | Mem: 26.53MB, Util: 100%  global_step : 4393
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 394 | Mem: 26.53MB, Util: 100%  global_step : 4394
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 395 | Mem: 26.53MB, Util: 100%  global_step : 4395
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 396 | Mem: 26.53MB, Util: 100%  global_step : 4396
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 397 | Mem: 26.53MB, Util: 100%  global_step : 4397
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 398 | Mem: 26.53MB, Util: 100%  glo

[Rank 1] Train Epoch 2:  20%|██        | 410/2000 [00:02<00:11, 135.27it/s]
[Rank 0] Train Epoch 2:  21%|██        | 423/2000 [00:03<00:11, 132.85it/s]
[Rank 2] Train Epoch 2:  21%|██        | 420/2000 [00:03<00:12, 130.37it/s]
[Rank 1] Train Epoch 2:  21%|██        | 424/2000 [00:03<00:11, 134.60it/s]
[Rank 0] Train Epoch 2:  22%|██▏       | 437/2000 [00:03<00:11, 133.18it/s]
[Rank 2] Train Epoch 2:  22%|██▏       | 435/2000 [00:03<00:11, 134.90it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 413 | Mem: 26.53MB, Util: 100%  global_step : 4413
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 414 | Mem: 26.53MB, Util: 100%  global_step : 4414
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 415 | Mem: 26.53MB, Util: 100%  global_step : 4415
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 416 | Mem: 26.53MB, Util: 100%  global_step : 4416
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 417 | Mem: 26.53MB, Util: 100%  global_step : 4417
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 418 | Mem: 26.53MB, Util: 100%  global_step : 4418
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 419 | Mem: 26.53MB, Util: 100%  global_step : 4419
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 420 | Mem: 26.53MB, Util: 100%  glo

[Rank 1] Train Epoch 2:  22%|██▏       | 438/2000 [00:03<00:11, 134.42it/s]
[Rank 0] Train Epoch 2:  23%|██▎       | 451/2000 [00:03<00:11, 131.55it/s]
[Rank 2] Train Epoch 2:  22%|██▎       | 450/2000 [00:03<00:11, 138.88it/s]
[Rank 1] Train Epoch 2:  23%|██▎       | 452/2000 [00:03<00:11, 135.15it/s]
[Rank 0] Train Epoch 2:  23%|██▎       | 465/2000 [00:03<00:11, 133.34it/s]
[Rank 2] Train Epoch 2:  23%|██▎       | 465/2000 [00:03<00:10, 141.04it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 444 | Mem: 26.53MB, Util: 97%  global_step : 4444
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 445 | Mem: 26.53MB, Util: 97%  global_step : 4445
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 446 | Mem: 26.53MB, Util: 97%  global_step : 4446
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 447 | Mem: 26.53MB, Util: 97%  global_step : 4447
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 448 | Mem: 26.53MB, Util: 97%  global_step : 4448
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 449 | Mem: 26.53MB, Util: 97%  global_step : 4449
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 450 | Mem: 26.53MB, Util: 97%  global_step : 4450
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 451 | Mem: 26.53MB, Util: 97%  global_step

[Rank 1] Train Epoch 2:  23%|██▎       | 466/2000 [00:03<00:11, 135.36it/s]
[Rank 0] Train Epoch 2:  24%|██▍       | 480/2000 [00:03<00:11, 135.81it/s]
[Rank 2] Train Epoch 2:  24%|██▍       | 480/2000 [00:03<00:10, 142.91it/s]
[Rank 1] Train Epoch 2:  24%|██▍       | 480/2000 [00:03<00:11, 135.39it/s]
[Rank 0] Train Epoch 2:  25%|██▍       | 495/2000 [00:03<00:10, 137.46it/s]
[Rank 2] Train Epoch 2:  25%|██▍       | 496/2000 [00:03<00:10, 145.16it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 475 | Mem: 26.53MB, Util: 100%  global_step : 4475
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 476 | Mem: 26.53MB, Util: 100%  global_step : 4476
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 477 | Mem: 26.53MB, Util: 100%  global_step : 4477
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 478 | Mem: 26.53MB, Util: 100%  global_step : 4478
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 479 | Mem: 26.53MB, Util: 100%  global_step : 4479
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 480 | Mem: 26.53MB, Util: 100%  global_step : 4480
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 481 | Mem: 26.53MB, Util: 100%  global_step : 4481
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 482 | Mem: 26.53MB, Util: 100%  glo

[Rank 1] Train Epoch 2:  25%|██▍       | 494/2000 [00:03<00:11, 135.86it/s]
[Rank 2] Train Epoch 2:  26%|██▌       | 511/2000 [00:03<00:12, 123.51it/s]
[Rank 1] Train Epoch 2:  25%|██▌       | 508/2000 [00:03<00:11, 135.58it/s]
[Rank 0] Train Epoch 2:  25%|██▌       | 509/2000 [00:03<00:11, 131.31it/s]
[Rank 2] Train Epoch 2:  26%|██▋       | 526/2000 [00:03<00:11, 130.22it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 500 | Mem: 26.53MB, Util: 100%  global_step : 4500
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 501 | Mem: 26.53MB, Util: 100%  global_step : 4501
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 502 | Mem: 26.53MB, Util: 100%  global_step : 4502
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 503 | Mem: 26.53MB, Util: 100%  global_step : 4503
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 504 | Mem: 26.53MB, Util: 100%  global_step : 4504
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 505 | Mem: 26.53MB, Util: 100%  global_step : 4505
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 506 | Mem: 26.53MB, Util: 100%  global_step : 4506
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 507 | Mem: 26.53MB, Util: 100%  glo

[Rank 1] Train Epoch 2:  26%|██▌       | 522/2000 [00:03<00:10, 135.27it/s]
[Rank 0] Train Epoch 2:  26%|██▌       | 523/2000 [00:03<00:11, 131.18it/s]
[Rank 2] Train Epoch 2:  27%|██▋       | 541/2000 [00:03<00:10, 135.03it/s]
[Rank 1] Train Epoch 2:  27%|██▋       | 536/2000 [00:03<00:10, 136.29it/s]
[Rank 0] Train Epoch 2:  27%|██▋       | 537/2000 [00:03<00:11, 127.24it/s]
[Rank 2] Train Epoch 2:  28%|██▊       | 556/2000 [00:04<00:10, 138.03it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 527 | Mem: 26.53MB, Util: 100%  global_step : 4527
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 528 | Mem: 26.53MB, Util: 100%  global_step : 4528
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 529 | Mem: 26.53MB, Util: 100%  global_step : 4529
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 530 | Mem: 26.53MB, Util: 100%  global_step : 4530
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 531 | Mem: 26.53MB, Util: 100%  global_step : 4531
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 532 | Mem: 26.53MB, Util: 100%  global_step : 4532
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 533 | Mem: 26.53MB, Util: 100%  global_step : 4533
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 534 | Mem: 26.53MB, Util: 100%  glo

[Rank 1] Train Epoch 2:  28%|██▊       | 550/2000 [00:04<00:10, 136.42it/s]
[Rank 0] Train Epoch 2:  28%|██▊       | 552/2000 [00:04<00:10, 132.61it/s]
[Rank 2] Train Epoch 2:  29%|██▊       | 571/2000 [00:04<00:10, 140.17it/s]
[Rank 1] Train Epoch 2:  28%|██▊       | 564/2000 [00:04<00:10, 136.56it/s]
[Rank 0] Train Epoch 2:  28%|██▊       | 567/2000 [00:04<00:10, 136.95it/s]
[Rank 2] Train Epoch 2:  29%|██▉       | 586/2000 [00:04<00:09, 141.48it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 2, Batch 549 | Mem: 26.53MB, Util: 64%  global_step : 4549
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 2, Batch 550 | Mem: 26.53MB, Util: 64%  global_step : 4550
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 2, Batch 551 | Mem: 26.53MB, Util: 64%  global_step : 4551
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 2, Batch 552 | Mem: 26.53MB, Util: 64%  global_step : 4552
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 2, Batch 553 | Mem: 26.53MB, Util: 64%  global_step : 4553
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 2, Batch 554 | Mem: 26.53MB, Util: 64%  global_step : 4554
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 2, Batch 555 | Mem: 26.53MB, Util: 64%  global_step : 4555
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 2, Batch 556 | Mem: 26.53MB, Util: 64%  global_step : 4556


[Rank 1] Train Epoch 2:  29%|██▉       | 578/2000 [00:04<00:10, 136.61it/s]
[Rank 0] Train Epoch 2:  29%|██▉       | 581/2000 [00:04<00:10, 136.81it/s]
[Rank 1] Train Epoch 2:  30%|██▉       | 592/2000 [00:04<00:10, 131.46it/s]
[Rank 0] Train Epoch 2:  30%|██▉       | 595/2000 [00:04<00:10, 135.21it/s]
[Rank 2] Train Epoch 2:  30%|███       | 601/2000 [00:04<00:11, 118.90it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 2, Batch 579 | Mem: 26.53MB, Util: 70%  global_step : 4579
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 2, Batch 580 | Mem: 26.53MB, Util: 70%  global_step : 4580
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 2, Batch 581 | Mem: 26.53MB, Util: 70%  global_step : 4581
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 2, Batch 582 | Mem: 26.53MB, Util: 70%  global_step : 4582
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 2, Batch 583 | Mem: 26.53MB, Util: 70%  global_step : 4583
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 2, Batch 584 | Mem: 26.53MB, Util: 70%  global_step : 4584
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 2, Batch 585 | Mem: 26.53MB, Util: 70%  global_step : 4585
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 2, Batch 586 | Mem: 26.53MB, Util: 70%  global_step : 4586


[Rank 1] Train Epoch 2:  30%|███       | 606/2000 [00:04<00:10, 130.72it/s]
[Rank 0] Train Epoch 2:  30%|███       | 609/2000 [00:04<00:10, 134.34it/s]
[Rank 2] Train Epoch 2:  31%|███       | 616/2000 [00:04<00:11, 125.36it/s]
[Rank 1] Train Epoch 2:  31%|███       | 620/2000 [00:04<00:10, 132.56it/s]
[Rank 0] Train Epoch 2:  31%|███       | 623/2000 [00:04<00:10, 134.05it/s]
[Rank 2] Train Epoch 2:  32%|███▏      | 631/2000 [00:04<00:10, 130.84it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 2, Batch 607 | Mem: 26.53MB, Util: 66%  global_step : 4607
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 2, Batch 608 | Mem: 26.53MB, Util: 66%  global_step : 4608
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 2, Batch 609 | Mem: 26.53MB, Util: 66%  global_step : 4609
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 2, Batch 610 | Mem: 26.53MB, Util: 66%  global_step : 4610
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 2, Batch 611 | Mem: 26.53MB, Util: 66%  global_step : 4611
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 2, Batch 612 | Mem: 26.53MB, Util: 66%  global_step : 4612
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 2, Batch 613 | Mem: 26.53MB, Util: 66%  global_step : 4613
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 2, Batch 614 | Mem: 26.53MB, Util: 66%  global_step : 4614


[Rank 1] Train Epoch 2:  32%|███▏      | 634/2000 [00:04<00:10, 131.92it/s]
[Rank 2] Train Epoch 2:  32%|███▏      | 646/2000 [00:04<00:10, 134.42it/s]
[Rank 0] Train Epoch 2:  32%|███▏      | 637/2000 [00:04<00:10, 133.29it/s]
[Rank 1] Train Epoch 2:  32%|███▏      | 649/2000 [00:04<00:09, 135.70it/s]
[Rank 2] Train Epoch 2:  33%|███▎      | 661/2000 [00:04<00:09, 137.84it/s]
[Rank 0] Train Epoch 2:  33%|███▎      | 651/2000 [00:04<00:10, 132.68it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 638 | Mem: 26.53MB, Util: 95%  global_step : 4638
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 639 | Mem: 26.53MB, Util: 95%  global_step : 4639
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 640 | Mem: 26.53MB, Util: 95%  global_step : 4640
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 641 | Mem: 26.53MB, Util: 95%  global_step : 4641
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 642 | Mem: 26.53MB, Util: 95%  global_step : 4642
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 643 | Mem: 26.53MB, Util: 95%  global_step : 4643
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 644 | Mem: 26.53MB, Util: 95%  global_step : 4644
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 645 | Mem: 26.53MB, Util: 95%  global_step

[Rank 1] Train Epoch 2:  33%|███▎      | 664/2000 [00:04<00:09, 139.24it/s]
[Rank 2] Train Epoch 2:  34%|███▍      | 676/2000 [00:04<00:09, 140.12it/s]
[Rank 0] Train Epoch 2:  33%|███▎      | 665/2000 [00:04<00:10, 129.68it/s]
[Rank 1] Train Epoch 2:  34%|███▍      | 679/2000 [00:04<00:09, 142.11it/s]
[Rank 2] Train Epoch 2:  35%|███▍      | 691/2000 [00:05<00:09, 141.78it/s]
[Rank 0] Train Epoch 2:  34%|███▍      | 680/2000 [00:05<00:09, 133.52it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 668 | Mem: 26.53MB, Util: 100%  global_step : 4668
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 669 | Mem: 26.53MB, Util: 100%  global_step : 4669
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 670 | Mem: 26.53MB, Util: 100%  global_step : 4670
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 671 | Mem: 26.53MB, Util: 100%  global_step : 4671
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 672 | Mem: 26.53MB, Util: 100%  global_step : 4672
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 673 | Mem: 26.53MB, Util: 100%  global_step : 4673
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 674 | Mem: 26.53MB, Util: 100%  global_step : 4674
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 675 | Mem: 26.53MB, Util: 100%  glo

[Rank 1] Train Epoch 2:  35%|███▍      | 694/2000 [00:05<00:09, 144.15it/s]
[Rank 0] Train Epoch 2:  35%|███▍      | 695/2000 [00:05<00:09, 137.74it/s]
[Rank 1] Train Epoch 2:  35%|███▌      | 709/2000 [00:05<00:10, 125.65it/s]
[Rank 2] Train Epoch 2:  35%|███▌      | 706/2000 [00:05<00:10, 121.64it/s]
[Rank 0] Train Epoch 2:  35%|███▌      | 709/2000 [00:05<00:09, 133.80it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 699 | Mem: 26.53MB, Util: 100%  global_step : 4699
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 700 | Mem: 26.53MB, Util: 100%  global_step : 4700
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 701 | Mem: 26.53MB, Util: 100%  global_step : 4701
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 702 | Mem: 26.53MB, Util: 100%  global_step : 4702
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 703 | Mem: 26.53MB, Util: 100%  global_step : 4703
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 704 | Mem: 26.53MB, Util: 100%  global_step : 4704
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 2, Batch 692 | Mem: 26.53MB, Util: 65%  global_step : 4692
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 2, Batch 693 | Mem: 26.53MB, Util: 65%  global_

[Rank 1] Train Epoch 2:  36%|███▌      | 723/2000 [00:05<00:09, 129.42it/s]
[Rank 2] Train Epoch 2:  36%|███▌      | 721/2000 [00:05<00:09, 128.18it/s]
[Rank 2] Train Epoch 2:  37%|███▋      | 736/2000 [00:05<00:09, 132.93it/s]
[Rank 0] Train Epoch 2:  36%|███▌      | 723/2000 [00:05<00:09, 133.67it/s]
[Rank 1] Train Epoch 2:  37%|███▋      | 738/2000 [00:05<00:09, 134.10it/s]
[Rank 2] Train Epoch 2:  38%|███▊      | 751/2000 [00:05<00:09, 136.30it/s]
[Rank 0] Train Epoch 2:  37%|███▋      | 738/2000 [00:05<00:09, 138.03it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 720 | Mem: 26.53MB, Util: 97%  global_step : 4720
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 721 | Mem: 26.53MB, Util: 97%  global_step : 4721
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 722 | Mem: 26.53MB, Util: 97%  global_step : 4722
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 723 | Mem: 26.53MB, Util: 97%  global_step : 4723
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 724 | Mem: 26.53MB, Util: 97%  global_step : 4724
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 725 | Mem: 26.53MB, Util: 97%  global_step : 4725
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 726 | Mem: 26.53MB, Util: 97%  global_step : 4726
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 727 | Mem: 26.53MB, Util: 97%  global_step

[Rank 1] Train Epoch 2:  38%|███▊      | 753/2000 [00:05<00:09, 136.92it/s]
[Rank 2] Train Epoch 2:  38%|███▊      | 766/2000 [00:05<00:08, 138.81it/s]
[Rank 0] Train Epoch 2:  38%|███▊      | 753/2000 [00:05<00:08, 141.41it/s]
[Rank 1] Train Epoch 2:  38%|███▊      | 768/2000 [00:05<00:08, 138.70it/s]
[Rank 2] Train Epoch 2:  39%|███▉      | 781/2000 [00:05<00:08, 141.35it/s]
[Rank 0] Train Epoch 2:  38%|███▊      | 768/2000 [00:05<00:08, 141.98it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 751 | Mem: 26.53MB, Util: 100%  global_step : 4751
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 752 | Mem: 26.53MB, Util: 100%  global_step : 4752
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 753 | Mem: 26.53MB, Util: 100%  global_step : 4753
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 754 | Mem: 26.53MB, Util: 100%  global_step : 4754
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 755 | Mem: 26.53MB, Util: 100%  global_step : 4755
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 756 | Mem: 26.53MB, Util: 100%  global_step : 4756
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 757 | Mem: 26.53MB, Util: 100%  global_step : 4757
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 758 | Mem: 26.53MB, Util: 100%  glo

[Rank 1] Train Epoch 2:  39%|███▉      | 783/2000 [00:05<00:08, 140.18it/s]
[Rank 2] Train Epoch 2:  40%|███▉      | 796/2000 [00:05<00:08, 143.12it/s]
[Rank 0] Train Epoch 2:  39%|███▉      | 783/2000 [00:05<00:08, 142.46it/s]
[Rank 1] Train Epoch 2:  40%|███▉      | 798/2000 [00:05<00:08, 140.32it/s]
[Rank 0] Train Epoch 2:  40%|███▉      | 798/2000 [00:05<00:08, 142.03it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 782 | Mem: 26.53MB, Util: 100%  global_step : 4782
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 783 | Mem: 26.53MB, Util: 100%  global_step : 4783
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 784 | Mem: 26.53MB, Util: 100%  global_step : 4784
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 785 | Mem: 26.53MB, Util: 100%  global_step : 4785
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 786 | Mem: 26.53MB, Util: 100%  global_step : 4786
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 787 | Mem: 26.53MB, Util: 100%  global_step : 4787
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 788 | Mem: 26.53MB, Util: 100%  global_step : 4788
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 789 | Mem: 26.53MB, Util: 100%  glo

[Rank 1] Train Epoch 2:  41%|████      | 813/2000 [00:05<00:08, 140.58it/s]
[Rank 2] Train Epoch 2:  41%|████      | 811/2000 [00:05<00:08, 136.81it/s]
[Rank 0] Train Epoch 2:  41%|████      | 813/2000 [00:05<00:08, 142.03it/s]
[Rank 1] Train Epoch 2:  41%|████▏     | 828/2000 [00:06<00:08, 142.16it/s]
[Rank 2] Train Epoch 2:  41%|████▏     | 826/2000 [00:06<00:08, 139.08it/s]
[Rank 0] Train Epoch 2:  41%|████▏     | 828/2000 [00:06<00:08, 142.64it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 810 | Mem: 26.53MB, Util: 100%  global_step : 4810
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 811 | Mem: 26.53MB, Util: 100%  global_step : 4811
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 812 | Mem: 26.53MB, Util: 100%  global_step : 4812
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 813 | Mem: 26.53MB, Util: 100%  global_step : 4813
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 814 | Mem: 26.53MB, Util: 100%  global_step : 4814
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 815 | Mem: 26.53MB, Util: 100%  global_step : 4815
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 816 | Mem: 26.53MB, Util: 100%  global_step : 4816
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 817 | Mem: 26.53MB, Util: 100%  glo

[Rank 1] Train Epoch 2:  42%|████▏     | 843/2000 [00:06<00:08, 142.64it/s]
[Rank 2] Train Epoch 2:  42%|████▏     | 841/2000 [00:06<00:08, 140.86it/s]
[Rank 2] Train Epoch 2:  43%|████▎     | 856/2000 [00:06<00:08, 142.79it/s]
[Rank 0] Train Epoch 2:  42%|████▏     | 843/2000 [00:06<00:08, 142.92it/s]
[Rank 1] Train Epoch 2:  43%|████▎     | 858/2000 [00:06<00:07, 142.89it/s]
[Rank 2] Train Epoch 2:  44%|████▎     | 871/2000 [00:06<00:07, 144.16it/s]
[Rank 0] Train Epoch 2:  43%|████▎     | 858/2000 [00:06<00:07, 143.62it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 840 | Mem: 26.53MB, Util: 93%  global_step : 4840
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 841 | Mem: 26.53MB, Util: 93%  global_step : 4841
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 842 | Mem: 26.53MB, Util: 93%  global_step : 4842
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 843 | Mem: 26.53MB, Util: 93%  global_step : 4843
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 844 | Mem: 26.53MB, Util: 93%  global_step : 4844
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 845 | Mem: 26.53MB, Util: 93%  global_step : 4845
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 846 | Mem: 26.53MB, Util: 93%  global_step : 4846
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 847 | Mem: 26.53MB, Util: 93%  global_step

[Rank 1] Train Epoch 2:  44%|████▎     | 873/2000 [00:06<00:07, 143.50it/s]
[Rank 2] Train Epoch 2:  44%|████▍     | 886/2000 [00:06<00:07, 145.16it/s]
[Rank 0] Train Epoch 2:  44%|████▎     | 873/2000 [00:06<00:07, 143.66it/s]
[Rank 1] Train Epoch 2:  44%|████▍     | 888/2000 [00:06<00:07, 142.92it/s]
[Rank 0] Train Epoch 2:  44%|████▍     | 888/2000 [00:06<00:07, 142.31it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 871 | Mem: 26.53MB, Util: 100%  global_step : 4871
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 872 | Mem: 26.53MB, Util: 100%  global_step : 4872
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 873 | Mem: 26.53MB, Util: 100%  global_step : 4873
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 874 | Mem: 26.53MB, Util: 100%  global_step : 4874
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 875 | Mem: 26.53MB, Util: 100%  global_step : 4875
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 876 | Mem: 26.53MB, Util: 100%  global_step : 4876
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 877 | Mem: 26.53MB, Util: 100%  global_step : 4877
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 878 | Mem: 26.53MB, Util: 100%  glo

[Rank 1] Train Epoch 2:  45%|████▌     | 903/2000 [00:06<00:07, 142.21it/s]
[Rank 2] Train Epoch 2:  45%|████▌     | 901/2000 [00:06<00:07, 139.13it/s]
[Rank 0] Train Epoch 2:  45%|████▌     | 903/2000 [00:06<00:07, 144.13it/s]
[Rank 1] Train Epoch 2:  46%|████▌     | 918/2000 [00:06<00:07, 143.40it/s]
[Rank 2] Train Epoch 2:  46%|████▌     | 917/2000 [00:06<00:07, 144.53it/s]
[Rank 0] Train Epoch 2:  46%|████▌     | 918/2000 [00:06<00:07, 143.71it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 900 | Mem: 26.53MB, Util: 100%  global_step : 4900
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 901 | Mem: 26.53MB, Util: 100%  global_step : 4901
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 902 | Mem: 26.53MB, Util: 100%  global_step : 4902
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 903 | Mem: 26.53MB, Util: 100%  global_step : 4903
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 904 | Mem: 26.53MB, Util: 100%  global_step : 4904
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 905 | Mem: 26.53MB, Util: 100%  global_step : 4905
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 906 | Mem: 26.53MB, Util: 100%  global_step : 4906
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 907 | Mem: 26.53MB, Util: 100%  glo

[Rank 1] Train Epoch 2:  47%|████▋     | 933/2000 [00:06<00:07, 143.91it/s]
[Rank 2] Train Epoch 2:  47%|████▋     | 933/2000 [00:06<00:07, 148.64it/s]
[Rank 2] Train Epoch 2:  47%|████▋     | 949/2000 [00:06<00:06, 150.79it/s]
[Rank 0] Train Epoch 2:  47%|████▋     | 933/2000 [00:06<00:07, 143.97it/s]
[Rank 1] Train Epoch 2:  47%|████▋     | 948/2000 [00:06<00:07, 142.90it/s]
[Rank 2] Train Epoch 2:  48%|████▊     | 965/2000 [00:07<00:06, 152.87it/s]
[Rank 0] Train Epoch 2:  47%|████▋     | 948/2000 [00:06<00:07, 143.01it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 932 | Mem: 26.53MB, Util: 93%  global_step : 4932
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 933 | Mem: 26.53MB, Util: 93%  global_step : 4933
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 934 | Mem: 26.53MB, Util: 93%  global_step : 4934
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 935 | Mem: 26.53MB, Util: 93%  global_step : 4935
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 936 | Mem: 26.53MB, Util: 93%  global_step : 4936
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 937 | Mem: 26.53MB, Util: 93%  global_step : 4937
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 938 | Mem: 26.53MB, Util: 93%  global_step : 4938
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 939 | Mem: 26.53MB, Util: 93%  global_step

[Rank 1] Train Epoch 2:  48%|████▊     | 963/2000 [00:06<00:07, 144.86it/s]
[Rank 2] Train Epoch 2:  49%|████▉     | 981/2000 [00:07<00:06, 154.28it/s]
[Rank 0] Train Epoch 2:  48%|████▊     | 966/2000 [00:06<00:06, 152.56it/s]
[Rank 1] Train Epoch 2:  49%|████▉     | 978/2000 [00:07<00:07, 145.78it/s]
[Rank 2] Train Epoch 2:  50%|████▉     | 997/2000 [00:07<00:06, 155.54it/s]
[Rank 0] Train Epoch 2:  49%|████▉     | 984/2000 [00:07<00:06, 158.88it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 965 | Mem: 26.53MB, Util: 100%  global_step : 4965
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 966 | Mem: 26.53MB, Util: 100%  global_step : 4966
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 967 | Mem: 26.53MB, Util: 100%  global_step : 4967
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 968 | Mem: 26.53MB, Util: 100%  global_step : 4968
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 969 | Mem: 26.53MB, Util: 100%  global_step : 4969
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 970 | Mem: 26.53MB, Util: 100%  global_step : 4970
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 971 | Mem: 26.53MB, Util: 100%  global_step : 4971
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 972 | Mem: 26.53MB, Util: 100%  glo

[Rank 1] Train Epoch 2:  50%|████▉     | 993/2000 [00:07<00:06, 146.56it/s]
[Rank 0] Train Epoch 2:  50%|█████     | 1001/2000 [00:07<00:07, 142.06it/s]
[Rank 1] Train Epoch 2:  50%|█████     | 1008/2000 [00:07<00:06, 146.54it/s]
[Rank 2] Train Epoch 2:  51%|█████     | 1013/2000 [00:07<00:07, 136.28it/s]
[Rank 0] Train Epoch 2:  51%|█████     | 1017/2000 [00:07<00:06, 144.17it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 999 | Mem: 26.53MB, Util: 100%  global_step : 4999
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1000 | Mem: 26.53MB, Util: 100%  global_step : 5000
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1001 | Mem: 26.53MB, Util: 100%  global_step : 5001
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1002 | Mem: 26.53MB, Util: 100%  global_step : 5002
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1003 | Mem: 26.53MB, Util: 100%  global_step : 5003
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1004 | Mem: 26.53MB, Util: 100%  global_step : 5004
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1005 | Mem: 26.53MB, Util: 100%  global_step : 5005
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1006 | Mem: 26.53MB, Util: 10

[Rank 1] Train Epoch 2:  51%|█████     | 1023/2000 [00:07<00:06, 146.80it/s]
[Rank 2] Train Epoch 2:  52%|█████▏    | 1030/2000 [00:07<00:06, 143.16it/s]
[Rank 0] Train Epoch 2:  52%|█████▏    | 1032/2000 [00:07<00:07, 138.10it/s]
[Rank 1] Train Epoch 2:  52%|█████▏    | 1038/2000 [00:07<00:06, 146.52it/s]
[Rank 2] Train Epoch 2:  52%|█████▏    | 1046/2000 [00:07<00:06, 147.55it/s]
[Rank 0] Train Epoch 2:  52%|█████▏    | 1047/2000 [00:07<00:06, 140.10it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1024 | Mem: 26.53MB, Util: 98%  global_step : 5024
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1025 | Mem: 26.53MB, Util: 98%  global_step : 5025
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1026 | Mem: 26.53MB, Util: 98%  global_step : 5026
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1027 | Mem: 26.53MB, Util: 98%  global_step : 5027
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1028 | Mem: 26.53MB, Util: 98%  global_step : 5028
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1029 | Mem: 26.53MB, Util: 98%  global_step : 5029
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1030 | Mem: 26.53MB, Util: 98%  global_step : 5030
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1031 | Mem: 26.53MB, Util: 98%  glo

[Rank 1] Train Epoch 2:  53%|█████▎    | 1053/2000 [00:07<00:06, 146.56it/s]
[Rank 2] Train Epoch 2:  53%|█████▎    | 1063/2000 [00:07<00:06, 152.17it/s]
[Rank 0] Train Epoch 2:  53%|█████▎    | 1062/2000 [00:07<00:06, 140.95it/s]
[Rank 1] Train Epoch 2:  53%|█████▎    | 1068/2000 [00:07<00:06, 145.07it/s]
[Rank 2] Train Epoch 2:  54%|█████▍    | 1079/2000 [00:07<00:05, 154.24it/s]
[Rank 0] Train Epoch 2:  54%|█████▍    | 1077/2000 [00:07<00:06, 136.30it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1057 | Mem: 26.53MB, Util: 97%  global_step : 5057
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1058 | Mem: 26.53MB, Util: 97%  global_step : 5058
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1059 | Mem: 26.53MB, Util: 97%  global_step : 5059
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1060 | Mem: 26.53MB, Util: 97%  global_step : 5060
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1061 | Mem: 26.53MB, Util: 97%  global_step : 5061
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1062 | Mem: 26.53MB, Util: 97%  global_step : 5062
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1063 | Mem: 26.53MB, Util: 97%  global_step : 5063
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1064 | Mem: 26.53MB, Util: 97%  glo

[Rank 1] Train Epoch 2:  54%|█████▍    | 1083/2000 [00:07<00:06, 144.39it/s]
[Rank 2] Train Epoch 2:  55%|█████▍    | 1095/2000 [00:07<00:05, 155.90it/s]
[Rank 0] Train Epoch 2:  55%|█████▍    | 1092/2000 [00:07<00:06, 138.83it/s]
[Rank 1] Train Epoch 2:  55%|█████▍    | 1098/2000 [00:07<00:06, 145.64it/s]
[Rank 0] Train Epoch 2:  55%|█████▌    | 1107/2000 [00:08<00:06, 139.06it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1091 | Mem: 26.53MB, Util: 100%  global_step : 5091
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1092 | Mem: 26.53MB, Util: 100%  global_step : 5092
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1093 | Mem: 26.53MB, Util: 100%  global_step : 5093
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1094 | Mem: 26.53MB, Util: 100%  global_step : 5094
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1095 | Mem: 26.53MB, Util: 100%  global_step : 5095
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1096 | Mem: 26.53MB, Util: 100%  global_step : 5096
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1097 | Mem: 26.53MB, Util: 100%  global_step : 5097
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1098 | Mem: 26.53MB, Util: 1

[Rank 1] Train Epoch 2:  56%|█████▌    | 1113/2000 [00:08<00:06, 134.95it/s]
[Rank 2] Train Epoch 2:  56%|█████▌    | 1111/2000 [00:08<00:07, 122.59it/s]
[Rank 0] Train Epoch 2:  56%|█████▌    | 1121/2000 [00:08<00:06, 138.91it/s]
[Rank 1] Train Epoch 2:  56%|█████▋    | 1128/2000 [00:08<00:06, 138.42it/s]
[Rank 2] Train Epoch 2:  56%|█████▋    | 1126/2000 [00:08<00:06, 129.17it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1109 | Mem: 26.53MB, Util: 100%  global_step : 5109
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1110 | Mem: 26.53MB, Util: 100%  global_step : 5110
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1111 | Mem: 26.53MB, Util: 100%  global_step : 5111
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1112 | Mem: 26.53MB, Util: 100%  global_step : 5112
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1113 | Mem: 26.53MB, Util: 100%  global_step : 5113
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1114 | Mem: 26.53MB, Util: 100%  global_step : 5114
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1115 | Mem: 26.53MB, Util: 100%  global_step : 5115
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1116 | Mem: 26.53MB, Util: 1

[Rank 0] Train Epoch 2:  57%|█████▋    | 1136/2000 [00:08<00:06, 139.50it/s]
[Rank 1] Train Epoch 2:  57%|█████▋    | 1143/2000 [00:08<00:06, 139.41it/s]
[Rank 2] Train Epoch 2:  57%|█████▋    | 1141/2000 [00:08<00:06, 133.75it/s]
[Rank 2] Train Epoch 2:  58%|█████▊    | 1156/2000 [00:08<00:06, 137.72it/s]
[Rank 0] Train Epoch 2:  57%|█████▊    | 1150/2000 [00:08<00:06, 137.58it/s]
[Rank 1] Train Epoch 2:  58%|█████▊    | 1158/2000 [00:08<00:06, 140.12it/s]
[Rank 2] Train Epoch 2:  59%|█████▊    | 1171/2000 [00:08<00:05, 140.37it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1140 | Mem: 26.53MB, Util: 94%  global_step : 5140
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1141 | Mem: 26.53MB, Util: 94%  global_step : 5141
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1142 | Mem: 26.53MB, Util: 94%  global_step : 5142
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1143 | Mem: 26.53MB, Util: 94%  global_step : 5143
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1144 | Mem: 26.53MB, Util: 94%  global_step : 5144
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1145 | Mem: 26.53MB, Util: 94%  global_step : 5145
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1146 | Mem: 26.53MB, Util: 94%  global_step : 5146
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1147 | Mem: 26.53MB, Util: 94%  glo

[Rank 0] Train Epoch 2:  58%|█████▊    | 1164/2000 [00:08<00:06, 136.53it/s]
[Rank 2] Train Epoch 2:  59%|█████▉    | 1187/2000 [00:08<00:05, 143.32it/s]
[Rank 0] Train Epoch 2:  59%|█████▉    | 1178/2000 [00:08<00:06, 136.34it/s]
[Rank 1] Train Epoch 2:  59%|█████▊    | 1173/2000 [00:08<00:06, 136.49it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1171 | Mem: 26.53MB, Util: 100%  global_step : 5171
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1172 | Mem: 26.53MB, Util: 100%  global_step : 5172
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1173 | Mem: 26.53MB, Util: 100%  global_step : 5173
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1174 | Mem: 26.53MB, Util: 100%  global_step : 5174
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1175 | Mem: 26.53MB, Util: 100%  global_step : 5175
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1176 | Mem: 26.53MB, Util: 100%  global_step : 5176
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1177 | Mem: 26.53MB, Util: 100%  global_step : 5177
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1178 | Mem: 26.53MB, Util: 1

[Rank 0] Train Epoch 2:  60%|█████▉    | 1192/2000 [00:08<00:05, 135.68it/s]
[Rank 1] Train Epoch 2:  59%|█████▉    | 1187/2000 [00:08<00:06, 134.94it/s]
[Rank 2] Train Epoch 2:  60%|██████    | 1202/2000 [00:08<00:06, 122.53it/s]
[Rank 0] Train Epoch 2:  60%|██████    | 1206/2000 [00:08<00:05, 135.29it/s]
[Rank 1] Train Epoch 2:  60%|██████    | 1201/2000 [00:08<00:06, 128.57it/s]
[Rank 2] Train Epoch 2:  61%|██████    | 1218/2000 [00:08<00:06, 129.99it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1200 | Mem: 26.53MB, Util: 100%  global_step : 5200
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1201 | Mem: 26.53MB, Util: 100%  global_step : 5201
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1202 | Mem: 26.53MB, Util: 100%  global_step : 5202
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1203 | Mem: 26.53MB, Util: 100%  global_step : 5203
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1204 | Mem: 26.53MB, Util: 100%  global_step : 5204
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1205 | Mem: 26.53MB, Util: 100%  global_step : 5205
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1206 | Mem: 26.53MB, Util: 100%  global_step : 5206
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1207 | Mem: 26.53MB, Util: 1

[Rank 0] Train Epoch 2:  61%|██████    | 1220/2000 [00:08<00:05, 134.97it/s]
[Rank 1] Train Epoch 2:  61%|██████    | 1216/2000 [00:08<00:05, 132.50it/s]
[Rank 2] Train Epoch 2:  62%|██████▏   | 1233/2000 [00:08<00:05, 134.75it/s]
[Rank 0] Train Epoch 2:  62%|██████▏   | 1234/2000 [00:08<00:05, 134.50it/s]
[Rank 1] Train Epoch 2:  62%|██████▏   | 1231/2000 [00:08<00:05, 135.03it/s]
[Rank 2] Train Epoch 2:  62%|██████▏   | 1249/2000 [00:09<00:05, 139.22it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1224 | Mem: 26.53MB, Util: 100%  global_step : 5224
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1225 | Mem: 26.53MB, Util: 100%  global_step : 5225
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1226 | Mem: 26.53MB, Util: 100%  global_step : 5226
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1227 | Mem: 26.53MB, Util: 100%  global_step : 5227
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1228 | Mem: 26.53MB, Util: 100%  global_step : 5228
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1229 | Mem: 26.53MB, Util: 100%  global_step : 5229
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1230 | Mem: 26.53MB, Util: 100%  global_step : 5230
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1231 | Mem: 26.53MB, Util: 1

[Rank 0] Train Epoch 2:  62%|██████▏   | 1248/2000 [00:09<00:05, 134.79it/s]
[Rank 1] Train Epoch 2:  62%|██████▏   | 1246/2000 [00:09<00:05, 137.09it/s]
[Rank 2] Train Epoch 2:  63%|██████▎   | 1264/2000 [00:09<00:05, 141.52it/s]
[Rank 0] Train Epoch 2:  63%|██████▎   | 1262/2000 [00:09<00:05, 135.28it/s]
[Rank 1] Train Epoch 2:  63%|██████▎   | 1261/2000 [00:09<00:05, 139.92it/s]
[Rank 2] Train Epoch 2:  64%|██████▍   | 1280/2000 [00:09<00:04, 145.42it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1255 | Mem: 26.53MB, Util: 98%  global_step : 5255
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1256 | Mem: 26.53MB, Util: 98%  global_step : 5256
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1257 | Mem: 26.53MB, Util: 98%  global_step : 5257
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1258 | Mem: 26.53MB, Util: 98%  global_step : 5258
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1259 | Mem: 26.53MB, Util: 98%  global_step : 5259
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1260 | Mem: 26.53MB, Util: 98%  global_step : 5260
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1261 | Mem: 26.53MB, Util: 98%  global_step : 5261
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1262 | Mem: 26.53MB, Util: 98%  glo

[Rank 0] Train Epoch 2:  64%|██████▍   | 1276/2000 [00:09<00:05, 135.01it/s]
[Rank 1] Train Epoch 2:  64%|██████▍   | 1277/2000 [00:09<00:04, 144.78it/s]
[Rank 2] Train Epoch 2:  65%|██████▍   | 1296/2000 [00:09<00:04, 149.23it/s]
[Rank 0] Train Epoch 2:  64%|██████▍   | 1290/2000 [00:09<00:05, 135.03it/s]
[Rank 1] Train Epoch 2:  65%|██████▍   | 1293/2000 [00:09<00:04, 147.58it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1287 | Mem: 26.53MB, Util: 100%  global_step : 5287
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1288 | Mem: 26.53MB, Util: 100%  global_step : 5288
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1289 | Mem: 26.53MB, Util: 100%  global_step : 5289
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1290 | Mem: 26.53MB, Util: 100%  global_step : 5290
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1291 | Mem: 26.53MB, Util: 100%  global_step : 5291
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1292 | Mem: 26.53MB, Util: 100%  global_step : 5292
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1293 | Mem: 26.53MB, Util: 100%  global_step : 5293
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1294 | Mem: 26.53MB, Util: 1

[Rank 0] Train Epoch 2:  65%|██████▌   | 1304/2000 [00:09<00:05, 132.43it/s]
[Rank 1] Train Epoch 2:  65%|██████▌   | 1308/2000 [00:09<00:05, 124.75it/s]
[Rank 2] Train Epoch 2:  66%|██████▌   | 1312/2000 [00:09<00:05, 122.00it/s]
[Rank 0] Train Epoch 2:  66%|██████▌   | 1319/2000 [00:09<00:05, 136.12it/s]
[Rank 1] Train Epoch 2:  66%|██████▌   | 1323/2000 [00:09<00:05, 129.22it/s]
[Rank 2] Train Epoch 2:  66%|██████▋   | 1328/2000 [00:09<00:05, 131.37it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1307 | Mem: 26.53MB, Util: 100%  global_step : 5307
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1308 | Mem: 26.53MB, Util: 100%  global_step : 5308
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1309 | Mem: 26.53MB, Util: 100%  global_step : 5309
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1310 | Mem: 26.53MB, Util: 100%  global_step : 5310
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1311 | Mem: 26.53MB, Util: 100%  global_step : 5311
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1312 | Mem: 26.53MB, Util: 100%  global_step : 5312
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1313 | Mem: 26.53MB, Util: 100%  global_step : 5313
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1314 | Mem: 26.53MB, Util: 1

[Rank 0] Train Epoch 2:  67%|██████▋   | 1334/2000 [00:09<00:04, 138.35it/s]
[Rank 1] Train Epoch 2:  67%|██████▋   | 1337/2000 [00:09<00:05, 131.86it/s]
[Rank 2] Train Epoch 2:  67%|██████▋   | 1345/2000 [00:09<00:04, 139.36it/s]
[Rank 1] Train Epoch 2:  68%|██████▊   | 1351/2000 [00:09<00:04, 133.81it/s]
[Rank 0] Train Epoch 2:  67%|██████▋   | 1349/2000 [00:09<00:04, 140.25it/s]
[Rank 2] Train Epoch 2:  68%|██████▊   | 1361/2000 [00:09<00:04, 144.80it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1340 | Mem: 26.53MB, Util: 99%  global_step : 5340
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1341 | Mem: 26.53MB, Util: 99%  global_step : 5341
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1342 | Mem: 26.53MB, Util: 99%  global_step : 5342
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1343 | Mem: 26.53MB, Util: 99%  global_step : 5343
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1344 | Mem: 26.53MB, Util: 99%  global_step : 5344
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1345 | Mem: 26.53MB, Util: 99%  global_step : 5345
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1346 | Mem: 26.53MB, Util: 99%  global_step : 5346
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1347 | Mem: 26.53MB, Util: 99%  glo

[Rank 0] Train Epoch 2:  68%|██████▊   | 1364/2000 [00:09<00:04, 140.93it/s]
[Rank 1] Train Epoch 2:  68%|██████▊   | 1366/2000 [00:09<00:04, 135.90it/s]
[Rank 2] Train Epoch 2:  69%|██████▉   | 1378/2000 [00:09<00:04, 149.55it/s]
[Rank 1] Train Epoch 2:  69%|██████▉   | 1381/2000 [00:10<00:04, 137.41it/s]
[Rank 0] Train Epoch 2:  69%|██████▉   | 1379/2000 [00:09<00:04, 141.85it/s]
[Rank 2] Train Epoch 2:  70%|██████▉   | 1394/2000 [00:10<00:03, 152.10it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1374 | Mem: 26.53MB, Util: 100%  global_step : 5374
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1375 | Mem: 26.53MB, Util: 100%  global_step : 5375
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1376 | Mem: 26.53MB, Util: 100%  global_step : 5376
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1377 | Mem: 26.53MB, Util: 100%  global_step : 5377
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1378 | Mem: 26.53MB, Util: 100%  global_step : 5378
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1379 | Mem: 26.53MB, Util: 100%  global_step : 5379
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1380 | Mem: 26.53MB, Util: 100%  global_step : 5380
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1381 | Mem: 26.53MB, Util: 1

[Rank 1] Train Epoch 2:  70%|██████▉   | 1396/2000 [00:10<00:04, 138.45it/s]
[Rank 0] Train Epoch 2:  70%|██████▉   | 1394/2000 [00:10<00:04, 140.68it/s]
[Rank 1] Train Epoch 2:  71%|███████   | 1412/2000 [00:10<00:04, 144.21it/s]
[Rank 0] Train Epoch 2:  70%|███████   | 1409/2000 [00:10<00:04, 133.51it/s]
[Rank 2] Train Epoch 2:  70%|███████   | 1410/2000 [00:10<00:04, 122.59it/s]
[Rank 2] Train Epoch 2:  71%|███████▏  | 1426/2000 [00:10<00:04, 131.58it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1400 | Mem: 26.53MB, Util: 100%  global_step : 5400
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1401 | Mem: 26.53MB, Util: 100%  global_step : 5401
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1402 | Mem: 26.53MB, Util: 100%  global_step : 5402
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1403 | Mem: 26.53MB, Util: 100%  global_step : 5403
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1404 | Mem: 26.53MB, Util: 100%  global_step : 5404
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1405 | Mem: 26.53MB, Util: 100%  global_step : 5405
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1406 | Mem: 26.53MB, Util: 100%  global_step : 5406
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1407 | Mem: 26.53MB, Util: 1

[Rank 1] Train Epoch 2:  71%|███████▏  | 1427/2000 [00:10<00:04, 143.24it/s]
[Rank 0] Train Epoch 2:  71%|███████   | 1423/2000 [00:10<00:04, 134.41it/s]
[Rank 2] Train Epoch 2:  72%|███████▏  | 1442/2000 [00:10<00:04, 138.47it/s]
[Rank 1] Train Epoch 2:  72%|███████▏  | 1442/2000 [00:10<00:03, 142.91it/s]
[Rank 0] Train Epoch 2:  72%|███████▏  | 1437/2000 [00:10<00:04, 134.66it/s]
[Rank 2] Train Epoch 2:  73%|███████▎  | 1459/2000 [00:10<00:03, 144.74it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1426 | Mem: 26.53MB, Util: 98%  global_step : 5426
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1427 | Mem: 26.53MB, Util: 98%  global_step : 5427
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1428 | Mem: 26.53MB, Util: 98%  global_step : 5428
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1429 | Mem: 26.53MB, Util: 98%  global_step : 5429
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1430 | Mem: 26.53MB, Util: 98%  global_step : 5430
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1431 | Mem: 26.53MB, Util: 98%  global_step : 5431
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1432 | Mem: 26.53MB, Util: 98%  global_step : 5432
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1433 | Mem: 26.53MB, Util: 98%  glo

[Rank 1] Train Epoch 2:  73%|███████▎  | 1457/2000 [00:10<00:03, 142.74it/s]
[Rank 0] Train Epoch 2:  73%|███████▎  | 1451/2000 [00:10<00:04, 134.62it/s]
[Rank 2] Train Epoch 2:  74%|███████▍  | 1475/2000 [00:10<00:03, 148.23it/s]
[Rank 1] Train Epoch 2:  74%|███████▎  | 1472/2000 [00:10<00:03, 142.92it/s]
[Rank 0] Train Epoch 2:  73%|███████▎  | 1465/2000 [00:10<00:03, 135.41it/s]
[Rank 2] Train Epoch 2:  75%|███████▍  | 1491/2000 [00:10<00:03, 148.49it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1460 | Mem: 26.53MB, Util: 100%  global_step : 5460
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1461 | Mem: 26.53MB, Util: 100%  global_step : 5461
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1462 | Mem: 26.53MB, Util: 100%  global_step : 5462
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1463 | Mem: 26.53MB, Util: 100%  global_step : 5463
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1464 | Mem: 26.53MB, Util: 100%  global_step : 5464
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1465 | Mem: 26.53MB, Util: 100%  global_step : 5465
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1466 | Mem: 26.53MB, Util: 100%  global_step : 5466
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1467 | Mem: 26.53MB, Util: 1

[Rank 1] Train Epoch 2:  74%|███████▍  | 1487/2000 [00:10<00:03, 142.31it/s]
[Rank 0] Train Epoch 2:  74%|███████▍  | 1479/2000 [00:10<00:03, 136.39it/s]
[Rank 0] Train Epoch 2:  75%|███████▍  | 1494/2000 [00:10<00:03, 137.52it/s]
[Rank 2] Train Epoch 2:  75%|███████▌  | 1507/2000 [00:10<00:04, 117.10it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1492 | Mem: 26.53MB, Util: 100%  global_step : 5492
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1493 | Mem: 26.53MB, Util: 100%  global_step : 5493
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1494 | Mem: 26.53MB, Util: 100%  global_step : 5494
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1495 | Mem: 26.53MB, Util: 100%  global_step : 5495
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1496 | Mem: 26.53MB, Util: 100%  global_step : 5496
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1497 | Mem: 26.53MB, Util: 100%  global_step : 5497
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1498 | Mem: 26.53MB, Util: 100%  global_step : 5498
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1499 | Mem: 26.53MB, Util: 1

[Rank 1] Train Epoch 2:  75%|███████▌  | 1502/2000 [00:10<00:04, 124.42it/s]
[Rank 0] Train Epoch 2:  75%|███████▌  | 1508/2000 [00:10<00:03, 137.89it/s]
[Rank 2] Train Epoch 2:  76%|███████▌  | 1522/2000 [00:11<00:03, 124.36it/s]
[Rank 1] Train Epoch 2:  76%|███████▌  | 1516/2000 [00:11<00:03, 125.94it/s]
[Rank 0] Train Epoch 2:  76%|███████▌  | 1522/2000 [00:11<00:03, 137.60it/s]
[Rank 2] Train Epoch 2:  77%|███████▋  | 1537/2000 [00:11<00:03, 129.59it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1508 | Mem: 26.53MB, Util: 100%  global_step : 5508
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1509 | Mem: 26.53MB, Util: 100%  global_step : 5509
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1510 | Mem: 26.53MB, Util: 100%  global_step : 5510
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1511 | Mem: 26.53MB, Util: 100%  global_step : 5511
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1512 | Mem: 26.53MB, Util: 100%  global_step : 5512
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1513 | Mem: 26.53MB, Util: 100%  global_step : 5513
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1514 | Mem: 26.53MB, Util: 100%  global_step : 5514
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1515 | Mem: 26.53MB, Util: 1

[Rank 1] Train Epoch 2:  77%|███████▋  | 1531/2000 [00:11<00:03, 132.23it/s]
[Rank 0] Train Epoch 2:  77%|███████▋  | 1536/2000 [00:11<00:03, 138.18it/s]
[Rank 2] Train Epoch 2:  78%|███████▊  | 1552/2000 [00:11<00:03, 133.75it/s]
[Rank 1] Train Epoch 2:  77%|███████▋  | 1546/2000 [00:11<00:03, 136.50it/s]
[Rank 0] Train Epoch 2:  78%|███████▊  | 1551/2000 [00:11<00:03, 139.01it/s]
[Rank 2] Train Epoch 2:  78%|███████▊  | 1567/2000 [00:11<00:03, 137.60it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1539 | Mem: 26.53MB, Util: 97%  global_step : 5539
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1540 | Mem: 26.53MB, Util: 97%  global_step : 5540
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1541 | Mem: 26.53MB, Util: 97%  global_step : 5541
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1542 | Mem: 26.53MB, Util: 97%  global_step : 5542
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1543 | Mem: 26.53MB, Util: 97%  global_step : 5543
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1544 | Mem: 26.53MB, Util: 97%  global_step : 5544
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1545 | Mem: 26.53MB, Util: 97%  global_step : 5545
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1546 | Mem: 26.53MB, Util: 97%  glo

[Rank 1] Train Epoch 2:  78%|███████▊  | 1564/2000 [00:11<00:02, 148.16it/s]
[Rank 0] Train Epoch 2:  78%|███████▊  | 1565/2000 [00:11<00:03, 138.98it/s]
[Rank 2] Train Epoch 2:  79%|███████▉  | 1582/2000 [00:11<00:02, 139.58it/s]
[Rank 1] Train Epoch 2:  79%|███████▉  | 1583/2000 [00:11<00:02, 157.98it/s]
[Rank 0] Train Epoch 2:  79%|███████▉  | 1579/2000 [00:11<00:03, 138.83it/s]
[Rank 2] Train Epoch 2:  80%|███████▉  | 1597/2000 [00:11<00:02, 141.81it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1569 | Mem: 26.53MB, Util: 100%  global_step : 5569
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1570 | Mem: 26.53MB, Util: 100%  global_step : 5570
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1571 | Mem: 26.53MB, Util: 100%  global_step : 5571
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1572 | Mem: 26.53MB, Util: 100%  global_step : 5572
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1573 | Mem: 26.53MB, Util: 100%  global_step : 5573
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1574 | Mem: 26.53MB, Util: 100%  global_step : 5574
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1575 | Mem: 26.53MB, Util: 100%  global_step : 5575
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1576 | Mem: 26.53MB, Util: 1

[Rank 0] Train Epoch 2:  80%|███████▉  | 1593/2000 [00:11<00:02, 138.97it/s]
[Rank 1] Train Epoch 2:  80%|████████  | 1601/2000 [00:11<00:03, 128.94it/s]
[Rank 0] Train Epoch 2:  80%|████████  | 1607/2000 [00:11<00:02, 136.59it/s]
[Rank 2] Train Epoch 2:  81%|████████  | 1612/2000 [00:11<00:02, 130.37it/s]
[Rank 1] Train Epoch 2:  81%|████████  | 1620/2000 [00:11<00:02, 142.30it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1600 | Mem: 26.53MB, Util: 100%  global_step : 5600
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1601 | Mem: 26.53MB, Util: 100%  global_step : 5601
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1602 | Mem: 26.53MB, Util: 100%  global_step : 5602
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1603 | Mem: 26.53MB, Util: 100%  global_step : 5603
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1604 | Mem: 26.53MB, Util: 100%  global_step : 5604
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1605 | Mem: 26.53MB, Util: 100%  global_step : 5605
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1606 | Mem: 26.53MB, Util: 100%  global_step : 5606
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1607 | Mem: 26.53MB, Util: 1

[Rank 0] Train Epoch 2:  81%|████████  | 1623/2000 [00:11<00:02, 140.73it/s]
[Rank 2] Train Epoch 2:  81%|████████▏ | 1627/2000 [00:11<00:02, 134.97it/s]
[Rank 1] Train Epoch 2:  82%|████████▏ | 1638/2000 [00:11<00:02, 151.86it/s]
[Rank 1] Train Epoch 2:  83%|████████▎ | 1656/2000 [00:11<00:02, 159.03it/s]
[Rank 0] Train Epoch 2:  82%|████████▏ | 1638/2000 [00:11<00:02, 143.14it/s]
[Rank 2] Train Epoch 2:  82%|████████▏ | 1642/2000 [00:11<00:02, 137.89it/s]
[Rank 1] Train Epoch 2:  84%|████████▎ | 1674/2000 [00:12<00:01, 164.54it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1626 | Mem: 26.53MB, Util: 98%  global_step : 5626
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1627 | Mem: 26.53MB, Util: 98%  global_step : 5627
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1628 | Mem: 26.53MB, Util: 98%  global_step : 5628
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1629 | Mem: 26.53MB, Util: 98%  global_step : 5629
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1630 | Mem: 26.53MB, Util: 98%  global_step : 5630
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1631 | Mem: 26.53MB, Util: 98%  global_step : 5631
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1632 | Mem: 26.53MB, Util: 98%  global_step : 5632
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1633 | Mem: 26.53MB, Util: 98%  glo

[Rank 0] Train Epoch 2:  83%|████████▎ | 1653/2000 [00:11<00:02, 139.83it/s]
[Rank 2] Train Epoch 2:  83%|████████▎ | 1657/2000 [00:12<00:02, 140.49it/s]
[Rank 1] Train Epoch 2:  85%|████████▍ | 1693/2000 [00:12<00:01, 168.77it/s]
[Rank 0] Train Epoch 2:  83%|████████▎ | 1668/2000 [00:12<00:02, 139.92it/s]
[Rank 2] Train Epoch 2:  84%|████████▎ | 1673/2000 [00:12<00:02, 143.49it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1656 | Mem: 26.53MB, Util: 100%  global_step : 5656
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1657 | Mem: 26.53MB, Util: 100%  global_step : 5657
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1658 | Mem: 26.53MB, Util: 100%  global_step : 5658
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1659 | Mem: 26.53MB, Util: 100%  global_step : 5659
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1660 | Mem: 26.53MB, Util: 100%  global_step : 5660
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1661 | Mem: 26.53MB, Util: 100%  global_step : 5661
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1662 | Mem: 26.53MB, Util: 100%  global_step : 5662
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1663 | Mem: 26.53MB, Util: 1

[Rank 0] Train Epoch 2:  84%|████████▍ | 1683/2000 [00:12<00:02, 140.48it/s]
[Rank 2] Train Epoch 2:  84%|████████▍ | 1690/2000 [00:12<00:02, 149.03it/s]
[Rank 0] Train Epoch 2:  85%|████████▍ | 1698/2000 [00:12<00:02, 137.75it/s]
[Rank 2] Train Epoch 2:  85%|████████▌ | 1705/2000 [00:12<00:02, 127.38it/s]
[Rank 1] Train Epoch 2:  86%|████████▌ | 1711/2000 [00:12<00:02, 118.14it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1689 | Mem: 26.53MB, Util: 100%  global_step : 5689
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1690 | Mem: 26.53MB, Util: 100%  global_step : 5690
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1691 | Mem: 26.53MB, Util: 100%  global_step : 5691
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1692 | Mem: 26.53MB, Util: 100%  global_step : 5692
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1693 | Mem: 26.53MB, Util: 100%  global_step : 5693
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1694 | Mem: 26.53MB, Util: 100%  global_step : 5694
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1695 | Mem: 26.53MB, Util: 100%  global_step : 5695
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1696 | Mem: 26.53MB, Util: 1

[Rank 0] Train Epoch 2:  86%|████████▌ | 1713/2000 [00:12<00:02, 140.87it/s]
[Rank 2] Train Epoch 2:  86%|████████▌ | 1721/2000 [00:12<00:02, 135.32it/s]
[Rank 1] Train Epoch 2:  86%|████████▋ | 1729/2000 [00:12<00:02, 131.39it/s]
[Rank 0] Train Epoch 2:  86%|████████▋ | 1728/2000 [00:12<00:01, 143.23it/s]
[Rank 2] Train Epoch 2:  87%|████████▋ | 1737/2000 [00:12<00:01, 141.76it/s]
[Rank 1] Train Epoch 2:  87%|████████▋ | 1748/2000 [00:12<00:01, 143.50it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1712 | Mem: 26.53MB, Util: 100%  global_step : 5712
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1713 | Mem: 26.53MB, Util: 100%  global_step : 5713
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1714 | Mem: 26.53MB, Util: 100%  global_step : 5714
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1715 | Mem: 26.53MB, Util: 100%  global_step : 5715
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1716 | Mem: 26.53MB, Util: 100%  global_step : 5716
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1717 | Mem: 26.53MB, Util: 100%  global_step : 5717
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1718 | Mem: 26.53MB, Util: 100%  global_step : 5718
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1719 | Mem: 26.53MB, Util: 1

[Rank 0] Train Epoch 2:  87%|████████▋ | 1744/2000 [00:12<00:01, 145.58it/s]
[Rank 2] Train Epoch 2:  88%|████████▊ | 1754/2000 [00:12<00:01, 147.40it/s]
[Rank 1] Train Epoch 2:  88%|████████▊ | 1767/2000 [00:12<00:01, 153.44it/s]
[Rank 0] Train Epoch 2:  88%|████████▊ | 1759/2000 [00:12<00:01, 140.60it/s]
[Rank 2] Train Epoch 2:  88%|████████▊ | 1770/2000 [00:12<00:01, 147.57it/s]
[Rank 1] Train Epoch 2:  89%|████████▉ | 1786/2000 [00:12<00:01, 160.87it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1745 | Mem: 26.53MB, Util: 76%  global_step : 5745
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1746 | Mem: 26.53MB, Util: 76%  global_step : 5746
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1747 | Mem: 26.53MB, Util: 76%  global_step : 5747
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1748 | Mem: 26.53MB, Util: 76%  global_step : 5748
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1749 | Mem: 26.53MB, Util: 76%  global_step : 5749
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1750 | Mem: 26.53MB, Util: 76%  global_step : 5750
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1751 | Mem: 26.53MB, Util: 76%  global_step : 5751
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1752 | Mem: 26.53MB, Util: 76%  glo

[Rank 0] Train Epoch 2:  89%|████████▊ | 1774/2000 [00:12<00:01, 138.05it/s]
[Rank 2] Train Epoch 2:  89%|████████▉ | 1785/2000 [00:12<00:01, 142.23it/s]
[Rank 0] Train Epoch 2:  90%|████████▉ | 1790/2000 [00:12<00:01, 141.99it/s]
[Rank 2] Train Epoch 2:  90%|█████████ | 1800/2000 [00:13<00:01, 142.97it/s]
[Rank 1] Train Epoch 2:  90%|█████████ | 1804/2000 [00:13<00:01, 120.47it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1777 | Mem: 26.53MB, Util: 91%  global_step : 5777
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1778 | Mem: 26.53MB, Util: 91%  global_step : 5778
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1779 | Mem: 26.53MB, Util: 91%  global_step : 5779
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1780 | Mem: 26.53MB, Util: 91%  global_step : 5780
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1781 | Mem: 26.53MB, Util: 91%  global_step : 5781
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1782 | Mem: 26.53MB, Util: 91%  global_step : 5782
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1783 | Mem: 26.53MB, Util: 91%  global_step : 5783
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1784 | Mem: 26.53MB, Util: 91%  glo

[Rank 0] Train Epoch 2:  90%|█████████ | 1805/2000 [00:13<00:01, 144.13it/s]
[Rank 2] Train Epoch 2:  91%|█████████ | 1815/2000 [00:13<00:01, 137.41it/s]
[Rank 1] Train Epoch 2:  91%|█████████ | 1823/2000 [00:13<00:01, 134.24it/s]
[Rank 0] Train Epoch 2:  91%|█████████ | 1820/2000 [00:13<00:01, 140.61it/s]
[Rank 2] Train Epoch 2:  92%|█████████▏| 1830/2000 [00:13<00:01, 139.73it/s]
[Rank 1] Train Epoch 2:  92%|█████████▏| 1841/2000 [00:13<00:01, 144.57it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1804 | Mem: 26.53MB, Util: 100%  global_step : 5804
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1805 | Mem: 26.53MB, Util: 100%  global_step : 5805
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1806 | Mem: 26.53MB, Util: 100%  global_step : 5806
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1807 | Mem: 26.53MB, Util: 100%  global_step : 5807
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1808 | Mem: 26.53MB, Util: 100%  global_step : 5808
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1809 | Mem: 26.53MB, Util: 100%  global_step : 5809
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1810 | Mem: 26.53MB, Util: 100%  global_step : 5810
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1811 | Mem: 26.53MB, Util: 1

[Rank 0] Train Epoch 2:  92%|█████████▏| 1835/2000 [00:13<00:01, 140.47it/s]
[Rank 2] Train Epoch 2:  92%|█████████▏| 1845/2000 [00:13<00:01, 141.80it/s]
[Rank 1] Train Epoch 2:  93%|█████████▎| 1860/2000 [00:13<00:00, 154.00it/s]
[Rank 0] Train Epoch 2:  92%|█████████▎| 1850/2000 [00:13<00:01, 140.17it/s]
[Rank 2] Train Epoch 2:  93%|█████████▎| 1860/2000 [00:13<00:00, 143.35it/s]
[Rank 1] Train Epoch 2:  94%|█████████▍| 1878/2000 [00:13<00:00, 160.49it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1834 | Mem: 26.53MB, Util: 91%  global_step : 5834
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1835 | Mem: 26.53MB, Util: 91%  global_step : 5835
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1836 | Mem: 26.53MB, Util: 91%  global_step : 5836
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1837 | Mem: 26.53MB, Util: 91%  global_step : 5837
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1838 | Mem: 26.53MB, Util: 91%  global_step : 5838
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1839 | Mem: 26.53MB, Util: 91%  global_step : 5839
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1840 | Mem: 26.53MB, Util: 91%  global_step : 5840
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1841 | Mem: 26.53MB, Util: 91%  glo

[Rank 0] Train Epoch 2:  93%|█████████▎| 1865/2000 [00:13<00:00, 140.50it/s]
[Rank 2] Train Epoch 2:  94%|█████████▍| 1875/2000 [00:13<00:00, 144.02it/s]
[Rank 1] Train Epoch 2:  95%|█████████▍| 1896/2000 [00:13<00:00, 165.57it/s]
[Rank 0] Train Epoch 2:  94%|█████████▍| 1880/2000 [00:13<00:00, 140.90it/s]
[Rank 2] Train Epoch 2:  94%|█████████▍| 1890/2000 [00:13<00:00, 144.05it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1865 | Mem: 26.53MB, Util: 96%  global_step : 5865
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1866 | Mem: 26.53MB, Util: 96%  global_step : 5866
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1867 | Mem: 26.53MB, Util: 96%  global_step : 5867
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1868 | Mem: 26.53MB, Util: 96%  global_step : 5868
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1869 | Mem: 26.53MB, Util: 96%  global_step : 5869
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1870 | Mem: 26.53MB, Util: 96%  global_step : 5870
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1871 | Mem: 26.53MB, Util: 96%  global_step : 5871
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1872 | Mem: 26.53MB, Util: 96%  glo

[Rank 0] Train Epoch 2:  95%|█████████▍| 1895/2000 [00:13<00:00, 140.01it/s]
[Rank 2] Train Epoch 2:  95%|█████████▌| 1905/2000 [00:13<00:00, 132.05it/s]
[Rank 0] Train Epoch 2:  96%|█████████▌| 1910/2000 [00:13<00:00, 139.24it/s]
[Rank 2] Train Epoch 2:  96%|█████████▌| 1921/2000 [00:13<00:00, 138.01it/s]
[Rank 1] Train Epoch 2:  96%|█████████▌| 1914/2000 [00:13<00:00, 116.92it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1896 | Mem: 26.53MB, Util: 100%  global_step : 5896
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1897 | Mem: 26.53MB, Util: 100%  global_step : 5897
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1898 | Mem: 26.53MB, Util: 100%  global_step : 5898
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1899 | Mem: 26.53MB, Util: 100%  global_step : 5899
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1900 | Mem: 26.53MB, Util: 100%  global_step : 5900
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1901 | Mem: 26.53MB, Util: 100%  global_step : 5901
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1902 | Mem: 26.53MB, Util: 100%  global_step : 5902
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1903 | Mem: 26.53MB, Util: 1

[Rank 0] Train Epoch 2:  96%|█████████▋| 1926/2000 [00:13<00:00, 143.39it/s]
[Rank 2] Train Epoch 2:  97%|█████████▋| 1936/2000 [00:14<00:00, 140.75it/s]
[Rank 1] Train Epoch 2:  97%|█████████▋| 1932/2000 [00:13<00:00, 130.23it/s]
[Rank 0] Train Epoch 2:  97%|█████████▋| 1941/2000 [00:14<00:00, 144.59it/s]
[Rank 2] Train Epoch 2:  98%|█████████▊| 1951/2000 [00:14<00:00, 143.19it/s]
[Rank 1] Train Epoch 2:  98%|█████████▊| 1950/2000 [00:14<00:00, 141.68it/s]
[Rank 1] Train Epoch 2:  98%|█████████▊| 1968/2000 [00:14<00:00, 151.10it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1922 | Mem: 26.53MB, Util: 100%  global_step : 5922
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1923 | Mem: 26.53MB, Util: 100%  global_step : 5923
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1924 | Mem: 26.53MB, Util: 100%  global_step : 5924
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1925 | Mem: 26.53MB, Util: 100%  global_step : 5925
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1926 | Mem: 26.53MB, Util: 100%  global_step : 5926
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1927 | Mem: 26.53MB, Util: 100%  global_step : 5927
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1928 | Mem: 26.53MB, Util: 100%  global_step : 5928
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1929 | Mem: 26.53MB, Util: 1

[Rank 0] Train Epoch 2:  98%|█████████▊| 1956/2000 [00:14<00:00, 145.84it/s]
[Rank 2] Train Epoch 2:  98%|█████████▊| 1967/2000 [00:14<00:00, 145.48it/s]
[Rank 1] Train Epoch 2:  99%|█████████▉| 1987/2000 [00:14<00:00, 158.87it/s]
[Rank 0] Train Epoch 2:  99%|█████████▊| 1971/2000 [00:14<00:00, 147.04it/s]
[Rank 2] Train Epoch 2:  99%|█████████▉| 1982/2000 [00:14<00:00, 146.24it/s]
[Rank 1] Train Epoch 2: 100%|██████████| 2000/2000 [00:14<00:00, 139.86it/s]
[Rank 1] Test Epoch 2:   0%|          | 0/334 [00:00<?, ?it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1954 | Mem: 26.53MB, Util: 100%  global_step : 5954
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1955 | Mem: 26.53MB, Util: 100%  global_step : 5955
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1956 | Mem: 26.53MB, Util: 100%  global_step : 5956
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1957 | Mem: 26.53MB, Util: 100%  global_step : 5957
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1958 | Mem: 26.53MB, Util: 100%  global_step : 5958
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1959 | Mem: 26.53MB, Util: 100%  global_step : 5959
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1960 | Mem: 26.53MB, Util: 100%  global_step : 5960
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1961 | Mem: 26.53MB, Util: 1

[Rank 0] Train Epoch 2:  99%|█████████▉| 1986/2000 [00:14<00:00, 147.53it/s]
[Rank 2] Train Epoch 2: 100%|██████████| 2000/2000 [00:14<00:00, 138.32it/s]
[Rank 2] Test Epoch 2:   0%|          | 0/334 [00:00<?, ?it/s]
[Rank 1] Test Epoch 2:   0%|          | 1/334 [00:00<00:38,  8.59it/s]
[Rank 0] Train Epoch 2: 100%|██████████| 2000/2000 [00:14<00:00, 138.73it/s]
[Rank 0] Test Epoch 2:   0%|          | 0/334 [00:00<?, ?it/s]
[Rank 2] Test Epoch 2:  10%|▉         | 33/334 [00:00<00:00, 326.62it/s]
[Rank 1] Test Epoch 2:  11%|█         | 36/334 [00:00<00:01, 193.84it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1985 | Mem: 26.53MB, Util: 100%  global_step : 5985
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1986 | Mem: 26.53MB, Util: 100%  global_step : 5986
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1987 | Mem: 26.53MB, Util: 100%  global_step : 5987
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1988 | Mem: 26.53MB, Util: 100%  global_step : 5988
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1989 | Mem: 26.53MB, Util: 100%  global_step : 5989
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1990 | Mem: 26.53MB, Util: 100%  global_step : 5990
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1991 | Mem: 26.53MB, Util: 100%  global_step : 5991
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 2, Batch 1992 | Mem: 26.53MB, Util: 1

[Rank 0] Test Epoch 2:  10%|▉         | 32/334 [00:00<00:00, 312.69it/s]
[Rank 2] Test Epoch 2:  21%|██        | 69/334 [00:00<00:00, 345.93it/s]
[Rank 1] Test Epoch 2:  21%|██        | 70/334 [00:00<00:01, 256.27it/s]
[Rank 0] Test Epoch 2:  20%|██        | 68/334 [00:00<00:00, 337.27it/s]
[Rank 2] Test Epoch 2:  31%|███▏      | 105/334 [00:00<00:00, 349.81it/s]
[Rank 1] Test Epoch 2:  31%|███▏      | 105/334 [00:00<00:00, 291.19it/s]
[Rank 0] Test Epoch 2:  31%|███       | 103/334 [00:00<00:00, 341.01it/s]
[Rank 2] Test Epoch 2:  42%|████▏     | 140/334 [00:00<00:00, 349.09it/s]
[Rank 1] Test Epoch 2:  42%|████▏     | 140/334 [00:00<00:00, 309.08it/s]
[Rank 0] Test Epoch 2:  41%|████▏     | 138/334 [00:00<00:00, 344.30it/s]
[Rank 2] Test Epoch 2:  52%|█████▏    | 175/334 [00:00<00:00, 344.93it/s]
[Rank 1] Test Epoch 2:  52%|█████▏    | 174/334 [00:00<00:00, 318.43it/s]
[Rank 0] Test Epoch 2:  52%|█████▏    | 173/334 [00:00<00:00, 344.92it/s]
[Rank 2] Test Epoch 2:  63%|██████▎   | 21

[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [Rank 0] Epoch 2 | Loss: 0.4037, Acc: 0.8527, Model Checksum: 420a4e2eca39228bd6ae89ef89f0efd9
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [ NodeId f94cfd48e881a8ef0b964a8593da4b704a1574ca224f294a4cfe1791 Rank 0] Epoch 2 | Loss: 0.4037, Acc: 0.8527, Model Checksum: 420a4e2eca39228bd6ae89ef89f0efd9
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 0 | Mem: 26.53MB, Util: 3%  global_step : 6000
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [Rank 2] Epoch 2 | Loss: 0.3811, Acc: 0.8614, Model Checksum: 420a4e2eca39228bd6ae89ef89f0efd9
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [ NodeId 8132e66e3928a4b82addde0aa19309709a58e788e1dcddeaab904191 Rank 2] Epoch 2 | Loss: 0.3811, Acc: 0.8614, Model Checksum: 420a4e2eca39228bd6ae89ef89f0efd9
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 0 | Mem: 26.53MB, Util: 3%  global_step : 6000
[36m(RayTrainWorker pid=401, ip=10.254.6.1

[Rank 0] Train Epoch 3:   2%|▏         | 30/2000 [00:00<00:13, 144.95it/s]
[Rank 2] Train Epoch 3:   2%|▏         | 30/2000 [00:00<00:13, 147.88it/s]
[Rank 1] Train Epoch 3:   2%|▏         | 34/2000 [00:00<00:12, 151.37it/s]
[Rank 0] Train Epoch 3:   2%|▏         | 45/2000 [00:00<00:13, 143.80it/s]
[Rank 2] Train Epoch 3:   2%|▏         | 46/2000 [00:00<00:12, 150.61it/s]
[Rank 1] Train Epoch 3:   2%|▎         | 50/2000 [00:00<00:13, 142.68it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 16 | Mem: 26.53MB, Util: 3%  global_step : 6016
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 17 | Mem: 26.53MB, Util: 3%  global_step : 6017
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 18 | Mem: 26.53MB, Util: 3%  global_step : 6018
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 19 | Mem: 26.53MB, Util: 3%  global_step : 6019
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 20 | Mem: 26.53MB, Util: 3%  global_step : 6020
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 21 | Mem: 26.53MB, Util: 3%  global_step : 6021
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 22 | Mem: 26.53MB, Util: 3%  global_step : 6022
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 23 | Mem: 26.53MB, Util: 3%  global_step : 6023
[36m(RayTrainWo

[Rank 0] Train Epoch 3:   3%|▎         | 60/2000 [00:00<00:13, 143.82it/s]
[Rank 2] Train Epoch 3:   3%|▎         | 62/2000 [00:00<00:12, 151.40it/s]
[Rank 1] Train Epoch 3:   3%|▎         | 65/2000 [00:00<00:13, 142.71it/s]
[Rank 0] Train Epoch 3:   4%|▍         | 75/2000 [00:00<00:13, 143.80it/s]
[Rank 2] Train Epoch 3:   4%|▍         | 78/2000 [00:00<00:12, 151.76it/s]
[Rank 1] Train Epoch 3:   4%|▍         | 80/2000 [00:00<00:13, 139.81it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 47 | Mem: 26.53MB, Util: 74%  global_step : 6047
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 48 | Mem: 26.53MB, Util: 74%  global_step : 6048
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 49 | Mem: 26.53MB, Util: 74%  global_step : 6049
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 50 | Mem: 26.53MB, Util: 74%  global_step : 6050
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 51 | Mem: 26.53MB, Util: 74%  global_step : 6051
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 52 | Mem: 26.53MB, Util: 74%  global_step : 6052
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 53 | Mem: 26.53MB, Util: 74%  global_step : 6053
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 54 | Mem: 26.53MB, Util: 74%  global_step : 6054
[36m(Ra

[Rank 0] Train Epoch 3:   4%|▍         | 90/2000 [00:00<00:13, 143.02it/s]
[Rank 2] Train Epoch 3:   5%|▍         | 94/2000 [00:00<00:12, 152.06it/s]
[Rank 1] Train Epoch 3:   5%|▍         | 95/2000 [00:00<00:13, 141.53it/s]
[Rank 0] Train Epoch 3:   5%|▌         | 105/2000 [00:00<00:13, 139.19it/s]
[Rank 1] Train Epoch 3:   6%|▌         | 110/2000 [00:00<00:13, 142.49it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 77 | Mem: 26.53MB, Util: 92%  global_step : 6077
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 78 | Mem: 26.53MB, Util: 92%  global_step : 6078
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 79 | Mem: 26.53MB, Util: 92%  global_step : 6079
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 80 | Mem: 26.53MB, Util: 92%  global_step : 6080
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 81 | Mem: 26.53MB, Util: 92%  global_step : 6081
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 82 | Mem: 26.53MB, Util: 92%  global_step : 6082
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 83 | Mem: 26.53MB, Util: 92%  global_step : 6083
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 84 | Mem: 26.53MB, Util: 92%  global_step : 6084
[36m(Ra

[Rank 0] Train Epoch 3:   6%|▌         | 119/2000 [00:00<00:13, 135.08it/s]
[Rank 2] Train Epoch 3:   6%|▌         | 110/2000 [00:00<00:15, 120.58it/s]
[Rank 1] Train Epoch 3:   6%|▋         | 125/2000 [00:00<00:13, 140.88it/s]
[Rank 0] Train Epoch 3:   7%|▋         | 133/2000 [00:00<00:13, 135.69it/s]
[Rank 2] Train Epoch 3:   6%|▌         | 124/2000 [00:00<00:15, 123.42it/s]
[Rank 1] Train Epoch 3:   7%|▋         | 140/2000 [00:00<00:13, 141.71it/s]
[Rank 0] Train Epoch 3:   7%|▋         | 147/2000 [00:01<00:13, 135.06it/s]
[Rank 2] Train Epoch 3:   7%|▋         | 138/2000 [00:01<00:14, 126.61it/s]
[Rank 1] Train Epoch 3:   8%|▊         | 155/2000 [00:01<00:13, 141.61it/s]
[Rank 0] Train Epoch 3:   8%|▊         | 161/2000 [00:01<00:13, 134.74it/s]
[Rank 2] Train Epoch 3:   8%|▊         | 152/2000 [00:01<00:14, 129.22it/s]
[Rank 1] Train Epoch 3:   8%|▊         | 170/2000 [00:01<00:12, 142.14it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 133 | Mem: 26.53MB, Util: 99%  global_step : 6133
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 134 | Mem: 26.53MB, Util: 99%  global_step : 6134
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 135 | Mem: 26.53MB, Util: 99%  global_step : 6135
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 136 | Mem: 26.53MB, Util: 99%  global_step : 6136
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 137 | Mem: 26.53MB, Util: 99%  global_step : 6137
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 138 | Mem: 26.53MB, Util: 99%  global_step : 6138
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 139 | Mem: 26.53MB, Util: 99%  global_step : 6139
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 140 | Mem: 26.53MB, Util: 99%  global_step : 6140


[Rank 0] Train Epoch 3:   9%|▉         | 175/2000 [00:01<00:13, 134.36it/s]
[Rank 2] Train Epoch 3:   8%|▊         | 166/2000 [00:01<00:14, 130.84it/s]
[Rank 1] Train Epoch 3:   9%|▉         | 185/2000 [00:01<00:12, 144.01it/s]
[Rank 0] Train Epoch 3:   9%|▉         | 189/2000 [00:01<00:13, 134.28it/s]
[Rank 2] Train Epoch 3:   9%|▉         | 180/2000 [00:01<00:13, 132.46it/s]
[Rank 0] Train Epoch 3:  10%|█         | 203/2000 [00:01<00:13, 134.61it/s]
[Rank 2] Train Epoch 3:  10%|▉         | 194/2000 [00:01<00:13, 133.61it/s]
[Rank 1] Train Epoch 3:  10%|█         | 201/2000 [00:01<00:14, 126.40it/s]
[Rank 0] Train Epoch 3:  11%|█         | 217/2000 [00:01<00:13, 134.92it/s]
[Rank 2] Train Epoch 3:  10%|█         | 208/2000 [00:01<00:13, 133.64it/s]
[Rank 2] Train Epoch 3:  11%|█         | 222/2000 [00:01<00:13, 134.34it/s]
[Rank 1] Train Epoch 3:  11%|█         | 218/2000 [00:01<00:13, 135.96it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 189 | Mem: 26.53MB, Util: 100%  global_step : 6189
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 190 | Mem: 26.53MB, Util: 100%  global_step : 6190
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 191 | Mem: 26.53MB, Util: 100%  global_step : 6191
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 192 | Mem: 26.53MB, Util: 100%  global_step : 6192
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 193 | Mem: 26.53MB, Util: 100%  global_step : 6193
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 194 | Mem: 26.53MB, Util: 100%  global_step : 6194
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 195 | Mem: 26.53MB, Util: 100%  global_step : 6195
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 196 | Mem: 26.53MB, Util: 100%  global_step

[Rank 0] Train Epoch 3:  12%|█▏        | 231/2000 [00:01<00:13, 134.81it/s]
[Rank 1] Train Epoch 3:  12%|█▏        | 234/2000 [00:01<00:12, 141.08it/s]
[Rank 2] Train Epoch 3:  12%|█▏        | 236/2000 [00:01<00:13, 133.35it/s]
[Rank 1] Train Epoch 3:  12%|█▎        | 250/2000 [00:01<00:12, 145.02it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 217 | Mem: 26.53MB, Util: 86%  global_step : 6217
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 218 | Mem: 26.53MB, Util: 86%  global_step : 6218
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 219 | Mem: 26.53MB, Util: 86%  global_step : 6219
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 220 | Mem: 26.53MB, Util: 86%  global_step : 6220
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 221 | Mem: 26.53MB, Util: 86%  global_step : 6221
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 222 | Mem: 26.53MB, Util: 86%  global_step : 6222
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 223 | Mem: 26.53MB, Util: 86%  global_step : 6223
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 224 | Mem: 26.53MB, Util: 86%  global_step : 6224


[Rank 0] Train Epoch 3:  12%|█▏        | 246/2000 [00:01<00:12, 136.32it/s]
[Rank 2] Train Epoch 3:  12%|█▎        | 250/2000 [00:01<00:13, 132.27it/s]
[Rank 1] Train Epoch 3:  13%|█▎        | 266/2000 [00:01<00:11, 147.49it/s]
[Rank 0] Train Epoch 3:  13%|█▎        | 260/2000 [00:01<00:12, 136.27it/s]
[Rank 2] Train Epoch 3:  13%|█▎        | 264/2000 [00:01<00:13, 132.25it/s]
[Rank 1] Train Epoch 3:  14%|█▍        | 282/2000 [00:01<00:11, 149.40it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 245 | Mem: 26.53MB, Util: 81%  global_step : 6245
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 246 | Mem: 26.53MB, Util: 81%  global_step : 6246
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 247 | Mem: 26.53MB, Util: 81%  global_step : 6247
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 248 | Mem: 26.53MB, Util: 81%  global_step : 6248
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 249 | Mem: 26.53MB, Util: 81%  global_step : 6249
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 250 | Mem: 26.53MB, Util: 81%  global_step : 6250
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 251 | Mem: 26.53MB, Util: 81%  global_step : 6251
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 252 | Mem: 26.53MB, Util: 81%  global_step : 6252


[Rank 0] Train Epoch 3:  14%|█▍        | 275/2000 [00:02<00:12, 137.84it/s]
[Rank 0] Train Epoch 3:  14%|█▍        | 289/2000 [00:02<00:12, 138.44it/s]
[Rank 2] Train Epoch 3:  14%|█▍        | 278/2000 [00:02<00:13, 132.39it/s]
[Rank 1] Train Epoch 3:  15%|█▍        | 298/2000 [00:02<00:11, 151.57it/s]
[Rank 2] Train Epoch 3:  15%|█▍        | 292/2000 [00:02<00:13, 127.22it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 274 | Mem: 26.53MB, Util: 96%  global_step : 6274
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 275 | Mem: 26.53MB, Util: 96%  global_step : 6275
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 276 | Mem: 26.53MB, Util: 96%  global_step : 6276
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 277 | Mem: 26.53MB, Util: 96%  global_step : 6277
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 278 | Mem: 26.53MB, Util: 96%  global_step : 6278
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 279 | Mem: 26.53MB, Util: 96%  global_step : 6279
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 280 | Mem: 26.53MB, Util: 96%  global_step : 6280
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 281 | Mem: 26.53MB, Util: 96%  global_step : 6281


[Rank 0] Train Epoch 3:  15%|█▌        | 303/2000 [00:02<00:13, 122.61it/s]
[Rank 2] Train Epoch 3:  15%|█▌        | 306/2000 [00:02<00:13, 128.43it/s]
[Rank 1] Train Epoch 3:  16%|█▌        | 314/2000 [00:02<00:15, 111.34it/s]
[Rank 0] Train Epoch 3:  16%|█▌        | 318/2000 [00:02<00:13, 128.81it/s]
[Rank 2] Train Epoch 3:  16%|█▌        | 320/2000 [00:02<00:12, 129.87it/s]
[Rank 1] Train Epoch 3:  16%|█▋        | 330/2000 [00:02<00:13, 121.78it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 300 | Mem: 26.53MB, Util: 100%  global_step : 6300
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 301 | Mem: 26.53MB, Util: 100%  global_step : 6301
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 302 | Mem: 26.53MB, Util: 100%  global_step : 6302
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 303 | Mem: 26.53MB, Util: 100%  global_step : 6303
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 304 | Mem: 26.53MB, Util: 100%  global_step : 6304
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 305 | Mem: 26.53MB, Util: 100%  global_step : 6305
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 306 | Mem: 26.53MB, Util: 100%  global_step : 6306
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 307 | Mem: 26.53MB, Util: 100%  global_step

[Rank 0] Train Epoch 3:  17%|█▋        | 333/2000 [00:02<00:12, 133.28it/s]
[Rank 2] Train Epoch 3:  17%|█▋        | 334/2000 [00:02<00:12, 129.32it/s]
[Rank 1] Train Epoch 3:  17%|█▋        | 346/2000 [00:02<00:12, 130.12it/s]
[Rank 0] Train Epoch 3:  17%|█▋        | 348/2000 [00:02<00:12, 135.47it/s]
[Rank 2] Train Epoch 3:  17%|█▋        | 347/2000 [00:02<00:12, 128.98it/s]
[Rank 1] Train Epoch 3:  18%|█▊        | 362/2000 [00:02<00:11, 137.42it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 327 | Mem: 26.53MB, Util: 100%  global_step : 6327
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 328 | Mem: 26.53MB, Util: 100%  global_step : 6328
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 329 | Mem: 26.53MB, Util: 100%  global_step : 6329
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 330 | Mem: 26.53MB, Util: 100%  global_step : 6330
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 331 | Mem: 26.53MB, Util: 100%  global_step : 6331
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 332 | Mem: 26.53MB, Util: 100%  global_step : 6332
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 333 | Mem: 26.53MB, Util: 100%  global_step : 6333
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 334 | Mem: 26.53MB, Util: 100%  global_step

[Rank 0] Train Epoch 3:  18%|█▊        | 363/2000 [00:02<00:11, 137.23it/s]
[Rank 2] Train Epoch 3:  18%|█▊        | 360/2000 [00:02<00:12, 127.85it/s]
[Rank 1] Train Epoch 3:  19%|█▉        | 378/2000 [00:02<00:11, 142.52it/s]
[Rank 0] Train Epoch 3:  19%|█▉        | 377/2000 [00:02<00:11, 137.83it/s]
[Rank 2] Train Epoch 3:  19%|█▊        | 373/2000 [00:02<00:12, 128.26it/s]
[Rank 1] Train Epoch 3:  20%|█▉        | 394/2000 [00:02<00:10, 146.06it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 356 | Mem: 26.53MB, Util: 99%  global_step : 6356
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 357 | Mem: 26.53MB, Util: 99%  global_step : 6357
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 358 | Mem: 26.53MB, Util: 99%  global_step : 6358
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 359 | Mem: 26.53MB, Util: 99%  global_step : 6359
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 360 | Mem: 26.53MB, Util: 99%  global_step : 6360
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 361 | Mem: 26.53MB, Util: 99%  global_step : 6361
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 362 | Mem: 26.53MB, Util: 99%  global_step : 6362
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 363 | Mem: 26.53MB, Util: 99%  global_step : 6363


[Rank 0] Train Epoch 3:  20%|█▉        | 392/2000 [00:02<00:11, 139.99it/s]
[Rank 2] Train Epoch 3:  19%|█▉        | 387/2000 [00:02<00:12, 129.72it/s]
[Rank 2] Train Epoch 3:  20%|██        | 401/2000 [00:03<00:12, 130.49it/s]
[Rank 1] Train Epoch 3:  20%|██        | 410/2000 [00:03<00:14, 109.61it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 386 | Mem: 26.53MB, Util: 100%  global_step : 6386
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 387 | Mem: 26.53MB, Util: 100%  global_step : 6387
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 388 | Mem: 26.53MB, Util: 100%  global_step : 6388
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 389 | Mem: 26.53MB, Util: 100%  global_step : 6389
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 390 | Mem: 26.53MB, Util: 100%  global_step : 6390
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 391 | Mem: 26.53MB, Util: 100%  global_step : 6391
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 392 | Mem: 26.53MB, Util: 100%  global_step : 6392
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 393 | Mem: 26.53MB, Util: 100%  global_step

[Rank 0] Train Epoch 3:  20%|██        | 407/2000 [00:03<00:13, 117.66it/s]
[Rank 2] Train Epoch 3:  21%|██        | 415/2000 [00:03<00:12, 129.08it/s]
[Rank 1] Train Epoch 3:  21%|██▏       | 425/2000 [00:03<00:13, 117.96it/s]
[Rank 0] Train Epoch 3:  21%|██        | 422/2000 [00:03<00:12, 124.89it/s]
[Rank 2] Train Epoch 3:  21%|██▏       | 429/2000 [00:03<00:12, 129.64it/s]
[Rank 1] Train Epoch 3:  22%|██▏       | 440/2000 [00:03<00:12, 124.16it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 406 | Mem: 26.53MB, Util: 100%  global_step : 6406
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 407 | Mem: 26.53MB, Util: 100%  global_step : 6407
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 408 | Mem: 26.53MB, Util: 100%  global_step : 6408
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 409 | Mem: 26.53MB, Util: 100%  global_step : 6409
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 410 | Mem: 26.53MB, Util: 100%  global_step : 6410
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 411 | Mem: 26.53MB, Util: 100%  global_step : 6411
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 412 | Mem: 26.53MB, Util: 100%  global_step : 6412
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 413 | Mem: 26.53MB, Util: 100%  global_step

[Rank 0] Train Epoch 3:  22%|██▏       | 437/2000 [00:03<00:12, 130.20it/s]
[Rank 2] Train Epoch 3:  22%|██▏       | 442/2000 [00:03<00:12, 128.81it/s]
[Rank 1] Train Epoch 3:  23%|██▎       | 455/2000 [00:03<00:11, 128.99it/s]
[Rank 0] Train Epoch 3:  23%|██▎       | 451/2000 [00:03<00:11, 131.10it/s]
[Rank 2] Train Epoch 3:  23%|██▎       | 455/2000 [00:03<00:12, 128.40it/s]
[Rank 1] Train Epoch 3:  24%|██▎       | 470/2000 [00:03<00:11, 132.66it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 436 | Mem: 26.53MB, Util: 98%  global_step : 6436
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 437 | Mem: 26.53MB, Util: 98%  global_step : 6437
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 438 | Mem: 26.53MB, Util: 98%  global_step : 6438
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 439 | Mem: 26.53MB, Util: 98%  global_step : 6439
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 440 | Mem: 26.53MB, Util: 98%  global_step : 6440
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 441 | Mem: 26.53MB, Util: 98%  global_step : 6441
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 442 | Mem: 26.53MB, Util: 98%  global_step : 6442
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 443 | Mem: 26.53MB, Util: 98%  global_step : 6443


[Rank 0] Train Epoch 3:  23%|██▎       | 465/2000 [00:03<00:11, 133.10it/s]
[Rank 2] Train Epoch 3:  23%|██▎       | 468/2000 [00:03<00:12, 126.93it/s]
[Rank 1] Train Epoch 3:  24%|██▍       | 485/2000 [00:03<00:11, 135.44it/s]
[Rank 0] Train Epoch 3:  24%|██▍       | 479/2000 [00:03<00:11, 133.69it/s]
[Rank 2] Train Epoch 3:  24%|██▍       | 482/2000 [00:03<00:11, 129.86it/s]
[Rank 1] Train Epoch 3:  25%|██▌       | 500/2000 [00:03<00:10, 137.99it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 464 | Mem: 26.53MB, Util: 100%  global_step : 6464
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 465 | Mem: 26.53MB, Util: 100%  global_step : 6465
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 466 | Mem: 26.53MB, Util: 100%  global_step : 6466
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 467 | Mem: 26.53MB, Util: 100%  global_step : 6467
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 468 | Mem: 26.53MB, Util: 100%  global_step : 6468
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 469 | Mem: 26.53MB, Util: 100%  global_step : 6469
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 470 | Mem: 26.53MB, Util: 100%  global_step : 6470
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 471 | Mem: 26.53MB, Util: 100%  global_step

[Rank 0] Train Epoch 3:  25%|██▍       | 493/2000 [00:03<00:11, 134.52it/s]
[Rank 2] Train Epoch 3:  25%|██▍       | 495/2000 [00:03<00:11, 127.57it/s]
[Rank 0] Train Epoch 3:  25%|██▌       | 507/2000 [00:03<00:12, 115.51it/s]
[Rank 2] Train Epoch 3:  25%|██▌       | 509/2000 [00:03<00:11, 129.54it/s]
[Rank 1] Train Epoch 3:  26%|██▌       | 515/2000 [00:03<00:13, 112.24it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 492 | Mem: 26.53MB, Util: 100%  global_step : 6492
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 493 | Mem: 26.53MB, Util: 100%  global_step : 6493
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 494 | Mem: 26.53MB, Util: 100%  global_step : 6494
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 495 | Mem: 26.53MB, Util: 100%  global_step : 6495
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 496 | Mem: 26.53MB, Util: 100%  global_step : 6496
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 497 | Mem: 26.53MB, Util: 100%  global_step : 6497
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 498 | Mem: 26.53MB, Util: 100%  global_step : 6498
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 499 | Mem: 26.53MB, Util: 100%  global_step

[Rank 0] Train Epoch 3:  26%|██▌       | 521/2000 [00:03<00:12, 121.64it/s]
[Rank 2] Train Epoch 3:  26%|██▌       | 523/2000 [00:03<00:11, 130.01it/s]
[Rank 1] Train Epoch 3:  26%|██▋       | 530/2000 [00:03<00:12, 120.27it/s]
[Rank 0] Train Epoch 3:  27%|██▋       | 535/2000 [00:04<00:11, 125.38it/s]
[Rank 2] Train Epoch 3:  27%|██▋       | 537/2000 [00:04<00:11, 126.97it/s]
[Rank 1] Train Epoch 3:  27%|██▋       | 545/2000 [00:04<00:11, 126.61it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 513 | Mem: 26.53MB, Util: 99%  global_step : 6513
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 514 | Mem: 26.53MB, Util: 99%  global_step : 6514
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 515 | Mem: 26.53MB, Util: 99%  global_step : 6515
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 516 | Mem: 26.53MB, Util: 99%  global_step : 6516
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 517 | Mem: 26.53MB, Util: 99%  global_step : 6517
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 518 | Mem: 26.53MB, Util: 99%  global_step : 6518
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 519 | Mem: 26.53MB, Util: 99%  global_step : 6519
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 520 | Mem: 26.53MB, Util: 99%  global_step : 6520


[Rank 0] Train Epoch 3:  27%|██▋       | 549/2000 [00:04<00:11, 129.28it/s]
[Rank 2] Train Epoch 3:  28%|██▊       | 550/2000 [00:04<00:11, 127.28it/s]
[Rank 1] Train Epoch 3:  28%|██▊       | 561/2000 [00:04<00:10, 133.07it/s]
[Rank 0] Train Epoch 3:  28%|██▊       | 563/2000 [00:04<00:11, 130.45it/s]
[Rank 2] Train Epoch 3:  28%|██▊       | 563/2000 [00:04<00:11, 126.17it/s]
[Rank 1] Train Epoch 3:  29%|██▉       | 576/2000 [00:04<00:10, 135.17it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 541 | Mem: 26.53MB, Util: 100%  global_step : 6541
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 542 | Mem: 26.53MB, Util: 100%  global_step : 6542
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 543 | Mem: 26.53MB, Util: 100%  global_step : 6543
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 544 | Mem: 26.53MB, Util: 100%  global_step : 6544
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 545 | Mem: 26.53MB, Util: 100%  global_step : 6545
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 546 | Mem: 26.53MB, Util: 100%  global_step : 6546
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 547 | Mem: 26.53MB, Util: 100%  global_step : 6547
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 548 | Mem: 26.53MB, Util: 100%  global_step

[Rank 0] Train Epoch 3:  29%|██▉       | 577/2000 [00:04<00:10, 131.11it/s]
[Rank 2] Train Epoch 3:  29%|██▉       | 577/2000 [00:04<00:11, 129.08it/s]
[Rank 1] Train Epoch 3:  30%|██▉       | 591/2000 [00:04<00:10, 137.76it/s]
[Rank 0] Train Epoch 3:  30%|██▉       | 591/2000 [00:04<00:10, 132.72it/s]
[Rank 2] Train Epoch 3:  30%|██▉       | 590/2000 [00:04<00:10, 128.91it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 569 | Mem: 26.53MB, Util: 100%  global_step : 6569
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 570 | Mem: 26.53MB, Util: 100%  global_step : 6570
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 571 | Mem: 26.53MB, Util: 100%  global_step : 6571
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 572 | Mem: 26.53MB, Util: 100%  global_step : 6572
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 573 | Mem: 26.53MB, Util: 100%  global_step : 6573
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 574 | Mem: 26.53MB, Util: 100%  global_step : 6574
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 575 | Mem: 26.53MB, Util: 100%  global_step : 6575
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 576 | Mem: 26.53MB, Util: 100%  global_step

[Rank 0] Train Epoch 3:  30%|███       | 605/2000 [00:04<00:11, 119.61it/s]
[Rank 2] Train Epoch 3:  30%|███       | 604/2000 [00:04<00:10, 131.54it/s]
[Rank 1] Train Epoch 3:  30%|███       | 606/2000 [00:04<00:12, 113.86it/s]
[Rank 0] Train Epoch 3:  31%|███       | 619/2000 [00:04<00:11, 123.43it/s]
[Rank 2] Train Epoch 3:  31%|███       | 618/2000 [00:04<00:10, 131.40it/s]
[Rank 1] Train Epoch 3:  31%|███       | 621/2000 [00:04<00:11, 121.42it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 597 | Mem: 26.53MB, Util: 100%  global_step : 6597
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 598 | Mem: 26.53MB, Util: 100%  global_step : 6598
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 599 | Mem: 26.53MB, Util: 100%  global_step : 6599
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 600 | Mem: 26.53MB, Util: 100%  global_step : 6600
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 601 | Mem: 26.53MB, Util: 100%  global_step : 6601
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 602 | Mem: 26.53MB, Util: 100%  global_step : 6602
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 603 | Mem: 26.53MB, Util: 100%  global_step : 6603
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 604 | Mem: 26.53MB, Util: 100%  global_step

[Rank 0] Train Epoch 3:  32%|███▏      | 632/2000 [00:04<00:10, 124.58it/s]
[Rank 2] Train Epoch 3:  32%|███▏      | 632/2000 [00:04<00:10, 130.88it/s]
[Rank 1] Train Epoch 3:  32%|███▏      | 636/2000 [00:04<00:10, 127.48it/s]
[Rank 0] Train Epoch 3:  32%|███▏      | 645/2000 [00:04<00:10, 125.97it/s]
[Rank 2] Train Epoch 3:  32%|███▏      | 646/2000 [00:04<00:10, 130.93it/s]
[Rank 1] Train Epoch 3:  33%|███▎      | 651/2000 [00:04<00:10, 131.97it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 619 | Mem: 26.53MB, Util: 87%  global_step : 6619
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 620 | Mem: 26.53MB, Util: 87%  global_step : 6620
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 621 | Mem: 26.53MB, Util: 87%  global_step : 6621
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 622 | Mem: 26.53MB, Util: 87%  global_step : 6622
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 623 | Mem: 26.53MB, Util: 87%  global_step : 6623
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 624 | Mem: 26.53MB, Util: 87%  global_step : 6624
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 625 | Mem: 26.53MB, Util: 87%  global_step : 6625
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 626 | Mem: 26.53MB, Util: 87%  global_step : 6626


[Rank 0] Train Epoch 3:  33%|███▎      | 658/2000 [00:04<00:10, 126.05it/s]
[Rank 2] Train Epoch 3:  33%|███▎      | 660/2000 [00:05<00:10, 130.59it/s]
[Rank 1] Train Epoch 3:  33%|███▎      | 666/2000 [00:04<00:09, 134.66it/s]
[Rank 0] Train Epoch 3:  34%|███▎      | 673/2000 [00:05<00:10, 131.53it/s]
[Rank 2] Train Epoch 3:  34%|███▎      | 674/2000 [00:05<00:10, 129.19it/s]
[Rank 1] Train Epoch 3:  34%|███▍      | 681/2000 [00:05<00:09, 136.77it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 646 | Mem: 26.53MB, Util: 83%  global_step : 6646
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 647 | Mem: 26.53MB, Util: 83%  global_step : 6647
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 648 | Mem: 26.53MB, Util: 83%  global_step : 6648
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 649 | Mem: 26.53MB, Util: 83%  global_step : 6649
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 650 | Mem: 26.53MB, Util: 83%  global_step : 6650
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 651 | Mem: 26.53MB, Util: 83%  global_step : 6651
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 652 | Mem: 26.53MB, Util: 83%  global_step : 6652
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 653 | Mem: 26.53MB, Util: 83%  global_step : 6653


[Rank 0] Train Epoch 3:  35%|███▍      | 691/2000 [00:05<00:09, 144.28it/s]
[Rank 2] Train Epoch 3:  34%|███▍      | 688/2000 [00:05<00:10, 130.05it/s]
[Rank 1] Train Epoch 3:  35%|███▍      | 696/2000 [00:05<00:09, 138.32it/s]
[Rank 2] Train Epoch 3:  35%|███▌      | 702/2000 [00:05<00:10, 129.11it/s]
[Rank 1] Train Epoch 3:  36%|███▌      | 711/2000 [00:05<00:11, 116.65it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 676 | Mem: 26.53MB, Util: 65%  global_step : 6676
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 677 | Mem: 26.53MB, Util: 65%  global_step : 6677
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 678 | Mem: 26.53MB, Util: 65%  global_step : 6678
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 679 | Mem: 26.53MB, Util: 65%  global_step : 6679
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 680 | Mem: 26.53MB, Util: 65%  global_step : 6680
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 681 | Mem: 26.53MB, Util: 65%  global_step : 6681
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 682 | Mem: 26.53MB, Util: 65%  global_step : 6682
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 683 | Mem: 26.53MB, Util: 65%  global_step : 6683


[Rank 0] Train Epoch 3:  35%|███▌      | 706/2000 [00:05<00:10, 124.40it/s]
[Rank 2] Train Epoch 3:  36%|███▌      | 715/2000 [00:05<00:10, 125.53it/s]
[Rank 1] Train Epoch 3:  36%|███▋      | 726/2000 [00:05<00:10, 123.11it/s]
[Rank 0] Train Epoch 3:  36%|███▌      | 721/2000 [00:05<00:09, 130.18it/s]
[Rank 2] Train Epoch 3:  36%|███▋      | 728/2000 [00:05<00:10, 123.93it/s]
[Rank 1] Train Epoch 3:  37%|███▋      | 741/2000 [00:05<00:09, 128.40it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 701 | Mem: 26.53MB, Util: 99%  global_step : 6701
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 702 | Mem: 26.53MB, Util: 99%  global_step : 6702
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 703 | Mem: 26.53MB, Util: 99%  global_step : 6703
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 704 | Mem: 26.53MB, Util: 99%  global_step : 6704
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 705 | Mem: 26.53MB, Util: 99%  global_step : 6705
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 706 | Mem: 26.53MB, Util: 99%  global_step : 6706
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 707 | Mem: 26.53MB, Util: 99%  global_step : 6707
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 708 | Mem: 26.53MB, Util: 99%  global_step : 6708


[Rank 0] Train Epoch 3:  37%|███▋      | 736/2000 [00:05<00:09, 133.46it/s]
[Rank 2] Train Epoch 3:  37%|███▋      | 741/2000 [00:05<00:10, 125.19it/s]
[Rank 1] Train Epoch 3:  38%|███▊      | 756/2000 [00:05<00:09, 132.51it/s]
[Rank 0] Train Epoch 3:  38%|███▊      | 751/2000 [00:05<00:09, 136.43it/s]
[Rank 2] Train Epoch 3:  38%|███▊      | 755/2000 [00:05<00:09, 127.95it/s]
[Rank 1] Train Epoch 3:  39%|███▊      | 771/2000 [00:05<00:09, 135.59it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 731 | Mem: 26.53MB, Util: 97%  global_step : 6731
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 732 | Mem: 26.53MB, Util: 97%  global_step : 6732
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 733 | Mem: 26.53MB, Util: 97%  global_step : 6733
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 734 | Mem: 26.53MB, Util: 97%  global_step : 6734
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 735 | Mem: 26.53MB, Util: 97%  global_step : 6735
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 736 | Mem: 26.53MB, Util: 97%  global_step : 6736
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 737 | Mem: 26.53MB, Util: 97%  global_step : 6737
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 738 | Mem: 26.53MB, Util: 97%  global_step : 6738


[Rank 0] Train Epoch 3:  38%|███▊      | 766/2000 [00:05<00:08, 138.11it/s]
[Rank 2] Train Epoch 3:  38%|███▊      | 768/2000 [00:05<00:09, 127.00it/s]
[Rank 1] Train Epoch 3:  39%|███▉      | 786/2000 [00:05<00:08, 137.03it/s]
[Rank 0] Train Epoch 3:  39%|███▉      | 780/2000 [00:05<00:08, 138.13it/s]
[Rank 2] Train Epoch 3:  39%|███▉      | 781/2000 [00:05<00:09, 127.03it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 761 | Mem: 26.53MB, Util: 100%  global_step : 6761
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 762 | Mem: 26.53MB, Util: 100%  global_step : 6762
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 763 | Mem: 26.53MB, Util: 100%  global_step : 6763
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 764 | Mem: 26.53MB, Util: 100%  global_step : 6764
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 765 | Mem: 26.53MB, Util: 100%  global_step : 6765
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 766 | Mem: 26.53MB, Util: 100%  global_step : 6766
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 767 | Mem: 26.53MB, Util: 100%  global_step : 6767
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 768 | Mem: 26.53MB, Util: 100%  global_step

[Rank 0] Train Epoch 3:  40%|███▉      | 798/2000 [00:05<00:08, 149.99it/s]
[Rank 2] Train Epoch 3:  40%|███▉      | 794/2000 [00:06<00:09, 127.84it/s]
[Rank 1] Train Epoch 3:  40%|████      | 801/2000 [00:06<00:10, 109.46it/s]
[Rank 2] Train Epoch 3:  40%|████      | 807/2000 [00:06<00:09, 126.43it/s]
[Rank 1] Train Epoch 3:  41%|████      | 816/2000 [00:06<00:10, 117.59it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 793 | Mem: 26.53MB, Util: 100%  global_step : 6793
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 794 | Mem: 26.53MB, Util: 100%  global_step : 6794
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 795 | Mem: 26.53MB, Util: 100%  global_step : 6795
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 796 | Mem: 26.53MB, Util: 100%  global_step : 6796
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 797 | Mem: 26.53MB, Util: 100%  global_step : 6797
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 798 | Mem: 26.53MB, Util: 100%  global_step : 6798
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 799 | Mem: 26.53MB, Util: 100%  global_step : 6799
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 785 | Mem: 26.53MB, Util: 61%  global_step

[Rank 0] Train Epoch 3:  41%|████      | 814/2000 [00:06<00:10, 114.56it/s]
[Rank 2] Train Epoch 3:  41%|████      | 821/2000 [00:06<00:09, 127.49it/s]
[Rank 1] Train Epoch 3:  42%|████▏     | 831/2000 [00:06<00:09, 124.08it/s]
[Rank 0] Train Epoch 3:  42%|████▏     | 832/2000 [00:06<00:09, 129.69it/s]
[Rank 2] Train Epoch 3:  42%|████▏     | 835/2000 [00:06<00:08, 129.45it/s]
[Rank 1] Train Epoch 3:  42%|████▏     | 845/2000 [00:06<00:09, 127.28it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 808 | Mem: 26.53MB, Util: 100%  global_step : 6808
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 809 | Mem: 26.53MB, Util: 100%  global_step : 6809
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 810 | Mem: 26.53MB, Util: 100%  global_step : 6810
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 811 | Mem: 26.53MB, Util: 100%  global_step : 6811
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 812 | Mem: 26.53MB, Util: 100%  global_step : 6812
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 813 | Mem: 26.53MB, Util: 100%  global_step : 6813
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 814 | Mem: 26.53MB, Util: 100%  global_step : 6814
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 815 | Mem: 26.53MB, Util: 100%  global_step

[Rank 0] Train Epoch 3:  42%|████▎     | 850/2000 [00:06<00:08, 142.20it/s]
[Rank 2] Train Epoch 3:  42%|████▏     | 848/2000 [00:06<00:08, 128.55it/s]
[Rank 1] Train Epoch 3:  43%|████▎     | 861/2000 [00:06<00:08, 134.21it/s]
[Rank 0] Train Epoch 3:  43%|████▎     | 868/2000 [00:06<00:07, 152.04it/s]
[Rank 2] Train Epoch 3:  43%|████▎     | 861/2000 [00:06<00:09, 124.17it/s]
[Rank 1] Train Epoch 3:  44%|████▍     | 877/2000 [00:06<00:07, 141.09it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 846 | Mem: 26.53MB, Util: 99%  global_step : 6846
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 847 | Mem: 26.53MB, Util: 99%  global_step : 6847
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 848 | Mem: 26.53MB, Util: 99%  global_step : 6848
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 849 | Mem: 26.53MB, Util: 99%  global_step : 6849
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 850 | Mem: 26.53MB, Util: 99%  global_step : 6850
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 851 | Mem: 26.53MB, Util: 99%  global_step : 6851
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 852 | Mem: 26.53MB, Util: 99%  global_step : 6852
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 853 | Mem: 26.53MB, Util: 99%  global_step : 6853


[Rank 0] Train Epoch 3:  44%|████▍     | 885/2000 [00:06<00:07, 155.06it/s]
[Rank 2] Train Epoch 3:  44%|████▎     | 874/2000 [00:06<00:09, 124.98it/s]
[Rank 1] Train Epoch 3:  45%|████▍     | 893/2000 [00:06<00:07, 144.41it/s]
[Rank 2] Train Epoch 3:  44%|████▍     | 887/2000 [00:06<00:08, 126.11it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 883 | Mem: 26.53MB, Util: 100%  global_step : 6883
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 884 | Mem: 26.53MB, Util: 100%  global_step : 6884
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 885 | Mem: 26.53MB, Util: 100%  global_step : 6885
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 886 | Mem: 26.53MB, Util: 100%  global_step : 6886
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 887 | Mem: 26.53MB, Util: 100%  global_step : 6887
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 888 | Mem: 26.53MB, Util: 100%  global_step : 6888
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 889 | Mem: 26.53MB, Util: 100%  global_step : 6889
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 890 | Mem: 26.53MB, Util: 100%  global_step

[Rank 2] Train Epoch 3:  45%|████▌     | 900/2000 [00:06<00:08, 126.43it/s]
[Rank 1] Train Epoch 3:  45%|████▌     | 908/2000 [00:06<00:09, 111.53it/s]
[Rank 0] Train Epoch 3:  45%|████▌     | 902/2000 [00:06<00:10, 99.86it/s] 
[Rank 2] Train Epoch 3:  46%|████▌     | 913/2000 [00:07<00:08, 127.03it/s]
[Rank 1] Train Epoch 3:  46%|████▌     | 924/2000 [00:07<00:08, 122.20it/s]
[Rank 0] Train Epoch 3:  46%|████▌     | 920/2000 [00:07<00:09, 115.86it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 890 | Mem: 26.53MB, Util: 58%  global_step : 6890
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 891 | Mem: 26.53MB, Util: 58%  global_step : 6891
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 892 | Mem: 26.53MB, Util: 58%  global_step : 6892
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 893 | Mem: 26.53MB, Util: 58%  global_step : 6893
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 894 | Mem: 26.53MB, Util: 58%  global_step : 6894
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 895 | Mem: 26.53MB, Util: 58%  global_step : 6895
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 896 | Mem: 26.53MB, Util: 58%  global_step : 6896
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 897 | Mem: 26.53MB, Util: 58%  global_step

[Rank 2] Train Epoch 3:  46%|████▋     | 926/2000 [00:07<00:08, 127.80it/s]
[Rank 1] Train Epoch 3:  47%|████▋     | 940/2000 [00:07<00:08, 130.49it/s]
[Rank 0] Train Epoch 3:  47%|████▋     | 938/2000 [00:07<00:08, 129.82it/s]
[Rank 2] Train Epoch 3:  47%|████▋     | 940/2000 [00:07<00:08, 129.06it/s]
[Rank 1] Train Epoch 3:  48%|████▊     | 956/2000 [00:07<00:07, 137.39it/s]
[Rank 0] Train Epoch 3:  48%|████▊     | 956/2000 [00:07<00:07, 141.79it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 916 | Mem: 26.53MB, Util: 61%  global_step : 6916
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 917 | Mem: 26.53MB, Util: 61%  global_step : 6917
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 918 | Mem: 26.53MB, Util: 61%  global_step : 6918
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 919 | Mem: 26.53MB, Util: 61%  global_step : 6919
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 920 | Mem: 26.53MB, Util: 61%  global_step : 6920
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 921 | Mem: 26.53MB, Util: 61%  global_step : 6921
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 922 | Mem: 26.53MB, Util: 61%  global_step : 6922
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 923 | Mem: 26.53MB, Util: 61%  global_step

[Rank 2] Train Epoch 3:  48%|████▊     | 953/2000 [00:07<00:08, 128.91it/s]
[Rank 1] Train Epoch 3:  49%|████▊     | 972/2000 [00:07<00:07, 141.90it/s]
[Rank 0] Train Epoch 3:  49%|████▊     | 974/2000 [00:07<00:06, 150.14it/s]
[Rank 2] Train Epoch 3:  48%|████▊     | 967/2000 [00:07<00:07, 131.02it/s]
[Rank 1] Train Epoch 3:  49%|████▉     | 988/2000 [00:07<00:07, 144.56it/s]
[Rank 0] Train Epoch 3:  50%|████▉     | 992/2000 [00:07<00:06, 155.12it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 943 | Mem: 26.53MB, Util: 63%  global_step : 6943
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 944 | Mem: 26.53MB, Util: 63%  global_step : 6944
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 945 | Mem: 26.53MB, Util: 63%  global_step : 6945
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 946 | Mem: 26.53MB, Util: 63%  global_step : 6946
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 947 | Mem: 26.53MB, Util: 63%  global_step : 6947
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 948 | Mem: 26.53MB, Util: 63%  global_step : 6948
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 949 | Mem: 26.53MB, Util: 63%  global_step : 6949
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 950 | Mem: 26.53MB, Util: 63%  global_step

[Rank 2] Train Epoch 3:  49%|████▉     | 981/2000 [00:07<00:07, 130.08it/s]
[Rank 2] Train Epoch 3:  50%|████▉     | 995/2000 [00:07<00:07, 132.11it/s]
[Rank 1] Train Epoch 3:  50%|█████     | 1003/2000 [00:07<00:08, 111.90it/s]
[Rank 0] Train Epoch 3:  50%|█████     | 1009/2000 [00:07<00:09, 105.68it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 971 | Mem: 26.53MB, Util: 61%  global_step : 6971
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 972 | Mem: 26.53MB, Util: 61%  global_step : 6972
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 973 | Mem: 26.53MB, Util: 61%  global_step : 6973
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 974 | Mem: 26.53MB, Util: 61%  global_step : 6974
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 975 | Mem: 26.53MB, Util: 61%  global_step : 6975
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 976 | Mem: 26.53MB, Util: 61%  global_step : 6976
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 977 | Mem: 26.53MB, Util: 61%  global_step : 6977
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 978 | Mem: 26.53MB, Util: 61%  global_step

[Rank 2] Train Epoch 3:  50%|█████     | 1009/2000 [00:07<00:07, 131.74it/s]
[Rank 1] Train Epoch 3:  51%|█████     | 1019/2000 [00:07<00:08, 121.81it/s]
[Rank 0] Train Epoch 3:  51%|█████▏    | 1028/2000 [00:07<00:07, 121.63it/s]
[Rank 2] Train Epoch 3:  51%|█████     | 1023/2000 [00:07<00:07, 128.45it/s]
[Rank 1] Train Epoch 3:  52%|█████▏    | 1035/2000 [00:07<00:07, 130.97it/s]
[Rank 0] Train Epoch 3:  52%|█████▏    | 1046/2000 [00:07<00:07, 134.66it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 999 | Mem: 26.53MB, Util: 64%  global_step : 6999
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1000 | Mem: 26.53MB, Util: 64%  global_step : 7000
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1001 | Mem: 26.53MB, Util: 64%  global_step : 7001
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1002 | Mem: 26.53MB, Util: 64%  global_step : 7002
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1003 | Mem: 26.53MB, Util: 64%  global_step : 7003
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1004 | Mem: 26.53MB, Util: 64%  global_step : 7004
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1005 | Mem: 26.53MB, Util: 64%  global_step : 7005
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1006 | Mem: 26.53MB, Util: 64%  glob

[Rank 2] Train Epoch 3:  52%|█████▏    | 1037/2000 [00:07<00:07, 129.05it/s]
[Rank 1] Train Epoch 3:  52%|█████▎    | 1050/2000 [00:07<00:07, 135.21it/s]
[Rank 0] Train Epoch 3:  53%|█████▎    | 1065/2000 [00:08<00:06, 146.21it/s]
[Rank 2] Train Epoch 3:  52%|█████▎    | 1050/2000 [00:08<00:07, 126.93it/s]
[Rank 1] Train Epoch 3:  53%|█████▎    | 1066/2000 [00:08<00:06, 141.06it/s]
[Rank 0] Train Epoch 3:  54%|█████▍    | 1082/2000 [00:08<00:06, 148.36it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1025 | Mem: 26.53MB, Util: 62%  global_step : 7025
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1026 | Mem: 26.53MB, Util: 62%  global_step : 7026
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1027 | Mem: 26.53MB, Util: 62%  global_step : 7027
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1028 | Mem: 26.53MB, Util: 62%  global_step : 7028
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1029 | Mem: 26.53MB, Util: 62%  global_step : 7029
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1030 | Mem: 26.53MB, Util: 62%  global_step : 7030
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1031 | Mem: 26.53MB, Util: 62%  global_step : 7031
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1032 | Mem: 26.53MB, Util: 62%  glo

[Rank 2] Train Epoch 3:  53%|█████▎    | 1063/2000 [00:08<00:07, 125.38it/s]
[Rank 1] Train Epoch 3:  54%|█████▍    | 1082/2000 [00:08<00:06, 144.89it/s]
[Rank 2] Train Epoch 3:  54%|█████▍    | 1076/2000 [00:08<00:07, 126.08it/s]
[Rank 1] Train Epoch 3:  55%|█████▍    | 1098/2000 [00:08<00:06, 147.74it/s]
[Rank 0] Train Epoch 3:  55%|█████▍    | 1099/2000 [00:08<00:06, 141.53it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1051 | Mem: 26.53MB, Util: 60%  global_step : 7051
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1052 | Mem: 26.53MB, Util: 60%  global_step : 7052
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1053 | Mem: 26.53MB, Util: 60%  global_step : 7053
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1054 | Mem: 26.53MB, Util: 60%  global_step : 7054
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1055 | Mem: 26.53MB, Util: 60%  global_step : 7055
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1056 | Mem: 26.53MB, Util: 60%  global_step : 7056
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1057 | Mem: 26.53MB, Util: 60%  global_step : 7057
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1058 | Mem: 26.53MB, Util: 60%  glo

[Rank 2] Train Epoch 3:  54%|█████▍    | 1089/2000 [00:08<00:07, 126.35it/s]
[Rank 2] Train Epoch 3:  55%|█████▌    | 1102/2000 [00:08<00:07, 126.38it/s]
[Rank 0] Train Epoch 3:  56%|█████▌    | 1115/2000 [00:08<00:09, 97.92it/s] 


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1078 | Mem: 26.53MB, Util: 59%  global_step : 7078
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1079 | Mem: 26.53MB, Util: 59%  global_step : 7079
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1080 | Mem: 26.53MB, Util: 59%  global_step : 7080
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1081 | Mem: 26.53MB, Util: 59%  global_step : 7081
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1082 | Mem: 26.53MB, Util: 59%  global_step : 7082
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1083 | Mem: 26.53MB, Util: 59%  global_step : 7083
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1084 | Mem: 26.53MB, Util: 59%  global_step : 7084
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1085 | Mem: 26.53MB, Util: 59%  glo

[Rank 2] Train Epoch 3:  56%|█████▌    | 1115/2000 [00:08<00:06, 127.27it/s]
[Rank 1] Train Epoch 3:  56%|█████▌    | 1114/2000 [00:08<00:08, 104.95it/s]
[Rank 0] Train Epoch 3:  56%|█████▋    | 1130/2000 [00:08<00:08, 107.17it/s]
[Rank 2] Train Epoch 3:  56%|█████▋    | 1129/2000 [00:08<00:06, 130.59it/s]
[Rank 1] Train Epoch 3:  56%|█████▋    | 1130/2000 [00:08<00:07, 116.13it/s]
[Rank 0] Train Epoch 3:  57%|█████▋    | 1145/2000 [00:08<00:07, 115.13it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1104 | Mem: 26.53MB, Util: 62%  global_step : 7104
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1105 | Mem: 26.53MB, Util: 62%  global_step : 7105
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1106 | Mem: 26.53MB, Util: 62%  global_step : 7106
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1107 | Mem: 26.53MB, Util: 62%  global_step : 7107
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1108 | Mem: 26.53MB, Util: 62%  global_step : 7108
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1109 | Mem: 26.53MB, Util: 62%  global_step : 7109
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1110 | Mem: 26.53MB, Util: 62%  global_step : 7110
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1111 | Mem: 26.53MB, Util: 62%  glo

[Rank 2] Train Epoch 3:  57%|█████▋    | 1143/2000 [00:08<00:06, 132.16it/s]
[Rank 1] Train Epoch 3:  57%|█████▋    | 1145/2000 [00:08<00:06, 123.77it/s]
[Rank 0] Train Epoch 3:  58%|█████▊    | 1160/2000 [00:08<00:06, 122.23it/s]
[Rank 2] Train Epoch 3:  58%|█████▊    | 1157/2000 [00:08<00:06, 128.28it/s]
[Rank 1] Train Epoch 3:  58%|█████▊    | 1161/2000 [00:08<00:06, 132.76it/s]
[Rank 0] Train Epoch 3:  59%|█████▉    | 1175/2000 [00:08<00:06, 127.29it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1132 | Mem: 26.53MB, Util: 60%  global_step : 7132
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1133 | Mem: 26.53MB, Util: 60%  global_step : 7133
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1134 | Mem: 26.53MB, Util: 60%  global_step : 7134
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1135 | Mem: 26.53MB, Util: 60%  global_step : 7135
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1136 | Mem: 26.53MB, Util: 60%  global_step : 7136
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1137 | Mem: 26.53MB, Util: 60%  global_step : 7137
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1138 | Mem: 26.53MB, Util: 60%  global_step : 7138
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1139 | Mem: 26.53MB, Util: 60%  glo

[Rank 2] Train Epoch 3:  59%|█████▊    | 1171/2000 [00:09<00:06, 130.40it/s]
[Rank 1] Train Epoch 3:  59%|█████▉    | 1177/2000 [00:08<00:05, 138.44it/s]
[Rank 0] Train Epoch 3:  59%|█████▉    | 1189/2000 [00:09<00:06, 129.98it/s]
[Rank 2] Train Epoch 3:  59%|█████▉    | 1185/2000 [00:09<00:06, 129.88it/s]
[Rank 1] Train Epoch 3:  60%|█████▉    | 1193/2000 [00:09<00:05, 142.81it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1159 | Mem: 26.53MB, Util: 66%  global_step : 7159
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1160 | Mem: 26.53MB, Util: 66%  global_step : 7160
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1161 | Mem: 26.53MB, Util: 66%  global_step : 7161
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1162 | Mem: 26.53MB, Util: 66%  global_step : 7162
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1163 | Mem: 26.53MB, Util: 66%  global_step : 7163
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1164 | Mem: 26.53MB, Util: 66%  global_step : 7164
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1165 | Mem: 26.53MB, Util: 66%  global_step : 7165
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1166 | Mem: 26.53MB, Util: 66%  glo

[Rank 2] Train Epoch 3:  60%|█████▉    | 1199/2000 [00:09<00:06, 130.51it/s]
[Rank 0] Train Epoch 3:  60%|██████    | 1203/2000 [00:09<00:06, 115.11it/s]
[Rank 2] Train Epoch 3:  61%|██████    | 1213/2000 [00:09<00:06, 130.62it/s]
[Rank 1] Train Epoch 3:  60%|██████    | 1208/2000 [00:09<00:07, 111.48it/s]
[Rank 0] Train Epoch 3:  61%|██████    | 1221/2000 [00:09<00:05, 130.50it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1186 | Mem: 26.53MB, Util: 58%  global_step : 7186
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1187 | Mem: 26.53MB, Util: 58%  global_step : 7187
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1188 | Mem: 26.53MB, Util: 58%  global_step : 7188
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1189 | Mem: 26.53MB, Util: 58%  global_step : 7189
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1190 | Mem: 26.53MB, Util: 58%  global_step : 7190
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1191 | Mem: 26.53MB, Util: 58%  global_step : 7191
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1192 | Mem: 26.53MB, Util: 58%  global_step : 7192
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1193 | Mem: 26.53MB, Util: 58%  glo

[Rank 2] Train Epoch 3:  61%|██████▏   | 1227/2000 [00:09<00:05, 129.92it/s]
[Rank 1] Train Epoch 3:  61%|██████    | 1224/2000 [00:09<00:06, 121.17it/s]
[Rank 0] Train Epoch 3:  62%|██████▏   | 1238/2000 [00:09<00:05, 138.97it/s]
[Rank 1] Train Epoch 3:  62%|██████▏   | 1239/2000 [00:09<00:05, 128.30it/s]
[Rank 0] Train Epoch 3:  63%|██████▎   | 1253/2000 [00:09<00:05, 140.34it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1214 | Mem: 26.53MB, Util: 62%  global_step : 7214
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1215 | Mem: 26.53MB, Util: 62%  global_step : 7215
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1216 | Mem: 26.53MB, Util: 62%  global_step : 7216
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1217 | Mem: 26.53MB, Util: 62%  global_step : 7217
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1218 | Mem: 26.53MB, Util: 62%  global_step : 7218
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1219 | Mem: 26.53MB, Util: 62%  global_step : 7219
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1220 | Mem: 26.53MB, Util: 62%  global_step : 7220
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1221 | Mem: 26.53MB, Util: 62%  glo

[Rank 2] Train Epoch 3:  62%|██████▏   | 1241/2000 [00:09<00:05, 127.22it/s]
[Rank 1] Train Epoch 3:  63%|██████▎   | 1255/2000 [00:09<00:05, 134.28it/s]
[Rank 0] Train Epoch 3:  63%|██████▎   | 1268/2000 [00:09<00:05, 140.11it/s]
[Rank 2] Train Epoch 3:  63%|██████▎   | 1254/2000 [00:09<00:05, 126.73it/s]
[Rank 1] Train Epoch 3:  64%|██████▎   | 1270/2000 [00:09<00:05, 137.42it/s]
[Rank 0] Train Epoch 3:  64%|██████▍   | 1283/2000 [00:09<00:05, 139.78it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1240 | Mem: 26.53MB, Util: 65%  global_step : 7240
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1241 | Mem: 26.53MB, Util: 65%  global_step : 7241
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1242 | Mem: 26.53MB, Util: 65%  global_step : 7242
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1243 | Mem: 26.53MB, Util: 65%  global_step : 7243
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1244 | Mem: 26.53MB, Util: 65%  global_step : 7244
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1245 | Mem: 26.53MB, Util: 65%  global_step : 7245
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1246 | Mem: 26.53MB, Util: 65%  global_step : 7246
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1247 | Mem: 26.53MB, Util: 65%  glo

[Rank 2] Train Epoch 3:  63%|██████▎   | 1268/2000 [00:09<00:05, 128.81it/s]
[Rank 1] Train Epoch 3:  64%|██████▍   | 1286/2000 [00:09<00:05, 141.42it/s]
[Rank 0] Train Epoch 3:  65%|██████▍   | 1298/2000 [00:09<00:05, 140.18it/s]
[Rank 2] Train Epoch 3:  64%|██████▍   | 1281/2000 [00:09<00:05, 127.59it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1267 | Mem: 26.53MB, Util: 60%  global_step : 7267
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1268 | Mem: 26.53MB, Util: 60%  global_step : 7268
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1269 | Mem: 26.53MB, Util: 60%  global_step : 7269
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1270 | Mem: 26.53MB, Util: 60%  global_step : 7270
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1271 | Mem: 26.53MB, Util: 60%  global_step : 7271
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1272 | Mem: 26.53MB, Util: 60%  global_step : 7272
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1273 | Mem: 26.53MB, Util: 60%  global_step : 7273
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1274 | Mem: 26.53MB, Util: 60%  glo

[Rank 2] Train Epoch 3:  65%|██████▍   | 1294/2000 [00:09<00:05, 126.20it/s]
[Rank 1] Train Epoch 3:  65%|██████▌   | 1301/2000 [00:10<00:06, 107.94it/s]
[Rank 0] Train Epoch 3:  66%|██████▌   | 1313/2000 [00:10<00:06, 104.84it/s]
[Rank 2] Train Epoch 3:  65%|██████▌   | 1308/2000 [00:10<00:05, 128.69it/s]
[Rank 1] Train Epoch 3:  66%|██████▌   | 1317/2000 [00:10<00:05, 118.20it/s]
[Rank 0] Train Epoch 3:  66%|██████▋   | 1328/2000 [00:10<00:05, 113.70it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1293 | Mem: 26.53MB, Util: 62%  global_step : 7293
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1294 | Mem: 26.53MB, Util: 62%  global_step : 7294
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1295 | Mem: 26.53MB, Util: 62%  global_step : 7295
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1296 | Mem: 26.53MB, Util: 62%  global_step : 7296
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1297 | Mem: 26.53MB, Util: 62%  global_step : 7297
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1298 | Mem: 26.53MB, Util: 62%  global_step : 7298
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1299 | Mem: 26.53MB, Util: 62%  global_step : 7299
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1300 | Mem: 26.53MB, Util: 62%  glo

[Rank 2] Train Epoch 3:  66%|██████▌   | 1321/2000 [00:10<00:05, 128.74it/s]
[Rank 1] Train Epoch 3:  67%|██████▋   | 1332/2000 [00:10<00:05, 125.31it/s]
[Rank 0] Train Epoch 3:  67%|██████▋   | 1343/2000 [00:10<00:05, 121.06it/s]
[Rank 2] Train Epoch 3:  67%|██████▋   | 1335/2000 [00:10<00:05, 129.34it/s]
[Rank 1] Train Epoch 3:  67%|██████▋   | 1347/2000 [00:10<00:05, 129.65it/s]
[Rank 0] Train Epoch 3:  68%|██████▊   | 1358/2000 [00:10<00:05, 127.25it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1320 | Mem: 26.53MB, Util: 63%  global_step : 7320
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1321 | Mem: 26.53MB, Util: 63%  global_step : 7321
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1322 | Mem: 26.53MB, Util: 63%  global_step : 7322
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1323 | Mem: 26.53MB, Util: 63%  global_step : 7323
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1324 | Mem: 26.53MB, Util: 63%  global_step : 7324
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1325 | Mem: 26.53MB, Util: 63%  global_step : 7325
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1326 | Mem: 26.53MB, Util: 63%  global_step : 7326
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1327 | Mem: 26.53MB, Util: 63%  glo

[Rank 2] Train Epoch 3:  67%|██████▋   | 1348/2000 [00:10<00:05, 126.77it/s]
[Rank 1] Train Epoch 3:  68%|██████▊   | 1362/2000 [00:10<00:04, 132.70it/s]
[Rank 0] Train Epoch 3:  69%|██████▊   | 1373/2000 [00:10<00:04, 130.82it/s]
[Rank 2] Train Epoch 3:  68%|██████▊   | 1361/2000 [00:10<00:05, 126.68it/s]
[Rank 1] Train Epoch 3:  69%|██████▉   | 1376/2000 [00:10<00:04, 133.80it/s]
[Rank 0] Train Epoch 3:  69%|██████▉   | 1388/2000 [00:10<00:04, 134.14it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1347 | Mem: 26.53MB, Util: 63%  global_step : 7347
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1348 | Mem: 26.53MB, Util: 63%  global_step : 7348
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1349 | Mem: 26.53MB, Util: 63%  global_step : 7349
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1350 | Mem: 26.53MB, Util: 63%  global_step : 7350
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1351 | Mem: 26.53MB, Util: 63%  global_step : 7351
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1352 | Mem: 26.53MB, Util: 63%  global_step : 7352
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1353 | Mem: 26.53MB, Util: 63%  global_step : 7353
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1354 | Mem: 26.53MB, Util: 63%  glo

[Rank 2] Train Epoch 3:  69%|██████▉   | 1375/2000 [00:10<00:04, 128.80it/s]
[Rank 1] Train Epoch 3:  70%|██████▉   | 1393/2000 [00:10<00:04, 141.95it/s]
[Rank 2] Train Epoch 3:  69%|██████▉   | 1388/2000 [00:10<00:04, 128.15it/s]
[Rank 0] Train Epoch 3:  70%|███████   | 1402/2000 [00:10<00:05, 114.20it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1374 | Mem: 26.53MB, Util: 63%  global_step : 7374
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1375 | Mem: 26.53MB, Util: 63%  global_step : 7375
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1376 | Mem: 26.53MB, Util: 63%  global_step : 7376
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1377 | Mem: 26.53MB, Util: 63%  global_step : 7377
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1378 | Mem: 26.53MB, Util: 63%  global_step : 7378
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1379 | Mem: 26.53MB, Util: 63%  global_step : 7379
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1380 | Mem: 26.53MB, Util: 63%  global_step : 7380
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1381 | Mem: 26.53MB, Util: 63%  glo

[Rank 2] Train Epoch 3:  70%|███████   | 1402/2000 [00:10<00:04, 130.10it/s]
[Rank 1] Train Epoch 3:  70%|███████   | 1408/2000 [00:10<00:05, 113.84it/s]
[Rank 0] Train Epoch 3:  71%|███████   | 1417/2000 [00:10<00:04, 121.42it/s]
[Rank 2] Train Epoch 3:  71%|███████   | 1416/2000 [00:10<00:04, 130.91it/s]
[Rank 1] Train Epoch 3:  71%|███████   | 1423/2000 [00:10<00:04, 120.62it/s]
[Rank 0] Train Epoch 3:  72%|███████▏  | 1432/2000 [00:11<00:04, 127.36it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1401 | Mem: 26.53MB, Util: 64%  global_step : 7401
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1402 | Mem: 26.53MB, Util: 64%  global_step : 7402
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1403 | Mem: 26.53MB, Util: 64%  global_step : 7403
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1404 | Mem: 26.53MB, Util: 64%  global_step : 7404
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1405 | Mem: 26.53MB, Util: 64%  global_step : 7405
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1406 | Mem: 26.53MB, Util: 64%  global_step : 7406
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1407 | Mem: 26.53MB, Util: 64%  global_step : 7407
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1408 | Mem: 26.53MB, Util: 64%  glo

[Rank 2] Train Epoch 3:  72%|███████▏  | 1430/2000 [00:11<00:04, 129.27it/s]
[Rank 1] Train Epoch 3:  72%|███████▏  | 1438/2000 [00:11<00:04, 125.90it/s]
[Rank 0] Train Epoch 3:  72%|███████▏  | 1447/2000 [00:11<00:04, 131.36it/s]
[Rank 2] Train Epoch 3:  72%|███████▏  | 1444/2000 [00:11<00:04, 129.65it/s]
[Rank 1] Train Epoch 3:  73%|███████▎  | 1452/2000 [00:11<00:04, 129.45it/s]
[Rank 0] Train Epoch 3:  73%|███████▎  | 1462/2000 [00:11<00:03, 134.78it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1428 | Mem: 26.53MB, Util: 64%  global_step : 7428
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1429 | Mem: 26.53MB, Util: 64%  global_step : 7429
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1430 | Mem: 26.53MB, Util: 64%  global_step : 7430
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1431 | Mem: 26.53MB, Util: 64%  global_step : 7431
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1432 | Mem: 26.53MB, Util: 64%  global_step : 7432
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1433 | Mem: 26.53MB, Util: 64%  global_step : 7433
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1434 | Mem: 26.53MB, Util: 64%  global_step : 7434
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1435 | Mem: 26.53MB, Util: 64%  glo

[Rank 2] Train Epoch 3:  73%|███████▎  | 1457/2000 [00:11<00:04, 127.94it/s]
[Rank 1] Train Epoch 3:  73%|███████▎  | 1467/2000 [00:11<00:04, 132.89it/s]
[Rank 0] Train Epoch 3:  74%|███████▍  | 1477/2000 [00:11<00:03, 136.46it/s]
[Rank 2] Train Epoch 3:  74%|███████▎  | 1470/2000 [00:11<00:04, 125.77it/s]
[Rank 1] Train Epoch 3:  74%|███████▍  | 1481/2000 [00:11<00:03, 134.42it/s]
[Rank 0] Train Epoch 3:  75%|███████▍  | 1492/2000 [00:11<00:03, 137.89it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1454 | Mem: 26.53MB, Util: 61%  global_step : 7454
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1455 | Mem: 26.53MB, Util: 61%  global_step : 7455
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1456 | Mem: 26.53MB, Util: 61%  global_step : 7456
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1457 | Mem: 26.53MB, Util: 61%  global_step : 7457
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1458 | Mem: 26.53MB, Util: 61%  global_step : 7458
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1459 | Mem: 26.53MB, Util: 61%  global_step : 7459
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1460 | Mem: 26.53MB, Util: 61%  global_step : 7460
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1461 | Mem: 26.53MB, Util: 61%  glo

[Rank 2] Train Epoch 3:  74%|███████▍  | 1483/2000 [00:11<00:04, 125.23it/s]
[Rank 1] Train Epoch 3:  75%|███████▍  | 1496/2000 [00:11<00:03, 136.71it/s]
[Rank 0] Train Epoch 3:  75%|███████▌  | 1506/2000 [00:11<00:04, 114.64it/s]
[Rank 2] Train Epoch 3:  75%|███████▍  | 1497/2000 [00:11<00:03, 128.82it/s]
[Rank 1] Train Epoch 3:  76%|███████▌  | 1510/2000 [00:11<00:04, 116.32it/s]
[Rank 0] Train Epoch 3:  76%|███████▌  | 1521/2000 [00:11<00:03, 121.84it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1480 | Mem: 26.53MB, Util: 60%  global_step : 7480
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1481 | Mem: 26.53MB, Util: 60%  global_step : 7481
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1482 | Mem: 26.53MB, Util: 60%  global_step : 7482
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1483 | Mem: 26.53MB, Util: 60%  global_step : 7483
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1484 | Mem: 26.53MB, Util: 60%  global_step : 7484
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1485 | Mem: 26.53MB, Util: 60%  global_step : 7485
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1486 | Mem: 26.53MB, Util: 60%  global_step : 7486
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1487 | Mem: 26.53MB, Util: 60%  glo

[Rank 2] Train Epoch 3:  76%|███████▌  | 1512/2000 [00:11<00:03, 134.95it/s]
[Rank 1] Train Epoch 3:  76%|███████▋  | 1525/2000 [00:11<00:03, 123.27it/s]
[Rank 0] Train Epoch 3:  77%|███████▋  | 1536/2000 [00:11<00:03, 127.35it/s]
[Rank 2] Train Epoch 3:  76%|███████▋  | 1528/2000 [00:11<00:03, 141.68it/s]
[Rank 1] Train Epoch 3:  77%|███████▋  | 1540/2000 [00:11<00:03, 128.51it/s]
[Rank 0] Train Epoch 3:  78%|███████▊  | 1551/2000 [00:11<00:03, 131.61it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1509 | Mem: 26.53MB, Util: 61%  global_step : 7509
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1510 | Mem: 26.53MB, Util: 61%  global_step : 7510
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1511 | Mem: 26.53MB, Util: 61%  global_step : 7511
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1512 | Mem: 26.53MB, Util: 61%  global_step : 7512
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1513 | Mem: 26.53MB, Util: 61%  global_step : 7513
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1514 | Mem: 26.53MB, Util: 61%  global_step : 7514
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1515 | Mem: 26.53MB, Util: 61%  global_step : 7515
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1516 | Mem: 26.53MB, Util: 61%  glo

[Rank 2] Train Epoch 3:  77%|███████▋  | 1544/2000 [00:11<00:03, 145.01it/s]
[Rank 0] Train Epoch 3:  78%|███████▊  | 1566/2000 [00:12<00:03, 134.50it/s]
[Rank 2] Train Epoch 3:  78%|███████▊  | 1560/2000 [00:11<00:02, 147.45it/s]
[Rank 1] Train Epoch 3:  78%|███████▊  | 1555/2000 [00:11<00:03, 132.34it/s]
[Rank 0] Train Epoch 3:  79%|███████▉  | 1581/2000 [00:12<00:03, 136.72it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1542 | Mem: 26.53MB, Util: 97%  global_step : 7542
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1543 | Mem: 26.53MB, Util: 97%  global_step : 7543
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1544 | Mem: 26.53MB, Util: 97%  global_step : 7544
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1545 | Mem: 26.53MB, Util: 97%  global_step : 7545
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1546 | Mem: 26.53MB, Util: 97%  global_step : 7546
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1547 | Mem: 26.53MB, Util: 97%  global_step : 7547
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1548 | Mem: 26.53MB, Util: 97%  global_step : 7548
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1549 | Mem: 26.53MB, Util: 97%  glo

[Rank 2] Train Epoch 3:  79%|███████▉  | 1577/2000 [00:12<00:02, 151.41it/s]
[Rank 1] Train Epoch 3:  78%|███████▊  | 1570/2000 [00:12<00:03, 135.04it/s]
[Rank 0] Train Epoch 3:  80%|███████▉  | 1596/2000 [00:12<00:02, 138.43it/s]
[Rank 2] Train Epoch 3:  80%|███████▉  | 1593/2000 [00:12<00:02, 147.03it/s]
[Rank 1] Train Epoch 3:  79%|███████▉  | 1585/2000 [00:12<00:03, 136.95it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1574 | Mem: 26.53MB, Util: 100%  global_step : 7574
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1575 | Mem: 26.53MB, Util: 100%  global_step : 7575
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1576 | Mem: 26.53MB, Util: 100%  global_step : 7576
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1577 | Mem: 26.53MB, Util: 100%  global_step : 7577
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1578 | Mem: 26.53MB, Util: 100%  global_step : 7578
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1579 | Mem: 26.53MB, Util: 100%  global_step : 7579
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1580 | Mem: 26.53MB, Util: 100%  global_step : 7580
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 3, Batch 1581 | Mem: 26.53MB, Util: 1

[Rank 0] Train Epoch 3:  81%|████████  | 1611/2000 [00:12<00:02, 139.57it/s]
[Rank 2] Train Epoch 3:  80%|████████  | 1608/2000 [00:12<00:02, 131.24it/s]
[Rank 1] Train Epoch 3:  80%|████████  | 1600/2000 [00:12<00:02, 139.01it/s]
[Rank 0] Train Epoch 3:  81%|████████▏ | 1626/2000 [00:12<00:02, 139.87it/s]
[Rank 1] Train Epoch 3:  81%|████████  | 1615/2000 [00:12<00:02, 139.66it/s]
[Rank 2] Train Epoch 3:  81%|████████  | 1623/2000 [00:12<00:02, 135.93it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 1597 | Mem: 26.53MB, Util: 69%  global_step : 7597
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 1598 | Mem: 26.53MB, Util: 69%  global_step : 7598
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 1599 | Mem: 26.53MB, Util: 69%  global_step : 7599
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 1600 | Mem: 26.53MB, Util: 69%  global_step : 7600
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 1601 | Mem: 26.53MB, Util: 68%  global_step : 7601
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 1602 | Mem: 26.53MB, Util: 68%  global_step : 7602
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 1603 | Mem: 26.53MB, Util: 68%  global_step : 7603
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 1604 | Mem: 26.53MB, Util: 68%  global_step

[Rank 0] Train Epoch 3:  82%|████████▏ | 1641/2000 [00:12<00:02, 141.33it/s]
[Rank 1] Train Epoch 3:  82%|████████▏ | 1630/2000 [00:12<00:02, 140.30it/s]
[Rank 2] Train Epoch 3:  82%|████████▏ | 1638/2000 [00:12<00:02, 138.83it/s]
[Rank 1] Train Epoch 3:  82%|████████▏ | 1645/2000 [00:12<00:02, 141.30it/s]
[Rank 0] Train Epoch 3:  83%|████████▎ | 1656/2000 [00:12<00:02, 142.21it/s]
[Rank 2] Train Epoch 3:  83%|████████▎ | 1654/2000 [00:12<00:02, 144.61it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 1627 | Mem: 26.53MB, Util: 68%  global_step : 7627
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 1628 | Mem: 26.53MB, Util: 68%  global_step : 7628
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 1629 | Mem: 26.53MB, Util: 68%  global_step : 7629
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 1630 | Mem: 26.53MB, Util: 68%  global_step : 7630
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 1631 | Mem: 26.53MB, Util: 68%  global_step : 7631
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 1632 | Mem: 26.53MB, Util: 68%  global_step : 7632
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 1633 | Mem: 26.53MB, Util: 68%  global_step : 7633
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 3, Batch 1634 | Mem: 26.53MB, Util: 68%  global_step

[Rank 1] Train Epoch 3:  83%|████████▎ | 1660/2000 [00:12<00:02, 141.91it/s]
[Rank 0] Train Epoch 3:  84%|████████▎ | 1671/2000 [00:12<00:02, 142.39it/s]
[Rank 2] Train Epoch 3:  84%|████████▎ | 1670/2000 [00:12<00:02, 146.54it/s]
[Rank 1] Train Epoch 3:  84%|████████▍ | 1675/2000 [00:12<00:02, 142.56it/s]
[Rank 0] Train Epoch 3:  84%|████████▍ | 1686/2000 [00:12<00:02, 142.58it/s]
[Rank 2] Train Epoch 3:  84%|████████▍ | 1686/2000 [00:12<00:02, 150.36it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1657 | Mem: 26.53MB, Util: 87%  global_step : 7657
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1658 | Mem: 26.53MB, Util: 87%  global_step : 7658
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1659 | Mem: 26.53MB, Util: 87%  global_step : 7659
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1660 | Mem: 26.53MB, Util: 87%  global_step : 7660
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1661 | Mem: 26.53MB, Util: 87%  global_step : 7661
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1662 | Mem: 26.53MB, Util: 87%  global_step : 7662
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1663 | Mem: 26.53MB, Util: 87%  global_step : 7663
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1664 | Mem: 26.53MB, Util: 

[Rank 1] Train Epoch 3:  84%|████████▍ | 1690/2000 [00:12<00:02, 142.47it/s]
[Rank 0] Train Epoch 3:  85%|████████▌ | 1701/2000 [00:12<00:02, 142.87it/s]
[Rank 2] Train Epoch 3:  85%|████████▌ | 1702/2000 [00:13<00:02, 134.23it/s]
[Rank 1] Train Epoch 3:  85%|████████▌ | 1705/2000 [00:12<00:02, 142.59it/s]
[Rank 0] Train Epoch 3:  86%|████████▌ | 1716/2000 [00:13<00:01, 143.36it/s]
[Rank 2] Train Epoch 3:  86%|████████▌ | 1718/2000 [00:13<00:02, 140.09it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1687 | Mem: 26.53MB, Util: 79%  global_step : 7687
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1688 | Mem: 26.53MB, Util: 79%  global_step : 7688
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1689 | Mem: 26.53MB, Util: 79%  global_step : 7689
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1690 | Mem: 26.53MB, Util: 79%  global_step : 7690
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1691 | Mem: 26.53MB, Util: 79%  global_step : 7691
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1692 | Mem: 26.53MB, Util: 79%  global_step : 7692
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1693 | Mem: 26.53MB, Util: 79%  global_step : 7693
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1694 | Mem: 26.53MB, Util: 

[Rank 1] Train Epoch 3:  86%|████████▌ | 1720/2000 [00:13<00:01, 142.22it/s]
[Rank 2] Train Epoch 3:  87%|████████▋ | 1734/2000 [00:13<00:01, 144.52it/s]
[Rank 0] Train Epoch 3:  87%|████████▋ | 1731/2000 [00:13<00:01, 143.97it/s]
[Rank 1] Train Epoch 3:  87%|████████▋ | 1735/2000 [00:13<00:01, 142.28it/s]
[Rank 2] Train Epoch 3:  88%|████████▊ | 1750/2000 [00:13<00:01, 148.41it/s]
[Rank 0] Train Epoch 3:  87%|████████▋ | 1746/2000 [00:13<00:01, 144.51it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1716 | Mem: 26.53MB, Util: 81%  global_step : 7716
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1717 | Mem: 26.53MB, Util: 81%  global_step : 7717
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1718 | Mem: 26.53MB, Util: 81%  global_step : 7718
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1719 | Mem: 26.53MB, Util: 81%  global_step : 7719
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1720 | Mem: 26.53MB, Util: 81%  global_step : 7720
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1721 | Mem: 26.53MB, Util: 81%  global_step : 7721
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1722 | Mem: 26.53MB, Util: 81%  global_step : 7722
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1723 | Mem: 26.53MB, Util: 

[Rank 1] Train Epoch 3:  88%|████████▊ | 1750/2000 [00:13<00:01, 142.84it/s]
[Rank 2] Train Epoch 3:  88%|████████▊ | 1767/2000 [00:13<00:01, 152.25it/s]
[Rank 0] Train Epoch 3:  88%|████████▊ | 1761/2000 [00:13<00:01, 142.23it/s]
[Rank 1] Train Epoch 3:  88%|████████▊ | 1765/2000 [00:13<00:01, 143.29it/s]
[Rank 0] Train Epoch 3:  89%|████████▉ | 1776/2000 [00:13<00:01, 143.08it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1746 | Mem: 26.53MB, Util: 75%  global_step : 7746
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1747 | Mem: 26.53MB, Util: 75%  global_step : 7747
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1748 | Mem: 26.53MB, Util: 75%  global_step : 7748
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1749 | Mem: 26.53MB, Util: 75%  global_step : 7749
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1750 | Mem: 26.53MB, Util: 68%  global_step : 7750
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1751 | Mem: 26.53MB, Util: 68%  global_step : 7751
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1752 | Mem: 26.53MB, Util: 68%  global_step : 7752
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1753 | Mem: 26.53MB, Util: 

[Rank 1] Train Epoch 3:  89%|████████▉ | 1780/2000 [00:13<00:01, 143.79it/s]
[Rank 2] Train Epoch 3:  89%|████████▉ | 1783/2000 [00:13<00:01, 149.54it/s]
[Rank 0] Train Epoch 3:  90%|████████▉ | 1791/2000 [00:13<00:01, 140.84it/s]
[Rank 1] Train Epoch 3:  90%|████████▉ | 1795/2000 [00:13<00:01, 142.67it/s]
[Rank 2] Train Epoch 3:  90%|████████▉ | 1799/2000 [00:13<00:01, 148.95it/s]
[Rank 0] Train Epoch 3:  90%|█████████ | 1806/2000 [00:13<00:01, 141.92it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1776 | Mem: 26.53MB, Util: 68%  global_step : 7776
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1777 | Mem: 26.53MB, Util: 68%  global_step : 7777
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1778 | Mem: 26.53MB, Util: 68%  global_step : 7778
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1779 | Mem: 26.53MB, Util: 68%  global_step : 7779
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1780 | Mem: 26.53MB, Util: 77%  global_step : 7780
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1781 | Mem: 26.53MB, Util: 77%  global_step : 7781
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1782 | Mem: 26.53MB, Util: 77%  global_step : 7782
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1783 | Mem: 26.53MB, Util: 

[Rank 1] Train Epoch 3:  90%|█████████ | 1810/2000 [00:13<00:01, 142.26it/s]
[Rank 2] Train Epoch 3:  91%|█████████ | 1814/2000 [00:13<00:01, 130.00it/s]
[Rank 0] Train Epoch 3:  91%|█████████ | 1821/2000 [00:13<00:01, 142.63it/s]
[Rank 1] Train Epoch 3:  91%|█████████▏| 1825/2000 [00:13<00:01, 142.43it/s]
[Rank 2] Train Epoch 3:  91%|█████████▏| 1828/2000 [00:13<00:01, 130.58it/s]
[Rank 0] Train Epoch 3:  92%|█████████▏| 1836/2000 [00:13<00:01, 143.27it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1805 | Mem: 26.53MB, Util: 77%  global_step : 7805
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1806 | Mem: 26.53MB, Util: 77%  global_step : 7806
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1807 | Mem: 26.53MB, Util: 77%  global_step : 7807
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1808 | Mem: 26.53MB, Util: 77%  global_step : 7808
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1809 | Mem: 26.53MB, Util: 91%  global_step : 7809
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1810 | Mem: 26.53MB, Util: 91%  global_step : 7810
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1811 | Mem: 26.53MB, Util: 91%  global_step : 7811
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1812 | Mem: 26.53MB, Util: 

[Rank 1] Train Epoch 3:  92%|█████████▏| 1840/2000 [00:13<00:01, 143.29it/s]
[Rank 2] Train Epoch 3:  92%|█████████▏| 1842/2000 [00:13<00:01, 132.18it/s]
[Rank 0] Train Epoch 3:  93%|█████████▎| 1851/2000 [00:14<00:01, 143.93it/s]
[Rank 1] Train Epoch 3:  93%|█████████▎| 1855/2000 [00:14<00:01, 142.67it/s]
[Rank 2] Train Epoch 3:  93%|█████████▎| 1856/2000 [00:14<00:01, 133.70it/s]
[Rank 0] Train Epoch 3:  93%|█████████▎| 1866/2000 [00:14<00:00, 143.92it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1834 | Mem: 26.53MB, Util: 91%  global_step : 7834
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1835 | Mem: 26.53MB, Util: 91%  global_step : 7835
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1836 | Mem: 26.53MB, Util: 91%  global_step : 7836
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1837 | Mem: 26.53MB, Util: 91%  global_step : 7837
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1838 | Mem: 26.53MB, Util: 91%  global_step : 7838
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1839 | Mem: 26.53MB, Util: 89%  global_step : 7839
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1840 | Mem: 26.53MB, Util: 89%  global_step : 7840
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1841 | Mem: 26.53MB, Util: 

[Rank 1] Train Epoch 3:  94%|█████████▎| 1870/2000 [00:14<00:00, 141.94it/s]
[Rank 2] Train Epoch 3:  94%|█████████▎| 1870/2000 [00:14<00:00, 133.05it/s]
[Rank 0] Train Epoch 3:  94%|█████████▍| 1881/2000 [00:14<00:00, 143.61it/s]
[Rank 1] Train Epoch 3:  94%|█████████▍| 1885/2000 [00:14<00:00, 141.30it/s]
[Rank 2] Train Epoch 3:  94%|█████████▍| 1884/2000 [00:14<00:00, 134.19it/s]
[Rank 0] Train Epoch 3:  95%|█████████▍| 1896/2000 [00:14<00:00, 143.94it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1864 | Mem: 26.53MB, Util: 89%  global_step : 7864
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1865 | Mem: 26.53MB, Util: 89%  global_step : 7865
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1866 | Mem: 26.53MB, Util: 89%  global_step : 7866
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1867 | Mem: 26.53MB, Util: 89%  global_step : 7867
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1868 | Mem: 26.53MB, Util: 100%  global_step : 7868
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1869 | Mem: 26.53MB, Util: 100%  global_step : 7869
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1870 | Mem: 26.53MB, Util: 100%  global_step : 7870
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1871 | Mem: 26.53MB, Uti

[Rank 1] Train Epoch 3:  95%|█████████▌| 1900/2000 [00:14<00:00, 140.44it/s]
[Rank 2] Train Epoch 3:  95%|█████████▍| 1898/2000 [00:14<00:00, 134.60it/s]
[Rank 1] Train Epoch 3:  96%|█████████▌| 1915/2000 [00:14<00:00, 130.68it/s]
[Rank 2] Train Epoch 3:  96%|█████████▌| 1913/2000 [00:14<00:00, 136.84it/s]
[Rank 0] Train Epoch 3:  96%|█████████▌| 1911/2000 [00:14<00:00, 125.40it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1892 | Mem: 26.53MB, Util: 100%  global_step : 7892
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1893 | Mem: 26.53MB, Util: 100%  global_step : 7893
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1894 | Mem: 26.53MB, Util: 100%  global_step : 7894
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1895 | Mem: 26.53MB, Util: 100%  global_step : 7895
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1896 | Mem: 26.53MB, Util: 100%  global_step : 7896
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1897 | Mem: 26.53MB, Util: 100%  global_step : 7897
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1898 | Mem: 26.53MB, Util: 100%  global_step : 7898
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1899 | Mem: 26.53MB,

[Rank 1] Train Epoch 3:  96%|█████████▋| 1930/2000 [00:14<00:00, 133.31it/s]
[Rank 2] Train Epoch 3:  96%|█████████▋| 1928/2000 [00:14<00:00, 140.63it/s]
[Rank 0] Train Epoch 3:  96%|█████████▋| 1925/2000 [00:14<00:00, 128.42it/s]
[Rank 1] Train Epoch 3:  97%|█████████▋| 1945/2000 [00:14<00:00, 135.90it/s]
[Rank 2] Train Epoch 3:  97%|█████████▋| 1943/2000 [00:14<00:00, 141.93it/s]
[Rank 0] Train Epoch 3:  97%|█████████▋| 1939/2000 [00:14<00:00, 130.27it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1917 | Mem: 26.53MB, Util: 100%  global_step : 7917
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1918 | Mem: 26.53MB, Util: 100%  global_step : 7918
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1919 | Mem: 26.53MB, Util: 98%  global_step : 7919
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1920 | Mem: 26.53MB, Util: 98%  global_step : 7920
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1921 | Mem: 26.53MB, Util: 98%  global_step : 7921
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1922 | Mem: 26.53MB, Util: 98%  global_step : 7922
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1923 | Mem: 26.53MB, Util: 98%  global_step : 7923
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1924 | Mem: 26.53MB, Util

[Rank 1] Train Epoch 3:  98%|█████████▊| 1960/2000 [00:14<00:00, 137.56it/s]
[Rank 2] Train Epoch 3:  98%|█████████▊| 1958/2000 [00:14<00:00, 143.20it/s]
[Rank 0] Train Epoch 3:  98%|█████████▊| 1953/2000 [00:14<00:00, 131.42it/s]
[Rank 1] Train Epoch 3:  99%|█████████▉| 1976/2000 [00:14<00:00, 141.63it/s]
[Rank 2] Train Epoch 3:  99%|█████████▊| 1973/2000 [00:14<00:00, 144.56it/s]
[Rank 0] Train Epoch 3:  98%|█████████▊| 1967/2000 [00:14<00:00, 132.06it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1946 | Mem: 26.53MB, Util: 98%  global_step : 7946
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1947 | Mem: 26.53MB, Util: 98%  global_step : 7947
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1948 | Mem: 26.53MB, Util: 100%  global_step : 7948
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1949 | Mem: 26.53MB, Util: 100%  global_step : 7949
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1950 | Mem: 26.53MB, Util: 100%  global_step : 7950
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1951 | Mem: 26.53MB, Util: 100%  global_step : 7951
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1952 | Mem: 26.53MB, Util: 100%  global_step : 7952
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1953 | Mem: 26.53MB, U

[Rank 1] Train Epoch 3: 100%|█████████▉| 1991/2000 [00:15<00:00, 143.23it/s]
[Rank 2] Train Epoch 3:  99%|█████████▉| 1988/2000 [00:15<00:00, 145.95it/s]
[Rank 0] Train Epoch 3:  99%|█████████▉| 1981/2000 [00:15<00:00, 132.89it/s]
[Rank 1] Train Epoch 3: 100%|██████████| 2000/2000 [00:15<00:00, 132.59it/s]
[Rank 1] Test Epoch 3:   0%|          | 0/334 [00:00<?, ?it/s]
[Rank 2] Train Epoch 3: 100%|██████████| 2000/2000 [00:15<00:00, 132.43it/s]
[Rank 2] Test Epoch 3:   0%|          | 0/334 [00:00<?, ?it/s]
[Rank 0] Train Epoch 3: 100%|██████████| 2000/2000 [00:15<00:00, 132.04it/s]
[Rank 0] Test Epoch 3:   0%|          | 0/334 [00:00<?, ?it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1977 | Mem: 26.53MB, Util: 100%  global_step : 7977
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1978 | Mem: 26.53MB, Util: 100%  global_step : 7978
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1979 | Mem: 26.53MB, Util: 100%  global_step : 7979
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1980 | Mem: 26.53MB, Util: 100%  global_step : 7980
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1981 | Mem: 26.53MB, Util: 100%  global_step : 7981
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1982 | Mem: 26.53MB, Util: 100%  global_step : 7982
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1983 | Mem: 26.53MB, Util: 100%  global_step : 7983
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 3, Batch 1984 | Mem: 26.53MB,

[Rank 1] Test Epoch 3:   5%|▍         | 16/334 [00:00<00:02, 157.92it/s]
[Rank 2] Test Epoch 3:   3%|▎         | 11/334 [00:00<00:02, 109.25it/s]
[Rank 0] Test Epoch 3:  10%|▉         | 33/334 [00:00<00:00, 325.88it/s]
[Rank 1] Test Epoch 3:  15%|█▌        | 51/334 [00:00<00:01, 268.55it/s]
[Rank 2] Test Epoch 3:  13%|█▎        | 45/334 [00:00<00:01, 243.04it/s]
[Rank 0] Test Epoch 3:  20%|██        | 68/334 [00:00<00:00, 336.36it/s]
[Rank 1] Test Epoch 3:  25%|██▌       | 85/334 [00:00<00:00, 298.35it/s]
[Rank 2] Test Epoch 3:  24%|██▎       | 79/334 [00:00<00:00, 286.42it/s]
[Rank 0] Test Epoch 3:  31%|███       | 103/334 [00:00<00:00, 339.17it/s]
[Rank 1] Test Epoch 3:  36%|███▌      | 119/334 [00:00<00:00, 311.97it/s]
[Rank 2] Test Epoch 3:  45%|████▍     | 149/334 [00:00<00:00, 323.55it/s]
[Rank 0] Test Epoch 3:  41%|████▏     | 138/334 [00:00<00:00, 340.54it/s]
[Rank 1] Test Epoch 3:  46%|████▌     | 154/334 [00:00<00:00, 323.11it/s]
[Rank 2] Test Epoch 3:  55%|█████▌    | 184/33

[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [Rank 0] Epoch 3 | Loss: 0.3867, Acc: 0.8623, Model Checksum: 9f76870aca8d829a6383a0416958bc38
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [ NodeId f94cfd48e881a8ef0b964a8593da4b704a1574ca224f294a4cfe1791 Rank 0] Epoch 3 | Loss: 0.3867, Acc: 0.8623, Model Checksum: 9f76870aca8d829a6383a0416958bc38
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [Rank 1] Epoch 3 | Loss: 0.3958, Acc: 0.8563, Model Checksum: 9f76870aca8d829a6383a0416958bc38
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [ NodeId d78b974282fa0fa2bddc3a93a3217bbba8df4be1998f6b20ec83243d Rank 1] Epoch 3 | Loss: 0.3958, Acc: 0.8563, Model Checksum: 9f76870aca8d829a6383a0416958bc38
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 0 | Mem: 26.53MB, Util: 3%  global_step : 8000
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 1 | Mem: 26.53MB, Util: 3%  global_step : 8001
[36m(RayTrainWorker pid=398, ip=10.25

[Rank 1] Train Epoch 4:   1%|          | 14/2000 [00:00<00:14, 136.30it/s]
[Rank 2] Train Epoch 4:   1%|          | 12/2000 [00:00<00:16, 118.64it/s]
[Rank 0] Train Epoch 4:   1%|▏         | 26/2000 [00:00<00:15, 126.53it/s]
[Rank 1] Train Epoch 4:   1%|▏         | 29/2000 [00:00<00:14, 139.84it/s]
[Rank 2] Train Epoch 4:   1%|▏         | 26/2000 [00:00<00:15, 126.11it/s]
[Rank 0] Train Epoch 4:   2%|▏         | 44/2000 [00:00<00:13, 149.22it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 4, Batch 13 | Mem: 26.53MB, Util: 3%  global_step : 8013
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 4, Batch 14 | Mem: 26.53MB, Util: 3%  global_step : 8014
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 4, Batch 15 | Mem: 26.53MB, Util: 3%  global_step : 8015
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 4, Batch 16 | Mem: 26.53MB, Util: 3%  global_step : 8016
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 4, Batch 17 | Mem: 26.53MB, Util: 3%  global_step : 8017
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 4, Batch 18 | Mem: 26.53MB, Util: 3%  global_step : 8018
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 4, Batch 19 | Mem: 26.53MB, Util: 3%  global_step : 8019
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 4, Batch 20 | Mem: 26.53MB, Util: 3%  global_step : 8020
[36m(RayTrainWo

[Rank 1] Train Epoch 4:   2%|▏         | 44/2000 [00:00<00:13, 140.28it/s]
[Rank 2] Train Epoch 4:   2%|▏         | 40/2000 [00:00<00:15, 129.82it/s]
[Rank 0] Train Epoch 4:   3%|▎         | 62/2000 [00:00<00:12, 159.86it/s]
[Rank 1] Train Epoch 4:   3%|▎         | 59/2000 [00:00<00:13, 140.19it/s]
[Rank 2] Train Epoch 4:   3%|▎         | 54/2000 [00:00<00:14, 132.35it/s]
[Rank 0] Train Epoch 4:   4%|▍         | 80/2000 [00:00<00:11, 165.88it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 4, Batch 45 | Mem: 26.53MB, Util: 76%  global_step : 8045
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 4, Batch 46 | Mem: 26.53MB, Util: 76%  global_step : 8046
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 4, Batch 47 | Mem: 26.53MB, Util: 76%  global_step : 8047
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 4, Batch 48 | Mem: 26.53MB, Util: 76%  global_step : 8048
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 4, Batch 49 | Mem: 26.53MB, Util: 76%  global_step : 8049
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 4, Batch 50 | Mem: 26.53MB, Util: 76%  global_step : 8050
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 4, Batch 51 | Mem: 26.53MB, Util: 76%  global_step : 8051
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 4, Batch 52 | Mem: 26.53MB, Util: 76%  global_step : 8052
[36m(Ra

[Rank 1] Train Epoch 4:   4%|▍         | 75/2000 [00:00<00:13, 143.72it/s]
[Rank 2] Train Epoch 4:   3%|▎         | 68/2000 [00:00<00:14, 134.02it/s]
[Rank 0] Train Epoch 4:   5%|▍         | 98/2000 [00:00<00:11, 169.98it/s]
[Rank 1] Train Epoch 4:   4%|▍         | 90/2000 [00:00<00:13, 145.48it/s]
[Rank 2] Train Epoch 4:   4%|▍         | 82/2000 [00:00<00:14, 131.43it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 4, Batch 83 | Mem: 26.53MB, Util: 100%  global_step : 8083
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 4, Batch 84 | Mem: 26.53MB, Util: 100%  global_step : 8084
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 4, Batch 85 | Mem: 26.53MB, Util: 100%  global_step : 8085
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 4, Batch 86 | Mem: 26.53MB, Util: 100%  global_step : 8086
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 4, Batch 87 | Mem: 26.53MB, Util: 100%  global_step : 8087
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 4, Batch 88 | Mem: 26.53MB, Util: 100%  global_step : 8088
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 4, Batch 89 | Mem: 26.53MB, Util: 100%  global_step : 8089
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 4, Batch 90 | Mem: 26.53MB, Util: 100%  global_step : 8090


[Rank 2] Train Epoch 4:   5%|▍         | 99/2000 [00:00<00:13, 140.98it/s]
[Rank 0] Train Epoch 4:   6%|▌         | 116/2000 [00:00<00:15, 122.95it/s]
[Rank 1] Train Epoch 4:   5%|▌         | 105/2000 [00:00<00:14, 128.93it/s]
[Rank 2] Train Epoch 4:   6%|▌         | 114/2000 [00:00<00:13, 140.88it/s]
[Rank 0] Train Epoch 4:   7%|▋         | 134/2000 [00:00<00:13, 136.53it/s]
[Rank 1] Train Epoch 4:   6%|▌         | 119/2000 [00:00<00:14, 128.86it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 4, Batch 100 | Mem: 26.53MB, Util: 100%  global_step : 8100
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 4, Batch 101 | Mem: 26.53MB, Util: 100%  global_step : 8101
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 4, Batch 102 | Mem: 26.53MB, Util: 100%  global_step : 8102
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 4, Batch 103 | Mem: 26.53MB, Util: 100%  global_step : 8103
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 4, Batch 104 | Mem: 26.53MB, Util: 100%  global_step : 8104
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 4, Batch 105 | Mem: 26.53MB, Util: 100%  global_step : 8105
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 4, Batch 106 | Mem: 26.53MB, Util: 99%  global_step : 8106
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 4, Batch 107 | Mem: 26.53MB, Util: 99%  global_step :

[Rank 2] Train Epoch 4:   6%|▋         | 129/2000 [00:00<00:13, 137.36it/s]
[Rank 0] Train Epoch 4:   8%|▊         | 152/2000 [00:01<00:12, 147.11it/s]
[Rank 1] Train Epoch 4:   7%|▋         | 133/2000 [00:00<00:14, 130.56it/s]
[Rank 2] Train Epoch 4:   7%|▋         | 143/2000 [00:01<00:13, 136.39it/s]
[Rank 0] Train Epoch 4:   8%|▊         | 170/2000 [00:01<00:11, 155.70it/s]
[Rank 1] Train Epoch 4:   7%|▋         | 147/2000 [00:01<00:14, 131.51it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 4, Batch 135 | Mem: 26.53MB, Util: 99%  global_step : 8135
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 4, Batch 136 | Mem: 26.53MB, Util: 99%  global_step : 8136
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 4, Batch 137 | Mem: 26.53MB, Util: 99%  global_step : 8137
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 4, Batch 138 | Mem: 26.53MB, Util: 99%  global_step : 8138
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 4, Batch 139 | Mem: 26.53MB, Util: 99%  global_step : 8139
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 4, Batch 140 | Mem: 26.53MB, Util: 99%  global_step : 8140
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 4, Batch 141 | Mem: 26.53MB, Util: 99%  global_step : 8141
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 4, Batch 142 | Mem: 26.53MB, Util: 100%  global_step : 8142

[Rank 2] Train Epoch 4:   8%|▊         | 157/2000 [00:01<00:13, 135.30it/s]
[Rank 0] Train Epoch 4:   9%|▉         | 188/2000 [00:01<00:11, 160.71it/s]
[Rank 1] Train Epoch 4:   8%|▊         | 161/2000 [00:01<00:13, 132.36it/s]
[Rank 2] Train Epoch 4:   9%|▊         | 171/2000 [00:01<00:13, 134.61it/s]
[Rank 1] Train Epoch 4:   9%|▉         | 175/2000 [00:01<00:13, 132.83it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 4, Batch 172 | Mem: 26.53MB, Util: 100%  global_step : 8172
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 4, Batch 173 | Mem: 26.53MB, Util: 100%  global_step : 8173
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 4, Batch 174 | Mem: 26.53MB, Util: 100%  global_step : 8174
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 4, Batch 175 | Mem: 26.53MB, Util: 100%  global_step : 8175
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 4, Batch 176 | Mem: 26.53MB, Util: 100%  global_step : 8176
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 4, Batch 177 | Mem: 26.53MB, Util: 100%  global_step : 8177
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 4, Batch 178 | Mem: 26.53MB, Util: 100%  global_step : 8178
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 4, Batch 179 | Mem: 26.53MB, Util: 100%  global_step

[Rank 2] Train Epoch 4:   9%|▉         | 185/2000 [00:01<00:13, 131.46it/s]
[Rank 1] Train Epoch 4:   9%|▉         | 189/2000 [00:01<00:13, 134.49it/s]
[Rank 2] Train Epoch 4:  10%|▉         | 199/2000 [00:01<00:13, 132.56it/s]
[Rank 0] Train Epoch 4:  10%|█         | 205/2000 [00:01<00:16, 107.44it/s]
[Rank 1] Train Epoch 4:  10%|█         | 203/2000 [00:01<00:13, 131.17it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 187 | Mem: 26.53MB, Util: 67%  global_step : 8187
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 188 | Mem: 26.53MB, Util: 67%  global_step : 8188
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 189 | Mem: 26.53MB, Util: 67%  global_step : 8189
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 190 | Mem: 26.53MB, Util: 67%  global_step : 8190
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 191 | Mem: 26.53MB, Util: 77%  global_step : 8191
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 192 | Mem: 26.53MB, Util: 77%  global_step : 8192
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 193 | Mem: 26.53MB, Util: 77%  global_step : 8193
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 194 | Mem: 26.53MB, Util: 77%  glo

[Rank 2] Train Epoch 4:  11%|█         | 213/2000 [00:01<00:13, 133.40it/s]
[Rank 0] Train Epoch 4:  11%|█         | 223/2000 [00:01<00:14, 122.19it/s]
[Rank 1] Train Epoch 4:  11%|█         | 218/2000 [00:01<00:13, 133.98it/s]
[Rank 2] Train Epoch 4:  11%|█▏        | 227/2000 [00:01<00:13, 133.60it/s]
[Rank 0] Train Epoch 4:  12%|█▏        | 238/2000 [00:01<00:13, 128.46it/s]
[Rank 1] Train Epoch 4:  12%|█▏        | 233/2000 [00:01<00:13, 135.69it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 214 | Mem: 26.53MB, Util: 77%  global_step : 8214
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 215 | Mem: 26.53MB, Util: 77%  global_step : 8215
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 216 | Mem: 26.53MB, Util: 77%  global_step : 8216
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 217 | Mem: 26.53MB, Util: 77%  global_step : 8217
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 218 | Mem: 26.53MB, Util: 91%  global_step : 8218
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 219 | Mem: 26.53MB, Util: 91%  global_step : 8219
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 220 | Mem: 26.53MB, Util: 91%  global_step : 8220
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 221 | Mem: 26.53MB, Util: 91%  glo

[Rank 2] Train Epoch 4:  12%|█▏        | 241/2000 [00:01<00:13, 134.18it/s]
[Rank 0] Train Epoch 4:  13%|█▎        | 253/2000 [00:01<00:13, 130.91it/s]
[Rank 1] Train Epoch 4:  12%|█▏        | 247/2000 [00:01<00:13, 130.78it/s]
[Rank 2] Train Epoch 4:  13%|█▎        | 255/2000 [00:01<00:13, 134.00it/s]
[Rank 0] Train Epoch 4:  13%|█▎        | 268/2000 [00:01<00:12, 133.29it/s]
[Rank 1] Train Epoch 4:  13%|█▎        | 261/2000 [00:01<00:13, 132.69it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 242 | Mem: 26.53MB, Util: 91%  global_step : 8242
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 243 | Mem: 26.53MB, Util: 91%  global_step : 8243
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 244 | Mem: 26.53MB, Util: 91%  global_step : 8244
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 245 | Mem: 26.53MB, Util: 99%  global_step : 8245
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 246 | Mem: 26.53MB, Util: 99%  global_step : 8246
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 247 | Mem: 26.53MB, Util: 99%  global_step : 8247
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 248 | Mem: 26.53MB, Util: 99%  global_step : 8248
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 249 | Mem: 26.53MB, Util: 99%  glo

[Rank 2] Train Epoch 4:  13%|█▎        | 269/2000 [00:02<00:12, 133.66it/s]
[Rank 0] Train Epoch 4:  14%|█▍        | 283/2000 [00:02<00:12, 133.93it/s]
[Rank 1] Train Epoch 4:  14%|█▍        | 275/2000 [00:02<00:12, 132.91it/s]
[Rank 2] Train Epoch 4:  14%|█▍        | 283/2000 [00:02<00:12, 134.01it/s]
[Rank 0] Train Epoch 4:  15%|█▍        | 298/2000 [00:02<00:12, 134.20it/s]
[Rank 1] Train Epoch 4:  14%|█▍        | 289/2000 [00:02<00:12, 132.18it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 270 | Mem: 26.53MB, Util: 99%  global_step : 8270
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 271 | Mem: 26.53MB, Util: 99%  global_step : 8271
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 272 | Mem: 26.53MB, Util: 68%  global_step : 8272
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 273 | Mem: 26.53MB, Util: 68%  global_step : 8273
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 274 | Mem: 26.53MB, Util: 68%  global_step : 8274
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 275 | Mem: 26.53MB, Util: 68%  global_step : 8275
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 276 | Mem: 26.53MB, Util: 68%  global_step : 8276
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 277 | Mem: 26.53MB, Util: 68%  glo

[Rank 2] Train Epoch 4:  15%|█▍        | 297/2000 [00:02<00:12, 133.87it/s]
[Rank 1] Train Epoch 4:  15%|█▌        | 303/2000 [00:02<00:13, 129.37it/s]
[Rank 2] Train Epoch 4:  16%|█▌        | 311/2000 [00:02<00:13, 128.74it/s]
[Rank 0] Train Epoch 4:  16%|█▌        | 312/2000 [00:02<00:15, 111.72it/s]
[Rank 1] Train Epoch 4:  16%|█▌        | 317/2000 [00:02<00:12, 132.31it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 296 | Mem: 26.53MB, Util: 63%  global_step : 8296
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 297 | Mem: 26.53MB, Util: 63%  global_step : 8297
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 298 | Mem: 26.53MB, Util: 63%  global_step : 8298
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 299 | Mem: 26.53MB, Util: 63%  global_step : 8299
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 300 | Mem: 26.53MB, Util: 63%  global_step : 8300
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 301 | Mem: 26.53MB, Util: 63%  global_step : 8301
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 302 | Mem: 26.53MB, Util: 63%  global_step : 8302
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 303 | Mem: 26.53MB, Util: 63%  glo

[Rank 2] Train Epoch 4:  16%|█▋        | 325/2000 [00:02<00:12, 130.51it/s]
[Rank 0] Train Epoch 4:  16%|█▋        | 326/2000 [00:02<00:14, 117.27it/s]
[Rank 1] Train Epoch 4:  17%|█▋        | 331/2000 [00:02<00:12, 134.37it/s]
[Rank 2] Train Epoch 4:  17%|█▋        | 339/2000 [00:02<00:12, 131.93it/s]
[Rank 0] Train Epoch 4:  17%|█▋        | 340/2000 [00:02<00:13, 121.71it/s]
[Rank 1] Train Epoch 4:  17%|█▋        | 346/2000 [00:02<00:12, 136.48it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 325 | Mem: 26.53MB, Util: 86%  global_step : 8325
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 326 | Mem: 26.53MB, Util: 86%  global_step : 8326
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 327 | Mem: 26.53MB, Util: 86%  global_step : 8327
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 328 | Mem: 26.53MB, Util: 86%  global_step : 8328
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 329 | Mem: 26.53MB, Util: 86%  global_step : 8329
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 330 | Mem: 26.53MB, Util: 86%  global_step : 8330
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 331 | Mem: 26.53MB, Util: 86%  global_step : 8331
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 332 | Mem: 26.53MB, Util: 86%  glo

[Rank 2] Train Epoch 4:  18%|█▊        | 353/2000 [00:02<00:12, 132.51it/s]
[Rank 0] Train Epoch 4:  18%|█▊        | 354/2000 [00:02<00:13, 125.45it/s]
[Rank 1] Train Epoch 4:  18%|█▊        | 361/2000 [00:02<00:11, 137.92it/s]
[Rank 2] Train Epoch 4:  18%|█▊        | 367/2000 [00:02<00:12, 130.47it/s]
[Rank 0] Train Epoch 4:  18%|█▊        | 368/2000 [00:02<00:12, 128.42it/s]
[Rank 1] Train Epoch 4:  19%|█▉        | 376/2000 [00:02<00:11, 139.06it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 355 | Mem: 26.53MB, Util: 100%  global_step : 8355
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 356 | Mem: 26.53MB, Util: 100%  global_step : 8356
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 357 | Mem: 26.53MB, Util: 100%  global_step : 8357
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 358 | Mem: 26.53MB, Util: 100%  global_step : 8358
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 359 | Mem: 26.53MB, Util: 100%  global_step : 8359
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 360 | Mem: 26.53MB, Util: 100%  global_step : 8360
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 361 | Mem: 26.53MB, Util: 100%  global_step : 8361
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 362 | Mem: 26.53MB, Util: 1

[Rank 2] Train Epoch 4:  19%|█▉        | 381/2000 [00:02<00:12, 131.82it/s]
[Rank 0] Train Epoch 4:  19%|█▉        | 382/2000 [00:02<00:12, 129.94it/s]
[Rank 1] Train Epoch 4:  20%|█▉        | 391/2000 [00:02<00:11, 140.50it/s]
[Rank 2] Train Epoch 4:  20%|█▉        | 397/2000 [00:02<00:11, 139.71it/s]
[Rank 0] Train Epoch 4:  20%|█▉        | 396/2000 [00:02<00:12, 131.85it/s]
[Rank 1] Train Epoch 4:  20%|██        | 406/2000 [00:03<00:12, 127.44it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 385 | Mem: 26.53MB, Util: 100%  global_step : 8385
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 386 | Mem: 26.53MB, Util: 100%  global_step : 8386
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 387 | Mem: 26.53MB, Util: 100%  global_step : 8387
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 388 | Mem: 26.53MB, Util: 100%  global_step : 8388
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 389 | Mem: 26.53MB, Util: 100%  global_step : 8389
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 390 | Mem: 26.53MB, Util: 100%  global_step : 8390
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 391 | Mem: 26.53MB, Util: 100%  global_step : 8391
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 392 | Mem: 26.53MB, Util: 1

[Rank 2] Train Epoch 4:  21%|██        | 412/2000 [00:03<00:11, 139.56it/s]
[Rank 0] Train Epoch 4:  20%|██        | 410/2000 [00:03<00:11, 132.66it/s]
[Rank 1] Train Epoch 4:  21%|██        | 420/2000 [00:03<00:12, 130.55it/s]
[Rank 2] Train Epoch 4:  21%|██▏       | 428/2000 [00:03<00:10, 145.34it/s]
[Rank 0] Train Epoch 4:  21%|██        | 424/2000 [00:03<00:11, 133.33it/s]
[Rank 1] Train Epoch 4:  22%|██▏       | 435/2000 [00:03<00:11, 133.48it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 409 | Mem: 26.53MB, Util: 99%  global_step : 8409
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 410 | Mem: 26.53MB, Util: 99%  global_step : 8410
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 411 | Mem: 26.53MB, Util: 99%  global_step : 8411
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 412 | Mem: 26.53MB, Util: 99%  global_step : 8412
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 413 | Mem: 26.53MB, Util: 99%  global_step : 8413
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 414 | Mem: 26.53MB, Util: 99%  global_step : 8414
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 415 | Mem: 26.53MB, Util: 99%  global_step : 8415
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 416 | Mem: 26.53MB, Util: 99%  glo

[Rank 2] Train Epoch 4:  22%|██▏       | 444/2000 [00:03<00:10, 148.83it/s]
[Rank 0] Train Epoch 4:  22%|██▏       | 438/2000 [00:03<00:11, 134.15it/s]
[Rank 1] Train Epoch 4:  22%|██▏       | 449/2000 [00:03<00:11, 135.30it/s]
[Rank 2] Train Epoch 4:  23%|██▎       | 460/2000 [00:03<00:10, 151.74it/s]
[Rank 0] Train Epoch 4:  23%|██▎       | 452/2000 [00:03<00:11, 134.60it/s]
[Rank 1] Train Epoch 4:  23%|██▎       | 464/2000 [00:03<00:11, 137.10it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 438 | Mem: 26.53MB, Util: 88%  global_step : 8438
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 439 | Mem: 26.53MB, Util: 88%  global_step : 8439
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 440 | Mem: 26.53MB, Util: 88%  global_step : 8440
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 441 | Mem: 26.53MB, Util: 88%  global_step : 8441
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 442 | Mem: 26.53MB, Util: 88%  global_step : 8442
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 443 | Mem: 26.53MB, Util: 88%  global_step : 8443
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 444 | Mem: 26.53MB, Util: 88%  global_step : 8444
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 445 | Mem: 26.53MB, Util: 88%  glo

[Rank 2] Train Epoch 4:  24%|██▍       | 476/2000 [00:03<00:09, 153.26it/s]
[Rank 0] Train Epoch 4:  23%|██▎       | 466/2000 [00:03<00:11, 134.18it/s]
[Rank 1] Train Epoch 4:  24%|██▍       | 478/2000 [00:03<00:11, 137.28it/s]
[Rank 2] Train Epoch 4:  25%|██▍       | 492/2000 [00:03<00:09, 154.56it/s]
[Rank 0] Train Epoch 4:  24%|██▍       | 480/2000 [00:03<00:11, 133.39it/s]
[Rank 1] Train Epoch 4:  25%|██▍       | 493/2000 [00:03<00:10, 139.40it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 468 | Mem: 26.53MB, Util: 100%  global_step : 8468
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 469 | Mem: 26.53MB, Util: 100%  global_step : 8469
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 470 | Mem: 26.53MB, Util: 100%  global_step : 8470
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 471 | Mem: 26.53MB, Util: 100%  global_step : 8471
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 472 | Mem: 26.53MB, Util: 100%  global_step : 8472
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 473 | Mem: 26.53MB, Util: 100%  global_step : 8473
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 474 | Mem: 26.53MB, Util: 100%  global_step : 8474
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 475 | Mem: 26.53MB, Util: 1

[Rank 0] Train Epoch 4:  25%|██▍       | 494/2000 [00:03<00:11, 131.95it/s]
[Rank 1] Train Epoch 4:  25%|██▌       | 507/2000 [00:03<00:11, 126.63it/s]
[Rank 2] Train Epoch 4:  25%|██▌       | 508/2000 [00:03<00:12, 115.07it/s]
[Rank 1] Train Epoch 4:  26%|██▌       | 524/2000 [00:03<00:10, 138.19it/s]
[Rank 2] Train Epoch 4:  26%|██▋       | 526/2000 [00:03<00:11, 128.77it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 498 | Mem: 26.53MB, Util: 100%  global_step : 8498
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 499 | Mem: 26.53MB, Util: 100%  global_step : 8499
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 500 | Mem: 26.53MB, Util: 100%  global_step : 8500
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 501 | Mem: 26.53MB, Util: 100%  global_step : 8501
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 502 | Mem: 26.53MB, Util: 100%  global_step : 8502
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 503 | Mem: 26.53MB, Util: 100%  global_step : 8503
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 504 | Mem: 26.53MB, Util: 100%  global_step : 8504
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 505 | Mem: 26.53MB, Util: 1

[Rank 0] Train Epoch 4:  25%|██▌       | 508/2000 [00:04<00:38, 39.15it/s] 
[Rank 1] Train Epoch 4:  27%|██▋       | 539/2000 [00:04<00:30, 47.49it/s] 
[Rank 2] Train Epoch 4:  27%|██▋       | 541/2000 [00:04<00:30, 48.45it/s] 
[Rank 0] Train Epoch 4:  26%|██▋       | 526/2000 [00:04<00:27, 53.96it/s]
[Rank 1] Train Epoch 4:  28%|██▊       | 554/2000 [00:04<00:24, 59.45it/s]
[Rank 2] Train Epoch 4:  28%|██▊       | 557/2000 [00:04<00:23, 61.12it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 549 | Mem: 26.53MB, Util: 100%  global_step : 8549
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 550 | Mem: 26.53MB, Util: 100%  global_step : 8550
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 551 | Mem: 26.53MB, Util: 100%  global_step : 8551
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 552 | Mem: 26.53MB, Util: 100%  global_step : 8552
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 553 | Mem: 26.53MB, Util: 100%  global_step : 8553
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 554 | Mem: 26.53MB, Util: 100%  global_step : 8554
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 555 | Mem: 26.53MB, Util: 100%  global_step : 8555
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 556 | Mem: 26.53MB, Util: 100%  glo

[Rank 0] Train Epoch 4:  27%|██▋       | 544/2000 [00:04<00:20, 70.40it/s]
[Rank 1] Train Epoch 4:  28%|██▊       | 569/2000 [00:04<00:19, 72.42it/s]
[Rank 2] Train Epoch 4:  29%|██▊       | 571/2000 [00:04<00:19, 71.96it/s]
[Rank 0] Train Epoch 4:  28%|██▊       | 562/2000 [00:04<00:16, 87.48it/s]
[Rank 1] Train Epoch 4:  29%|██▉       | 584/2000 [00:05<00:16, 85.58it/s]
[Rank 2] Train Epoch 4:  29%|██▉       | 586/2000 [00:05<00:16, 84.65it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 580 | Mem: 26.53MB, Util: 100%  global_step : 8580
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 581 | Mem: 26.53MB, Util: 100%  global_step : 8581
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 582 | Mem: 26.53MB, Util: 100%  global_step : 8582
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 583 | Mem: 26.53MB, Util: 100%  global_step : 8583
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 584 | Mem: 26.53MB, Util: 100%  global_step : 8584
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 585 | Mem: 26.53MB, Util: 100%  global_step : 8585
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 586 | Mem: 26.53MB, Util: 100%  global_step : 8586
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 587 | Mem: 26.53MB, Util: 100%  glo

[Rank 0] Train Epoch 4:  29%|██▉       | 580/2000 [00:05<00:13, 104.21it/s]
[Rank 1] Train Epoch 4:  30%|██▉       | 599/2000 [00:05<00:14, 98.03it/s]
[Rank 2] Train Epoch 4:  30%|███       | 601/2000 [00:05<00:16, 86.75it/s]
[Rank 0] Train Epoch 4:  30%|██▉       | 598/2000 [00:05<00:11, 119.39it/s]
[Rank 2] Train Epoch 4:  31%|███       | 616/2000 [00:05<00:14, 98.45it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 601 | Mem: 26.53MB, Util: 100%  global_step : 8601
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 602 | Mem: 26.53MB, Util: 100%  global_step : 8602
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 603 | Mem: 26.53MB, Util: 100%  global_step : 8603
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 604 | Mem: 26.53MB, Util: 100%  global_step : 8604
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 605 | Mem: 26.53MB, Util: 100%  global_step : 8605
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 606 | Mem: 26.53MB, Util: 100%  global_step : 8606
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 607 | Mem: 26.53MB, Util: 100%  global_step : 8607
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 608 | Mem: 26.53MB, Util: 100%  glo

[Rank 0] Train Epoch 4:  31%|███       | 616/2000 [00:05<00:10, 132.54it/s]
[Rank 1] Train Epoch 4:  31%|███       | 613/2000 [00:05<00:14, 96.45it/s]
[Rank 2] Train Epoch 4:  32%|███▏      | 631/2000 [00:05<00:12, 109.20it/s]
[Rank 0] Train Epoch 4:  32%|███▏      | 634/2000 [00:05<00:09, 142.78it/s]
[Rank 1] Train Epoch 4:  31%|███▏      | 628/2000 [00:05<00:12, 107.52it/s]
[Rank 2] Train Epoch 4:  32%|███▏      | 646/2000 [00:05<00:11, 117.29it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 632 | Mem: 26.53MB, Util: 84%  global_step : 8632
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 633 | Mem: 26.53MB, Util: 84%  global_step : 8633
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 634 | Mem: 26.53MB, Util: 84%  global_step : 8634
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 635 | Mem: 26.53MB, Util: 84%  global_step : 8635
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 636 | Mem: 26.53MB, Util: 84%  global_step : 8636
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 637 | Mem: 26.53MB, Util: 84%  global_step : 8637
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 638 | Mem: 26.53MB, Util: 84%  global_step : 8638
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 639 | Mem: 26.53MB, Util: 84%  global_step

[Rank 0] Train Epoch 4:  33%|███▎      | 652/2000 [00:05<00:08, 151.25it/s]
[Rank 1] Train Epoch 4:  32%|███▏      | 642/2000 [00:05<00:11, 115.22it/s]
[Rank 2] Train Epoch 4:  33%|███▎      | 661/2000 [00:05<00:10, 123.51it/s]
[Rank 0] Train Epoch 4:  34%|███▎      | 670/2000 [00:05<00:08, 157.90it/s]
[Rank 1] Train Epoch 4:  33%|███▎      | 656/2000 [00:05<00:11, 116.27it/s]
[Rank 2] Train Epoch 4:  34%|███▍      | 676/2000 [00:05<00:10, 129.03it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 661 | Mem: 26.53MB, Util: 76%  global_step : 8661
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 662 | Mem: 26.53MB, Util: 76%  global_step : 8662
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 663 | Mem: 26.53MB, Util: 76%  global_step : 8663
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 664 | Mem: 26.53MB, Util: 76%  global_step : 8664
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 665 | Mem: 26.53MB, Util: 76%  global_step : 8665
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 666 | Mem: 26.53MB, Util: 76%  global_step : 8666
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 667 | Mem: 26.53MB, Util: 76%  global_step : 8667
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 668 | Mem: 26.53MB, Util: 76%  global_step

[Rank 0] Train Epoch 4:  34%|███▍      | 688/2000 [00:05<00:08, 163.41it/s]
[Rank 1] Train Epoch 4:  34%|███▎      | 671/2000 [00:05<00:10, 123.39it/s]
[Rank 2] Train Epoch 4:  35%|███▍      | 691/2000 [00:05<00:09, 133.52it/s]
[Rank 1] Train Epoch 4:  34%|███▍      | 686/2000 [00:05<00:10, 128.26it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 691 | Mem: 26.53MB, Util: 98%  global_step : 8691
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 692 | Mem: 26.53MB, Util: 98%  global_step : 8692
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 693 | Mem: 26.53MB, Util: 98%  global_step : 8693
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 694 | Mem: 26.53MB, Util: 98%  global_step : 8694
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 695 | Mem: 26.53MB, Util: 98%  global_step : 8695
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 696 | Mem: 26.53MB, Util: 98%  global_step : 8696
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 697 | Mem: 26.53MB, Util: 98%  global_step : 8697
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 698 | Mem: 26.53MB, Util: 98%  global_step

[Rank 0] Train Epoch 4:  35%|███▌      | 706/2000 [00:05<00:10, 118.14it/s]
[Rank 1] Train Epoch 4:  35%|███▌      | 701/2000 [00:05<00:09, 132.45it/s]
[Rank 2] Train Epoch 4:  35%|███▌      | 706/2000 [00:05<00:10, 128.82it/s]
[Rank 0] Train Epoch 4:  36%|███▌      | 724/2000 [00:06<00:09, 131.28it/s]
[Rank 1] Train Epoch 4:  36%|███▌      | 715/2000 [00:05<00:09, 132.62it/s]
[Rank 2] Train Epoch 4:  36%|███▌      | 721/2000 [00:06<00:09, 133.50it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 718 | Mem: 26.53MB, Util: 100%  global_step : 8718
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 719 | Mem: 26.53MB, Util: 100%  global_step : 8719
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 720 | Mem: 26.53MB, Util: 100%  global_step : 8720
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 721 | Mem: 26.53MB, Util: 100%  global_step : 8721
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 722 | Mem: 26.53MB, Util: 100%  global_step : 8722
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 723 | Mem: 26.53MB, Util: 100%  global_step : 8723
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 724 | Mem: 26.53MB, Util: 100%  global_step : 8724
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 725 | Mem: 26.53MB, Util: 100%  glo

[Rank 0] Train Epoch 4:  37%|███▋      | 742/2000 [00:06<00:08, 142.44it/s]
[Rank 1] Train Epoch 4:  36%|███▋      | 729/2000 [00:06<00:09, 134.36it/s]
[Rank 2] Train Epoch 4:  37%|███▋      | 736/2000 [00:06<00:09, 137.45it/s]
[Rank 0] Train Epoch 4:  38%|███▊      | 760/2000 [00:06<00:08, 151.40it/s]
[Rank 1] Train Epoch 4:  37%|███▋      | 744/2000 [00:06<00:09, 138.34it/s]
[Rank 2] Train Epoch 4:  38%|███▊      | 751/2000 [00:06<00:08, 139.57it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 749 | Mem: 26.53MB, Util: 97%  global_step : 8749
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 750 | Mem: 26.53MB, Util: 97%  global_step : 8750
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 751 | Mem: 26.53MB, Util: 97%  global_step : 8751
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 752 | Mem: 26.53MB, Util: 97%  global_step : 8752
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 753 | Mem: 26.53MB, Util: 97%  global_step : 8753
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 754 | Mem: 26.53MB, Util: 97%  global_step : 8754
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 755 | Mem: 26.53MB, Util: 97%  global_step : 8755
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 756 | Mem: 26.53MB, Util: 97%  global_step

[Rank 0] Train Epoch 4:  39%|███▉      | 777/2000 [00:06<00:08, 150.49it/s]
[Rank 1] Train Epoch 4:  38%|███▊      | 759/2000 [00:06<00:09, 136.21it/s]
[Rank 2] Train Epoch 4:  38%|███▊      | 766/2000 [00:06<00:08, 139.93it/s]
[Rank 0] Train Epoch 4:  40%|███▉      | 794/2000 [00:06<00:08, 148.72it/s]
[Rank 1] Train Epoch 4:  39%|███▊      | 773/2000 [00:06<00:09, 136.16it/s]
[Rank 2] Train Epoch 4:  39%|███▉      | 781/2000 [00:06<00:08, 141.96it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 779 | Mem: 26.53MB, Util: 100%  global_step : 8779
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 780 | Mem: 26.53MB, Util: 100%  global_step : 8780
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 781 | Mem: 26.53MB, Util: 100%  global_step : 8781
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 782 | Mem: 26.53MB, Util: 100%  global_step : 8782
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 783 | Mem: 26.53MB, Util: 100%  global_step : 8783
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 784 | Mem: 26.53MB, Util: 100%  global_step : 8784
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 785 | Mem: 26.53MB, Util: 100%  global_step : 8785
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 786 | Mem: 26.53MB, Util: 100%  glo

[Rank 1] Train Epoch 4:  39%|███▉      | 787/2000 [00:06<00:08, 136.11it/s]
[Rank 2] Train Epoch 4:  40%|███▉      | 796/2000 [00:06<00:08, 142.87it/s]
[Rank 0] Train Epoch 4:  40%|████      | 810/2000 [00:06<00:10, 112.06it/s]
[Rank 1] Train Epoch 4:  40%|████      | 801/2000 [00:06<00:08, 134.52it/s]
[Rank 2] Train Epoch 4:  41%|████      | 811/2000 [00:06<00:09, 127.74it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 803 | Mem: 26.53MB, Util: 100%  global_step : 8803
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 804 | Mem: 26.53MB, Util: 100%  global_step : 8804
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 805 | Mem: 26.53MB, Util: 100%  global_step : 8805
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 806 | Mem: 26.53MB, Util: 100%  global_step : 8806
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 807 | Mem: 26.53MB, Util: 100%  global_step : 8807
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 808 | Mem: 26.53MB, Util: 100%  global_step : 8808
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 809 | Mem: 26.53MB, Util: 100%  global_step : 8809
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 810 | Mem: 26.53MB, Util: 100%  glo

[Rank 0] Train Epoch 4:  41%|████▏     | 825/2000 [00:06<00:09, 119.03it/s]
[Rank 1] Train Epoch 4:  41%|████      | 815/2000 [00:06<00:08, 133.36it/s]
[Rank 2] Train Epoch 4:  41%|████▏     | 826/2000 [00:06<00:08, 132.92it/s]
[Rank 0] Train Epoch 4:  42%|████▏     | 840/2000 [00:06<00:09, 125.36it/s]
[Rank 1] Train Epoch 4:  41%|████▏     | 829/2000 [00:06<00:08, 132.69it/s]
[Rank 2] Train Epoch 4:  42%|████▏     | 841/2000 [00:06<00:08, 136.86it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 834 | Mem: 26.53MB, Util: 97%  global_step : 8834
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 835 | Mem: 26.53MB, Util: 97%  global_step : 8835
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 836 | Mem: 26.53MB, Util: 97%  global_step : 8836
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 837 | Mem: 26.53MB, Util: 97%  global_step : 8837
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 838 | Mem: 26.53MB, Util: 97%  global_step : 8838
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 839 | Mem: 26.53MB, Util: 97%  global_step : 8839
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 840 | Mem: 26.53MB, Util: 97%  global_step : 8840
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 841 | Mem: 26.53MB, Util: 97%  global_step

[Rank 1] Train Epoch 4:  42%|████▏     | 843/2000 [00:06<00:09, 126.69it/s]
[Rank 0] Train Epoch 4:  43%|████▎     | 856/2000 [00:07<00:08, 133.30it/s]
[Rank 2] Train Epoch 4:  43%|████▎     | 856/2000 [00:07<00:08, 139.75it/s]
[Rank 1] Train Epoch 4:  43%|████▎     | 857/2000 [00:07<00:08, 129.40it/s]
[Rank 0] Train Epoch 4:  44%|████▎     | 871/2000 [00:07<00:08, 136.33it/s]
[Rank 2] Train Epoch 4:  44%|████▎     | 871/2000 [00:07<00:07, 142.24it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 865 | Mem: 26.53MB, Util: 100%  global_step : 8865
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 866 | Mem: 26.53MB, Util: 100%  global_step : 8866
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 867 | Mem: 26.53MB, Util: 100%  global_step : 8867
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 868 | Mem: 26.53MB, Util: 100%  global_step : 8868
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 869 | Mem: 26.53MB, Util: 100%  global_step : 8869
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 870 | Mem: 26.53MB, Util: 100%  global_step : 8870
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 871 | Mem: 26.53MB, Util: 100%  global_step : 8871
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 872 | Mem: 26.53MB, Util: 100%  glo

[Rank 1] Train Epoch 4:  44%|████▎     | 871/2000 [00:07<00:08, 131.09it/s]
[Rank 0] Train Epoch 4:  44%|████▍     | 886/2000 [00:07<00:08, 137.66it/s]
[Rank 2] Train Epoch 4:  44%|████▍     | 886/2000 [00:07<00:07, 143.13it/s]
[Rank 1] Train Epoch 4:  44%|████▍     | 885/2000 [00:07<00:08, 131.57it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 895 | Mem: 26.53MB, Util: 100%  global_step : 8895
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 896 | Mem: 26.53MB, Util: 100%  global_step : 8896
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 897 | Mem: 26.53MB, Util: 100%  global_step : 8897
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 898 | Mem: 26.53MB, Util: 100%  global_step : 8898
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 899 | Mem: 26.53MB, Util: 100%  global_step : 8899
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 891 | Mem: 26.53MB, Util: 67%  global_step : 8891
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 892 | Mem: 26.53MB, Util: 67%  global_step : 8892
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 893 | Mem: 26.53MB, Util: 67%  glo

[Rank 1] Train Epoch 4:  45%|████▍     | 899/2000 [00:07<00:08, 132.81it/s]
[Rank 0] Train Epoch 4:  45%|████▌     | 901/2000 [00:07<00:09, 114.81it/s]
[Rank 2] Train Epoch 4:  45%|████▌     | 901/2000 [00:07<00:09, 115.31it/s]
[Rank 1] Train Epoch 4:  46%|████▌     | 914/2000 [00:07<00:08, 135.68it/s]
[Rank 0] Train Epoch 4:  46%|████▌     | 916/2000 [00:07<00:08, 122.47it/s]
[Rank 2] Train Epoch 4:  46%|████▌     | 916/2000 [00:07<00:08, 123.37it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 913 | Mem: 26.53MB, Util: 100%  global_step : 8913
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 914 | Mem: 26.53MB, Util: 92%  global_step : 8914
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 915 | Mem: 26.53MB, Util: 92%  global_step : 8915
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 916 | Mem: 26.53MB, Util: 92%  global_step : 8916
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 917 | Mem: 26.53MB, Util: 92%  global_step : 8917
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 918 | Mem: 26.53MB, Util: 92%  global_step : 8918
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 919 | Mem: 26.53MB, Util: 92%  global_step : 8919
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 920 | Mem: 26.53MB, Util: 92%  global_ste

[Rank 1] Train Epoch 4:  46%|████▋     | 929/2000 [00:07<00:07, 137.46it/s]
[Rank 0] Train Epoch 4:  47%|████▋     | 931/2000 [00:07<00:08, 128.39it/s]
[Rank 2] Train Epoch 4:  47%|████▋     | 931/2000 [00:07<00:08, 129.43it/s]
[Rank 1] Train Epoch 4:  47%|████▋     | 943/2000 [00:07<00:07, 137.23it/s]
[Rank 0] Train Epoch 4:  47%|████▋     | 946/2000 [00:07<00:07, 132.96it/s]
[Rank 2] Train Epoch 4:  47%|████▋     | 946/2000 [00:07<00:07, 134.20it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 944 | Mem: 26.53MB, Util: 92%  global_step : 8944
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 945 | Mem: 26.53MB, Util: 92%  global_step : 8945
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 946 | Mem: 26.53MB, Util: 92%  global_step : 8946
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 947 | Mem: 26.53MB, Util: 100%  global_step : 8947
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 948 | Mem: 26.53MB, Util: 100%  global_step : 8948
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 949 | Mem: 26.53MB, Util: 100%  global_step : 8949
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 950 | Mem: 26.53MB, Util: 100%  global_step : 8950
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 951 | Mem: 26.53MB, Util: 100%  global

[Rank 1] Train Epoch 4:  48%|████▊     | 957/2000 [00:07<00:07, 137.06it/s]
[Rank 0] Train Epoch 4:  48%|████▊     | 961/2000 [00:07<00:07, 136.46it/s]
[Rank 2] Train Epoch 4:  48%|████▊     | 961/2000 [00:07<00:07, 137.33it/s]
[Rank 1] Train Epoch 4:  49%|████▊     | 971/2000 [00:07<00:07, 137.12it/s]
[Rank 0] Train Epoch 4:  49%|████▉     | 976/2000 [00:07<00:07, 139.13it/s]
[Rank 2] Train Epoch 4:  49%|████▉     | 976/2000 [00:07<00:07, 139.65it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 974 | Mem: 26.53MB, Util: 100%  global_step : 8974
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 975 | Mem: 26.53MB, Util: 100%  global_step : 8975
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 976 | Mem: 26.53MB, Util: 100%  global_step : 8976
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 977 | Mem: 26.53MB, Util: 100%  global_step : 8977
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 978 | Mem: 26.53MB, Util: 100%  global_step : 8978
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 979 | Mem: 26.53MB, Util: 100%  global_step : 8979
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 980 | Mem: 26.53MB, Util: 100%  global_step : 8980
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 981 | Mem: 26.53MB, Util: 100%  glo

[Rank 1] Train Epoch 4:  49%|████▉     | 986/2000 [00:08<00:07, 138.08it/s]
[Rank 0] Train Epoch 4:  50%|████▉     | 991/2000 [00:08<00:07, 140.63it/s]
[Rank 2] Train Epoch 4:  50%|████▉     | 991/2000 [00:08<00:07, 141.05it/s]
[Rank 1] Train Epoch 4:  50%|█████     | 1000/2000 [00:08<00:07, 137.81it/s]
[Rank 0] Train Epoch 4:  50%|█████     | 1006/2000 [00:08<00:07, 130.26it/s]
[Rank 2] Train Epoch 4:  50%|█████     | 1006/2000 [00:08<00:07, 129.75it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1000 | Mem: 26.53MB, Util: 100%  global_step : 9000
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1001 | Mem: 26.53MB, Util: 100%  global_step : 9001
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1002 | Mem: 26.53MB, Util: 100%  global_step : 9002
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1003 | Mem: 26.53MB, Util: 100%  global_step : 9003
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1004 | Mem: 26.53MB, Util: 100%  global_step : 9004
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1005 | Mem: 26.53MB, Util: 100%  global_step : 9005
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1006 | Mem: 26.53MB, Util: 100%  global_step : 9006
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1007 | Mem: 26.53MB, Util: 1

[Rank 1] Train Epoch 4:  51%|█████     | 1014/2000 [00:08<00:07, 137.81it/s]
[Rank 0] Train Epoch 4:  51%|█████     | 1021/2000 [00:08<00:07, 133.95it/s]
[Rank 2] Train Epoch 4:  51%|█████     | 1021/2000 [00:08<00:07, 134.47it/s]
[Rank 1] Train Epoch 4:  51%|█████▏    | 1028/2000 [00:08<00:07, 138.04it/s]
[Rank 0] Train Epoch 4:  52%|█████▏    | 1036/2000 [00:08<00:07, 136.33it/s]
[Rank 2] Train Epoch 4:  52%|█████▏    | 1036/2000 [00:08<00:07, 137.35it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1030 | Mem: 26.53MB, Util: 100%  global_step : 9030
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1031 | Mem: 26.53MB, Util: 100%  global_step : 9031
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1032 | Mem: 26.53MB, Util: 95%  global_step : 9032
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1033 | Mem: 26.53MB, Util: 95%  global_step : 9033
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1034 | Mem: 26.53MB, Util: 95%  global_step : 9034
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1035 | Mem: 26.53MB, Util: 95%  global_step : 9035
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1036 | Mem: 26.53MB, Util: 95%  global_step : 9036
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1037 | Mem: 26.53MB, Util: 95%  g

[Rank 1] Train Epoch 4:  52%|█████▏    | 1042/2000 [00:08<00:06, 137.57it/s]
[Rank 0] Train Epoch 4:  52%|█████▎    | 1050/2000 [00:08<00:06, 137.15it/s]
[Rank 2] Train Epoch 4:  53%|█████▎    | 1051/2000 [00:08<00:06, 140.04it/s]
[Rank 1] Train Epoch 4:  53%|█████▎    | 1056/2000 [00:08<00:06, 136.37it/s]
[Rank 0] Train Epoch 4:  53%|█████▎    | 1065/2000 [00:08<00:06, 138.24it/s]
[Rank 2] Train Epoch 4:  53%|█████▎    | 1066/2000 [00:08<00:06, 141.42it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1060 | Mem: 26.53MB, Util: 100%  global_step : 9060
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1061 | Mem: 26.53MB, Util: 100%  global_step : 9061
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1062 | Mem: 26.53MB, Util: 100%  global_step : 9062
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1063 | Mem: 26.53MB, Util: 100%  global_step : 9063
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1064 | Mem: 26.53MB, Util: 100%  global_step : 9064
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1065 | Mem: 26.53MB, Util: 100%  global_step : 9065
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1066 | Mem: 26.53MB, Util: 100%  global_step : 9066
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1067 | Mem: 26.53MB, Util: 1

[Rank 1] Train Epoch 4:  54%|█████▎    | 1070/2000 [00:08<00:06, 135.69it/s]
[Rank 0] Train Epoch 4:  54%|█████▍    | 1079/2000 [00:08<00:06, 138.29it/s]
[Rank 2] Train Epoch 4:  54%|█████▍    | 1081/2000 [00:08<00:06, 139.42it/s]
[Rank 1] Train Epoch 4:  54%|█████▍    | 1084/2000 [00:08<00:06, 135.77it/s]
[Rank 0] Train Epoch 4:  55%|█████▍    | 1094/2000 [00:08<00:06, 139.07it/s]
[Rank 2] Train Epoch 4:  55%|█████▍    | 1096/2000 [00:08<00:06, 141.73it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1089 | Mem: 26.53MB, Util: 100%  global_step : 9089
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1090 | Mem: 26.53MB, Util: 100%  global_step : 9090
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1091 | Mem: 26.53MB, Util: 100%  global_step : 9091
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1092 | Mem: 26.53MB, Util: 100%  global_step : 9092
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1093 | Mem: 26.53MB, Util: 100%  global_step : 9093
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1094 | Mem: 26.53MB, Util: 100%  global_step : 9094
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1095 | Mem: 26.53MB, Util: 100%  global_step : 9095
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1096 | Mem: 26.53MB, Util: 1

[Rank 1] Train Epoch 4:  55%|█████▍    | 1098/2000 [00:08<00:06, 135.70it/s]
[Rank 0] Train Epoch 4:  55%|█████▌    | 1108/2000 [00:08<00:06, 130.03it/s]
[Rank 2] Train Epoch 4:  56%|█████▌    | 1111/2000 [00:08<00:06, 127.41it/s]
[Rank 1] Train Epoch 4:  56%|█████▌    | 1113/2000 [00:08<00:06, 137.65it/s]
[Rank 0] Train Epoch 4:  56%|█████▌    | 1123/2000 [00:09<00:06, 134.17it/s]
[Rank 2] Train Epoch 4:  56%|█████▋    | 1126/2000 [00:09<00:06, 132.52it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1114 | Mem: 26.53MB, Util: 96%  global_step : 9114
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1115 | Mem: 26.53MB, Util: 96%  global_step : 9115
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1116 | Mem: 26.53MB, Util: 96%  global_step : 9116
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1117 | Mem: 26.53MB, Util: 96%  global_step : 9117
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1118 | Mem: 26.53MB, Util: 96%  global_step : 9118
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1119 | Mem: 26.53MB, Util: 96%  global_step : 9119
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1120 | Mem: 26.53MB, Util: 96%  global_step : 9120
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1121 | Mem: 26.53MB, Util: 96%  glo

[Rank 1] Train Epoch 4:  56%|█████▋    | 1128/2000 [00:09<00:06, 138.80it/s]
[Rank 0] Train Epoch 4:  57%|█████▋    | 1138/2000 [00:09<00:06, 136.32it/s]
[Rank 2] Train Epoch 4:  57%|█████▋    | 1140/2000 [00:09<00:06, 133.47it/s]
[Rank 1] Train Epoch 4:  57%|█████▋    | 1142/2000 [00:09<00:06, 137.27it/s]
[Rank 0] Train Epoch 4:  58%|█████▊    | 1153/2000 [00:09<00:06, 138.19it/s]
[Rank 2] Train Epoch 4:  58%|█████▊    | 1155/2000 [00:09<00:06, 137.84it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1143 | Mem: 26.53MB, Util: 94%  global_step : 9143
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1144 | Mem: 26.53MB, Util: 94%  global_step : 9144
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1145 | Mem: 26.53MB, Util: 94%  global_step : 9145
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1146 | Mem: 26.53MB, Util: 94%  global_step : 9146
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1147 | Mem: 26.53MB, Util: 94%  global_step : 9147
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1148 | Mem: 26.53MB, Util: 94%  global_step : 9148
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1149 | Mem: 26.53MB, Util: 94%  global_step : 9149
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1150 | Mem: 26.53MB, Util: 94%  glo

[Rank 1] Train Epoch 4:  58%|█████▊    | 1156/2000 [00:09<00:06, 137.92it/s]
[Rank 0] Train Epoch 4:  58%|█████▊    | 1167/2000 [00:09<00:06, 138.59it/s]
[Rank 2] Train Epoch 4:  58%|█████▊    | 1170/2000 [00:09<00:05, 140.78it/s]
[Rank 1] Train Epoch 4:  59%|█████▊    | 1171/2000 [00:09<00:05, 139.01it/s]
[Rank 0] Train Epoch 4:  59%|█████▉    | 1181/2000 [00:09<00:05, 138.44it/s]
[Rank 2] Train Epoch 4:  59%|█████▉    | 1185/2000 [00:09<00:05, 142.43it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1174 | Mem: 26.53MB, Util: 95%  global_step : 9174
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1175 | Mem: 26.53MB, Util: 95%  global_step : 9175
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1176 | Mem: 26.53MB, Util: 95%  global_step : 9176
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1177 | Mem: 26.53MB, Util: 95%  global_step : 9177
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1178 | Mem: 26.53MB, Util: 95%  global_step : 9178
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1179 | Mem: 26.53MB, Util: 95%  global_step : 9179
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1180 | Mem: 26.53MB, Util: 95%  global_step : 9180
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1181 | Mem: 26.53MB, Util: 95%  glo

[Rank 1] Train Epoch 4:  59%|█████▉    | 1186/2000 [00:09<00:05, 141.20it/s]
[Rank 0] Train Epoch 4:  60%|█████▉    | 1196/2000 [00:09<00:05, 139.48it/s]
[Rank 2] Train Epoch 4:  60%|██████    | 1200/2000 [00:09<00:05, 143.98it/s]
[Rank 1] Train Epoch 4:  60%|██████    | 1201/2000 [00:09<00:05, 142.37it/s]
[Rank 0] Train Epoch 4:  61%|██████    | 1211/2000 [00:09<00:05, 140.49it/s]
[Rank 2] Train Epoch 4:  61%|██████    | 1215/2000 [00:09<00:05, 137.34it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1203 | Mem: 26.53MB, Util: 100%  global_step : 9203
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1204 | Mem: 26.53MB, Util: 100%  global_step : 9204
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1205 | Mem: 26.53MB, Util: 100%  global_step : 9205
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1206 | Mem: 26.53MB, Util: 100%  global_step : 9206
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1207 | Mem: 26.53MB, Util: 100%  global_step : 9207
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1208 | Mem: 26.53MB, Util: 100%  global_step : 9208
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1209 | Mem: 26.53MB, Util: 100%  global_step : 9209
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1210 | Mem: 26.53MB, Util: 1

[Rank 1] Train Epoch 4:  61%|██████    | 1216/2000 [00:09<00:05, 141.21it/s]
[Rank 0] Train Epoch 4:  61%|██████▏   | 1226/2000 [00:09<00:05, 140.20it/s]
[Rank 2] Train Epoch 4:  62%|██████▏   | 1230/2000 [00:09<00:05, 140.71it/s]
[Rank 1] Train Epoch 4:  62%|██████▏   | 1231/2000 [00:09<00:05, 140.97it/s]
[Rank 0] Train Epoch 4:  62%|██████▏   | 1241/2000 [00:09<00:05, 140.52it/s]
[Rank 2] Train Epoch 4:  62%|██████▏   | 1245/2000 [00:09<00:05, 142.81it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1234 | Mem: 26.53MB, Util: 95%  global_step : 9234
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1235 | Mem: 26.53MB, Util: 95%  global_step : 9235
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1236 | Mem: 26.53MB, Util: 95%  global_step : 9236
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1237 | Mem: 26.53MB, Util: 95%  global_step : 9237
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1238 | Mem: 26.53MB, Util: 95%  global_step : 9238
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1239 | Mem: 26.53MB, Util: 95%  global_step : 9239
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1240 | Mem: 26.53MB, Util: 95%  global_step : 9240
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1241 | Mem: 26.53MB, Util: 95%  glo

[Rank 1] Train Epoch 4:  62%|██████▏   | 1246/2000 [00:09<00:05, 141.63it/s]
[Rank 0] Train Epoch 4:  63%|██████▎   | 1256/2000 [00:09<00:05, 140.94it/s]
[Rank 2] Train Epoch 4:  63%|██████▎   | 1260/2000 [00:09<00:05, 144.35it/s]
[Rank 1] Train Epoch 4:  63%|██████▎   | 1261/2000 [00:09<00:05, 141.83it/s]
[Rank 0] Train Epoch 4:  64%|██████▎   | 1271/2000 [00:10<00:05, 141.34it/s]
[Rank 2] Train Epoch 4:  64%|██████▍   | 1275/2000 [00:10<00:05, 140.80it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1264 | Mem: 26.53MB, Util: 100%  global_step : 9264
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1265 | Mem: 26.53MB, Util: 100%  global_step : 9265
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1266 | Mem: 26.53MB, Util: 100%  global_step : 9266
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1267 | Mem: 26.53MB, Util: 100%  global_step : 9267
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1268 | Mem: 26.53MB, Util: 100%  global_step : 9268
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1269 | Mem: 26.53MB, Util: 100%  global_step : 9269
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1270 | Mem: 26.53MB, Util: 100%  global_step : 9270
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1271 | Mem: 26.53MB, Util: 1

[Rank 1] Train Epoch 4:  64%|██████▍   | 1276/2000 [00:10<00:05, 141.87it/s]
[Rank 0] Train Epoch 4:  64%|██████▍   | 1286/2000 [00:10<00:05, 141.42it/s]
[Rank 2] Train Epoch 4:  64%|██████▍   | 1290/2000 [00:10<00:05, 140.42it/s]
[Rank 1] Train Epoch 4:  65%|██████▍   | 1291/2000 [00:10<00:05, 141.45it/s]
[Rank 0] Train Epoch 4:  65%|██████▌   | 1301/2000 [00:10<00:05, 137.07it/s]
[Rank 2] Train Epoch 4:  65%|██████▌   | 1305/2000 [00:10<00:05, 132.76it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1293 | Mem: 26.53MB, Util: 100%  global_step : 9293
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1294 | Mem: 26.53MB, Util: 100%  global_step : 9294
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1295 | Mem: 26.53MB, Util: 100%  global_step : 9295
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1296 | Mem: 26.53MB, Util: 100%  global_step : 9296
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1297 | Mem: 26.53MB, Util: 100%  global_step : 9297
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1298 | Mem: 26.53MB, Util: 100%  global_step : 9298
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1299 | Mem: 26.53MB, Util: 100%  global_step : 9299
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1300 | Mem: 26.53MB, Util: 1

[Rank 1] Train Epoch 4:  65%|██████▌   | 1306/2000 [00:10<00:05, 136.66it/s]
[Rank 0] Train Epoch 4:  66%|██████▌   | 1315/2000 [00:10<00:04, 137.76it/s]
[Rank 2] Train Epoch 4:  66%|██████▌   | 1319/2000 [00:10<00:05, 134.51it/s]
[Rank 1] Train Epoch 4:  66%|██████▌   | 1320/2000 [00:10<00:04, 136.11it/s]
[Rank 0] Train Epoch 4:  66%|██████▋   | 1329/2000 [00:10<00:04, 137.07it/s]
[Rank 2] Train Epoch 4:  67%|██████▋   | 1333/2000 [00:10<00:04, 135.65it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1319 | Mem: 26.53MB, Util: 97%  global_step : 9319
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1320 | Mem: 26.53MB, Util: 97%  global_step : 9320
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1321 | Mem: 26.53MB, Util: 97%  global_step : 9321
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1322 | Mem: 26.53MB, Util: 97%  global_step : 9322
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1323 | Mem: 26.53MB, Util: 97%  global_step : 9323
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1324 | Mem: 26.53MB, Util: 97%  global_step : 9324
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1325 | Mem: 26.53MB, Util: 97%  global_step : 9325
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1326 | Mem: 26.53MB, Util: 97%  glo

[Rank 1] Train Epoch 4:  67%|██████▋   | 1334/2000 [00:10<00:04, 134.86it/s]
[Rank 0] Train Epoch 4:  67%|██████▋   | 1343/2000 [00:10<00:04, 137.78it/s]
[Rank 2] Train Epoch 4:  67%|██████▋   | 1347/2000 [00:10<00:04, 136.38it/s]
[Rank 1] Train Epoch 4:  67%|██████▋   | 1348/2000 [00:10<00:04, 134.65it/s]
[Rank 0] Train Epoch 4:  68%|██████▊   | 1358/2000 [00:10<00:04, 138.38it/s]
[Rank 2] Train Epoch 4:  68%|██████▊   | 1361/2000 [00:10<00:04, 137.11it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1348 | Mem: 26.53MB, Util: 100%  global_step : 9348
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1349 | Mem: 26.53MB, Util: 100%  global_step : 9349
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1350 | Mem: 26.53MB, Util: 100%  global_step : 9350
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1351 | Mem: 26.53MB, Util: 100%  global_step : 9351
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1352 | Mem: 26.53MB, Util: 100%  global_step : 9352
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1353 | Mem: 26.53MB, Util: 100%  global_step : 9353
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1354 | Mem: 26.53MB, Util: 100%  global_step : 9354
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1355 | Mem: 26.53MB, Util: 1

[Rank 1] Train Epoch 4:  68%|██████▊   | 1362/2000 [00:10<00:04, 133.85it/s]
[Rank 0] Train Epoch 4:  69%|██████▊   | 1373/2000 [00:10<00:04, 139.62it/s]
[Rank 2] Train Epoch 4:  69%|██████▉   | 1376/2000 [00:10<00:04, 138.16it/s]
[Rank 1] Train Epoch 4:  69%|██████▉   | 1376/2000 [00:10<00:04, 134.16it/s]
[Rank 0] Train Epoch 4:  69%|██████▉   | 1387/2000 [00:10<00:04, 139.27it/s]
[Rank 2] Train Epoch 4:  70%|██████▉   | 1390/2000 [00:10<00:04, 137.69it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1377 | Mem: 26.53MB, Util: 100%  global_step : 9377
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1378 | Mem: 26.53MB, Util: 100%  global_step : 9378
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1379 | Mem: 26.53MB, Util: 100%  global_step : 9379
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1380 | Mem: 26.53MB, Util: 100%  global_step : 9380
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1381 | Mem: 26.53MB, Util: 100%  global_step : 9381
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1382 | Mem: 26.53MB, Util: 100%  global_step : 9382
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1383 | Mem: 26.53MB, Util: 100%  global_step : 9383
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1384 | Mem: 26.53MB, Util: 1

[Rank 1] Train Epoch 4:  70%|██████▉   | 1390/2000 [00:10<00:04, 134.00it/s]
[Rank 1] Train Epoch 4:  70%|███████   | 1404/2000 [00:11<00:04, 132.90it/s]
[Rank 0] Train Epoch 4:  70%|███████   | 1401/2000 [00:11<00:04, 127.24it/s]
[Rank 2] Train Epoch 4:  70%|███████   | 1404/2000 [00:11<00:04, 126.78it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1402 | Mem: 26.53MB, Util: 100%  global_step : 9402
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1403 | Mem: 26.53MB, Util: 100%  global_step : 9403
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1404 | Mem: 26.53MB, Util: 100%  global_step : 9404
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1405 | Mem: 26.53MB, Util: 100%  global_step : 9405
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1406 | Mem: 26.53MB, Util: 100%  global_step : 9406
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1407 | Mem: 26.53MB, Util: 100%  global_step : 9407
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1408 | Mem: 26.53MB, Util: 100%  global_step : 9408
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1409 | Mem: 26.53MB, Util: 1

[Rank 1] Train Epoch 4:  71%|███████   | 1418/2000 [00:11<00:04, 132.03it/s]
[Rank 0] Train Epoch 4:  71%|███████   | 1416/2000 [00:11<00:04, 131.06it/s]
[Rank 2] Train Epoch 4:  71%|███████   | 1419/2000 [00:11<00:04, 130.65it/s]
[Rank 1] Train Epoch 4:  72%|███████▏  | 1432/2000 [00:11<00:04, 132.54it/s]
[Rank 0] Train Epoch 4:  72%|███████▏  | 1430/2000 [00:11<00:04, 133.17it/s]
[Rank 2] Train Epoch 4:  72%|███████▏  | 1434/2000 [00:11<00:04, 132.25it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1432 | Mem: 26.53MB, Util: 96%  global_step : 9432
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1433 | Mem: 26.53MB, Util: 96%  global_step : 9433
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1434 | Mem: 26.53MB, Util: 96%  global_step : 9434
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1435 | Mem: 26.53MB, Util: 96%  global_step : 9435
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1436 | Mem: 26.53MB, Util: 96%  global_step : 9436
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1437 | Mem: 26.53MB, Util: 96%  global_step : 9437
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1438 | Mem: 26.53MB, Util: 96%  global_step : 9438
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1439 | Mem: 26.53MB, Util: 96%  glo

[Rank 1] Train Epoch 4:  72%|███████▏  | 1446/2000 [00:11<00:04, 130.15it/s]
[Rank 0] Train Epoch 4:  72%|███████▏  | 1444/2000 [00:11<00:04, 134.15it/s]
[Rank 2] Train Epoch 4:  72%|███████▎  | 1450/2000 [00:11<00:03, 138.00it/s]
[Rank 1] Train Epoch 4:  73%|███████▎  | 1460/2000 [00:11<00:04, 129.34it/s]
[Rank 0] Train Epoch 4:  73%|███████▎  | 1459/2000 [00:11<00:03, 136.38it/s]
[Rank 2] Train Epoch 4:  73%|███████▎  | 1465/2000 [00:11<00:03, 139.23it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1462 | Mem: 26.53MB, Util: 100%  global_step : 9462
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1463 | Mem: 26.53MB, Util: 100%  global_step : 9463
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1464 | Mem: 26.53MB, Util: 100%  global_step : 9464
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1465 | Mem: 26.53MB, Util: 100%  global_step : 9465
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1466 | Mem: 26.53MB, Util: 100%  global_step : 9466
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1467 | Mem: 26.53MB, Util: 100%  global_step : 9467
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1468 | Mem: 26.53MB, Util: 100%  global_step : 9468
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1469 | Mem: 26.53MB, Util: 1

[Rank 1] Train Epoch 4:  74%|███████▎  | 1474/2000 [00:11<00:04, 130.48it/s]
[Rank 0] Train Epoch 4:  74%|███████▎  | 1473/2000 [00:11<00:03, 137.25it/s]
[Rank 2] Train Epoch 4:  74%|███████▍  | 1480/2000 [00:11<00:03, 139.86it/s]
[Rank 1] Train Epoch 4:  74%|███████▍  | 1488/2000 [00:11<00:03, 131.58it/s]
[Rank 0] Train Epoch 4:  74%|███████▍  | 1487/2000 [00:11<00:03, 137.37it/s]
[Rank 2] Train Epoch 4:  75%|███████▍  | 1499/2000 [00:11<00:03, 153.36it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1495 | Mem: 26.53MB, Util: 100%  global_step : 9495
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1496 | Mem: 26.53MB, Util: 100%  global_step : 9496
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1497 | Mem: 26.53MB, Util: 100%  global_step : 9497
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1498 | Mem: 26.53MB, Util: 100%  global_step : 9498
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1499 | Mem: 26.53MB, Util: 100%  global_step : 9499
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 1490 | Mem: 26.53MB, Util: 64%  global_step : 9490
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 1491 | Mem: 26.53MB, Util: 66%  global_step : 9491
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 1492 | Mem: 26.53MB, Util: 

[Rank 1] Train Epoch 4:  75%|███████▌  | 1502/2000 [00:11<00:03, 132.46it/s]
[Rank 0] Train Epoch 4:  75%|███████▌  | 1501/2000 [00:11<00:04, 121.94it/s]
[Rank 2] Train Epoch 4:  76%|███████▌  | 1515/2000 [00:11<00:03, 126.03it/s]
[Rank 1] Train Epoch 4:  76%|███████▌  | 1516/2000 [00:11<00:03, 133.01it/s]
[Rank 0] Train Epoch 4:  76%|███████▌  | 1515/2000 [00:11<00:03, 126.71it/s]
[Rank 2] Train Epoch 4:  77%|███████▋  | 1534/2000 [00:11<00:03, 141.62it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1517 | Mem: 26.53MB, Util: 100%  global_step : 9517
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1518 | Mem: 26.53MB, Util: 100%  global_step : 9518
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1519 | Mem: 26.53MB, Util: 100%  global_step : 9519
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1520 | Mem: 26.53MB, Util: 100%  global_step : 9520
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1521 | Mem: 26.53MB, Util: 100%  global_step : 9521
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1522 | Mem: 26.53MB, Util: 100%  global_step : 9522
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1523 | Mem: 26.53MB, Util: 99%  global_step : 9523
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1524 | Mem: 26.53MB, Util: 99

[Rank 1] Train Epoch 4:  76%|███████▋  | 1530/2000 [00:12<00:03, 133.56it/s]
[Rank 0] Train Epoch 4:  76%|███████▋  | 1530/2000 [00:12<00:03, 131.28it/s]
[Rank 2] Train Epoch 4:  78%|███████▊  | 1553/2000 [00:12<00:02, 153.88it/s]
[Rank 1] Train Epoch 4:  77%|███████▋  | 1544/2000 [00:12<00:03, 133.62it/s]
[Rank 0] Train Epoch 4:  77%|███████▋  | 1545/2000 [00:12<00:03, 134.71it/s]
[Rank 2] Train Epoch 4:  78%|███████▊  | 1570/2000 [00:12<00:02, 156.06it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1557 | Mem: 26.53MB, Util: 99%  global_step : 9557
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1558 | Mem: 26.53MB, Util: 99%  global_step : 9558
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1559 | Mem: 26.53MB, Util: 99%  global_step : 9559
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1560 | Mem: 26.53MB, Util: 99%  global_step : 9560
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1561 | Mem: 26.53MB, Util: 100%  global_step : 9561
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1562 | Mem: 26.53MB, Util: 100%  global_step : 9562
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1563 | Mem: 26.53MB, Util: 100%  global_step : 9563
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1564 | Mem: 26.53MB, Util: 100% 

[Rank 1] Train Epoch 4:  78%|███████▊  | 1558/2000 [00:12<00:03, 133.82it/s]
[Rank 0] Train Epoch 4:  78%|███████▊  | 1559/2000 [00:12<00:03, 135.52it/s]
[Rank 2] Train Epoch 4:  79%|███████▉  | 1587/2000 [00:12<00:02, 151.96it/s]
[Rank 1] Train Epoch 4:  79%|███████▊  | 1572/2000 [00:12<00:03, 134.63it/s]
[Rank 0] Train Epoch 4:  79%|███████▊  | 1573/2000 [00:12<00:03, 132.95it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1588 | Mem: 26.53MB, Util: 100%  global_step : 9588
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1589 | Mem: 26.53MB, Util: 100%  global_step : 9589
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1590 | Mem: 26.53MB, Util: 100%  global_step : 9590
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1591 | Mem: 26.53MB, Util: 100%  global_step : 9591
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1592 | Mem: 26.53MB, Util: 100%  global_step : 9592
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1593 | Mem: 26.53MB, Util: 100%  global_step : 9593
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1594 | Mem: 26.53MB, Util: 100%  global_step : 9594
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1595 | Mem: 26.53MB, Util: 1

[Rank 1] Train Epoch 4:  79%|███████▉  | 1586/2000 [00:12<00:03, 133.84it/s]
[Rank 0] Train Epoch 4:  79%|███████▉  | 1587/2000 [00:12<00:03, 133.00it/s]
[Rank 1] Train Epoch 4:  80%|████████  | 1600/2000 [00:12<00:02, 133.78it/s]
[Rank 0] Train Epoch 4:  80%|████████  | 1601/2000 [00:12<00:03, 128.61it/s]
[Rank 2] Train Epoch 4:  80%|████████  | 1603/2000 [00:12<00:03, 107.63it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1600 | Mem: 26.53MB, Util: 100%  global_step : 9600
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1601 | Mem: 26.53MB, Util: 100%  global_step : 9601
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1602 | Mem: 26.53MB, Util: 100%  global_step : 9602
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1603 | Mem: 26.53MB, Util: 100%  global_step : 9603
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1604 | Mem: 26.53MB, Util: 100%  global_step : 9604
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1605 | Mem: 26.53MB, Util: 100%  global_step : 9605
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1606 | Mem: 26.53MB, Util: 100%  global_step : 9606
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1607 | Mem: 26.53MB, Util: 1

[Rank 1] Train Epoch 4:  81%|████████  | 1614/2000 [00:12<00:02, 135.16it/s]
[Rank 0] Train Epoch 4:  81%|████████  | 1615/2000 [00:12<00:02, 130.52it/s]
[Rank 2] Train Epoch 4:  81%|████████  | 1618/2000 [00:12<00:03, 116.06it/s]
[Rank 1] Train Epoch 4:  81%|████████▏ | 1628/2000 [00:12<00:02, 135.20it/s]
[Rank 0] Train Epoch 4:  81%|████████▏ | 1629/2000 [00:12<00:02, 131.94it/s]
[Rank 2] Train Epoch 4:  82%|████████▏ | 1632/2000 [00:12<00:03, 121.03it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1626 | Mem: 26.53MB, Util: 100%  global_step : 9626
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1627 | Mem: 26.53MB, Util: 100%  global_step : 9627
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1628 | Mem: 26.53MB, Util: 100%  global_step : 9628
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1629 | Mem: 26.53MB, Util: 100%  global_step : 9629
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1630 | Mem: 26.53MB, Util: 100%  global_step : 9630
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1631 | Mem: 26.53MB, Util: 100%  global_step : 9631
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1632 | Mem: 26.53MB, Util: 95%  global_step : 9632
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1633 | Mem: 26.53MB, Util: 95

[Rank 1] Train Epoch 4:  82%|████████▏ | 1642/2000 [00:12<00:02, 136.06it/s]
[Rank 0] Train Epoch 4:  82%|████████▏ | 1643/2000 [00:12<00:02, 133.07it/s]
[Rank 2] Train Epoch 4:  83%|████████▎ | 1651/2000 [00:12<00:02, 136.95it/s]
[Rank 1] Train Epoch 4:  83%|████████▎ | 1656/2000 [00:12<00:02, 131.14it/s]
[Rank 0] Train Epoch 4:  83%|████████▎ | 1658/2000 [00:12<00:02, 136.05it/s]
[Rank 2] Train Epoch 4:  83%|████████▎ | 1668/2000 [00:12<00:02, 144.19it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1662 | Mem: 26.53MB, Util: 95%  global_step : 9662
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1663 | Mem: 26.53MB, Util: 95%  global_step : 9663
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1664 | Mem: 26.53MB, Util: 95%  global_step : 9664
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1665 | Mem: 26.53MB, Util: 95%  global_step : 9665
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1666 | Mem: 26.53MB, Util: 95%  global_step : 9666
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1667 | Mem: 26.53MB, Util: 100%  global_step : 9667
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1668 | Mem: 26.53MB, Util: 100%  global_step : 9668
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1669 | Mem: 26.53MB, Util: 100%  

[Rank 1] Train Epoch 4:  84%|████████▎ | 1670/2000 [00:13<00:02, 131.90it/s]
[Rank 0] Train Epoch 4:  84%|████████▎ | 1673/2000 [00:13<00:02, 137.06it/s]
[Rank 2] Train Epoch 4:  84%|████████▍ | 1684/2000 [00:13<00:02, 147.14it/s]
[Rank 1] Train Epoch 4:  84%|████████▍ | 1684/2000 [00:13<00:02, 131.97it/s]
[Rank 0] Train Epoch 4:  84%|████████▍ | 1688/2000 [00:13<00:02, 138.15it/s]
[Rank 2] Train Epoch 4:  85%|████████▌ | 1700/2000 [00:13<00:02, 147.18it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1693 | Mem: 26.53MB, Util: 100%  global_step : 9693
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1694 | Mem: 26.53MB, Util: 100%  global_step : 9694
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1695 | Mem: 26.53MB, Util: 100%  global_step : 9695
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1696 | Mem: 26.53MB, Util: 100%  global_step : 9696
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1697 | Mem: 26.53MB, Util: 100%  global_step : 9697
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1698 | Mem: 26.53MB, Util: 100%  global_step : 9698
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1699 | Mem: 26.53MB, Util: 100%  global_step : 9699
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 1685 | Mem: 26.53MB, Util: 

[Rank 1] Train Epoch 4:  85%|████████▍ | 1698/2000 [00:13<00:02, 133.03it/s]
[Rank 0] Train Epoch 4:  85%|████████▌ | 1702/2000 [00:13<00:02, 127.77it/s]
[Rank 1] Train Epoch 4:  86%|████████▌ | 1712/2000 [00:13<00:02, 130.66it/s]
[Rank 0] Train Epoch 4:  86%|████████▌ | 1716/2000 [00:13<00:02, 130.82it/s]
[Rank 2] Train Epoch 4:  86%|████████▌ | 1716/2000 [00:13<00:02, 113.49it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1708 | Mem: 26.53MB, Util: 100%  global_step : 9708
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1709 | Mem: 26.53MB, Util: 100%  global_step : 9709
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1710 | Mem: 26.53MB, Util: 100%  global_step : 9710
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1711 | Mem: 26.53MB, Util: 100%  global_step : 9711
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1712 | Mem: 26.53MB, Util: 100%  global_step : 9712
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1713 | Mem: 26.53MB, Util: 96%  global_step : 9713
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1714 | Mem: 26.53MB, Util: 96%  global_step : 9714
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1715 | Mem: 26.53MB, Util: 96%

[Rank 1] Train Epoch 4:  86%|████████▋ | 1726/2000 [00:13<00:02, 131.63it/s]
[Rank 0] Train Epoch 4:  87%|████████▋ | 1731/2000 [00:13<00:02, 134.25it/s]
[Rank 2] Train Epoch 4:  87%|████████▋ | 1732/2000 [00:13<00:02, 124.17it/s]
[Rank 1] Train Epoch 4:  87%|████████▋ | 1740/2000 [00:13<00:01, 132.56it/s]
[Rank 0] Train Epoch 4:  87%|████████▋ | 1746/2000 [00:13<00:01, 136.42it/s]
[Rank 2] Train Epoch 4:  87%|████████▋ | 1749/2000 [00:13<00:01, 133.46it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1741 | Mem: 26.53MB, Util: 96%  global_step : 9741
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1742 | Mem: 26.53MB, Util: 100%  global_step : 9742
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1743 | Mem: 26.53MB, Util: 100%  global_step : 9743
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1744 | Mem: 26.53MB, Util: 100%  global_step : 9744
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1745 | Mem: 26.53MB, Util: 100%  global_step : 9745
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1746 | Mem: 26.53MB, Util: 100%  global_step : 9746
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1747 | Mem: 26.53MB, Util: 100%  global_step : 9747
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1748 | Mem: 26.53MB, Util: 10

[Rank 1] Train Epoch 4:  88%|████████▊ | 1754/2000 [00:13<00:01, 132.63it/s]
[Rank 0] Train Epoch 4:  88%|████████▊ | 1760/2000 [00:13<00:01, 133.56it/s]
[Rank 2] Train Epoch 4:  88%|████████▊ | 1765/2000 [00:13<00:01, 138.81it/s]
[Rank 2] Train Epoch 4:  89%|████████▉ | 1780/2000 [00:13<00:01, 141.59it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1773 | Mem: 26.53MB, Util: 100%  global_step : 9773
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1774 | Mem: 26.53MB, Util: 100%  global_step : 9774
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1775 | Mem: 26.53MB, Util: 100%  global_step : 9775
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1776 | Mem: 26.53MB, Util: 100%  global_step : 9776
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1777 | Mem: 26.53MB, Util: 100%  global_step : 9777
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1778 | Mem: 26.53MB, Util: 100%  global_step : 9778
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1779 | Mem: 26.53MB, Util: 100%  global_step : 9779
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1780 | Mem: 26.53MB, Util: 1

[Rank 1] Train Epoch 4:  88%|████████▊ | 1768/2000 [00:13<00:01, 131.77it/s]
[Rank 0] Train Epoch 4:  89%|████████▉ | 1775/2000 [00:13<00:01, 135.69it/s]
[Rank 0] Train Epoch 4:  89%|████████▉ | 1789/2000 [00:13<00:01, 136.60it/s]
[Rank 2] Train Epoch 4:  90%|████████▉ | 1796/2000 [00:13<00:01, 145.28it/s]
[Rank 1] Train Epoch 4:  89%|████████▉ | 1782/2000 [00:13<00:01, 132.23it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1800 | Mem: 26.53MB, Util: 100%  global_step : 9800
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1801 | Mem: 26.53MB, Util: 100%  global_step : 9801
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1802 | Mem: 26.53MB, Util: 100%  global_step : 9802
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1803 | Mem: 26.53MB, Util: 100%  global_step : 9803
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1804 | Mem: 26.53MB, Util: 100%  global_step : 9804
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 1795 | Mem: 26.53MB, Util: 63%  global_step : 9795
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 1796 | Mem: 26.53MB, Util: 63%  global_step : 9796
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 4, Batch 1797 | Mem: 26.53MB, Util: 

[Rank 1] Train Epoch 4:  90%|████████▉ | 1796/2000 [00:14<00:01, 133.25it/s]
[Rank 1] Train Epoch 4:  90%|█████████ | 1810/2000 [00:14<00:01, 134.82it/s]
[Rank 0] Train Epoch 4:  90%|█████████ | 1803/2000 [00:14<00:01, 125.05it/s]
[Rank 2] Train Epoch 4:  91%|█████████ | 1811/2000 [00:14<00:01, 114.28it/s]
[Rank 1] Train Epoch 4:  91%|█████████▏| 1826/2000 [00:14<00:01, 140.36it/s]
[Rank 0] Train Epoch 4:  91%|█████████ | 1818/2000 [00:14<00:01, 131.05it/s]
[Rank 2] Train Epoch 4:  91%|█████████▏| 1828/2000 [00:14<00:01, 126.12it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1822 | Mem: 26.53MB, Util: 99%  global_step : 9822
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1823 | Mem: 26.53MB, Util: 99%  global_step : 9823
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1824 | Mem: 26.53MB, Util: 99%  global_step : 9824
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1825 | Mem: 26.53MB, Util: 99%  global_step : 9825
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1826 | Mem: 26.53MB, Util: 99%  global_step : 9826
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1827 | Mem: 26.53MB, Util: 99%  global_step : 9827
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1828 | Mem: 26.53MB, Util: 99%  global_step : 9828
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1829 | Mem: 26.53MB, Util: 99%  glo

[Rank 1] Train Epoch 4:  92%|█████████▏| 1842/2000 [00:14<00:01, 143.79it/s]
[Rank 0] Train Epoch 4:  92%|█████████▏| 1833/2000 [00:14<00:01, 134.20it/s]
[Rank 2] Train Epoch 4:  92%|█████████▏| 1845/2000 [00:14<00:01, 135.80it/s]
[Rank 1] Train Epoch 4:  93%|█████████▎| 1857/2000 [00:14<00:00, 144.78it/s]
[Rank 0] Train Epoch 4:  92%|█████████▏| 1848/2000 [00:14<00:01, 136.65it/s]
[Rank 2] Train Epoch 4:  93%|█████████▎| 1862/2000 [00:14<00:00, 142.90it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1856 | Mem: 26.53MB, Util: 100%  global_step : 9856
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1857 | Mem: 26.53MB, Util: 100%  global_step : 9857
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1858 | Mem: 26.53MB, Util: 100%  global_step : 9858
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1859 | Mem: 26.53MB, Util: 100%  global_step : 9859
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1860 | Mem: 26.53MB, Util: 100%  global_step : 9860
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1861 | Mem: 26.53MB, Util: 100%  global_step : 9861
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1862 | Mem: 26.53MB, Util: 100%  global_step : 9862
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1863 | Mem: 26.53MB, Util: 1

[Rank 1] Train Epoch 4:  94%|█████████▎| 1872/2000 [00:14<00:00, 146.19it/s]
[Rank 0] Train Epoch 4:  93%|█████████▎| 1863/2000 [00:14<00:00, 138.34it/s]
[Rank 2] Train Epoch 4:  94%|█████████▍| 1879/2000 [00:14<00:00, 148.06it/s]
[Rank 1] Train Epoch 4:  94%|█████████▍| 1887/2000 [00:14<00:00, 146.15it/s]
[Rank 0] Train Epoch 4:  94%|█████████▍| 1878/2000 [00:14<00:00, 140.76it/s]
[Rank 2] Train Epoch 4:  95%|█████████▍| 1896/2000 [00:14<00:00, 152.30it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1890 | Mem: 26.53MB, Util: 100%  global_step : 9890
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1891 | Mem: 26.53MB, Util: 100%  global_step : 9891
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1892 | Mem: 26.53MB, Util: 100%  global_step : 9892
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1893 | Mem: 26.53MB, Util: 100%  global_step : 9893
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1894 | Mem: 26.53MB, Util: 100%  global_step : 9894
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1895 | Mem: 26.53MB, Util: 100%  global_step : 9895
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1896 | Mem: 26.53MB, Util: 100%  global_step : 9896
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1897 | Mem: 26.53MB, Util: 1

[Rank 0] Train Epoch 4:  95%|█████████▍| 1893/2000 [00:14<00:00, 142.16it/s]
[Rank 1] Train Epoch 4:  95%|█████████▌| 1902/2000 [00:14<00:00, 137.62it/s]
[Rank 0] Train Epoch 4:  95%|█████████▌| 1908/2000 [00:14<00:00, 142.32it/s]
[Rank 2] Train Epoch 4:  96%|█████████▌| 1912/2000 [00:14<00:00, 127.02it/s]
[Rank 1] Train Epoch 4:  96%|█████████▌| 1917/2000 [00:14<00:00, 138.68it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1911 | Mem: 26.53MB, Util: 100%  global_step : 9911
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1912 | Mem: 26.53MB, Util: 100%  global_step : 9912
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1913 | Mem: 26.53MB, Util: 100%  global_step : 9913
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1914 | Mem: 26.53MB, Util: 100%  global_step : 9914
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1915 | Mem: 26.53MB, Util: 100%  global_step : 9915
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1916 | Mem: 26.53MB, Util: 100%  global_step : 9916
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1917 | Mem: 26.53MB, Util: 100%  global_step : 9917
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1918 | Mem: 26.53MB, Util: 1

[Rank 0] Train Epoch 4:  96%|█████████▌| 1923/2000 [00:14<00:00, 143.05it/s]
[Rank 2] Train Epoch 4:  96%|█████████▋| 1929/2000 [00:14<00:00, 135.73it/s]
[Rank 1] Train Epoch 4:  97%|█████████▋| 1932/2000 [00:14<00:00, 140.09it/s]
[Rank 0] Train Epoch 4:  97%|█████████▋| 1938/2000 [00:15<00:00, 143.02it/s]
[Rank 2] Train Epoch 4:  97%|█████████▋| 1946/2000 [00:15<00:00, 142.37it/s]
[Rank 1] Train Epoch 4:  97%|█████████▋| 1947/2000 [00:15<00:00, 140.39it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1945 | Mem: 26.53MB, Util: 97%  global_step : 9945
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1946 | Mem: 26.53MB, Util: 97%  global_step : 9946
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1947 | Mem: 26.53MB, Util: 97%  global_step : 9947
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1948 | Mem: 26.53MB, Util: 97%  global_step : 9948
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1949 | Mem: 26.53MB, Util: 97%  global_step : 9949
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1950 | Mem: 26.53MB, Util: 97%  global_step : 9950
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1951 | Mem: 26.53MB, Util: 97%  global_step : 9951
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1952 | Mem: 26.53MB, Util: 97%  glo

[Rank 0] Train Epoch 4:  98%|█████████▊| 1953/2000 [00:15<00:00, 142.43it/s]
[Rank 2] Train Epoch 4:  98%|█████████▊| 1963/2000 [00:15<00:00, 148.21it/s]
[Rank 1] Train Epoch 4:  98%|█████████▊| 1962/2000 [00:15<00:00, 141.19it/s]
[Rank 0] Train Epoch 4:  98%|█████████▊| 1968/2000 [00:15<00:00, 142.01it/s]
[Rank 2] Train Epoch 4:  99%|█████████▉| 1980/2000 [00:15<00:00, 152.16it/s]
[Rank 1] Train Epoch 4:  99%|█████████▉| 1977/2000 [00:15<00:00, 141.31it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1979 | Mem: 26.53MB, Util: 100%  global_step : 9979
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1980 | Mem: 26.53MB, Util: 100%  global_step : 9980
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1981 | Mem: 26.53MB, Util: 100%  global_step : 9981
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1982 | Mem: 26.53MB, Util: 100%  global_step : 9982
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1983 | Mem: 26.53MB, Util: 100%  global_step : 9983
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1984 | Mem: 26.53MB, Util: 100%  global_step : 9984
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1985 | Mem: 26.53MB, Util: 100%  global_step : 9985
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 4, Batch 1986 | Mem: 26.53MB, Util: 1

[Rank 0] Train Epoch 4:  99%|█████████▉| 1983/2000 [00:15<00:00, 139.64it/s]
[Rank 2] Train Epoch 4: 100%|██████████| 2000/2000 [00:15<00:00, 130.29it/s]
[Rank 2] Test Epoch 4:   0%|          | 0/334 [00:00<?, ?it/s]
[Rank 1] Train Epoch 4: 100%|██████████| 2000/2000 [00:15<00:00, 129.64it/s]
[Rank 1] Test Epoch 4:   0%|          | 0/334 [00:00<?, ?it/s]
[Rank 0] Train Epoch 4: 100%|██████████| 2000/2000 [00:15<00:00, 129.53it/s]
[Rank 0] Test Epoch 4:   0%|          | 0/334 [00:00<?, ?it/s]
[Rank 2] Test Epoch 4:   2%|▏         | 7/334 [00:00<00:04, 69.02it/s]
[Rank 1] Test Epoch 4:  10%|▉         | 33/334 [00:00<00:00, 329.56it/s]
[Rank 0] Test Epoch 4:  10%|█         | 34/334 [00:00<00:00, 331.31it/s]
[Rank 2] Test Epoch 4:  13%|█▎        | 42/334 [00:00<00:01, 232.24it/s]
[Rank 1] Test Epoch 4:  20%|██        | 68/334 [00:00<00:00, 339.82it/s]
[Rank 0] Test Epoch 4:  21%|██        | 69/334 [00:00<00:00, 337.54it/s]
[Rank 2] Test Epoch 4:  23%|██▎       | 78/334 [00:00<00:00, 289.42

[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [Rank 0] Epoch 4 | Loss: 0.3679, Acc: 0.8677, Model Checksum: 2d8dec44db1a651130f75494f8f662f1
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [ NodeId f94cfd48e881a8ef0b964a8593da4b704a1574ca224f294a4cfe1791 Rank 0] Epoch 4 | Loss: 0.3679, Acc: 0.8677, Model Checksum: 2d8dec44db1a651130f75494f8f662f1
[36m(TunerInternal pid=767)[0m 
[36m(TunerInternal pid=767)[0m Training finished iteration 5 at 2025-04-07 12:56:34. Total running time: 7min 43s
[36m(TunerInternal pid=767)[0m ╭────────────────────────────────────────────╮
[36m(TunerInternal pid=767)[0m │ Training result                            │
[36m(TunerInternal pid=767)[0m ├────────────────────────────────────────────┤
[36m(TunerInternal pid=767)[0m │ checkpoint_dir_name                        │
[36m(TunerInternal pid=767)[0m │ time_this_iter_s                  16.39632 │
[36m(TunerInternal pid=767)[0m │ time_total_s                     313.56648 │
[36m(TunerInte

[Rank 0] Train Epoch 5:   1%|▏         | 29/2000 [00:00<00:14, 140.15it/s]
[Rank 2] Train Epoch 5:   1%|▏         | 29/2000 [00:00<00:13, 145.00it/s]
[Rank 1] Train Epoch 5:   1%|▏         | 29/2000 [00:00<00:14, 137.26it/s]
[Rank 0] Train Epoch 5:   2%|▏         | 44/2000 [00:00<00:14, 138.90it/s]
[Rank 2] Train Epoch 5:   2%|▏         | 45/2000 [00:00<00:13, 148.57it/s]
[Rank 1] Train Epoch 5:   2%|▏         | 44/2000 [00:00<00:14, 139.07it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 14 | Mem: 26.53MB, Util: 9%  global_step : 10014
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 15 | Mem: 26.53MB, Util: 9%  global_step : 10015
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 16 | Mem: 26.53MB, Util: 9%  global_step : 10016
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 17 | Mem: 26.53MB, Util: 9%  global_step : 10017
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 18 | Mem: 26.53MB, Util: 9%  global_step : 10018
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 19 | Mem: 26.53MB, Util: 9%  global_step : 10019
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 20 | Mem: 26.53MB, Util: 9%  global_step : 10020
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 21 | Mem: 26.53MB, Util: 9%  global_step : 10021
[36m(Ra

[Rank 2] Train Epoch 5:   3%|▎         | 60/2000 [00:00<00:13, 149.03it/s]
[Rank 2] Train Epoch 5:   4%|▍         | 75/2000 [00:00<00:12, 149.00it/s]
[Rank 1] Train Epoch 5:   3%|▎         | 59/2000 [00:00<00:13, 140.03it/s]
[Rank 0] Train Epoch 5:   3%|▎         | 58/2000 [00:00<00:14, 133.64it/s]
[Rank 0] Train Epoch 5:   4%|▎         | 73/2000 [00:00<00:13, 138.69it/s]
[Rank 2] Train Epoch 5:   4%|▍         | 90/2000 [00:00<00:12, 149.07it/s]
[Rank 1] Train Epoch 5:   4%|▎         | 74/2000 [00:00<00:14, 134.96it/s]
[Rank 0] Train Epoch 5:   4%|▍         | 88/2000 [00:00<00:13, 139.19it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 73 | Mem: 26.53MB, Util: 64%  global_step : 10073
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 74 | Mem: 26.53MB, Util: 64%  global_step : 10074
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 75 | Mem: 26.53MB, Util: 64%  global_step : 10075
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 76 | Mem: 26.53MB, Util: 64%  global_step : 10076
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 77 | Mem: 26.53MB, Util: 64%  global_step : 10077
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 78 | Mem: 26.53MB, Util: 64%  global_step : 10078
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 79 | Mem: 26.53MB, Util: 64%  global_step : 10079
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 80 | Mem: 26.53MB, Util: 64%  global_step : 10080


[Rank 1] Train Epoch 5:   4%|▍         | 88/2000 [00:00<00:14, 131.36it/s]
[Rank 2] Train Epoch 5:   5%|▌         | 105/2000 [00:00<00:16, 112.55it/s]
[Rank 1] Train Epoch 5:   5%|▌         | 102/2000 [00:00<00:14, 130.52it/s]
[Rank 0] Train Epoch 5:   5%|▌         | 102/2000 [00:00<00:14, 130.31it/s]
[Rank 2] Train Epoch 5:   6%|▌         | 120/2000 [00:00<00:15, 122.11it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 100 | Mem: 26.53MB, Util: 84%  global_step : 10100
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 101 | Mem: 26.53MB, Util: 84%  global_step : 10101
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 102 | Mem: 26.53MB, Util: 84%  global_step : 10102
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 103 | Mem: 26.53MB, Util: 84%  global_step : 10103
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 104 | Mem: 26.53MB, Util: 84%  global_step : 10104
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 105 | Mem: 26.53MB, Util: 84%  global_step : 10105
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 106 | Mem: 26.53MB, Util: 84%  global_step : 10106
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 107 | Mem: 26.53MB, Util: 84%  global_step 

[Rank 1] Train Epoch 5:   6%|▌         | 116/2000 [00:00<00:14, 129.29it/s]
[Rank 0] Train Epoch 5:   6%|▌         | 116/2000 [00:00<00:14, 132.84it/s]
[Rank 2] Train Epoch 5:   7%|▋         | 135/2000 [00:01<00:14, 129.47it/s]
[Rank 1] Train Epoch 5:   6%|▋         | 129/2000 [00:00<00:14, 128.56it/s]
[Rank 0] Train Epoch 5:   7%|▋         | 131/2000 [00:00<00:13, 135.65it/s]
[Rank 2] Train Epoch 5:   8%|▊         | 150/2000 [00:01<00:13, 134.38it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 129 | Mem: 26.53MB, Util: 97%  global_step : 10129
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 130 | Mem: 26.53MB, Util: 97%  global_step : 10130
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 131 | Mem: 26.53MB, Util: 97%  global_step : 10131
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 132 | Mem: 26.53MB, Util: 97%  global_step : 10132
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 133 | Mem: 26.53MB, Util: 97%  global_step : 10133
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 134 | Mem: 26.53MB, Util: 97%  global_step : 10134
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 135 | Mem: 26.53MB, Util: 97%  global_step : 10135
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 136 | Mem: 26.53MB, Util: 97%  global_step 

[Rank 1] Train Epoch 5:   7%|▋         | 142/2000 [00:01<00:14, 128.60it/s]
[Rank 0] Train Epoch 5:   7%|▋         | 146/2000 [00:01<00:13, 137.63it/s]
[Rank 2] Train Epoch 5:   8%|▊         | 165/2000 [00:01<00:13, 138.74it/s]
[Rank 1] Train Epoch 5:   8%|▊         | 155/2000 [00:01<00:14, 128.62it/s]
[Rank 0] Train Epoch 5:   8%|▊         | 161/2000 [00:01<00:13, 138.91it/s]
[Rank 2] Train Epoch 5:   9%|▉         | 180/2000 [00:01<00:12, 141.86it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 159 | Mem: 26.53MB, Util: 100%  global_step : 10159
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 160 | Mem: 26.53MB, Util: 100%  global_step : 10160
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 161 | Mem: 26.53MB, Util: 100%  global_step : 10161
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 162 | Mem: 26.53MB, Util: 100%  global_step : 10162
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 163 | Mem: 26.53MB, Util: 100%  global_step : 10163
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 164 | Mem: 26.53MB, Util: 100%  global_step : 10164
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 165 | Mem: 26.53MB, Util: 100%  global_step : 10165
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 166 | Mem: 26.53MB, Util: 100%  glob

[Rank 1] Train Epoch 5:   8%|▊         | 168/2000 [00:01<00:14, 128.51it/s]
[Rank 0] Train Epoch 5:   9%|▉         | 176/2000 [00:01<00:12, 140.43it/s]
[Rank 2] Train Epoch 5:  10%|▉         | 196/2000 [00:01<00:12, 144.43it/s]
[Rank 1] Train Epoch 5:   9%|▉         | 181/2000 [00:01<00:14, 128.61it/s]
[Rank 0] Train Epoch 5:  10%|▉         | 191/2000 [00:01<00:12, 141.67it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 189 | Mem: 26.53MB, Util: 100%  global_step : 10189
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 190 | Mem: 26.53MB, Util: 100%  global_step : 10190
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 191 | Mem: 26.53MB, Util: 100%  global_step : 10191
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 192 | Mem: 26.53MB, Util: 100%  global_step : 10192
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 193 | Mem: 26.53MB, Util: 100%  global_step : 10193
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 194 | Mem: 26.53MB, Util: 100%  global_step : 10194
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 195 | Mem: 26.53MB, Util: 100%  global_step : 10195
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 196 | Mem: 26.53MB, Util: 100%  glob

[Rank 1] Train Epoch 5:  10%|▉         | 194/2000 [00:01<00:14, 128.62it/s]
[Rank 0] Train Epoch 5:  10%|█         | 206/2000 [00:01<00:15, 115.93it/s]
[Rank 2] Train Epoch 5:  11%|█         | 211/2000 [00:01<00:16, 110.41it/s]
[Rank 1] Train Epoch 5:  10%|█         | 207/2000 [00:01<00:13, 128.51it/s]
[Rank 0] Train Epoch 5:  11%|█         | 221/2000 [00:01<00:14, 123.10it/s]
[Rank 2] Train Epoch 5:  11%|█▏        | 227/2000 [00:01<00:14, 120.46it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 208 | Mem: 26.53MB, Util: 100%  global_step : 10208
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 209 | Mem: 26.53MB, Util: 100%  global_step : 10209
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 210 | Mem: 26.53MB, Util: 100%  global_step : 10210
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 211 | Mem: 26.53MB, Util: 100%  global_step : 10211
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 212 | Mem: 26.53MB, Util: 100%  global_step : 10212
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 213 | Mem: 26.53MB, Util: 100%  global_step : 10213
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 214 | Mem: 26.53MB, Util: 100%  global_step : 10214
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 215 | Mem: 26.53MB, Util: 100%  glob

[Rank 1] Train Epoch 5:  11%|█         | 220/2000 [00:01<00:14, 125.85it/s]
[Rank 0] Train Epoch 5:  12%|█▏        | 236/2000 [00:01<00:13, 128.40it/s]
[Rank 2] Train Epoch 5:  12%|█▏        | 243/2000 [00:01<00:13, 128.30it/s]
[Rank 1] Train Epoch 5:  12%|█▏        | 234/2000 [00:01<00:13, 129.53it/s]
[Rank 0] Train Epoch 5:  13%|█▎        | 251/2000 [00:01<00:13, 133.27it/s]
[Rank 2] Train Epoch 5:  13%|█▎        | 259/2000 [00:01<00:12, 134.49it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 238 | Mem: 26.53MB, Util: 97%  global_step : 10238
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 239 | Mem: 26.53MB, Util: 97%  global_step : 10239
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 240 | Mem: 26.53MB, Util: 97%  global_step : 10240
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 241 | Mem: 26.53MB, Util: 97%  global_step : 10241
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 242 | Mem: 26.53MB, Util: 97%  global_step : 10242
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 243 | Mem: 26.53MB, Util: 97%  global_step : 10243
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 244 | Mem: 26.53MB, Util: 97%  global_step : 10244
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 245 | Mem: 26.53MB, Util: 97%  global_step 

[Rank 1] Train Epoch 5:  12%|█▏        | 249/2000 [00:01<00:13, 133.55it/s]
[Rank 0] Train Epoch 5:  13%|█▎        | 266/2000 [00:01<00:12, 136.87it/s]
[Rank 2] Train Epoch 5:  14%|█▎        | 274/2000 [00:02<00:12, 138.34it/s]
[Rank 1] Train Epoch 5:  13%|█▎        | 264/2000 [00:02<00:12, 135.81it/s]
[Rank 0] Train Epoch 5:  14%|█▍        | 281/2000 [00:02<00:12, 139.93it/s]
[Rank 2] Train Epoch 5:  14%|█▍        | 289/2000 [00:02<00:12, 141.12it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 269 | Mem: 26.53MB, Util: 100%  global_step : 10269
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 270 | Mem: 26.53MB, Util: 100%  global_step : 10270
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 271 | Mem: 26.53MB, Util: 100%  global_step : 10271
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 272 | Mem: 26.53MB, Util: 100%  global_step : 10272
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 273 | Mem: 26.53MB, Util: 100%  global_step : 10273
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 274 | Mem: 26.53MB, Util: 100%  global_step : 10274
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 275 | Mem: 26.53MB, Util: 100%  global_step : 10275
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 276 | Mem: 26.53MB, Util: 100%  glob

[Rank 1] Train Epoch 5:  14%|█▍        | 279/2000 [00:02<00:12, 137.58it/s]
[Rank 0] Train Epoch 5:  15%|█▍        | 296/2000 [00:02<00:12, 141.28it/s]
[Rank 1] Train Epoch 5:  15%|█▍        | 295/2000 [00:02<00:12, 141.95it/s]
[Rank 0] Train Epoch 5:  16%|█▌        | 311/2000 [00:02<00:12, 133.70it/s]
[Rank 2] Train Epoch 5:  15%|█▌        | 304/2000 [00:02<00:13, 127.10it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 300 | Mem: 26.53MB, Util: 100%  global_step : 10300
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 301 | Mem: 26.53MB, Util: 100%  global_step : 10301
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 302 | Mem: 26.53MB, Util: 100%  global_step : 10302
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 303 | Mem: 26.53MB, Util: 100%  global_step : 10303
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 304 | Mem: 26.53MB, Util: 100%  global_step : 10304
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 305 | Mem: 26.53MB, Util: 100%  global_step : 10305
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 306 | Mem: 26.53MB, Util: 100%  global_step : 10306
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 307 | Mem: 26.53MB, Util: 100%  glob

[Rank 1] Train Epoch 5:  16%|█▌        | 313/2000 [00:02<00:11, 151.82it/s]
[Rank 0] Train Epoch 5:  16%|█▋        | 326/2000 [00:02<00:12, 136.35it/s]
[Rank 2] Train Epoch 5:  16%|█▌        | 320/2000 [00:02<00:12, 133.68it/s]
[Rank 1] Train Epoch 5:  17%|█▋        | 331/2000 [00:02<00:10, 159.07it/s]
[Rank 0] Train Epoch 5:  17%|█▋        | 341/2000 [00:02<00:11, 138.92it/s]
[Rank 2] Train Epoch 5:  17%|█▋        | 335/2000 [00:02<00:12, 137.53it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 327 | Mem: 26.53MB, Util: 100%  global_step : 10327
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 328 | Mem: 26.53MB, Util: 100%  global_step : 10328
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 329 | Mem: 26.53MB, Util: 100%  global_step : 10329
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 330 | Mem: 26.53MB, Util: 100%  global_step : 10330
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 331 | Mem: 26.53MB, Util: 100%  global_step : 10331
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 332 | Mem: 26.53MB, Util: 71%  global_step : 10332
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 333 | Mem: 26.53MB, Util: 71%  global_step : 10333
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 334 | Mem: 26.53MB, Util: 71%  global_

[Rank 1] Train Epoch 5:  17%|█▋        | 349/2000 [00:02<00:10, 163.61it/s]
[Rank 0] Train Epoch 5:  18%|█▊        | 356/2000 [00:02<00:11, 140.52it/s]
[Rank 2] Train Epoch 5:  18%|█▊        | 351/2000 [00:02<00:11, 141.50it/s]
[Rank 1] Train Epoch 5:  18%|█▊        | 367/2000 [00:02<00:09, 167.39it/s]
[Rank 0] Train Epoch 5:  19%|█▊        | 371/2000 [00:02<00:11, 142.53it/s]
[Rank 2] Train Epoch 5:  18%|█▊        | 366/2000 [00:02<00:11, 143.40it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 357 | Mem: 26.53MB, Util: 71%  global_step : 10357
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 358 | Mem: 26.53MB, Util: 71%  global_step : 10358
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 359 | Mem: 26.53MB, Util: 71%  global_step : 10359
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 360 | Mem: 26.53MB, Util: 71%  global_step : 10360
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 361 | Mem: 26.53MB, Util: 71%  global_step : 10361
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 362 | Mem: 26.53MB, Util: 71%  global_step : 10362
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 363 | Mem: 26.53MB, Util: 71%  global_step : 10363
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 364 | Mem: 26.53MB, Util: 71%  global_step 

[Rank 1] Train Epoch 5:  19%|█▉        | 385/2000 [00:02<00:09, 170.62it/s]
[Rank 0] Train Epoch 5:  19%|█▉        | 386/2000 [00:02<00:11, 143.47it/s]
[Rank 2] Train Epoch 5:  19%|█▉        | 382/2000 [00:02<00:11, 145.59it/s]
[Rank 0] Train Epoch 5:  20%|██        | 401/2000 [00:02<00:11, 143.63it/s]
[Rank 2] Train Epoch 5:  20%|█▉        | 398/2000 [00:02<00:10, 146.95it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 388 | Mem: 26.53MB, Util: 71%  global_step : 10388
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 389 | Mem: 26.53MB, Util: 71%  global_step : 10389
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 390 | Mem: 26.53MB, Util: 71%  global_step : 10390
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 391 | Mem: 26.53MB, Util: 73%  global_step : 10391
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 392 | Mem: 26.53MB, Util: 73%  global_step : 10392
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 393 | Mem: 26.53MB, Util: 73%  global_step : 10393
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 394 | Mem: 26.53MB, Util: 73%  global_step : 10394
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 395 | Mem: 26.53MB, Util: 73%  global_step 

[Rank 1] Train Epoch 5:  20%|██        | 403/2000 [00:02<00:12, 127.05it/s]
[Rank 0] Train Epoch 5:  21%|██        | 416/2000 [00:03<00:10, 144.11it/s]
[Rank 2] Train Epoch 5:  21%|██        | 413/2000 [00:03<00:11, 138.90it/s]
[Rank 1] Train Epoch 5:  21%|██        | 421/2000 [00:03<00:11, 138.62it/s]
[Rank 0] Train Epoch 5:  22%|██▏       | 431/2000 [00:03<00:10, 145.41it/s]
[Rank 2] Train Epoch 5:  21%|██▏       | 428/2000 [00:03<00:11, 141.63it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 418 | Mem: 26.53MB, Util: 73%  global_step : 10418
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 419 | Mem: 26.53MB, Util: 73%  global_step : 10419
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 420 | Mem: 26.53MB, Util: 73%  global_step : 10420
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 421 | Mem: 26.53MB, Util: 73%  global_step : 10421
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 422 | Mem: 26.53MB, Util: 73%  global_step : 10422
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 423 | Mem: 26.53MB, Util: 73%  global_step : 10423
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 424 | Mem: 26.53MB, Util: 71%  global_step : 10424
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 425 | Mem: 26.53MB, Util: 71%  global_step 

[Rank 1] Train Epoch 5:  22%|██▏       | 439/2000 [00:03<00:10, 147.87it/s]
[Rank 0] Train Epoch 5:  22%|██▏       | 447/2000 [00:03<00:10, 146.88it/s]
[Rank 2] Train Epoch 5:  22%|██▏       | 443/2000 [00:03<00:10, 143.64it/s]
[Rank 1] Train Epoch 5:  23%|██▎       | 457/2000 [00:03<00:09, 155.67it/s]
[Rank 0] Train Epoch 5:  23%|██▎       | 463/2000 [00:03<00:10, 147.88it/s]
[Rank 2] Train Epoch 5:  23%|██▎       | 459/2000 [00:03<00:10, 145.66it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 449 | Mem: 26.53MB, Util: 71%  global_step : 10449
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 450 | Mem: 26.53MB, Util: 71%  global_step : 10450
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 451 | Mem: 26.53MB, Util: 71%  global_step : 10451
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 452 | Mem: 26.53MB, Util: 71%  global_step : 10452
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 453 | Mem: 26.53MB, Util: 71%  global_step : 10453
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 454 | Mem: 26.53MB, Util: 71%  global_step : 10454
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 455 | Mem: 26.53MB, Util: 72%  global_step : 10455
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 456 | Mem: 26.53MB, Util: 72%  global_step 

[Rank 1] Train Epoch 5:  24%|██▍       | 475/2000 [00:03<00:09, 160.81it/s]
[Rank 0] Train Epoch 5:  24%|██▍       | 478/2000 [00:03<00:10, 147.39it/s]
[Rank 2] Train Epoch 5:  24%|██▎       | 474/2000 [00:03<00:10, 146.33it/s]
[Rank 1] Train Epoch 5:  25%|██▍       | 493/2000 [00:03<00:09, 164.50it/s]
[Rank 0] Train Epoch 5:  25%|██▍       | 493/2000 [00:03<00:10, 146.75it/s]
[Rank 2] Train Epoch 5:  24%|██▍       | 489/2000 [00:03<00:10, 146.96it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 480 | Mem: 26.53MB, Util: 72%  global_step : 10480
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 481 | Mem: 26.53MB, Util: 72%  global_step : 10481
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 482 | Mem: 26.53MB, Util: 72%  global_step : 10482
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 483 | Mem: 26.53MB, Util: 72%  global_step : 10483
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 484 | Mem: 26.53MB, Util: 72%  global_step : 10484
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 485 | Mem: 26.53MB, Util: 73%  global_step : 10485
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 486 | Mem: 26.53MB, Util: 73%  global_step : 10486
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 487 | Mem: 26.53MB, Util: 73%  global_step 

[Rank 0] Train Epoch 5:  25%|██▌       | 508/2000 [00:03<00:10, 147.16it/s]
[Rank 2] Train Epoch 5:  25%|██▌       | 504/2000 [00:03<00:10, 145.79it/s]
[Rank 1] Train Epoch 5:  26%|██▌       | 510/2000 [00:03<00:11, 129.10it/s]
[Rank 0] Train Epoch 5:  26%|██▌       | 523/2000 [00:03<00:10, 146.95it/s]
[Rank 2] Train Epoch 5:  26%|██▌       | 519/2000 [00:03<00:10, 144.21it/s]
[Rank 1] Train Epoch 5:  26%|██▋       | 528/2000 [00:03<00:10, 140.17it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 510 | Mem: 26.53MB, Util: 73%  global_step : 10510
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 511 | Mem: 26.53MB, Util: 73%  global_step : 10511
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 512 | Mem: 26.53MB, Util: 73%  global_step : 10512
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 513 | Mem: 26.53MB, Util: 73%  global_step : 10513
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 514 | Mem: 26.53MB, Util: 73%  global_step : 10514
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 515 | Mem: 26.53MB, Util: 73%  global_step : 10515
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 516 | Mem: 26.53MB, Util: 70%  global_step : 10516
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 517 | Mem: 26.53MB, Util: 70%  global_step 

[Rank 0] Train Epoch 5:  27%|██▋       | 538/2000 [00:03<00:09, 146.70it/s]
[Rank 2] Train Epoch 5:  27%|██▋       | 534/2000 [00:03<00:10, 145.28it/s]
[Rank 1] Train Epoch 5:  27%|██▋       | 544/2000 [00:03<00:10, 138.08it/s]
[Rank 0] Train Epoch 5:  28%|██▊       | 554/2000 [00:03<00:09, 147.99it/s]
[Rank 2] Train Epoch 5:  28%|██▊       | 550/2000 [00:03<00:09, 149.50it/s]
[Rank 1] Train Epoch 5:  28%|██▊       | 559/2000 [00:04<00:10, 135.82it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 541 | Mem: 26.53MB, Util: 70%  global_step : 10541
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 542 | Mem: 26.53MB, Util: 70%  global_step : 10542
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 543 | Mem: 26.53MB, Util: 91%  global_step : 10543
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 544 | Mem: 26.53MB, Util: 91%  global_step : 10544
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 545 | Mem: 26.53MB, Util: 91%  global_step : 10545
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 546 | Mem: 26.53MB, Util: 91%  global_step : 10546
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 547 | Mem: 26.53MB, Util: 91%  global_step : 10547
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 548 | Mem: 26.53MB, Util: 91%  global_step 

[Rank 0] Train Epoch 5:  28%|██▊       | 569/2000 [00:04<00:09, 147.66it/s]
[Rank 2] Train Epoch 5:  28%|██▊       | 566/2000 [00:04<00:09, 151.20it/s]
[Rank 1] Train Epoch 5:  29%|██▊       | 574/2000 [00:04<00:10, 133.97it/s]
[Rank 0] Train Epoch 5:  29%|██▉       | 584/2000 [00:04<00:09, 146.81it/s]
[Rank 2] Train Epoch 5:  29%|██▉       | 582/2000 [00:04<00:09, 149.72it/s]
[Rank 1] Train Epoch 5:  29%|██▉       | 588/2000 [00:04<00:10, 132.61it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 572 | Mem: 26.53MB, Util: 91%  global_step : 10572
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 573 | Mem: 26.53MB, Util: 91%  global_step : 10573
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 574 | Mem: 26.53MB, Util: 79%  global_step : 10574
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 575 | Mem: 26.53MB, Util: 79%  global_step : 10575
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 576 | Mem: 26.53MB, Util: 79%  global_step : 10576
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 577 | Mem: 26.53MB, Util: 79%  global_step : 10577
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 578 | Mem: 26.53MB, Util: 79%  global_step : 10578
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 579 | Mem: 26.53MB, Util: 79%  global_step 

[Rank 0] Train Epoch 5:  30%|██▉       | 599/2000 [00:04<00:09, 144.54it/s]
[Rank 2] Train Epoch 5:  30%|██▉       | 599/2000 [00:04<00:09, 153.61it/s]
[Rank 1] Train Epoch 5:  30%|███       | 602/2000 [00:04<00:10, 131.35it/s]
[Rank 2] Train Epoch 5:  31%|███       | 615/2000 [00:04<00:10, 133.38it/s]
[Rank 1] Train Epoch 5:  31%|███       | 616/2000 [00:04<00:10, 130.79it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 600 | Mem: 26.53MB, Util: 100%  global_step : 10600
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 601 | Mem: 26.53MB, Util: 100%  global_step : 10601
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 602 | Mem: 26.53MB, Util: 100%  global_step : 10602
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 603 | Mem: 26.53MB, Util: 100%  global_step : 10603
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 604 | Mem: 26.53MB, Util: 100%  global_step : 10604
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 605 | Mem: 26.53MB, Util: 100%  global_step : 10605
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 606 | Mem: 26.53MB, Util: 100%  global_step : 10606
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 607 | Mem: 26.53MB, Util: 100%  glob

[Rank 0] Train Epoch 5:  31%|███       | 614/2000 [00:04<00:10, 133.01it/s]
[Rank 2] Train Epoch 5:  32%|███▏      | 631/2000 [00:04<00:09, 140.29it/s]
[Rank 1] Train Epoch 5:  32%|███▏      | 630/2000 [00:04<00:10, 130.24it/s]
[Rank 0] Train Epoch 5:  31%|███▏      | 628/2000 [00:04<00:10, 133.52it/s]
[Rank 2] Train Epoch 5:  32%|███▏      | 648/2000 [00:04<00:09, 146.81it/s]
[Rank 1] Train Epoch 5:  32%|███▏      | 644/2000 [00:04<00:10, 129.61it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 626 | Mem: 26.53MB, Util: 100%  global_step : 10626
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 627 | Mem: 26.53MB, Util: 98%  global_step : 10627
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 628 | Mem: 26.53MB, Util: 98%  global_step : 10628
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 629 | Mem: 26.53MB, Util: 98%  global_step : 10629
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 630 | Mem: 26.53MB, Util: 98%  global_step : 10630
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 631 | Mem: 26.53MB, Util: 98%  global_step : 10631
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 632 | Mem: 26.53MB, Util: 98%  global_step : 10632
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 633 | Mem: 26.53MB, Util: 98%  global_step

[Rank 0] Train Epoch 5:  32%|███▏      | 644/2000 [00:04<00:09, 139.66it/s]
[Rank 2] Train Epoch 5:  33%|███▎      | 665/2000 [00:04<00:08, 151.13it/s]
[Rank 1] Train Epoch 5:  33%|███▎      | 658/2000 [00:04<00:10, 128.83it/s]
[Rank 0] Train Epoch 5:  33%|███▎      | 659/2000 [00:04<00:09, 139.40it/s]
[Rank 2] Train Epoch 5:  34%|███▍      | 682/2000 [00:04<00:08, 155.55it/s]
[Rank 1] Train Epoch 5:  34%|███▎      | 671/2000 [00:04<00:10, 128.35it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 657 | Mem: 26.53MB, Util: 100%  global_step : 10657
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 658 | Mem: 26.53MB, Util: 100%  global_step : 10658
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 659 | Mem: 26.53MB, Util: 100%  global_step : 10659
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 660 | Mem: 26.53MB, Util: 100%  global_step : 10660
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 661 | Mem: 26.53MB, Util: 100%  global_step : 10661
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 662 | Mem: 26.53MB, Util: 100%  global_step : 10662
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 663 | Mem: 26.53MB, Util: 100%  global_step : 10663
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 664 | Mem: 26.53MB, Util: 100%  glob

[Rank 0] Train Epoch 5:  34%|███▎      | 674/2000 [00:04<00:09, 139.29it/s]
[Rank 2] Train Epoch 5:  35%|███▍      | 699/2000 [00:04<00:08, 156.53it/s]
[Rank 1] Train Epoch 5:  34%|███▍      | 685/2000 [00:04<00:10, 128.86it/s]
[Rank 0] Train Epoch 5:  34%|███▍      | 688/2000 [00:04<00:09, 138.91it/s]
[Rank 1] Train Epoch 5:  35%|███▍      | 698/2000 [00:05<00:10, 128.57it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 686 | Mem: 26.53MB, Util: 100%  global_step : 10686
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 687 | Mem: 26.53MB, Util: 100%  global_step : 10687
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 688 | Mem: 26.53MB, Util: 100%  global_step : 10688
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 689 | Mem: 26.53MB, Util: 100%  global_step : 10689
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 690 | Mem: 26.53MB, Util: 100%  global_step : 10690
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 691 | Mem: 26.53MB, Util: 100%  global_step : 10691
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 692 | Mem: 26.53MB, Util: 100%  global_step : 10692
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 693 | Mem: 26.53MB, Util: 100%  glob

[Rank 0] Train Epoch 5:  35%|███▌      | 702/2000 [00:05<00:11, 115.83it/s]
[Rank 1] Train Epoch 5:  36%|███▌      | 711/2000 [00:05<00:10, 126.46it/s]
[Rank 0] Train Epoch 5:  36%|███▌      | 717/2000 [00:05<00:10, 123.31it/s]
[Rank 2] Train Epoch 5:  36%|███▌      | 715/2000 [00:05<00:11, 108.12it/s]
[Rank 1] Train Epoch 5:  36%|███▋      | 726/2000 [00:05<00:09, 131.12it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 706 | Mem: 26.53MB, Util: 100%  global_step : 10706
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 707 | Mem: 26.53MB, Util: 100%  global_step : 10707
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 708 | Mem: 26.53MB, Util: 100%  global_step : 10708
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 709 | Mem: 26.53MB, Util: 100%  global_step : 10709
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 710 | Mem: 26.53MB, Util: 100%  global_step : 10710
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 711 | Mem: 26.53MB, Util: 100%  global_step : 10711
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 712 | Mem: 26.53MB, Util: 100%  global_step : 10712
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 713 | Mem: 26.53MB, Util: 100%  glob

[Rank 0] Train Epoch 5:  37%|███▋      | 732/2000 [00:05<00:09, 129.24it/s]
[Rank 2] Train Epoch 5:  37%|███▋      | 732/2000 [00:05<00:10, 121.08it/s]
[Rank 1] Train Epoch 5:  37%|███▋      | 741/2000 [00:05<00:09, 134.95it/s]
[Rank 0] Train Epoch 5:  37%|███▋      | 747/2000 [00:05<00:09, 132.76it/s]
[Rank 2] Train Epoch 5:  37%|███▋      | 748/2000 [00:05<00:09, 130.17it/s]
[Rank 1] Train Epoch 5:  38%|███▊      | 756/2000 [00:05<00:09, 137.13it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 736 | Mem: 26.53MB, Util: 97%  global_step : 10736
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 737 | Mem: 26.53MB, Util: 97%  global_step : 10737
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 738 | Mem: 26.53MB, Util: 97%  global_step : 10738
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 739 | Mem: 26.53MB, Util: 97%  global_step : 10739
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 740 | Mem: 26.53MB, Util: 97%  global_step : 10740
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 741 | Mem: 26.53MB, Util: 97%  global_step : 10741
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 742 | Mem: 26.53MB, Util: 97%  global_step : 10742
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 743 | Mem: 26.53MB, Util: 97%  global_step 

[Rank 0] Train Epoch 5:  38%|███▊      | 762/2000 [00:05<00:09, 136.19it/s]
[Rank 2] Train Epoch 5:  38%|███▊      | 765/2000 [00:05<00:08, 138.49it/s]
[Rank 1] Train Epoch 5:  39%|███▊      | 771/2000 [00:05<00:08, 139.07it/s]
[Rank 0] Train Epoch 5:  39%|███▉      | 777/2000 [00:05<00:08, 138.15it/s]
[Rank 2] Train Epoch 5:  39%|███▉      | 782/2000 [00:05<00:08, 144.73it/s]
[Rank 1] Train Epoch 5:  39%|███▉      | 786/2000 [00:05<00:08, 140.13it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 766 | Mem: 26.53MB, Util: 100%  global_step : 10766
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 767 | Mem: 26.53MB, Util: 100%  global_step : 10767
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 768 | Mem: 26.53MB, Util: 100%  global_step : 10768
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 769 | Mem: 26.53MB, Util: 100%  global_step : 10769
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 770 | Mem: 26.53MB, Util: 100%  global_step : 10770
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 771 | Mem: 26.53MB, Util: 100%  global_step : 10771
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 772 | Mem: 26.53MB, Util: 100%  global_step : 10772
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 773 | Mem: 26.53MB, Util: 100%  glob

[Rank 0] Train Epoch 5:  40%|███▉      | 792/2000 [00:05<00:08, 139.37it/s]
[Rank 2] Train Epoch 5:  40%|███▉      | 799/2000 [00:05<00:08, 149.83it/s]
[Rank 1] Train Epoch 5:  40%|████      | 801/2000 [00:05<00:08, 140.77it/s]
[Rank 0] Train Epoch 5:  40%|████      | 807/2000 [00:05<00:08, 133.74it/s]
[Rank 1] Train Epoch 5:  41%|████      | 816/2000 [00:05<00:08, 141.45it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 796 | Mem: 26.53MB, Util: 100%  global_step : 10796
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 797 | Mem: 26.53MB, Util: 100%  global_step : 10797
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 798 | Mem: 26.53MB, Util: 100%  global_step : 10798
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 799 | Mem: 26.53MB, Util: 100%  global_step : 10799
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 800 | Mem: 26.53MB, Util: 100%  global_step : 10800
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 801 | Mem: 26.53MB, Util: 100%  global_step : 10801
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 802 | Mem: 26.53MB, Util: 100%  global_step : 10802
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 803 | Mem: 26.53MB, Util: 100%  glob

[Rank 0] Train Epoch 5:  41%|████      | 821/2000 [00:05<00:08, 134.92it/s]
[Rank 2] Train Epoch 5:  41%|████      | 815/2000 [00:05<00:10, 113.81it/s]
[Rank 1] Train Epoch 5:  42%|████▏     | 831/2000 [00:06<00:08, 141.16it/s]
[Rank 0] Train Epoch 5:  42%|████▏     | 835/2000 [00:06<00:08, 132.92it/s]
[Rank 2] Train Epoch 5:  42%|████▏     | 830/2000 [00:06<00:09, 120.19it/s]
[Rank 1] Train Epoch 5:  42%|████▏     | 846/2000 [00:06<00:08, 141.81it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 823 | Mem: 26.53MB, Util: 98%  global_step : 10823
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 824 | Mem: 26.53MB, Util: 98%  global_step : 10824
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 825 | Mem: 26.53MB, Util: 98%  global_step : 10825
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 826 | Mem: 26.53MB, Util: 98%  global_step : 10826
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 827 | Mem: 26.53MB, Util: 98%  global_step : 10827
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 828 | Mem: 26.53MB, Util: 98%  global_step : 10828
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 829 | Mem: 26.53MB, Util: 98%  global_step : 10829
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 830 | Mem: 26.53MB, Util: 98%  global_step 

[Rank 0] Train Epoch 5:  42%|████▏     | 849/2000 [00:06<00:08, 134.35it/s]
[Rank 2] Train Epoch 5:  42%|████▏     | 845/2000 [00:06<00:09, 125.44it/s]
[Rank 1] Train Epoch 5:  43%|████▎     | 861/2000 [00:06<00:08, 141.91it/s]
[Rank 0] Train Epoch 5:  43%|████▎     | 864/2000 [00:06<00:08, 135.88it/s]
[Rank 2] Train Epoch 5:  43%|████▎     | 859/2000 [00:06<00:08, 127.93it/s]
[Rank 1] Train Epoch 5:  44%|████▍     | 876/2000 [00:06<00:07, 142.32it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 850 | Mem: 26.53MB, Util: 99%  global_step : 10850
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 851 | Mem: 26.53MB, Util: 99%  global_step : 10851
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 852 | Mem: 26.53MB, Util: 99%  global_step : 10852
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 853 | Mem: 26.53MB, Util: 99%  global_step : 10853
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 854 | Mem: 26.53MB, Util: 99%  global_step : 10854
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 855 | Mem: 26.53MB, Util: 99%  global_step : 10855
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 856 | Mem: 26.53MB, Util: 99%  global_step : 10856
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 857 | Mem: 26.53MB, Util: 99%  global_step 

[Rank 0] Train Epoch 5:  44%|████▍     | 879/2000 [00:06<00:08, 137.38it/s]
[Rank 2] Train Epoch 5:  44%|████▎     | 873/2000 [00:06<00:08, 130.96it/s]
[Rank 1] Train Epoch 5:  45%|████▍     | 891/2000 [00:06<00:07, 141.97it/s]
[Rank 0] Train Epoch 5:  45%|████▍     | 893/2000 [00:06<00:08, 137.54it/s]
[Rank 2] Train Epoch 5:  44%|████▍     | 887/2000 [00:06<00:08, 132.92it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 880 | Mem: 26.53MB, Util: 89%  global_step : 10880
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 881 | Mem: 26.53MB, Util: 89%  global_step : 10881
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 882 | Mem: 26.53MB, Util: 89%  global_step : 10882
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 883 | Mem: 26.53MB, Util: 89%  global_step : 10883
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 884 | Mem: 26.53MB, Util: 89%  global_step : 10884
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 885 | Mem: 26.53MB, Util: 89%  global_step : 10885
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 886 | Mem: 26.53MB, Util: 89%  global_step : 10886
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 887 | Mem: 26.53MB, Util: 89%  global_step 

[Rank 0] Train Epoch 5:  45%|████▌     | 907/2000 [00:06<00:08, 136.00it/s]
[Rank 2] Train Epoch 5:  45%|████▌     | 901/2000 [00:06<00:08, 134.29it/s]
[Rank 1] Train Epoch 5:  45%|████▌     | 906/2000 [00:06<00:08, 130.42it/s]
[Rank 0] Train Epoch 5:  46%|████▌     | 922/2000 [00:06<00:07, 136.93it/s]
[Rank 2] Train Epoch 5:  46%|████▌     | 915/2000 [00:06<00:08, 135.35it/s]
[Rank 1] Train Epoch 5:  46%|████▌     | 921/2000 [00:06<00:08, 133.75it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 908 | Mem: 26.53MB, Util: 100%  global_step : 10908
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 909 | Mem: 26.53MB, Util: 100%  global_step : 10909
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 910 | Mem: 26.53MB, Util: 100%  global_step : 10910
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 911 | Mem: 26.53MB, Util: 100%  global_step : 10911
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 912 | Mem: 26.53MB, Util: 100%  global_step : 10912
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 913 | Mem: 26.53MB, Util: 100%  global_step : 10913
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 914 | Mem: 26.53MB, Util: 100%  global_step : 10914
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 915 | Mem: 26.53MB, Util: 100%  glob

[Rank 0] Train Epoch 5:  47%|████▋     | 936/2000 [00:06<00:07, 137.32it/s]
[Rank 2] Train Epoch 5:  46%|████▋     | 929/2000 [00:06<00:07, 136.10it/s]
[Rank 1] Train Epoch 5:  47%|████▋     | 935/2000 [00:06<00:07, 135.26it/s]
[Rank 0] Train Epoch 5:  48%|████▊     | 951/2000 [00:06<00:07, 138.20it/s]
[Rank 2] Train Epoch 5:  47%|████▋     | 943/2000 [00:06<00:07, 135.37it/s]
[Rank 1] Train Epoch 5:  48%|████▊     | 950/2000 [00:06<00:07, 137.28it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 937 | Mem: 26.53MB, Util: 91%  global_step : 10937
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 938 | Mem: 26.53MB, Util: 91%  global_step : 10938
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 939 | Mem: 26.53MB, Util: 91%  global_step : 10939
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 940 | Mem: 26.53MB, Util: 91%  global_step : 10940
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 941 | Mem: 26.53MB, Util: 91%  global_step : 10941
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 942 | Mem: 26.53MB, Util: 91%  global_step : 10942
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 943 | Mem: 26.53MB, Util: 91%  global_step : 10943
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 944 | Mem: 26.53MB, Util: 91%  global_step 

[Rank 0] Train Epoch 5:  48%|████▊     | 965/2000 [00:07<00:07, 138.21it/s]
[Rank 2] Train Epoch 5:  48%|████▊     | 957/2000 [00:06<00:07, 135.35it/s]
[Rank 1] Train Epoch 5:  48%|████▊     | 965/2000 [00:07<00:07, 138.46it/s]
[Rank 0] Train Epoch 5:  49%|████▉     | 979/2000 [00:07<00:07, 138.58it/s]
[Rank 2] Train Epoch 5:  49%|████▊     | 971/2000 [00:07<00:07, 136.28it/s]
[Rank 1] Train Epoch 5:  49%|████▉     | 980/2000 [00:07<00:07, 139.86it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 967 | Mem: 26.53MB, Util: 99%  global_step : 10967
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 968 | Mem: 26.53MB, Util: 99%  global_step : 10968
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 969 | Mem: 26.53MB, Util: 99%  global_step : 10969
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 970 | Mem: 26.53MB, Util: 99%  global_step : 10970
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 971 | Mem: 26.53MB, Util: 99%  global_step : 10971
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 972 | Mem: 26.53MB, Util: 99%  global_step : 10972
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 973 | Mem: 26.53MB, Util: 99%  global_step : 10973
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 974 | Mem: 26.53MB, Util: 99%  global_step 

[Rank 0] Train Epoch 5:  50%|████▉     | 993/2000 [00:07<00:07, 138.55it/s]
[Rank 2] Train Epoch 5:  49%|████▉     | 985/2000 [00:07<00:07, 136.82it/s]
[Rank 2] Train Epoch 5:  50%|████▉     | 999/2000 [00:07<00:07, 136.69it/s]
[Rank 1] Train Epoch 5:  50%|████▉     | 995/2000 [00:07<00:07, 140.27it/s]
[Rank 0] Train Epoch 5:  50%|█████     | 1007/2000 [00:07<00:07, 134.22it/s]
[Rank 2] Train Epoch 5:  51%|█████     | 1013/2000 [00:07<00:07, 136.11it/s]
[Rank 1] Train Epoch 5:  50%|█████     | 1010/2000 [00:07<00:07, 132.00it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 996 | Mem: 26.53MB, Util: 100%  global_step : 10996
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 997 | Mem: 26.53MB, Util: 100%  global_step : 10997
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 998 | Mem: 26.53MB, Util: 100%  global_step : 10998
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 999 | Mem: 26.53MB, Util: 100%  global_step : 10999
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1000 | Mem: 26.53MB, Util: 100%  global_step : 11000
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1001 | Mem: 26.53MB, Util: 100%  global_step : 11001
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1002 | Mem: 26.53MB, Util: 100%  global_step : 11002
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1003 | Mem: 26.53MB, Util: 100%  

[Rank 0] Train Epoch 5:  51%|█████     | 1021/2000 [00:07<00:07, 135.48it/s]
[Rank 2] Train Epoch 5:  51%|█████▏    | 1027/2000 [00:07<00:07, 136.15it/s]
[Rank 1] Train Epoch 5:  51%|█████     | 1024/2000 [00:07<00:07, 134.01it/s]
[Rank 0] Train Epoch 5:  52%|█████▏    | 1036/2000 [00:07<00:07, 137.42it/s]
[Rank 2] Train Epoch 5:  52%|█████▏    | 1041/2000 [00:07<00:07, 135.99it/s]
[Rank 1] Train Epoch 5:  52%|█████▏    | 1038/2000 [00:07<00:07, 135.04it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1024 | Mem: 26.53MB, Util: 100%  global_step : 11024
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1025 | Mem: 26.53MB, Util: 100%  global_step : 11025
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1026 | Mem: 26.53MB, Util: 100%  global_step : 11026
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1027 | Mem: 26.53MB, Util: 100%  global_step : 11027
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1028 | Mem: 26.53MB, Util: 100%  global_step : 11028
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1029 | Mem: 26.53MB, Util: 100%  global_step : 11029
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1030 | Mem: 26.53MB, Util: 100%  global_step : 11030
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1031 | Mem: 26.53MB, Util: 10

[Rank 0] Train Epoch 5:  53%|█████▎    | 1051/2000 [00:07<00:06, 138.37it/s]
[Rank 2] Train Epoch 5:  53%|█████▎    | 1055/2000 [00:07<00:06, 136.37it/s]
[Rank 1] Train Epoch 5:  53%|█████▎    | 1053/2000 [00:07<00:06, 137.13it/s]
[Rank 0] Train Epoch 5:  53%|█████▎    | 1066/2000 [00:07<00:06, 139.53it/s]
[Rank 2] Train Epoch 5:  53%|█████▎    | 1069/2000 [00:07<00:06, 136.72it/s]
[Rank 1] Train Epoch 5:  53%|█████▎    | 1067/2000 [00:07<00:06, 137.89it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1054 | Mem: 26.53MB, Util: 96%  global_step : 11054
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1055 | Mem: 26.53MB, Util: 96%  global_step : 11055
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1056 | Mem: 26.53MB, Util: 96%  global_step : 11056
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1057 | Mem: 26.53MB, Util: 96%  global_step : 11057
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1058 | Mem: 26.53MB, Util: 96%  global_step : 11058
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1059 | Mem: 26.53MB, Util: 96%  global_step : 11059
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1060 | Mem: 26.53MB, Util: 96%  global_step : 11060
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1061 | Mem: 26.53MB, Util: 96%  glob

[Rank 0] Train Epoch 5:  54%|█████▍    | 1080/2000 [00:07<00:06, 139.25it/s]
[Rank 2] Train Epoch 5:  54%|█████▍    | 1083/2000 [00:07<00:06, 136.66it/s]
[Rank 1] Train Epoch 5:  54%|█████▍    | 1082/2000 [00:07<00:06, 138.66it/s]
[Rank 0] Train Epoch 5:  55%|█████▍    | 1095/2000 [00:07<00:06, 139.73it/s]
[Rank 2] Train Epoch 5:  55%|█████▍    | 1097/2000 [00:08<00:06, 137.01it/s]
[Rank 1] Train Epoch 5:  55%|█████▍    | 1097/2000 [00:07<00:06, 139.88it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1084 | Mem: 26.53MB, Util: 100%  global_step : 11084
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1085 | Mem: 26.53MB, Util: 100%  global_step : 11085
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1086 | Mem: 26.53MB, Util: 100%  global_step : 11086
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1087 | Mem: 26.53MB, Util: 100%  global_step : 11087
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1088 | Mem: 26.53MB, Util: 100%  global_step : 11088
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1089 | Mem: 26.53MB, Util: 100%  global_step : 11089
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1090 | Mem: 26.53MB, Util: 100%  global_step : 11090
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1091 | Mem: 26.53MB, Util: 10

[Rank 0] Train Epoch 5:  55%|█████▌    | 1109/2000 [00:08<00:06, 132.16it/s]
[Rank 2] Train Epoch 5:  56%|█████▌    | 1111/2000 [00:08<00:06, 136.83it/s]
[Rank 1] Train Epoch 5:  56%|█████▌    | 1112/2000 [00:08<00:06, 131.77it/s]
[Rank 0] Train Epoch 5:  56%|█████▌    | 1123/2000 [00:08<00:06, 134.14it/s]
[Rank 2] Train Epoch 5:  56%|█████▋    | 1125/2000 [00:08<00:06, 136.28it/s]
[Rank 1] Train Epoch 5:  56%|█████▋    | 1127/2000 [00:08<00:06, 134.31it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1111 | Mem: 26.53MB, Util: 100%  global_step : 11111
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1112 | Mem: 26.53MB, Util: 100%  global_step : 11112
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1113 | Mem: 26.53MB, Util: 100%  global_step : 11113
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1114 | Mem: 26.53MB, Util: 100%  global_step : 11114
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1115 | Mem: 26.53MB, Util: 95%  global_step : 11115
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1116 | Mem: 26.53MB, Util: 95%  global_step : 11116
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1117 | Mem: 26.53MB, Util: 95%  global_step : 11117
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1118 | Mem: 26.53MB, Util: 95%  

[Rank 0] Train Epoch 5:  57%|█████▋    | 1137/2000 [00:08<00:06, 133.75it/s]
[Rank 2] Train Epoch 5:  57%|█████▋    | 1139/2000 [00:08<00:06, 136.85it/s]
[Rank 1] Train Epoch 5:  57%|█████▋    | 1142/2000 [00:08<00:06, 136.46it/s]
[Rank 0] Train Epoch 5:  58%|█████▊    | 1151/2000 [00:08<00:06, 134.72it/s]
[Rank 2] Train Epoch 5:  58%|█████▊    | 1153/2000 [00:08<00:06, 136.84it/s]
[Rank 1] Train Epoch 5:  58%|█████▊    | 1157/2000 [00:08<00:06, 138.22it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1139 | Mem: 26.53MB, Util: 95%  global_step : 11139
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1140 | Mem: 26.53MB, Util: 95%  global_step : 11140
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1141 | Mem: 26.53MB, Util: 95%  global_step : 11141
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1142 | Mem: 26.53MB, Util: 95%  global_step : 11142
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1143 | Mem: 26.53MB, Util: 92%  global_step : 11143
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1144 | Mem: 26.53MB, Util: 92%  global_step : 11144
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1145 | Mem: 26.53MB, Util: 92%  global_step : 11145
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1146 | Mem: 26.53MB, Util: 92%  glob

[Rank 0] Train Epoch 5:  58%|█████▊    | 1167/2000 [00:08<00:05, 139.54it/s]
[Rank 2] Train Epoch 5:  58%|█████▊    | 1167/2000 [00:08<00:06, 136.38it/s]
[Rank 1] Train Epoch 5:  59%|█████▊    | 1172/2000 [00:08<00:05, 139.36it/s]
[Rank 0] Train Epoch 5:  59%|█████▉    | 1181/2000 [00:08<00:05, 137.41it/s]
[Rank 2] Train Epoch 5:  59%|█████▉    | 1181/2000 [00:08<00:05, 136.85it/s]
[Rank 1] Train Epoch 5:  59%|█████▉    | 1187/2000 [00:08<00:05, 139.68it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1170 | Mem: 26.53MB, Util: 92%  global_step : 11170
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1171 | Mem: 26.53MB, Util: 92%  global_step : 11171
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1172 | Mem: 26.53MB, Util: 92%  global_step : 11172
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1173 | Mem: 26.53MB, Util: 99%  global_step : 11173
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1174 | Mem: 26.53MB, Util: 99%  global_step : 11174
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1175 | Mem: 26.53MB, Util: 99%  global_step : 11175
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1176 | Mem: 26.53MB, Util: 99%  global_step : 11176
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1177 | Mem: 26.53MB, Util: 99%  glob

[Rank 0] Train Epoch 5:  60%|█████▉    | 1195/2000 [00:08<00:05, 137.88it/s]
[Rank 2] Train Epoch 5:  60%|█████▉    | 1195/2000 [00:08<00:05, 137.23it/s]
[Rank 1] Train Epoch 5:  60%|██████    | 1201/2000 [00:08<00:06, 131.52it/s]
[Rank 0] Train Epoch 5:  60%|██████    | 1209/2000 [00:08<00:05, 134.17it/s]
[Rank 2] Train Epoch 5:  60%|██████    | 1209/2000 [00:08<00:05, 137.01it/s]
[Rank 1] Train Epoch 5:  61%|██████    | 1215/2000 [00:08<00:06, 129.51it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1198 | Mem: 26.53MB, Util: 99%  global_step : 11198
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1199 | Mem: 26.53MB, Util: 99%  global_step : 11199
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1200 | Mem: 26.53MB, Util: 100%  global_step : 11200
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1201 | Mem: 26.53MB, Util: 100%  global_step : 11201
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1202 | Mem: 26.53MB, Util: 100%  global_step : 11202
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1203 | Mem: 26.53MB, Util: 100%  global_step : 11203
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1204 | Mem: 26.53MB, Util: 100%  global_step : 11204
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1205 | Mem: 26.53MB, Util: 100%

[Rank 0] Train Epoch 5:  61%|██████    | 1224/2000 [00:08<00:05, 137.01it/s]
[Rank 2] Train Epoch 5:  61%|██████    | 1223/2000 [00:08<00:05, 136.64it/s]
[Rank 0] Train Epoch 5:  62%|██████▏   | 1238/2000 [00:09<00:05, 134.51it/s]
[Rank 2] Train Epoch 5:  62%|██████▏   | 1237/2000 [00:09<00:05, 136.55it/s]
[Rank 1] Train Epoch 5:  61%|██████▏   | 1229/2000 [00:08<00:06, 125.13it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1226 | Mem: 26.53MB, Util: 100%  global_step : 11226
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1227 | Mem: 26.53MB, Util: 100%  global_step : 11227
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1228 | Mem: 26.53MB, Util: 100%  global_step : 11228
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1229 | Mem: 26.53MB, Util: 100%  global_step : 11229
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1230 | Mem: 26.53MB, Util: 94%  global_step : 11230
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1231 | Mem: 26.53MB, Util: 94%  global_step : 11231
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1232 | Mem: 26.53MB, Util: 94%  global_step : 11232
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1233 | Mem: 26.53MB, Util: 94%  

[Rank 0] Train Epoch 5:  63%|██████▎   | 1252/2000 [00:09<00:05, 135.31it/s]
[Rank 2] Train Epoch 5:  63%|██████▎   | 1251/2000 [00:09<00:05, 136.24it/s]
[Rank 1] Train Epoch 5:  62%|██████▏   | 1244/2000 [00:09<00:05, 129.84it/s]
[Rank 0] Train Epoch 5:  63%|██████▎   | 1266/2000 [00:09<00:05, 136.52it/s]
[Rank 2] Train Epoch 5:  63%|██████▎   | 1265/2000 [00:09<00:05, 136.37it/s]
[Rank 1] Train Epoch 5:  63%|██████▎   | 1259/2000 [00:09<00:05, 134.40it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1254 | Mem: 26.53MB, Util: 94%  global_step : 11254
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1255 | Mem: 26.53MB, Util: 94%  global_step : 11255
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1256 | Mem: 26.53MB, Util: 94%  global_step : 11256
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1257 | Mem: 26.53MB, Util: 94%  global_step : 11257
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1258 | Mem: 26.53MB, Util: 100%  global_step : 11258
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1259 | Mem: 26.53MB, Util: 100%  global_step : 11259
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1260 | Mem: 26.53MB, Util: 100%  global_step : 11260
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1261 | Mem: 26.53MB, Util: 100%  

[Rank 0] Train Epoch 5:  64%|██████▍   | 1280/2000 [00:09<00:05, 137.53it/s]
[Rank 2] Train Epoch 5:  64%|██████▍   | 1279/2000 [00:09<00:05, 135.14it/s]
[Rank 1] Train Epoch 5:  64%|██████▍   | 1275/2000 [00:09<00:05, 141.40it/s]
[Rank 0] Train Epoch 5:  65%|██████▍   | 1295/2000 [00:09<00:05, 138.67it/s]
[Rank 2] Train Epoch 5:  65%|██████▍   | 1293/2000 [00:09<00:05, 132.75it/s]
[Rank 1] Train Epoch 5:  64%|██████▍   | 1290/2000 [00:09<00:05, 141.24it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1284 | Mem: 26.53MB, Util: 100%  global_step : 11284
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1285 | Mem: 26.53MB, Util: 100%  global_step : 11285
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1286 | Mem: 26.53MB, Util: 100%  global_step : 11286
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1287 | Mem: 26.53MB, Util: 100%  global_step : 11287
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1288 | Mem: 26.53MB, Util: 100%  global_step : 11288
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1289 | Mem: 26.53MB, Util: 100%  global_step : 11289
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1290 | Mem: 26.53MB, Util: 100%  global_step : 11290
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1291 | Mem: 26.53MB, Util: 10

[Rank 0] Train Epoch 5:  65%|██████▌   | 1309/2000 [00:09<00:05, 130.36it/s]
[Rank 2] Train Epoch 5:  65%|██████▌   | 1307/2000 [00:09<00:05, 133.28it/s]
[Rank 1] Train Epoch 5:  65%|██████▌   | 1305/2000 [00:09<00:05, 133.85it/s]
[Rank 0] Train Epoch 5:  66%|██████▌   | 1323/2000 [00:09<00:05, 132.59it/s]
[Rank 2] Train Epoch 5:  66%|██████▌   | 1321/2000 [00:09<00:05, 133.86it/s]
[Rank 1] Train Epoch 5:  66%|██████▌   | 1320/2000 [00:09<00:05, 135.93it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1310 | Mem: 26.53MB, Util: 100%  global_step : 11310
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1311 | Mem: 26.53MB, Util: 100%  global_step : 11311
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1312 | Mem: 26.53MB, Util: 100%  global_step : 11312
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1313 | Mem: 26.53MB, Util: 100%  global_step : 11313
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1314 | Mem: 26.53MB, Util: 100%  global_step : 11314
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1315 | Mem: 26.53MB, Util: 100%  global_step : 11315
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1316 | Mem: 26.53MB, Util: 100%  global_step : 11316
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1317 | Mem: 26.53MB, Util: 10

[Rank 0] Train Epoch 5:  67%|██████▋   | 1338/2000 [00:09<00:04, 135.34it/s]
[Rank 2] Train Epoch 5:  67%|██████▋   | 1335/2000 [00:09<00:04, 134.61it/s]
[Rank 1] Train Epoch 5:  67%|██████▋   | 1335/2000 [00:09<00:04, 138.08it/s]
[Rank 0] Train Epoch 5:  68%|██████▊   | 1352/2000 [00:09<00:04, 133.94it/s]
[Rank 2] Train Epoch 5:  67%|██████▋   | 1349/2000 [00:09<00:04, 135.29it/s]
[Rank 1] Train Epoch 5:  68%|██████▊   | 1350/2000 [00:09<00:04, 139.00it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1340 | Mem: 26.53MB, Util: 97%  global_step : 11340
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1341 | Mem: 26.53MB, Util: 97%  global_step : 11341
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1342 | Mem: 26.53MB, Util: 97%  global_step : 11342
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1343 | Mem: 26.53MB, Util: 97%  global_step : 11343
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1344 | Mem: 26.53MB, Util: 97%  global_step : 11344
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1345 | Mem: 26.53MB, Util: 97%  global_step : 11345
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1346 | Mem: 26.53MB, Util: 97%  global_step : 11346
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1347 | Mem: 26.53MB, Util: 97%  glob

[Rank 0] Train Epoch 5:  68%|██████▊   | 1367/2000 [00:09<00:04, 136.02it/s]
[Rank 2] Train Epoch 5:  68%|██████▊   | 1363/2000 [00:09<00:04, 132.38it/s]
[Rank 1] Train Epoch 5:  68%|██████▊   | 1365/2000 [00:09<00:04, 139.53it/s]
[Rank 0] Train Epoch 5:  69%|██████▉   | 1382/2000 [00:10<00:04, 137.33it/s]
[Rank 2] Train Epoch 5:  69%|██████▉   | 1377/2000 [00:10<00:04, 133.06it/s]
[Rank 1] Train Epoch 5:  69%|██████▉   | 1380/2000 [00:10<00:04, 140.42it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1369 | Mem: 26.53MB, Util: 100%  global_step : 11369
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1370 | Mem: 26.53MB, Util: 100%  global_step : 11370
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1371 | Mem: 26.53MB, Util: 100%  global_step : 11371
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1372 | Mem: 26.53MB, Util: 100%  global_step : 11372
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1373 | Mem: 26.53MB, Util: 100%  global_step : 11373
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1374 | Mem: 26.53MB, Util: 100%  global_step : 11374
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1375 | Mem: 26.53MB, Util: 100%  global_step : 11375
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1376 | Mem: 26.53MB, Util: 10

[Rank 0] Train Epoch 5:  70%|██████▉   | 1397/2000 [00:10<00:04, 138.43it/s]
[Rank 2] Train Epoch 5:  70%|██████▉   | 1391/2000 [00:10<00:04, 134.07it/s]
[Rank 1] Train Epoch 5:  70%|██████▉   | 1395/2000 [00:10<00:04, 140.84it/s]
[Rank 2] Train Epoch 5:  70%|███████   | 1405/2000 [00:10<00:04, 134.56it/s]
[Rank 1] Train Epoch 5:  70%|███████   | 1410/2000 [00:10<00:04, 127.56it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1398 | Mem: 26.53MB, Util: 100%  global_step : 11398
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1399 | Mem: 26.53MB, Util: 100%  global_step : 11399
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1400 | Mem: 26.53MB, Util: 100%  global_step : 11400
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1401 | Mem: 26.53MB, Util: 100%  global_step : 11401
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1402 | Mem: 26.53MB, Util: 100%  global_step : 11402
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1403 | Mem: 26.53MB, Util: 100%  global_step : 11403
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1404 | Mem: 26.53MB, Util: 100%  global_step : 11404
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1405 | Mem: 26.53MB, Util: 10

[Rank 2] Train Epoch 5:  71%|███████   | 1419/2000 [00:10<00:04, 135.29it/s]
[Rank 0] Train Epoch 5:  71%|███████   | 1411/2000 [00:10<00:04, 126.55it/s]
[Rank 1] Train Epoch 5:  71%|███████▏  | 1425/2000 [00:10<00:04, 131.51it/s]
[Rank 0] Train Epoch 5:  71%|███████▏  | 1426/2000 [00:10<00:04, 130.53it/s]
[Rank 2] Train Epoch 5:  72%|███████▏  | 1433/2000 [00:10<00:04, 135.30it/s]
[Rank 1] Train Epoch 5:  72%|███████▏  | 1439/2000 [00:10<00:04, 133.66it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1423 | Mem: 26.53MB, Util: 96%  global_step : 11423
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1424 | Mem: 26.53MB, Util: 96%  global_step : 11424
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1425 | Mem: 26.53MB, Util: 96%  global_step : 11425
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1426 | Mem: 26.53MB, Util: 96%  global_step : 11426
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1427 | Mem: 26.53MB, Util: 96%  global_step : 11427
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1428 | Mem: 26.53MB, Util: 96%  global_step : 11428
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1429 | Mem: 26.53MB, Util: 96%  global_step : 11429
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 5, Batch 1430 | Mem: 26.53MB, Util: 96%  glob

[Rank 0] Train Epoch 5:  72%|███████▏  | 1440/2000 [00:10<00:04, 132.72it/s]
[Rank 2] Train Epoch 5:  72%|███████▏  | 1447/2000 [00:10<00:04, 135.54it/s]
[Rank 1] Train Epoch 5:  73%|███████▎  | 1454/2000 [00:10<00:03, 136.66it/s]
[Rank 0] Train Epoch 5:  73%|███████▎  | 1455/2000 [00:10<00:04, 134.98it/s]
[Rank 2] Train Epoch 5:  73%|███████▎  | 1461/2000 [00:10<00:03, 135.88it/s]
[Rank 1] Train Epoch 5:  73%|███████▎  | 1468/2000 [00:10<00:03, 137.06it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1452 | Mem: 26.53MB, Util: 79%  global_step : 11452
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1453 | Mem: 26.53MB, Util: 91%  global_step : 11453
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1454 | Mem: 26.53MB, Util: 91%  global_step : 11454
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1455 | Mem: 26.53MB, Util: 91%  global_step : 11455
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1456 | Mem: 26.53MB, Util: 91%  global_step : 11456
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1457 | Mem: 26.53MB, Util: 91%  global_step : 11457
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1458 | Mem: 26.53MB, Util: 91%  global_step : 11458
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1459 | Mem: 26.53MB, Util: 9

[Rank 2] Train Epoch 5:  74%|███████▍  | 1475/2000 [00:10<00:03, 135.09it/s]
[Rank 0] Train Epoch 5:  73%|███████▎  | 1469/2000 [00:10<00:03, 136.06it/s]
[Rank 1] Train Epoch 5:  74%|███████▍  | 1483/2000 [00:10<00:03, 138.25it/s]
[Rank 2] Train Epoch 5:  74%|███████▍  | 1489/2000 [00:10<00:03, 135.55it/s]
[Rank 0] Train Epoch 5:  74%|███████▍  | 1483/2000 [00:10<00:03, 134.60it/s]
[Rank 1] Train Epoch 5:  75%|███████▍  | 1497/2000 [00:10<00:03, 138.67it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1480 | Mem: 26.53MB, Util: 91%  global_step : 11480
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1481 | Mem: 26.53MB, Util: 66%  global_step : 11481
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1482 | Mem: 26.53MB, Util: 66%  global_step : 11482
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1483 | Mem: 26.53MB, Util: 66%  global_step : 11483
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1484 | Mem: 26.53MB, Util: 66%  global_step : 11484
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1485 | Mem: 26.53MB, Util: 66%  global_step : 11485
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1486 | Mem: 26.53MB, Util: 66%  global_step : 11486
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1487 | Mem: 26.53MB, Util: 6

[Rank 2] Train Epoch 5:  75%|███████▌  | 1503/2000 [00:11<00:03, 135.86it/s]
[Rank 0] Train Epoch 5:  75%|███████▍  | 1497/2000 [00:10<00:03, 135.81it/s]
[Rank 1] Train Epoch 5:  76%|███████▌  | 1511/2000 [00:11<00:03, 130.07it/s]
[Rank 2] Train Epoch 5:  76%|███████▌  | 1517/2000 [00:11<00:03, 136.18it/s]
[Rank 0] Train Epoch 5:  76%|███████▌  | 1511/2000 [00:11<00:03, 135.52it/s]
[Rank 1] Train Epoch 5:  76%|███████▋  | 1526/2000 [00:11<00:03, 134.06it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1508 | Mem: 26.53MB, Util: 66%  global_step : 11508
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1509 | Mem: 26.53MB, Util: 66%  global_step : 11509
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1510 | Mem: 26.53MB, Util: 66%  global_step : 11510
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1511 | Mem: 26.53MB, Util: 66%  global_step : 11511
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1512 | Mem: 26.53MB, Util: 66%  global_step : 11512
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1513 | Mem: 26.53MB, Util: 66%  global_step : 11513
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1514 | Mem: 26.53MB, Util: 66%  global_step : 11514
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1515 | Mem: 26.53MB, Util: 6

[Rank 2] Train Epoch 5:  77%|███████▋  | 1531/2000 [00:11<00:03, 136.21it/s]
[Rank 0] Train Epoch 5:  76%|███████▋  | 1526/2000 [00:11<00:03, 136.96it/s]
[Rank 1] Train Epoch 5:  77%|███████▋  | 1540/2000 [00:11<00:03, 135.56it/s]
[Rank 2] Train Epoch 5:  77%|███████▋  | 1545/2000 [00:11<00:03, 136.31it/s]
[Rank 0] Train Epoch 5:  77%|███████▋  | 1540/2000 [00:11<00:03, 135.71it/s]
[Rank 1] Train Epoch 5:  78%|███████▊  | 1555/2000 [00:11<00:03, 138.17it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1537 | Mem: 26.53MB, Util: 66%  global_step : 11537
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1538 | Mem: 26.53MB, Util: 66%  global_step : 11538
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1539 | Mem: 26.53MB, Util: 66%  global_step : 11539
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1540 | Mem: 26.53MB, Util: 65%  global_step : 11540
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1541 | Mem: 26.53MB, Util: 65%  global_step : 11541
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1542 | Mem: 26.53MB, Util: 65%  global_step : 11542
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1543 | Mem: 26.53MB, Util: 65%  global_step : 11543
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1544 | Mem: 26.53MB, Util: 6

[Rank 2] Train Epoch 5:  78%|███████▊  | 1559/2000 [00:11<00:03, 136.71it/s]
[Rank 0] Train Epoch 5:  78%|███████▊  | 1554/2000 [00:11<00:03, 136.82it/s]
[Rank 1] Train Epoch 5:  78%|███████▊  | 1570/2000 [00:11<00:03, 139.80it/s]
[Rank 2] Train Epoch 5:  79%|███████▊  | 1574/2000 [00:11<00:03, 137.89it/s]
[Rank 0] Train Epoch 5:  78%|███████▊  | 1568/2000 [00:11<00:03, 137.55it/s]
[Rank 1] Train Epoch 5:  79%|███████▉  | 1585/2000 [00:11<00:02, 140.06it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1566 | Mem: 26.53MB, Util: 65%  global_step : 11566
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1567 | Mem: 26.53MB, Util: 65%  global_step : 11567
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1568 | Mem: 26.53MB, Util: 67%  global_step : 11568
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1569 | Mem: 26.53MB, Util: 67%  global_step : 11569
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1570 | Mem: 26.53MB, Util: 67%  global_step : 11570
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1571 | Mem: 26.53MB, Util: 67%  global_step : 11571
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1572 | Mem: 26.53MB, Util: 67%  global_step : 11572
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1573 | Mem: 26.53MB, Util: 6

[Rank 2] Train Epoch 5:  79%|███████▉  | 1589/2000 [00:11<00:02, 139.10it/s]
[Rank 0] Train Epoch 5:  79%|███████▉  | 1582/2000 [00:11<00:03, 138.04it/s]
[Rank 1] Train Epoch 5:  80%|████████  | 1600/2000 [00:11<00:02, 137.12it/s]
[Rank 2] Train Epoch 5:  80%|████████  | 1603/2000 [00:11<00:02, 137.97it/s]
[Rank 0] Train Epoch 5:  80%|███████▉  | 1596/2000 [00:11<00:02, 138.26it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1595 | Mem: 26.53MB, Util: 67%  global_step : 11595
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1596 | Mem: 26.53MB, Util: 67%  global_step : 11596
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1597 | Mem: 26.53MB, Util: 66%  global_step : 11597
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1598 | Mem: 26.53MB, Util: 66%  global_step : 11598
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1599 | Mem: 26.53MB, Util: 66%  global_step : 11599
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1600 | Mem: 26.53MB, Util: 66%  global_step : 11600
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1601 | Mem: 26.53MB, Util: 66%  global_step : 11601
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1602 | Mem: 26.53MB, Util: 6

[Rank 2] Train Epoch 5:  81%|████████  | 1617/2000 [00:11<00:02, 138.43it/s]
[Rank 0] Train Epoch 5:  80%|████████  | 1610/2000 [00:11<00:02, 135.57it/s]
[Rank 1] Train Epoch 5:  81%|████████  | 1614/2000 [00:11<00:02, 130.77it/s]
[Rank 2] Train Epoch 5:  82%|████████▏ | 1632/2000 [00:11<00:02, 139.09it/s]
[Rank 0] Train Epoch 5:  81%|████████▏ | 1625/2000 [00:11<00:02, 137.04it/s]
[Rank 1] Train Epoch 5:  81%|████████▏ | 1628/2000 [00:11<00:02, 131.35it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1624 | Mem: 26.53MB, Util: 66%  global_step : 11624
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1625 | Mem: 26.53MB, Util: 81%  global_step : 11625
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1626 | Mem: 26.53MB, Util: 81%  global_step : 11626
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1627 | Mem: 26.53MB, Util: 81%  global_step : 11627
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1628 | Mem: 26.53MB, Util: 81%  global_step : 11628
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1629 | Mem: 26.53MB, Util: 81%  global_step : 11629
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1630 | Mem: 26.53MB, Util: 81%  global_step : 11630
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1631 | Mem: 26.53MB, Util: 8

[Rank 2] Train Epoch 5:  82%|████████▏ | 1647/2000 [00:12<00:02, 139.45it/s]
[Rank 0] Train Epoch 5:  82%|████████▏ | 1640/2000 [00:11<00:02, 138.24it/s]
[Rank 1] Train Epoch 5:  82%|████████▏ | 1642/2000 [00:12<00:02, 132.07it/s]
[Rank 2] Train Epoch 5:  83%|████████▎ | 1662/2000 [00:12<00:02, 139.74it/s]
[Rank 0] Train Epoch 5:  83%|████████▎ | 1655/2000 [00:12<00:02, 139.17it/s]
[Rank 1] Train Epoch 5:  83%|████████▎ | 1656/2000 [00:12<00:02, 132.75it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1653 | Mem: 26.53MB, Util: 100%  global_step : 11653
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1654 | Mem: 26.53MB, Util: 100%  global_step : 11654
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1655 | Mem: 26.53MB, Util: 100%  global_step : 11655
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1656 | Mem: 26.53MB, Util: 100%  global_step : 11656
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1657 | Mem: 26.53MB, Util: 100%  global_step : 11657
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1658 | Mem: 26.53MB, Util: 100%  global_step : 11658
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1659 | Mem: 26.53MB, Util: 100%  global_step : 11659
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1660 | Mem: 26.53MB, 

[Rank 2] Train Epoch 5:  84%|████████▍ | 1677/2000 [00:12<00:02, 139.95it/s]
[Rank 0] Train Epoch 5:  84%|████████▎ | 1670/2000 [00:12<00:02, 140.67it/s]
[Rank 1] Train Epoch 5:  84%|████████▎ | 1670/2000 [00:12<00:02, 132.82it/s]
[Rank 2] Train Epoch 5:  85%|████████▍ | 1692/2000 [00:12<00:02, 140.13it/s]
[Rank 0] Train Epoch 5:  84%|████████▍ | 1685/2000 [00:12<00:02, 138.55it/s]
[Rank 1] Train Epoch 5:  84%|████████▍ | 1684/2000 [00:12<00:02, 133.31it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1683 | Mem: 26.53MB, Util: 100%  global_step : 11683
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1684 | Mem: 26.53MB, Util: 100%  global_step : 11684
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1685 | Mem: 26.53MB, Util: 100%  global_step : 11685
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1686 | Mem: 26.53MB, Util: 100%  global_step : 11686
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1687 | Mem: 26.53MB, Util: 100%  global_step : 11687
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1688 | Mem: 26.53MB, Util: 100%  global_step : 11688
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1689 | Mem: 26.53MB, Util: 100%  global_step : 11689
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1690 | Mem: 26.53MB, 

[Rank 2] Train Epoch 5:  85%|████████▌ | 1707/2000 [00:12<00:02, 127.32it/s]
[Rank 0] Train Epoch 5:  85%|████████▌ | 1700/2000 [00:12<00:02, 139.28it/s]
[Rank 1] Train Epoch 5:  85%|████████▍ | 1698/2000 [00:12<00:02, 133.82it/s]
[Rank 2] Train Epoch 5:  86%|████████▌ | 1721/2000 [00:12<00:02, 130.34it/s]
[Rank 0] Train Epoch 5:  86%|████████▌ | 1714/2000 [00:12<00:02, 127.62it/s]
[Rank 1] Train Epoch 5:  86%|████████▌ | 1712/2000 [00:12<00:02, 133.19it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1707 | Mem: 26.53MB, Util: 100%  global_step : 11707
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1708 | Mem: 26.53MB, Util: 100%  global_step : 11708
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1709 | Mem: 26.53MB, Util: 100%  global_step : 11709
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1710 | Mem: 26.53MB, Util: 100%  global_step : 11710
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1711 | Mem: 26.53MB, Util: 100%  global_step : 11711
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1712 | Mem: 26.53MB, Util: 100%  global_step : 11712
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1713 | Mem: 26.53MB, Util: 100%  global_step : 11713
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1714 | Mem: 26.53MB, 

[Rank 2] Train Epoch 5:  87%|████████▋ | 1736/2000 [00:12<00:01, 133.51it/s]
[Rank 0] Train Epoch 5:  86%|████████▋ | 1729/2000 [00:12<00:02, 131.81it/s]
[Rank 1] Train Epoch 5:  86%|████████▋ | 1726/2000 [00:12<00:02, 133.51it/s]
[Rank 2] Train Epoch 5:  88%|████████▊ | 1751/2000 [00:12<00:01, 135.75it/s]
[Rank 0] Train Epoch 5:  87%|████████▋ | 1744/2000 [00:12<00:01, 134.97it/s]
[Rank 1] Train Epoch 5:  87%|████████▋ | 1740/2000 [00:12<00:01, 133.74it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1737 | Mem: 26.53MB, Util: 93%  global_step : 11737
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1738 | Mem: 26.53MB, Util: 93%  global_step : 11738
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1739 | Mem: 26.53MB, Util: 93%  global_step : 11739
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1740 | Mem: 26.53MB, Util: 93%  global_step : 11740
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1741 | Mem: 26.53MB, Util: 93%  global_step : 11741
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1742 | Mem: 26.53MB, Util: 93%  global_step : 11742
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1743 | Mem: 26.53MB, Util: 93%  global_step : 11743
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1744 | Mem: 26.53MB, Util: 9

[Rank 2] Train Epoch 5:  88%|████████▊ | 1766/2000 [00:12<00:01, 137.29it/s]
[Rank 0] Train Epoch 5:  88%|████████▊ | 1758/2000 [00:12<00:01, 136.31it/s]
[Rank 1] Train Epoch 5:  88%|████████▊ | 1754/2000 [00:12<00:01, 133.11it/s]
[Rank 2] Train Epoch 5:  89%|████████▉ | 1781/2000 [00:13<00:01, 138.18it/s]
[Rank 0] Train Epoch 5:  89%|████████▊ | 1772/2000 [00:12<00:01, 136.92it/s]
[Rank 1] Train Epoch 5:  88%|████████▊ | 1768/2000 [00:12<00:01, 133.13it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1766 | Mem: 26.53MB, Util: 100%  global_step : 11766
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1767 | Mem: 26.53MB, Util: 100%  global_step : 11767
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1768 | Mem: 26.53MB, Util: 100%  global_step : 11768
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1769 | Mem: 26.53MB, Util: 100%  global_step : 11769
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1770 | Mem: 26.53MB, Util: 100%  global_step : 11770
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1771 | Mem: 26.53MB, Util: 100%  global_step : 11771
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1772 | Mem: 26.53MB, Util: 100%  global_step : 11772
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1773 | Mem: 26.53MB, 

[Rank 2] Train Epoch 5:  90%|████████▉ | 1795/2000 [00:13<00:01, 138.56it/s]
[Rank 0] Train Epoch 5:  89%|████████▉ | 1787/2000 [00:13<00:01, 138.65it/s]
[Rank 1] Train Epoch 5:  89%|████████▉ | 1782/2000 [00:13<00:01, 132.81it/s]
[Rank 0] Train Epoch 5:  90%|█████████ | 1801/2000 [00:13<00:01, 123.75it/s]
[Rank 1] Train Epoch 5:  90%|████████▉ | 1796/2000 [00:13<00:01, 132.18it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1796 | Mem: 26.53MB, Util: 100%  global_step : 11796
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1797 | Mem: 26.53MB, Util: 100%  global_step : 11797
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1798 | Mem: 26.53MB, Util: 100%  global_step : 11798
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1799 | Mem: 26.53MB, Util: 100%  global_step : 11799
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1800 | Mem: 26.53MB, Util: 100%  global_step : 11800
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1801 | Mem: 26.53MB, Util: 100%  global_step : 11801
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1802 | Mem: 26.53MB, Util: 100%  global_step : 11802
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1803 | Mem: 26.53MB, 

[Rank 2] Train Epoch 5:  90%|█████████ | 1809/2000 [00:13<00:01, 124.35it/s]
[Rank 0] Train Epoch 5:  91%|█████████ | 1816/2000 [00:13<00:01, 128.65it/s]
[Rank 1] Train Epoch 5:  90%|█████████ | 1810/2000 [00:13<00:01, 132.02it/s]
[Rank 2] Train Epoch 5:  91%|█████████ | 1824/2000 [00:13<00:01, 129.39it/s]
[Rank 0] Train Epoch 5:  92%|█████████▏| 1831/2000 [00:13<00:01, 132.16it/s]
[Rank 1] Train Epoch 5:  91%|█████████ | 1824/2000 [00:13<00:01, 130.68it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1819 | Mem: 26.53MB, Util: 100%  global_step : 11819
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1820 | Mem: 26.53MB, Util: 100%  global_step : 11820
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1821 | Mem: 26.53MB, Util: 100%  global_step : 11821
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1822 | Mem: 26.53MB, Util: 100%  global_step : 11822
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1823 | Mem: 26.53MB, Util: 100%  global_step : 11823
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1824 | Mem: 26.53MB, Util: 100%  global_step : 11824
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1825 | Mem: 26.53MB, Util: 100%  global_step : 11825
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1826 | Mem: 26.53MB, 

[Rank 2] Train Epoch 5:  92%|█████████▏| 1839/2000 [00:13<00:01, 132.58it/s]
[Rank 0] Train Epoch 5:  92%|█████████▏| 1846/2000 [00:13<00:01, 134.98it/s]
[Rank 1] Train Epoch 5:  92%|█████████▏| 1838/2000 [00:13<00:01, 130.20it/s]
[Rank 2] Train Epoch 5:  93%|█████████▎| 1853/2000 [00:13<00:01, 133.03it/s]
[Rank 0] Train Epoch 5:  93%|█████████▎| 1861/2000 [00:13<00:01, 137.21it/s]
[Rank 1] Train Epoch 5:  93%|█████████▎| 1852/2000 [00:13<00:01, 130.22it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1848 | Mem: 26.53MB, Util: 97%  global_step : 11848
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1849 | Mem: 26.53MB, Util: 97%  global_step : 11849
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1850 | Mem: 26.53MB, Util: 97%  global_step : 11850
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1851 | Mem: 26.53MB, Util: 97%  global_step : 11851
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1852 | Mem: 26.53MB, Util: 97%  global_step : 11852
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1853 | Mem: 26.53MB, Util: 97%  global_step : 11853
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1854 | Mem: 26.53MB, Util: 97%  global_step : 11854
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1855 | Mem: 26.53MB, Util: 9

[Rank 2] Train Epoch 5:  93%|█████████▎| 1867/2000 [00:13<00:00, 133.66it/s]
[Rank 0] Train Epoch 5:  94%|█████████▍| 1875/2000 [00:13<00:00, 137.79it/s]
[Rank 1] Train Epoch 5:  93%|█████████▎| 1866/2000 [00:13<00:01, 130.94it/s]
[Rank 2] Train Epoch 5:  94%|█████████▍| 1881/2000 [00:13<00:00, 133.69it/s]
[Rank 0] Train Epoch 5:  94%|█████████▍| 1890/2000 [00:13<00:00, 139.21it/s]
[Rank 1] Train Epoch 5:  94%|█████████▍| 1880/2000 [00:13<00:00, 131.05it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1876 | Mem: 26.53MB, Util: 100%  global_step : 11876
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1877 | Mem: 26.53MB, Util: 100%  global_step : 11877
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1878 | Mem: 26.53MB, Util: 100%  global_step : 11878
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1879 | Mem: 26.53MB, Util: 100%  global_step : 11879
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1880 | Mem: 26.53MB, Util: 100%  global_step : 11880
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1881 | Mem: 26.53MB, Util: 100%  global_step : 11881
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1882 | Mem: 26.53MB, Util: 100%  global_step : 11882
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1883 | Mem: 26.53MB, 

[Rank 2] Train Epoch 5:  95%|█████████▍| 1895/2000 [00:13<00:00, 135.05it/s]
[Rank 1] Train Epoch 5:  95%|█████████▍| 1894/2000 [00:13<00:00, 131.46it/s]
[Rank 2] Train Epoch 5:  95%|█████████▌| 1909/2000 [00:14<00:00, 122.14it/s]
[Rank 0] Train Epoch 5:  95%|█████████▌| 1904/2000 [00:13<00:00, 119.33it/s]
[Rank 1] Train Epoch 5:  95%|█████████▌| 1908/2000 [00:14<00:00, 131.10it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1900 | Mem: 26.53MB, Util: 100%  global_step : 11900
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1901 | Mem: 26.53MB, Util: 100%  global_step : 11901
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1902 | Mem: 26.53MB, Util: 100%  global_step : 11902
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1903 | Mem: 26.53MB, Util: 100%  global_step : 11903
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1904 | Mem: 26.53MB, Util: 100%  global_step : 11904
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1905 | Mem: 26.53MB, Util: 100%  global_step : 11905
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1906 | Mem: 26.53MB, Util: 100%  global_step : 11906
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1907 | Mem: 26.53MB, 

[Rank 2] Train Epoch 5:  96%|█████████▌| 1923/2000 [00:14<00:00, 126.69it/s]
[Rank 0] Train Epoch 5:  96%|█████████▌| 1919/2000 [00:14<00:00, 125.36it/s]
[Rank 1] Train Epoch 5:  96%|█████████▌| 1922/2000 [00:14<00:00, 131.25it/s]
[Rank 2] Train Epoch 5:  97%|█████████▋| 1937/2000 [00:14<00:00, 130.33it/s]
[Rank 1] Train Epoch 5:  97%|█████████▋| 1936/2000 [00:14<00:00, 131.29it/s]
[Rank 0] Train Epoch 5:  97%|█████████▋| 1933/2000 [00:14<00:00, 128.73it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1928 | Mem: 26.53MB, Util: 97%  global_step : 11928
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1929 | Mem: 26.53MB, Util: 97%  global_step : 11929
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1930 | Mem: 26.53MB, Util: 97%  global_step : 11930
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1931 | Mem: 26.53MB, Util: 97%  global_step : 11931
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1932 | Mem: 26.53MB, Util: 97%  global_step : 11932
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1933 | Mem: 26.53MB, Util: 97%  global_step : 11933
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1934 | Mem: 26.53MB, Util: 97%  global_step : 11934
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1935 | Mem: 26.53MB, Util: 9

[Rank 2] Train Epoch 5:  98%|█████████▊| 1951/2000 [00:14<00:00, 132.76it/s]
[Rank 1] Train Epoch 5:  98%|█████████▊| 1950/2000 [00:14<00:00, 130.99it/s]
[Rank 0] Train Epoch 5:  97%|█████████▋| 1948/2000 [00:14<00:00, 132.33it/s]
[Rank 0] Train Epoch 5:  98%|█████████▊| 1962/2000 [00:14<00:00, 134.33it/s]
[Rank 2] Train Epoch 5:  98%|█████████▊| 1965/2000 [00:14<00:00, 133.58it/s]
[Rank 1] Train Epoch 5:  98%|█████████▊| 1964/2000 [00:14<00:00, 130.56it/s]
[Rank 0] Train Epoch 5:  99%|█████████▉| 1977/2000 [00:14<00:00, 136.64it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1957 | Mem: 26.53MB, Util: 99%  global_step : 11957
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1958 | Mem: 26.53MB, Util: 99%  global_step : 11958
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1959 | Mem: 26.53MB, Util: 99%  global_step : 11959
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1960 | Mem: 26.53MB, Util: 99%  global_step : 11960
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1961 | Mem: 26.53MB, Util: 99%  global_step : 11961
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1962 | Mem: 26.53MB, Util: 99%  global_step : 11962
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1963 | Mem: 26.53MB, Util: 99%  global_step : 11963
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1964 | Mem: 26.53MB, Util: 9

[Rank 2] Train Epoch 5:  99%|█████████▉| 1979/2000 [00:14<00:00, 134.68it/s]
[Rank 1] Train Epoch 5:  99%|█████████▉| 1978/2000 [00:14<00:00, 131.03it/s]
[Rank 2] Train Epoch 5: 100%|██████████| 2000/2000 [00:14<00:00, 135.97it/s]
[Rank 2] Test Epoch 5:   0%|          | 0/334 [00:00<?, ?it/s]
[Rank 1] Train Epoch 5: 100%|█████████▉| 1992/2000 [00:14<00:00, 131.06it/s]
[Rank 0] Train Epoch 5: 100%|██████████| 2000/2000 [00:14<00:00, 136.21it/s]
[Rank 0] Test Epoch 5:   0%|          | 0/334 [00:00<?, ?it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1986 | Mem: 26.53MB, Util: 100%  global_step : 11986
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1987 | Mem: 26.53MB, Util: 100%  global_step : 11987
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1988 | Mem: 26.53MB, Util: 100%  global_step : 11988
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1989 | Mem: 26.53MB, Util: 100%  global_step : 11989
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1990 | Mem: 26.53MB, Util: 100%  global_step : 11990
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1991 | Mem: 26.53MB, Util: 100%  global_step : 11991
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1992 | Mem: 26.53MB, Util: 100%  global_step : 11992
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 5, Batch 1993 | Mem: 26.53MB, 

[Rank 2] Test Epoch 5:   6%|▌         | 19/334 [00:00<00:01, 186.90it/s]
[Rank 1] Train Epoch 5: 100%|██████████| 2000/2000 [00:14<00:00, 135.82it/s]
[Rank 1] Test Epoch 5:   0%|          | 0/334 [00:00<?, ?it/s]
[Rank 1] Test Epoch 5:  10%|▉         | 33/334 [00:00<00:00, 328.86it/s]
[Rank 0] Test Epoch 5:   6%|▋         | 21/334 [00:00<00:01, 207.45it/s]
[Rank 2] Test Epoch 5:  15%|█▍        | 50/334 [00:00<00:01, 254.13it/s]
[Rank 1] Test Epoch 5:  20%|██        | 68/334 [00:00<00:00, 336.40it/s]
[Rank 0] Test Epoch 5:  16%|█▋        | 55/334 [00:00<00:00, 284.73it/s]
[Rank 2] Test Epoch 5:  23%|██▎       | 77/334 [00:00<00:00, 258.71it/s]
[Rank 1] Test Epoch 5:  31%|███       | 104/334 [00:00<00:00, 345.30it/s]
[Rank 0] Test Epoch 5:  27%|██▋       | 89/334 [00:00<00:00, 308.89it/s]
[Rank 2] Test Epoch 5:  32%|███▏      | 108/334 [00:00<00:00, 276.35it/s]
[Rank 1] Test Epoch 5:  42%|████▏     | 140/334 [00:00<00:00, 347.50it/s]
[Rank 0] Test Epoch 5:  37%|███▋      | 124/334 [00:00

[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [Rank 1] Epoch 5 | Loss: 0.3746, Acc: 0.8620, Model Checksum: a79a70f6b682fe903b506a63e5f28422
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [ NodeId d78b974282fa0fa2bddc3a93a3217bbba8df4be1998f6b20ec83243d Rank 1] Epoch 5 | Loss: 0.3746, Acc: 0.8620, Model Checksum: a79a70f6b682fe903b506a63e5f28422
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 0 | Mem: 26.53MB, Util: 3%  global_step : 12000
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1 | Mem: 26.53MB, Util: 3%  global_step : 12001
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 2 | Mem: 26.53MB, Util: 3%  global_step : 12002
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 3 | Mem: 26.53MB, Util: 3%  global_step : 12003
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 4 | Mem: 26.53MB, Util: 3%  global_step : 12004
[36m(RayTrainWo

[Rank 2] Test Epoch 5:  97%|█████████▋| 325/334 [00:01<00:00, 302.40it/s]
[Rank 1] Train Epoch 6:   1%|          | 14/2000 [00:00<00:14, 139.98it/s]
[Rank 0] Train Epoch 6:   1%|          | 14/2000 [00:00<00:14, 137.72it/s]
[Rank 2] Test Epoch 5: 100%|██████████| 334/334 [00:01<00:00, 290.78it/s]
[Rank 2] Train Epoch 6:   0%|          | 0/2000 [00:00<?, ?it/s]
[Rank 1] Train Epoch 6:   1%|▏         | 29/2000 [00:00<00:14, 140.27it/s]
[Rank 0] Train Epoch 6:   1%|▏         | 28/2000 [00:00<00:14, 137.31it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 22 | Mem: 26.53MB, Util: 17%  global_step : 12022
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 23 | Mem: 26.53MB, Util: 17%  global_step : 12023
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 24 | Mem: 26.53MB, Util: 17%  global_step : 12024
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 25 | Mem: 26.53MB, Util: 17%  global_step : 12025
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 26 | Mem: 26.53MB, Util: 17%  global_step : 12026
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 27 | Mem: 26.53MB, Util: 17%  global_step : 12027
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 28 | Mem: 26.53MB, Util: 17%  global_step : 12028
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 29 | Mem: 26.53MB, Util: 17%  glob

[Rank 2] Train Epoch 6:   1%|          | 13/2000 [00:00<00:15, 125.03it/s]
[Rank 1] Train Epoch 6:   2%|▏         | 44/2000 [00:00<00:14, 136.65it/s]
[Rank 0] Train Epoch 6:   2%|▏         | 42/2000 [00:00<00:14, 134.57it/s]
[Rank 2] Train Epoch 6:   1%|▏         | 28/2000 [00:00<00:14, 134.14it/s]
[Rank 1] Train Epoch 6:   3%|▎         | 58/2000 [00:00<00:14, 135.25it/s]
[Rank 0] Train Epoch 6:   3%|▎         | 56/2000 [00:00<00:14, 134.02it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 50 | Mem: 26.53MB, Util: 100%  global_step : 12050
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 51 | Mem: 26.53MB, Util: 100%  global_step : 12051
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 52 | Mem: 26.53MB, Util: 100%  global_step : 12052
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 53 | Mem: 26.53MB, Util: 100%  global_step : 12053
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 54 | Mem: 26.53MB, Util: 100%  global_step : 12054
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 55 | Mem: 26.53MB, Util: 100%  global_step : 12055
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 56 | Mem: 26.53MB, Util: 100%  global_step : 12056
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 57 | Mem: 26.53MB, Util: 10

[Rank 2] Train Epoch 6:   2%|▏         | 43/2000 [00:00<00:14, 136.82it/s]
[Rank 1] Train Epoch 6:   4%|▎         | 72/2000 [00:00<00:14, 135.17it/s]
[Rank 0] Train Epoch 6:   4%|▎         | 70/2000 [00:00<00:14, 134.92it/s]
[Rank 2] Train Epoch 6:   3%|▎         | 57/2000 [00:00<00:14, 137.90it/s]
[Rank 1] Train Epoch 6:   4%|▍         | 86/2000 [00:00<00:14, 134.47it/s]
[Rank 0] Train Epoch 6:   4%|▍         | 84/2000 [00:00<00:14, 134.46it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 78 | Mem: 26.53MB, Util: 100%  global_step : 12078
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 79 | Mem: 26.53MB, Util: 100%  global_step : 12079
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 80 | Mem: 26.53MB, Util: 100%  global_step : 12080
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 81 | Mem: 26.53MB, Util: 100%  global_step : 12081
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 82 | Mem: 26.53MB, Util: 100%  global_step : 12082
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 83 | Mem: 26.53MB, Util: 100%  global_step : 12083
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 84 | Mem: 26.53MB, Util: 100%  global_step : 12084
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 85 | Mem: 26.53MB, Util: 10

[Rank 2] Train Epoch 6:   4%|▎         | 71/2000 [00:00<00:14, 134.74it/s]
[Rank 1] Train Epoch 6:   5%|▌         | 100/2000 [00:00<00:14, 135.00it/s]
[Rank 0] Train Epoch 6:   5%|▍         | 98/2000 [00:00<00:14, 134.08it/s]
[Rank 2] Train Epoch 6:   4%|▍         | 85/2000 [00:00<00:14, 135.52it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 100 | Mem: 26.53MB, Util: 100%  global_step : 12100
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 101 | Mem: 26.53MB, Util: 100%  global_step : 12101
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 102 | Mem: 26.53MB, Util: 100%  global_step : 12102
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 6, Batch 97 | Mem: 26.53MB, Util: 62%  global_step : 12097
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 6, Batch 98 | Mem: 26.53MB, Util: 62%  global_step : 12098
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 6, Batch 99 | Mem: 26.53MB, Util: 62%  global_step : 12099
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 6, Batch 100 | Mem: 26.53MB, Util: 62%  global_step : 12100
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 6, Batch 101 | Mem: 26.53MB, Util: 62%  global_

[Rank 2] Train Epoch 6:   5%|▍         | 99/2000 [00:00<00:13, 136.32it/s]
[Rank 0] Train Epoch 6:   6%|▌         | 112/2000 [00:00<00:19, 95.78it/s]
[Rank 2] Train Epoch 6:   6%|▌         | 113/2000 [00:00<00:13, 136.98it/s]
[Rank 1] Train Epoch 6:   6%|▌         | 114/2000 [00:00<00:20, 92.37it/s] 
[Rank 0] Train Epoch 6:   6%|▋         | 127/2000 [00:01<00:17, 108.23it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 117 | Mem: 26.53MB, Util: 100%  global_step : 12117
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 118 | Mem: 26.53MB, Util: 100%  global_step : 12118
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 119 | Mem: 26.53MB, Util: 100%  global_step : 12119
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 120 | Mem: 26.53MB, Util: 100%  global_step : 12120
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 121 | Mem: 26.53MB, Util: 100%  global_step : 12121
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 122 | Mem: 26.53MB, Util: 100%  global_step : 12122
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 123 | Mem: 26.53MB, Util: 100%  global_step : 12123
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 124 | Mem: 26.53MB, Util: 100%  glob

[Rank 2] Train Epoch 6:   6%|▋         | 127/2000 [00:00<00:13, 137.54it/s]
[Rank 1] Train Epoch 6:   6%|▋         | 129/2000 [00:01<00:17, 104.92it/s]
[Rank 0] Train Epoch 6:   7%|▋         | 142/2000 [00:01<00:15, 118.68it/s]
[Rank 2] Train Epoch 6:   7%|▋         | 141/2000 [00:01<00:13, 137.60it/s]
[Rank 2] Train Epoch 6:   8%|▊         | 155/2000 [00:01<00:13, 138.08it/s]
[Rank 1] Train Epoch 6:   7%|▋         | 144/2000 [00:01<00:16, 114.52it/s]
[Rank 0] Train Epoch 6:   8%|▊         | 158/2000 [00:01<00:14, 128.42it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 149 | Mem: 26.53MB, Util: 83%  global_step : 12149
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 150 | Mem: 26.53MB, Util: 83%  global_step : 12150
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 151 | Mem: 26.53MB, Util: 83%  global_step : 12151
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 152 | Mem: 26.53MB, Util: 83%  global_step : 12152
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 153 | Mem: 26.53MB, Util: 83%  global_step : 12153
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 154 | Mem: 26.53MB, Util: 83%  global_step : 12154
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 155 | Mem: 26.53MB, Util: 83%  global_step : 12155
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 156 | Mem: 26.53MB, Util: 83%  global_step 

[Rank 2] Train Epoch 6:   8%|▊         | 170/2000 [00:01<00:13, 138.77it/s]
[Rank 1] Train Epoch 6:   8%|▊         | 159/2000 [00:01<00:15, 122.09it/s]
[Rank 0] Train Epoch 6:   9%|▊         | 174/2000 [00:01<00:13, 136.00it/s]
[Rank 2] Train Epoch 6:   9%|▉         | 184/2000 [00:01<00:13, 136.88it/s]
[Rank 1] Train Epoch 6:   9%|▊         | 174/2000 [00:01<00:14, 127.54it/s]
[Rank 0] Train Epoch 6:  10%|▉         | 190/2000 [00:01<00:12, 141.96it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 182 | Mem: 26.53MB, Util: 100%  global_step : 12182
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 183 | Mem: 26.53MB, Util: 100%  global_step : 12183
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 184 | Mem: 26.53MB, Util: 100%  global_step : 12184
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 185 | Mem: 26.53MB, Util: 100%  global_step : 12185
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 186 | Mem: 26.53MB, Util: 100%  global_step : 12186
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 187 | Mem: 26.53MB, Util: 100%  global_step : 12187
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 188 | Mem: 26.53MB, Util: 100%  global_step : 12188
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 189 | Mem: 26.53MB, Util: 100%  glob

[Rank 2] Train Epoch 6:  10%|█         | 200/2000 [00:01<00:12, 141.73it/s]
[Rank 1] Train Epoch 6:   9%|▉         | 189/2000 [00:01<00:13, 131.10it/s]
[Rank 0] Train Epoch 6:  10%|█         | 205/2000 [00:01<00:14, 123.81it/s]
[Rank 2] Train Epoch 6:  11%|█         | 215/2000 [00:01<00:12, 143.07it/s]
[Rank 1] Train Epoch 6:  10%|█         | 204/2000 [00:01<00:13, 133.66it/s]
[Rank 0] Train Epoch 6:  11%|█         | 221/2000 [00:01<00:13, 132.53it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 205 | Mem: 26.53MB, Util: 100%  global_step : 12205
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 206 | Mem: 26.53MB, Util: 100%  global_step : 12206
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 207 | Mem: 26.53MB, Util: 100%  global_step : 12207
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 208 | Mem: 26.53MB, Util: 100%  global_step : 12208
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 209 | Mem: 26.53MB, Util: 100%  global_step : 12209
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 210 | Mem: 26.53MB, Util: 100%  global_step : 12210
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 211 | Mem: 26.53MB, Util: 100%  global_step : 12211
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 212 | Mem: 26.53MB, Util: 100%  glob

[Rank 2] Train Epoch 6:  12%|█▏        | 231/2000 [00:01<00:12, 145.54it/s]
[Rank 1] Train Epoch 6:  11%|█         | 219/2000 [00:01<00:13, 136.06it/s]
[Rank 0] Train Epoch 6:  12%|█▏        | 237/2000 [00:01<00:12, 138.53it/s]
[Rank 2] Train Epoch 6:  12%|█▏        | 246/2000 [00:01<00:11, 146.42it/s]
[Rank 1] Train Epoch 6:  12%|█▏        | 234/2000 [00:01<00:12, 138.37it/s]
[Rank 0] Train Epoch 6:  13%|█▎        | 253/2000 [00:01<00:12, 142.76it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 238 | Mem: 26.53MB, Util: 96%  global_step : 12238
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 239 | Mem: 26.53MB, Util: 96%  global_step : 12239
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 240 | Mem: 26.53MB, Util: 96%  global_step : 12240
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 241 | Mem: 26.53MB, Util: 96%  global_step : 12241
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 242 | Mem: 26.53MB, Util: 96%  global_step : 12242
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 243 | Mem: 26.53MB, Util: 96%  global_step : 12243
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 244 | Mem: 26.53MB, Util: 96%  global_step : 12244
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 245 | Mem: 26.53MB, Util: 96%  global_step 

[Rank 2] Train Epoch 6:  13%|█▎        | 262/2000 [00:01<00:11, 148.24it/s]
[Rank 1] Train Epoch 6:  12%|█▏        | 249/2000 [00:01<00:12, 139.75it/s]
[Rank 0] Train Epoch 6:  13%|█▎        | 269/2000 [00:02<00:11, 146.28it/s]
[Rank 2] Train Epoch 6:  14%|█▍        | 278/2000 [00:01<00:11, 149.58it/s]
[Rank 1] Train Epoch 6:  13%|█▎        | 264/2000 [00:02<00:12, 141.08it/s]
[Rank 0] Train Epoch 6:  14%|█▍        | 284/2000 [00:02<00:12, 141.72it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 270 | Mem: 26.53MB, Util: 100%  global_step : 12270
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 271 | Mem: 26.53MB, Util: 100%  global_step : 12271
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 272 | Mem: 26.53MB, Util: 100%  global_step : 12272
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 273 | Mem: 26.53MB, Util: 100%  global_step : 12273
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 274 | Mem: 26.53MB, Util: 100%  global_step : 12274
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 275 | Mem: 26.53MB, Util: 100%  global_step : 12275
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 276 | Mem: 26.53MB, Util: 100%  global_step : 12276
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 277 | Mem: 26.53MB, Util: 100%  glob

[Rank 2] Train Epoch 6:  15%|█▍        | 294/2000 [00:02<00:11, 150.26it/s]
[Rank 1] Train Epoch 6:  14%|█▍        | 279/2000 [00:02<00:12, 140.95it/s]
[Rank 0] Train Epoch 6:  15%|█▌        | 300/2000 [00:02<00:11, 144.72it/s]
[Rank 1] Train Epoch 6:  15%|█▍        | 294/2000 [00:02<00:12, 141.32it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 300 | Mem: 26.53MB, Util: 100%  global_step : 12300
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 301 | Mem: 26.53MB, Util: 100%  global_step : 12301
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 302 | Mem: 26.53MB, Util: 100%  global_step : 12302
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 303 | Mem: 26.53MB, Util: 100%  global_step : 12303
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 304 | Mem: 26.53MB, Util: 100%  global_step : 12304
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 305 | Mem: 26.53MB, Util: 100%  global_step : 12305
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 306 | Mem: 26.53MB, Util: 100%  global_step : 12306
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 307 | Mem: 26.53MB, Util: 100%  glob

[Rank 1] Train Epoch 6:  15%|█▌        | 309/2000 [00:02<00:11, 141.63it/s]
[Rank 2] Train Epoch 6:  16%|█▌        | 310/2000 [00:02<00:16, 104.76it/s]
[Rank 1] Train Epoch 6:  16%|█▌        | 324/2000 [00:02<00:11, 143.63it/s]
[Rank 2] Train Epoch 6:  16%|█▌        | 323/2000 [00:02<00:15, 110.22it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 6, Batch 317 | Mem: 26.53MB, Util: 37%  global_step : 12317
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 6, Batch 318 | Mem: 26.53MB, Util: 37%  global_step : 12318
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 6, Batch 319 | Mem: 26.53MB, Util: 37%  global_step : 12319
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 6, Batch 320 | Mem: 26.53MB, Util: 37%  global_step : 12320
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 6, Batch 321 | Mem: 26.53MB, Util: 37%  global_step : 12321
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 6, Batch 322 | Mem: 26.53MB, Util: 37%  global_step : 12322
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 6, Batch 323 | Mem: 26.53MB, Util: 37%  global_step : 12323
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 6, Batch 324 | Mem: 26.53MB, Util: 37%  glob

[Rank 0] Train Epoch 6:  16%|█▌        | 315/2000 [00:03<00:47, 35.82it/s] 
[Rank 2] Train Epoch 6:  17%|█▋        | 338/2000 [00:03<00:39, 41.64it/s] 
[Rank 1] Train Epoch 6:  17%|█▋        | 339/2000 [00:03<00:41, 40.00it/s] 
[Rank 0] Train Epoch 6:  16%|█▋        | 329/2000 [00:03<00:37, 45.10it/s]
[Rank 2] Train Epoch 6:  18%|█▊        | 352/2000 [00:03<00:31, 51.82it/s]
[Rank 1] Train Epoch 6:  18%|█▊        | 353/2000 [00:03<00:32, 49.99it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 312 | Mem: 26.53MB, Util: 100%  global_step : 12312
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 313 | Mem: 26.53MB, Util: 0%  global_step : 12313
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 314 | Mem: 26.53MB, Util: 0%  global_step : 12314
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 315 | Mem: 26.53MB, Util: 0%  global_step : 12315
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 316 | Mem: 26.53MB, Util: 0%  global_step : 12316
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 317 | Mem: 26.53MB, Util: 0%  global_step : 12317
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 318 | Mem: 26.53MB, Util: 0%  global_step : 12318
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 6, Batch 337 | Mem: 26.53MB, Util: 100%  global_step : 1

[Rank 0] Train Epoch 6:  17%|█▋        | 344/2000 [00:03<00:29, 56.89it/s]
[Rank 2] Train Epoch 6:  18%|█▊        | 367/2000 [00:03<00:25, 64.38it/s]
[Rank 1] Train Epoch 6:  18%|█▊        | 367/2000 [00:03<00:26, 61.32it/s]
[Rank 0] Train Epoch 6:  18%|█▊        | 359/2000 [00:03<00:23, 69.26it/s]
[Rank 2] Train Epoch 6:  19%|█▉        | 382/2000 [00:03<00:20, 77.29it/s]
[Rank 1] Train Epoch 6:  19%|█▉        | 381/2000 [00:03<00:22, 73.09it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 334 | Mem: 26.53MB, Util: 0%  global_step : 12334
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 335 | Mem: 26.53MB, Util: 0%  global_step : 12335
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 336 | Mem: 26.53MB, Util: 0%  global_step : 12336
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 337 | Mem: 26.53MB, Util: 0%  global_step : 12337
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 338 | Mem: 26.53MB, Util: 0%  global_step : 12338
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 339 | Mem: 26.53MB, Util: 0%  global_step : 12339
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 340 | Mem: 26.53MB, Util: 0%  global_step : 12340
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 341 | Mem: 26.53MB, Util: 0%  global_step : 12341


[Rank 0] Train Epoch 6:  19%|█▊        | 374/2000 [00:03<00:19, 82.02it/s]
[Rank 2] Train Epoch 6:  20%|█▉        | 397/2000 [00:03<00:17, 90.14it/s]
[Rank 1] Train Epoch 6:  20%|█▉        | 396/2000 [00:03<00:18, 86.53it/s]
[Rank 0] Train Epoch 6:  19%|█▉        | 389/2000 [00:03<00:17, 94.16it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 365 | Mem: 26.53MB, Util: 61%  global_step : 12365
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 366 | Mem: 26.53MB, Util: 61%  global_step : 12366
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 367 | Mem: 26.53MB, Util: 61%  global_step : 12367
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 368 | Mem: 26.53MB, Util: 61%  global_step : 12368
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 369 | Mem: 26.53MB, Util: 61%  global_step : 12369
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 370 | Mem: 26.53MB, Util: 61%  global_step : 12370
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 371 | Mem: 26.53MB, Util: 61%  global_step : 12371
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 372 | Mem: 26.53MB, Util: 61%  global_step 

[Rank 0] Train Epoch 6:  20%|██        | 404/2000 [00:04<00:15, 103.66it/s]
[Rank 2] Train Epoch 6:  21%|██        | 411/2000 [00:03<00:20, 76.05it/s]
[Rank 1] Train Epoch 6:  20%|██        | 410/2000 [00:04<00:21, 74.78it/s]
[Rank 0] Train Epoch 6:  21%|██        | 419/2000 [00:04<00:14, 112.57it/s]
[Rank 2] Train Epoch 6:  21%|██▏       | 426/2000 [00:04<00:17, 89.36it/s]
[Rank 1] Train Epoch 6:  21%|██        | 424/2000 [00:04<00:18, 85.84it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 395 | Mem: 26.53MB, Util: 73%  global_step : 12395
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 396 | Mem: 26.53MB, Util: 73%  global_step : 12396
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 397 | Mem: 26.53MB, Util: 73%  global_step : 12397
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 398 | Mem: 26.53MB, Util: 73%  global_step : 12398
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 399 | Mem: 26.53MB, Util: 73%  global_step : 12399
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 400 | Mem: 26.53MB, Util: 73%  global_step : 12400
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 401 | Mem: 26.53MB, Util: 73%  global_step : 12401
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 402 | Mem: 26.53MB, Util: 73%  global_step 

[Rank 0] Train Epoch 6:  22%|██▏       | 434/2000 [00:04<00:13, 120.15it/s]
[Rank 2] Train Epoch 6:  22%|██▏       | 441/2000 [00:04<00:15, 101.75it/s]
[Rank 1] Train Epoch 6:  22%|██▏       | 439/2000 [00:04<00:15, 97.90it/s]
[Rank 0] Train Epoch 6:  22%|██▏       | 448/2000 [00:04<00:12, 123.05it/s]
[Rank 1] Train Epoch 6:  23%|██▎       | 454/2000 [00:04<00:14, 108.33it/s]
[Rank 2] Train Epoch 6:  23%|██▎       | 457/2000 [00:04<00:13, 113.77it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 424 | Mem: 26.53MB, Util: 71%  global_step : 12424
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 425 | Mem: 26.53MB, Util: 71%  global_step : 12425
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 426 | Mem: 26.53MB, Util: 71%  global_step : 12426
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 427 | Mem: 26.53MB, Util: 71%  global_step : 12427
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 428 | Mem: 26.53MB, Util: 71%  global_step : 12428
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 429 | Mem: 26.53MB, Util: 71%  global_step : 12429
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 430 | Mem: 26.53MB, Util: 71%  global_step : 12430
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 431 | Mem: 26.53MB, Util: 71%  global_step 

[Rank 0] Train Epoch 6:  23%|██▎       | 463/2000 [00:04<00:11, 129.06it/s]
[Rank 1] Train Epoch 6:  23%|██▎       | 469/2000 [00:04<00:13, 116.59it/s]
[Rank 2] Train Epoch 6:  24%|██▎       | 473/2000 [00:04<00:12, 123.30it/s]
[Rank 0] Train Epoch 6:  24%|██▍       | 478/2000 [00:04<00:11, 133.29it/s]
[Rank 1] Train Epoch 6:  24%|██▍       | 484/2000 [00:04<00:12, 123.42it/s]
[Rank 2] Train Epoch 6:  24%|██▍       | 489/2000 [00:04<00:11, 130.60it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 453 | Mem: 26.53MB, Util: 72%  global_step : 12453
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 454 | Mem: 26.53MB, Util: 72%  global_step : 12454
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 455 | Mem: 26.53MB, Util: 72%  global_step : 12455
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 456 | Mem: 26.53MB, Util: 72%  global_step : 12456
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 457 | Mem: 26.53MB, Util: 72%  global_step : 12457
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 458 | Mem: 26.53MB, Util: 76%  global_step : 12458
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 459 | Mem: 26.53MB, Util: 76%  global_step : 12459
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 460 | Mem: 26.53MB, Util: 76%  global_step 

[Rank 0] Train Epoch 6:  25%|██▍       | 493/2000 [00:04<00:11, 136.36it/s]
[Rank 2] Train Epoch 6:  25%|██▌       | 504/2000 [00:04<00:12, 119.90it/s]
[Rank 1] Train Epoch 6:  25%|██▍       | 498/2000 [00:04<00:11, 127.59it/s]
[Rank 0] Train Epoch 6:  25%|██▌       | 508/2000 [00:04<00:10, 135.79it/s]
[Rank 2] Train Epoch 6:  26%|██▌       | 519/2000 [00:04<00:11, 127.40it/s]
[Rank 1] Train Epoch 6:  26%|██▌       | 512/2000 [00:04<00:11, 130.69it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 483 | Mem: 26.53MB, Util: 76%  global_step : 12483
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 484 | Mem: 26.53MB, Util: 76%  global_step : 12484
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 485 | Mem: 26.53MB, Util: 76%  global_step : 12485
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 486 | Mem: 26.53MB, Util: 76%  global_step : 12486
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 487 | Mem: 26.53MB, Util: 76%  global_step : 12487
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 488 | Mem: 26.53MB, Util: 72%  global_step : 12488
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 489 | Mem: 26.53MB, Util: 72%  global_step : 12489
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 490 | Mem: 26.53MB, Util: 72%  global_step 

[Rank 0] Train Epoch 6:  26%|██▌       | 522/2000 [00:04<00:11, 133.99it/s]
[Rank 2] Train Epoch 6:  27%|██▋       | 535/2000 [00:04<00:10, 134.37it/s]
[Rank 1] Train Epoch 6:  26%|██▋       | 526/2000 [00:04<00:11, 133.13it/s]
[Rank 0] Train Epoch 6:  27%|██▋       | 536/2000 [00:05<00:10, 133.21it/s]
[Rank 2] Train Epoch 6:  28%|██▊       | 551/2000 [00:04<00:10, 139.20it/s]
[Rank 1] Train Epoch 6:  27%|██▋       | 540/2000 [00:05<00:10, 134.72it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 512 | Mem: 26.53MB, Util: 72%  global_step : 12512
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 513 | Mem: 26.53MB, Util: 72%  global_step : 12513
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 514 | Mem: 26.53MB, Util: 72%  global_step : 12514
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 515 | Mem: 26.53MB, Util: 73%  global_step : 12515
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 516 | Mem: 26.53MB, Util: 73%  global_step : 12516
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 517 | Mem: 26.53MB, Util: 73%  global_step : 12517
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 518 | Mem: 26.53MB, Util: 73%  global_step : 12518
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 519 | Mem: 26.53MB, Util: 73%  global_step 

[Rank 0] Train Epoch 6:  28%|██▊       | 550/2000 [00:05<00:10, 133.63it/s]
[Rank 1] Train Epoch 6:  28%|██▊       | 555/2000 [00:05<00:10, 137.04it/s]
[Rank 2] Train Epoch 6:  28%|██▊       | 567/2000 [00:05<00:10, 142.72it/s]
[Rank 0] Train Epoch 6:  28%|██▊       | 564/2000 [00:05<00:10, 134.22it/s]
[Rank 1] Train Epoch 6:  28%|██▊       | 570/2000 [00:05<00:10, 137.46it/s]
[Rank 2] Train Epoch 6:  29%|██▉       | 583/2000 [00:05<00:09, 145.05it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 540 | Mem: 26.53MB, Util: 73%  global_step : 12540
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 541 | Mem: 26.53MB, Util: 73%  global_step : 12541
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 542 | Mem: 26.53MB, Util: 73%  global_step : 12542
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 543 | Mem: 26.53MB, Util: 64%  global_step : 12543
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 544 | Mem: 26.53MB, Util: 64%  global_step : 12544
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 545 | Mem: 26.53MB, Util: 64%  global_step : 12545
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 546 | Mem: 26.53MB, Util: 64%  global_step : 12546
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 547 | Mem: 26.53MB, Util: 64%  global_step 

[Rank 0] Train Epoch 6:  29%|██▉       | 578/2000 [00:05<00:10, 134.13it/s]
[Rank 1] Train Epoch 6:  29%|██▉       | 584/2000 [00:05<00:10, 134.10it/s]
[Rank 2] Train Epoch 6:  30%|██▉       | 599/2000 [00:05<00:09, 147.46it/s]
[Rank 0] Train Epoch 6:  30%|██▉       | 592/2000 [00:05<00:10, 133.67it/s]
[Rank 1] Train Epoch 6:  30%|██▉       | 598/2000 [00:05<00:10, 134.08it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 568 | Mem: 26.53MB, Util: 64%  global_step : 12568
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 569 | Mem: 26.53MB, Util: 64%  global_step : 12569
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 570 | Mem: 26.53MB, Util: 66%  global_step : 12570
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 571 | Mem: 26.53MB, Util: 66%  global_step : 12571
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 572 | Mem: 26.53MB, Util: 66%  global_step : 12572
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 573 | Mem: 26.53MB, Util: 66%  global_step : 12573
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 574 | Mem: 26.53MB, Util: 66%  global_step : 12574
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 575 | Mem: 26.53MB, Util: 66%  global_step 

[Rank 0] Train Epoch 6:  30%|███       | 606/2000 [00:05<00:10, 133.19it/s]
[Rank 2] Train Epoch 6:  31%|███       | 614/2000 [00:05<00:11, 117.43it/s]
[Rank 1] Train Epoch 6:  31%|███       | 612/2000 [00:05<00:11, 125.52it/s]
[Rank 0] Train Epoch 6:  31%|███       | 620/2000 [00:05<00:10, 132.94it/s]
[Rank 2] Train Epoch 6:  32%|███▏      | 630/2000 [00:05<00:10, 126.24it/s]
[Rank 1] Train Epoch 6:  31%|███▏      | 626/2000 [00:05<00:10, 127.14it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 596 | Mem: 26.53MB, Util: 66%  global_step : 12596
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 597 | Mem: 26.53MB, Util: 66%  global_step : 12597
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 598 | Mem: 26.53MB, Util: 66%  global_step : 12598
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 599 | Mem: 26.53MB, Util: 66%  global_step : 12599
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 600 | Mem: 26.53MB, Util: 66%  global_step : 12600
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 601 | Mem: 26.53MB, Util: 66%  global_step : 12601
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 602 | Mem: 26.53MB, Util: 66%  global_step : 12602
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 603 | Mem: 26.53MB, Util: 66%  global_step 

[Rank 0] Train Epoch 6:  32%|███▏      | 634/2000 [00:05<00:10, 132.76it/s]
[Rank 1] Train Epoch 6:  32%|███▏      | 640/2000 [00:05<00:10, 128.57it/s]
[Rank 2] Train Epoch 6:  32%|███▏      | 646/2000 [00:05<00:10, 133.05it/s]
[Rank 0] Train Epoch 6:  32%|███▏      | 648/2000 [00:05<00:10, 134.47it/s]
[Rank 1] Train Epoch 6:  33%|███▎      | 654/2000 [00:05<00:10, 129.21it/s]
[Rank 2] Train Epoch 6:  33%|███▎      | 661/2000 [00:05<00:09, 137.16it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 624 | Mem: 26.53MB, Util: 66%  global_step : 12624
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 625 | Mem: 26.53MB, Util: 66%  global_step : 12625
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 626 | Mem: 26.53MB, Util: 66%  global_step : 12626
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 627 | Mem: 26.53MB, Util: 65%  global_step : 12627
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 628 | Mem: 26.53MB, Util: 65%  global_step : 12628
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 629 | Mem: 26.53MB, Util: 65%  global_step : 12629
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 630 | Mem: 26.53MB, Util: 65%  global_step : 12630
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 631 | Mem: 26.53MB, Util: 65%  global_step 

[Rank 0] Train Epoch 6:  33%|███▎      | 662/2000 [00:05<00:09, 134.49it/s]
[Rank 2] Train Epoch 6:  34%|███▍      | 676/2000 [00:05<00:09, 140.61it/s]
[Rank 1] Train Epoch 6:  33%|███▎      | 667/2000 [00:06<00:10, 128.34it/s]
[Rank 0] Train Epoch 6:  34%|███▍      | 676/2000 [00:06<00:09, 135.98it/s]
[Rank 1] Train Epoch 6:  34%|███▍      | 682/2000 [00:06<00:09, 133.66it/s]
[Rank 2] Train Epoch 6:  35%|███▍      | 692/2000 [00:05<00:09, 144.28it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 652 | Mem: 26.53MB, Util: 65%  global_step : 12652
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 653 | Mem: 26.53MB, Util: 65%  global_step : 12653
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 654 | Mem: 26.53MB, Util: 65%  global_step : 12654
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 655 | Mem: 26.53MB, Util: 65%  global_step : 12655
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 656 | Mem: 26.53MB, Util: 74%  global_step : 12656
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 657 | Mem: 26.53MB, Util: 74%  global_step : 12657
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 658 | Mem: 26.53MB, Util: 74%  global_step : 12658
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 659 | Mem: 26.53MB, Util: 74%  global_step 

[Rank 0] Train Epoch 6:  35%|███▍      | 691/2000 [00:06<00:09, 137.47it/s]
[Rank 1] Train Epoch 6:  35%|███▍      | 697/2000 [00:06<00:09, 136.87it/s]
[Rank 0] Train Epoch 6:  35%|███▌      | 705/2000 [00:06<00:09, 134.96it/s]
[Rank 1] Train Epoch 6:  36%|███▌      | 712/2000 [00:06<00:09, 139.28it/s]
[Rank 2] Train Epoch 6:  35%|███▌      | 707/2000 [00:06<00:10, 119.21it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 682 | Mem: 26.53MB, Util: 74%  global_step : 12682
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 683 | Mem: 26.53MB, Util: 74%  global_step : 12683
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 684 | Mem: 26.53MB, Util: 100%  global_step : 12684
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 685 | Mem: 26.53MB, Util: 100%  global_step : 12685
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 686 | Mem: 26.53MB, Util: 100%  global_step : 12686
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 687 | Mem: 26.53MB, Util: 100%  global_step : 12687
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 688 | Mem: 26.53MB, Util: 100%  global_step : 12688
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 689 | Mem: 26.53MB, Util: 100%  global

[Rank 0] Train Epoch 6:  36%|███▌      | 719/2000 [00:06<00:09, 134.92it/s]
[Rank 1] Train Epoch 6:  36%|███▋      | 727/2000 [00:06<00:09, 140.06it/s]
[Rank 2] Train Epoch 6:  36%|███▌      | 722/2000 [00:06<00:10, 124.73it/s]
[Rank 0] Train Epoch 6:  37%|███▋      | 733/2000 [00:06<00:09, 135.16it/s]
[Rank 1] Train Epoch 6:  37%|███▋      | 742/2000 [00:06<00:09, 135.18it/s]
[Rank 2] Train Epoch 6:  37%|███▋      | 736/2000 [00:06<00:09, 127.86it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 710 | Mem: 26.53MB, Util: 100%  global_step : 12710
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 711 | Mem: 26.53MB, Util: 100%  global_step : 12711
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 712 | Mem: 26.53MB, Util: 100%  global_step : 12712
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 713 | Mem: 26.53MB, Util: 100%  global_step : 12713
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 714 | Mem: 26.53MB, Util: 100%  global_step : 12714
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 715 | Mem: 26.53MB, Util: 100%  global_step : 12715
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 716 | Mem: 26.53MB, Util: 100%  global_step : 12716
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 717 | Mem: 26.53MB, Util: 100%  glob

[Rank 0] Train Epoch 6:  37%|███▋      | 747/2000 [00:06<00:09, 133.58it/s]
[Rank 1] Train Epoch 6:  38%|███▊      | 756/2000 [00:06<00:09, 134.53it/s]
[Rank 2] Train Epoch 6:  38%|███▊      | 750/2000 [00:06<00:09, 130.37it/s]
[Rank 0] Train Epoch 6:  38%|███▊      | 762/2000 [00:06<00:09, 136.49it/s]
[Rank 1] Train Epoch 6:  38%|███▊      | 770/2000 [00:06<00:09, 133.40it/s]
[Rank 2] Train Epoch 6:  38%|███▊      | 765/2000 [00:06<00:09, 133.32it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 738 | Mem: 26.53MB, Util: 70%  global_step : 12738
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 739 | Mem: 26.53MB, Util: 70%  global_step : 12739
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 740 | Mem: 26.53MB, Util: 70%  global_step : 12740
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 741 | Mem: 26.53MB, Util: 70%  global_step : 12741
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 742 | Mem: 26.53MB, Util: 70%  global_step : 12742
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 743 | Mem: 26.53MB, Util: 70%  global_step : 12743
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 744 | Mem: 26.53MB, Util: 70%  global_step : 12744
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 745 | Mem: 26.53MB, Util: 70%  global_step 

[Rank 0] Train Epoch 6:  39%|███▉      | 777/2000 [00:06<00:08, 137.85it/s]
[Rank 2] Train Epoch 6:  39%|███▉      | 779/2000 [00:06<00:09, 134.68it/s]
[Rank 1] Train Epoch 6:  39%|███▉      | 784/2000 [00:06<00:09, 132.60it/s]
[Rank 0] Train Epoch 6:  40%|███▉      | 792/2000 [00:06<00:08, 139.79it/s]
[Rank 1] Train Epoch 6:  40%|███▉      | 798/2000 [00:07<00:09, 132.27it/s]
[Rank 2] Train Epoch 6:  40%|███▉      | 793/2000 [00:06<00:08, 135.67it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 767 | Mem: 26.53MB, Util: 65%  global_step : 12767
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 768 | Mem: 26.53MB, Util: 65%  global_step : 12768
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 769 | Mem: 26.53MB, Util: 65%  global_step : 12769
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 770 | Mem: 26.53MB, Util: 65%  global_step : 12770
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 771 | Mem: 26.53MB, Util: 65%  global_step : 12771
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 772 | Mem: 26.53MB, Util: 65%  global_step : 12772
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 773 | Mem: 26.53MB, Util: 65%  global_step : 12773
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 774 | Mem: 26.53MB, Util: 65%  global_step 

[Rank 0] Train Epoch 6:  40%|████      | 806/2000 [00:07<00:09, 129.92it/s]
[Rank 2] Train Epoch 6:  40%|████      | 807/2000 [00:06<00:09, 124.67it/s]
[Rank 1] Train Epoch 6:  41%|████      | 812/2000 [00:07<00:09, 131.76it/s]
[Rank 0] Train Epoch 6:  41%|████      | 820/2000 [00:07<00:08, 132.69it/s]
[Rank 2] Train Epoch 6:  41%|████      | 822/2000 [00:07<00:08, 130.91it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 797 | Mem: 26.53MB, Util: 92%  global_step : 12797
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 798 | Mem: 26.53MB, Util: 92%  global_step : 12798
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 799 | Mem: 26.53MB, Util: 92%  global_step : 12799
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 800 | Mem: 26.53MB, Util: 92%  global_step : 12800
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 801 | Mem: 26.53MB, Util: 92%  global_step : 12801
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 802 | Mem: 26.53MB, Util: 92%  global_step : 12802
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 803 | Mem: 26.53MB, Util: 92%  global_step : 12803
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 804 | Mem: 26.53MB, Util: 92%  global_step 

[Rank 0] Train Epoch 6:  42%|████▏     | 834/2000 [00:07<00:08, 134.60it/s]
[Rank 1] Train Epoch 6:  41%|████▏     | 826/2000 [00:07<00:08, 130.94it/s]
[Rank 2] Train Epoch 6:  42%|████▏     | 836/2000 [00:07<00:08, 132.40it/s]
[Rank 0] Train Epoch 6:  42%|████▏     | 849/2000 [00:07<00:08, 137.17it/s]
[Rank 1] Train Epoch 6:  42%|████▏     | 840/2000 [00:07<00:08, 131.01it/s]
[Rank 2] Train Epoch 6:  42%|████▎     | 850/2000 [00:07<00:08, 133.21it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 823 | Mem: 26.53MB, Util: 97%  global_step : 12823
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 824 | Mem: 26.53MB, Util: 97%  global_step : 12824
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 825 | Mem: 26.53MB, Util: 97%  global_step : 12825
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 826 | Mem: 26.53MB, Util: 97%  global_step : 12826
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 827 | Mem: 26.53MB, Util: 97%  global_step : 12827
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 828 | Mem: 26.53MB, Util: 97%  global_step : 12828
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 829 | Mem: 26.53MB, Util: 97%  global_step : 12829
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 830 | Mem: 26.53MB, Util: 97%  global_step 

[Rank 0] Train Epoch 6:  43%|████▎     | 864/2000 [00:07<00:08, 139.76it/s]
[Rank 1] Train Epoch 6:  43%|████▎     | 854/2000 [00:07<00:08, 131.07it/s]
[Rank 2] Train Epoch 6:  43%|████▎     | 864/2000 [00:07<00:08, 133.42it/s]
[Rank 0] Train Epoch 6:  44%|████▍     | 879/2000 [00:07<00:07, 140.37it/s]
[Rank 1] Train Epoch 6:  43%|████▎     | 868/2000 [00:07<00:08, 130.97it/s]
[Rank 2] Train Epoch 6:  44%|████▍     | 878/2000 [00:07<00:08, 133.25it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 853 | Mem: 26.53MB, Util: 100%  global_step : 12853
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 854 | Mem: 26.53MB, Util: 100%  global_step : 12854
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 855 | Mem: 26.53MB, Util: 100%  global_step : 12855
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 856 | Mem: 26.53MB, Util: 100%  global_step : 12856
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 857 | Mem: 26.53MB, Util: 100%  global_step : 12857
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 858 | Mem: 26.53MB, Util: 100%  global_step : 12858
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 859 | Mem: 26.53MB, Util: 100%  global_step : 12859
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 860 | Mem: 26.53MB, Util: 100%  glob

[Rank 0] Train Epoch 6:  45%|████▍     | 894/2000 [00:07<00:07, 141.79it/s]
[Rank 1] Train Epoch 6:  44%|████▍     | 882/2000 [00:07<00:08, 131.00it/s]
[Rank 2] Train Epoch 6:  45%|████▍     | 892/2000 [00:07<00:08, 134.17it/s]
[Rank 1] Train Epoch 6:  45%|████▍     | 896/2000 [00:07<00:08, 131.58it/s]
[Rank 2] Train Epoch 6:  45%|████▌     | 906/2000 [00:07<00:08, 123.48it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 883 | Mem: 26.53MB, Util: 100%  global_step : 12883
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 884 | Mem: 26.53MB, Util: 100%  global_step : 12884
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 885 | Mem: 26.53MB, Util: 100%  global_step : 12885
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 886 | Mem: 26.53MB, Util: 100%  global_step : 12886
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 887 | Mem: 26.53MB, Util: 100%  global_step : 12887
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 888 | Mem: 26.53MB, Util: 100%  global_step : 12888
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 889 | Mem: 26.53MB, Util: 100%  global_step : 12889
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 890 | Mem: 26.53MB, Util: 100%  glob

[Rank 0] Train Epoch 6:  45%|████▌     | 909/2000 [00:07<00:09, 120.16it/s]
[Rank 1] Train Epoch 6:  46%|████▌     | 910/2000 [00:07<00:08, 131.11it/s]
[Rank 2] Train Epoch 6:  46%|████▌     | 920/2000 [00:07<00:08, 126.74it/s]
[Rank 0] Train Epoch 6:  46%|████▌     | 924/2000 [00:07<00:08, 125.81it/s]
[Rank 1] Train Epoch 6:  46%|████▌     | 924/2000 [00:07<00:08, 131.00it/s]
[Rank 2] Train Epoch 6:  47%|████▋     | 933/2000 [00:07<00:08, 126.10it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 905 | Mem: 26.53MB, Util: 100%  global_step : 12905
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 906 | Mem: 26.53MB, Util: 100%  global_step : 12906
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 907 | Mem: 26.53MB, Util: 100%  global_step : 12907
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 908 | Mem: 26.53MB, Util: 100%  global_step : 12908
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 909 | Mem: 26.53MB, Util: 100%  global_step : 12909
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 910 | Mem: 26.53MB, Util: 100%  global_step : 12910
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 911 | Mem: 26.53MB, Util: 100%  global_step : 12911
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 912 | Mem: 26.53MB, Util: 100%  glob

[Rank 0] Train Epoch 6:  47%|████▋     | 939/2000 [00:08<00:08, 129.87it/s]
[Rank 1] Train Epoch 6:  47%|████▋     | 938/2000 [00:08<00:08, 130.86it/s]
[Rank 2] Train Epoch 6:  47%|████▋     | 947/2000 [00:07<00:08, 127.49it/s]
[Rank 0] Train Epoch 6:  48%|████▊     | 954/2000 [00:08<00:07, 133.72it/s]
[Rank 1] Train Epoch 6:  48%|████▊     | 952/2000 [00:08<00:07, 131.46it/s]
[Rank 2] Train Epoch 6:  48%|████▊     | 960/2000 [00:08<00:08, 127.02it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 935 | Mem: 26.53MB, Util: 99%  global_step : 12935
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 936 | Mem: 26.53MB, Util: 99%  global_step : 12936
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 937 | Mem: 26.53MB, Util: 99%  global_step : 12937
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 938 | Mem: 26.53MB, Util: 99%  global_step : 12938
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 939 | Mem: 26.53MB, Util: 99%  global_step : 12939
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 940 | Mem: 26.53MB, Util: 99%  global_step : 12940
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 941 | Mem: 26.53MB, Util: 99%  global_step : 12941
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 942 | Mem: 26.53MB, Util: 99%  global_step 

[Rank 0] Train Epoch 6:  48%|████▊     | 969/2000 [00:08<00:07, 137.32it/s]
[Rank 1] Train Epoch 6:  48%|████▊     | 966/2000 [00:08<00:07, 132.19it/s]
[Rank 2] Train Epoch 6:  49%|████▊     | 974/2000 [00:08<00:07, 130.04it/s]
[Rank 0] Train Epoch 6:  49%|████▉     | 984/2000 [00:08<00:07, 138.82it/s]
[Rank 1] Train Epoch 6:  49%|████▉     | 980/2000 [00:08<00:07, 133.48it/s]
[Rank 2] Train Epoch 6:  49%|████▉     | 988/2000 [00:08<00:07, 132.55it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 965 | Mem: 26.53MB, Util: 100%  global_step : 12965
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 966 | Mem: 26.53MB, Util: 100%  global_step : 12966
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 967 | Mem: 26.53MB, Util: 100%  global_step : 12967
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 968 | Mem: 26.53MB, Util: 100%  global_step : 12968
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 969 | Mem: 26.53MB, Util: 100%  global_step : 12969
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 970 | Mem: 26.53MB, Util: 100%  global_step : 12970
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 971 | Mem: 26.53MB, Util: 100%  global_step : 12971
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 972 | Mem: 26.53MB, Util: 100%  glob

[Rank 0] Train Epoch 6:  50%|████▉     | 999/2000 [00:08<00:07, 139.43it/s]
[Rank 1] Train Epoch 6:  50%|████▉     | 994/2000 [00:08<00:07, 134.77it/s]
[Rank 2] Train Epoch 6:  50%|█████     | 1002/2000 [00:08<00:07, 133.83it/s]
[Rank 0] Train Epoch 6:  51%|█████     | 1014/2000 [00:08<00:07, 123.53it/s]
[Rank 1] Train Epoch 6:  50%|█████     | 1008/2000 [00:08<00:07, 133.22it/s]
[Rank 2] Train Epoch 6:  51%|█████     | 1016/2000 [00:08<00:07, 134.78it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 995 | Mem: 26.53MB, Util: 100%  global_step : 12995
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 996 | Mem: 26.53MB, Util: 100%  global_step : 12996
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 997 | Mem: 26.53MB, Util: 100%  global_step : 12997
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 998 | Mem: 26.53MB, Util: 100%  global_step : 12998
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 999 | Mem: 26.53MB, Util: 100%  global_step : 12999
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 1000 | Mem: 26.53MB, Util: 100%  global_step : 13000
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 1001 | Mem: 26.53MB, Util: 100%  global_step : 13001
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 1002 | Mem: 26.53MB, Util: 100%  g

[Rank 0] Train Epoch 6:  51%|█████▏    | 1029/2000 [00:08<00:07, 129.16it/s]
[Rank 1] Train Epoch 6:  51%|█████     | 1022/2000 [00:08<00:07, 132.72it/s]
[Rank 2] Train Epoch 6:  52%|█████▏    | 1030/2000 [00:08<00:07, 135.62it/s]
[Rank 0] Train Epoch 6:  52%|█████▏    | 1044/2000 [00:08<00:07, 132.47it/s]
[Rank 1] Train Epoch 6:  52%|█████▏    | 1036/2000 [00:08<00:07, 131.65it/s]
[Rank 2] Train Epoch 6:  52%|█████▏    | 1044/2000 [00:08<00:07, 135.64it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 1018 | Mem: 26.53MB, Util: 100%  global_step : 13018
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 1019 | Mem: 26.53MB, Util: 100%  global_step : 13019
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 1020 | Mem: 26.53MB, Util: 100%  global_step : 13020
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 1021 | Mem: 26.53MB, Util: 100%  global_step : 13021
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 1022 | Mem: 26.53MB, Util: 100%  global_step : 13022
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 1023 | Mem: 26.53MB, Util: 100%  global_step : 13023
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 1024 | Mem: 26.53MB, Util: 100%  global_step : 13024
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 1025 | Mem: 26.53MB, Util: 96

[Rank 0] Train Epoch 6:  53%|█████▎    | 1059/2000 [00:08<00:06, 135.36it/s]
[Rank 1] Train Epoch 6:  52%|█████▎    | 1050/2000 [00:08<00:07, 131.62it/s]
[Rank 2] Train Epoch 6:  53%|█████▎    | 1058/2000 [00:08<00:06, 136.14it/s]
[Rank 0] Train Epoch 6:  54%|█████▎    | 1074/2000 [00:09<00:06, 138.18it/s]
[Rank 1] Train Epoch 6:  53%|█████▎    | 1064/2000 [00:09<00:07, 131.95it/s]
[Rank 2] Train Epoch 6:  54%|█████▎    | 1072/2000 [00:08<00:06, 136.25it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 1048 | Mem: 26.53MB, Util: 96%  global_step : 13048
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 1049 | Mem: 26.53MB, Util: 96%  global_step : 13049
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 1050 | Mem: 26.53MB, Util: 96%  global_step : 13050
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 1051 | Mem: 26.53MB, Util: 96%  global_step : 13051
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 1052 | Mem: 26.53MB, Util: 96%  global_step : 13052
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 1053 | Mem: 26.53MB, Util: 96%  global_step : 13053
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 1054 | Mem: 26.53MB, Util: 100%  global_step : 13054
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 1055 | Mem: 26.53MB, Util: 100%  gl

[Rank 0] Train Epoch 6:  54%|█████▍    | 1088/2000 [00:09<00:06, 133.80it/s]
[Rank 1] Train Epoch 6:  54%|█████▍    | 1078/2000 [00:09<00:07, 131.50it/s]
[Rank 2] Train Epoch 6:  54%|█████▍    | 1086/2000 [00:09<00:06, 136.68it/s]
[Rank 1] Train Epoch 6:  55%|█████▍    | 1092/2000 [00:09<00:06, 131.35it/s]
[Rank 2] Train Epoch 6:  55%|█████▌    | 1100/2000 [00:09<00:06, 134.33it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 1077 | Mem: 26.53MB, Util: 100%  global_step : 13077
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 1078 | Mem: 26.53MB, Util: 100%  global_step : 13078
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 1079 | Mem: 26.53MB, Util: 100%  global_step : 13079
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 1080 | Mem: 26.53MB, Util: 100%  global_step : 13080
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 1081 | Mem: 26.53MB, Util: 100%  global_step : 13081
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 1082 | Mem: 26.53MB, Util: 100%  global_step : 13082
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 1083 | Mem: 26.53MB, Util: 100%  global_step : 13083
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 1084 | Mem: 26.53MB, Util: 10

[Rank 0] Train Epoch 6:  55%|█████▌    | 1102/2000 [00:09<00:07, 119.50it/s]
[Rank 1] Train Epoch 6:  55%|█████▌    | 1106/2000 [00:09<00:06, 131.34it/s]
[Rank 2] Train Epoch 6:  56%|█████▌    | 1114/2000 [00:09<00:07, 125.69it/s]
[Rank 0] Train Epoch 6:  56%|█████▌    | 1116/2000 [00:09<00:07, 124.81it/s]
[Rank 1] Train Epoch 6:  56%|█████▌    | 1120/2000 [00:09<00:06, 131.37it/s]
[Rank 2] Train Epoch 6:  56%|█████▋    | 1128/2000 [00:09<00:06, 128.57it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 1100 | Mem: 26.53MB, Util: 100%  global_step : 13100
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 1101 | Mem: 26.53MB, Util: 100%  global_step : 13101
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 1102 | Mem: 26.53MB, Util: 100%  global_step : 13102
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 1103 | Mem: 26.53MB, Util: 100%  global_step : 13103
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 1104 | Mem: 26.53MB, Util: 100%  global_step : 13104
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 1105 | Mem: 26.53MB, Util: 100%  global_step : 13105
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 1106 | Mem: 26.53MB, Util: 100%  global_step : 13106
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 1107 | Mem: 26.53MB, Util: 10

[Rank 0] Train Epoch 6:  56%|█████▋    | 1130/2000 [00:09<00:06, 127.84it/s]
[Rank 1] Train Epoch 6:  57%|█████▋    | 1134/2000 [00:09<00:06, 131.41it/s]
[Rank 2] Train Epoch 6:  57%|█████▋    | 1142/2000 [00:09<00:06, 131.24it/s]
[Rank 0] Train Epoch 6:  57%|█████▋    | 1144/2000 [00:09<00:06, 130.76it/s]
[Rank 1] Train Epoch 6:  57%|█████▋    | 1148/2000 [00:09<00:06, 131.35it/s]
[Rank 0] Train Epoch 6:  58%|█████▊    | 1158/2000 [00:09<00:06, 133.24it/s]
[Rank 2] Train Epoch 6:  58%|█████▊    | 1156/2000 [00:09<00:06, 132.89it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 1128 | Mem: 26.53MB, Util: 100%  global_step : 13128
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 1129 | Mem: 26.53MB, Util: 100%  global_step : 13129
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 1130 | Mem: 26.53MB, Util: 100%  global_step : 13130
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 1131 | Mem: 26.53MB, Util: 100%  global_step : 13131
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 1132 | Mem: 26.53MB, Util: 92%  global_step : 13132
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 1133 | Mem: 26.53MB, Util: 92%  global_step : 13133
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 1134 | Mem: 26.53MB, Util: 92%  global_step : 13134
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 6, Batch 1135 | Mem: 26.53MB, Util: 92%  

[Rank 1] Train Epoch 6:  58%|█████▊    | 1162/2000 [00:09<00:06, 131.01it/s]
[Rank 0] Train Epoch 6:  59%|█████▊    | 1173/2000 [00:09<00:06, 136.43it/s]
[Rank 2] Train Epoch 6:  58%|█████▊    | 1170/2000 [00:09<00:06, 133.78it/s]
[Rank 1] Train Epoch 6:  59%|█████▉    | 1176/2000 [00:09<00:06, 132.63it/s]
[Rank 0] Train Epoch 6:  59%|█████▉    | 1188/2000 [00:09<00:05, 138.08it/s]
[Rank 2] Train Epoch 6:  59%|█████▉    | 1184/2000 [00:09<00:06, 134.67it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1154 | Mem: 26.53MB, Util: 61%  global_step : 13154
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1155 | Mem: 26.53MB, Util: 61%  global_step : 13155
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1156 | Mem: 26.53MB, Util: 61%  global_step : 13156
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1157 | Mem: 26.53MB, Util: 61%  global_step : 13157
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1158 | Mem: 26.53MB, Util: 61%  global_step : 13158
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1159 | Mem: 26.53MB, Util: 61%  global_step : 13159
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1160 | Mem: 26.53MB, Util: 61%  global_step : 13160
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1161 | Mem: 26.53MB,

[Rank 1] Train Epoch 6:  60%|█████▉    | 1190/2000 [00:09<00:06, 133.19it/s]
[Rank 2] Train Epoch 6:  60%|█████▉    | 1198/2000 [00:09<00:05, 134.42it/s]
[Rank 1] Train Epoch 6:  60%|██████    | 1204/2000 [00:10<00:05, 133.53it/s]
[Rank 0] Train Epoch 6:  60%|██████    | 1202/2000 [00:10<00:06, 124.79it/s]
[Rank 2] Train Epoch 6:  61%|██████    | 1212/2000 [00:09<00:06, 127.87it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1182 | Mem: 26.53MB, Util: 61%  global_step : 13182
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1183 | Mem: 26.53MB, Util: 61%  global_step : 13183
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1184 | Mem: 26.53MB, Util: 61%  global_step : 13184
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1185 | Mem: 26.53MB, Util: 65%  global_step : 13185
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1186 | Mem: 26.53MB, Util: 65%  global_step : 13186
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1187 | Mem: 26.53MB, Util: 65%  global_step : 13187
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1188 | Mem: 26.53MB, Util: 65%  global_step : 13188
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1189 | Mem: 26.53MB,

[Rank 1] Train Epoch 6:  61%|██████    | 1218/2000 [00:10<00:05, 133.85it/s]
[Rank 0] Train Epoch 6:  61%|██████    | 1216/2000 [00:10<00:06, 128.68it/s]
[Rank 2] Train Epoch 6:  61%|██████▏   | 1226/2000 [00:10<00:05, 130.26it/s]
[Rank 1] Train Epoch 6:  62%|██████▏   | 1232/2000 [00:10<00:05, 134.51it/s]
[Rank 0] Train Epoch 6:  62%|██████▏   | 1231/2000 [00:10<00:05, 132.25it/s]
[Rank 2] Train Epoch 6:  62%|██████▏   | 1240/2000 [00:10<00:05, 132.08it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1210 | Mem: 26.53MB, Util: 65%  global_step : 13210
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1211 | Mem: 26.53MB, Util: 65%  global_step : 13211
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1212 | Mem: 26.53MB, Util: 65%  global_step : 13212
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1213 | Mem: 26.53MB, Util: 65%  global_step : 13213
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1214 | Mem: 26.53MB, Util: 65%  global_step : 13214
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1215 | Mem: 26.53MB, Util: 66%  global_step : 13215
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1216 | Mem: 26.53MB, Util: 66%  global_step : 13216
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1217 | Mem: 26.53MB,

[Rank 1] Train Epoch 6:  62%|██████▏   | 1246/2000 [00:10<00:05, 134.79it/s]
[Rank 0] Train Epoch 6:  62%|██████▏   | 1245/2000 [00:10<00:05, 130.13it/s]
[Rank 2] Train Epoch 6:  63%|██████▎   | 1254/2000 [00:10<00:05, 133.27it/s]
[Rank 1] Train Epoch 6:  63%|██████▎   | 1260/2000 [00:10<00:05, 134.62it/s]
[Rank 0] Train Epoch 6:  63%|██████▎   | 1259/2000 [00:10<00:05, 131.79it/s]
[Rank 2] Train Epoch 6:  63%|██████▎   | 1268/2000 [00:10<00:05, 134.01it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1238 | Mem: 26.53MB, Util: 66%  global_step : 13238
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1239 | Mem: 26.53MB, Util: 66%  global_step : 13239
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1240 | Mem: 26.53MB, Util: 66%  global_step : 13240
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1241 | Mem: 26.53MB, Util: 67%  global_step : 13241
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1242 | Mem: 26.53MB, Util: 67%  global_step : 13242
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1243 | Mem: 26.53MB, Util: 67%  global_step : 13243
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1244 | Mem: 26.53MB, Util: 67%  global_step : 13244
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1245 | Mem: 26.53MB,

[Rank 1] Train Epoch 6:  64%|██████▎   | 1274/2000 [00:10<00:05, 135.02it/s]
[Rank 0] Train Epoch 6:  64%|██████▎   | 1273/2000 [00:10<00:05, 132.98it/s]
[Rank 2] Train Epoch 6:  64%|██████▍   | 1282/2000 [00:10<00:05, 133.86it/s]
[Rank 1] Train Epoch 6:  64%|██████▍   | 1288/2000 [00:10<00:05, 134.81it/s]
[Rank 0] Train Epoch 6:  64%|██████▍   | 1287/2000 [00:10<00:05, 134.23it/s]
[Rank 2] Train Epoch 6:  65%|██████▍   | 1296/2000 [00:10<00:05, 134.30it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1266 | Mem: 26.53MB, Util: 67%  global_step : 13266
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1267 | Mem: 26.53MB, Util: 67%  global_step : 13267
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1268 | Mem: 26.53MB, Util: 67%  global_step : 13268
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1269 | Mem: 26.53MB, Util: 70%  global_step : 13269
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1270 | Mem: 26.53MB, Util: 70%  global_step : 13270
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1271 | Mem: 26.53MB, Util: 70%  global_step : 13271
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1272 | Mem: 26.53MB, Util: 70%  global_step : 13272
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1273 | Mem: 26.53MB,

[Rank 1] Train Epoch 6:  65%|██████▌   | 1302/2000 [00:10<00:05, 135.69it/s]
[Rank 0] Train Epoch 6:  65%|██████▌   | 1301/2000 [00:10<00:05, 134.41it/s]
[Rank 2] Train Epoch 6:  66%|██████▌   | 1310/2000 [00:10<00:05, 134.34it/s]
[Rank 1] Train Epoch 6:  66%|██████▌   | 1316/2000 [00:10<00:05, 135.51it/s]
[Rank 0] Train Epoch 6:  66%|██████▌   | 1315/2000 [00:10<00:05, 134.83it/s]
[Rank 2] Train Epoch 6:  66%|██████▌   | 1324/2000 [00:10<00:05, 134.85it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1295 | Mem: 26.53MB, Util: 70%  global_step : 13295
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1296 | Mem: 26.53MB, Util: 70%  global_step : 13296
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1297 | Mem: 26.53MB, Util: 65%  global_step : 13297
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1298 | Mem: 26.53MB, Util: 65%  global_step : 13298
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1299 | Mem: 26.53MB, Util: 65%  global_step : 13299
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1300 | Mem: 26.53MB, Util: 65%  global_step : 13300
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1301 | Mem: 26.53MB, Util: 65%  global_step : 13301
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1302 | Mem: 26.53MB,

[Rank 1] Train Epoch 6:  66%|██████▋   | 1330/2000 [00:11<00:04, 134.75it/s]
[Rank 0] Train Epoch 6:  66%|██████▋   | 1329/2000 [00:10<00:04, 134.57it/s]
[Rank 2] Train Epoch 6:  67%|██████▋   | 1338/2000 [00:10<00:04, 135.19it/s]
[Rank 1] Train Epoch 6:  67%|██████▋   | 1344/2000 [00:11<00:04, 134.82it/s]
[Rank 0] Train Epoch 6:  67%|██████▋   | 1343/2000 [00:11<00:04, 134.77it/s]
[Rank 2] Train Epoch 6:  68%|██████▊   | 1352/2000 [00:11<00:04, 135.68it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1323 | Mem: 26.53MB, Util: 65%  global_step : 13323
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1324 | Mem: 26.53MB, Util: 67%  global_step : 13324
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1325 | Mem: 26.53MB, Util: 67%  global_step : 13325
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1326 | Mem: 26.53MB, Util: 67%  global_step : 13326
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1327 | Mem: 26.53MB, Util: 67%  global_step : 13327
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1328 | Mem: 26.53MB, Util: 67%  global_step : 13328
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1329 | Mem: 26.53MB, Util: 67%  global_step : 13329
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1330 | Mem: 26.53MB,

[Rank 1] Train Epoch 6:  68%|██████▊   | 1358/2000 [00:11<00:04, 134.35it/s]
[Rank 0] Train Epoch 6:  68%|██████▊   | 1358/2000 [00:11<00:04, 136.38it/s]
[Rank 2] Train Epoch 6:  68%|██████▊   | 1366/2000 [00:11<00:04, 133.70it/s]
[Rank 1] Train Epoch 6:  69%|██████▊   | 1372/2000 [00:11<00:04, 133.96it/s]
[Rank 2] Train Epoch 6:  69%|██████▉   | 1380/2000 [00:11<00:04, 134.94it/s]
[Rank 0] Train Epoch 6:  69%|██████▊   | 1372/2000 [00:11<00:04, 135.57it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1351 | Mem: 26.53MB, Util: 67%  global_step : 13351
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1352 | Mem: 26.53MB, Util: 67%  global_step : 13352
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1353 | Mem: 26.53MB, Util: 67%  global_step : 13353
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1354 | Mem: 26.53MB, Util: 65%  global_step : 13354
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1355 | Mem: 26.53MB, Util: 65%  global_step : 13355
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1356 | Mem: 26.53MB, Util: 65%  global_step : 13356
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1357 | Mem: 26.53MB, Util: 65%  global_step : 13357
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1358 | Mem: 26.53MB,

[Rank 1] Train Epoch 6:  69%|██████▉   | 1386/2000 [00:11<00:04, 133.93it/s]
[Rank 2] Train Epoch 6:  70%|██████▉   | 1394/2000 [00:11<00:04, 135.00it/s]
[Rank 0] Train Epoch 6:  69%|██████▉   | 1386/2000 [00:11<00:04, 135.61it/s]
[Rank 1] Train Epoch 6:  70%|███████   | 1400/2000 [00:11<00:04, 133.96it/s]
[Rank 2] Train Epoch 6:  70%|███████   | 1408/2000 [00:11<00:04, 132.28it/s]
[Rank 0] Train Epoch 6:  70%|███████   | 1400/2000 [00:11<00:04, 134.29it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1378 | Mem: 26.53MB, Util: 65%  global_step : 13378
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1379 | Mem: 26.53MB, Util: 65%  global_step : 13379
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1380 | Mem: 26.53MB, Util: 65%  global_step : 13380
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1381 | Mem: 26.53MB, Util: 65%  global_step : 13381
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1382 | Mem: 26.53MB, Util: 64%  global_step : 13382
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1383 | Mem: 26.53MB, Util: 64%  global_step : 13383
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1384 | Mem: 26.53MB, Util: 64%  global_step : 13384
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1385 | Mem: 26.53MB,

[Rank 1] Train Epoch 6:  71%|███████   | 1414/2000 [00:11<00:04, 130.37it/s]
[Rank 2] Train Epoch 6:  71%|███████   | 1422/2000 [00:11<00:04, 130.67it/s]
[Rank 0] Train Epoch 6:  71%|███████   | 1414/2000 [00:11<00:04, 131.05it/s]
[Rank 1] Train Epoch 6:  71%|███████▏  | 1428/2000 [00:11<00:04, 131.84it/s]
[Rank 2] Train Epoch 6:  72%|███████▏  | 1436/2000 [00:11<00:04, 132.63it/s]
[Rank 0] Train Epoch 6:  71%|███████▏  | 1428/2000 [00:11<00:04, 131.61it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1406 | Mem: 26.53MB, Util: 64%  global_step : 13406
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1407 | Mem: 26.53MB, Util: 64%  global_step : 13407
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1408 | Mem: 26.53MB, Util: 64%  global_step : 13408
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1409 | Mem: 26.53MB, Util: 64%  global_step : 13409
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1410 | Mem: 26.53MB, Util: 64%  global_step : 13410
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1411 | Mem: 26.53MB, Util: 64%  global_step : 13411
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1412 | Mem: 26.53MB, Util: 64%  global_step : 13412
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1413 | Mem: 26.53MB,

[Rank 1] Train Epoch 6:  72%|███████▏  | 1442/2000 [00:11<00:04, 133.82it/s]
[Rank 2] Train Epoch 6:  73%|███████▎  | 1451/2000 [00:11<00:04, 135.10it/s]
[Rank 0] Train Epoch 6:  72%|███████▏  | 1442/2000 [00:11<00:04, 133.07it/s]
[Rank 1] Train Epoch 6:  73%|███████▎  | 1456/2000 [00:11<00:04, 132.56it/s]
[Rank 2] Train Epoch 6:  73%|███████▎  | 1465/2000 [00:11<00:03, 136.34it/s]
[Rank 0] Train Epoch 6:  73%|███████▎  | 1456/2000 [00:11<00:04, 133.86it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1433 | Mem: 26.53MB, Util: 64%  global_step : 13433
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1434 | Mem: 26.53MB, Util: 64%  global_step : 13434
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1435 | Mem: 26.53MB, Util: 64%  global_step : 13435
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1436 | Mem: 26.53MB, Util: 63%  global_step : 13436
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1437 | Mem: 26.53MB, Util: 63%  global_step : 13437
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1438 | Mem: 26.53MB, Util: 63%  global_step : 13438
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1439 | Mem: 26.53MB, Util: 63%  global_step : 13439
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1440 | Mem: 26.53MB,

[Rank 1] Train Epoch 6:  74%|███████▎  | 1471/2000 [00:12<00:03, 135.18it/s]
[Rank 2] Train Epoch 6:  74%|███████▍  | 1480/2000 [00:11<00:03, 137.79it/s]
[Rank 0] Train Epoch 6:  74%|███████▎  | 1470/2000 [00:12<00:03, 133.62it/s]
[Rank 1] Train Epoch 6:  74%|███████▍  | 1485/2000 [00:12<00:03, 136.32it/s]
[Rank 2] Train Epoch 6:  75%|███████▍  | 1494/2000 [00:12<00:03, 138.35it/s]
[Rank 0] Train Epoch 6:  74%|███████▍  | 1484/2000 [00:12<00:03, 133.39it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1461 | Mem: 26.53MB, Util: 65%  global_step : 13461
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1462 | Mem: 26.53MB, Util: 65%  global_step : 13462
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1463 | Mem: 26.53MB, Util: 65%  global_step : 13463
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1464 | Mem: 26.53MB, Util: 65%  global_step : 13464
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1465 | Mem: 26.53MB, Util: 65%  global_step : 13465
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1466 | Mem: 26.53MB, Util: 65%  global_step : 13466
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1467 | Mem: 26.53MB, Util: 65%  global_step : 13467
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1468 | Mem: 26.53MB,

[Rank 1] Train Epoch 6:  75%|███████▍  | 1499/2000 [00:12<00:03, 129.79it/s]
[Rank 0] Train Epoch 6:  75%|███████▍  | 1498/2000 [00:12<00:03, 132.55it/s]
[Rank 1] Train Epoch 6:  76%|███████▌  | 1513/2000 [00:12<00:03, 129.84it/s]
[Rank 2] Train Epoch 6:  75%|███████▌  | 1508/2000 [00:12<00:03, 125.87it/s]
[Rank 0] Train Epoch 6:  76%|███████▌  | 1512/2000 [00:12<00:03, 127.97it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1491 | Mem: 26.53MB, Util: 64%  global_step : 13491
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1492 | Mem: 26.53MB, Util: 64%  global_step : 13492
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1493 | Mem: 26.53MB, Util: 64%  global_step : 13493
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1494 | Mem: 26.53MB, Util: 64%  global_step : 13494
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1495 | Mem: 26.53MB, Util: 64%  global_step : 13495
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1496 | Mem: 26.53MB, Util: 64%  global_step : 13496
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1497 | Mem: 26.53MB, Util: 64%  global_step : 13497
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1498 | Mem: 26.53MB,

[Rank 1] Train Epoch 6:  76%|███████▋  | 1527/2000 [00:12<00:03, 129.91it/s]
[Rank 2] Train Epoch 6:  76%|███████▌  | 1522/2000 [00:12<00:03, 129.71it/s]
[Rank 0] Train Epoch 6:  76%|███████▋  | 1526/2000 [00:12<00:03, 128.58it/s]
[Rank 1] Train Epoch 6:  77%|███████▋  | 1541/2000 [00:12<00:03, 130.45it/s]
[Rank 2] Train Epoch 6:  77%|███████▋  | 1537/2000 [00:12<00:03, 133.42it/s]
[Rank 0] Train Epoch 6:  77%|███████▋  | 1540/2000 [00:12<00:03, 129.26it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1516 | Mem: 26.53MB, Util: 77%  global_step : 13516
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1517 | Mem: 26.53MB, Util: 77%  global_step : 13517
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1518 | Mem: 26.53MB, Util: 77%  global_step : 13518
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1519 | Mem: 26.53MB, Util: 77%  global_step : 13519
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1520 | Mem: 26.53MB, Util: 77%  global_step : 13520
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1521 | Mem: 26.53MB, Util: 77%  global_step : 13521
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1522 | Mem: 26.53MB, Util: 77%  global_step : 13522
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1523 | Mem: 26.53MB,

[Rank 1] Train Epoch 6:  78%|███████▊  | 1555/2000 [00:12<00:03, 130.84it/s]
[Rank 2] Train Epoch 6:  78%|███████▊  | 1552/2000 [00:12<00:03, 137.71it/s]
[Rank 0] Train Epoch 6:  78%|███████▊  | 1554/2000 [00:12<00:03, 129.64it/s]
[Rank 1] Train Epoch 6:  78%|███████▊  | 1569/2000 [00:12<00:03, 131.35it/s]
[Rank 2] Train Epoch 6:  78%|███████▊  | 1567/2000 [00:12<00:03, 140.85it/s]
[Rank 0] Train Epoch 6:  78%|███████▊  | 1568/2000 [00:12<00:03, 130.43it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1543 | Mem: 26.53MB, Util: 66%  global_step : 13543
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1544 | Mem: 26.53MB, Util: 66%  global_step : 13544
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1545 | Mem: 26.53MB, Util: 66%  global_step : 13545
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1546 | Mem: 26.53MB, Util: 66%  global_step : 13546
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1547 | Mem: 26.53MB, Util: 66%  global_step : 13547
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1548 | Mem: 26.53MB, Util: 66%  global_step : 13548
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1549 | Mem: 26.53MB, Util: 66%  global_step : 13549
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1550 | Mem: 26.53MB,

[Rank 1] Train Epoch 6:  79%|███████▉  | 1583/2000 [00:12<00:03, 132.35it/s]
[Rank 2] Train Epoch 6:  79%|███████▉  | 1582/2000 [00:12<00:02, 142.77it/s]
[Rank 0] Train Epoch 6:  79%|███████▉  | 1582/2000 [00:12<00:03, 130.45it/s]
[Rank 1] Train Epoch 6:  80%|███████▉  | 1597/2000 [00:13<00:03, 133.70it/s]
[Rank 2] Train Epoch 6:  80%|███████▉  | 1598/2000 [00:12<00:02, 145.09it/s]
[Rank 0] Train Epoch 6:  80%|███████▉  | 1596/2000 [00:13<00:03, 132.95it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1571 | Mem: 26.53MB, Util: 65%  global_step : 13571
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1572 | Mem: 26.53MB, Util: 65%  global_step : 13572
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1573 | Mem: 26.53MB, Util: 65%  global_step : 13573
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1574 | Mem: 26.53MB, Util: 65%  global_step : 13574
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1575 | Mem: 26.53MB, Util: 65%  global_step : 13575
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1576 | Mem: 26.53MB, Util: 65%  global_step : 13576
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1577 | Mem: 26.53MB, Util: 65%  global_step : 13577
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1578 | Mem: 26.53MB,

[Rank 1] Train Epoch 6:  81%|████████  | 1611/2000 [00:13<00:02, 133.01it/s]
[Rank 2] Train Epoch 6:  81%|████████  | 1613/2000 [00:12<00:03, 121.43it/s]
[Rank 0] Train Epoch 6:  80%|████████  | 1610/2000 [00:13<00:02, 133.15it/s]
[Rank 1] Train Epoch 6:  81%|████████▏ | 1625/2000 [00:13<00:02, 133.02it/s]
[Rank 2] Train Epoch 6:  81%|████████▏ | 1628/2000 [00:13<00:02, 128.56it/s]
[Rank 0] Train Epoch 6:  81%|████████  | 1624/2000 [00:13<00:02, 131.16it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1599 | Mem: 26.53MB, Util: 84%  global_step : 13599
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1600 | Mem: 26.53MB, Util: 84%  global_step : 13600
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1601 | Mem: 26.53MB, Util: 84%  global_step : 13601
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1602 | Mem: 26.53MB, Util: 84%  global_step : 13602
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1603 | Mem: 26.53MB, Util: 84%  global_step : 13603
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1604 | Mem: 26.53MB, Util: 84%  global_step : 13604
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1605 | Mem: 26.53MB, Util: 84%  global_step : 13605
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1606 | Mem: 26.53MB,

[Rank 1] Train Epoch 6:  82%|████████▏ | 1639/2000 [00:13<00:02, 133.91it/s]
[Rank 2] Train Epoch 6:  82%|████████▏ | 1643/2000 [00:13<00:02, 134.04it/s]
[Rank 0] Train Epoch 6:  82%|████████▏ | 1638/2000 [00:13<00:02, 129.51it/s]
[Rank 1] Train Epoch 6:  83%|████████▎ | 1653/2000 [00:13<00:02, 134.73it/s]
[Rank 2] Train Epoch 6:  83%|████████▎ | 1658/2000 [00:13<00:02, 138.25it/s]
[Rank 0] Train Epoch 6:  83%|████████▎ | 1652/2000 [00:13<00:02, 132.46it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1627 | Mem: 26.53MB, Util: 97%  global_step : 13627
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1628 | Mem: 26.53MB, Util: 97%  global_step : 13628
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1629 | Mem: 26.53MB, Util: 97%  global_step : 13629
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1630 | Mem: 26.53MB, Util: 97%  global_step : 13630
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1631 | Mem: 26.53MB, Util: 97%  global_step : 13631
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1632 | Mem: 26.53MB, Util: 97%  global_step : 13632
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1633 | Mem: 26.53MB, Util: 97%  global_step : 13633
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1634 | Mem: 26.53MB,

[Rank 1] Train Epoch 6:  83%|████████▎ | 1667/2000 [00:13<00:02, 136.04it/s]
[Rank 2] Train Epoch 6:  84%|████████▎ | 1673/2000 [00:13<00:02, 141.46it/s]
[Rank 0] Train Epoch 6:  83%|████████▎ | 1666/2000 [00:13<00:02, 131.86it/s]
[Rank 1] Train Epoch 6:  84%|████████▍ | 1681/2000 [00:13<00:02, 135.46it/s]
[Rank 2] Train Epoch 6:  84%|████████▍ | 1688/2000 [00:13<00:02, 143.17it/s]
[Rank 0] Train Epoch 6:  84%|████████▍ | 1681/2000 [00:13<00:02, 134.71it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1655 | Mem: 26.53MB, Util: 92%  global_step : 13655
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1656 | Mem: 26.53MB, Util: 92%  global_step : 13656
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1657 | Mem: 26.53MB, Util: 92%  global_step : 13657
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1658 | Mem: 26.53MB, Util: 92%  global_step : 13658
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1659 | Mem: 26.53MB, Util: 92%  global_step : 13659
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1660 | Mem: 26.53MB, Util: 92%  global_step : 13660
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1661 | Mem: 26.53MB, Util: 92%  global_step : 13661
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1662 | Mem: 26.53MB,

[Rank 1] Train Epoch 6:  85%|████████▍ | 1695/2000 [00:13<00:02, 136.25it/s]
[Rank 0] Train Epoch 6:  85%|████████▍ | 1695/2000 [00:13<00:02, 135.79it/s]
[Rank 1] Train Epoch 6:  85%|████████▌ | 1709/2000 [00:13<00:02, 131.62it/s]
[Rank 2] Train Epoch 6:  85%|████████▌ | 1703/2000 [00:13<00:02, 119.07it/s]
[Rank 0] Train Epoch 6:  86%|████████▌ | 1710/2000 [00:13<00:02, 137.77it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1684 | Mem: 26.53MB, Util: 100%  global_step : 13684
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1685 | Mem: 26.53MB, Util: 100%  global_step : 13685
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1686 | Mem: 26.53MB, Util: 100%  global_step : 13686
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1687 | Mem: 26.53MB, Util: 100%  global_step : 13687
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1688 | Mem: 26.53MB, Util: 100%  global_step : 13688
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1689 | Mem: 26.53MB, Util: 100%  global_step : 13689
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1690 | Mem: 26.53MB, Util: 100%  global_step : 13690
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1691 | Mem: 2

[Rank 1] Train Epoch 6:  86%|████████▌ | 1723/2000 [00:13<00:02, 132.11it/s]
[Rank 2] Train Epoch 6:  86%|████████▌ | 1718/2000 [00:13<00:02, 126.18it/s]
[Rank 0] Train Epoch 6:  86%|████████▋ | 1725/2000 [00:13<00:01, 138.56it/s]
[Rank 1] Train Epoch 6:  87%|████████▋ | 1737/2000 [00:14<00:01, 132.21it/s]
[Rank 2] Train Epoch 6:  87%|████████▋ | 1733/2000 [00:13<00:02, 132.22it/s]
[Rank 0] Train Epoch 6:  87%|████████▋ | 1739/2000 [00:14<00:01, 138.64it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1711 | Mem: 26.53MB, Util: 100%  global_step : 13711
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1712 | Mem: 26.53MB, Util: 100%  global_step : 13712
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1713 | Mem: 26.53MB, Util: 100%  global_step : 13713
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1714 | Mem: 26.53MB, Util: 100%  global_step : 13714
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1715 | Mem: 26.53MB, Util: 100%  global_step : 13715
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1716 | Mem: 26.53MB, Util: 100%  global_step : 13716
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1717 | Mem: 26.53MB, Util: 100%  global_step : 13717
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1718 | Mem: 2

[Rank 1] Train Epoch 6:  88%|████████▊ | 1751/2000 [00:14<00:01, 132.20it/s]
[Rank 2] Train Epoch 6:  87%|████████▋ | 1748/2000 [00:13<00:01, 136.59it/s]
[Rank 0] Train Epoch 6:  88%|████████▊ | 1753/2000 [00:14<00:01, 135.03it/s]
[Rank 1] Train Epoch 6:  88%|████████▊ | 1765/2000 [00:14<00:01, 131.95it/s]
[Rank 2] Train Epoch 6:  88%|████████▊ | 1763/2000 [00:14<00:01, 139.84it/s]
[Rank 0] Train Epoch 6:  88%|████████▊ | 1767/2000 [00:14<00:01, 135.23it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1739 | Mem: 26.53MB, Util: 82%  global_step : 13739
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1740 | Mem: 26.53MB, Util: 82%  global_step : 13740
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1741 | Mem: 26.53MB, Util: 82%  global_step : 13741
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1742 | Mem: 26.53MB, Util: 82%  global_step : 13742
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1743 | Mem: 26.53MB, Util: 82%  global_step : 13743
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1744 | Mem: 26.53MB, Util: 82%  global_step : 13744
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1745 | Mem: 26.53MB, Util: 82%  global_step : 13745
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1746 | Mem: 26.53MB,

[Rank 1] Train Epoch 6:  89%|████████▉ | 1779/2000 [00:14<00:01, 131.90it/s]
[Rank 2] Train Epoch 6:  89%|████████▉ | 1778/2000 [00:14<00:01, 141.81it/s]
[Rank 2] Train Epoch 6:  90%|████████▉ | 1793/2000 [00:14<00:01, 143.64it/s]
[Rank 0] Train Epoch 6:  89%|████████▉ | 1781/2000 [00:14<00:01, 134.59it/s]
[Rank 1] Train Epoch 6:  90%|████████▉ | 1793/2000 [00:14<00:01, 132.32it/s]
[Rank 0] Train Epoch 6:  90%|████████▉ | 1795/2000 [00:14<00:01, 135.21it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1766 | Mem: 26.53MB, Util: 66%  global_step : 13766
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1767 | Mem: 26.53MB, Util: 66%  global_step : 13767
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1768 | Mem: 26.53MB, Util: 66%  global_step : 13768
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1769 | Mem: 26.53MB, Util: 66%  global_step : 13769
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1770 | Mem: 26.53MB, Util: 66%  global_step : 13770
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1771 | Mem: 26.53MB, Util: 66%  global_step : 13771
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1772 | Mem: 26.53MB, Util: 66%  global_step : 13772
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1773 | Mem: 26.53MB,

[Rank 1] Train Epoch 6:  90%|█████████ | 1807/2000 [00:14<00:01, 132.77it/s]
[Rank 2] Train Epoch 6:  90%|█████████ | 1808/2000 [00:14<00:01, 118.35it/s]
[Rank 0] Train Epoch 6:  90%|█████████ | 1809/2000 [00:14<00:01, 128.78it/s]
[Rank 1] Train Epoch 6:  91%|█████████ | 1821/2000 [00:14<00:01, 132.85it/s]
[Rank 2] Train Epoch 6:  91%|█████████ | 1823/2000 [00:14<00:01, 125.67it/s]
[Rank 0] Train Epoch 6:  91%|█████████ | 1823/2000 [00:14<00:01, 130.82it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1794 | Mem: 26.53MB, Util: 65%  global_step : 13794
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1795 | Mem: 26.53MB, Util: 65%  global_step : 13795
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1796 | Mem: 26.53MB, Util: 65%  global_step : 13796
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1797 | Mem: 26.53MB, Util: 65%  global_step : 13797
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1798 | Mem: 26.53MB, Util: 65%  global_step : 13798
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1799 | Mem: 26.53MB, Util: 65%  global_step : 13799
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1800 | Mem: 26.53MB, Util: 65%  global_step : 13800
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1801 | Mem: 26.53MB,

[Rank 1] Train Epoch 6:  92%|█████████▏| 1835/2000 [00:14<00:01, 132.42it/s]
[Rank 2] Train Epoch 6:  92%|█████████▏| 1838/2000 [00:14<00:01, 131.41it/s]
[Rank 0] Train Epoch 6:  92%|█████████▏| 1837/2000 [00:14<00:01, 132.40it/s]
[Rank 2] Train Epoch 6:  93%|█████████▎| 1853/2000 [00:14<00:01, 136.18it/s]
[Rank 0] Train Epoch 6:  93%|█████████▎| 1851/2000 [00:14<00:01, 133.61it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1822 | Mem: 26.53MB, Util: 67%  global_step : 13822
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1823 | Mem: 26.53MB, Util: 67%  global_step : 13823
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1824 | Mem: 26.53MB, Util: 67%  global_step : 13824
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1825 | Mem: 26.53MB, Util: 67%  global_step : 13825
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1826 | Mem: 26.53MB, Util: 67%  global_step : 13826
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1827 | Mem: 26.53MB, Util: 67%  global_step : 13827
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1828 | Mem: 26.53MB, Util: 67%  global_step : 13828
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1829 | Mem: 26.53MB,

[Rank 1] Train Epoch 6:  92%|█████████▏| 1849/2000 [00:14<00:01, 130.70it/s]
[Rank 1] Train Epoch 6:  93%|█████████▎| 1863/2000 [00:15<00:01, 132.49it/s]
[Rank 2] Train Epoch 6:  93%|█████████▎| 1868/2000 [00:14<00:00, 139.78it/s]
[Rank 0] Train Epoch 6:  93%|█████████▎| 1865/2000 [00:15<00:01, 134.52it/s]
[Rank 1] Train Epoch 6:  94%|█████████▍| 1877/2000 [00:15<00:00, 133.67it/s]
[Rank 2] Train Epoch 6:  94%|█████████▍| 1883/2000 [00:14<00:00, 142.46it/s]
[Rank 0] Train Epoch 6:  94%|█████████▍| 1879/2000 [00:15<00:00, 135.35it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1848 | Mem: 26.53MB, Util: 66%  global_step : 13848
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1849 | Mem: 26.53MB, Util: 66%  global_step : 13849
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1850 | Mem: 26.53MB, Util: 66%  global_step : 13850
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1851 | Mem: 26.53MB, Util: 66%  global_step : 13851
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1852 | Mem: 26.53MB, Util: 66%  global_step : 13852
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1853 | Mem: 26.53MB, Util: 66%  global_step : 13853
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1854 | Mem: 26.53MB, Util: 66%  global_step : 13854
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1855 | Mem: 26.53MB,

[Rank 1] Train Epoch 6:  95%|█████████▍| 1891/2000 [00:15<00:00, 134.04it/s]
[Rank 2] Train Epoch 6:  95%|█████████▍| 1898/2000 [00:15<00:00, 143.62it/s]
[Rank 0] Train Epoch 6:  95%|█████████▍| 1893/2000 [00:15<00:00, 132.81it/s]
[Rank 1] Train Epoch 6:  95%|█████████▌| 1905/2000 [00:15<00:00, 134.70it/s]
[Rank 0] Train Epoch 6:  95%|█████████▌| 1907/2000 [00:15<00:00, 131.24it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1877 | Mem: 26.53MB, Util: 62%  global_step : 13877
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1878 | Mem: 26.53MB, Util: 62%  global_step : 13878
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1879 | Mem: 26.53MB, Util: 62%  global_step : 13879
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1880 | Mem: 26.53MB, Util: 62%  global_step : 13880
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1881 | Mem: 26.53MB, Util: 62%  global_step : 13881
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1882 | Mem: 26.53MB, Util: 62%  global_step : 13882
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1883 | Mem: 26.53MB, Util: 62%  global_step : 13883
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1884 | Mem: 26.53MB,

[Rank 1] Train Epoch 6:  96%|█████████▌| 1919/2000 [00:15<00:00, 135.51it/s]
[Rank 2] Train Epoch 6:  96%|█████████▌| 1913/2000 [00:15<00:00, 119.82it/s]
[Rank 0] Train Epoch 6:  96%|█████████▌| 1922/2000 [00:15<00:00, 134.56it/s]
[Rank 1] Train Epoch 6:  97%|█████████▋| 1933/2000 [00:15<00:00, 136.03it/s]
[Rank 2] Train Epoch 6:  96%|█████████▋| 1928/2000 [00:15<00:00, 127.28it/s]
[Rank 0] Train Epoch 6:  97%|█████████▋| 1936/2000 [00:15<00:00, 135.88it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1905 | Mem: 26.53MB, Util: 66%  global_step : 13905
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1906 | Mem: 26.53MB, Util: 66%  global_step : 13906
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1907 | Mem: 26.53MB, Util: 66%  global_step : 13907
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1908 | Mem: 26.53MB, Util: 66%  global_step : 13908
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1909 | Mem: 26.53MB, Util: 66%  global_step : 13909
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1910 | Mem: 26.53MB, Util: 66%  global_step : 13910
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1911 | Mem: 26.53MB, Util: 66%  global_step : 13911
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1912 | Mem: 26.53MB,

[Rank 1] Train Epoch 6:  97%|█████████▋| 1947/2000 [00:15<00:00, 135.42it/s]
[Rank 2] Train Epoch 6:  97%|█████████▋| 1944/2000 [00:15<00:00, 134.38it/s]
[Rank 0] Train Epoch 6:  98%|█████████▊| 1951/2000 [00:15<00:00, 137.17it/s]
[Rank 1] Train Epoch 6:  98%|█████████▊| 1961/2000 [00:15<00:00, 135.00it/s]
[Rank 2] Train Epoch 6:  98%|█████████▊| 1959/2000 [00:15<00:00, 137.99it/s]
[Rank 0] Train Epoch 6:  98%|█████████▊| 1966/2000 [00:15<00:00, 138.22it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1934 | Mem: 26.53MB, Util: 68%  global_step : 13934
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1935 | Mem: 26.53MB, Util: 68%  global_step : 13935
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1936 | Mem: 26.53MB, Util: 68%  global_step : 13936
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1937 | Mem: 26.53MB, Util: 68%  global_step : 13937
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1938 | Mem: 26.53MB, Util: 68%  global_step : 13938
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1939 | Mem: 26.53MB, Util: 68%  global_step : 13939
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1940 | Mem: 26.53MB, Util: 68%  global_step : 13940
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1941 | Mem: 26.53MB,

[Rank 2] Train Epoch 6:  99%|█████████▊| 1974/2000 [00:15<00:00, 140.79it/s]
[Rank 0] Train Epoch 6:  99%|█████████▉| 1981/2000 [00:15<00:00, 138.90it/s]
[Rank 1] Train Epoch 6:  99%|█████████▉| 1975/2000 [00:15<00:00, 131.14it/s]
[Rank 2] Train Epoch 6: 100%|██████████| 2000/2000 [00:15<00:00, 126.36it/s]
[Rank 2] Test Epoch 6:   0%|          | 0/334 [00:00<?, ?it/s]
[Rank 0] Train Epoch 6: 100%|██████████| 2000/2000 [00:16<00:00, 124.95it/s]
[Rank 0] Test Epoch 6:   0%|          | 0/334 [00:00<?, ?it/s]
[Rank 1] Train Epoch 6: 100%|██████████| 2000/2000 [00:16<00:00, 124.52it/s]
[Rank 1] Test Epoch 6:   0%|          | 0/334 [00:00<?, ?it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1988 | Mem: 26.53MB, Util: 65%  global_step : 13988
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1989 | Mem: 26.53MB, Util: 65%  global_step : 13989
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1990 | Mem: 26.53MB, Util: 65%  global_step : 13990
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1991 | Mem: 26.53MB, Util: 65%  global_step : 13991
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1992 | Mem: 26.53MB, Util: 65%  global_step : 13992
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1993 | Mem: 26.53MB, Util: 65%  global_step : 13993
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1994 | Mem: 26.53MB, Util: 65%  global_step : 13994
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 6, Batch 1995 | Mem: 26.53MB,

[Rank 2] Test Epoch 6:   1%|          | 3/334 [00:00<00:11, 29.66it/s]
[Rank 0] Test Epoch 6:   5%|▌         | 18/334 [00:00<00:01, 177.94it/s]
[Rank 1] Test Epoch 6:  10%|█         | 34/334 [00:00<00:00, 335.54it/s]
[Rank 2] Test Epoch 6:  12%|█▏        | 39/334 [00:00<00:01, 223.02it/s]
[Rank 0] Test Epoch 6:  16%|█▌        | 52/334 [00:00<00:01, 271.51it/s]
[Rank 1] Test Epoch 6:  20%|██        | 68/334 [00:00<00:00, 338.00it/s]
[Rank 2] Test Epoch 6:  22%|██▏       | 75/334 [00:00<00:00, 285.05it/s]
[Rank 0] Test Epoch 6:  26%|██▌       | 86/334 [00:00<00:00, 301.94it/s]
[Rank 1] Test Epoch 6:  31%|███       | 102/334 [00:00<00:00, 337.35it/s]
[Rank 2] Test Epoch 6:  34%|███▎      | 112/334 [00:00<00:00, 315.93it/s]
[Rank 0] Test Epoch 6:  36%|███▌      | 121/334 [00:00<00:00, 319.45it/s]
[Rank 1] Test Epoch 6:  41%|████      | 136/334 [00:00<00:00, 338.10it/s]
[Rank 2] Test Epoch 6:  45%|████▍     | 149/334 [00:00<00:00, 332.38it/s]
[Rank 0] Test Epoch 6:  46%|████▋     | 155/334 

[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [Rank 2] Epoch 6 | Loss: 0.3269, Acc: 0.8839, Model Checksum: bd4b8f3e89aaa6579a8995072d1e24f8
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [ NodeId 8132e66e3928a4b82addde0aa19309709a58e788e1dcddeaab904191 Rank 2] Epoch 6 | Loss: 0.3269, Acc: 0.8839, Model Checksum: bd4b8f3e89aaa6579a8995072d1e24f8
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 0 | Mem: 26.53MB, Util: 3%  global_step : 14000
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1 | Mem: 26.53MB, Util: 3%  global_step : 14001
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 2 | Mem: 26.53MB, Util: 3%  global_step : 14002
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 3 | Mem: 26.53MB, Util: 3%  global_step : 14003
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 4 | Mem: 26.53MB, Util: 3%  global_step : 14004
[36m(RayTrainWorker pi

[Rank 2] Train Epoch 7:   1%|          | 14/2000 [00:00<00:14, 139.97it/s]
[Rank 0] Train Epoch 7:   1%|          | 12/2000 [00:00<00:17, 112.53it/s]
[Rank 1] Train Epoch 7:   1%|          | 13/2000 [00:00<00:15, 125.40it/s]
[Rank 2] Train Epoch 7:   1%|▏         | 29/2000 [00:00<00:13, 144.05it/s]
[Rank 0] Train Epoch 7:   1%|▏         | 26/2000 [00:00<00:15, 125.21it/s]
[Rank 1] Train Epoch 7:   1%|▏         | 27/2000 [00:00<00:14, 132.05it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 20 | Mem: 26.53MB, Util: 3%  global_step : 14020
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 21 | Mem: 26.53MB, Util: 3%  global_step : 14021
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 22 | Mem: 26.53MB, Util: 3%  global_step : 14022
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 23 | Mem: 26.53MB, Util: 3%  global_step : 14023
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 24 | Mem: 26.53MB, Util: 3%  global_step : 14024
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 25 | Mem: 26.53MB, Util: 3%  global_step : 14025
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 26 | Mem: 26.53MB, Util: 3%  global_step : 14026
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 27 | Mem: 26.53MB, Util: 3%  global_step : 14027


[Rank 2] Train Epoch 7:   2%|▏         | 44/2000 [00:00<00:13, 145.33it/s]
[Rank 0] Train Epoch 7:   2%|▏         | 40/2000 [00:00<00:14, 131.12it/s]
[Rank 1] Train Epoch 7:   2%|▏         | 41/2000 [00:00<00:15, 127.15it/s]
[Rank 2] Train Epoch 7:   3%|▎         | 59/2000 [00:00<00:13, 147.02it/s]
[Rank 0] Train Epoch 7:   3%|▎         | 54/2000 [00:00<00:14, 133.93it/s]
[Rank 1] Train Epoch 7:   3%|▎         | 55/2000 [00:00<00:15, 128.47it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 52 | Mem: 26.53MB, Util: 86%  global_step : 14052
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 53 | Mem: 26.53MB, Util: 86%  global_step : 14053
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 54 | Mem: 26.53MB, Util: 86%  global_step : 14054
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 55 | Mem: 26.53MB, Util: 86%  global_step : 14055
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 56 | Mem: 26.53MB, Util: 86%  global_step : 14056
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 57 | Mem: 26.53MB, Util: 100%  global_step : 14057
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 58 | Mem: 26.53MB, Util: 100%  global_step : 14058
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 59 | Mem: 26.53MB, Util: 100%  global_st

[Rank 2] Train Epoch 7:   4%|▎         | 74/2000 [00:00<00:13, 147.31it/s]
[Rank 0] Train Epoch 7:   3%|▎         | 68/2000 [00:00<00:14, 134.93it/s]
[Rank 1] Train Epoch 7:   3%|▎         | 69/2000 [00:00<00:14, 129.43it/s]
[Rank 2] Train Epoch 7:   4%|▍         | 90/2000 [00:00<00:12, 148.34it/s]
[Rank 0] Train Epoch 7:   4%|▍         | 82/2000 [00:00<00:14, 134.80it/s]
[Rank 1] Train Epoch 7:   4%|▍         | 83/2000 [00:00<00:14, 129.44it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 83 | Mem: 26.53MB, Util: 100%  global_step : 14083
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 84 | Mem: 26.53MB, Util: 100%  global_step : 14084
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 85 | Mem: 26.53MB, Util: 100%  global_step : 14085
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 86 | Mem: 26.53MB, Util: 100%  global_step : 14086
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 87 | Mem: 26.53MB, Util: 100%  global_step : 14087
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 88 | Mem: 26.53MB, Util: 100%  global_step : 14088
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 89 | Mem: 26.53MB, Util: 100%  global_step : 14089
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 90 | Mem: 26.53MB, Util: 100%  glob

[Rank 0] Train Epoch 7:   5%|▍         | 96/2000 [00:00<00:14, 135.06it/s]
[Rank 1] Train Epoch 7:   5%|▍         | 97/2000 [00:00<00:14, 129.91it/s]
[Rank 2] Train Epoch 7:   5%|▌         | 105/2000 [00:00<00:18, 100.46it/s]
[Rank 0] Train Epoch 7:   6%|▌         | 110/2000 [00:00<00:14, 131.00it/s]
[Rank 1] Train Epoch 7:   6%|▌         | 111/2000 [00:00<00:14, 130.25it/s]
[Rank 2] Train Epoch 7:   6%|▌         | 120/2000 [00:00<00:16, 111.88it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 100 | Mem: 26.53MB, Util: 100%  global_step : 14100
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 101 | Mem: 26.53MB, Util: 100%  global_step : 14101
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 102 | Mem: 26.53MB, Util: 100%  global_step : 14102
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 103 | Mem: 26.53MB, Util: 100%  global_step : 14103
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 104 | Mem: 26.53MB, Util: 100%  global_step : 14104
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 105 | Mem: 26.53MB, Util: 100%  global_step : 14105
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 106 | Mem: 26.53MB, Util: 100%  global_step : 14106
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 100 | Mem: 26.53MB, Util: 96%

[Rank 0] Train Epoch 7:   6%|▌         | 124/2000 [00:00<00:14, 129.52it/s]
[Rank 1] Train Epoch 7:   6%|▋         | 125/2000 [00:00<00:14, 130.36it/s]
[Rank 2] Train Epoch 7:   7%|▋         | 135/2000 [00:01<00:15, 120.83it/s]
[Rank 0] Train Epoch 7:   7%|▋         | 138/2000 [00:01<00:14, 131.94it/s]
[Rank 1] Train Epoch 7:   7%|▋         | 139/2000 [00:01<00:14, 130.57it/s]
[Rank 2] Train Epoch 7:   8%|▊         | 150/2000 [00:01<00:14, 127.33it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 123 | Mem: 26.53MB, Util: 100%  global_step : 14123
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 124 | Mem: 26.53MB, Util: 100%  global_step : 14124
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 125 | Mem: 26.53MB, Util: 100%  global_step : 14125
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 126 | Mem: 26.53MB, Util: 100%  global_step : 14126
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 127 | Mem: 26.53MB, Util: 97%  global_step : 14127
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 128 | Mem: 26.53MB, Util: 97%  global_step : 14128
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 129 | Mem: 26.53MB, Util: 97%  global_step : 14129
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 130 | Mem: 26.53MB, Util: 97%  

[Rank 0] Train Epoch 7:   8%|▊         | 152/2000 [00:01<00:13, 133.98it/s]
[Rank 1] Train Epoch 7:   8%|▊         | 153/2000 [00:01<00:14, 130.80it/s]
[Rank 2] Train Epoch 7:   8%|▊         | 165/2000 [00:01<00:13, 132.67it/s]
[Rank 0] Train Epoch 7:   8%|▊         | 167/2000 [00:01<00:13, 136.41it/s]
[Rank 1] Train Epoch 7:   8%|▊         | 167/2000 [00:01<00:14, 130.46it/s]
[Rank 2] Train Epoch 7:   9%|▉         | 180/2000 [00:01<00:13, 136.51it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 153 | Mem: 26.53MB, Util: 97%  global_step : 14153
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 154 | Mem: 26.53MB, Util: 97%  global_step : 14154
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 155 | Mem: 26.53MB, Util: 97%  global_step : 14155
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 156 | Mem: 26.53MB, Util: 97%  global_step : 14156
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 157 | Mem: 26.53MB, Util: 100%  global_step : 14157
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 158 | Mem: 26.53MB, Util: 100%  global_step : 14158
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 159 | Mem: 26.53MB, Util: 100%  global_step : 14159
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 160 | Mem: 26.53MB, Util: 100%  

[Rank 0] Train Epoch 7:   9%|▉         | 182/2000 [00:01<00:13, 138.55it/s]
[Rank 1] Train Epoch 7:   9%|▉         | 181/2000 [00:01<00:13, 130.44it/s]
[Rank 2] Train Epoch 7:  10%|▉         | 195/2000 [00:01<00:12, 139.65it/s]
[Rank 0] Train Epoch 7:  10%|▉         | 196/2000 [00:01<00:13, 134.65it/s]
[Rank 1] Train Epoch 7:  10%|▉         | 195/2000 [00:01<00:13, 130.57it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 184 | Mem: 26.53MB, Util: 100%  global_step : 14184
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 185 | Mem: 26.53MB, Util: 100%  global_step : 14185
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 186 | Mem: 26.53MB, Util: 100%  global_step : 14186
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 187 | Mem: 26.53MB, Util: 100%  global_step : 14187
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 188 | Mem: 26.53MB, Util: 100%  global_step : 14188
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 189 | Mem: 26.53MB, Util: 100%  global_step : 14189
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 190 | Mem: 26.53MB, Util: 100%  global_step : 14190
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 191 | Mem: 26.53MB, Util: 10

[Rank 0] Train Epoch 7:  10%|█         | 210/2000 [00:01<00:14, 124.53it/s]
[Rank 1] Train Epoch 7:  10%|█         | 209/2000 [00:01<00:13, 130.47it/s]
[Rank 2] Train Epoch 7:  10%|█         | 210/2000 [00:01<00:15, 114.83it/s]
[Rank 0] Train Epoch 7:  11%|█         | 224/2000 [00:01<00:13, 128.09it/s]
[Rank 1] Train Epoch 7:  11%|█         | 223/2000 [00:01<00:13, 130.68it/s]
[Rank 2] Train Epoch 7:  11%|█▏        | 225/2000 [00:01<00:14, 123.18it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 202 | Mem: 26.53MB, Util: 100%  global_step : 14202
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 203 | Mem: 26.53MB, Util: 100%  global_step : 14203
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 204 | Mem: 26.53MB, Util: 100%  global_step : 14204
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 205 | Mem: 26.53MB, Util: 100%  global_step : 14205
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 206 | Mem: 26.53MB, Util: 100%  global_step : 14206
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 207 | Mem: 26.53MB, Util: 100%  global_step : 14207
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 208 | Mem: 26.53MB, Util: 100%  global_step : 14208
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 209 | Mem: 26.53MB, Util: 10

[Rank 0] Train Epoch 7:  12%|█▏        | 238/2000 [00:01<00:13, 130.51it/s]
[Rank 1] Train Epoch 7:  12%|█▏        | 237/2000 [00:01<00:13, 130.58it/s]
[Rank 2] Train Epoch 7:  12%|█▏        | 240/2000 [00:01<00:13, 129.62it/s]
[Rank 0] Train Epoch 7:  13%|█▎        | 252/2000 [00:01<00:13, 132.60it/s]
[Rank 1] Train Epoch 7:  13%|█▎        | 251/2000 [00:01<00:13, 130.13it/s]
[Rank 2] Train Epoch 7:  13%|█▎        | 255/2000 [00:01<00:12, 134.39it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 233 | Mem: 26.53MB, Util: 100%  global_step : 14233
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 234 | Mem: 26.53MB, Util: 100%  global_step : 14234
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 235 | Mem: 26.53MB, Util: 100%  global_step : 14235
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 236 | Mem: 26.53MB, Util: 100%  global_step : 14236
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 237 | Mem: 26.53MB, Util: 100%  global_step : 14237
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 238 | Mem: 26.53MB, Util: 98%  global_step : 14238
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 239 | Mem: 26.53MB, Util: 98%  global_step : 14239
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 240 | Mem: 26.53MB, Util: 98% 

[Rank 0] Train Epoch 7:  13%|█▎        | 266/2000 [00:02<00:12, 134.02it/s]
[Rank 1] Train Epoch 7:  13%|█▎        | 265/2000 [00:02<00:13, 130.17it/s]
[Rank 2] Train Epoch 7:  14%|█▎        | 270/2000 [00:02<00:12, 138.43it/s]
[Rank 0] Train Epoch 7:  14%|█▍        | 280/2000 [00:02<00:12, 134.39it/s]
[Rank 1] Train Epoch 7:  14%|█▍        | 279/2000 [00:02<00:13, 130.27it/s]
[Rank 2] Train Epoch 7:  14%|█▍        | 285/2000 [00:02<00:12, 141.30it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 264 | Mem: 26.53MB, Util: 98%  global_step : 14264
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 265 | Mem: 26.53MB, Util: 98%  global_step : 14265
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 266 | Mem: 26.53MB, Util: 98%  global_step : 14266
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 267 | Mem: 26.53MB, Util: 98%  global_step : 14267
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 268 | Mem: 26.53MB, Util: 98%  global_step : 14268
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 269 | Mem: 26.53MB, Util: 100%  global_step : 14269
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 270 | Mem: 26.53MB, Util: 100%  global_step : 14270
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 271 | Mem: 26.53MB, Util: 100%  g

[Rank 0] Train Epoch 7:  15%|█▍        | 294/2000 [00:02<00:12, 133.96it/s]
[Rank 2] Train Epoch 7:  15%|█▌        | 300/2000 [00:02<00:11, 143.28it/s]
[Rank 1] Train Epoch 7:  15%|█▍        | 293/2000 [00:02<00:13, 129.88it/s]
[Rank 1] Train Epoch 7:  15%|█▌        | 306/2000 [00:02<00:13, 129.60it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 295 | Mem: 26.53MB, Util: 100%  global_step : 14295
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 296 | Mem: 26.53MB, Util: 100%  global_step : 14296
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 297 | Mem: 26.53MB, Util: 100%  global_step : 14297
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 298 | Mem: 26.53MB, Util: 100%  global_step : 14298
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 299 | Mem: 26.53MB, Util: 100%  global_step : 14299
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 296 | Mem: 26.53MB, Util: 100%  global_step : 14296
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 297 | Mem: 26.53MB, Util: 100%  global_step : 14297
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 298 | Mem: 26.53MB, Util: 100% 

[Rank 0] Train Epoch 7:  15%|█▌        | 308/2000 [00:02<00:13, 122.19it/s]
[Rank 2] Train Epoch 7:  16%|█▌        | 315/2000 [00:02<00:14, 113.34it/s]
[Rank 0] Train Epoch 7:  16%|█▌        | 322/2000 [00:02<00:13, 126.20it/s]
[Rank 1] Train Epoch 7:  16%|█▌        | 320/2000 [00:02<00:12, 130.12it/s]
[Rank 2] Train Epoch 7:  16%|█▋        | 330/2000 [00:02<00:13, 121.81it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 312 | Mem: 26.53MB, Util: 100%  global_step : 14312
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 313 | Mem: 26.53MB, Util: 97%  global_step : 14313
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 314 | Mem: 26.53MB, Util: 97%  global_step : 14314
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 315 | Mem: 26.53MB, Util: 97%  global_step : 14315
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 316 | Mem: 26.53MB, Util: 97%  global_step : 14316
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 317 | Mem: 26.53MB, Util: 97%  global_step : 14317
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 318 | Mem: 26.53MB, Util: 97%  global_step : 14318
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 319 | Mem: 26.53MB, Util: 97%  glo

[Rank 0] Train Epoch 7:  17%|█▋        | 336/2000 [00:02<00:12, 128.29it/s]
[Rank 1] Train Epoch 7:  17%|█▋        | 334/2000 [00:02<00:12, 130.63it/s]
[Rank 2] Train Epoch 7:  17%|█▋        | 346/2000 [00:02<00:12, 129.25it/s]
[Rank 0] Train Epoch 7:  18%|█▊        | 350/2000 [00:02<00:12, 131.10it/s]
[Rank 1] Train Epoch 7:  17%|█▋        | 348/2000 [00:02<00:12, 130.70it/s]
[Rank 2] Train Epoch 7:  18%|█▊        | 361/2000 [00:02<00:12, 134.41it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 343 | Mem: 26.53MB, Util: 100%  global_step : 14343
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 344 | Mem: 26.53MB, Util: 100%  global_step : 14344
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 345 | Mem: 26.53MB, Util: 100%  global_step : 14345
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 346 | Mem: 26.53MB, Util: 100%  global_step : 14346
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 347 | Mem: 26.53MB, Util: 100%  global_step : 14347
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 348 | Mem: 26.53MB, Util: 100%  global_step : 14348
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 349 | Mem: 26.53MB, Util: 100%  global_step : 14349
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 350 | Mem: 26.53MB, Util: 10

[Rank 0] Train Epoch 7:  18%|█▊        | 364/2000 [00:02<00:12, 132.28it/s]
[Rank 1] Train Epoch 7:  18%|█▊        | 362/2000 [00:02<00:12, 130.69it/s]
[Rank 2] Train Epoch 7:  19%|█▉        | 376/2000 [00:02<00:11, 138.45it/s]
[Rank 1] Train Epoch 7:  19%|█▉        | 376/2000 [00:02<00:12, 131.00it/s]
[Rank 0] Train Epoch 7:  19%|█▉        | 378/2000 [00:02<00:12, 132.98it/s]
[Rank 2] Train Epoch 7:  20%|█▉        | 391/2000 [00:02<00:11, 140.96it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 374 | Mem: 26.53MB, Util: 100%  global_step : 14374
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 375 | Mem: 26.53MB, Util: 100%  global_step : 14375
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 376 | Mem: 26.53MB, Util: 100%  global_step : 14376
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 377 | Mem: 26.53MB, Util: 100%  global_step : 14377
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 378 | Mem: 26.53MB, Util: 100%  global_step : 14378
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 379 | Mem: 26.53MB, Util: 100%  global_step : 14379
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 380 | Mem: 26.53MB, Util: 100%  global_step : 14380
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 381 | Mem: 26.53MB, Util: 10

[Rank 1] Train Epoch 7:  20%|█▉        | 390/2000 [00:02<00:12, 130.43it/s]
[Rank 0] Train Epoch 7:  20%|█▉        | 392/2000 [00:02<00:12, 133.91it/s]
[Rank 2] Train Epoch 7:  20%|██        | 406/2000 [00:03<00:13, 114.23it/s]
[Rank 1] Train Epoch 7:  20%|██        | 404/2000 [00:03<00:12, 130.68it/s]
[Rank 0] Train Epoch 7:  20%|██        | 406/2000 [00:03<00:12, 124.70it/s]
[Rank 2] Train Epoch 7:  21%|██        | 421/2000 [00:03<00:12, 122.79it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 400 | Mem: 26.53MB, Util: 100%  global_step : 14400
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 401 | Mem: 26.53MB, Util: 100%  global_step : 14401
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 402 | Mem: 26.53MB, Util: 100%  global_step : 14402
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 403 | Mem: 26.53MB, Util: 100%  global_step : 14403
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 404 | Mem: 26.53MB, Util: 100%  global_step : 14404
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 405 | Mem: 26.53MB, Util: 100%  global_step : 14405
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 406 | Mem: 26.53MB, Util: 100%  global_step : 14406
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 407 | Mem: 26.53MB, Util: 10

[Rank 1] Train Epoch 7:  21%|██        | 418/2000 [00:03<00:12, 130.34it/s]
[Rank 0] Train Epoch 7:  21%|██        | 420/2000 [00:03<00:12, 127.72it/s]
[Rank 2] Train Epoch 7:  22%|██▏       | 436/2000 [00:03<00:12, 129.83it/s]
[Rank 1] Train Epoch 7:  22%|██▏       | 432/2000 [00:03<00:12, 130.48it/s]
[Rank 0] Train Epoch 7:  22%|██▏       | 433/2000 [00:03<00:12, 125.38it/s]
[Rank 2] Train Epoch 7:  23%|██▎       | 451/2000 [00:03<00:11, 134.94it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 423 | Mem: 26.53MB, Util: 97%  global_step : 14423
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 424 | Mem: 26.53MB, Util: 97%  global_step : 14424
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 425 | Mem: 26.53MB, Util: 97%  global_step : 14425
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 426 | Mem: 26.53MB, Util: 97%  global_step : 14426
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 427 | Mem: 26.53MB, Util: 97%  global_step : 14427
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 428 | Mem: 26.53MB, Util: 97%  global_step : 14428
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 429 | Mem: 26.53MB, Util: 97%  global_step : 14429
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 430 | Mem: 26.53MB, Util: 97%  glob

[Rank 1] Train Epoch 7:  22%|██▏       | 446/2000 [00:03<00:11, 130.70it/s]
[Rank 0] Train Epoch 7:  22%|██▏       | 447/2000 [00:03<00:12, 129.08it/s]
[Rank 2] Train Epoch 7:  23%|██▎       | 466/2000 [00:03<00:11, 138.44it/s]
[Rank 1] Train Epoch 7:  23%|██▎       | 460/2000 [00:03<00:11, 130.85it/s]
[Rank 0] Train Epoch 7:  23%|██▎       | 461/2000 [00:03<00:11, 128.38it/s]
[Rank 2] Train Epoch 7:  24%|██▍       | 481/2000 [00:03<00:10, 141.56it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 454 | Mem: 26.53MB, Util: 97%  global_step : 14454
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 455 | Mem: 26.53MB, Util: 100%  global_step : 14455
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 456 | Mem: 26.53MB, Util: 100%  global_step : 14456
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 457 | Mem: 26.53MB, Util: 100%  global_step : 14457
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 458 | Mem: 26.53MB, Util: 100%  global_step : 14458
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 459 | Mem: 26.53MB, Util: 100%  global_step : 14459
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 460 | Mem: 26.53MB, Util: 100%  global_step : 14460
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 461 | Mem: 26.53MB, Util: 100

[Rank 1] Train Epoch 7:  24%|██▎       | 474/2000 [00:03<00:11, 130.72it/s]
[Rank 0] Train Epoch 7:  24%|██▍       | 475/2000 [00:03<00:11, 131.16it/s]
[Rank 2] Train Epoch 7:  25%|██▍       | 496/2000 [00:03<00:10, 139.38it/s]
[Rank 1] Train Epoch 7:  24%|██▍       | 488/2000 [00:03<00:11, 130.99it/s]
[Rank 0] Train Epoch 7:  24%|██▍       | 489/2000 [00:03<00:11, 133.21it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 485 | Mem: 26.53MB, Util: 100%  global_step : 14485
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 486 | Mem: 26.53MB, Util: 100%  global_step : 14486
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 487 | Mem: 26.53MB, Util: 100%  global_step : 14487
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 488 | Mem: 26.53MB, Util: 100%  global_step : 14488
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 489 | Mem: 26.53MB, Util: 100%  global_step : 14489
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 490 | Mem: 26.53MB, Util: 100%  global_step : 14490
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 491 | Mem: 26.53MB, Util: 100%  global_step : 14491
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 492 | Mem: 26.53MB, Util: 10

[Rank 1] Train Epoch 7:  25%|██▌       | 502/2000 [00:03<00:11, 130.97it/s]
[Rank 0] Train Epoch 7:  25%|██▌       | 503/2000 [00:03<00:11, 129.08it/s]
[Rank 2] Train Epoch 7:  26%|██▌       | 511/2000 [00:03<00:12, 115.13it/s]
[Rank 1] Train Epoch 7:  26%|██▌       | 516/2000 [00:03<00:11, 130.57it/s]
[Rank 0] Train Epoch 7:  26%|██▌       | 519/2000 [00:03<00:10, 135.90it/s]
[Rank 2] Train Epoch 7:  26%|██▋       | 526/2000 [00:04<00:11, 123.22it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 502 | Mem: 26.53MB, Util: 100%  global_step : 14502
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 503 | Mem: 26.53MB, Util: 100%  global_step : 14503
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 504 | Mem: 26.53MB, Util: 100%  global_step : 14504
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 505 | Mem: 26.53MB, Util: 100%  global_step : 14505
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 506 | Mem: 26.53MB, Util: 100%  global_step : 14506
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 507 | Mem: 26.53MB, Util: 100%  global_step : 14507
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 508 | Mem: 26.53MB, Util: 100%  global_step : 14508
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 509 | Mem: 26.53MB, Util: 10

[Rank 1] Train Epoch 7:  26%|██▋       | 530/2000 [00:04<00:11, 130.73it/s]
[Rank 0] Train Epoch 7:  27%|██▋       | 535/2000 [00:04<00:10, 140.49it/s]
[Rank 2] Train Epoch 7:  27%|██▋       | 541/2000 [00:04<00:11, 129.71it/s]
[Rank 1] Train Epoch 7:  27%|██▋       | 544/2000 [00:04<00:11, 130.40it/s]
[Rank 0] Train Epoch 7:  28%|██▊       | 551/2000 [00:04<00:10, 143.59it/s]
[Rank 2] Train Epoch 7:  28%|██▊       | 556/2000 [00:04<00:10, 133.68it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 533 | Mem: 26.53MB, Util: 96%  global_step : 14533
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 534 | Mem: 26.53MB, Util: 96%  global_step : 14534
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 535 | Mem: 26.53MB, Util: 96%  global_step : 14535
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 536 | Mem: 26.53MB, Util: 96%  global_step : 14536
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 537 | Mem: 26.53MB, Util: 96%  global_step : 14537
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 538 | Mem: 26.53MB, Util: 96%  global_step : 14538
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 539 | Mem: 26.53MB, Util: 96%  global_step : 14539
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 540 | Mem: 26.53MB, Util: 96%  glob

[Rank 1] Train Epoch 7:  28%|██▊       | 558/2000 [00:04<00:11, 129.92it/s]
[Rank 0] Train Epoch 7:  28%|██▊       | 566/2000 [00:04<00:10, 134.27it/s]
[Rank 2] Train Epoch 7:  29%|██▊       | 571/2000 [00:04<00:10, 137.62it/s]
[Rank 1] Train Epoch 7:  29%|██▊       | 571/2000 [00:04<00:11, 129.91it/s]
[Rank 0] Train Epoch 7:  29%|██▉       | 582/2000 [00:04<00:10, 139.15it/s]
[Rank 2] Train Epoch 7:  29%|██▉       | 586/2000 [00:04<00:10, 140.44it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 564 | Mem: 26.53MB, Util: 100%  global_step : 14564
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 565 | Mem: 26.53MB, Util: 100%  global_step : 14565
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 566 | Mem: 26.53MB, Util: 100%  global_step : 14566
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 567 | Mem: 26.53MB, Util: 100%  global_step : 14567
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 568 | Mem: 26.53MB, Util: 100%  global_step : 14568
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 569 | Mem: 26.53MB, Util: 100%  global_step : 14569
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 570 | Mem: 26.53MB, Util: 100%  global_step : 14570
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 571 | Mem: 26.53MB, Util: 10

[Rank 1] Train Epoch 7:  29%|██▉       | 584/2000 [00:04<00:10, 129.69it/s]
[Rank 0] Train Epoch 7:  30%|██▉       | 597/2000 [00:04<00:10, 139.12it/s]
[Rank 1] Train Epoch 7:  30%|██▉       | 598/2000 [00:04<00:10, 129.81it/s]
[Rank 2] Train Epoch 7:  30%|███       | 601/2000 [00:04<00:12, 113.52it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 595 | Mem: 26.53MB, Util: 100%  global_step : 14595
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 596 | Mem: 26.53MB, Util: 100%  global_step : 14596
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 597 | Mem: 26.53MB, Util: 100%  global_step : 14597
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 598 | Mem: 26.53MB, Util: 100%  global_step : 14598
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 599 | Mem: 26.53MB, Util: 100%  global_step : 14599
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 7, Batch 591 | Mem: 26.53MB, Util: 66%  global_step : 14591
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 7, Batch 592 | Mem: 26.53MB, Util: 66%  global_step : 14592
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 7, Batch 593 | Mem: 26.53MB, Util: 6

[Rank 1] Train Epoch 7:  31%|███       | 611/2000 [00:04<00:10, 128.41it/s]
[Rank 0] Train Epoch 7:  31%|███       | 611/2000 [00:04<00:12, 115.30it/s]
[Rank 2] Train Epoch 7:  31%|███       | 616/2000 [00:04<00:11, 121.67it/s]
[Rank 1] Train Epoch 7:  31%|███▏      | 625/2000 [00:04<00:10, 129.96it/s]
[Rank 0] Train Epoch 7:  31%|███▏      | 625/2000 [00:04<00:11, 121.23it/s]
[Rank 2] Train Epoch 7:  32%|███▏      | 631/2000 [00:04<00:10, 128.17it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 612 | Mem: 26.53MB, Util: 100%  global_step : 14612
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 613 | Mem: 26.53MB, Util: 100%  global_step : 14613
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 614 | Mem: 26.53MB, Util: 100%  global_step : 14614
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 615 | Mem: 26.53MB, Util: 100%  global_step : 14615
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 616 | Mem: 26.53MB, Util: 100%  global_step : 14616
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 617 | Mem: 26.53MB, Util: 100%  global_step : 14617
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 618 | Mem: 26.53MB, Util: 100%  global_step : 14618
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 619 | Mem: 26.53MB, Util: 10

[Rank 1] Train Epoch 7:  32%|███▏      | 639/2000 [00:04<00:10, 132.39it/s]
[Rank 0] Train Epoch 7:  32%|███▏      | 639/2000 [00:04<00:10, 125.72it/s]
[Rank 2] Train Epoch 7:  32%|███▏      | 646/2000 [00:04<00:10, 133.30it/s]
[Rank 1] Train Epoch 7:  33%|███▎      | 653/2000 [00:05<00:10, 132.98it/s]
[Rank 0] Train Epoch 7:  33%|███▎      | 653/2000 [00:04<00:10, 128.84it/s]
[Rank 2] Train Epoch 7:  33%|███▎      | 661/2000 [00:05<00:09, 137.27it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 643 | Mem: 26.53MB, Util: 96%  global_step : 14643
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 644 | Mem: 26.53MB, Util: 96%  global_step : 14644
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 645 | Mem: 26.53MB, Util: 96%  global_step : 14645
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 646 | Mem: 26.53MB, Util: 96%  global_step : 14646
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 647 | Mem: 26.53MB, Util: 96%  global_step : 14647
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 648 | Mem: 26.53MB, Util: 96%  global_step : 14648
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 649 | Mem: 26.53MB, Util: 96%  global_step : 14649
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 650 | Mem: 26.53MB, Util: 96%  glob

[Rank 1] Train Epoch 7:  33%|███▎      | 667/2000 [00:05<00:09, 133.54it/s]
[Rank 0] Train Epoch 7:  33%|███▎      | 667/2000 [00:05<00:10, 130.99it/s]
[Rank 2] Train Epoch 7:  34%|███▍      | 676/2000 [00:05<00:09, 140.55it/s]
[Rank 1] Train Epoch 7:  34%|███▍      | 681/2000 [00:05<00:09, 134.11it/s]
[Rank 0] Train Epoch 7:  34%|███▍      | 681/2000 [00:05<00:09, 133.16it/s]
[Rank 2] Train Epoch 7:  35%|███▍      | 691/2000 [00:05<00:09, 141.98it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 674 | Mem: 26.53MB, Util: 100%  global_step : 14674
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 675 | Mem: 26.53MB, Util: 100%  global_step : 14675
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 676 | Mem: 26.53MB, Util: 100%  global_step : 14676
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 677 | Mem: 26.53MB, Util: 100%  global_step : 14677
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 678 | Mem: 26.53MB, Util: 100%  global_step : 14678
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 679 | Mem: 26.53MB, Util: 100%  global_step : 14679
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 680 | Mem: 26.53MB, Util: 100%  global_step : 14680
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 681 | Mem: 26.53MB, Util: 10

[Rank 1] Train Epoch 7:  35%|███▍      | 695/2000 [00:05<00:09, 131.91it/s]
[Rank 0] Train Epoch 7:  35%|███▍      | 695/2000 [00:05<00:09, 134.58it/s]
[Rank 2] Train Epoch 7:  35%|███▌      | 706/2000 [00:05<00:10, 117.84it/s]
[Rank 1] Train Epoch 7:  35%|███▌      | 709/2000 [00:05<00:09, 130.38it/s]
[Rank 0] Train Epoch 7:  35%|███▌      | 709/2000 [00:05<00:10, 127.01it/s]
[Rank 2] Train Epoch 7:  36%|███▌      | 721/2000 [00:05<00:10, 125.32it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 700 | Mem: 26.53MB, Util: 100%  global_step : 14700
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 701 | Mem: 26.53MB, Util: 100%  global_step : 14701
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 702 | Mem: 26.53MB, Util: 100%  global_step : 14702
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 703 | Mem: 26.53MB, Util: 100%  global_step : 14703
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 704 | Mem: 26.53MB, Util: 100%  global_step : 14704
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 705 | Mem: 26.53MB, Util: 100%  global_step : 14705
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 706 | Mem: 26.53MB, Util: 100%  global_step : 14706
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 707 | Mem: 26.53MB, Util: 10

[Rank 1] Train Epoch 7:  36%|███▌      | 723/2000 [00:05<00:09, 131.14it/s]
[Rank 0] Train Epoch 7:  36%|███▌      | 723/2000 [00:05<00:09, 130.28it/s]
[Rank 2] Train Epoch 7:  37%|███▋      | 736/2000 [00:05<00:09, 131.07it/s]
[Rank 1] Train Epoch 7:  37%|███▋      | 737/2000 [00:05<00:09, 131.63it/s]
[Rank 0] Train Epoch 7:  37%|███▋      | 738/2000 [00:05<00:09, 133.19it/s]
[Rank 2] Train Epoch 7:  38%|███▊      | 751/2000 [00:05<00:09, 135.49it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 724 | Mem: 26.53MB, Util: 100%  global_step : 14724
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 725 | Mem: 26.53MB, Util: 100%  global_step : 14725
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 726 | Mem: 26.53MB, Util: 100%  global_step : 14726
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 727 | Mem: 26.53MB, Util: 100%  global_step : 14727
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 728 | Mem: 26.53MB, Util: 100%  global_step : 14728
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 729 | Mem: 26.53MB, Util: 100%  global_step : 14729
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 730 | Mem: 26.53MB, Util: 100%  global_step : 14730
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 731 | Mem: 26.53MB, Util: 10

[Rank 1] Train Epoch 7:  38%|███▊      | 751/2000 [00:05<00:09, 131.62it/s]
[Rank 0] Train Epoch 7:  38%|███▊      | 753/2000 [00:05<00:09, 135.47it/s]
[Rank 2] Train Epoch 7:  38%|███▊      | 765/2000 [00:05<00:09, 136.59it/s]
[Rank 1] Train Epoch 7:  38%|███▊      | 765/2000 [00:05<00:09, 131.84it/s]
[Rank 0] Train Epoch 7:  38%|███▊      | 767/2000 [00:05<00:09, 133.51it/s]
[Rank 2] Train Epoch 7:  39%|███▉      | 780/2000 [00:05<00:08, 140.03it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 754 | Mem: 26.53MB, Util: 97%  global_step : 14754
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 755 | Mem: 26.53MB, Util: 97%  global_step : 14755
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 756 | Mem: 26.53MB, Util: 97%  global_step : 14756
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 757 | Mem: 26.53MB, Util: 97%  global_step : 14757
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 758 | Mem: 26.53MB, Util: 97%  global_step : 14758
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 759 | Mem: 26.53MB, Util: 97%  global_step : 14759
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 760 | Mem: 26.53MB, Util: 97%  global_step : 14760
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 761 | Mem: 26.53MB, Util: 97%  glob

[Rank 1] Train Epoch 7:  39%|███▉      | 779/2000 [00:05<00:09, 131.52it/s]
[Rank 0] Train Epoch 7:  39%|███▉      | 781/2000 [00:05<00:09, 135.18it/s]
[Rank 2] Train Epoch 7:  40%|███▉      | 795/2000 [00:06<00:08, 142.77it/s]
[Rank 1] Train Epoch 7:  40%|███▉      | 793/2000 [00:06<00:09, 132.02it/s]
[Rank 0] Train Epoch 7:  40%|███▉      | 796/2000 [00:06<00:08, 136.74it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 785 | Mem: 26.53MB, Util: 100%  global_step : 14785
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 786 | Mem: 26.53MB, Util: 100%  global_step : 14786
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 787 | Mem: 26.53MB, Util: 100%  global_step : 14787
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 788 | Mem: 26.53MB, Util: 100%  global_step : 14788
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 789 | Mem: 26.53MB, Util: 100%  global_step : 14789
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 790 | Mem: 26.53MB, Util: 100%  global_step : 14790
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 791 | Mem: 26.53MB, Util: 100%  global_step : 14791
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 792 | Mem: 26.53MB, Util: 10

[Rank 1] Train Epoch 7:  40%|████      | 807/2000 [00:06<00:09, 132.43it/s]
[Rank 0] Train Epoch 7:  40%|████      | 810/2000 [00:06<00:09, 122.24it/s]
[Rank 2] Train Epoch 7:  40%|████      | 810/2000 [00:06<00:10, 117.67it/s]
[Rank 1] Train Epoch 7:  41%|████      | 821/2000 [00:06<00:08, 132.43it/s]
[Rank 0] Train Epoch 7:  41%|████▏     | 825/2000 [00:06<00:09, 127.45it/s]
[Rank 2] Train Epoch 7:  41%|████▏     | 825/2000 [00:06<00:09, 125.19it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 804 | Mem: 26.53MB, Util: 100%  global_step : 14804
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 805 | Mem: 26.53MB, Util: 100%  global_step : 14805
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 806 | Mem: 26.53MB, Util: 100%  global_step : 14806
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 807 | Mem: 26.53MB, Util: 100%  global_step : 14807
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 808 | Mem: 26.53MB, Util: 100%  global_step : 14808
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 809 | Mem: 26.53MB, Util: 100%  global_step : 14809
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 810 | Mem: 26.53MB, Util: 100%  global_step : 14810
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 811 | Mem: 26.53MB, Util: 10

[Rank 1] Train Epoch 7:  42%|████▏     | 835/2000 [00:06<00:08, 129.86it/s]
[Rank 0] Train Epoch 7:  42%|████▏     | 839/2000 [00:06<00:08, 130.57it/s]
[Rank 2] Train Epoch 7:  42%|████▏     | 840/2000 [00:06<00:08, 131.02it/s]
[Rank 1] Train Epoch 7:  42%|████▏     | 848/2000 [00:06<00:08, 128.57it/s]
[Rank 0] Train Epoch 7:  43%|████▎     | 853/2000 [00:06<00:08, 132.15it/s]
[Rank 2] Train Epoch 7:  43%|████▎     | 855/2000 [00:06<00:08, 135.81it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 845 | Mem: 26.53MB, Util: 82%  global_step : 14845
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 835 | Mem: 26.53MB, Util: 97%  global_step : 14835
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 836 | Mem: 26.53MB, Util: 97%  global_step : 14836
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 837 | Mem: 26.53MB, Util: 97%  global_step : 14837
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 838 | Mem: 26.53MB, Util: 97%  global_step : 14838
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 839 | Mem: 26.53MB, Util: 97%  global_step : 14839
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 840 | Mem: 26.53MB, Util: 97%  global_step : 14840
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 841 | Mem: 26.53MB, Util: 97%  globa

[Rank 1] Train Epoch 7:  43%|████▎     | 862/2000 [00:06<00:08, 130.47it/s]
[Rank 0] Train Epoch 7:  43%|████▎     | 867/2000 [00:06<00:08, 133.70it/s]
[Rank 2] Train Epoch 7:  44%|████▎     | 870/2000 [00:06<00:08, 139.16it/s]
[Rank 1] Train Epoch 7:  44%|████▍     | 876/2000 [00:06<00:08, 132.27it/s]
[Rank 0] Train Epoch 7:  44%|████▍     | 881/2000 [00:06<00:08, 134.95it/s]
[Rank 2] Train Epoch 7:  44%|████▍     | 885/2000 [00:06<00:08, 135.75it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 860 | Mem: 26.53MB, Util: 96%  global_step : 14860
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 861 | Mem: 26.53MB, Util: 96%  global_step : 14861
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 862 | Mem: 26.53MB, Util: 96%  global_step : 14862
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 863 | Mem: 26.53MB, Util: 96%  global_step : 14863
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 864 | Mem: 26.53MB, Util: 96%  global_step : 14864
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 865 | Mem: 26.53MB, Util: 96%  global_step : 14865
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 866 | Mem: 26.53MB, Util: 96%  global_step : 14866
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 867 | Mem: 26.53MB, Util: 96%  global_step 

[Rank 1] Train Epoch 7:  44%|████▍     | 890/2000 [00:06<00:08, 133.30it/s]
[Rank 0] Train Epoch 7:  45%|████▍     | 896/2000 [00:06<00:08, 137.47it/s]
[Rank 2] Train Epoch 7:  45%|████▍     | 899/2000 [00:06<00:08, 135.62it/s]
[Rank 1] Train Epoch 7:  45%|████▌     | 904/2000 [00:06<00:08, 129.93it/s]
[Rank 0] Train Epoch 7:  46%|████▌     | 910/2000 [00:06<00:08, 122.93it/s]
[Rank 2] Train Epoch 7:  46%|████▌     | 913/2000 [00:07<00:09, 115.08it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 890 | Mem: 26.53MB, Util: 100%  global_step : 14890
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 891 | Mem: 26.53MB, Util: 100%  global_step : 14891
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 892 | Mem: 26.53MB, Util: 100%  global_step : 14892
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 893 | Mem: 26.53MB, Util: 100%  global_step : 14893
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 894 | Mem: 26.53MB, Util: 100%  global_step : 14894
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 895 | Mem: 26.53MB, Util: 100%  global_step : 14895
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 896 | Mem: 26.53MB, Util: 100%  global_step : 14896
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 897 | Mem: 26.53MB, Util: 100%  glob

[Rank 1] Train Epoch 7:  46%|████▌     | 918/2000 [00:07<00:08, 130.49it/s]
[Rank 0] Train Epoch 7:  46%|████▋     | 925/2000 [00:07<00:08, 127.80it/s]
[Rank 2] Train Epoch 7:  46%|████▋     | 927/2000 [00:07<00:08, 119.70it/s]
[Rank 1] Train Epoch 7:  47%|████▋     | 932/2000 [00:07<00:08, 129.01it/s]
[Rank 0] Train Epoch 7:  47%|████▋     | 939/2000 [00:07<00:08, 130.90it/s]
[Rank 2] Train Epoch 7:  47%|████▋     | 941/2000 [00:07<00:08, 123.14it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 913 | Mem: 26.53MB, Util: 100%  global_step : 14913
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 914 | Mem: 26.53MB, Util: 100%  global_step : 14914
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 915 | Mem: 26.53MB, Util: 100%  global_step : 14915
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 916 | Mem: 26.53MB, Util: 100%  global_step : 14916
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 917 | Mem: 26.53MB, Util: 100%  global_step : 14917
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 918 | Mem: 26.53MB, Util: 100%  global_step : 14918
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 919 | Mem: 26.53MB, Util: 100%  global_step : 14919
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 920 | Mem: 26.53MB, Util: 100%  glob

[Rank 1] Train Epoch 7:  47%|████▋     | 946/2000 [00:07<00:08, 130.03it/s]
[Rank 0] Train Epoch 7:  48%|████▊     | 953/2000 [00:07<00:07, 133.21it/s]
[Rank 2] Train Epoch 7:  48%|████▊     | 955/2000 [00:07<00:08, 125.78it/s]
[Rank 1] Train Epoch 7:  48%|████▊     | 960/2000 [00:07<00:07, 130.84it/s]
[Rank 0] Train Epoch 7:  48%|████▊     | 967/2000 [00:07<00:07, 133.97it/s]
[Rank 2] Train Epoch 7:  48%|████▊     | 969/2000 [00:07<00:08, 127.88it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 943 | Mem: 26.53MB, Util: 96%  global_step : 14943
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 944 | Mem: 26.53MB, Util: 96%  global_step : 14944
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 945 | Mem: 26.53MB, Util: 96%  global_step : 14945
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 946 | Mem: 26.53MB, Util: 96%  global_step : 14946
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 947 | Mem: 26.53MB, Util: 96%  global_step : 14947
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 948 | Mem: 26.53MB, Util: 96%  global_step : 14948
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 949 | Mem: 26.53MB, Util: 96%  global_step : 14949
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 950 | Mem: 26.53MB, Util: 96%  global_step 

[Rank 1] Train Epoch 7:  49%|████▊     | 974/2000 [00:07<00:07, 131.85it/s]
[Rank 0] Train Epoch 7:  49%|████▉     | 981/2000 [00:07<00:07, 135.59it/s]
[Rank 2] Train Epoch 7:  49%|████▉     | 983/2000 [00:07<00:07, 129.38it/s]
[Rank 1] Train Epoch 7:  49%|████▉     | 989/2000 [00:07<00:07, 135.10it/s]
[Rank 0] Train Epoch 7:  50%|████▉     | 995/2000 [00:07<00:07, 136.65it/s]
[Rank 2] Train Epoch 7:  50%|████▉     | 997/2000 [00:07<00:07, 130.47it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 972 | Mem: 26.53MB, Util: 100%  global_step : 14972
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 973 | Mem: 26.53MB, Util: 100%  global_step : 14973
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 974 | Mem: 26.53MB, Util: 100%  global_step : 14974
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 975 | Mem: 26.53MB, Util: 100%  global_step : 14975
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 976 | Mem: 26.53MB, Util: 100%  global_step : 14976
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 977 | Mem: 26.53MB, Util: 100%  global_step : 14977
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 978 | Mem: 26.53MB, Util: 100%  global_step : 14978
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 979 | Mem: 26.53MB, Util: 100%  glob

[Rank 1] Train Epoch 7:  50%|█████     | 1003/2000 [00:07<00:07, 135.13it/s]
[Rank 0] Train Epoch 7:  50%|█████     | 1009/2000 [00:07<00:07, 124.97it/s]
[Rank 1] Train Epoch 7:  51%|█████     | 1019/2000 [00:07<00:06, 140.72it/s]
[Rank 0] Train Epoch 7:  51%|█████     | 1023/2000 [00:07<00:07, 128.25it/s]
[Rank 2] Train Epoch 7:  51%|█████     | 1011/2000 [00:07<00:07, 130.48it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 1000 | Mem: 26.53MB, Util: 100%  global_step : 15000
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 1001 | Mem: 26.53MB, Util: 100%  global_step : 15001
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 1002 | Mem: 26.53MB, Util: 100%  global_step : 15002
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 1003 | Mem: 26.53MB, Util: 100%  global_step : 15003
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 1004 | Mem: 26.53MB, Util: 100%  global_step : 15004
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 1005 | Mem: 26.53MB, Util: 100%  global_step : 15005
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 1006 | Mem: 26.53MB, Util: 100%  global_step : 15006
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 1007 | Mem: 26.53MB, Util: 10

[Rank 1] Train Epoch 7:  52%|█████▏    | 1035/2000 [00:07<00:06, 143.64it/s]
[Rank 2] Train Epoch 7:  51%|█████▏    | 1025/2000 [00:07<00:07, 131.62it/s]
[Rank 0] Train Epoch 7:  52%|█████▏    | 1038/2000 [00:07<00:07, 131.84it/s]
[Rank 1] Train Epoch 7:  53%|█████▎    | 1051/2000 [00:07<00:06, 146.16it/s]
[Rank 2] Train Epoch 7:  52%|█████▏    | 1039/2000 [00:07<00:07, 132.30it/s]
[Rank 0] Train Epoch 7:  53%|█████▎    | 1052/2000 [00:08<00:07, 133.52it/s]
[Rank 2] Train Epoch 7:  53%|█████▎    | 1053/2000 [00:08<00:07, 134.20it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1024 | Mem: 26.53MB, Util: 96%  global_step : 15024
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1025 | Mem: 26.53MB, Util: 96%  global_step : 15025
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1026 | Mem: 26.53MB, Util: 96%  global_step : 15026
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1027 | Mem: 26.53MB, Util: 96%  global_step : 15027
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1028 | Mem: 26.53MB, Util: 96%  global_step : 15028
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1029 | Mem: 26.53MB, Util: 96%  global_step : 15029
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1030 | Mem: 26.53MB, Util: 96%  global_step : 15030
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1031 | Mem: 26.53MB, Util: 6

[Rank 1] Train Epoch 7:  53%|█████▎    | 1067/2000 [00:08<00:06, 150.06it/s]
[Rank 0] Train Epoch 7:  53%|█████▎    | 1066/2000 [00:08<00:06, 134.22it/s]
[Rank 2] Train Epoch 7:  53%|█████▎    | 1067/2000 [00:08<00:06, 133.99it/s]
[Rank 1] Train Epoch 7:  54%|█████▍    | 1083/2000 [00:08<00:06, 150.99it/s]
[Rank 0] Train Epoch 7:  54%|█████▍    | 1081/2000 [00:08<00:06, 136.08it/s]
[Rank 2] Train Epoch 7:  54%|█████▍    | 1081/2000 [00:08<00:06, 133.63it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 1054 | Mem: 26.53MB, Util: 97%  global_step : 15054
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 1055 | Mem: 26.53MB, Util: 97%  global_step : 15055
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 1056 | Mem: 26.53MB, Util: 97%  global_step : 15056
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 1057 | Mem: 26.53MB, Util: 97%  global_step : 15057
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 1058 | Mem: 26.53MB, Util: 97%  global_step : 15058
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 1059 | Mem: 26.53MB, Util: 97%  global_step : 15059
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 1060 | Mem: 26.53MB, Util: 97%  global_step : 15060
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 1061 | Mem: 26.53MB, Util: 97%  glob

[Rank 1] Train Epoch 7:  55%|█████▍    | 1099/2000 [00:08<00:05, 151.93it/s]
[Rank 0] Train Epoch 7:  55%|█████▍    | 1095/2000 [00:08<00:06, 133.97it/s]
[Rank 2] Train Epoch 7:  55%|█████▍    | 1095/2000 [00:08<00:06, 133.48it/s]
[Rank 2] Train Epoch 7:  55%|█████▌    | 1109/2000 [00:08<00:06, 133.21it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 1083 | Mem: 26.53MB, Util: 100%  global_step : 15083
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 1084 | Mem: 26.53MB, Util: 100%  global_step : 15084
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 1085 | Mem: 26.53MB, Util: 100%  global_step : 15085
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 1086 | Mem: 26.53MB, Util: 100%  global_step : 15086
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 1087 | Mem: 26.53MB, Util: 100%  global_step : 15087
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 1088 | Mem: 26.53MB, Util: 100%  global_step : 15088
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 1089 | Mem: 26.53MB, Util: 100%  global_step : 15089
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 1090 | Mem: 26.53MB, Util: 10

[Rank 1] Train Epoch 7:  56%|█████▌    | 1115/2000 [00:08<00:07, 119.01it/s]
[Rank 0] Train Epoch 7:  55%|█████▌    | 1109/2000 [00:08<00:06, 128.06it/s]
[Rank 0] Train Epoch 7:  56%|█████▌    | 1123/2000 [00:08<00:06, 131.13it/s]
[Rank 2] Train Epoch 7:  56%|█████▌    | 1123/2000 [00:08<00:06, 133.46it/s]
[Rank 1] Train Epoch 7:  57%|█████▋    | 1131/2000 [00:08<00:06, 128.69it/s]
[Rank 0] Train Epoch 7:  57%|█████▋    | 1137/2000 [00:08<00:06, 133.01it/s]
[Rank 2] Train Epoch 7:  57%|█████▋    | 1137/2000 [00:08<00:06, 133.85it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 1108 | Mem: 26.53MB, Util: 100%  global_step : 15108
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 1109 | Mem: 26.53MB, Util: 100%  global_step : 15109
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 1110 | Mem: 26.53MB, Util: 100%  global_step : 15110
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 1111 | Mem: 26.53MB, Util: 100%  global_step : 15111
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 1112 | Mem: 26.53MB, Util: 100%  global_step : 15112
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 1113 | Mem: 26.53MB, Util: 100%  global_step : 15113
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 1114 | Mem: 26.53MB, Util: 100%  global_step : 15114
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 1115 | Mem: 26.53MB, Util: 10

[Rank 1] Train Epoch 7:  57%|█████▋    | 1147/2000 [00:08<00:06, 135.10it/s]
[Rank 2] Train Epoch 7:  58%|█████▊    | 1151/2000 [00:08<00:06, 134.00it/s]
[Rank 0] Train Epoch 7:  58%|█████▊    | 1152/2000 [00:08<00:06, 135.25it/s]
[Rank 1] Train Epoch 7:  58%|█████▊    | 1163/2000 [00:08<00:06, 139.30it/s]
[Rank 2] Train Epoch 7:  58%|█████▊    | 1165/2000 [00:08<00:06, 134.09it/s]
[Rank 0] Train Epoch 7:  58%|█████▊    | 1166/2000 [00:08<00:06, 135.80it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 1138 | Mem: 26.53MB, Util: 85%  global_step : 15138
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 1139 | Mem: 26.53MB, Util: 85%  global_step : 15139
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 1140 | Mem: 26.53MB, Util: 85%  global_step : 15140
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 1141 | Mem: 26.53MB, Util: 85%  global_step : 15141
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 1142 | Mem: 26.53MB, Util: 85%  global_step : 15142
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 1143 | Mem: 26.53MB, Util: 85%  global_step : 15143
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 1144 | Mem: 26.53MB, Util: 85%  global_step : 15144
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 1145 | Mem: 26.53MB, Util: 85%  glob

[Rank 1] Train Epoch 7:  59%|█████▉    | 1178/2000 [00:08<00:05, 141.80it/s]
[Rank 2] Train Epoch 7:  59%|█████▉    | 1179/2000 [00:09<00:06, 133.93it/s]
[Rank 0] Train Epoch 7:  59%|█████▉    | 1181/2000 [00:08<00:05, 138.37it/s]
[Rank 1] Train Epoch 7:  60%|█████▉    | 1194/2000 [00:09<00:05, 144.65it/s]
[Rank 2] Train Epoch 7:  60%|█████▉    | 1193/2000 [00:09<00:06, 134.46it/s]
[Rank 0] Train Epoch 7:  60%|█████▉    | 1196/2000 [00:09<00:05, 140.85it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1165 | Mem: 26.53MB, Util: 66%  global_step : 15165
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1166 | Mem: 26.53MB, Util: 66%  global_step : 15166
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1167 | Mem: 26.53MB, Util: 66%  global_step : 15167
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1168 | Mem: 26.53MB, Util: 66%  global_step : 15168
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1169 | Mem: 26.53MB, Util: 66%  global_step : 15169
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1170 | Mem: 26.53MB, Util: 66%  global_step : 15170
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1171 | Mem: 26.53MB, Util: 66%  global_step : 15171
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1172 | Mem: 26.53MB, Util: 6

[Rank 2] Train Epoch 7:  60%|██████    | 1207/2000 [00:09<00:05, 134.56it/s]
[Rank 1] Train Epoch 7:  60%|██████    | 1209/2000 [00:09<00:06, 117.65it/s]
[Rank 2] Train Epoch 7:  61%|██████    | 1221/2000 [00:09<00:05, 134.95it/s]
[Rank 0] Train Epoch 7:  61%|██████    | 1211/2000 [00:09<00:06, 131.41it/s]
[Rank 1] Train Epoch 7:  61%|██████▏   | 1225/2000 [00:09<00:06, 127.32it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1193 | Mem: 26.53MB, Util: 66%  global_step : 15193
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1194 | Mem: 26.53MB, Util: 66%  global_step : 15194
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1195 | Mem: 26.53MB, Util: 66%  global_step : 15195
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1196 | Mem: 26.53MB, Util: 66%  global_step : 15196
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1197 | Mem: 26.53MB, Util: 66%  global_step : 15197
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1198 | Mem: 26.53MB, Util: 66%  global_step : 15198
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1199 | Mem: 26.53MB, Util: 66%  global_step : 15199
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1200 | Mem: 26.53MB, Util: 6

[Rank 2] Train Epoch 7:  62%|██████▏   | 1235/2000 [00:09<00:05, 135.02it/s]
[Rank 0] Train Epoch 7:  61%|██████▏   | 1226/2000 [00:09<00:05, 135.80it/s]
[Rank 1] Train Epoch 7:  62%|██████▏   | 1241/2000 [00:09<00:05, 134.65it/s]
[Rank 2] Train Epoch 7:  62%|██████▏   | 1249/2000 [00:09<00:05, 135.51it/s]
[Rank 0] Train Epoch 7:  62%|██████▏   | 1241/2000 [00:09<00:05, 138.83it/s]
[Rank 1] Train Epoch 7:  63%|██████▎   | 1257/2000 [00:09<00:05, 140.54it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1222 | Mem: 26.53MB, Util: 66%  global_step : 15222
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1223 | Mem: 26.53MB, Util: 66%  global_step : 15223
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1224 | Mem: 26.53MB, Util: 66%  global_step : 15224
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1225 | Mem: 26.53MB, Util: 66%  global_step : 15225
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1226 | Mem: 26.53MB, Util: 66%  global_step : 15226
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1227 | Mem: 26.53MB, Util: 66%  global_step : 15227
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1228 | Mem: 26.53MB, Util: 66%  global_step : 15228
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1229 | Mem: 26.53MB, Util: 6

[Rank 2] Train Epoch 7:  63%|██████▎   | 1263/2000 [00:09<00:05, 136.52it/s]
[Rank 0] Train Epoch 7:  63%|██████▎   | 1256/2000 [00:09<00:05, 141.01it/s]
[Rank 1] Train Epoch 7:  64%|██████▎   | 1272/2000 [00:09<00:05, 142.41it/s]
[Rank 2] Train Epoch 7:  64%|██████▍   | 1277/2000 [00:09<00:05, 137.28it/s]
[Rank 0] Train Epoch 7:  64%|██████▎   | 1271/2000 [00:09<00:05, 139.45it/s]
[Rank 1] Train Epoch 7:  64%|██████▍   | 1287/2000 [00:09<00:04, 143.63it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1250 | Mem: 26.53MB, Util: 66%  global_step : 15250
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1251 | Mem: 26.53MB, Util: 66%  global_step : 15251
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1252 | Mem: 26.53MB, Util: 66%  global_step : 15252
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1253 | Mem: 26.53MB, Util: 66%  global_step : 15253
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1254 | Mem: 26.53MB, Util: 66%  global_step : 15254
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1255 | Mem: 26.53MB, Util: 66%  global_step : 15255
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1256 | Mem: 26.53MB, Util: 66%  global_step : 15256
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1257 | Mem: 26.53MB, Util: 6

[Rank 2] Train Epoch 7:  65%|██████▍   | 1291/2000 [00:09<00:05, 137.22it/s]
[Rank 0] Train Epoch 7:  64%|██████▍   | 1286/2000 [00:09<00:05, 139.89it/s]
[Rank 1] Train Epoch 7:  65%|██████▌   | 1302/2000 [00:09<00:05, 120.21it/s]
[Rank 2] Train Epoch 7:  65%|██████▌   | 1305/2000 [00:09<00:05, 137.17it/s]
[Rank 0] Train Epoch 7:  65%|██████▌   | 1301/2000 [00:09<00:05, 128.48it/s]
[Rank 1] Train Epoch 7:  66%|██████▌   | 1317/2000 [00:09<00:05, 125.72it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1280 | Mem: 26.53MB, Util: 66%  global_step : 15280
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1281 | Mem: 26.53MB, Util: 66%  global_step : 15281
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1282 | Mem: 26.53MB, Util: 66%  global_step : 15282
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1283 | Mem: 26.53MB, Util: 66%  global_step : 15283
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1284 | Mem: 26.53MB, Util: 66%  global_step : 15284
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1285 | Mem: 26.53MB, Util: 66%  global_step : 15285
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1286 | Mem: 26.53MB, Util: 66%  global_step : 15286
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1287 | Mem: 26.53MB, Util: 6

[Rank 0] Train Epoch 7:  66%|██████▌   | 1316/2000 [00:09<00:05, 133.35it/s]
[Rank 2] Train Epoch 7:  66%|██████▌   | 1319/2000 [00:10<00:04, 136.53it/s]
[Rank 1] Train Epoch 7:  67%|██████▋   | 1332/2000 [00:10<00:05, 129.98it/s]
[Rank 0] Train Epoch 7:  67%|██████▋   | 1331/2000 [00:10<00:04, 137.55it/s]
[Rank 2] Train Epoch 7:  67%|██████▋   | 1333/2000 [00:10<00:04, 136.20it/s]
[Rank 1] Train Epoch 7:  67%|██████▋   | 1347/2000 [00:10<00:04, 133.93it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 1309 | Mem: 26.53MB, Util: 100%  global_step : 15309
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 1310 | Mem: 26.53MB, Util: 100%  global_step : 15310
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 1311 | Mem: 26.53MB, Util: 100%  global_step : 15311
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 1312 | Mem: 26.53MB, Util: 100%  global_step : 15312
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 1313 | Mem: 26.53MB, Util: 100%  global_step : 15313
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 1314 | Mem: 26.53MB, Util: 100%  global_step : 15314
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 1315 | Mem: 26.53MB, Util: 100%  global_step : 15315
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 1316 | Mem: 26.53MB, Util: 10

[Rank 0] Train Epoch 7:  67%|██████▋   | 1346/2000 [00:10<00:04, 140.71it/s]
[Rank 2] Train Epoch 7:  67%|██████▋   | 1347/2000 [00:10<00:04, 136.42it/s]
[Rank 1] Train Epoch 7:  68%|██████▊   | 1362/2000 [00:10<00:04, 137.45it/s]
[Rank 2] Train Epoch 7:  68%|██████▊   | 1361/2000 [00:10<00:04, 137.14it/s]
[Rank 0] Train Epoch 7:  68%|██████▊   | 1361/2000 [00:10<00:04, 141.91it/s]
[Rank 1] Train Epoch 7:  69%|██████▉   | 1377/2000 [00:10<00:04, 139.38it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 1340 | Mem: 26.53MB, Util: 96%  global_step : 15340
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 1341 | Mem: 26.53MB, Util: 96%  global_step : 15341
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 1342 | Mem: 26.53MB, Util: 96%  global_step : 15342
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 1343 | Mem: 26.53MB, Util: 96%  global_step : 15343
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 1344 | Mem: 26.53MB, Util: 96%  global_step : 15344
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 1345 | Mem: 26.53MB, Util: 96%  global_step : 15345
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 1346 | Mem: 26.53MB, Util: 96%  global_step : 15346
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 1347 | Mem: 26.53MB, Util: 96%  glob

[Rank 2] Train Epoch 7:  69%|██████▉   | 1375/2000 [00:10<00:04, 137.38it/s]
[Rank 0] Train Epoch 7:  69%|██████▉   | 1376/2000 [00:10<00:04, 143.34it/s]
[Rank 1] Train Epoch 7:  70%|██████▉   | 1392/2000 [00:10<00:04, 141.64it/s]
[Rank 2] Train Epoch 7:  69%|██████▉   | 1389/2000 [00:10<00:04, 137.61it/s]
[Rank 0] Train Epoch 7:  70%|██████▉   | 1391/2000 [00:10<00:04, 144.47it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1366 | Mem: 26.53MB, Util: 65%  global_step : 15366
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1367 | Mem: 26.53MB, Util: 65%  global_step : 15367
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1368 | Mem: 26.53MB, Util: 65%  global_step : 15368
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1369 | Mem: 26.53MB, Util: 65%  global_step : 15369
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1370 | Mem: 26.53MB, Util: 65%  global_step : 15370
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1371 | Mem: 26.53MB, Util: 65%  global_step : 15371
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1372 | Mem: 26.53MB, Util: 65%  global_step : 15372
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1373 | Mem: 26.53MB, Util: 6

[Rank 2] Train Epoch 7:  70%|███████   | 1403/2000 [00:10<00:04, 137.95it/s]
[Rank 0] Train Epoch 7:  70%|███████   | 1406/2000 [00:10<00:04, 127.06it/s]
[Rank 1] Train Epoch 7:  70%|███████   | 1407/2000 [00:10<00:04, 129.57it/s]
[Rank 2] Train Epoch 7:  71%|███████   | 1417/2000 [00:10<00:04, 137.81it/s]
[Rank 0] Train Epoch 7:  71%|███████   | 1421/2000 [00:10<00:04, 132.38it/s]
[Rank 1] Train Epoch 7:  71%|███████   | 1422/2000 [00:10<00:04, 132.57it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1395 | Mem: 26.53MB, Util: 66%  global_step : 15395
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1396 | Mem: 26.53MB, Util: 66%  global_step : 15396
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1397 | Mem: 26.53MB, Util: 66%  global_step : 15397
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1398 | Mem: 26.53MB, Util: 66%  global_step : 15398
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1399 | Mem: 26.53MB, Util: 66%  global_step : 15399
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1400 | Mem: 26.53MB, Util: 66%  global_step : 15400
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1401 | Mem: 26.53MB, Util: 66%  global_step : 15401
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1402 | Mem: 26.53MB, Util: 6

[Rank 2] Train Epoch 7:  72%|███████▏  | 1431/2000 [00:10<00:04, 137.93it/s]
[Rank 0] Train Epoch 7:  72%|███████▏  | 1436/2000 [00:10<00:04, 136.37it/s]
[Rank 1] Train Epoch 7:  72%|███████▏  | 1436/2000 [00:10<00:04, 134.11it/s]
[Rank 2] Train Epoch 7:  72%|███████▏  | 1445/2000 [00:10<00:04, 137.86it/s]
[Rank 0] Train Epoch 7:  73%|███████▎  | 1451/2000 [00:10<00:03, 139.34it/s]
[Rank 1] Train Epoch 7:  73%|███████▎  | 1451/2000 [00:10<00:04, 135.90it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1424 | Mem: 26.53MB, Util: 67%  global_step : 15424
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1425 | Mem: 26.53MB, Util: 67%  global_step : 15425
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1426 | Mem: 26.53MB, Util: 67%  global_step : 15426
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1427 | Mem: 26.53MB, Util: 67%  global_step : 15427
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1428 | Mem: 26.53MB, Util: 67%  global_step : 15428
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1429 | Mem: 26.53MB, Util: 67%  global_step : 15429
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1430 | Mem: 26.53MB, Util: 67%  global_step : 15430
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1431 | Mem: 26.53MB, Util: 6

[Rank 2] Train Epoch 7:  73%|███████▎  | 1459/2000 [00:11<00:03, 138.09it/s]
[Rank 0] Train Epoch 7:  73%|███████▎  | 1466/2000 [00:11<00:03, 137.66it/s]
[Rank 1] Train Epoch 7:  73%|███████▎  | 1466/2000 [00:11<00:03, 137.51it/s]
[Rank 2] Train Epoch 7:  74%|███████▎  | 1473/2000 [00:11<00:03, 138.36it/s]
[Rank 0] Train Epoch 7:  74%|███████▍  | 1481/2000 [00:11<00:03, 140.73it/s]
[Rank 1] Train Epoch 7:  74%|███████▍  | 1481/2000 [00:11<00:03, 140.22it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1453 | Mem: 26.53MB, Util: 65%  global_step : 15453
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1454 | Mem: 26.53MB, Util: 65%  global_step : 15454
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1455 | Mem: 26.53MB, Util: 65%  global_step : 15455
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1456 | Mem: 26.53MB, Util: 65%  global_step : 15456
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1457 | Mem: 26.53MB, Util: 65%  global_step : 15457
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1458 | Mem: 26.53MB, Util: 65%  global_step : 15458
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1459 | Mem: 26.53MB, Util: 65%  global_step : 15459
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1460 | Mem: 26.53MB, Util: 6

[Rank 2] Train Epoch 7:  74%|███████▍  | 1487/2000 [00:11<00:03, 137.94it/s]
[Rank 0] Train Epoch 7:  75%|███████▍  | 1497/2000 [00:11<00:03, 143.99it/s]
[Rank 1] Train Epoch 7:  75%|███████▍  | 1496/2000 [00:11<00:03, 141.90it/s]
[Rank 2] Train Epoch 7:  75%|███████▌  | 1501/2000 [00:11<00:03, 137.94it/s]
[Rank 0] Train Epoch 7:  76%|███████▌  | 1512/2000 [00:11<00:03, 131.17it/s]
[Rank 1] Train Epoch 7:  76%|███████▌  | 1511/2000 [00:11<00:03, 133.63it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1482 | Mem: 26.53MB, Util: 65%  global_step : 15482
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1483 | Mem: 26.53MB, Util: 65%  global_step : 15483
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1484 | Mem: 26.53MB, Util: 65%  global_step : 15484
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1485 | Mem: 26.53MB, Util: 65%  global_step : 15485
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1486 | Mem: 26.53MB, Util: 65%  global_step : 15486
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1487 | Mem: 26.53MB, Util: 65%  global_step : 15487
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1488 | Mem: 26.53MB, Util: 65%  global_step : 15488
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1489 | Mem: 26.53MB, Util: 6

[Rank 2] Train Epoch 7:  76%|███████▌  | 1515/2000 [00:11<00:03, 138.01it/s]
[Rank 0] Train Epoch 7:  76%|███████▋  | 1528/2000 [00:11<00:03, 136.85it/s]
[Rank 1] Train Epoch 7:  76%|███████▋  | 1525/2000 [00:11<00:03, 135.29it/s]
[Rank 2] Train Epoch 7:  76%|███████▋  | 1530/2000 [00:11<00:03, 139.88it/s]
[Rank 0] Train Epoch 7:  77%|███████▋  | 1543/2000 [00:11<00:03, 140.18it/s]
[Rank 1] Train Epoch 7:  77%|███████▋  | 1540/2000 [00:11<00:03, 136.99it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1511 | Mem: 26.53MB, Util: 68%  global_step : 15511
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1512 | Mem: 26.53MB, Util: 68%  global_step : 15512
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1513 | Mem: 26.53MB, Util: 68%  global_step : 15513
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1514 | Mem: 26.53MB, Util: 68%  global_step : 15514
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1515 | Mem: 26.53MB, Util: 68%  global_step : 15515
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1516 | Mem: 26.53MB, Util: 68%  global_step : 15516
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1517 | Mem: 26.53MB, Util: 68%  global_step : 15517
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1518 | Mem: 26.53MB, Util: 6

[Rank 2] Train Epoch 7:  77%|███████▋  | 1545/2000 [00:11<00:03, 140.76it/s]
[Rank 0] Train Epoch 7:  78%|███████▊  | 1559/2000 [00:11<00:03, 143.51it/s]
[Rank 1] Train Epoch 7:  78%|███████▊  | 1555/2000 [00:11<00:03, 138.34it/s]
[Rank 2] Train Epoch 7:  78%|███████▊  | 1560/2000 [00:11<00:03, 140.72it/s]
[Rank 0] Train Epoch 7:  79%|███████▉  | 1575/2000 [00:11<00:02, 145.54it/s]
[Rank 1] Train Epoch 7:  78%|███████▊  | 1570/2000 [00:11<00:03, 139.26it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1541 | Mem: 26.53MB, Util: 74%  global_step : 15541
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1542 | Mem: 26.53MB, Util: 74%  global_step : 15542
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1543 | Mem: 26.53MB, Util: 74%  global_step : 15543
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1544 | Mem: 26.53MB, Util: 74%  global_step : 15544
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1545 | Mem: 26.53MB, Util: 74%  global_step : 15545
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1546 | Mem: 26.53MB, Util: 74%  global_step : 15546
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1547 | Mem: 26.53MB, Util: 74%  global_step : 15547
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1548 | Mem: 26.53MB, Util: 7

[Rank 2] Train Epoch 7:  79%|███████▉  | 1576/2000 [00:11<00:02, 143.73it/s]
[Rank 0] Train Epoch 7:  80%|███████▉  | 1590/2000 [00:11<00:02, 146.79it/s]
[Rank 1] Train Epoch 7:  79%|███████▉  | 1585/2000 [00:11<00:02, 140.60it/s]
[Rank 2] Train Epoch 7:  80%|███████▉  | 1591/2000 [00:12<00:02, 144.45it/s]
[Rank 1] Train Epoch 7:  80%|████████  | 1600/2000 [00:12<00:02, 141.33it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1572 | Mem: 26.53MB, Util: 86%  global_step : 15572
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1573 | Mem: 26.53MB, Util: 86%  global_step : 15573
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1574 | Mem: 26.53MB, Util: 86%  global_step : 15574
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1575 | Mem: 26.53MB, Util: 86%  global_step : 15575
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1576 | Mem: 26.53MB, Util: 86%  global_step : 15576
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1577 | Mem: 26.53MB, Util: 86%  global_step : 15577
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1578 | Mem: 26.53MB, Util: 86%  global_step : 15578
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1579 | Mem: 26.53MB, Util: 8

[Rank 2] Train Epoch 7:  80%|████████  | 1606/2000 [00:12<00:02, 140.99it/s]
[Rank 0] Train Epoch 7:  80%|████████  | 1605/2000 [00:12<00:02, 132.05it/s]
[Rank 1] Train Epoch 7:  81%|████████  | 1615/2000 [00:12<00:02, 142.64it/s]
[Rank 2] Train Epoch 7:  81%|████████  | 1621/2000 [00:12<00:02, 142.58it/s]
[Rank 0] Train Epoch 7:  81%|████████  | 1620/2000 [00:12<00:02, 135.19it/s]
[Rank 1] Train Epoch 7:  82%|████████▏ | 1630/2000 [00:12<00:02, 143.14it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1601 | Mem: 26.53MB, Util: 100%  global_step : 15601
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1602 | Mem: 26.53MB, Util: 100%  global_step : 15602
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1603 | Mem: 26.53MB, Util: 100%  global_step : 15603
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1604 | Mem: 26.53MB, Util: 100%  global_step : 15604
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1605 | Mem: 26.53MB, Util: 100%  global_step : 15605
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1606 | Mem: 26.53MB, Util: 100%  global_step : 15606
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1607 | Mem: 26.53MB, Util: 100%  global_step : 15607
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1608 | Mem: 26.53MB, 

[Rank 2] Train Epoch 7:  82%|████████▏ | 1636/2000 [00:12<00:02, 143.45it/s]
[Rank 0] Train Epoch 7:  82%|████████▏ | 1635/2000 [00:12<00:02, 138.52it/s]
[Rank 1] Train Epoch 7:  82%|████████▏ | 1645/2000 [00:12<00:02, 144.54it/s]
[Rank 2] Train Epoch 7:  83%|████████▎ | 1651/2000 [00:12<00:02, 144.08it/s]
[Rank 0] Train Epoch 7:  82%|████████▎ | 1650/2000 [00:12<00:02, 141.19it/s]
[Rank 1] Train Epoch 7:  83%|████████▎ | 1660/2000 [00:12<00:02, 144.71it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1631 | Mem: 26.53MB, Util: 97%  global_step : 15631
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1632 | Mem: 26.53MB, Util: 97%  global_step : 15632
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1633 | Mem: 26.53MB, Util: 97%  global_step : 15633
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1634 | Mem: 26.53MB, Util: 97%  global_step : 15634
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1635 | Mem: 26.53MB, Util: 97%  global_step : 15635
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1636 | Mem: 26.53MB, Util: 97%  global_step : 15636
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1637 | Mem: 26.53MB, Util: 97%  global_step : 15637
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1638 | Mem: 26.53MB, Util: 9

[Rank 2] Train Epoch 7:  83%|████████▎ | 1666/2000 [00:12<00:02, 145.15it/s]
[Rank 0] Train Epoch 7:  83%|████████▎ | 1666/2000 [00:12<00:02, 144.23it/s]
[Rank 1] Train Epoch 7:  84%|████████▍ | 1675/2000 [00:12<00:02, 142.80it/s]
[Rank 2] Train Epoch 7:  84%|████████▍ | 1681/2000 [00:12<00:02, 145.37it/s]
[Rank 0] Train Epoch 7:  84%|████████▍ | 1681/2000 [00:12<00:02, 144.73it/s]
[Rank 1] Train Epoch 7:  84%|████████▍ | 1690/2000 [00:12<00:02, 143.40it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1662 | Mem: 26.53MB, Util: 95%  global_step : 15662
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1663 | Mem: 26.53MB, Util: 95%  global_step : 15663
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1664 | Mem: 26.53MB, Util: 95%  global_step : 15664
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1665 | Mem: 26.53MB, Util: 95%  global_step : 15665
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1666 | Mem: 26.53MB, Util: 95%  global_step : 15666
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1667 | Mem: 26.53MB, Util: 95%  global_step : 15667
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1668 | Mem: 26.53MB, Util: 95%  global_step : 15668
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1669 | Mem: 26.53MB, Util: 9

[Rank 2] Train Epoch 7:  85%|████████▍ | 1696/2000 [00:12<00:02, 146.09it/s]
[Rank 0] Train Epoch 7:  85%|████████▍ | 1697/2000 [00:12<00:02, 146.55it/s]
[Rank 1] Train Epoch 7:  85%|████████▌ | 1705/2000 [00:12<00:02, 143.29it/s]
[Rank 2] Train Epoch 7:  86%|████████▌ | 1711/2000 [00:12<00:02, 140.64it/s]
[Rank 0] Train Epoch 7:  86%|████████▌ | 1712/2000 [00:12<00:02, 140.16it/s]
[Rank 1] Train Epoch 7:  86%|████████▌ | 1720/2000 [00:12<00:01, 143.42it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1693 | Mem: 26.53MB, Util: 98%  global_step : 15693
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1694 | Mem: 26.53MB, Util: 98%  global_step : 15694
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1695 | Mem: 26.53MB, Util: 98%  global_step : 15695
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1696 | Mem: 26.53MB, Util: 98%  global_step : 15696
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1697 | Mem: 26.53MB, Util: 98%  global_step : 15697
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1698 | Mem: 26.53MB, Util: 98%  global_step : 15698
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1699 | Mem: 26.53MB, Util: 98%  global_step : 15699
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1700 | Mem: 26.53MB, Util: 9

[Rank 2] Train Epoch 7:  86%|████████▋ | 1726/2000 [00:12<00:01, 142.55it/s]
[Rank 0] Train Epoch 7:  86%|████████▋ | 1728/2000 [00:12<00:01, 143.30it/s]
[Rank 1] Train Epoch 7:  87%|████████▋ | 1735/2000 [00:12<00:01, 142.50it/s]
[Rank 2] Train Epoch 7:  87%|████████▋ | 1741/2000 [00:13<00:01, 144.09it/s]
[Rank 0] Train Epoch 7:  87%|████████▋ | 1743/2000 [00:12<00:01, 144.72it/s]
[Rank 1] Train Epoch 7:  88%|████████▊ | 1750/2000 [00:13<00:01, 143.14it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1722 | Mem: 26.53MB, Util: 90%  global_step : 15722
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1723 | Mem: 26.53MB, Util: 90%  global_step : 15723
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1724 | Mem: 26.53MB, Util: 90%  global_step : 15724
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1725 | Mem: 26.53MB, Util: 90%  global_step : 15725
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1726 | Mem: 26.53MB, Util: 90%  global_step : 15726
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1727 | Mem: 26.53MB, Util: 90%  global_step : 15727
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1728 | Mem: 26.53MB, Util: 90%  global_step : 15728
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1729 | Mem: 26.53MB, Util: 9

[Rank 2] Train Epoch 7:  88%|████████▊ | 1756/2000 [00:13<00:01, 144.38it/s]
[Rank 0] Train Epoch 7:  88%|████████▊ | 1758/2000 [00:13<00:01, 146.03it/s]
[Rank 1] Train Epoch 7:  88%|████████▊ | 1765/2000 [00:13<00:01, 142.96it/s]
[Rank 2] Train Epoch 7:  89%|████████▊ | 1771/2000 [00:13<00:01, 144.59it/s]
[Rank 0] Train Epoch 7:  89%|████████▊ | 1773/2000 [00:13<00:01, 143.82it/s]
[Rank 1] Train Epoch 7:  89%|████████▉ | 1780/2000 [00:13<00:01, 143.16it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1753 | Mem: 26.53MB, Util: 92%  global_step : 15753
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1754 | Mem: 26.53MB, Util: 92%  global_step : 15754
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1755 | Mem: 26.53MB, Util: 92%  global_step : 15755
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1756 | Mem: 26.53MB, Util: 92%  global_step : 15756
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1757 | Mem: 26.53MB, Util: 92%  global_step : 15757
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1758 | Mem: 26.53MB, Util: 92%  global_step : 15758
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1759 | Mem: 26.53MB, Util: 92%  global_step : 15759
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1760 | Mem: 26.53MB, Util: 9

[Rank 2] Train Epoch 7:  89%|████████▉ | 1786/2000 [00:13<00:01, 145.27it/s]
[Rank 0] Train Epoch 7:  89%|████████▉ | 1788/2000 [00:13<00:01, 143.61it/s]
[Rank 1] Train Epoch 7:  90%|████████▉ | 1795/2000 [00:13<00:01, 142.70it/s]
[Rank 2] Train Epoch 7:  90%|█████████ | 1801/2000 [00:13<00:01, 139.51it/s]
[Rank 0] Train Epoch 7:  90%|█████████ | 1803/2000 [00:13<00:01, 137.03it/s]
[Rank 1] Train Epoch 7:  90%|█████████ | 1810/2000 [00:13<00:01, 143.11it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1784 | Mem: 26.53MB, Util: 100%  global_step : 15784
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1785 | Mem: 26.53MB, Util: 100%  global_step : 15785
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1786 | Mem: 26.53MB, Util: 100%  global_step : 15786
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1787 | Mem: 26.53MB, Util: 100%  global_step : 15787
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1788 | Mem: 26.53MB, Util: 100%  global_step : 15788
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1789 | Mem: 26.53MB, Util: 100%  global_step : 15789
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1790 | Mem: 26.53MB, Util: 100%  global_step : 15790
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1791 | Mem: 26.53MB, 

[Rank 2] Train Epoch 7:  91%|█████████ | 1816/2000 [00:13<00:01, 141.48it/s]
[Rank 0] Train Epoch 7:  91%|█████████ | 1817/2000 [00:13<00:01, 136.80it/s]
[Rank 1] Train Epoch 7:  91%|█████████▏| 1825/2000 [00:13<00:01, 142.99it/s]
[Rank 2] Train Epoch 7:  92%|█████████▏| 1831/2000 [00:13<00:01, 138.92it/s]
[Rank 0] Train Epoch 7:  92%|█████████▏| 1832/2000 [00:13<00:01, 139.80it/s]
[Rank 1] Train Epoch 7:  92%|█████████▏| 1840/2000 [00:13<00:01, 141.61it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1812 | Mem: 26.53MB, Util: 100%  global_step : 15812
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1813 | Mem: 26.53MB, Util: 100%  global_step : 15813
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1814 | Mem: 26.53MB, Util: 100%  global_step : 15814
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1815 | Mem: 26.53MB, Util: 100%  global_step : 15815
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1816 | Mem: 26.53MB, Util: 100%  global_step : 15816
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1817 | Mem: 26.53MB, Util: 100%  global_step : 15817
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1818 | Mem: 26.53MB, Util: 100%  global_step : 15818
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1819 | Mem: 26.53MB, 

[Rank 2] Train Epoch 7:  92%|█████████▏| 1845/2000 [00:13<00:01, 139.01it/s]
[Rank 0] Train Epoch 7:  92%|█████████▏| 1847/2000 [00:13<00:01, 142.24it/s]
[Rank 1] Train Epoch 7:  93%|█████████▎| 1855/2000 [00:13<00:01, 140.70it/s]
[Rank 2] Train Epoch 7:  93%|█████████▎| 1860/2000 [00:13<00:00, 141.11it/s]
[Rank 0] Train Epoch 7:  93%|█████████▎| 1862/2000 [00:13<00:00, 142.87it/s]
[Rank 1] Train Epoch 7:  94%|█████████▎| 1870/2000 [00:13<00:00, 141.17it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1841 | Mem: 26.53MB, Util: 97%  global_step : 15841
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1842 | Mem: 26.53MB, Util: 97%  global_step : 15842
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1843 | Mem: 26.53MB, Util: 97%  global_step : 15843
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1844 | Mem: 26.53MB, Util: 97%  global_step : 15844
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1845 | Mem: 26.53MB, Util: 97%  global_step : 15845
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1846 | Mem: 26.53MB, Util: 97%  global_step : 15846
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1847 | Mem: 26.53MB, Util: 97%  global_step : 15847
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1848 | Mem: 26.53MB, Util: 9

[Rank 2] Train Epoch 7:  94%|█████████▍| 1875/2000 [00:13<00:00, 143.09it/s]
[Rank 0] Train Epoch 7:  94%|█████████▍| 1877/2000 [00:13<00:00, 144.32it/s]
[Rank 1] Train Epoch 7:  94%|█████████▍| 1885/2000 [00:14<00:00, 138.96it/s]
[Rank 2] Train Epoch 7:  94%|█████████▍| 1890/2000 [00:14<00:00, 143.96it/s]
[Rank 0] Train Epoch 7:  95%|█████████▍| 1892/2000 [00:14<00:00, 144.85it/s]
[Rank 1] Train Epoch 7:  95%|█████████▌| 1900/2000 [00:14<00:00, 139.96it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1871 | Mem: 26.53MB, Util: 69%  global_step : 15871
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1872 | Mem: 26.53MB, Util: 69%  global_step : 15872
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1873 | Mem: 26.53MB, Util: 69%  global_step : 15873
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1874 | Mem: 26.53MB, Util: 69%  global_step : 15874
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1875 | Mem: 26.53MB, Util: 69%  global_step : 15875
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1876 | Mem: 26.53MB, Util: 69%  global_step : 15876
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1877 | Mem: 26.53MB, Util: 69%  global_step : 15877
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1878 | Mem: 26.53MB, Util: 6

[Rank 2] Train Epoch 7:  95%|█████████▌| 1905/2000 [00:14<00:00, 138.00it/s]
[Rank 0] Train Epoch 7:  95%|█████████▌| 1907/2000 [00:14<00:00, 136.64it/s]
[Rank 1] Train Epoch 7:  96%|█████████▌| 1915/2000 [00:14<00:00, 140.98it/s]
[Rank 0] Train Epoch 7:  96%|█████████▌| 1922/2000 [00:14<00:00, 138.68it/s]
[Rank 2] Train Epoch 7:  96%|█████████▌| 1920/2000 [00:14<00:00, 140.74it/s]
[Rank 1] Train Epoch 7:  96%|█████████▋| 1930/2000 [00:14<00:00, 141.91it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 1900 | Mem: 26.53MB, Util: 100%  global_step : 15900
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 1901 | Mem: 26.53MB, Util: 100%  global_step : 15901
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 1902 | Mem: 26.53MB, Util: 100%  global_step : 15902
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 1903 | Mem: 26.53MB, Util: 100%  global_step : 15903
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 1904 | Mem: 26.53MB, Util: 100%  global_step : 15904
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 1905 | Mem: 26.53MB, Util: 100%  global_step : 15905
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 1906 | Mem: 26.53MB, Util: 100%  global_step : 15906
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 1907 | Mem: 26.53MB, Util: 10

[Rank 0] Train Epoch 7:  97%|█████████▋| 1937/2000 [00:14<00:00, 140.86it/s]
[Rank 2] Train Epoch 7:  97%|█████████▋| 1935/2000 [00:14<00:00, 142.39it/s]
[Rank 1] Train Epoch 7:  97%|█████████▋| 1945/2000 [00:14<00:00, 142.22it/s]
[Rank 0] Train Epoch 7:  98%|█████████▊| 1952/2000 [00:14<00:00, 141.45it/s]
[Rank 2] Train Epoch 7:  98%|█████████▊| 1950/2000 [00:14<00:00, 143.64it/s]
[Rank 1] Train Epoch 7:  98%|█████████▊| 1960/2000 [00:14<00:00, 142.73it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 1930 | Mem: 26.53MB, Util: 100%  global_step : 15930
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 1931 | Mem: 26.53MB, Util: 100%  global_step : 15931
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 1932 | Mem: 26.53MB, Util: 100%  global_step : 15932
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 1933 | Mem: 26.53MB, Util: 100%  global_step : 15933
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 1934 | Mem: 26.53MB, Util: 100%  global_step : 15934
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 1935 | Mem: 26.53MB, Util: 100%  global_step : 15935
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 1936 | Mem: 26.53MB, Util: 100%  global_step : 15936
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 7, Batch 1937 | Mem: 26.53MB, Util: 10

[Rank 2] Train Epoch 7:  98%|█████████▊| 1965/2000 [00:14<00:00, 144.30it/s]
[Rank 0] Train Epoch 7:  98%|█████████▊| 1967/2000 [00:14<00:00, 143.47it/s]
[Rank 1] Train Epoch 7:  99%|█████████▉| 1975/2000 [00:14<00:00, 143.65it/s]
[Rank 2] Train Epoch 7:  99%|█████████▉| 1980/2000 [00:14<00:00, 144.89it/s]
[Rank 0] Train Epoch 7:  99%|█████████▉| 1982/2000 [00:14<00:00, 144.52it/s]
[Rank 1] Train Epoch 7: 100%|█████████▉| 1990/2000 [00:14<00:00, 143.90it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1961 | Mem: 26.53MB, Util: 96%  global_step : 15961
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1962 | Mem: 26.53MB, Util: 96%  global_step : 15962
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1963 | Mem: 26.53MB, Util: 96%  global_step : 15963
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1964 | Mem: 26.53MB, Util: 96%  global_step : 15964
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1965 | Mem: 26.53MB, Util: 96%  global_step : 15965
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1966 | Mem: 26.53MB, Util: 96%  global_step : 15966
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1967 | Mem: 26.53MB, Util: 96%  global_step : 15967
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1968 | Mem: 26.53MB, Util: 9

[Rank 2] Train Epoch 7: 100%|██████████| 2000/2000 [00:14<00:00, 134.54it/s]
[Rank 2] Test Epoch 7:   0%|          | 0/334 [00:00<?, ?it/s]
[Rank 0] Train Epoch 7: 100%|██████████| 2000/2000 [00:14<00:00, 135.07it/s]
[Rank 0] Test Epoch 7:   0%|          | 0/334 [00:00<?, ?it/s]
[Rank 1] Train Epoch 7: 100%|██████████| 2000/2000 [00:14<00:00, 134.95it/s]
[Rank 1] Test Epoch 7:   0%|          | 0/334 [00:00<?, ?it/s]
[Rank 2] Test Epoch 7:  10%|▉         | 33/334 [00:00<00:00, 323.14it/s]
[Rank 0] Test Epoch 7:  10%|▉         | 33/334 [00:00<00:00, 324.13it/s]
[Rank 1] Test Epoch 7:  10%|█         | 34/334 [00:00<00:00, 333.64it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1992 | Mem: 26.53MB, Util: 100%  global_step : 15992
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1993 | Mem: 26.53MB, Util: 100%  global_step : 15993
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1994 | Mem: 26.53MB, Util: 100%  global_step : 15994
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1995 | Mem: 26.53MB, Util: 100%  global_step : 15995
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1996 | Mem: 26.53MB, Util: 100%  global_step : 15996
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1997 | Mem: 26.53MB, Util: 100%  global_step : 15997
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1998 | Mem: 26.53MB, Util: 100%  global_step : 15998
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 7, Batch 1999 | Mem: 26.50MB, 

[Rank 2] Test Epoch 7:  21%|██        | 69/334 [00:00<00:00, 343.90it/s]
[Rank 0] Test Epoch 7:  20%|██        | 68/334 [00:00<00:00, 337.64it/s]
[Rank 1] Test Epoch 7:  21%|██        | 69/334 [00:00<00:00, 341.11it/s]
[Rank 2] Test Epoch 7:  32%|███▏      | 106/334 [00:00<00:00, 352.50it/s]
[Rank 0] Test Epoch 7:  31%|███       | 103/334 [00:00<00:00, 339.59it/s]
[Rank 1] Test Epoch 7:  31%|███       | 104/334 [00:00<00:00, 344.06it/s]
[Rank 1] Test Epoch 7:  42%|████▏     | 139/334 [00:00<00:00, 344.34it/s]
[Rank 2] Test Epoch 7:  43%|████▎     | 142/334 [00:00<00:00, 354.23it/s]
[Rank 0] Test Epoch 7:  41%|████▏     | 138/334 [00:00<00:00, 341.50it/s]
[Rank 1] Test Epoch 7:  52%|█████▏    | 175/334 [00:00<00:00, 347.22it/s]
[Rank 2] Test Epoch 7:  53%|█████▎    | 178/334 [00:00<00:00, 355.56it/s]
[Rank 0] Test Epoch 7:  52%|█████▏    | 173/334 [00:00<00:00, 342.15it/s]
[Rank 1] Test Epoch 7:  63%|██████▎   | 210/334 [00:00<00:00, 346.94it/s]
[Rank 2] Test Epoch 7:  64%|██████▍   | 2

[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [Rank 2] Epoch 7 | Loss: 0.3199, Acc: 0.8833, Model Checksum: 5d5d8647f0bfbe7bf1da6f4ba766a0a7
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [ NodeId 8132e66e3928a4b82addde0aa19309709a58e788e1dcddeaab904191 Rank 2] Epoch 7 | Loss: 0.3199, Acc: 0.8833, Model Checksum: 5d5d8647f0bfbe7bf1da6f4ba766a0a7
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 8, Batch 0 | Mem: 26.53MB, Util: 3%  global_step : 16000
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 8, Batch 1 | Mem: 26.53MB, Util: 3%  global_step : 16001
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 8, Batch 2 | Mem: 26.53MB, Util: 3%  global_step : 16002
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 8, Batch 3 | Mem: 26.53MB, Util: 3%  global_step : 16003
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [Rank 0] Epoch 7 | Loss: 0.3465, Acc: 0.8758, Model Checksum: 5d5d8647f0bfbe7bf1da6f4ba766a0a7
[

[Rank 2] Train Epoch 8:   1%|          | 14/2000 [00:00<00:14, 137.15it/s]
[Rank 0] Train Epoch 8:   1%|          | 14/2000 [00:00<00:14, 134.01it/s]
[Rank 1] Train Epoch 8:   1%|          | 14/2000 [00:00<00:14, 133.13it/s]
[Rank 2] Train Epoch 8:   1%|▏         | 29/2000 [00:00<00:13, 141.41it/s]
[Rank 0] Train Epoch 8:   1%|▏         | 29/2000 [00:00<00:14, 140.34it/s]
[Rank 1] Train Epoch 8:   1%|▏         | 29/2000 [00:00<00:14, 138.04it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 8, Batch 19 | Mem: 26.53MB, Util: 3%  global_step : 16019
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 8, Batch 20 | Mem: 26.53MB, Util: 18%  global_step : 16020
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 8, Batch 21 | Mem: 26.53MB, Util: 18%  global_step : 16021
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 8, Batch 22 | Mem: 26.53MB, Util: 18%  global_step : 16022
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 8, Batch 23 | Mem: 26.53MB, Util: 18%  global_step : 16023
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 8, Batch 24 | Mem: 26.53MB, Util: 18%  global_step : 16024
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 8, Batch 25 | Mem: 26.53MB, Util: 18%  global_step : 16025
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 8, Batch 26 | Mem: 26.53MB, Util: 18%  global_step :

[Rank 1] Train Epoch 8:   2%|▏         | 44/2000 [00:00<00:13, 141.48it/s]
[Rank 1] Train Epoch 8:   3%|▎         | 59/2000 [00:00<00:13, 144.26it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 52 | Mem: 26.53MB, Util: 52%  global_step : 16052
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 53 | Mem: 26.53MB, Util: 52%  global_step : 16053
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 54 | Mem: 26.53MB, Util: 52%  global_step : 16054
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 55 | Mem: 26.53MB, Util: 52%  global_step : 16055
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 56 | Mem: 26.53MB, Util: 79%  global_step : 16056
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 57 | Mem: 26.53MB, Util: 79%  global_step : 16057
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 58 | Mem: 26.53MB, Util: 79%  global_step : 16058
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 59 | Mem: 26.53MB, Util: 79%  glob

[Rank 2] Train Epoch 8:   2%|▏         | 44/2000 [00:00<00:40, 48.27it/s] 
[Rank 2] Train Epoch 8:   3%|▎         | 60/2000 [00:00<00:28, 67.77it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 8, Batch 60 | Mem: 26.53MB, Util: 100%  global_step : 16060
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 8, Batch 61 | Mem: 26.53MB, Util: 100%  global_step : 16061
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 37 | Mem: 26.53MB, Util: 61%  global_step : 16037
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 62 | Mem: 26.53MB, Util: 100%  global_step : 16062
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 63 | Mem: 26.53MB, Util: 100%  global_step : 16063
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 64 | Mem: 26.53MB, Util: 100%  global_step : 16064
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 8, Batch 62 | Mem: 26.53MB, Util: 100%  global_step : 16062
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 8, Batch 63 | Mem: 26.53MB, Util: 100%  glo

[Rank 2] Train Epoch 8:   4%|▎         | 72/2000 [00:02<01:49, 17.58it/s]
[Rank 0] Train Epoch 8:   2%|▏         | 44/2000 [00:02<02:32, 12.80it/s] 
[Rank 1] Train Epoch 8:   4%|▎         | 74/2000 [00:02<01:53, 17.03it/s] 
[Rank 2] Train Epoch 8:   4%|▍         | 86/2000 [00:02<01:16, 24.91it/s]
[Rank 0] Train Epoch 8:   3%|▎         | 58/2000 [00:02<01:39, 19.56it/s]
[Rank 1] Train Epoch 8:   4%|▍         | 88/2000 [00:02<01:20, 23.83it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 51 | Mem: 26.53MB, Util: 0%  global_step : 16051
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 52 | Mem: 26.53MB, Util: 0%  global_step : 16052
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 53 | Mem: 26.53MB, Util: 0%  global_step : 16053
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 54 | Mem: 26.53MB, Util: 0%  global_step : 16054
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 55 | Mem: 26.53MB, Util: 0%  global_step : 16055
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 56 | Mem: 26.53MB, Util: 0%  global_step : 16056
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 57 | Mem: 26.53MB, Util: 0%  global_step : 16057
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 58 | Mem: 26.53MB, Util: 0%  global_step : 16058
[36m(Ra

[Rank 2] Train Epoch 8:   5%|▌         | 100/2000 [00:02<00:56, 33.88it/s]
[Rank 0] Train Epoch 8:   4%|▎         | 72/2000 [00:02<01:08, 28.04it/s]
[Rank 0] Train Epoch 8:   4%|▍         | 87/2000 [00:02<00:49, 39.01it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 80 | Mem: 26.53MB, Util: 61%  global_step : 16080
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 81 | Mem: 26.53MB, Util: 61%  global_step : 16081
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 82 | Mem: 26.53MB, Util: 61%  global_step : 16082
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 83 | Mem: 26.53MB, Util: 61%  global_step : 16083
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 84 | Mem: 26.53MB, Util: 61%  global_step : 16084
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 85 | Mem: 26.53MB, Util: 61%  global_step : 16085
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 86 | Mem: 26.53MB, Util: 61%  global_step : 16086
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 87 | Mem: 26.53MB, Util: 61%  global_step : 16087


[Rank 0] Train Epoch 8:   5%|▌         | 100/2000 [00:03<00:38, 49.49it/s]
[Rank 1] Train Epoch 8:   5%|▌         | 101/2000 [00:03<01:07, 28.21it/s]
[Rank 2] Train Epoch 8:   6%|▌         | 111/2000 [00:03<00:51, 36.42it/s]
[Rank 0] Train Epoch 8:   6%|▌         | 115/2000 [00:03<00:29, 63.19it/s]
[Rank 1] Train Epoch 8:   6%|▌         | 116/2000 [00:03<00:48, 38.58it/s]
[Rank 2] Train Epoch 8:   6%|▋         | 127/2000 [00:03<00:37, 50.07it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 109 | Mem: 26.53MB, Util: 67%  global_step : 16109
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 110 | Mem: 26.53MB, Util: 67%  global_step : 16110
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 111 | Mem: 26.53MB, Util: 67%  global_step : 16111
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 112 | Mem: 26.53MB, Util: 67%  global_step : 16112
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 113 | Mem: 26.53MB, Util: 67%  global_step : 16113
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 114 | Mem: 26.53MB, Util: 67%  global_step : 16114
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 115 | Mem: 26.53MB, Util: 67%  global_step : 16115
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 116 | Mem: 26.53MB, Util: 67%  global_step 

[Rank 0] Train Epoch 8:   6%|▋         | 130/2000 [00:03<00:24, 76.94it/s]
[Rank 1] Train Epoch 8:   7%|▋         | 131/2000 [00:03<00:36, 50.69it/s]
[Rank 2] Train Epoch 8:   7%|▋         | 143/2000 [00:03<00:28, 64.79it/s]
[Rank 0] Train Epoch 8:   7%|▋         | 145/2000 [00:03<00:20, 90.29it/s]
[Rank 1] Train Epoch 8:   7%|▋         | 147/2000 [00:03<00:28, 64.86it/s]
[Rank 2] Train Epoch 8:   8%|▊         | 159/2000 [00:03<00:23, 79.83it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 139 | Mem: 26.53MB, Util: 67%  global_step : 16139
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 140 | Mem: 26.53MB, Util: 67%  global_step : 16140
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 141 | Mem: 26.53MB, Util: 67%  global_step : 16141
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 142 | Mem: 26.53MB, Util: 67%  global_step : 16142
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 143 | Mem: 26.53MB, Util: 67%  global_step : 16143
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 144 | Mem: 26.53MB, Util: 67%  global_step : 16144
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 145 | Mem: 26.53MB, Util: 67%  global_step : 16145
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 146 | Mem: 26.53MB, Util: 67%  global_step 

[Rank 0] Train Epoch 8:   8%|▊         | 160/2000 [00:03<00:17, 102.44it/s]
[Rank 1] Train Epoch 8:   8%|▊         | 163/2000 [00:03<00:23, 79.32it/s]
[Rank 2] Train Epoch 8:   9%|▉         | 175/2000 [00:03<00:19, 93.98it/s]
[Rank 0] Train Epoch 8:   9%|▉         | 175/2000 [00:03<00:16, 111.73it/s]
[Rank 1] Train Epoch 8:   9%|▉         | 179/2000 [00:03<00:19, 93.23it/s]
[Rank 2] Train Epoch 8:  10%|▉         | 191/2000 [00:03<00:16, 106.64it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 169 | Mem: 26.53MB, Util: 68%  global_step : 16169
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 170 | Mem: 26.53MB, Util: 68%  global_step : 16170
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 171 | Mem: 26.53MB, Util: 68%  global_step : 16171
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 172 | Mem: 26.53MB, Util: 68%  global_step : 16172
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 173 | Mem: 26.53MB, Util: 68%  global_step : 16173
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 174 | Mem: 26.53MB, Util: 68%  global_step : 16174
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 175 | Mem: 26.53MB, Util: 68%  global_step : 16175
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 176 | Mem: 26.53MB, Util: 68%  global_step 

[Rank 0] Train Epoch 8:  10%|▉         | 190/2000 [00:03<00:15, 119.48it/s]
[Rank 1] Train Epoch 8:  10%|▉         | 195/2000 [00:03<00:17, 105.54it/s]
[Rank 2] Train Epoch 8:  10%|█         | 206/2000 [00:03<00:17, 104.80it/s]
[Rank 0] Train Epoch 8:  10%|█         | 205/2000 [00:03<00:14, 122.62it/s]
[Rank 1] Train Epoch 8:  10%|█         | 210/2000 [00:03<00:16, 105.82it/s]
[Rank 2] Train Epoch 8:  11%|█         | 222/2000 [00:03<00:15, 116.48it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 198 | Mem: 26.53MB, Util: 69%  global_step : 16198
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 199 | Mem: 26.53MB, Util: 69%  global_step : 16199
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 200 | Mem: 26.53MB, Util: 69%  global_step : 16200
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 201 | Mem: 26.53MB, Util: 69%  global_step : 16201
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 202 | Mem: 26.53MB, Util: 69%  global_step : 16202
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 203 | Mem: 26.53MB, Util: 69%  global_step : 16203
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 204 | Mem: 26.53MB, Util: 69%  global_step : 16204
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 205 | Mem: 26.53MB, Util: 69%  global_step 

[Rank 0] Train Epoch 8:  11%|█         | 219/2000 [00:03<00:14, 126.75it/s]
[Rank 1] Train Epoch 8:  11%|█▏        | 226/2000 [00:03<00:15, 117.07it/s]
[Rank 2] Train Epoch 8:  12%|█▏        | 238/2000 [00:04<00:14, 125.65it/s]
[Rank 0] Train Epoch 8:  12%|█▏        | 233/2000 [00:03<00:13, 130.02it/s]
[Rank 1] Train Epoch 8:  12%|█▏        | 242/2000 [00:04<00:13, 126.46it/s]
[Rank 2] Train Epoch 8:  13%|█▎        | 254/2000 [00:04<00:13, 133.33it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 227 | Mem: 26.53MB, Util: 66%  global_step : 16227
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 228 | Mem: 26.53MB, Util: 66%  global_step : 16228
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 229 | Mem: 26.53MB, Util: 66%  global_step : 16229
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 230 | Mem: 26.53MB, Util: 64%  global_step : 16230
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 231 | Mem: 26.53MB, Util: 64%  global_step : 16231
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 232 | Mem: 26.53MB, Util: 64%  global_step : 16232
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 233 | Mem: 26.53MB, Util: 64%  global_step : 16233
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 234 | Mem: 26.53MB, Util: 64%  global_step 

[Rank 0] Train Epoch 8:  12%|█▏        | 247/2000 [00:04<00:13, 132.18it/s]
[Rank 1] Train Epoch 8:  13%|█▎        | 257/2000 [00:04<00:13, 132.33it/s]
[Rank 2] Train Epoch 8:  14%|█▎        | 270/2000 [00:04<00:12, 138.83it/s]
[Rank 0] Train Epoch 8:  13%|█▎        | 261/2000 [00:04<00:13, 133.64it/s]
[Rank 1] Train Epoch 8:  14%|█▎        | 273/2000 [00:04<00:12, 137.87it/s]
[Rank 2] Train Epoch 8:  14%|█▍        | 286/2000 [00:04<00:11, 143.41it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 256 | Mem: 26.53MB, Util: 64%  global_step : 16256
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 257 | Mem: 26.53MB, Util: 64%  global_step : 16257
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 258 | Mem: 26.53MB, Util: 66%  global_step : 16258
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 259 | Mem: 26.53MB, Util: 66%  global_step : 16259
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 260 | Mem: 26.53MB, Util: 66%  global_step : 16260
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 261 | Mem: 26.53MB, Util: 66%  global_step : 16261
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 262 | Mem: 26.53MB, Util: 66%  global_step : 16262
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 263 | Mem: 26.53MB, Util: 66%  global_step 

[Rank 0] Train Epoch 8:  14%|█▍        | 275/2000 [00:04<00:12, 135.01it/s]
[Rank 1] Train Epoch 8:  14%|█▍        | 288/2000 [00:04<00:12, 141.00it/s]
[Rank 0] Train Epoch 8:  14%|█▍        | 290/2000 [00:04<00:12, 137.87it/s]
[Rank 1] Train Epoch 8:  15%|█▌        | 303/2000 [00:04<00:13, 123.79it/s]
[Rank 2] Train Epoch 8:  15%|█▌        | 302/2000 [00:04<00:13, 122.96it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 286 | Mem: 26.53MB, Util: 66%  global_step : 16286
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 287 | Mem: 26.53MB, Util: 68%  global_step : 16287
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 288 | Mem: 26.53MB, Util: 68%  global_step : 16288
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 289 | Mem: 26.53MB, Util: 68%  global_step : 16289
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 290 | Mem: 26.53MB, Util: 68%  global_step : 16290
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 291 | Mem: 26.53MB, Util: 68%  global_step : 16291
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 292 | Mem: 26.53MB, Util: 68%  global_step : 16292
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 293 | Mem: 26.53MB, Util: 68%  global_step 

[Rank 0] Train Epoch 8:  15%|█▌        | 305/2000 [00:04<00:12, 140.24it/s]
[Rank 1] Train Epoch 8:  16%|█▌        | 319/2000 [00:04<00:12, 131.70it/s]
[Rank 2] Train Epoch 8:  16%|█▌        | 317/2000 [00:04<00:13, 129.43it/s]
[Rank 0] Train Epoch 8:  16%|█▌        | 320/2000 [00:04<00:11, 141.96it/s]
[Rank 1] Train Epoch 8:  17%|█▋        | 335/2000 [00:04<00:12, 137.73it/s]
[Rank 2] Train Epoch 8:  17%|█▋        | 332/2000 [00:04<00:12, 133.83it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 316 | Mem: 26.53MB, Util: 68%  global_step : 16316
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 317 | Mem: 26.53MB, Util: 68%  global_step : 16317
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 318 | Mem: 26.53MB, Util: 68%  global_step : 16318
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 319 | Mem: 26.53MB, Util: 68%  global_step : 16319
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 320 | Mem: 26.53MB, Util: 72%  global_step : 16320
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 321 | Mem: 26.53MB, Util: 72%  global_step : 16321
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 322 | Mem: 26.53MB, Util: 72%  global_step : 16322
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 323 | Mem: 26.53MB, Util: 72%  global_step 

[Rank 0] Train Epoch 8:  17%|█▋        | 335/2000 [00:04<00:11, 142.43it/s]
[Rank 1] Train Epoch 8:  18%|█▊        | 350/2000 [00:04<00:11, 139.64it/s]
[Rank 2] Train Epoch 8:  17%|█▋        | 347/2000 [00:04<00:11, 137.92it/s]
[Rank 0] Train Epoch 8:  18%|█▊        | 350/2000 [00:04<00:11, 143.94it/s]
[Rank 1] Train Epoch 8:  18%|█▊        | 365/2000 [00:04<00:11, 142.51it/s]
[Rank 2] Train Epoch 8:  18%|█▊        | 362/2000 [00:04<00:11, 140.62it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 347 | Mem: 26.53MB, Util: 72%  global_step : 16347
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 348 | Mem: 26.53MB, Util: 72%  global_step : 16348
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 349 | Mem: 26.53MB, Util: 72%  global_step : 16349
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 350 | Mem: 26.53MB, Util: 71%  global_step : 16350
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 351 | Mem: 26.53MB, Util: 71%  global_step : 16351
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 352 | Mem: 26.53MB, Util: 71%  global_step : 16352
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 353 | Mem: 26.53MB, Util: 71%  global_step : 16353
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 354 | Mem: 26.53MB, Util: 71%  global_step 

[Rank 0] Train Epoch 8:  18%|█▊        | 365/2000 [00:04<00:11, 144.03it/s]
[Rank 1] Train Epoch 8:  19%|█▉        | 381/2000 [00:04<00:11, 145.78it/s]
[Rank 2] Train Epoch 8:  19%|█▉        | 377/2000 [00:04<00:11, 142.76it/s]
[Rank 0] Train Epoch 8:  19%|█▉        | 380/2000 [00:05<00:11, 143.56it/s]
[Rank 1] Train Epoch 8:  20%|█▉        | 397/2000 [00:05<00:10, 147.67it/s]
[Rank 2] Train Epoch 8:  20%|█▉        | 392/2000 [00:05<00:11, 144.04it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 377 | Mem: 26.53MB, Util: 71%  global_step : 16377
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 378 | Mem: 26.53MB, Util: 71%  global_step : 16378
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 379 | Mem: 26.53MB, Util: 71%  global_step : 16379
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 380 | Mem: 26.53MB, Util: 70%  global_step : 16380
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 381 | Mem: 26.53MB, Util: 70%  global_step : 16381
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 382 | Mem: 26.53MB, Util: 70%  global_step : 16382
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 383 | Mem: 26.53MB, Util: 70%  global_step : 16383
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 384 | Mem: 26.53MB, Util: 70%  global_step 

[Rank 0] Train Epoch 8:  20%|█▉        | 395/2000 [00:05<00:11, 142.09it/s]
[Rank 2] Train Epoch 8:  20%|██        | 407/2000 [00:05<00:11, 136.77it/s]
[Rank 0] Train Epoch 8:  20%|██        | 410/2000 [00:05<00:11, 141.00it/s]
[Rank 1] Train Epoch 8:  21%|██        | 412/2000 [00:05<00:11, 135.02it/s]
[Rank 2] Train Epoch 8:  21%|██        | 422/2000 [00:05<00:11, 140.46it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 406 | Mem: 26.53MB, Util: 70%  global_step : 16406
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 407 | Mem: 26.53MB, Util: 70%  global_step : 16407
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 408 | Mem: 26.53MB, Util: 67%  global_step : 16408
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 409 | Mem: 26.53MB, Util: 67%  global_step : 16409
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 410 | Mem: 26.53MB, Util: 67%  global_step : 16410
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 411 | Mem: 26.53MB, Util: 67%  global_step : 16411
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 412 | Mem: 26.53MB, Util: 67%  global_step : 16412
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 413 | Mem: 26.53MB, Util: 67%  global_step 

[Rank 0] Train Epoch 8:  21%|██▏       | 425/2000 [00:05<00:11, 141.83it/s]
[Rank 1] Train Epoch 8:  21%|██▏       | 428/2000 [00:05<00:11, 139.90it/s]
[Rank 2] Train Epoch 8:  22%|██▏       | 437/2000 [00:05<00:10, 142.43it/s]
[Rank 0] Train Epoch 8:  22%|██▏       | 440/2000 [00:05<00:10, 142.91it/s]
[Rank 1] Train Epoch 8:  22%|██▏       | 444/2000 [00:05<00:10, 143.51it/s]
[Rank 2] Train Epoch 8:  23%|██▎       | 452/2000 [00:05<00:10, 140.98it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 436 | Mem: 26.53MB, Util: 67%  global_step : 16436
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 437 | Mem: 26.53MB, Util: 67%  global_step : 16437
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 438 | Mem: 26.53MB, Util: 69%  global_step : 16438
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 439 | Mem: 26.53MB, Util: 69%  global_step : 16439
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 440 | Mem: 26.53MB, Util: 69%  global_step : 16440
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 441 | Mem: 26.53MB, Util: 69%  global_step : 16441
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 442 | Mem: 26.53MB, Util: 69%  global_step : 16442
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 443 | Mem: 26.53MB, Util: 69%  global_step 

[Rank 0] Train Epoch 8:  23%|██▎       | 455/2000 [00:05<00:10, 144.10it/s]
[Rank 1] Train Epoch 8:  23%|██▎       | 460/2000 [00:05<00:10, 146.52it/s]
[Rank 2] Train Epoch 8:  23%|██▎       | 467/2000 [00:05<00:10, 143.04it/s]
[Rank 0] Train Epoch 8:  24%|██▎       | 470/2000 [00:05<00:10, 144.09it/s]
[Rank 1] Train Epoch 8:  24%|██▍       | 476/2000 [00:05<00:10, 147.97it/s]
[Rank 2] Train Epoch 8:  24%|██▍       | 482/2000 [00:05<00:10, 143.93it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 467 | Mem: 26.53MB, Util: 69%  global_step : 16467
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 468 | Mem: 26.53MB, Util: 69%  global_step : 16468
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 469 | Mem: 26.53MB, Util: 69%  global_step : 16469
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 470 | Mem: 26.53MB, Util: 69%  global_step : 16470
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 471 | Mem: 26.53MB, Util: 69%  global_step : 16471
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 472 | Mem: 26.53MB, Util: 69%  global_step : 16472
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 473 | Mem: 26.53MB, Util: 69%  global_step : 16473
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 474 | Mem: 26.53MB, Util: 69%  global_step 

[Rank 0] Train Epoch 8:  24%|██▍       | 485/2000 [00:05<00:10, 144.60it/s]
[Rank 1] Train Epoch 8:  25%|██▍       | 491/2000 [00:05<00:10, 145.39it/s]
[Rank 2] Train Epoch 8:  25%|██▍       | 497/2000 [00:05<00:10, 145.15it/s]
[Rank 0] Train Epoch 8:  25%|██▌       | 500/2000 [00:05<00:10, 144.70it/s]
[Rank 1] Train Epoch 8:  25%|██▌       | 506/2000 [00:05<00:10, 137.25it/s]
[Rank 2] Train Epoch 8:  26%|██▌       | 512/2000 [00:05<00:10, 143.55it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 498 | Mem: 26.53MB, Util: 70%  global_step : 16498
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 499 | Mem: 26.53MB, Util: 70%  global_step : 16499
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 500 | Mem: 26.53MB, Util: 70%  global_step : 16500
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 501 | Mem: 26.53MB, Util: 70%  global_step : 16501
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 502 | Mem: 26.53MB, Util: 70%  global_step : 16502
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 503 | Mem: 26.53MB, Util: 70%  global_step : 16503
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 504 | Mem: 26.53MB, Util: 70%  global_step : 16504
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 505 | Mem: 26.53MB, Util: 70%  global_step 

[Rank 0] Train Epoch 8:  26%|██▌       | 515/2000 [00:05<00:10, 145.31it/s]
[Rank 1] Train Epoch 8:  26%|██▌       | 522/2000 [00:05<00:10, 142.51it/s]
[Rank 2] Train Epoch 8:  26%|██▋       | 527/2000 [00:06<00:10, 144.67it/s]
[Rank 0] Train Epoch 8:  26%|██▋       | 530/2000 [00:06<00:10, 145.55it/s]
[Rank 1] Train Epoch 8:  27%|██▋       | 538/2000 [00:06<00:09, 146.24it/s]
[Rank 2] Train Epoch 8:  27%|██▋       | 543/2000 [00:06<00:09, 147.90it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 529 | Mem: 26.53MB, Util: 70%  global_step : 16529
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 530 | Mem: 26.53MB, Util: 70%  global_step : 16530
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 531 | Mem: 26.53MB, Util: 70%  global_step : 16531
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 532 | Mem: 26.53MB, Util: 70%  global_step : 16532
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 533 | Mem: 26.53MB, Util: 70%  global_step : 16533
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 534 | Mem: 26.53MB, Util: 70%  global_step : 16534
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 535 | Mem: 26.53MB, Util: 70%  global_step : 16535
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 536 | Mem: 26.53MB, Util: 70%  global_step 

[Rank 0] Train Epoch 8:  27%|██▋       | 545/2000 [00:06<00:09, 146.15it/s]
[Rank 1] Train Epoch 8:  28%|██▊       | 553/2000 [00:06<00:09, 146.58it/s]
[Rank 2] Train Epoch 8:  28%|██▊       | 558/2000 [00:06<00:09, 147.86it/s]
[Rank 0] Train Epoch 8:  28%|██▊       | 560/2000 [00:06<00:09, 145.71it/s]
[Rank 1] Train Epoch 8:  28%|██▊       | 568/2000 [00:06<00:09, 146.56it/s]
[Rank 2] Train Epoch 8:  29%|██▊       | 573/2000 [00:06<00:09, 146.74it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 559 | Mem: 26.53MB, Util: 72%  global_step : 16559
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 560 | Mem: 26.53MB, Util: 72%  global_step : 16560
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 561 | Mem: 26.53MB, Util: 72%  global_step : 16561
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 562 | Mem: 26.53MB, Util: 72%  global_step : 16562
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 563 | Mem: 26.53MB, Util: 72%  global_step : 16563
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 564 | Mem: 26.53MB, Util: 72%  global_step : 16564
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 565 | Mem: 26.53MB, Util: 72%  global_step : 16565
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 566 | Mem: 26.53MB, Util: 72%  global_step 

[Rank 0] Train Epoch 8:  29%|██▉       | 575/2000 [00:06<00:09, 145.75it/s]
[Rank 1] Train Epoch 8:  29%|██▉       | 583/2000 [00:06<00:09, 145.28it/s]
[Rank 0] Train Epoch 8:  30%|██▉       | 590/2000 [00:06<00:09, 146.15it/s]
[Rank 2] Train Epoch 8:  29%|██▉       | 589/2000 [00:06<00:09, 147.98it/s]
[Rank 0] Train Epoch 8:  30%|███       | 605/2000 [00:06<00:09, 145.99it/s]
[Rank 1] Train Epoch 8:  30%|██▉       | 598/2000 [00:06<00:09, 145.65it/s]
[Rank 2] Train Epoch 8:  30%|███       | 604/2000 [00:06<00:09, 143.02it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 592 | Mem: 26.53MB, Util: 100%  global_step : 16592
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 593 | Mem: 26.53MB, Util: 100%  global_step : 16593
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 594 | Mem: 26.53MB, Util: 100%  global_step : 16594
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 595 | Mem: 26.53MB, Util: 100%  global_step : 16595
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 596 | Mem: 26.53MB, Util: 100%  global_step : 16596
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 597 | Mem: 26.53MB, Util: 100%  global_step : 16597
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 598 | Mem: 26.53MB, Util: 100%  global_step : 16598
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 599 | Mem: 26.53MB, 

[Rank 1] Train Epoch 8:  31%|███       | 613/2000 [00:06<00:09, 140.23it/s]
[Rank 0] Train Epoch 8:  31%|███       | 620/2000 [00:06<00:09, 146.22it/s]
[Rank 2] Train Epoch 8:  31%|███       | 620/2000 [00:06<00:09, 147.32it/s]
[Rank 1] Train Epoch 8:  31%|███▏      | 628/2000 [00:06<00:09, 142.08it/s]
[Rank 0] Train Epoch 8:  32%|███▏      | 635/2000 [00:06<00:09, 146.14it/s]
[Rank 2] Train Epoch 8:  32%|███▏      | 637/2000 [00:06<00:08, 151.49it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 621 | Mem: 26.53MB, Util: 100%  global_step : 16621
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 622 | Mem: 26.53MB, Util: 90%  global_step : 16622
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 623 | Mem: 26.53MB, Util: 90%  global_step : 16623
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 624 | Mem: 26.53MB, Util: 90%  global_step : 16624
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 625 | Mem: 26.53MB, Util: 90%  global_step : 16625
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 626 | Mem: 26.53MB, Util: 90%  global_step : 16626
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 627 | Mem: 26.53MB, Util: 90%  global_step : 16627
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 628 | Mem: 26.53MB, Util: 

[Rank 1] Train Epoch 8:  32%|███▏      | 643/2000 [00:06<00:09, 143.53it/s]
[Rank 0] Train Epoch 8:  32%|███▎      | 650/2000 [00:06<00:09, 145.05it/s]
[Rank 2] Train Epoch 8:  33%|███▎      | 653/2000 [00:06<00:08, 153.63it/s]
[Rank 1] Train Epoch 8:  33%|███▎      | 658/2000 [00:06<00:09, 142.50it/s]
[Rank 0] Train Epoch 8:  33%|███▎      | 665/2000 [00:06<00:09, 144.28it/s]
[Rank 2] Train Epoch 8:  33%|███▎      | 669/2000 [00:06<00:08, 155.26it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 651 | Mem: 26.53MB, Util: 86%  global_step : 16651
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 652 | Mem: 26.53MB, Util: 86%  global_step : 16652
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 653 | Mem: 26.53MB, Util: 86%  global_step : 16653
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 654 | Mem: 26.53MB, Util: 86%  global_step : 16654
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 655 | Mem: 26.53MB, Util: 86%  global_step : 16655
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 656 | Mem: 26.53MB, Util: 86%  global_step : 16656
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 657 | Mem: 26.53MB, Util: 86%  global_step : 16657
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 658 | Mem: 26.53MB, Util: 8

[Rank 1] Train Epoch 8:  34%|███▎      | 673/2000 [00:07<00:09, 142.83it/s]
[Rank 0] Train Epoch 8:  34%|███▍      | 680/2000 [00:07<00:09, 143.69it/s]
[Rank 2] Train Epoch 8:  34%|███▍      | 685/2000 [00:07<00:08, 153.06it/s]
[Rank 1] Train Epoch 8:  34%|███▍      | 688/2000 [00:07<00:09, 142.88it/s]
[Rank 0] Train Epoch 8:  35%|███▍      | 695/2000 [00:07<00:09, 144.37it/s]
[Rank 2] Train Epoch 8:  35%|███▌      | 701/2000 [00:07<00:09, 132.88it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 681 | Mem: 26.53MB, Util: 91%  global_step : 16681
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 682 | Mem: 26.53MB, Util: 91%  global_step : 16682
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 683 | Mem: 26.53MB, Util: 91%  global_step : 16683
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 684 | Mem: 26.53MB, Util: 91%  global_step : 16684
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 685 | Mem: 26.53MB, Util: 91%  global_step : 16685
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 686 | Mem: 26.53MB, Util: 91%  global_step : 16686
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 687 | Mem: 26.53MB, Util: 91%  global_step : 16687
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 688 | Mem: 26.53MB, Util: 9

[Rank 1] Train Epoch 8:  35%|███▌      | 703/2000 [00:07<00:09, 143.33it/s]
[Rank 0] Train Epoch 8:  36%|███▌      | 710/2000 [00:07<00:08, 144.39it/s]
[Rank 2] Train Epoch 8:  36%|███▌      | 717/2000 [00:07<00:09, 139.02it/s]
[Rank 1] Train Epoch 8:  36%|███▌      | 718/2000 [00:07<00:08, 144.54it/s]
[Rank 0] Train Epoch 8:  36%|███▋      | 725/2000 [00:07<00:08, 144.76it/s]
[Rank 2] Train Epoch 8:  37%|███▋      | 733/2000 [00:07<00:08, 144.70it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 711 | Mem: 26.53MB, Util: 94%  global_step : 16711
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 712 | Mem: 26.53MB, Util: 94%  global_step : 16712
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 713 | Mem: 26.53MB, Util: 94%  global_step : 16713
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 714 | Mem: 26.53MB, Util: 94%  global_step : 16714
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 715 | Mem: 26.53MB, Util: 94%  global_step : 16715
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 716 | Mem: 26.53MB, Util: 94%  global_step : 16716
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 717 | Mem: 26.53MB, Util: 94%  global_step : 16717
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 718 | Mem: 26.53MB, Util: 9

[Rank 1] Train Epoch 8:  37%|███▋      | 733/2000 [00:07<00:08, 142.13it/s]
[Rank 0] Train Epoch 8:  37%|███▋      | 740/2000 [00:07<00:08, 146.15it/s]
[Rank 2] Train Epoch 8:  37%|███▋      | 749/2000 [00:07<00:08, 148.08it/s]
[Rank 1] Train Epoch 8:  37%|███▋      | 748/2000 [00:07<00:08, 144.08it/s]
[Rank 0] Train Epoch 8:  38%|███▊      | 756/2000 [00:07<00:08, 148.36it/s]
[Rank 2] Train Epoch 8:  38%|███▊      | 765/2000 [00:07<00:08, 151.38it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 740 | Mem: 26.53MB, Util: 92%  global_step : 16740
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 741 | Mem: 26.53MB, Util: 92%  global_step : 16741
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 742 | Mem: 26.53MB, Util: 92%  global_step : 16742
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 743 | Mem: 26.53MB, Util: 92%  global_step : 16743
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 744 | Mem: 26.53MB, Util: 92%  global_step : 16744
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 745 | Mem: 26.53MB, Util: 92%  global_step : 16745
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 746 | Mem: 26.53MB, Util: 92%  global_step : 16746
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 747 | Mem: 26.53MB, Util: 9

[Rank 1] Train Epoch 8:  38%|███▊      | 763/2000 [00:07<00:08, 144.48it/s]
[Rank 0] Train Epoch 8:  39%|███▊      | 772/2000 [00:07<00:08, 149.64it/s]
[Rank 2] Train Epoch 8:  39%|███▉      | 782/2000 [00:07<00:07, 154.11it/s]
[Rank 1] Train Epoch 8:  39%|███▉      | 778/2000 [00:07<00:08, 144.88it/s]
[Rank 0] Train Epoch 8:  39%|███▉      | 788/2000 [00:07<00:08, 150.88it/s]
[Rank 2] Train Epoch 8:  40%|███▉      | 798/2000 [00:07<00:07, 152.41it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 771 | Mem: 26.53MB, Util: 73%  global_step : 16771
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 772 | Mem: 26.53MB, Util: 73%  global_step : 16772
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 773 | Mem: 26.53MB, Util: 73%  global_step : 16773
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 774 | Mem: 26.53MB, Util: 73%  global_step : 16774
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 775 | Mem: 26.53MB, Util: 73%  global_step : 16775
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 776 | Mem: 26.53MB, Util: 73%  global_step : 16776
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 777 | Mem: 26.53MB, Util: 73%  global_step : 16777
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 778 | Mem: 26.53MB, Util: 7

[Rank 1] Train Epoch 8:  40%|███▉      | 793/2000 [00:07<00:08, 144.63it/s]
[Rank 1] Train Epoch 8:  40%|████      | 808/2000 [00:07<00:08, 144.50it/s]
[Rank 0] Train Epoch 8:  40%|████      | 804/2000 [00:07<00:08, 137.08it/s]
[Rank 2] Train Epoch 8:  41%|████      | 814/2000 [00:08<00:09, 131.35it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 801 | Mem: 26.53MB, Util: 72%  global_step : 16801
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 802 | Mem: 26.53MB, Util: 72%  global_step : 16802
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 803 | Mem: 26.53MB, Util: 72%  global_step : 16803
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 804 | Mem: 26.53MB, Util: 72%  global_step : 16804
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 805 | Mem: 26.53MB, Util: 72%  global_step : 16805
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 806 | Mem: 26.53MB, Util: 72%  global_step : 16806
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 807 | Mem: 26.53MB, Util: 72%  global_step : 16807
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 808 | Mem: 26.53MB, Util: 7

[Rank 1] Train Epoch 8:  41%|████      | 823/2000 [00:08<00:08, 143.11it/s]
[Rank 0] Train Epoch 8:  41%|████      | 819/2000 [00:08<00:08, 139.40it/s]
[Rank 2] Train Epoch 8:  41%|████▏     | 829/2000 [00:08<00:08, 135.00it/s]
[Rank 1] Train Epoch 8:  42%|████▏     | 838/2000 [00:08<00:08, 142.77it/s]
[Rank 0] Train Epoch 8:  42%|████▏     | 835/2000 [00:08<00:08, 143.25it/s]
[Rank 2] Train Epoch 8:  42%|████▏     | 844/2000 [00:08<00:08, 137.92it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 830 | Mem: 26.53MB, Util: 82%  global_step : 16830
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 831 | Mem: 26.53MB, Util: 82%  global_step : 16831
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 832 | Mem: 26.53MB, Util: 82%  global_step : 16832
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 833 | Mem: 26.53MB, Util: 82%  global_step : 16833
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 834 | Mem: 26.53MB, Util: 82%  global_step : 16834
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 835 | Mem: 26.53MB, Util: 82%  global_step : 16835
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 836 | Mem: 26.53MB, Util: 82%  global_step : 16836
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 837 | Mem: 26.53MB, Util: 8

[Rank 1] Train Epoch 8:  43%|████▎     | 853/2000 [00:08<00:08, 142.73it/s]
[Rank 0] Train Epoch 8:  43%|████▎     | 851/2000 [00:08<00:07, 145.90it/s]
[Rank 2] Train Epoch 8:  43%|████▎     | 859/2000 [00:08<00:08, 139.53it/s]
[Rank 1] Train Epoch 8:  43%|████▎     | 868/2000 [00:08<00:07, 142.48it/s]
[Rank 0] Train Epoch 8:  43%|████▎     | 866/2000 [00:08<00:07, 145.20it/s]
[Rank 2] Train Epoch 8:  44%|████▎     | 874/2000 [00:08<00:07, 141.05it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 860 | Mem: 26.53MB, Util: 81%  global_step : 16860
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 861 | Mem: 26.53MB, Util: 81%  global_step : 16861
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 862 | Mem: 26.53MB, Util: 81%  global_step : 16862
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 863 | Mem: 26.53MB, Util: 81%  global_step : 16863
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 864 | Mem: 26.53MB, Util: 81%  global_step : 16864
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 865 | Mem: 26.53MB, Util: 81%  global_step : 16865
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 866 | Mem: 26.53MB, Util: 81%  global_step : 16866
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 867 | Mem: 26.53MB, Util: 8

[Rank 1] Train Epoch 8:  44%|████▍     | 883/2000 [00:08<00:07, 141.43it/s]
[Rank 0] Train Epoch 8:  44%|████▍     | 881/2000 [00:08<00:07, 144.71it/s]
[Rank 2] Train Epoch 8:  44%|████▍     | 889/2000 [00:08<00:08, 138.39it/s]
[Rank 1] Train Epoch 8:  45%|████▍     | 898/2000 [00:08<00:07, 141.86it/s]
[Rank 0] Train Epoch 8:  45%|████▍     | 896/2000 [00:08<00:07, 144.29it/s]
[Rank 2] Train Epoch 8:  45%|████▌     | 903/2000 [00:08<00:07, 137.99it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 890 | Mem: 26.53MB, Util: 68%  global_step : 16890
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 891 | Mem: 26.53MB, Util: 68%  global_step : 16891
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 892 | Mem: 26.53MB, Util: 68%  global_step : 16892
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 893 | Mem: 26.53MB, Util: 68%  global_step : 16893
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 894 | Mem: 26.53MB, Util: 68%  global_step : 16894
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 895 | Mem: 26.53MB, Util: 68%  global_step : 16895
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 896 | Mem: 26.53MB, Util: 68%  global_step : 16896
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 897 | Mem: 26.53MB, Util: 6

[Rank 1] Train Epoch 8:  46%|████▌     | 913/2000 [00:08<00:07, 140.23it/s]
[Rank 0] Train Epoch 8:  46%|████▌     | 911/2000 [00:08<00:07, 138.89it/s]
[Rank 2] Train Epoch 8:  46%|████▌     | 918/2000 [00:08<00:07, 140.48it/s]
[Rank 1] Train Epoch 8:  46%|████▋     | 928/2000 [00:08<00:08, 133.85it/s]
[Rank 0] Train Epoch 8:  46%|████▋     | 925/2000 [00:08<00:07, 138.94it/s]
[Rank 2] Train Epoch 8:  47%|████▋     | 933/2000 [00:08<00:07, 142.42it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 917 | Mem: 26.53MB, Util: 67%  global_step : 16917
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 918 | Mem: 26.53MB, Util: 67%  global_step : 16918
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 919 | Mem: 26.53MB, Util: 67%  global_step : 16919
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 920 | Mem: 26.53MB, Util: 67%  global_step : 16920
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 921 | Mem: 26.53MB, Util: 67%  global_step : 16921
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 922 | Mem: 26.53MB, Util: 67%  global_step : 16922
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 923 | Mem: 26.53MB, Util: 67%  global_step : 16923
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 924 | Mem: 26.53MB, Util: 6

[Rank 1] Train Epoch 8:  47%|████▋     | 943/2000 [00:08<00:07, 136.01it/s]
[Rank 0] Train Epoch 8:  47%|████▋     | 940/2000 [00:08<00:07, 140.71it/s]
[Rank 2] Train Epoch 8:  47%|████▋     | 948/2000 [00:08<00:07, 143.97it/s]
[Rank 1] Train Epoch 8:  48%|████▊     | 958/2000 [00:09<00:07, 138.25it/s]
[Rank 0] Train Epoch 8:  48%|████▊     | 955/2000 [00:08<00:07, 141.50it/s]
[Rank 2] Train Epoch 8:  48%|████▊     | 963/2000 [00:09<00:07, 143.77it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 946 | Mem: 26.53MB, Util: 60%  global_step : 16946
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 947 | Mem: 26.53MB, Util: 60%  global_step : 16947
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 948 | Mem: 26.53MB, Util: 60%  global_step : 16948
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 949 | Mem: 26.53MB, Util: 60%  global_step : 16949
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 950 | Mem: 26.53MB, Util: 60%  global_step : 16950
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 951 | Mem: 26.53MB, Util: 60%  global_step : 16951
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 952 | Mem: 26.53MB, Util: 60%  global_step : 16952
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 953 | Mem: 26.53MB, Util: 6

[Rank 1] Train Epoch 8:  49%|████▊     | 973/2000 [00:09<00:07, 139.54it/s]
[Rank 0] Train Epoch 8:  48%|████▊     | 970/2000 [00:09<00:07, 141.97it/s]
[Rank 2] Train Epoch 8:  49%|████▉     | 978/2000 [00:09<00:07, 145.26it/s]
[Rank 1] Train Epoch 8:  49%|████▉     | 988/2000 [00:09<00:07, 140.97it/s]
[Rank 0] Train Epoch 8:  49%|████▉     | 985/2000 [00:09<00:07, 142.47it/s]
[Rank 2] Train Epoch 8:  50%|████▉     | 993/2000 [00:09<00:06, 145.50it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 976 | Mem: 26.53MB, Util: 68%  global_step : 16976
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 977 | Mem: 26.53MB, Util: 68%  global_step : 16977
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 978 | Mem: 26.53MB, Util: 68%  global_step : 16978
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 979 | Mem: 26.53MB, Util: 68%  global_step : 16979
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 980 | Mem: 26.53MB, Util: 68%  global_step : 16980
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 981 | Mem: 26.53MB, Util: 68%  global_step : 16981
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 982 | Mem: 26.53MB, Util: 68%  global_step : 16982
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 983 | Mem: 26.53MB, Util: 6

[Rank 1] Train Epoch 8:  50%|█████     | 1003/2000 [00:09<00:07, 141.33it/s]
[Rank 0] Train Epoch 8:  50%|█████     | 1000/2000 [00:09<00:06, 143.77it/s]
[Rank 2] Train Epoch 8:  50%|█████     | 1008/2000 [00:09<00:07, 130.01it/s]
[Rank 1] Train Epoch 8:  51%|█████     | 1018/2000 [00:09<00:06, 141.09it/s]
[Rank 0] Train Epoch 8:  51%|█████     | 1015/2000 [00:09<00:07, 132.89it/s]
[Rank 2] Train Epoch 8:  51%|█████     | 1023/2000 [00:09<00:07, 134.99it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1005 | Mem: 26.53MB, Util: 69%  global_step : 17005
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1006 | Mem: 26.53MB, Util: 69%  global_step : 17006
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1007 | Mem: 26.53MB, Util: 69%  global_step : 17007
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1008 | Mem: 26.53MB, Util: 69%  global_step : 17008
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1009 | Mem: 26.53MB, Util: 69%  global_step : 17009
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1010 | Mem: 26.53MB, Util: 69%  global_step : 17010
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1011 | Mem: 26.53MB, Util: 69%  global_step : 17011
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1012 | Mem: 26.53MB,

[Rank 1] Train Epoch 8:  52%|█████▏    | 1033/2000 [00:09<00:06, 138.69it/s]
[Rank 0] Train Epoch 8:  52%|█████▏    | 1030/2000 [00:09<00:07, 136.54it/s]
[Rank 2] Train Epoch 8:  52%|█████▏    | 1038/2000 [00:09<00:06, 138.86it/s]
[Rank 1] Train Epoch 8:  52%|█████▏    | 1048/2000 [00:09<00:06, 140.06it/s]
[Rank 0] Train Epoch 8:  52%|█████▏    | 1045/2000 [00:09<00:06, 138.63it/s]
[Rank 2] Train Epoch 8:  53%|█████▎    | 1053/2000 [00:09<00:06, 141.53it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1034 | Mem: 26.53MB, Util: 69%  global_step : 17034
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1035 | Mem: 26.53MB, Util: 69%  global_step : 17035
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1036 | Mem: 26.53MB, Util: 69%  global_step : 17036
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1037 | Mem: 26.53MB, Util: 69%  global_step : 17037
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1038 | Mem: 26.53MB, Util: 69%  global_step : 17038
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1039 | Mem: 26.53MB, Util: 69%  global_step : 17039
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1040 | Mem: 26.53MB, Util: 69%  global_step : 17040
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1041 | Mem: 26.53MB,

[Rank 1] Train Epoch 8:  53%|█████▎    | 1063/2000 [00:09<00:06, 139.74it/s]
[Rank 0] Train Epoch 8:  53%|█████▎    | 1060/2000 [00:09<00:06, 139.69it/s]
[Rank 2] Train Epoch 8:  53%|█████▎    | 1068/2000 [00:09<00:06, 143.84it/s]
[Rank 1] Train Epoch 8:  54%|█████▍    | 1078/2000 [00:09<00:06, 141.54it/s]
[Rank 0] Train Epoch 8:  54%|█████▍    | 1075/2000 [00:09<00:06, 140.99it/s]
[Rank 2] Train Epoch 8:  54%|█████▍    | 1083/2000 [00:09<00:06, 145.60it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1063 | Mem: 26.53MB, Util: 66%  global_step : 17063
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1064 | Mem: 26.53MB, Util: 66%  global_step : 17064
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1065 | Mem: 26.53MB, Util: 66%  global_step : 17065
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1066 | Mem: 26.53MB, Util: 66%  global_step : 17066
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1067 | Mem: 26.53MB, Util: 66%  global_step : 17067
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1068 | Mem: 26.53MB, Util: 66%  global_step : 17068
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1069 | Mem: 26.53MB, Util: 66%  global_step : 17069
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1070 | Mem: 26.53MB,

[Rank 1] Train Epoch 8:  55%|█████▍    | 1093/2000 [00:09<00:06, 143.65it/s]
[Rank 0] Train Epoch 8:  55%|█████▍    | 1090/2000 [00:09<00:06, 140.99it/s]
[Rank 2] Train Epoch 8:  55%|█████▍    | 1098/2000 [00:10<00:06, 146.02it/s]
[Rank 1] Train Epoch 8:  55%|█████▌    | 1108/2000 [00:10<00:06, 145.38it/s]
[Rank 0] Train Epoch 8:  55%|█████▌    | 1105/2000 [00:10<00:06, 139.46it/s]
[Rank 2] Train Epoch 8:  56%|█████▌    | 1113/2000 [00:10<00:06, 133.37it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1093 | Mem: 26.53MB, Util: 68%  global_step : 17093
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1094 | Mem: 26.53MB, Util: 68%  global_step : 17094
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1095 | Mem: 26.53MB, Util: 68%  global_step : 17095
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1096 | Mem: 26.53MB, Util: 68%  global_step : 17096
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1097 | Mem: 26.53MB, Util: 68%  global_step : 17097
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1098 | Mem: 26.53MB, Util: 68%  global_step : 17098
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1099 | Mem: 26.53MB, Util: 68%  global_step : 17099
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1100 | Mem: 26.53MB,

[Rank 1] Train Epoch 8:  56%|█████▌    | 1123/2000 [00:10<00:06, 145.22it/s]
[Rank 0] Train Epoch 8:  56%|█████▌    | 1119/2000 [00:10<00:06, 138.43it/s]
[Rank 2] Train Epoch 8:  56%|█████▋    | 1128/2000 [00:10<00:06, 136.39it/s]
[Rank 1] Train Epoch 8:  57%|█████▋    | 1138/2000 [00:10<00:05, 143.86it/s]
[Rank 0] Train Epoch 8:  57%|█████▋    | 1133/2000 [00:10<00:06, 136.75it/s]
[Rank 2] Train Epoch 8:  57%|█████▋    | 1144/2000 [00:10<00:06, 141.47it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1124 | Mem: 26.53MB, Util: 74%  global_step : 17124
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1125 | Mem: 26.53MB, Util: 74%  global_step : 17125
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1126 | Mem: 26.53MB, Util: 74%  global_step : 17126
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1127 | Mem: 26.53MB, Util: 74%  global_step : 17127
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1128 | Mem: 26.53MB, Util: 74%  global_step : 17128
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1129 | Mem: 26.53MB, Util: 74%  global_step : 17129
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1130 | Mem: 26.53MB, Util: 74%  global_step : 17130
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1131 | Mem: 26.53MB,

[Rank 1] Train Epoch 8:  58%|█████▊    | 1153/2000 [00:10<00:05, 143.42it/s]
[Rank 0] Train Epoch 8:  57%|█████▋    | 1147/2000 [00:10<00:06, 136.65it/s]
[Rank 2] Train Epoch 8:  58%|█████▊    | 1160/2000 [00:10<00:05, 144.56it/s]
[Rank 0] Train Epoch 8:  58%|█████▊    | 1161/2000 [00:10<00:06, 136.34it/s]
[Rank 2] Train Epoch 8:  59%|█████▉    | 1176/2000 [00:10<00:05, 147.13it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1153 | Mem: 26.53MB, Util: 90%  global_step : 17153
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1154 | Mem: 26.53MB, Util: 90%  global_step : 17154
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1155 | Mem: 26.53MB, Util: 90%  global_step : 17155
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1156 | Mem: 26.53MB, Util: 90%  global_step : 17156
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1157 | Mem: 26.53MB, Util: 90%  global_step : 17157
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1158 | Mem: 26.53MB, Util: 90%  global_step : 17158
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1159 | Mem: 26.53MB, Util: 90%  global_step : 17159
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1160 | Mem: 26.53MB,

[Rank 1] Train Epoch 8:  58%|█████▊    | 1168/2000 [00:10<00:05, 140.55it/s]
[Rank 0] Train Epoch 8:  59%|█████▉    | 1175/2000 [00:10<00:06, 136.72it/s]
[Rank 2] Train Epoch 8:  60%|█████▉    | 1192/2000 [00:10<00:05, 149.10it/s]
[Rank 1] Train Epoch 8:  59%|█████▉    | 1183/2000 [00:10<00:05, 140.11it/s]
[Rank 0] Train Epoch 8:  59%|█████▉    | 1189/2000 [00:10<00:05, 136.16it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1181 | Mem: 26.53MB, Util: 100%  global_step : 17181
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1182 | Mem: 26.53MB, Util: 100%  global_step : 17182
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1183 | Mem: 26.53MB, Util: 100%  global_step : 17183
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1184 | Mem: 26.53MB, Util: 100%  global_step : 17184
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1185 | Mem: 26.53MB, Util: 100%  global_step : 17185
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1186 | Mem: 26.53MB, Util: 100%  global_step : 17186
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1187 | Mem: 26.53MB, Util: 100%  global_step : 17187
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1188 | Mem: 2

[Rank 1] Train Epoch 8:  60%|█████▉    | 1198/2000 [00:10<00:05, 140.23it/s]
[Rank 0] Train Epoch 8:  60%|██████    | 1203/2000 [00:10<00:05, 135.88it/s]
[Rank 2] Train Epoch 8:  60%|██████    | 1207/2000 [00:10<00:06, 125.40it/s]
[Rank 1] Train Epoch 8:  61%|██████    | 1213/2000 [00:10<00:06, 130.97it/s]
[Rank 0] Train Epoch 8:  61%|██████    | 1217/2000 [00:10<00:05, 136.09it/s]
[Rank 2] Train Epoch 8:  61%|██████    | 1223/2000 [00:10<00:05, 133.27it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1206 | Mem: 26.53MB, Util: 100%  global_step : 17206
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1207 | Mem: 26.53MB, Util: 100%  global_step : 17207
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1208 | Mem: 26.53MB, Util: 100%  global_step : 17208
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1209 | Mem: 26.53MB, Util: 100%  global_step : 17209
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1210 | Mem: 26.53MB, Util: 100%  global_step : 17210
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1211 | Mem: 26.53MB, Util: 100%  global_step : 17211
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1212 | Mem: 26.53MB, Util: 100%  global_step : 17212
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1213 | Mem: 2

[Rank 1] Train Epoch 8:  61%|██████▏   | 1227/2000 [00:10<00:05, 133.31it/s]
[Rank 0] Train Epoch 8:  62%|██████▏   | 1231/2000 [00:11<00:05, 136.25it/s]
[Rank 2] Train Epoch 8:  62%|██████▏   | 1239/2000 [00:11<00:05, 139.12it/s]
[Rank 1] Train Epoch 8:  62%|██████▏   | 1242/2000 [00:11<00:05, 135.92it/s]
[Rank 0] Train Epoch 8:  62%|██████▏   | 1245/2000 [00:11<00:05, 136.02it/s]
[Rank 2] Train Epoch 8:  63%|██████▎   | 1255/2000 [00:11<00:05, 143.41it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1235 | Mem: 26.53MB, Util: 97%  global_step : 17235
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1236 | Mem: 26.53MB, Util: 97%  global_step : 17236
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1237 | Mem: 26.53MB, Util: 97%  global_step : 17237
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1238 | Mem: 26.53MB, Util: 97%  global_step : 17238
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1239 | Mem: 26.53MB, Util: 97%  global_step : 17239
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1240 | Mem: 26.53MB, Util: 97%  global_step : 17240
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1241 | Mem: 26.53MB, Util: 97%  global_step : 17241
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1242 | Mem: 26.53MB,

[Rank 1] Train Epoch 8:  63%|██████▎   | 1256/2000 [00:11<00:05, 136.92it/s]
[Rank 0] Train Epoch 8:  63%|██████▎   | 1259/2000 [00:11<00:05, 136.30it/s]
[Rank 2] Train Epoch 8:  64%|██████▎   | 1271/2000 [00:11<00:04, 146.48it/s]
[Rank 1] Train Epoch 8:  64%|██████▎   | 1271/2000 [00:11<00:05, 138.10it/s]
[Rank 0] Train Epoch 8:  64%|██████▎   | 1273/2000 [00:11<00:05, 135.87it/s]
[Rank 2] Train Epoch 8:  64%|██████▍   | 1287/2000 [00:11<00:04, 148.89it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1264 | Mem: 26.53MB, Util: 100%  global_step : 17264
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1265 | Mem: 26.53MB, Util: 100%  global_step : 17265
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1266 | Mem: 26.53MB, Util: 100%  global_step : 17266
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1267 | Mem: 26.53MB, Util: 100%  global_step : 17267
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1268 | Mem: 26.53MB, Util: 100%  global_step : 17268
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1269 | Mem: 26.53MB, Util: 100%  global_step : 17269
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1270 | Mem: 26.53MB, Util: 100%  global_step : 17270
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1271 | Mem: 2

[Rank 1] Train Epoch 8:  64%|██████▍   | 1285/2000 [00:11<00:05, 138.34it/s]
[Rank 0] Train Epoch 8:  64%|██████▍   | 1287/2000 [00:11<00:05, 136.11it/s]
[Rank 2] Train Epoch 8:  65%|██████▌   | 1303/2000 [00:11<00:05, 120.69it/s]
[Rank 1] Train Epoch 8:  65%|██████▍   | 1299/2000 [00:11<00:05, 135.09it/s]
[Rank 0] Train Epoch 8:  65%|██████▌   | 1301/2000 [00:11<00:05, 136.73it/s]
[Rank 2] Train Epoch 8:  66%|██████▌   | 1318/2000 [00:11<00:05, 127.14it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1292 | Mem: 26.53MB, Util: 100%  global_step : 17292
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1293 | Mem: 26.53MB, Util: 100%  global_step : 17293
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1294 | Mem: 26.53MB, Util: 100%  global_step : 17294
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1295 | Mem: 26.53MB, Util: 100%  global_step : 17295
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1296 | Mem: 26.53MB, Util: 100%  global_step : 17296
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1297 | Mem: 26.53MB, Util: 100%  global_step : 17297
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1298 | Mem: 26.53MB, Util: 100%  global_step : 17298
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1299 | Mem: 2

[Rank 1] Train Epoch 8:  66%|██████▌   | 1313/2000 [00:11<00:05, 132.66it/s]
[Rank 0] Train Epoch 8:  66%|██████▌   | 1315/2000 [00:11<00:04, 137.30it/s]
[Rank 2] Train Epoch 8:  67%|██████▋   | 1333/2000 [00:11<00:05, 132.61it/s]
[Rank 1] Train Epoch 8:  66%|██████▋   | 1327/2000 [00:11<00:05, 134.46it/s]
[Rank 0] Train Epoch 8:  66%|██████▋   | 1329/2000 [00:11<00:04, 136.99it/s]
[Rank 0] Train Epoch 8:  67%|██████▋   | 1343/2000 [00:11<00:04, 137.37it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1319 | Mem: 26.53MB, Util: 100%  global_step : 17319
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1320 | Mem: 26.53MB, Util: 100%  global_step : 17320
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1321 | Mem: 26.53MB, Util: 100%  global_step : 17321
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1322 | Mem: 26.53MB, Util: 100%  global_step : 17322
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1323 | Mem: 26.53MB, Util: 100%  global_step : 17323
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1324 | Mem: 26.53MB, Util: 100%  global_step : 17324
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1325 | Mem: 26.53MB, Util: 100%  global_step : 17325
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1326 | Mem: 2

[Rank 2] Train Epoch 8:  67%|██████▋   | 1348/2000 [00:11<00:04, 137.20it/s]
[Rank 1] Train Epoch 8:  67%|██████▋   | 1341/2000 [00:11<00:04, 135.41it/s]
[Rank 0] Train Epoch 8:  68%|██████▊   | 1357/2000 [00:11<00:04, 137.00it/s]
[Rank 2] Train Epoch 8:  68%|██████▊   | 1363/2000 [00:11<00:04, 140.17it/s]
[Rank 1] Train Epoch 8:  68%|██████▊   | 1355/2000 [00:11<00:04, 136.56it/s]
[Rank 0] Train Epoch 8:  69%|██████▊   | 1371/2000 [00:12<00:04, 136.85it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 8, Batch 1350 | Mem: 26.53MB, Util: 94%  global_step : 17350
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 8, Batch 1351 | Mem: 26.53MB, Util: 94%  global_step : 17351
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 8, Batch 1352 | Mem: 26.53MB, Util: 94%  global_step : 17352
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 8, Batch 1353 | Mem: 26.53MB, Util: 94%  global_step : 17353
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 8, Batch 1354 | Mem: 26.53MB, Util: 94%  global_step : 17354
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 8, Batch 1355 | Mem: 26.53MB, Util: 94%  global_step : 17355
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 8, Batch 1356 | Mem: 26.53MB, Util: 94%  global_step : 17356
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 8, Batch 1357 | Mem: 26.53MB, Util: 9

[Rank 2] Train Epoch 8:  69%|██████▉   | 1378/2000 [00:12<00:04, 142.10it/s]
[Rank 1] Train Epoch 8:  68%|██████▊   | 1369/2000 [00:12<00:04, 136.91it/s]
[Rank 0] Train Epoch 8:  69%|██████▉   | 1385/2000 [00:12<00:04, 135.22it/s]
[Rank 2] Train Epoch 8:  70%|██████▉   | 1393/2000 [00:12<00:04, 143.80it/s]
[Rank 1] Train Epoch 8:  69%|██████▉   | 1383/2000 [00:12<00:04, 137.65it/s]
[Rank 0] Train Epoch 8:  70%|███████   | 1400/2000 [00:12<00:04, 138.08it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 8, Batch 1381 | Mem: 26.53MB, Util: 100%  global_step : 17381
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 8, Batch 1382 | Mem: 26.53MB, Util: 100%  global_step : 17382
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 8, Batch 1383 | Mem: 26.53MB, Util: 100%  global_step : 17383
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 8, Batch 1384 | Mem: 26.53MB, Util: 100%  global_step : 17384
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 8, Batch 1385 | Mem: 26.53MB, Util: 100%  global_step : 17385
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 8, Batch 1386 | Mem: 26.53MB, Util: 100%  global_step : 17386
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 8, Batch 1387 | Mem: 26.53MB, Util: 100%  global_step : 17387
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 8, Batch 1388 | Mem: 26.53MB, 

[Rank 1] Train Epoch 8:  70%|██████▉   | 1397/2000 [00:12<00:04, 138.02it/s]
[Rank 0] Train Epoch 8:  71%|███████   | 1415/2000 [00:12<00:04, 139.66it/s]
[Rank 2] Train Epoch 8:  70%|███████   | 1408/2000 [00:12<00:04, 126.26it/s]
[Rank 1] Train Epoch 8:  71%|███████   | 1411/2000 [00:12<00:04, 136.14it/s]
[Rank 0] Train Epoch 8:  72%|███████▏  | 1430/2000 [00:12<00:04, 140.63it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1405 | Mem: 26.53MB, Util: 100%  global_step : 17405
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1406 | Mem: 26.53MB, Util: 100%  global_step : 17406
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1407 | Mem: 26.53MB, Util: 100%  global_step : 17407
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1408 | Mem: 26.53MB, Util: 100%  global_step : 17408
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1409 | Mem: 26.53MB, Util: 100%  global_step : 17409
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1410 | Mem: 26.53MB, Util: 100%  global_step : 17410
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1411 | Mem: 26.53MB, Util: 100%  global_step : 17411
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1412 | Mem: 2

[Rank 1] Train Epoch 8:  71%|███████▏  | 1426/2000 [00:12<00:04, 138.39it/s]
[Rank 2] Train Epoch 8:  71%|███████   | 1423/2000 [00:12<00:04, 131.87it/s]
[Rank 0] Train Epoch 8:  72%|███████▏  | 1445/2000 [00:12<00:03, 141.15it/s]
[Rank 1] Train Epoch 8:  72%|███████▏  | 1441/2000 [00:12<00:03, 139.77it/s]
[Rank 2] Train Epoch 8:  72%|███████▏  | 1438/2000 [00:12<00:04, 136.10it/s]
[Rank 0] Train Epoch 8:  73%|███████▎  | 1460/2000 [00:12<00:03, 141.98it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1435 | Mem: 26.53MB, Util: 71%  global_step : 17435
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1436 | Mem: 26.53MB, Util: 71%  global_step : 17436
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1437 | Mem: 26.53MB, Util: 71%  global_step : 17437
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1438 | Mem: 26.53MB, Util: 71%  global_step : 17438
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1439 | Mem: 26.53MB, Util: 71%  global_step : 17439
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1440 | Mem: 26.53MB, Util: 71%  global_step : 17440
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1441 | Mem: 26.53MB, Util: 71%  global_step : 17441
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1442 | Mem: 26.53MB,

[Rank 1] Train Epoch 8:  73%|███████▎  | 1456/2000 [00:12<00:03, 140.95it/s]
[Rank 2] Train Epoch 8:  73%|███████▎  | 1453/2000 [00:12<00:03, 139.20it/s]
[Rank 0] Train Epoch 8:  74%|███████▍  | 1475/2000 [00:12<00:03, 142.58it/s]
[Rank 1] Train Epoch 8:  74%|███████▎  | 1471/2000 [00:12<00:03, 141.52it/s]
[Rank 2] Train Epoch 8:  73%|███████▎  | 1468/2000 [00:12<00:03, 141.42it/s]
[Rank 0] Train Epoch 8:  74%|███████▍  | 1490/2000 [00:12<00:03, 143.19it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1465 | Mem: 26.53MB, Util: 75%  global_step : 17465
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1466 | Mem: 26.53MB, Util: 75%  global_step : 17466
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1467 | Mem: 26.53MB, Util: 75%  global_step : 17467
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1468 | Mem: 26.53MB, Util: 75%  global_step : 17468
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1469 | Mem: 26.53MB, Util: 75%  global_step : 17469
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1470 | Mem: 26.53MB, Util: 75%  global_step : 17470
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1471 | Mem: 26.53MB, Util: 75%  global_step : 17471
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1472 | Mem: 26.53MB,

[Rank 1] Train Epoch 8:  74%|███████▍  | 1486/2000 [00:12<00:03, 142.27it/s]
[Rank 2] Train Epoch 8:  74%|███████▍  | 1483/2000 [00:12<00:03, 143.63it/s]
[Rank 0] Train Epoch 8:  75%|███████▌  | 1505/2000 [00:12<00:03, 143.34it/s]
[Rank 1] Train Epoch 8:  75%|███████▌  | 1501/2000 [00:12<00:03, 142.74it/s]
[Rank 2] Train Epoch 8:  75%|███████▍  | 1498/2000 [00:12<00:03, 144.60it/s]


[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 8, Batch 1497 | Mem: 26.53MB, Util: 100%  global_step : 17497
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 8, Batch 1498 | Mem: 26.53MB, Util: 100%  global_step : 17498
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 8, Batch 1499 | Mem: 26.53MB, Util: 100%  global_step : 17499
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 8, Batch 1500 | Mem: 26.53MB, Util: 100%  global_step : 17500
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 8, Batch 1501 | Mem: 26.53MB, Util: 100%  global_step : 17501
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 8, Batch 1502 | Mem: 26.53MB, Util: 100%  global_step : 17502
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 8, Batch 1503 | Mem: 26.53MB, Util: 100%  global_step : 17503
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [GPU LOG] Epoch 8, Batch 1504 | Mem: 26.53MB, 

[Rank 1] Train Epoch 8:  76%|███████▌  | 1516/2000 [00:13<00:03, 142.25it/s]
[Rank 2] Train Epoch 8:  76%|███████▌  | 1513/2000 [00:13<00:03, 138.34it/s]
[Rank 0] Train Epoch 8:  76%|███████▌  | 1520/2000 [00:13<00:03, 134.65it/s]
[Rank 1] Train Epoch 8:  77%|███████▋  | 1531/2000 [00:13<00:03, 142.41it/s]
[Rank 2] Train Epoch 8:  76%|███████▋  | 1528/2000 [00:13<00:03, 141.13it/s]
[Rank 0] Train Epoch 8:  77%|███████▋  | 1535/2000 [00:13<00:03, 137.33it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1525 | Mem: 26.53MB, Util: 77%  global_step : 17525
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1526 | Mem: 26.53MB, Util: 77%  global_step : 17526
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1527 | Mem: 26.53MB, Util: 77%  global_step : 17527
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1528 | Mem: 26.53MB, Util: 77%  global_step : 17528
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1529 | Mem: 26.53MB, Util: 77%  global_step : 17529
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1530 | Mem: 26.53MB, Util: 77%  global_step : 17530
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1531 | Mem: 26.53MB, Util: 77%  global_step : 17531
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1532 | Mem: 26.53MB,

[Rank 1] Train Epoch 8:  77%|███████▋  | 1546/2000 [00:13<00:03, 140.65it/s]
[Rank 2] Train Epoch 8:  77%|███████▋  | 1543/2000 [00:13<00:03, 142.41it/s]
[Rank 0] Train Epoch 8:  78%|███████▊  | 1550/2000 [00:13<00:03, 139.02it/s]
[Rank 1] Train Epoch 8:  78%|███████▊  | 1561/2000 [00:13<00:03, 141.22it/s]
[Rank 2] Train Epoch 8:  78%|███████▊  | 1558/2000 [00:13<00:03, 144.31it/s]
[Rank 0] Train Epoch 8:  78%|███████▊  | 1565/2000 [00:13<00:03, 140.59it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1554 | Mem: 26.53MB, Util: 100%  global_step : 17554
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1555 | Mem: 26.53MB, Util: 100%  global_step : 17555
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1556 | Mem: 26.53MB, Util: 100%  global_step : 17556
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1557 | Mem: 26.53MB, Util: 100%  global_step : 17557
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1558 | Mem: 26.53MB, Util: 100%  global_step : 17558
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1559 | Mem: 26.53MB, Util: 100%  global_step : 17559
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1560 | Mem: 26.53MB, Util: 100%  global_step : 17560
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1561 | Mem: 2

[Rank 1] Train Epoch 8:  79%|███████▉  | 1576/2000 [00:13<00:03, 140.75it/s]
[Rank 2] Train Epoch 8:  79%|███████▊  | 1573/2000 [00:13<00:02, 144.80it/s]
[Rank 0] Train Epoch 8:  79%|███████▉  | 1580/2000 [00:13<00:02, 141.42it/s]
[Rank 0] Train Epoch 8:  80%|███████▉  | 1595/2000 [00:13<00:02, 143.02it/s]
[Rank 1] Train Epoch 8:  80%|███████▉  | 1591/2000 [00:13<00:02, 140.30it/s]
[Rank 2] Train Epoch 8:  79%|███████▉  | 1588/2000 [00:13<00:02, 145.58it/s]
[Rank 0] Train Epoch 8:  80%|████████  | 1610/2000 [00:13<00:02, 142.78it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1583 | Mem: 26.53MB, Util: 100%  global_step : 17583
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1584 | Mem: 26.53MB, Util: 100%  global_step : 17584
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1585 | Mem: 26.53MB, Util: 100%  global_step : 17585
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1586 | Mem: 26.53MB, Util: 100%  global_step : 17586
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1587 | Mem: 26.53MB, Util: 100%  global_step : 17587
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1588 | Mem: 26.53MB, Util: 100%  global_step : 17588
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1589 | Mem: 26.53MB, Util: 100%  global_step : 17589
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1590 | Mem: 2

[Rank 2] Train Epoch 8:  80%|████████  | 1603/2000 [00:13<00:03, 131.56it/s]
[Rank 1] Train Epoch 8:  80%|████████  | 1606/2000 [00:13<00:02, 139.65it/s]
[Rank 0] Train Epoch 8:  81%|████████▏ | 1625/2000 [00:13<00:02, 143.09it/s]
[Rank 1] Train Epoch 8:  81%|████████  | 1621/2000 [00:13<00:02, 140.51it/s]
[Rank 2] Train Epoch 8:  81%|████████  | 1618/2000 [00:13<00:02, 136.12it/s]
[Rank 0] Train Epoch 8:  82%|████████▏ | 1640/2000 [00:13<00:02, 143.77it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1612 | Mem: 26.53MB, Util: 100%  global_step : 17612
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1613 | Mem: 26.53MB, Util: 100%  global_step : 17613
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1614 | Mem: 26.53MB, Util: 100%  global_step : 17614
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1615 | Mem: 26.53MB, Util: 100%  global_step : 17615
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1616 | Mem: 26.53MB, Util: 100%  global_step : 17616
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1617 | Mem: 26.53MB, Util: 100%  global_step : 17617
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1618 | Mem: 26.53MB, Util: 100%  global_step : 17618
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1619 | Mem: 2

[Rank 1] Train Epoch 8:  82%|████████▏ | 1636/2000 [00:13<00:02, 141.12it/s]
[Rank 2] Train Epoch 8:  82%|████████▏ | 1633/2000 [00:13<00:02, 139.39it/s]
[Rank 0] Train Epoch 8:  83%|████████▎ | 1655/2000 [00:14<00:02, 143.81it/s]
[Rank 1] Train Epoch 8:  83%|████████▎ | 1651/2000 [00:13<00:02, 142.14it/s]
[Rank 2] Train Epoch 8:  82%|████████▏ | 1648/2000 [00:14<00:02, 141.51it/s]
[Rank 0] Train Epoch 8:  84%|████████▎ | 1670/2000 [00:14<00:02, 143.78it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1642 | Mem: 26.53MB, Util: 85%  global_step : 17642
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1643 | Mem: 26.53MB, Util: 85%  global_step : 17643
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1644 | Mem: 26.53MB, Util: 85%  global_step : 17644
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1645 | Mem: 26.53MB, Util: 85%  global_step : 17645
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1646 | Mem: 26.53MB, Util: 85%  global_step : 17646
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1647 | Mem: 26.53MB, Util: 85%  global_step : 17647
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1648 | Mem: 26.53MB, Util: 85%  global_step : 17648
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1649 | Mem: 26.53MB,

[Rank 1] Train Epoch 8:  83%|████████▎ | 1666/2000 [00:14<00:02, 142.41it/s]
[Rank 2] Train Epoch 8:  83%|████████▎ | 1663/2000 [00:14<00:02, 142.85it/s]
[Rank 0] Train Epoch 8:  84%|████████▍ | 1685/2000 [00:14<00:02, 143.68it/s]
[Rank 1] Train Epoch 8:  84%|████████▍ | 1681/2000 [00:14<00:02, 143.04it/s]
[Rank 2] Train Epoch 8:  84%|████████▍ | 1678/2000 [00:14<00:02, 143.47it/s]
[Rank 0] Train Epoch 8:  85%|████████▌ | 1700/2000 [00:14<00:02, 143.88it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1672 | Mem: 26.53MB, Util: 69%  global_step : 17672
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1673 | Mem: 26.53MB, Util: 69%  global_step : 17673
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1674 | Mem: 26.53MB, Util: 69%  global_step : 17674
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1675 | Mem: 26.53MB, Util: 69%  global_step : 17675
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1676 | Mem: 26.53MB, Util: 69%  global_step : 17676
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1677 | Mem: 26.53MB, Util: 69%  global_step : 17677
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1678 | Mem: 26.53MB, Util: 69%  global_step : 17678
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1679 | Mem: 26.53MB,

[Rank 1] Train Epoch 8:  85%|████████▍ | 1696/2000 [00:14<00:02, 143.08it/s]
[Rank 2] Train Epoch 8:  85%|████████▍ | 1693/2000 [00:14<00:02, 144.65it/s]
[Rank 0] Train Epoch 8:  86%|████████▌ | 1715/2000 [00:14<00:01, 143.91it/s]
[Rank 1] Train Epoch 8:  86%|████████▌ | 1711/2000 [00:14<00:02, 141.14it/s]
[Rank 2] Train Epoch 8:  85%|████████▌ | 1708/2000 [00:14<00:02, 139.75it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 1701 | Mem: 26.53MB, Util: 85%  global_step : 17701
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 1702 | Mem: 26.53MB, Util: 85%  global_step : 17702
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 1703 | Mem: 26.53MB, Util: 85%  global_step : 17703
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 1704 | Mem: 26.53MB, Util: 85%  global_step : 17704
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 1705 | Mem: 26.53MB, Util: 85%  global_step : 17705
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 1706 | Mem: 26.53MB, Util: 85%  global_step : 17706
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 1707 | Mem: 26.53MB, Util: 85%  global_step : 17707
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 1708 | Mem: 26.53MB, Util: 85%  glob

[Rank 0] Train Epoch 8:  86%|████████▋ | 1730/2000 [00:14<00:01, 142.01it/s]
[Rank 1] Train Epoch 8:  86%|████████▋ | 1726/2000 [00:14<00:01, 139.56it/s]
[Rank 2] Train Epoch 8:  86%|████████▌ | 1723/2000 [00:14<00:01, 142.03it/s]
[Rank 0] Train Epoch 8:  87%|████████▋ | 1745/2000 [00:14<00:01, 141.74it/s]
[Rank 1] Train Epoch 8:  87%|████████▋ | 1740/2000 [00:14<00:01, 138.27it/s]
[Rank 2] Train Epoch 8:  87%|████████▋ | 1738/2000 [00:14<00:01, 143.87it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 1730 | Mem: 26.53MB, Util: 79%  global_step : 17730
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 1731 | Mem: 26.53MB, Util: 79%  global_step : 17731
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 1732 | Mem: 26.53MB, Util: 79%  global_step : 17732
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 1733 | Mem: 26.53MB, Util: 79%  global_step : 17733
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 1734 | Mem: 26.53MB, Util: 79%  global_step : 17734
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 1735 | Mem: 26.53MB, Util: 79%  global_step : 17735
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 1736 | Mem: 26.53MB, Util: 79%  global_step : 17736
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 1737 | Mem: 26.53MB, Util: 79%  glob

[Rank 0] Train Epoch 8:  88%|████████▊ | 1760/2000 [00:14<00:01, 141.29it/s]
[Rank 1] Train Epoch 8:  88%|████████▊ | 1754/2000 [00:14<00:01, 137.27it/s]
[Rank 2] Train Epoch 8:  88%|████████▊ | 1753/2000 [00:14<00:01, 145.14it/s]
[Rank 1] Train Epoch 8:  88%|████████▊ | 1768/2000 [00:14<00:01, 137.10it/s]
[Rank 2] Train Epoch 8:  88%|████████▊ | 1768/2000 [00:14<00:01, 146.31it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 1760 | Mem: 26.53MB, Util: 99%  global_step : 17760
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 1761 | Mem: 26.53MB, Util: 99%  global_step : 17761
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 1762 | Mem: 26.53MB, Util: 99%  global_step : 17762
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 1763 | Mem: 26.53MB, Util: 99%  global_step : 17763
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 1764 | Mem: 26.53MB, Util: 99%  global_step : 17764
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 1765 | Mem: 26.53MB, Util: 99%  global_step : 17765
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 1766 | Mem: 26.53MB, Util: 99%  global_step : 17766
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 1767 | Mem: 26.53MB, Util: 99%  glob

[Rank 0] Train Epoch 8:  89%|████████▉ | 1775/2000 [00:14<00:01, 140.35it/s]
[Rank 1] Train Epoch 8:  89%|████████▉ | 1782/2000 [00:14<00:01, 137.02it/s]
[Rank 2] Train Epoch 8:  89%|████████▉ | 1783/2000 [00:14<00:01, 146.73it/s]
[Rank 0] Train Epoch 8:  90%|████████▉ | 1790/2000 [00:14<00:01, 140.15it/s]
[Rank 1] Train Epoch 8:  90%|████████▉ | 1796/2000 [00:15<00:01, 136.90it/s]
[Rank 2] Train Epoch 8:  90%|████████▉ | 1798/2000 [00:15<00:01, 147.19it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 1789 | Mem: 26.53MB, Util: 100%  global_step : 17789
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 1790 | Mem: 26.53MB, Util: 100%  global_step : 17790
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 1791 | Mem: 26.53MB, Util: 100%  global_step : 17791
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 1792 | Mem: 26.53MB, Util: 100%  global_step : 17792
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 1793 | Mem: 26.53MB, Util: 100%  global_step : 17793
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 1794 | Mem: 26.53MB, Util: 100%  global_step : 17794
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 1795 | Mem: 26.53MB, Util: 100%  global_step : 17795
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 8, Batch 1796 | Mem: 26.53MB, Util: 10

[Rank 0] Train Epoch 8:  90%|█████████ | 1805/2000 [00:15<00:01, 131.88it/s]
[Rank 1] Train Epoch 8:  90%|█████████ | 1810/2000 [00:15<00:01, 136.86it/s]
[Rank 2] Train Epoch 8:  91%|█████████ | 1813/2000 [00:15<00:01, 126.01it/s]
[Rank 1] Train Epoch 8:  91%|█████████ | 1824/2000 [00:15<00:01, 136.33it/s]
[Rank 0] Train Epoch 8:  91%|█████████ | 1820/2000 [00:15<00:01, 134.61it/s]
[Rank 2] Train Epoch 8:  91%|█████████▏| 1828/2000 [00:15<00:01, 131.37it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1815 | Mem: 26.53MB, Util: 65%  global_step : 17815
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1816 | Mem: 26.53MB, Util: 65%  global_step : 17816
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1817 | Mem: 26.53MB, Util: 65%  global_step : 17817
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1818 | Mem: 26.53MB, Util: 65%  global_step : 17818
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1819 | Mem: 26.53MB, Util: 65%  global_step : 17819
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1820 | Mem: 26.53MB, Util: 65%  global_step : 17820
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1821 | Mem: 26.53MB, Util: 64%  global_step : 17821
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1822 | Mem: 26.53MB,

[Rank 1] Train Epoch 8:  92%|█████████▏| 1838/2000 [00:15<00:01, 135.71it/s]
[Rank 0] Train Epoch 8:  92%|█████████▏| 1835/2000 [00:15<00:01, 136.50it/s]
[Rank 2] Train Epoch 8:  92%|█████████▏| 1844/2000 [00:15<00:01, 137.45it/s]
[Rank 1] Train Epoch 8:  93%|█████████▎| 1852/2000 [00:15<00:01, 134.74it/s]
[Rank 0] Train Epoch 8:  92%|█████████▎| 1850/2000 [00:15<00:01, 135.31it/s]
[Rank 2] Train Epoch 8:  93%|█████████▎| 1859/2000 [00:15<00:01, 139.73it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1843 | Mem: 26.53MB, Util: 64%  global_step : 17843
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1844 | Mem: 26.53MB, Util: 64%  global_step : 17844
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1845 | Mem: 26.53MB, Util: 64%  global_step : 17845
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1846 | Mem: 26.53MB, Util: 64%  global_step : 17846
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1847 | Mem: 26.53MB, Util: 64%  global_step : 17847
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1848 | Mem: 26.53MB, Util: 64%  global_step : 17848
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1849 | Mem: 26.53MB, Util: 63%  global_step : 17849
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1850 | Mem: 26.53MB,

[Rank 1] Train Epoch 8:  93%|█████████▎| 1866/2000 [00:15<00:00, 134.51it/s]
[Rank 0] Train Epoch 8:  93%|█████████▎| 1864/2000 [00:15<00:01, 134.74it/s]
[Rank 2] Train Epoch 8:  94%|█████████▎| 1874/2000 [00:15<00:00, 141.41it/s]
[Rank 1] Train Epoch 8:  94%|█████████▍| 1880/2000 [00:15<00:00, 134.34it/s]
[Rank 0] Train Epoch 8:  94%|█████████▍| 1879/2000 [00:15<00:00, 136.34it/s]
[Rank 2] Train Epoch 8:  94%|█████████▍| 1889/2000 [00:15<00:00, 142.84it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1871 | Mem: 26.53MB, Util: 63%  global_step : 17871
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1872 | Mem: 26.53MB, Util: 63%  global_step : 17872
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1873 | Mem: 26.53MB, Util: 63%  global_step : 17873
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1874 | Mem: 26.53MB, Util: 63%  global_step : 17874
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1875 | Mem: 26.53MB, Util: 63%  global_step : 17875
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1876 | Mem: 26.53MB, Util: 63%  global_step : 17876
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1877 | Mem: 26.53MB, Util: 63%  global_step : 17877
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1878 | Mem: 26.53MB,

[Rank 1] Train Epoch 8:  95%|█████████▍| 1894/2000 [00:15<00:00, 134.03it/s]
[Rank 0] Train Epoch 8:  95%|█████████▍| 1893/2000 [00:15<00:00, 136.29it/s]
[Rank 1] Train Epoch 8:  95%|█████████▌| 1908/2000 [00:15<00:00, 133.21it/s]
[Rank 0] Train Epoch 8:  95%|█████████▌| 1907/2000 [00:15<00:00, 129.17it/s]
[Rank 2] Train Epoch 8:  95%|█████████▌| 1904/2000 [00:15<00:00, 118.91it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1899 | Mem: 26.53MB, Util: 64%  global_step : 17899
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1900 | Mem: 26.53MB, Util: 64%  global_step : 17900
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1901 | Mem: 26.53MB, Util: 64%  global_step : 17901
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1902 | Mem: 26.53MB, Util: 64%  global_step : 17902
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1903 | Mem: 26.53MB, Util: 64%  global_step : 17903
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1904 | Mem: 26.53MB, Util: 64%  global_step : 17904
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1905 | Mem: 26.53MB, Util: 64%  global_step : 17905
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1906 | Mem: 26.53MB,

[Rank 1] Train Epoch 8:  96%|█████████▌| 1922/2000 [00:15<00:00, 132.98it/s]
[Rank 0] Train Epoch 8:  96%|█████████▌| 1921/2000 [00:15<00:00, 131.71it/s]
[Rank 2] Train Epoch 8:  96%|█████████▌| 1919/2000 [00:15<00:00, 125.70it/s]
[Rank 1] Train Epoch 8:  97%|█████████▋| 1936/2000 [00:16<00:00, 133.03it/s]
[Rank 0] Train Epoch 8:  97%|█████████▋| 1936/2000 [00:16<00:00, 134.41it/s]
[Rank 2] Train Epoch 8:  97%|█████████▋| 1934/2000 [00:16<00:00, 130.99it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1927 | Mem: 26.53MB, Util: 66%  global_step : 17927
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1928 | Mem: 26.53MB, Util: 65%  global_step : 17928
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1929 | Mem: 26.53MB, Util: 65%  global_step : 17929
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1930 | Mem: 26.53MB, Util: 65%  global_step : 17930
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1931 | Mem: 26.53MB, Util: 65%  global_step : 17931
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1932 | Mem: 26.53MB, Util: 65%  global_step : 17932
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1933 | Mem: 26.53MB, Util: 65%  global_step : 17933
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1934 | Mem: 26.53MB,

[Rank 1] Train Epoch 8:  98%|█████████▊| 1950/2000 [00:16<00:00, 132.58it/s]
[Rank 0] Train Epoch 8:  98%|█████████▊| 1951/2000 [00:16<00:00, 136.43it/s]
[Rank 2] Train Epoch 8:  97%|█████████▋| 1949/2000 [00:16<00:00, 135.47it/s]
[Rank 1] Train Epoch 8:  98%|█████████▊| 1964/2000 [00:16<00:00, 133.39it/s]
[Rank 0] Train Epoch 8:  98%|█████████▊| 1965/2000 [00:16<00:00, 135.55it/s]
[Rank 2] Train Epoch 8:  98%|█████████▊| 1964/2000 [00:16<00:00, 138.65it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1955 | Mem: 26.53MB, Util: 65%  global_step : 17955
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1956 | Mem: 26.53MB, Util: 65%  global_step : 17956
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1957 | Mem: 26.53MB, Util: 65%  global_step : 17957
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1958 | Mem: 26.53MB, Util: 65%  global_step : 17958
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1959 | Mem: 26.53MB, Util: 65%  global_step : 17959
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1960 | Mem: 26.53MB, Util: 65%  global_step : 17960
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1961 | Mem: 26.53MB, Util: 65%  global_step : 17961
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1962 | Mem: 26.53MB,

[Rank 1] Train Epoch 8:  99%|█████████▉| 1978/2000 [00:16<00:00, 133.35it/s]
[Rank 0] Train Epoch 8:  99%|█████████▉| 1979/2000 [00:16<00:00, 135.08it/s]
[Rank 2] Train Epoch 8:  99%|█████████▉| 1979/2000 [00:16<00:00, 140.59it/s]
[Rank 1] Train Epoch 8: 100%|█████████▉| 1992/2000 [00:16<00:00, 132.88it/s]
[Rank 0] Train Epoch 8: 100%|██████████| 2000/2000 [00:16<00:00, 120.85it/s]
[Rank 0] Test Epoch 8:   0%|          | 0/334 [00:00<?, ?it/s]
[Rank 2] Train Epoch 8: 100%|██████████| 2000/2000 [00:16<00:00, 120.92it/s]
[Rank 2] Test Epoch 8:   0%|          | 0/334 [00:00<?, ?it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1982 | Mem: 26.53MB, Util: 65%  global_step : 17982
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1983 | Mem: 26.53MB, Util: 66%  global_step : 17983
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1984 | Mem: 26.53MB, Util: 66%  global_step : 17984
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1985 | Mem: 26.53MB, Util: 66%  global_step : 17985
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1986 | Mem: 26.53MB, Util: 66%  global_step : 17986
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1987 | Mem: 26.53MB, Util: 66%  global_step : 17987
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1988 | Mem: 26.53MB, Util: 66%  global_step : 17988
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 8, Batch 1989 | Mem: 26.53MB,

[Rank 1] Train Epoch 8: 100%|██████████| 2000/2000 [00:16<00:00, 120.67it/s]
[Rank 1] Test Epoch 8:   0%|          | 0/334 [00:00<?, ?it/s]
[Rank 0] Test Epoch 8:   7%|▋         | 25/334 [00:00<00:01, 243.78it/s]
[Rank 2] Test Epoch 8:   4%|▍         | 14/334 [00:00<00:02, 136.61it/s]
[Rank 1] Test Epoch 8:  10%|▉         | 33/334 [00:00<00:00, 324.64it/s]
[Rank 0] Test Epoch 8:  18%|█▊        | 60/334 [00:00<00:00, 301.66it/s]
[Rank 2] Test Epoch 8:  15%|█▍        | 50/334 [00:00<00:01, 266.19it/s]
[Rank 1] Test Epoch 8:  20%|██        | 68/334 [00:00<00:00, 334.79it/s]
[Rank 0] Test Epoch 8:  28%|██▊       | 94/334 [00:00<00:00, 318.56it/s]
[Rank 2] Test Epoch 8:  26%|██▌       | 87/334 [00:00<00:00, 311.60it/s]
[Rank 1] Test Epoch 8:  31%|███       | 102/334 [00:00<00:00, 336.98it/s]
[Rank 0] Test Epoch 8:  39%|███▊      | 129/334 [00:00<00:00, 327.71it/s]
[Rank 2] Test Epoch 8:  37%|███▋      | 124/334 [00:00<00:00, 333.41it/s]
[Rank 1] Test Epoch 8:  41%|████      | 137/334 [00:00

[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [Rank 1] Epoch 8 | Loss: 0.3509, Acc: 0.8755, Model Checksum: ee7f61445e2162c70d70532be3720c38
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [ NodeId d78b974282fa0fa2bddc3a93a3217bbba8df4be1998f6b20ec83243d Rank 1] Epoch 8 | Loss: 0.3509, Acc: 0.8755, Model Checksum: ee7f61445e2162c70d70532be3720c38
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 0 | Mem: 26.53MB, Util: 3%  global_step : 18000
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1 | Mem: 26.53MB, Util: 3%  global_step : 18001
[36m(TunerInternal pid=767)[0m 
[36m(TunerInternal pid=767)[0m Training finished iteration 9 at 2025-04-07 12:57:40. Total running time: 8min 49s
[36m(TunerInternal pid=767)[0m ╭────────────────────────────────────────────╮
[36m(TunerInternal pid=767)[0m │ Training result                            │
[36m(TunerInternal pid=767)[0m ├────────────────────────────────────────────┤


[Rank 1] Train Epoch 9:   1%|          | 13/2000 [00:00<00:15, 124.79it/s]
[Rank 0] Train Epoch 9:   1%|          | 14/2000 [00:00<00:15, 131.07it/s]
[Rank 2] Train Epoch 9:   1%|          | 14/2000 [00:00<00:14, 136.06it/s]
[Rank 1] Train Epoch 9:   1%|▏         | 27/2000 [00:00<00:15, 128.92it/s]
[Rank 0] Train Epoch 9:   1%|▏         | 28/2000 [00:00<00:14, 135.58it/s]
[Rank 2] Train Epoch 9:   1%|▏         | 29/2000 [00:00<00:13, 142.53it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 16 | Mem: 26.53MB, Util: 37%  global_step : 18016
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 17 | Mem: 26.53MB, Util: 37%  global_step : 18017
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 18 | Mem: 26.53MB, Util: 37%  global_step : 18018
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 19 | Mem: 26.53MB, Util: 37%  global_step : 18019
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 20 | Mem: 26.53MB, Util: 37%  global_step : 18020
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 21 | Mem: 26.53MB, Util: 37%  global_step : 18021
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 22 | Mem: 26.53MB, Util: 37%  global_step : 18022
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 23 | Mem: 26.53MB, Util: 37%  glob

[Rank 1] Train Epoch 9:   2%|▏         | 41/2000 [00:00<00:15, 130.01it/s]
[Rank 2] Train Epoch 9:   2%|▏         | 44/2000 [00:00<00:13, 142.39it/s]
[Rank 0] Train Epoch 9:   2%|▏         | 42/2000 [00:00<00:14, 136.29it/s]
[Rank 1] Train Epoch 9:   3%|▎         | 54/2000 [00:00<00:14, 129.78it/s]
[Rank 2] Train Epoch 9:   3%|▎         | 59/2000 [00:00<00:13, 142.85it/s]
[Rank 0] Train Epoch 9:   3%|▎         | 56/2000 [00:00<00:14, 137.42it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 43 | Mem: 26.53MB, Util: 64%  global_step : 18043
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 44 | Mem: 26.53MB, Util: 64%  global_step : 18044
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 45 | Mem: 26.53MB, Util: 64%  global_step : 18045
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 46 | Mem: 26.53MB, Util: 64%  global_step : 18046
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 47 | Mem: 26.53MB, Util: 64%  global_step : 18047
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 48 | Mem: 26.53MB, Util: 64%  global_step : 18048
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 49 | Mem: 26.53MB, Util: 64%  global_step : 18049
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 50 | Mem: 26.53MB, Util: 64%  glob

[Rank 1] Train Epoch 9:   3%|▎         | 68/2000 [00:00<00:14, 130.35it/s]
[Rank 2] Train Epoch 9:   4%|▎         | 74/2000 [00:00<00:13, 142.96it/s]
[Rank 0] Train Epoch 9:   4%|▎         | 70/2000 [00:00<00:13, 138.29it/s]
[Rank 1] Train Epoch 9:   4%|▍         | 82/2000 [00:00<00:14, 131.49it/s]
[Rank 0] Train Epoch 9:   4%|▍         | 84/2000 [00:00<00:13, 137.93it/s]
[Rank 2] Train Epoch 9:   4%|▍         | 89/2000 [00:00<00:13, 144.02it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 71 | Mem: 26.53MB, Util: 66%  global_step : 18071
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 72 | Mem: 26.53MB, Util: 66%  global_step : 18072
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 73 | Mem: 26.53MB, Util: 66%  global_step : 18073
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 74 | Mem: 26.53MB, Util: 66%  global_step : 18074
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 75 | Mem: 26.53MB, Util: 66%  global_step : 18075
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 76 | Mem: 26.53MB, Util: 66%  global_step : 18076
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 77 | Mem: 26.53MB, Util: 66%  global_step : 18077
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 78 | Mem: 26.53MB, Util: 66%  glob

[Rank 1] Train Epoch 9:   5%|▍         | 96/2000 [00:00<00:14, 131.92it/s]
[Rank 0] Train Epoch 9:   5%|▍         | 99/2000 [00:00<00:13, 138.49it/s]
[Rank 1] Train Epoch 9:   6%|▌         | 110/2000 [00:00<00:14, 132.20it/s]
[Rank 2] Train Epoch 9:   5%|▌         | 104/2000 [00:00<00:18, 103.78it/s]
[Rank 0] Train Epoch 9:   6%|▌         | 113/2000 [00:00<00:15, 125.12it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 99 | Mem: 26.53MB, Util: 64%  global_step : 18099
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 100 | Mem: 26.53MB, Util: 64%  global_step : 18100
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 101 | Mem: 26.53MB, Util: 64%  global_step : 18101
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 102 | Mem: 26.53MB, Util: 64%  global_step : 18102
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 103 | Mem: 26.53MB, Util: 64%  global_step : 18103
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 104 | Mem: 26.53MB, Util: 64%  global_step : 18104
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 105 | Mem: 26.53MB, Util: 64%  global_step : 18105
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 106 | Mem: 26.53MB, Util: 64

[Rank 1] Train Epoch 9:   6%|▌         | 124/2000 [00:00<00:14, 133.38it/s]
[Rank 2] Train Epoch 9:   6%|▌         | 119/2000 [00:00<00:16, 113.81it/s]
[Rank 0] Train Epoch 9:   6%|▋         | 127/2000 [00:00<00:14, 128.50it/s]
[Rank 1] Train Epoch 9:   7%|▋         | 138/2000 [00:01<00:14, 132.80it/s]
[Rank 2] Train Epoch 9:   7%|▋         | 134/2000 [00:01<00:15, 122.51it/s]
[Rank 0] Train Epoch 9:   7%|▋         | 142/2000 [00:01<00:14, 132.31it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 127 | Mem: 26.53MB, Util: 66%  global_step : 18127
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 128 | Mem: 26.53MB, Util: 66%  global_step : 18128
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 129 | Mem: 26.53MB, Util: 66%  global_step : 18129
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 130 | Mem: 26.53MB, Util: 66%  global_step : 18130
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 131 | Mem: 26.53MB, Util: 66%  global_step : 18131
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 132 | Mem: 26.53MB, Util: 66%  global_step : 18132
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 133 | Mem: 26.53MB, Util: 66%  global_step : 18133
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 134 | Mem: 26.53MB, Util: 6

[Rank 1] Train Epoch 9:   8%|▊         | 152/2000 [00:01<00:13, 132.32it/s]
[Rank 2] Train Epoch 9:   7%|▋         | 149/2000 [00:01<00:14, 129.16it/s]
[Rank 0] Train Epoch 9:   8%|▊         | 156/2000 [00:01<00:13, 133.54it/s]
[Rank 1] Train Epoch 9:   8%|▊         | 166/2000 [00:01<00:13, 132.48it/s]
[Rank 2] Train Epoch 9:   8%|▊         | 164/2000 [00:01<00:13, 134.29it/s]
[Rank 0] Train Epoch 9:   9%|▊         | 171/2000 [00:01<00:13, 137.51it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 155 | Mem: 26.53MB, Util: 67%  global_step : 18155
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 156 | Mem: 26.53MB, Util: 67%  global_step : 18156
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 157 | Mem: 26.53MB, Util: 67%  global_step : 18157
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 158 | Mem: 26.53MB, Util: 67%  global_step : 18158
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 159 | Mem: 26.53MB, Util: 67%  global_step : 18159
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 160 | Mem: 26.53MB, Util: 67%  global_step : 18160
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 161 | Mem: 26.53MB, Util: 67%  global_step : 18161
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 162 | Mem: 26.53MB, Util: 6

[Rank 1] Train Epoch 9:   9%|▉         | 180/2000 [00:01<00:13, 133.23it/s]
[Rank 2] Train Epoch 9:   9%|▉         | 179/2000 [00:01<00:13, 137.82it/s]
[Rank 0] Train Epoch 9:   9%|▉         | 186/2000 [00:01<00:12, 140.03it/s]
[Rank 1] Train Epoch 9:  10%|▉         | 194/2000 [00:01<00:13, 133.84it/s]
[Rank 2] Train Epoch 9:  10%|▉         | 194/2000 [00:01<00:12, 140.57it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 183 | Mem: 26.53MB, Util: 66%  global_step : 18183
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 184 | Mem: 26.53MB, Util: 66%  global_step : 18184
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 185 | Mem: 26.53MB, Util: 66%  global_step : 18185
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 186 | Mem: 26.53MB, Util: 66%  global_step : 18186
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 187 | Mem: 26.53MB, Util: 66%  global_step : 18187
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 188 | Mem: 26.53MB, Util: 66%  global_step : 18188
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 189 | Mem: 26.53MB, Util: 66%  global_step : 18189
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 190 | Mem: 26.53MB, Util: 6

[Rank 1] Train Epoch 9:  10%|█         | 208/2000 [00:01<00:13, 134.50it/s]
[Rank 2] Train Epoch 9:  10%|█         | 209/2000 [00:01<00:14, 119.86it/s]
[Rank 0] Train Epoch 9:  10%|█         | 201/2000 [00:01<00:14, 125.90it/s]
[Rank 1] Train Epoch 9:  11%|█         | 222/2000 [00:01<00:13, 134.38it/s]
[Rank 2] Train Epoch 9:  11%|█         | 224/2000 [00:01<00:14, 126.49it/s]
[Rank 0] Train Epoch 9:  11%|█         | 216/2000 [00:01<00:13, 131.85it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 211 | Mem: 26.53MB, Util: 67%  global_step : 18211
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 212 | Mem: 26.53MB, Util: 67%  global_step : 18212
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 213 | Mem: 26.53MB, Util: 67%  global_step : 18213
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 214 | Mem: 26.53MB, Util: 67%  global_step : 18214
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 215 | Mem: 26.53MB, Util: 67%  global_step : 18215
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 216 | Mem: 26.53MB, Util: 67%  global_step : 18216
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 217 | Mem: 26.53MB, Util: 67%  global_step : 18217
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 218 | Mem: 26.53MB, Util: 6

[Rank 1] Train Epoch 9:  12%|█▏        | 236/2000 [00:01<00:13, 133.41it/s]
[Rank 2] Train Epoch 9:  12%|█▏        | 239/2000 [00:01<00:13, 131.99it/s]
[Rank 0] Train Epoch 9:  12%|█▏        | 231/2000 [00:01<00:12, 136.09it/s]
[Rank 1] Train Epoch 9:  12%|█▎        | 250/2000 [00:01<00:13, 133.37it/s]
[Rank 2] Train Epoch 9:  13%|█▎        | 254/2000 [00:01<00:12, 135.76it/s]
[Rank 0] Train Epoch 9:  12%|█▏        | 246/2000 [00:01<00:12, 139.05it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 239 | Mem: 26.53MB, Util: 67%  global_step : 18239
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 240 | Mem: 26.53MB, Util: 67%  global_step : 18240
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 241 | Mem: 26.53MB, Util: 67%  global_step : 18241
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 242 | Mem: 26.53MB, Util: 67%  global_step : 18242
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 243 | Mem: 26.53MB, Util: 67%  global_step : 18243
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 244 | Mem: 26.53MB, Util: 67%  global_step : 18244
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 245 | Mem: 26.53MB, Util: 67%  global_step : 18245
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 246 | Mem: 26.53MB, Util: 6

[Rank 1] Train Epoch 9:  13%|█▎        | 264/2000 [00:01<00:12, 134.02it/s]
[Rank 2] Train Epoch 9:  13%|█▎        | 269/2000 [00:02<00:12, 138.83it/s]
[Rank 0] Train Epoch 9:  13%|█▎        | 261/2000 [00:01<00:12, 139.50it/s]
[Rank 1] Train Epoch 9:  14%|█▍        | 278/2000 [00:02<00:12, 134.60it/s]
[Rank 2] Train Epoch 9:  14%|█▍        | 284/2000 [00:02<00:12, 139.95it/s]
[Rank 0] Train Epoch 9:  14%|█▍        | 276/2000 [00:02<00:12, 139.61it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 267 | Mem: 26.53MB, Util: 66%  global_step : 18267
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 268 | Mem: 26.53MB, Util: 66%  global_step : 18268
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 269 | Mem: 26.53MB, Util: 66%  global_step : 18269
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 270 | Mem: 26.53MB, Util: 66%  global_step : 18270
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 271 | Mem: 26.53MB, Util: 66%  global_step : 18271
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 272 | Mem: 26.53MB, Util: 66%  global_step : 18272
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 273 | Mem: 26.53MB, Util: 66%  global_step : 18273
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 274 | Mem: 26.53MB, Util: 6

[Rank 1] Train Epoch 9:  15%|█▍        | 292/2000 [00:02<00:12, 135.28it/s]
[Rank 2] Train Epoch 9:  15%|█▍        | 299/2000 [00:02<00:11, 142.10it/s]
[Rank 0] Train Epoch 9:  15%|█▍        | 291/2000 [00:02<00:12, 140.77it/s]
[Rank 1] Train Epoch 9:  15%|█▌        | 306/2000 [00:02<00:12, 135.25it/s]
[Rank 0] Train Epoch 9:  15%|█▌        | 306/2000 [00:02<00:13, 123.77it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 295 | Mem: 26.53MB, Util: 68%  global_step : 18295
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 296 | Mem: 26.53MB, Util: 68%  global_step : 18296
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 297 | Mem: 26.53MB, Util: 68%  global_step : 18297
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 298 | Mem: 26.53MB, Util: 68%  global_step : 18298
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 299 | Mem: 26.53MB, Util: 68%  global_step : 18299
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 300 | Mem: 26.53MB, Util: 68%  global_step : 18300
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 301 | Mem: 26.53MB, Util: 68%  global_step : 18301
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 302 | Mem: 26.53MB, Util: 6

[Rank 1] Train Epoch 9:  16%|█▌        | 320/2000 [00:02<00:12, 135.01it/s]
[Rank 2] Train Epoch 9:  16%|█▌        | 314/2000 [00:02<00:13, 123.56it/s]
[Rank 0] Train Epoch 9:  16%|█▌        | 321/2000 [00:02<00:13, 128.87it/s]
[Rank 1] Train Epoch 9:  17%|█▋        | 334/2000 [00:02<00:12, 135.25it/s]
[Rank 2] Train Epoch 9:  16%|█▋        | 329/2000 [00:02<00:12, 128.97it/s]
[Rank 0] Train Epoch 9:  17%|█▋        | 336/2000 [00:02<00:12, 131.91it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 324 | Mem: 26.53MB, Util: 72%  global_step : 18324
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 325 | Mem: 26.53MB, Util: 72%  global_step : 18325
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 326 | Mem: 26.53MB, Util: 72%  global_step : 18326
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 327 | Mem: 26.53MB, Util: 72%  global_step : 18327
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 328 | Mem: 26.53MB, Util: 72%  global_step : 18328
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 329 | Mem: 26.53MB, Util: 72%  global_step : 18329
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 330 | Mem: 26.53MB, Util: 72%  global_step : 18330
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 331 | Mem: 26.53MB, Util: 7

[Rank 1] Train Epoch 9:  17%|█▋        | 348/2000 [00:02<00:12, 135.18it/s]
[Rank 2] Train Epoch 9:  17%|█▋        | 344/2000 [00:02<00:12, 133.98it/s]
[Rank 0] Train Epoch 9:  18%|█▊        | 351/2000 [00:02<00:12, 134.80it/s]
[Rank 1] Train Epoch 9:  18%|█▊        | 362/2000 [00:02<00:12, 135.22it/s]
[Rank 2] Train Epoch 9:  18%|█▊        | 359/2000 [00:02<00:11, 137.26it/s]
[Rank 0] Train Epoch 9:  18%|█▊        | 366/2000 [00:02<00:11, 136.46it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 352 | Mem: 26.53MB, Util: 70%  global_step : 18352
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 353 | Mem: 26.53MB, Util: 70%  global_step : 18353
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 354 | Mem: 26.53MB, Util: 70%  global_step : 18354
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 355 | Mem: 26.53MB, Util: 70%  global_step : 18355
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 356 | Mem: 26.53MB, Util: 70%  global_step : 18356
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 357 | Mem: 26.53MB, Util: 70%  global_step : 18357
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 358 | Mem: 26.53MB, Util: 70%  global_step : 18358
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 359 | Mem: 26.53MB, Util: 7

[Rank 1] Train Epoch 9:  19%|█▉        | 376/2000 [00:02<00:11, 135.42it/s]
[Rank 2] Train Epoch 9:  19%|█▊        | 374/2000 [00:02<00:11, 140.34it/s]
[Rank 2] Train Epoch 9:  19%|█▉        | 389/2000 [00:02<00:11, 142.06it/s]
[Rank 0] Train Epoch 9:  19%|█▉        | 381/2000 [00:02<00:11, 138.03it/s]
[Rank 1] Train Epoch 9:  20%|█▉        | 390/2000 [00:02<00:11, 135.07it/s]
[Rank 0] Train Epoch 9:  20%|█▉        | 396/2000 [00:02<00:11, 138.89it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 380 | Mem: 26.53MB, Util: 67%  global_step : 18380
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 381 | Mem: 26.53MB, Util: 67%  global_step : 18381
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 382 | Mem: 26.53MB, Util: 67%  global_step : 18382
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 383 | Mem: 26.53MB, Util: 67%  global_step : 18383
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 384 | Mem: 26.53MB, Util: 67%  global_step : 18384
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 385 | Mem: 26.53MB, Util: 67%  global_step : 18385
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 386 | Mem: 26.53MB, Util: 67%  global_step : 18386
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 387 | Mem: 26.53MB, Util: 6

[Rank 1] Train Epoch 9:  20%|██        | 404/2000 [00:03<00:11, 136.19it/s]
[Rank 2] Train Epoch 9:  20%|██        | 404/2000 [00:03<00:12, 124.36it/s]
[Rank 0] Train Epoch 9:  20%|██        | 410/2000 [00:03<00:12, 129.89it/s]
[Rank 1] Train Epoch 9:  21%|██        | 418/2000 [00:03<00:11, 136.88it/s]
[Rank 2] Train Epoch 9:  21%|██        | 419/2000 [00:03<00:12, 130.37it/s]
[Rank 0] Train Epoch 9:  21%|██▏       | 425/2000 [00:03<00:11, 133.52it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 409 | Mem: 26.53MB, Util: 69%  global_step : 18409
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 410 | Mem: 26.53MB, Util: 69%  global_step : 18410
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 411 | Mem: 26.53MB, Util: 69%  global_step : 18411
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 412 | Mem: 26.53MB, Util: 69%  global_step : 18412
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 413 | Mem: 26.53MB, Util: 69%  global_step : 18413
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 414 | Mem: 26.53MB, Util: 69%  global_step : 18414
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 415 | Mem: 26.53MB, Util: 68%  global_step : 18415
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 416 | Mem: 26.53MB, Util: 6

[Rank 1] Train Epoch 9:  22%|██▏       | 432/2000 [00:03<00:11, 137.49it/s]
[Rank 2] Train Epoch 9:  22%|██▏       | 434/2000 [00:03<00:11, 134.78it/s]
[Rank 0] Train Epoch 9:  22%|██▏       | 440/2000 [00:03<00:11, 135.99it/s]
[Rank 1] Train Epoch 9:  22%|██▏       | 447/2000 [00:03<00:11, 138.38it/s]
[Rank 2] Train Epoch 9:  22%|██▏       | 449/2000 [00:03<00:11, 137.99it/s]
[Rank 0] Train Epoch 9:  23%|██▎       | 455/2000 [00:03<00:11, 137.38it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 438 | Mem: 26.53MB, Util: 68%  global_step : 18438
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 439 | Mem: 26.53MB, Util: 68%  global_step : 18439
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 440 | Mem: 26.53MB, Util: 68%  global_step : 18440
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 441 | Mem: 26.53MB, Util: 68%  global_step : 18441
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 442 | Mem: 26.53MB, Util: 68%  global_step : 18442
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 443 | Mem: 26.53MB, Util: 68%  global_step : 18443
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 444 | Mem: 26.53MB, Util: 70%  global_step : 18444
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 445 | Mem: 26.53MB, Util: 7

[Rank 1] Train Epoch 9:  23%|██▎       | 461/2000 [00:03<00:11, 137.58it/s]
[Rank 2] Train Epoch 9:  23%|██▎       | 464/2000 [00:03<00:10, 140.08it/s]
[Rank 0] Train Epoch 9:  24%|██▎       | 470/2000 [00:03<00:11, 138.74it/s]
[Rank 1] Train Epoch 9:  24%|██▍       | 475/2000 [00:03<00:11, 136.07it/s]
[Rank 2] Train Epoch 9:  24%|██▍       | 479/2000 [00:03<00:10, 142.41it/s]
[Rank 0] Train Epoch 9:  24%|██▍       | 485/2000 [00:03<00:10, 139.39it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 466 | Mem: 26.53MB, Util: 70%  global_step : 18466
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 467 | Mem: 26.53MB, Util: 70%  global_step : 18467
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 468 | Mem: 26.53MB, Util: 70%  global_step : 18468
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 469 | Mem: 26.53MB, Util: 70%  global_step : 18469
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 470 | Mem: 26.53MB, Util: 70%  global_step : 18470
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 471 | Mem: 26.53MB, Util: 70%  global_step : 18471
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 472 | Mem: 26.53MB, Util: 68%  global_step : 18472
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 473 | Mem: 26.53MB, Util: 6

[Rank 1] Train Epoch 9:  24%|██▍       | 489/2000 [00:03<00:11, 135.86it/s]
[Rank 2] Train Epoch 9:  25%|██▍       | 494/2000 [00:03<00:10, 143.75it/s]
[Rank 0] Train Epoch 9:  25%|██▌       | 500/2000 [00:03<00:10, 140.13it/s]
[Rank 1] Train Epoch 9:  25%|██▌       | 503/2000 [00:03<00:11, 135.16it/s]
[Rank 2] Train Epoch 9:  25%|██▌       | 509/2000 [00:03<00:11, 126.16it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 494 | Mem: 26.53MB, Util: 68%  global_step : 18494
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 495 | Mem: 26.53MB, Util: 68%  global_step : 18495
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 496 | Mem: 26.53MB, Util: 68%  global_step : 18496
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 497 | Mem: 26.53MB, Util: 68%  global_step : 18497
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 498 | Mem: 26.53MB, Util: 68%  global_step : 18498
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 499 | Mem: 26.53MB, Util: 68%  global_step : 18499
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 500 | Mem: 26.53MB, Util: 68%  global_step : 18500
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 501 | Mem: 26.53MB, Util: 6

[Rank 1] Train Epoch 9:  26%|██▌       | 517/2000 [00:03<00:10, 135.15it/s]
[Rank 2] Train Epoch 9:  26%|██▌       | 524/2000 [00:03<00:11, 131.49it/s]
[Rank 0] Train Epoch 9:  26%|██▌       | 515/2000 [00:03<00:11, 130.33it/s]
[Rank 1] Train Epoch 9:  27%|██▋       | 531/2000 [00:03<00:10, 135.21it/s]
[Rank 2] Train Epoch 9:  27%|██▋       | 539/2000 [00:04<00:10, 135.63it/s]
[Rank 0] Train Epoch 9:  26%|██▋       | 530/2000 [00:03<00:11, 133.38it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 522 | Mem: 26.53MB, Util: 68%  global_step : 18522
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 523 | Mem: 26.53MB, Util: 68%  global_step : 18523
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 524 | Mem: 26.53MB, Util: 68%  global_step : 18524
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 525 | Mem: 26.53MB, Util: 68%  global_step : 18525
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 526 | Mem: 26.53MB, Util: 68%  global_step : 18526
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 527 | Mem: 26.53MB, Util: 66%  global_step : 18527
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 528 | Mem: 26.53MB, Util: 66%  global_step : 18528
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 529 | Mem: 26.53MB, Util: 6

[Rank 1] Train Epoch 9:  27%|██▋       | 545/2000 [00:04<00:10, 134.81it/s]
[Rank 2] Train Epoch 9:  28%|██▊       | 554/2000 [00:04<00:10, 138.85it/s]
[Rank 0] Train Epoch 9:  27%|██▋       | 545/2000 [00:04<00:10, 135.89it/s]
[Rank 0] Train Epoch 9:  28%|██▊       | 559/2000 [00:04<00:10, 136.83it/s]
[Rank 1] Train Epoch 9:  28%|██▊       | 559/2000 [00:04<00:10, 134.70it/s]
[Rank 2] Train Epoch 9:  28%|██▊       | 569/2000 [00:04<00:10, 141.09it/s]
[Rank 0] Train Epoch 9:  29%|██▊       | 574/2000 [00:04<00:10, 138.23it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 550 | Mem: 26.53MB, Util: 66%  global_step : 18550
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 551 | Mem: 26.53MB, Util: 66%  global_step : 18551
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 552 | Mem: 26.53MB, Util: 66%  global_step : 18552
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 553 | Mem: 26.53MB, Util: 66%  global_step : 18553
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 554 | Mem: 26.53MB, Util: 66%  global_step : 18554
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 555 | Mem: 26.53MB, Util: 66%  global_step : 18555
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 556 | Mem: 26.53MB, Util: 66%  global_step : 18556
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 557 | Mem: 26.53MB, Util: 6

[Rank 1] Train Epoch 9:  29%|██▊       | 573/2000 [00:04<00:10, 135.07it/s]
[Rank 2] Train Epoch 9:  29%|██▉       | 584/2000 [00:04<00:09, 143.64it/s]
[Rank 0] Train Epoch 9:  29%|██▉       | 588/2000 [00:04<00:10, 136.58it/s]
[Rank 1] Train Epoch 9:  29%|██▉       | 587/2000 [00:04<00:10, 135.33it/s]
[Rank 2] Train Epoch 9:  30%|██▉       | 599/2000 [00:04<00:09, 144.92it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 578 | Mem: 26.53MB, Util: 67%  global_step : 18578
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 579 | Mem: 26.53MB, Util: 67%  global_step : 18579
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 580 | Mem: 26.53MB, Util: 67%  global_step : 18580
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 581 | Mem: 26.53MB, Util: 67%  global_step : 18581
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 582 | Mem: 26.53MB, Util: 67%  global_step : 18582
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 583 | Mem: 26.53MB, Util: 67%  global_step : 18583
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 584 | Mem: 26.53MB, Util: 67%  global_step : 18584
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 585 | Mem: 26.53MB, Util: 6

[Rank 1] Train Epoch 9:  30%|███       | 601/2000 [00:04<00:10, 132.42it/s]
[Rank 0] Train Epoch 9:  30%|███       | 602/2000 [00:04<00:11, 126.23it/s]
[Rank 1] Train Epoch 9:  31%|███       | 616/2000 [00:04<00:10, 135.81it/s]
[Rank 2] Train Epoch 9:  31%|███       | 614/2000 [00:04<00:11, 120.98it/s]
[Rank 0] Train Epoch 9:  31%|███       | 617/2000 [00:04<00:10, 131.89it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 606 | Mem: 26.53MB, Util: 67%  global_step : 18606
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 607 | Mem: 26.53MB, Util: 67%  global_step : 18607
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 608 | Mem: 26.53MB, Util: 67%  global_step : 18608
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 609 | Mem: 26.53MB, Util: 67%  global_step : 18609
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 610 | Mem: 26.53MB, Util: 67%  global_step : 18610
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 611 | Mem: 26.53MB, Util: 67%  global_step : 18611
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 612 | Mem: 26.53MB, Util: 67%  global_step : 18612
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 613 | Mem: 26.53MB, Util: 6

[Rank 1] Train Epoch 9:  32%|███▏      | 631/2000 [00:04<00:09, 138.54it/s]
[Rank 2] Train Epoch 9:  31%|███▏      | 629/2000 [00:04<00:10, 127.30it/s]
[Rank 0] Train Epoch 9:  32%|███▏      | 632/2000 [00:04<00:10, 135.40it/s]
[Rank 1] Train Epoch 9:  32%|███▏      | 645/2000 [00:04<00:10, 135.06it/s]
[Rank 2] Train Epoch 9:  32%|███▏      | 644/2000 [00:04<00:10, 132.70it/s]
[Rank 0] Train Epoch 9:  32%|███▏      | 647/2000 [00:04<00:09, 138.53it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 635 | Mem: 26.53MB, Util: 65%  global_step : 18635
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 636 | Mem: 26.53MB, Util: 65%  global_step : 18636
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 637 | Mem: 26.53MB, Util: 65%  global_step : 18637
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 638 | Mem: 26.53MB, Util: 65%  global_step : 18638
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 639 | Mem: 26.53MB, Util: 65%  global_step : 18639
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 640 | Mem: 26.53MB, Util: 65%  global_step : 18640
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 641 | Mem: 26.53MB, Util: 68%  global_step : 18641
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 642 | Mem: 26.53MB, Util: 6

[Rank 1] Train Epoch 9:  33%|███▎      | 659/2000 [00:04<00:09, 134.62it/s]
[Rank 2] Train Epoch 9:  33%|███▎      | 659/2000 [00:04<00:09, 136.76it/s]
[Rank 0] Train Epoch 9:  33%|███▎      | 662/2000 [00:04<00:09, 140.96it/s]
[Rank 1] Train Epoch 9:  34%|███▎      | 673/2000 [00:05<00:09, 134.65it/s]
[Rank 2] Train Epoch 9:  34%|███▎      | 674/2000 [00:05<00:09, 139.62it/s]
[Rank 0] Train Epoch 9:  34%|███▍      | 677/2000 [00:05<00:09, 141.50it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 663 | Mem: 26.53MB, Util: 68%  global_step : 18663
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 664 | Mem: 26.53MB, Util: 68%  global_step : 18664
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 665 | Mem: 26.53MB, Util: 68%  global_step : 18665
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 666 | Mem: 26.53MB, Util: 64%  global_step : 18666
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 667 | Mem: 26.53MB, Util: 64%  global_step : 18667
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 668 | Mem: 26.53MB, Util: 64%  global_step : 18668
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 669 | Mem: 26.53MB, Util: 64%  global_step : 18669
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 670 | Mem: 26.53MB, Util: 6

[Rank 1] Train Epoch 9:  34%|███▍      | 687/2000 [00:05<00:09, 135.14it/s]
[Rank 2] Train Epoch 9:  34%|███▍      | 689/2000 [00:05<00:09, 141.79it/s]
[Rank 0] Train Epoch 9:  35%|███▍      | 692/2000 [00:05<00:09, 142.03it/s]
[Rank 1] Train Epoch 9:  35%|███▌      | 701/2000 [00:05<00:09, 135.49it/s]
[Rank 2] Train Epoch 9:  35%|███▌      | 704/2000 [00:05<00:10, 125.34it/s]
[Rank 0] Train Epoch 9:  35%|███▌      | 707/2000 [00:05<00:10, 127.28it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 691 | Mem: 26.53MB, Util: 64%  global_step : 18691
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 692 | Mem: 26.53MB, Util: 64%  global_step : 18692
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 693 | Mem: 26.53MB, Util: 64%  global_step : 18693
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 694 | Mem: 26.53MB, Util: 64%  global_step : 18694
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 695 | Mem: 26.53MB, Util: 67%  global_step : 18695
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 696 | Mem: 26.53MB, Util: 67%  global_step : 18696
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 697 | Mem: 26.53MB, Util: 67%  global_step : 18697
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 698 | Mem: 26.53MB, Util: 6

[Rank 1] Train Epoch 9:  36%|███▌      | 715/2000 [00:05<00:09, 135.85it/s]
[Rank 2] Train Epoch 9:  36%|███▌      | 721/2000 [00:05<00:09, 136.17it/s]
[Rank 0] Train Epoch 9:  36%|███▌      | 722/2000 [00:05<00:09, 132.39it/s]
[Rank 1] Train Epoch 9:  36%|███▋      | 729/2000 [00:05<00:09, 135.89it/s]
[Rank 2] Train Epoch 9:  37%|███▋      | 738/2000 [00:05<00:08, 143.40it/s]
[Rank 0] Train Epoch 9:  37%|███▋      | 739/2000 [00:05<00:08, 141.87it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 720 | Mem: 26.53MB, Util: 67%  global_step : 18720
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 721 | Mem: 26.53MB, Util: 67%  global_step : 18721
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 722 | Mem: 26.53MB, Util: 67%  global_step : 18722
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 723 | Mem: 26.53MB, Util: 67%  global_step : 18723
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 724 | Mem: 26.53MB, Util: 67%  global_step : 18724
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 725 | Mem: 26.53MB, Util: 67%  global_step : 18725
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 726 | Mem: 26.53MB, Util: 67%  global_step : 18726
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 727 | Mem: 26.53MB, Util: 6

[Rank 1] Train Epoch 9:  37%|███▋      | 743/2000 [00:05<00:09, 135.55it/s]
[Rank 2] Train Epoch 9:  38%|███▊      | 755/2000 [00:05<00:08, 149.01it/s]
[Rank 0] Train Epoch 9:  38%|███▊      | 757/2000 [00:05<00:08, 152.49it/s]
[Rank 1] Train Epoch 9:  38%|███▊      | 757/2000 [00:05<00:09, 135.40it/s]
[Rank 2] Train Epoch 9:  39%|███▊      | 772/2000 [00:05<00:08, 153.05it/s]
[Rank 0] Train Epoch 9:  39%|███▊      | 774/2000 [00:05<00:07, 154.63it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 748 | Mem: 26.53MB, Util: 67%  global_step : 18748
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 749 | Mem: 26.53MB, Util: 67%  global_step : 18749
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 750 | Mem: 26.53MB, Util: 66%  global_step : 18750
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 751 | Mem: 26.53MB, Util: 66%  global_step : 18751
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 752 | Mem: 26.53MB, Util: 66%  global_step : 18752
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 753 | Mem: 26.53MB, Util: 66%  global_step : 18753
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 754 | Mem: 26.53MB, Util: 66%  global_step : 18754
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 755 | Mem: 26.53MB, Util: 6

[Rank 1] Train Epoch 9:  39%|███▊      | 771/2000 [00:05<00:09, 135.36it/s]
[Rank 2] Train Epoch 9:  39%|███▉      | 789/2000 [00:05<00:07, 156.00it/s]
[Rank 0] Train Epoch 9:  40%|███▉      | 790/2000 [00:05<00:08, 144.00it/s]
[Rank 1] Train Epoch 9:  39%|███▉      | 785/2000 [00:05<00:08, 135.34it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 776 | Mem: 26.53MB, Util: 66%  global_step : 18776
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 777 | Mem: 26.53MB, Util: 66%  global_step : 18777
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 778 | Mem: 26.53MB, Util: 66%  global_step : 18778
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 779 | Mem: 26.53MB, Util: 66%  global_step : 18779
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 780 | Mem: 26.53MB, Util: 66%  global_step : 18780
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 781 | Mem: 26.53MB, Util: 66%  global_step : 18781
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 782 | Mem: 26.53MB, Util: 66%  global_step : 18782
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 783 | Mem: 26.53MB, Util: 6

[Rank 1] Train Epoch 9:  40%|███▉      | 799/2000 [00:05<00:08, 134.94it/s]
[Rank 2] Train Epoch 9:  40%|████      | 805/2000 [00:06<00:10, 115.67it/s]
[Rank 0] Train Epoch 9:  40%|████      | 805/2000 [00:05<00:10, 118.77it/s]
[Rank 1] Train Epoch 9:  41%|████      | 813/2000 [00:06<00:08, 134.92it/s]
[Rank 2] Train Epoch 9:  41%|████      | 820/2000 [00:06<00:09, 122.69it/s]
[Rank 0] Train Epoch 9:  41%|████      | 819/2000 [00:06<00:09, 123.09it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 804 | Mem: 26.53MB, Util: 66%  global_step : 18804
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 805 | Mem: 26.53MB, Util: 66%  global_step : 18805
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 806 | Mem: 26.53MB, Util: 66%  global_step : 18806
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 807 | Mem: 26.53MB, Util: 66%  global_step : 18807
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 808 | Mem: 26.53MB, Util: 67%  global_step : 18808
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 809 | Mem: 26.53MB, Util: 67%  global_step : 18809
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 810 | Mem: 26.53MB, Util: 67%  global_step : 18810
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 811 | Mem: 26.53MB, Util: 6

[Rank 1] Train Epoch 9:  41%|████▏     | 827/2000 [00:06<00:08, 134.38it/s]
[Rank 2] Train Epoch 9:  42%|████▏     | 835/2000 [00:06<00:09, 129.27it/s]
[Rank 0] Train Epoch 9:  42%|████▏     | 834/2000 [00:06<00:09, 127.75it/s]
[Rank 1] Train Epoch 9:  42%|████▏     | 841/2000 [00:06<00:08, 134.70it/s]
[Rank 2] Train Epoch 9:  42%|████▎     | 850/2000 [00:06<00:08, 134.16it/s]
[Rank 0] Train Epoch 9:  42%|████▏     | 849/2000 [00:06<00:08, 131.71it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 832 | Mem: 26.53MB, Util: 67%  global_step : 18832
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 833 | Mem: 26.53MB, Util: 67%  global_step : 18833
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 834 | Mem: 26.53MB, Util: 67%  global_step : 18834
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 835 | Mem: 26.53MB, Util: 67%  global_step : 18835
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 836 | Mem: 26.53MB, Util: 67%  global_step : 18836
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 837 | Mem: 26.53MB, Util: 67%  global_step : 18837
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 838 | Mem: 26.53MB, Util: 67%  global_step : 18838
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 839 | Mem: 26.53MB, Util: 6

[Rank 1] Train Epoch 9:  43%|████▎     | 855/2000 [00:06<00:08, 134.85it/s]
[Rank 2] Train Epoch 9:  43%|████▎     | 865/2000 [00:06<00:08, 137.39it/s]
[Rank 0] Train Epoch 9:  43%|████▎     | 864/2000 [00:06<00:08, 134.33it/s]
[Rank 1] Train Epoch 9:  43%|████▎     | 869/2000 [00:06<00:08, 133.28it/s]
[Rank 2] Train Epoch 9:  44%|████▍     | 880/2000 [00:06<00:07, 140.07it/s]
[Rank 0] Train Epoch 9:  44%|████▍     | 879/2000 [00:06<00:08, 136.57it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 860 | Mem: 26.53MB, Util: 67%  global_step : 18860
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 861 | Mem: 26.53MB, Util: 67%  global_step : 18861
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 862 | Mem: 26.53MB, Util: 67%  global_step : 18862
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 863 | Mem: 26.53MB, Util: 67%  global_step : 18863
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 864 | Mem: 26.53MB, Util: 66%  global_step : 18864
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 865 | Mem: 26.53MB, Util: 66%  global_step : 18865
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 866 | Mem: 26.53MB, Util: 66%  global_step : 18866
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 867 | Mem: 26.53MB, Util: 6

[Rank 1] Train Epoch 9:  44%|████▍     | 883/2000 [00:06<00:08, 133.38it/s]
[Rank 2] Train Epoch 9:  45%|████▍     | 895/2000 [00:06<00:07, 142.28it/s]
[Rank 0] Train Epoch 9:  45%|████▍     | 893/2000 [00:06<00:08, 137.34it/s]
[Rank 1] Train Epoch 9:  45%|████▍     | 897/2000 [00:06<00:08, 132.28it/s]
[Rank 0] Train Epoch 9:  45%|████▌     | 907/2000 [00:06<00:08, 124.01it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 887 | Mem: 26.53MB, Util: 66%  global_step : 18887
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 888 | Mem: 26.53MB, Util: 66%  global_step : 18888
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 889 | Mem: 26.53MB, Util: 66%  global_step : 18889
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 890 | Mem: 26.53MB, Util: 66%  global_step : 18890
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 891 | Mem: 26.53MB, Util: 66%  global_step : 18891
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 892 | Mem: 26.53MB, Util: 66%  global_step : 18892
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 893 | Mem: 26.53MB, Util: 66%  global_step : 18893
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 894 | Mem: 26.53MB, Util: 6

[Rank 1] Train Epoch 9:  46%|████▌     | 911/2000 [00:06<00:08, 131.17it/s]
[Rank 2] Train Epoch 9:  46%|████▌     | 910/2000 [00:06<00:09, 118.97it/s]
[Rank 0] Train Epoch 9:  46%|████▌     | 922/2000 [00:06<00:08, 129.29it/s]
[Rank 1] Train Epoch 9:  46%|████▋     | 925/2000 [00:06<00:08, 129.26it/s]
[Rank 2] Train Epoch 9:  46%|████▋     | 925/2000 [00:06<00:08, 125.80it/s]
[Rank 0] Train Epoch 9:  47%|████▋     | 937/2000 [00:06<00:08, 132.66it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 914 | Mem: 26.53MB, Util: 66%  global_step : 18914
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 915 | Mem: 26.53MB, Util: 66%  global_step : 18915
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 916 | Mem: 26.53MB, Util: 66%  global_step : 18916
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 917 | Mem: 26.53MB, Util: 64%  global_step : 18917
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 918 | Mem: 26.53MB, Util: 64%  global_step : 18918
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 919 | Mem: 26.53MB, Util: 64%  global_step : 18919
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 920 | Mem: 26.53MB, Util: 64%  global_step : 18920
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 921 | Mem: 26.53MB, Util: 6

[Rank 1] Train Epoch 9:  47%|████▋     | 938/2000 [00:06<00:08, 128.30it/s]
[Rank 2] Train Epoch 9:  47%|████▋     | 940/2000 [00:07<00:08, 131.57it/s]
[Rank 2] Train Epoch 9:  48%|████▊     | 955/2000 [00:07<00:07, 135.82it/s]
[Rank 0] Train Epoch 9:  48%|████▊     | 952/2000 [00:07<00:07, 135.69it/s]
[Rank 1] Train Epoch 9:  48%|████▊     | 951/2000 [00:07<00:08, 128.18it/s]
[Rank 2] Train Epoch 9:  48%|████▊     | 970/2000 [00:07<00:07, 139.44it/s]
[Rank 0] Train Epoch 9:  48%|████▊     | 967/2000 [00:07<00:07, 137.53it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 940 | Mem: 26.53MB, Util: 63%  global_step : 18940
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 941 | Mem: 26.53MB, Util: 63%  global_step : 18941
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 942 | Mem: 26.53MB, Util: 63%  global_step : 18942
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 943 | Mem: 26.53MB, Util: 63%  global_step : 18943
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 944 | Mem: 26.53MB, Util: 63%  global_step : 18944
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 945 | Mem: 26.53MB, Util: 63%  global_step : 18945
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 946 | Mem: 26.53MB, Util: 63%  global_step : 18946
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 947 | Mem: 26.53MB, Util: 6

[Rank 1] Train Epoch 9:  48%|████▊     | 964/2000 [00:07<00:08, 127.99it/s]
[Rank 2] Train Epoch 9:  49%|████▉     | 986/2000 [00:07<00:07, 142.55it/s]
[Rank 0] Train Epoch 9:  49%|████▉     | 982/2000 [00:07<00:07, 139.79it/s]
[Rank 1] Train Epoch 9:  49%|████▉     | 977/2000 [00:07<00:08, 126.79it/s]
[Rank 0] Train Epoch 9:  50%|████▉     | 998/2000 [00:07<00:07, 142.99it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 967 | Mem: 26.53MB, Util: 65%  global_step : 18967
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 968 | Mem: 26.53MB, Util: 65%  global_step : 18968
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 969 | Mem: 26.53MB, Util: 65%  global_step : 18969
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 970 | Mem: 26.53MB, Util: 65%  global_step : 18970
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 971 | Mem: 26.53MB, Util: 65%  global_step : 18971
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 972 | Mem: 26.53MB, Util: 65%  global_step : 18972
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 973 | Mem: 26.53MB, Util: 65%  global_step : 18973
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 974 | Mem: 26.53MB, Util: 6

[Rank 1] Train Epoch 9:  50%|████▉     | 990/2000 [00:07<00:07, 126.94it/s]
[Rank 1] Train Epoch 9:  50%|█████     | 1003/2000 [00:07<00:07, 127.33it/s]
[Rank 2] Train Epoch 9:  50%|█████     | 1001/2000 [00:07<00:09, 109.27it/s]
[Rank 0] Train Epoch 9:  51%|█████     | 1013/2000 [00:07<00:09, 108.75it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 993 | Mem: 26.53MB, Util: 63%  global_step : 18993
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 994 | Mem: 26.53MB, Util: 63%  global_step : 18994
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 995 | Mem: 26.53MB, Util: 63%  global_step : 18995
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 996 | Mem: 26.53MB, Util: 63%  global_step : 18996
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 997 | Mem: 26.53MB, Util: 63%  global_step : 18997
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 998 | Mem: 26.53MB, Util: 63%  global_step : 18998
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 999 | Mem: 26.53MB, Util: 63%  global_step : 18999
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1000 | Mem: 26.53MB, Util: 

[Rank 1] Train Epoch 9:  51%|█████     | 1017/2000 [00:07<00:07, 128.33it/s]
[Rank 2] Train Epoch 9:  51%|█████     | 1016/2000 [00:07<00:08, 118.19it/s]
[Rank 2] Train Epoch 9:  52%|█████▏    | 1031/2000 [00:07<00:07, 125.51it/s]
[Rank 0] Train Epoch 9:  51%|█████▏    | 1029/2000 [00:07<00:08, 118.93it/s]
[Rank 1] Train Epoch 9:  52%|█████▏    | 1030/2000 [00:07<00:07, 127.71it/s]
[Rank 2] Train Epoch 9:  52%|█████▏    | 1046/2000 [00:07<00:07, 131.53it/s]
[Rank 0] Train Epoch 9:  52%|█████▏    | 1045/2000 [00:07<00:07, 126.86it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1020 | Mem: 26.53MB, Util: 67%  global_step : 19020
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1021 | Mem: 26.53MB, Util: 67%  global_step : 19021
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1022 | Mem: 26.53MB, Util: 67%  global_step : 19022
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1023 | Mem: 26.53MB, Util: 67%  global_step : 19023
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1024 | Mem: 26.53MB, Util: 67%  global_step : 19024
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1025 | Mem: 26.53MB, Util: 67%  global_step : 19025
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1026 | Mem: 26.53MB, Util: 67%  global_step : 19026
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1027 | Mem: 26.53MB,

[Rank 1] Train Epoch 9:  52%|█████▏    | 1043/2000 [00:07<00:07, 127.69it/s]
[Rank 2] Train Epoch 9:  53%|█████▎    | 1061/2000 [00:07<00:06, 135.31it/s]
[Rank 0] Train Epoch 9:  53%|█████▎    | 1059/2000 [00:07<00:07, 125.21it/s]
[Rank 1] Train Epoch 9:  53%|█████▎    | 1056/2000 [00:07<00:07, 127.52it/s]
[Rank 2] Train Epoch 9:  54%|█████▍    | 1076/2000 [00:08<00:06, 138.67it/s]
[Rank 0] Train Epoch 9:  54%|█████▍    | 1075/2000 [00:08<00:06, 132.38it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1047 | Mem: 26.53MB, Util: 78%  global_step : 19047
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1048 | Mem: 26.53MB, Util: 78%  global_step : 19048
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1049 | Mem: 26.53MB, Util: 78%  global_step : 19049
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1050 | Mem: 26.53MB, Util: 78%  global_step : 19050
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1051 | Mem: 26.53MB, Util: 78%  global_step : 19051
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1052 | Mem: 26.53MB, Util: 78%  global_step : 19052
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1053 | Mem: 26.53MB, Util: 78%  global_step : 19053
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1054 | Mem: 26.53MB,

[Rank 1] Train Epoch 9:  53%|█████▎    | 1069/2000 [00:08<00:07, 127.43it/s]
[Rank 2] Train Epoch 9:  55%|█████▍    | 1091/2000 [00:08<00:06, 141.38it/s]
[Rank 0] Train Epoch 9:  55%|█████▍    | 1091/2000 [00:08<00:06, 137.18it/s]
[Rank 1] Train Epoch 9:  54%|█████▍    | 1082/2000 [00:08<00:07, 126.86it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1073 | Mem: 26.53MB, Util: 64%  global_step : 19073
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1074 | Mem: 26.53MB, Util: 64%  global_step : 19074
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1075 | Mem: 26.53MB, Util: 64%  global_step : 19075
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1076 | Mem: 26.53MB, Util: 64%  global_step : 19076
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1077 | Mem: 26.53MB, Util: 64%  global_step : 19077
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1078 | Mem: 26.53MB, Util: 64%  global_step : 19078
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1079 | Mem: 26.53MB, Util: 64%  global_step : 19079
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1080 | Mem: 26.53MB,

[Rank 1] Train Epoch 9:  55%|█████▍    | 1095/2000 [00:08<00:07, 127.31it/s]
[Rank 2] Train Epoch 9:  55%|█████▌    | 1106/2000 [00:08<00:08, 110.58it/s]
[Rank 0] Train Epoch 9:  55%|█████▌    | 1106/2000 [00:08<00:07, 116.28it/s]
[Rank 1] Train Epoch 9:  55%|█████▌    | 1108/2000 [00:08<00:06, 127.58it/s]
[Rank 2] Train Epoch 9:  56%|█████▌    | 1123/2000 [00:08<00:07, 123.43it/s]
[Rank 0] Train Epoch 9:  56%|█████▌    | 1121/2000 [00:08<00:07, 122.91it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1100 | Mem: 26.53MB, Util: 65%  global_step : 19100
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1101 | Mem: 26.53MB, Util: 65%  global_step : 19101
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1102 | Mem: 26.53MB, Util: 65%  global_step : 19102
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1103 | Mem: 26.53MB, Util: 65%  global_step : 19103
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1104 | Mem: 26.53MB, Util: 65%  global_step : 19104
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1105 | Mem: 26.53MB, Util: 65%  global_step : 19105
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1106 | Mem: 26.53MB, Util: 65%  global_step : 19106
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1107 | Mem: 26.53MB,

[Rank 1] Train Epoch 9:  56%|█████▌    | 1121/2000 [00:08<00:06, 127.53it/s]
[Rank 2] Train Epoch 9:  57%|█████▋    | 1139/2000 [00:08<00:06, 132.63it/s]
[Rank 0] Train Epoch 9:  57%|█████▋    | 1134/2000 [00:08<00:07, 122.87it/s]
[Rank 1] Train Epoch 9:  57%|█████▋    | 1134/2000 [00:08<00:06, 127.27it/s]
[Rank 2] Train Epoch 9:  58%|█████▊    | 1156/2000 [00:08<00:06, 140.47it/s]
[Rank 0] Train Epoch 9:  57%|█████▋    | 1148/2000 [00:08<00:06, 125.12it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1126 | Mem: 26.53MB, Util: 62%  global_step : 19126
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1127 | Mem: 26.53MB, Util: 62%  global_step : 19127
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1128 | Mem: 26.53MB, Util: 62%  global_step : 19128
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1129 | Mem: 26.53MB, Util: 62%  global_step : 19129
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1130 | Mem: 26.53MB, Util: 62%  global_step : 19130
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1131 | Mem: 26.53MB, Util: 62%  global_step : 19131
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1132 | Mem: 26.53MB, Util: 62%  global_step : 19132
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1133 | Mem: 26.53MB,

[Rank 1] Train Epoch 9:  57%|█████▋    | 1147/2000 [00:08<00:06, 127.53it/s]
[Rank 2] Train Epoch 9:  59%|█████▊    | 1173/2000 [00:08<00:05, 146.36it/s]
[Rank 0] Train Epoch 9:  58%|█████▊    | 1161/2000 [00:08<00:06, 125.83it/s]
[Rank 1] Train Epoch 9:  58%|█████▊    | 1160/2000 [00:08<00:06, 127.60it/s]
[Rank 2] Train Epoch 9:  59%|█████▉    | 1189/2000 [00:08<00:05, 149.77it/s]
[Rank 0] Train Epoch 9:  59%|█████▉    | 1175/2000 [00:08<00:06, 127.53it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1153 | Mem: 26.53MB, Util: 65%  global_step : 19153
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1154 | Mem: 26.53MB, Util: 65%  global_step : 19154
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1155 | Mem: 26.53MB, Util: 65%  global_step : 19155
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1156 | Mem: 26.53MB, Util: 65%  global_step : 19156
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1157 | Mem: 26.53MB, Util: 65%  global_step : 19157
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1158 | Mem: 26.53MB, Util: 65%  global_step : 19158
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1159 | Mem: 26.53MB, Util: 65%  global_step : 19159
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1160 | Mem: 26.53MB,

[Rank 1] Train Epoch 9:  59%|█████▊    | 1173/2000 [00:08<00:06, 127.18it/s]
[Rank 0] Train Epoch 9:  59%|█████▉    | 1189/2000 [00:08<00:06, 128.58it/s]
[Rank 1] Train Epoch 9:  59%|█████▉    | 1186/2000 [00:08<00:06, 126.93it/s]
[Rank 0] Train Epoch 9:  60%|██████    | 1202/2000 [00:09<00:06, 120.39it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1179 | Mem: 26.53MB, Util: 65%  global_step : 19179
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1180 | Mem: 26.53MB, Util: 65%  global_step : 19180
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1181 | Mem: 26.53MB, Util: 65%  global_step : 19181
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1182 | Mem: 26.53MB, Util: 65%  global_step : 19182
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1183 | Mem: 26.53MB, Util: 65%  global_step : 19183
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1184 | Mem: 26.53MB, Util: 65%  global_step : 19184
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1185 | Mem: 26.53MB, Util: 65%  global_step : 19185
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1186 | Mem: 26.53MB,

[Rank 1] Train Epoch 9:  60%|█████▉    | 1199/2000 [00:09<00:06, 127.07it/s]
[Rank 2] Train Epoch 9:  60%|██████    | 1205/2000 [00:09<00:07, 104.63it/s]
[Rank 0] Train Epoch 9:  61%|██████    | 1216/2000 [00:09<00:06, 123.42it/s]
[Rank 1] Train Epoch 9:  61%|██████    | 1212/2000 [00:09<00:06, 127.58it/s]
[Rank 2] Train Epoch 9:  61%|██████    | 1222/2000 [00:09<00:06, 117.52it/s]
[Rank 0] Train Epoch 9:  62%|██████▏   | 1230/2000 [00:09<00:06, 126.25it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1206 | Mem: 26.53MB, Util: 61%  global_step : 19206
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1207 | Mem: 26.53MB, Util: 61%  global_step : 19207
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1208 | Mem: 26.53MB, Util: 61%  global_step : 19208
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1209 | Mem: 26.53MB, Util: 61%  global_step : 19209
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1210 | Mem: 26.53MB, Util: 61%  global_step : 19210
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1211 | Mem: 26.53MB, Util: 61%  global_step : 19211
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1212 | Mem: 26.53MB, Util: 61%  global_step : 19212
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1213 | Mem: 26.53MB,

[Rank 1] Train Epoch 9:  61%|██████▏   | 1226/2000 [00:09<00:05, 129.71it/s]
[Rank 2] Train Epoch 9:  62%|██████▏   | 1239/2000 [00:09<00:05, 128.48it/s]
[Rank 0] Train Epoch 9:  62%|██████▏   | 1244/2000 [00:09<00:05, 128.60it/s]
[Rank 1] Train Epoch 9:  62%|██████▏   | 1240/2000 [00:09<00:05, 130.81it/s]
[Rank 2] Train Epoch 9:  63%|██████▎   | 1256/2000 [00:09<00:05, 137.52it/s]
[Rank 0] Train Epoch 9:  63%|██████▎   | 1258/2000 [00:09<00:05, 130.20it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1234 | Mem: 26.53MB, Util: 64%  global_step : 19234
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1235 | Mem: 26.53MB, Util: 64%  global_step : 19235
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1236 | Mem: 26.53MB, Util: 64%  global_step : 19236
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1237 | Mem: 26.53MB, Util: 64%  global_step : 19237
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1238 | Mem: 26.53MB, Util: 64%  global_step : 19238
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1239 | Mem: 26.53MB, Util: 64%  global_step : 19239
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1240 | Mem: 26.53MB, Util: 64%  global_step : 19240
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1241 | Mem: 26.53MB,

[Rank 1] Train Epoch 9:  63%|██████▎   | 1254/2000 [00:09<00:05, 131.88it/s]
[Rank 2] Train Epoch 9:  64%|██████▎   | 1273/2000 [00:09<00:05, 144.48it/s]
[Rank 0] Train Epoch 9:  64%|██████▎   | 1272/2000 [00:09<00:05, 131.17it/s]
[Rank 1] Train Epoch 9:  63%|██████▎   | 1268/2000 [00:09<00:05, 132.53it/s]
[Rank 2] Train Epoch 9:  64%|██████▍   | 1290/2000 [00:09<00:04, 149.98it/s]
[Rank 0] Train Epoch 9:  64%|██████▍   | 1286/2000 [00:09<00:05, 131.59it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1262 | Mem: 26.53MB, Util: 65%  global_step : 19262
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1263 | Mem: 26.53MB, Util: 65%  global_step : 19263
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1264 | Mem: 26.53MB, Util: 65%  global_step : 19264
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1265 | Mem: 26.53MB, Util: 65%  global_step : 19265
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1266 | Mem: 26.53MB, Util: 65%  global_step : 19266
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1267 | Mem: 26.53MB, Util: 65%  global_step : 19267
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1268 | Mem: 26.53MB, Util: 65%  global_step : 19268
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1269 | Mem: 26.53MB,

[Rank 1] Train Epoch 9:  64%|██████▍   | 1282/2000 [00:09<00:05, 132.39it/s]
[Rank 0] Train Epoch 9:  65%|██████▌   | 1300/2000 [00:09<00:05, 132.66it/s]
[Rank 1] Train Epoch 9:  65%|██████▍   | 1296/2000 [00:09<00:05, 132.24it/s]
[Rank 2] Train Epoch 9:  65%|██████▌   | 1306/2000 [00:09<00:06, 110.25it/s]
[Rank 0] Train Epoch 9:  66%|██████▌   | 1314/2000 [00:09<00:05, 131.10it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1289 | Mem: 26.53MB, Util: 66%  global_step : 19289
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1290 | Mem: 26.53MB, Util: 66%  global_step : 19290
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1291 | Mem: 26.53MB, Util: 66%  global_step : 19291
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1292 | Mem: 26.53MB, Util: 66%  global_step : 19292
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1293 | Mem: 26.53MB, Util: 66%  global_step : 19293
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1294 | Mem: 26.53MB, Util: 66%  global_step : 19294
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1295 | Mem: 26.53MB, Util: 66%  global_step : 19295
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1296 | Mem: 26.53MB,

[Rank 1] Train Epoch 9:  66%|██████▌   | 1310/2000 [00:09<00:05, 132.36it/s]
[Rank 2] Train Epoch 9:  66%|██████▌   | 1322/2000 [00:10<00:05, 121.18it/s]
[Rank 0] Train Epoch 9:  66%|██████▋   | 1328/2000 [00:10<00:05, 132.22it/s]
[Rank 1] Train Epoch 9:  66%|██████▌   | 1324/2000 [00:09<00:05, 132.68it/s]
[Rank 2] Train Epoch 9:  67%|██████▋   | 1339/2000 [00:10<00:05, 131.12it/s]
[Rank 0] Train Epoch 9:  67%|██████▋   | 1342/2000 [00:10<00:04, 132.91it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1317 | Mem: 26.53MB, Util: 66%  global_step : 19317
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1318 | Mem: 26.53MB, Util: 66%  global_step : 19318
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1319 | Mem: 26.53MB, Util: 66%  global_step : 19319
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1320 | Mem: 26.53MB, Util: 66%  global_step : 19320
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1321 | Mem: 26.53MB, Util: 66%  global_step : 19321
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1322 | Mem: 26.53MB, Util: 66%  global_step : 19322
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1323 | Mem: 26.53MB, Util: 66%  global_step : 19323
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1324 | Mem: 26.53MB,

[Rank 1] Train Epoch 9:  67%|██████▋   | 1338/2000 [00:10<00:04, 133.20it/s]
[Rank 2] Train Epoch 9:  68%|██████▊   | 1356/2000 [00:10<00:04, 139.01it/s]
[Rank 0] Train Epoch 9:  68%|██████▊   | 1356/2000 [00:10<00:04, 133.12it/s]
[Rank 1] Train Epoch 9:  68%|██████▊   | 1352/2000 [00:10<00:04, 133.59it/s]
[Rank 2] Train Epoch 9:  69%|██████▊   | 1372/2000 [00:10<00:04, 143.96it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 9, Batch 1344 | Mem: 26.53MB, Util: 68%  global_step : 19344
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 9, Batch 1345 | Mem: 26.53MB, Util: 68%  global_step : 19345
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 9, Batch 1346 | Mem: 26.53MB, Util: 68%  global_step : 19346
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 9, Batch 1347 | Mem: 26.53MB, Util: 68%  global_step : 19347
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 9, Batch 1348 | Mem: 26.53MB, Util: 68%  global_step : 19348
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 9, Batch 1349 | Mem: 26.53MB, Util: 68%  global_step : 19349
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 9, Batch 1350 | Mem: 26.53MB, Util: 68%  global_step : 19350
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 9, Batch 1351 | Mem: 26.53MB, Util: 68%  glob

[Rank 0] Train Epoch 9:  68%|██████▊   | 1370/2000 [00:10<00:04, 131.66it/s]
[Rank 1] Train Epoch 9:  68%|██████▊   | 1366/2000 [00:10<00:04, 133.47it/s]
[Rank 2] Train Epoch 9:  69%|██████▉   | 1388/2000 [00:10<00:04, 145.11it/s]
[Rank 0] Train Epoch 9:  69%|██████▉   | 1384/2000 [00:10<00:04, 129.70it/s]
[Rank 1] Train Epoch 9:  69%|██████▉   | 1380/2000 [00:10<00:04, 133.52it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 9, Batch 1371 | Mem: 26.53MB, Util: 64%  global_step : 19371
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 9, Batch 1372 | Mem: 26.53MB, Util: 64%  global_step : 19372
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 9, Batch 1373 | Mem: 26.53MB, Util: 64%  global_step : 19373
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 9, Batch 1374 | Mem: 26.53MB, Util: 64%  global_step : 19374
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 9, Batch 1375 | Mem: 26.53MB, Util: 64%  global_step : 19375
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 9, Batch 1376 | Mem: 26.53MB, Util: 64%  global_step : 19376
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 9, Batch 1377 | Mem: 26.53MB, Util: 64%  global_step : 19377
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 9, Batch 1378 | Mem: 26.53MB, Util: 64%  glob

[Rank 0] Train Epoch 9:  70%|██████▉   | 1397/2000 [00:10<00:04, 128.74it/s]
[Rank 1] Train Epoch 9:  70%|██████▉   | 1394/2000 [00:10<00:04, 133.42it/s]
[Rank 2] Train Epoch 9:  70%|███████   | 1404/2000 [00:10<00:05, 109.71it/s]
[Rank 0] Train Epoch 9:  70%|███████   | 1410/2000 [00:10<00:04, 128.60it/s]
[Rank 1] Train Epoch 9:  70%|███████   | 1408/2000 [00:10<00:04, 126.86it/s]
[Rank 2] Train Epoch 9:  71%|███████   | 1419/2000 [00:10<00:04, 118.20it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 9, Batch 1398 | Mem: 26.53MB, Util: 63%  global_step : 19398
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 9, Batch 1399 | Mem: 26.53MB, Util: 63%  global_step : 19399
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 9, Batch 1400 | Mem: 26.53MB, Util: 63%  global_step : 19400
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 9, Batch 1401 | Mem: 26.53MB, Util: 63%  global_step : 19401
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 9, Batch 1402 | Mem: 26.53MB, Util: 63%  global_step : 19402
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 9, Batch 1403 | Mem: 26.53MB, Util: 63%  global_step : 19403
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 9, Batch 1404 | Mem: 26.53MB, Util: 63%  global_step : 19404
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 9, Batch 1405 | Mem: 26.53MB, Util: 63%  glob

[Rank 0] Train Epoch 9:  71%|███████   | 1424/2000 [00:10<00:04, 129.58it/s]
[Rank 1] Train Epoch 9:  71%|███████   | 1421/2000 [00:10<00:04, 127.66it/s]
[Rank 2] Train Epoch 9:  72%|███████▏  | 1436/2000 [00:10<00:04, 129.32it/s]
[Rank 0] Train Epoch 9:  72%|███████▏  | 1438/2000 [00:10<00:04, 129.90it/s]
[Rank 1] Train Epoch 9:  72%|███████▏  | 1435/2000 [00:10<00:04, 128.64it/s]
[Rank 2] Train Epoch 9:  73%|███████▎  | 1452/2000 [00:10<00:03, 137.12it/s]


[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 9, Batch 1425 | Mem: 26.53MB, Util: 64%  global_step : 19425
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 9, Batch 1426 | Mem: 26.53MB, Util: 64%  global_step : 19426
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 9, Batch 1427 | Mem: 26.53MB, Util: 64%  global_step : 19427
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 9, Batch 1428 | Mem: 26.53MB, Util: 64%  global_step : 19428
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 9, Batch 1429 | Mem: 26.53MB, Util: 64%  global_step : 19429
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 9, Batch 1430 | Mem: 26.53MB, Util: 64%  global_step : 19430
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 9, Batch 1431 | Mem: 26.53MB, Util: 64%  global_step : 19431
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [GPU LOG] Epoch 9, Batch 1432 | Mem: 26.53MB, Util: 64%  glob

[Rank 0] Train Epoch 9:  73%|███████▎  | 1452/2000 [00:10<00:04, 130.25it/s]
[Rank 1] Train Epoch 9:  72%|███████▏  | 1449/2000 [00:10<00:04, 129.22it/s]
[Rank 2] Train Epoch 9:  74%|███████▎  | 1470/2000 [00:11<00:03, 146.70it/s]
[Rank 1] Train Epoch 9:  73%|███████▎  | 1463/2000 [00:11<00:04, 130.01it/s]
[Rank 0] Train Epoch 9:  73%|███████▎  | 1466/2000 [00:11<00:04, 130.97it/s]
[Rank 2] Train Epoch 9:  74%|███████▍  | 1489/2000 [00:11<00:03, 158.34it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1453 | Mem: 26.53MB, Util: 82%  global_step : 19453
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1454 | Mem: 26.53MB, Util: 82%  global_step : 19454
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1455 | Mem: 26.53MB, Util: 82%  global_step : 19455
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1456 | Mem: 26.53MB, Util: 82%  global_step : 19456
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1457 | Mem: 26.53MB, Util: 82%  global_step : 19457
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1458 | Mem: 26.53MB, Util: 82%  global_step : 19458
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1459 | Mem: 26.53MB, Util: 70%  global_step : 19459
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1460 | Mem: 26.53MB,

[Rank 1] Train Epoch 9:  74%|███████▍  | 1477/2000 [00:11<00:04, 130.06it/s]
[Rank 0] Train Epoch 9:  74%|███████▍  | 1480/2000 [00:11<00:03, 130.06it/s]
[Rank 1] Train Epoch 9:  75%|███████▍  | 1491/2000 [00:11<00:03, 129.77it/s]
[Rank 0] Train Epoch 9:  75%|███████▍  | 1494/2000 [00:11<00:03, 129.86it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1480 | Mem: 26.53MB, Util: 66%  global_step : 19480
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1481 | Mem: 26.53MB, Util: 66%  global_step : 19481
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1482 | Mem: 26.53MB, Util: 66%  global_step : 19482
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1483 | Mem: 26.53MB, Util: 66%  global_step : 19483
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1484 | Mem: 26.53MB, Util: 66%  global_step : 19484
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1485 | Mem: 26.53MB, Util: 66%  global_step : 19485
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1486 | Mem: 26.53MB, Util: 66%  global_step : 19486
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1487 | Mem: 26.53MB,

[Rank 1] Train Epoch 9:  75%|███████▌  | 1505/2000 [00:11<00:03, 130.03it/s]
[Rank 0] Train Epoch 9:  75%|███████▌  | 1507/2000 [00:11<00:03, 129.46it/s]
[Rank 2] Train Epoch 9:  75%|███████▌  | 1506/2000 [00:11<00:04, 111.34it/s]
[Rank 1] Train Epoch 9:  76%|███████▌  | 1519/2000 [00:11<00:03, 130.36it/s]
[Rank 0] Train Epoch 9:  76%|███████▌  | 1521/2000 [00:11<00:03, 130.38it/s]
[Rank 2] Train Epoch 9:  76%|███████▋  | 1525/2000 [00:11<00:03, 127.92it/s]
[Rank 2] Train Epoch 9:  77%|███████▋  | 1544/2000 [00:11<00:03, 142.53it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1507 | Mem: 26.53MB, Util: 71%  global_step : 19507
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1508 | Mem: 26.53MB, Util: 71%  global_step : 19508
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1509 | Mem: 26.53MB, Util: 71%  global_step : 19509
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1510 | Mem: 26.53MB, Util: 71%  global_step : 19510
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1511 | Mem: 26.53MB, Util: 71%  global_step : 19511
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1512 | Mem: 26.53MB, Util: 71%  global_step : 19512
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1513 | Mem: 26.53MB, Util: 71%  global_step : 19513
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1514 | Mem: 26.53MB,

[Rank 1] Train Epoch 9:  77%|███████▋  | 1533/2000 [00:11<00:03, 130.33it/s]
[Rank 0] Train Epoch 9:  77%|███████▋  | 1535/2000 [00:11<00:03, 130.96it/s]
[Rank 2] Train Epoch 9:  78%|███████▊  | 1563/2000 [00:11<00:02, 154.27it/s]
[Rank 1] Train Epoch 9:  77%|███████▋  | 1547/2000 [00:11<00:03, 131.03it/s]
[Rank 0] Train Epoch 9:  77%|███████▋  | 1549/2000 [00:11<00:03, 128.65it/s]
[Rank 2] Train Epoch 9:  79%|███████▉  | 1581/2000 [00:11<00:02, 157.70it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1535 | Mem: 26.53MB, Util: 83%  global_step : 19535
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1536 | Mem: 26.53MB, Util: 83%  global_step : 19536
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1537 | Mem: 26.53MB, Util: 83%  global_step : 19537
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1538 | Mem: 26.53MB, Util: 83%  global_step : 19538
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1539 | Mem: 26.53MB, Util: 83%  global_step : 19539
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1540 | Mem: 26.53MB, Util: 83%  global_step : 19540
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1541 | Mem: 26.53MB, Util: 83%  global_step : 19541
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1542 | Mem: 26.53MB,

[Rank 1] Train Epoch 9:  78%|███████▊  | 1561/2000 [00:11<00:03, 131.35it/s]
[Rank 0] Train Epoch 9:  78%|███████▊  | 1563/2000 [00:11<00:03, 131.17it/s]
[Rank 1] Train Epoch 9:  79%|███████▉  | 1575/2000 [00:11<00:03, 130.57it/s]
[Rank 0] Train Epoch 9:  79%|███████▉  | 1578/2000 [00:11<00:03, 133.96it/s]
[Rank 2] Train Epoch 9:  80%|███████▉  | 1598/2000 [00:11<00:02, 150.87it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1562 | Mem: 26.53MB, Util: 74%  global_step : 19562
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1563 | Mem: 26.53MB, Util: 74%  global_step : 19563
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1564 | Mem: 26.53MB, Util: 74%  global_step : 19564
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1565 | Mem: 26.53MB, Util: 74%  global_step : 19565
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1566 | Mem: 26.53MB, Util: 74%  global_step : 19566
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1567 | Mem: 26.53MB, Util: 74%  global_step : 19567
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1568 | Mem: 26.53MB, Util: 74%  global_step : 19568
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1569 | Mem: 26.53MB,

[Rank 1] Train Epoch 9:  79%|███████▉  | 1589/2000 [00:12<00:03, 130.98it/s]
[Rank 0] Train Epoch 9:  80%|███████▉  | 1592/2000 [00:12<00:03, 135.42it/s]
[Rank 1] Train Epoch 9:  80%|████████  | 1603/2000 [00:12<00:03, 131.44it/s]
[Rank 0] Train Epoch 9:  80%|████████  | 1606/2000 [00:12<00:02, 131.47it/s]
[Rank 2] Train Epoch 9:  81%|████████  | 1614/2000 [00:12<00:03, 106.30it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1590 | Mem: 26.53MB, Util: 83%  global_step : 19590
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1591 | Mem: 26.53MB, Util: 83%  global_step : 19591
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1592 | Mem: 26.53MB, Util: 83%  global_step : 19592
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1593 | Mem: 26.53MB, Util: 83%  global_step : 19593
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1594 | Mem: 26.53MB, Util: 83%  global_step : 19594
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1595 | Mem: 26.53MB, Util: 83%  global_step : 19595
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1596 | Mem: 26.53MB, Util: 83%  global_step : 19596
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1597 | Mem: 26.53MB,

[Rank 1] Train Epoch 9:  81%|████████  | 1617/2000 [00:12<00:02, 131.19it/s]
[Rank 0] Train Epoch 9:  81%|████████  | 1620/2000 [00:12<00:02, 133.79it/s]
[Rank 2] Train Epoch 9:  82%|████████▏ | 1634/2000 [00:12<00:02, 124.51it/s]
[Rank 1] Train Epoch 9:  82%|████████▏ | 1631/2000 [00:12<00:02, 131.51it/s]
[Rank 0] Train Epoch 9:  82%|████████▏ | 1634/2000 [00:12<00:02, 135.27it/s]
[Rank 2] Train Epoch 9:  83%|████████▎ | 1653/2000 [00:12<00:02, 139.36it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1617 | Mem: 26.53MB, Util: 65%  global_step : 19617
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1618 | Mem: 26.53MB, Util: 65%  global_step : 19618
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1619 | Mem: 26.53MB, Util: 65%  global_step : 19619
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1620 | Mem: 26.53MB, Util: 65%  global_step : 19620
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1621 | Mem: 26.53MB, Util: 65%  global_step : 19621
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1622 | Mem: 26.53MB, Util: 65%  global_step : 19622
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1623 | Mem: 26.53MB, Util: 65%  global_step : 19623
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1624 | Mem: 26.53MB,

[Rank 1] Train Epoch 9:  82%|████████▏ | 1645/2000 [00:12<00:02, 131.79it/s]
[Rank 0] Train Epoch 9:  82%|████████▏ | 1648/2000 [00:12<00:02, 136.40it/s]
[Rank 2] Train Epoch 9:  84%|████████▎ | 1672/2000 [00:12<00:02, 151.58it/s]
[Rank 0] Train Epoch 9:  83%|████████▎ | 1663/2000 [00:12<00:02, 137.86it/s]
[Rank 2] Train Epoch 9:  84%|████████▍ | 1689/2000 [00:12<00:02, 148.97it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1645 | Mem: 26.53MB, Util: 63%  global_step : 19645
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1646 | Mem: 26.53MB, Util: 63%  global_step : 19646
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1647 | Mem: 26.53MB, Util: 63%  global_step : 19647
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1648 | Mem: 26.53MB, Util: 63%  global_step : 19648
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1649 | Mem: 26.53MB, Util: 63%  global_step : 19649
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1650 | Mem: 26.53MB, Util: 63%  global_step : 19650
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1651 | Mem: 26.53MB, Util: 63%  global_step : 19651
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1652 | Mem: 26.53MB,

[Rank 1] Train Epoch 9:  83%|████████▎ | 1659/2000 [00:12<00:02, 131.51it/s]
[Rank 0] Train Epoch 9:  84%|████████▍ | 1677/2000 [00:12<00:02, 138.35it/s]
[Rank 1] Train Epoch 9:  84%|████████▎ | 1673/2000 [00:12<00:02, 128.44it/s]
[Rank 0] Train Epoch 9:  85%|████████▍ | 1691/2000 [00:12<00:02, 133.53it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1671 | Mem: 26.53MB, Util: 64%  global_step : 19671
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1672 | Mem: 26.53MB, Util: 64%  global_step : 19672
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1673 | Mem: 26.53MB, Util: 64%  global_step : 19673
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1674 | Mem: 26.53MB, Util: 64%  global_step : 19674
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1675 | Mem: 26.53MB, Util: 64%  global_step : 19675
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1676 | Mem: 26.53MB, Util: 64%  global_step : 19676
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1677 | Mem: 26.53MB, Util: 64%  global_step : 19677
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1678 | Mem: 26.53MB,

[Rank 1] Train Epoch 9:  84%|████████▍ | 1687/2000 [00:12<00:02, 130.15it/s]
[Rank 2] Train Epoch 9:  85%|████████▌ | 1706/2000 [00:12<00:02, 101.62it/s]
[Rank 1] Train Epoch 9:  85%|████████▌ | 1701/2000 [00:12<00:02, 131.55it/s]
[Rank 0] Train Epoch 9:  85%|████████▌ | 1705/2000 [00:12<00:02, 121.89it/s]
[Rank 2] Train Epoch 9:  86%|████████▋ | 1725/2000 [00:13<00:02, 118.41it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1699 | Mem: 26.53MB, Util: 63%  global_step : 19699
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1700 | Mem: 26.53MB, Util: 63%  global_step : 19700
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1701 | Mem: 26.53MB, Util: 63%  global_step : 19701
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1702 | Mem: 26.53MB, Util: 63%  global_step : 19702
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1703 | Mem: 26.53MB, Util: 63%  global_step : 19703
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1704 | Mem: 26.53MB, Util: 63%  global_step : 19704
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1705 | Mem: 26.53MB, Util: 63%  global_step : 19705
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1706 | Mem: 26.53MB,

[Rank 1] Train Epoch 9:  86%|████████▌ | 1715/2000 [00:12<00:02, 132.70it/s]
[Rank 0] Train Epoch 9:  86%|████████▌ | 1718/2000 [00:13<00:02, 123.32it/s]
[Rank 2] Train Epoch 9:  87%|████████▋ | 1744/2000 [00:13<00:01, 133.31it/s]
[Rank 1] Train Epoch 9:  86%|████████▋ | 1729/2000 [00:13<00:02, 133.86it/s]
[Rank 0] Train Epoch 9:  87%|████████▋ | 1731/2000 [00:13<00:02, 124.94it/s]
[Rank 2] Train Epoch 9:  88%|████████▊ | 1763/2000 [00:13<00:01, 146.59it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1727 | Mem: 26.53MB, Util: 74%  global_step : 19727
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1728 | Mem: 26.53MB, Util: 74%  global_step : 19728
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1729 | Mem: 26.53MB, Util: 74%  global_step : 19729
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1730 | Mem: 26.53MB, Util: 74%  global_step : 19730
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1731 | Mem: 26.53MB, Util: 74%  global_step : 19731
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1732 | Mem: 26.53MB, Util: 74%  global_step : 19732
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1733 | Mem: 26.53MB, Util: 74%  global_step : 19733
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1734 | Mem: 26.53MB,

[Rank 1] Train Epoch 9:  87%|████████▋ | 1743/2000 [00:13<00:01, 134.77it/s]
[Rank 0] Train Epoch 9:  87%|████████▋ | 1744/2000 [00:13<00:02, 124.46it/s]
[Rank 2] Train Epoch 9:  89%|████████▉ | 1781/2000 [00:13<00:01, 153.97it/s]
[Rank 1] Train Epoch 9:  88%|████████▊ | 1757/2000 [00:13<00:01, 133.80it/s]
[Rank 0] Train Epoch 9:  88%|████████▊ | 1759/2000 [00:13<00:01, 129.82it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1755 | Mem: 26.53MB, Util: 100%  global_step : 19755
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1756 | Mem: 26.53MB, Util: 100%  global_step : 19756
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1757 | Mem: 26.53MB, Util: 100%  global_step : 19757
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1758 | Mem: 26.53MB, Util: 100%  global_step : 19758
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1759 | Mem: 26.53MB, Util: 100%  global_step : 19759
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1760 | Mem: 26.53MB, Util: 100%  global_step : 19760
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1761 | Mem: 26.53MB, Util: 100%  global_step : 19761
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1762 | Mem: 2

[Rank 1] Train Epoch 9:  89%|████████▊ | 1771/2000 [00:13<00:01, 131.52it/s]
[Rank 0] Train Epoch 9:  89%|████████▊ | 1773/2000 [00:13<00:01, 132.64it/s]
[Rank 2] Train Epoch 9:  90%|████████▉ | 1798/2000 [00:13<00:01, 130.50it/s]
[Rank 1] Train Epoch 9:  89%|████████▉ | 1785/2000 [00:13<00:01, 131.47it/s]
[Rank 0] Train Epoch 9:  89%|████████▉ | 1787/2000 [00:13<00:01, 134.69it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1782 | Mem: 26.53MB, Util: 100%  global_step : 19782
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1783 | Mem: 26.53MB, Util: 100%  global_step : 19783
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1784 | Mem: 26.53MB, Util: 100%  global_step : 19784
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1785 | Mem: 26.53MB, Util: 100%  global_step : 19785
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1786 | Mem: 26.53MB, Util: 100%  global_step : 19786
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1787 | Mem: 26.53MB, Util: 100%  global_step : 19787
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1788 | Mem: 26.53MB, Util: 100%  global_step : 19788
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1789 | Mem: 2

[Rank 1] Train Epoch 9:  90%|████████▉ | 1799/2000 [00:13<00:01, 130.50it/s]
[Rank 0] Train Epoch 9:  90%|█████████ | 1801/2000 [00:13<00:01, 132.68it/s]
[Rank 2] Train Epoch 9:  91%|█████████ | 1813/2000 [00:13<00:01, 108.75it/s]
[Rank 1] Train Epoch 9:  91%|█████████ | 1813/2000 [00:13<00:01, 130.58it/s]
[Rank 0] Train Epoch 9:  91%|█████████ | 1816/2000 [00:13<00:01, 134.71it/s]
[Rank 2] Train Epoch 9:  92%|█████████▏| 1832/2000 [00:13<00:01, 125.45it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1810 | Mem: 26.53MB, Util: 87%  global_step : 19810
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1811 | Mem: 26.53MB, Util: 87%  global_step : 19811
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1812 | Mem: 26.53MB, Util: 87%  global_step : 19812
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1813 | Mem: 26.53MB, Util: 87%  global_step : 19813
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1814 | Mem: 26.53MB, Util: 87%  global_step : 19814
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1815 | Mem: 26.53MB, Util: 87%  global_step : 19815
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1816 | Mem: 26.53MB, Util: 87%  global_step : 19816
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1817 | Mem: 26.53MB,

[Rank 1] Train Epoch 9:  91%|█████████▏| 1827/2000 [00:13<00:01, 131.40it/s]
[Rank 0] Train Epoch 9:  92%|█████████▏| 1830/2000 [00:13<00:01, 135.28it/s]
[Rank 0] Train Epoch 9:  92%|█████████▏| 1844/2000 [00:13<00:01, 136.16it/s]
[Rank 2] Train Epoch 9:  93%|█████████▎| 1851/2000 [00:13<00:01, 139.64it/s]
[Rank 1] Train Epoch 9:  92%|█████████▏| 1841/2000 [00:13<00:01, 131.04it/s]
[Rank 0] Train Epoch 9:  93%|█████████▎| 1858/2000 [00:14<00:01, 137.02it/s]
[Rank 2] Train Epoch 9:  94%|█████████▎| 1870/2000 [00:14<00:00, 151.69it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1837 | Mem: 26.53MB, Util: 67%  global_step : 19837
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1838 | Mem: 26.53MB, Util: 67%  global_step : 19838
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1839 | Mem: 26.53MB, Util: 67%  global_step : 19839
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1840 | Mem: 26.53MB, Util: 67%  global_step : 19840
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1841 | Mem: 26.53MB, Util: 67%  global_step : 19841
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1842 | Mem: 26.53MB, Util: 67%  global_step : 19842
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1843 | Mem: 26.53MB, Util: 67%  global_step : 19843
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1844 | Mem: 26.53MB,

[Rank 1] Train Epoch 9:  93%|█████████▎| 1855/2000 [00:14<00:01, 131.97it/s]
[Rank 0] Train Epoch 9:  94%|█████████▎| 1872/2000 [00:14<00:00, 137.26it/s]
[Rank 2] Train Epoch 9:  94%|█████████▍| 1887/2000 [00:14<00:00, 154.75it/s]
[Rank 1] Train Epoch 9:  93%|█████████▎| 1869/2000 [00:14<00:00, 132.92it/s]
[Rank 0] Train Epoch 9:  94%|█████████▍| 1886/2000 [00:14<00:00, 137.90it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1865 | Mem: 26.53MB, Util: 68%  global_step : 19865
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1866 | Mem: 26.53MB, Util: 68%  global_step : 19866
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1867 | Mem: 26.53MB, Util: 68%  global_step : 19867
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1868 | Mem: 26.53MB, Util: 68%  global_step : 19868
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1869 | Mem: 26.53MB, Util: 68%  global_step : 19869
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1870 | Mem: 26.53MB, Util: 68%  global_step : 19870
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1871 | Mem: 26.53MB, Util: 68%  global_step : 19871
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1872 | Mem: 26.53MB,

[Rank 1] Train Epoch 9:  94%|█████████▍| 1883/2000 [00:14<00:00, 134.22it/s]
[Rank 0] Train Epoch 9:  95%|█████████▌| 1900/2000 [00:14<00:00, 137.39it/s]
[Rank 2] Train Epoch 9:  95%|█████████▌| 1904/2000 [00:14<00:00, 105.12it/s]
[Rank 1] Train Epoch 9:  95%|█████████▍| 1897/2000 [00:14<00:00, 134.81it/s]
[Rank 2] Train Epoch 9:  96%|█████████▌| 1923/2000 [00:14<00:00, 122.05it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1894 | Mem: 26.53MB, Util: 69%  global_step : 19894
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1895 | Mem: 26.53MB, Util: 69%  global_step : 19895
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1896 | Mem: 26.53MB, Util: 69%  global_step : 19896
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1897 | Mem: 26.53MB, Util: 69%  global_step : 19897
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1898 | Mem: 26.53MB, Util: 69%  global_step : 19898
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1899 | Mem: 26.53MB, Util: 69%  global_step : 19899
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1900 | Mem: 26.53MB, Util: 69%  global_step : 19900
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1901 | Mem: 26.53MB,

[Rank 1] Train Epoch 9:  96%|█████████▌| 1911/2000 [00:14<00:00, 133.27it/s]
[Rank 0] Train Epoch 9:  96%|█████████▌| 1914/2000 [00:14<00:00, 123.27it/s]
[Rank 2] Train Epoch 9:  97%|█████████▋| 1942/2000 [00:14<00:00, 136.94it/s]
[Rank 1] Train Epoch 9:  96%|█████████▋| 1925/2000 [00:14<00:00, 134.67it/s]
[Rank 0] Train Epoch 9:  96%|█████████▋| 1928/2000 [00:14<00:00, 125.66it/s]
[Rank 2] Train Epoch 9:  98%|█████████▊| 1961/2000 [00:14<00:00, 149.49it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1922 | Mem: 26.53MB, Util: 67%  global_step : 19922
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1923 | Mem: 26.53MB, Util: 67%  global_step : 19923
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1924 | Mem: 26.53MB, Util: 67%  global_step : 19924
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1925 | Mem: 26.53MB, Util: 67%  global_step : 19925
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1926 | Mem: 26.53MB, Util: 67%  global_step : 19926
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1927 | Mem: 26.53MB, Util: 67%  global_step : 19927
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1928 | Mem: 26.53MB, Util: 67%  global_step : 19928
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1929 | Mem: 26.53MB,

[Rank 1] Train Epoch 9:  97%|█████████▋| 1939/2000 [00:14<00:00, 135.77it/s]
[Rank 0] Train Epoch 9:  97%|█████████▋| 1942/2000 [00:14<00:00, 127.15it/s]
[Rank 2] Train Epoch 9:  99%|█████████▉| 1978/2000 [00:14<00:00, 153.21it/s]
[Rank 1] Train Epoch 9:  98%|█████████▊| 1953/2000 [00:14<00:00, 136.65it/s]
[Rank 0] Train Epoch 9:  98%|█████████▊| 1955/2000 [00:14<00:00, 127.10it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1951 | Mem: 26.53MB, Util: 95%  global_step : 19951
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1952 | Mem: 26.53MB, Util: 95%  global_step : 19952
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1953 | Mem: 26.53MB, Util: 95%  global_step : 19953
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1954 | Mem: 26.53MB, Util: 95%  global_step : 19954
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1955 | Mem: 26.53MB, Util: 95%  global_step : 19955
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1956 | Mem: 26.53MB, Util: 95%  global_step : 19956
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1957 | Mem: 26.53MB, Util: 95%  global_step : 19957
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1958 | Mem: 26.53MB,

[Rank 1] Train Epoch 9:  98%|█████████▊| 1968/2000 [00:14<00:00, 138.71it/s]
[Rank 0] Train Epoch 9:  98%|█████████▊| 1968/2000 [00:14<00:00, 126.97it/s]
[Rank 2] Train Epoch 9: 100%|██████████| 2000/2000 [00:15<00:00, 133.02it/s]
[Rank 2] Test Epoch 9:   0%|          | 0/334 [00:00<?, ?it/s]
[Rank 1] Train Epoch 9:  99%|█████████▉| 1983/2000 [00:14<00:00, 140.17it/s]
[Rank 0] Train Epoch 9:  99%|█████████▉| 1981/2000 [00:15<00:00, 126.97it/s]


[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1981 | Mem: 26.53MB, Util: 100%  global_step : 19981
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1982 | Mem: 26.53MB, Util: 100%  global_step : 19982
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1983 | Mem: 26.53MB, Util: 100%  global_step : 19983
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1984 | Mem: 26.53MB, Util: 100%  global_step : 19984
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1985 | Mem: 26.53MB, Util: 100%  global_step : 19985
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1986 | Mem: 26.53MB, Util: 100%  global_step : 19986
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1987 | Mem: 26.53MB, Util: 100%  global_step : 19987
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [GPU LOG] Epoch 9, Batch 1988 | Mem: 2

[Rank 1] Train Epoch 9: 100%|██████████| 2000/2000 [00:15<00:00, 132.59it/s]
[Rank 1] Test Epoch 9:   0%|          | 0/334 [00:00<?, ?it/s]
[Rank 0] Train Epoch 9: 100%|██████████| 2000/2000 [00:15<00:00, 132.01it/s]
[Rank 0] Test Epoch 9:   0%|          | 0/334 [00:00<?, ?it/s]
[Rank 2] Test Epoch 9:   0%|          | 1/334 [00:00<00:59,  5.63it/s]
[Rank 1] Test Epoch 9:   3%|▎         | 11/334 [00:00<00:02, 108.33it/s]
[Rank 0] Test Epoch 9:  10%|▉         | 32/334 [00:00<00:00, 319.94it/s]
[Rank 2] Test Epoch 9:  11%|█         | 37/334 [00:00<00:01, 162.97it/s]
[Rank 1] Test Epoch 9:  14%|█▍        | 46/334 [00:00<00:01, 248.49it/s]
[Rank 0] Test Epoch 9:  20%|██        | 68/334 [00:00<00:00, 339.11it/s]
[Rank 2] Test Epoch 9:  22%|██▏       | 74/334 [00:00<00:01, 241.94it/s]
[Rank 1] Test Epoch 9:  25%|██▍       | 82/334 [00:00<00:00, 296.26it/s]
[Rank 0] Test Epoch 9:  31%|███       | 103/334 [00:00<00:00, 340.11it/s]
[Rank 2] Test Epoch 9:  33%|███▎      | 110/334 [00:00<00:00, 28

[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [Rank 0] Epoch 9 | Loss: 0.3358, Acc: 0.8776, Model Checksum: e8f42bd92533c342a79447a789c2dbc5
[36m(RayTrainWorker pid=604, ip=10.254.7.94)[0m [ NodeId f94cfd48e881a8ef0b964a8593da4b704a1574ca224f294a4cfe1791 Rank 0] Epoch 9 | Loss: 0.3358, Acc: 0.8776, Model Checksum: e8f42bd92533c342a79447a789c2dbc5
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [Rank 2] Epoch 9 | Loss: 0.3175, Acc: 0.8839, Model Checksum: e8f42bd92533c342a79447a789c2dbc5
[36m(RayTrainWorker pid=401, ip=10.254.6.117)[0m [ NodeId 8132e66e3928a4b82addde0aa19309709a58e788e1dcddeaab904191 Rank 2] Epoch 9 | Loss: 0.3175, Acc: 0.8839, Model Checksum: e8f42bd92533c342a79447a789c2dbc5
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [Rank 1] Epoch 9 | Loss: 0.3534, Acc: 0.8743, Model Checksum: e8f42bd92533c342a79447a789c2dbc5
[36m(RayTrainWorker pid=398, ip=10.254.12.140)[0m [ NodeId d78b974282fa0fa2bddc3a93a3217bbba8df4be1998f6b20ec83243d Rank 1] Epoch 9 | Loss:

[Rank 1] Test Epoch 9: 100%|██████████| 334/334 [00:01<00:00, 326.26it/s]


[36m(TunerInternal pid=767)[0m 
[36m(TunerInternal pid=767)[0m Training completed after 10 iterations at 2025-04-07 12:57:58. Total running time: 9min 8s
[36m(TunerInternal pid=767)[0m 
Training result: Result(
  metrics={'loss': 0.3358377871664051, 'accuracy': 0.877624475104979, 'epoch': 9, 'node_id': 'f94cfd48e881a8ef0b964a8593da4b704a1574ca224f294a4cfe1791', 'rank': 0, 'model_checksum': 'e8f42bd92533c342a79447a789c2dbc5', 'gpu_utilization': 64, 'gpu_mem_alloc': 26.50244140625},
  path='/home/cdsw/ray_results/TorchTrainer_2025-04-07_12-47-50/TorchTrainer_a71c6_00000_0_2025-04-07_12-48-50',
  filesystem='local',
  checkpoint=None
)


[36m(TunerInternal pid=767)[0m Wrote the latest version of all result files and experiment state to '/home/cdsw/ray_results/TorchTrainer_2025-04-07_12-47-50' in 4.7144s.


In [42]:
cluster.terminate()

# or 
# Shutdown Ray programmatically
ray.shutdown()