From 4868ba6db26b726b2bac6f8af1289638bd496169 Mon Sep 17 00:00:00 2001 From: kohankhaki Date: Wed, 8 Oct 2025 19:01:39 -0400 Subject: [PATCH 1/8] set vars needed for ddp. --- templates/configs/_global.yaml | 3 +- templates/src/mlp/ddp/train.py | 103 +++++++++++++++++++-------------- 2 files changed, 61 insertions(+), 45 deletions(-) diff --git a/templates/configs/_global.yaml b/templates/configs/_global.yaml index 8c7fda3..8788422 100644 --- a/templates/configs/_global.yaml +++ b/templates/configs/_global.yaml @@ -21,7 +21,7 @@ hydra: submitit_folder: ${hydra.sweep.dir}/submitit_logs/%j nodes: ${oc.select:compute.nodes,null} gpus_per_node: ${oc.select:compute.slurm.gpus_per_node, ${compute.gpus_per_node}} - tasks_per_node: 1 + tasks_per_node: ${oc.select:compute.tasks_per_node, ${compute.gpus_per_node}} cpus_per_task: ${compute.cpus_per_task} mem_gb: ${compute.mem_gb} timeout_min: ${compute.timeout_min} @@ -31,4 +31,3 @@ hydra: account: ${user.slurm.account} max_num_timeout: 2 additional_parameters: ${oc.select:user.slurm.additional_parameters, {}} - diff --git a/templates/src/mlp/ddp/train.py b/templates/src/mlp/ddp/train.py index 0e4d239..d8ae255 100644 --- a/templates/src/mlp/ddp/train.py +++ b/templates/src/mlp/ddp/train.py @@ -1,14 +1,14 @@ """Distributed MLP training using PyTorch DDP.""" -import os import logging +import os import submitit import torch import torch.distributed as dist +from omegaconf import DictConfig, OmegaConf from torch import nn, optim from torch.utils.data import DataLoader, DistributedSampler, TensorDataset -from omegaconf import DictConfig, OmegaConf logger = logging.getLogger(__name__) @@ -79,6 +79,49 @@ def _setup_distributed(self, rank, world_size): world_size=world_size, ) + def _wrap_distributed(self, model, world_size, local_rank): + if world_size > 1: + return nn.parallel.DistributedDataParallel( + model, + device_ids=[local_rank] if torch.cuda.is_available() else None, + ) + return model + + def _configure_training(self, cfg): + lr = OmegaConf.select(cfg, "trainer.learning_rate", default=1e-3) + num_epochs = OmegaConf.select(cfg, "trainer.num_epochs", default=1000) + seed = OmegaConf.select(cfg, "trainer.seed", default=42) + return lr, num_epochs, seed + + def _get_distributed_config(self): + job_env = submitit.JobEnvironment() + return job_env, job_env.global_rank, job_env.local_rank, job_env.num_tasks + + def _prepare_environment(self, job_env, rank, local_rank, world_size): + os.environ.setdefault("RANK", str(rank)) + os.environ.setdefault("LOCAL_RANK", str(local_rank)) + os.environ.setdefault("WORLD_SIZE", str(world_size)) + + if "MASTER_ADDR" not in os.environ: + hostnames = getattr(job_env, "hostnames", None) or [job_env.hostname] + os.environ["MASTER_ADDR"] = str(hostnames[0]) + + if "MASTER_PORT" not in os.environ: + os.environ["MASTER_PORT"] = "29500" + + def _log_run_configuration(self, seed, world_size, local_rank, rank): + if rank != 0: + return + logger.info(f"Starting DDP MLP training with seed {seed}") + logger.info(f"World size: {world_size}, Local rank: {local_rank}") + if torch.cuda.is_available(): + logger.info(f"Number of available GPUs: {torch.cuda.device_count()}") + + def _set_seed(self, seed): + torch.manual_seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed(seed) + def _initialize_device_and_model(self, cfg, local_rank): """Initialize device and model.""" input_dim = OmegaConf.select(cfg, "trainer.input_dim", default=10) @@ -193,64 +236,35 @@ def _train_epoch( def __call__(self, cfg): """Train the MLP model with DDP.""" - cfg : DictConfig = OmegaConf.create(cfg) # Ensure cfg is a DictConfig + cfg: DictConfig = OmegaConf.create(cfg) - # Create output directory out_dir = cfg.paths.out_dir os.makedirs(out_dir, exist_ok=True) - - # Get ckpt dir self.ckpt_dir = self._latest_checkpoint(out_dir) - # Configuration - lr = OmegaConf.select(cfg, "trainer.learning_rate", default=1e-3) - num_epochs = OmegaConf.select(cfg, "trainer.num_epochs", default=1000) - seed = OmegaConf.select(cfg, "trainer.seed", default=42) - - # Get distributed training info from environment - # TODO: None of these env vars are actually set at the moment. Need to fix this example. - rank = int(os.environ.get("RANK", "0")) - local_rank = int(os.environ.get("LOCAL_RANK", "0")) - world_size = int(os.environ.get("WORLD_SIZE", "1")) + lr, num_epochs, seed = self._configure_training(cfg) + job_env, rank, local_rank, world_size = self._get_distributed_config() - if rank == 0: - logger.info(f"Starting DDP MLP training with seed {seed}") - logger.info(f"World size: {world_size}, Local rank: {local_rank}") - - # Set seed for reproducibility (same seed on all processes) - torch.manual_seed(seed) - if torch.cuda.is_available(): - torch.cuda.manual_seed(seed) - logger.info(f"Number of available GPUs: {torch.cuda.device_count()}") + self._prepare_environment(job_env, rank, local_rank, world_size) + self._set_seed(seed) + self._log_run_configuration(seed, world_size, local_rank, rank) - # Setup distributed training self._setup_distributed(rank, world_size) - # Setup device and model device, model = self._initialize_device_and_model(cfg, local_rank) - if rank == 0: logger.info(f"Using device: {device}") - # Wrap model with DDP - if world_size > 1: - model = nn.parallel.DistributedDataParallel( - model, - device_ids=[local_rank] if torch.cuda.is_available() else None, - ) + model = self._wrap_distributed(model, world_size, local_rank) - # Setup data and training loader, sampler = self._initialize_data_and_loader(cfg, world_size, rank) optimizer = optim.Adam(model.parameters(), lr=lr) criterion = nn.CrossEntropyLoss() - # Resume from checkpoint if available start_epoch = self._load_checkpoint_if_exists(model, optimizer, device, rank) - if rank == 0: logger.info(f"Training from epoch {start_epoch} to {num_epochs}...") - # Training loop with DDP for epoch in range(start_epoch, num_epochs): loss_sum, correct, total = self._train_epoch( model, @@ -264,15 +278,18 @@ def __call__(self, cfg): rank, ) + avg_loss = loss_sum / len(loader) + acc = 100.0 * correct / total + should_checkpoint = epoch % 100 == 0 or epoch == num_epochs - 1 + # Log metrics only on rank 0 if rank == 0: - acc = 100.0 * correct / total - avg_loss = loss_sum / len(loader) logger.info(f"Epoch {epoch}: loss={avg_loss:.4f} acc={acc:.2f}%") - if epoch % 100 == 0 or epoch == num_epochs - 1: - if world_size > 1: - dist.barrier() + if should_checkpoint: + if world_size > 1: + dist.barrier() + if rank == 0: self._save_checkpoint( model, optimizer, epoch, out_dir, avg_loss, acc, rank ) From 2bc170b0aeb34b78ba4f325e8df379717e125149 Mon Sep 17 00:00:00 2001 From: kohankhaki Date: Wed, 15 Oct 2025 16:55:59 -0400 Subject: [PATCH 2/8] set env vars for distributed training. fixed average loss. updated readme to include info on ddp env. --- templates/src/mlp/ddp/README.md | 17 +++++++++++++++-- templates/src/mlp/ddp/train.py | 18 +++++++++++------- 2 files changed, 26 insertions(+), 9 deletions(-) diff --git a/templates/src/mlp/ddp/README.md b/templates/src/mlp/ddp/README.md index c962828..68a8598 100644 --- a/templates/src/mlp/ddp/README.md +++ b/templates/src/mlp/ddp/README.md @@ -2,7 +2,7 @@ > :warning: WIP: This template is a work in progress and does not use DDP in its current state. -*Data Parallelism* lets you to split your data across multiple accelerators so that you can train your model faster! +*Data Parallelism* lets you to split your data across multiple accelerators so that you can train your model faster! Most of the time all your accelerators (gpus) will be on the same machine (node), and that simplifies things. However if you are using a large number of gpus that can't fit on a single machine, then you'll have to use multiple machines (nodes). For example, on the Killarney cluster, L40's have a maximum of 4 per node and H100's have a maximum of 8 per nodes. Data Parallelism across multiple nodes is referred to as *Distributed Data Parallelism* (DDP). By default DDP works for both single node and multi-node settings. @@ -14,4 +14,17 @@ This example implements a simple MLP using DDP. **Rank:** Integer ID for a single gpu. Unique across all nodes. (from `0` to `world_size - 1`) -**Local Rank:** Integer ID for a single gpu. Unique only within a node. (from `0` to `num_gpus_per_node - 1`) \ No newline at end of file +**Local Rank:** Integer ID for a single gpu. Unique only within a node. (from `0` to `num_gpus_per_node - 1`) + +## DDP Setup + +Unlike `torchrun`, Submitit is a **job scheduler integration**, not a distributed orchestrator. It spawns one process per GPU (or per `tasks_per_node`), but it does **not automatically set** the PyTorch environment variables (`RANK`, `LOCAL_RANK`, `WORLD_SIZE`, `MASTER_ADDR`, `MASTER_PORT`) required by `torch.distributed`. + +Therefore, this project explicitly initializes the distributed environment inside the training script using `submitit.JobEnvironment()`. +This pattern is the standard way to perform DDP initialization with Submitit when not using `torchrun` +([MosaicML Docs](https://docs.mosaicml.com/projects/composer/en/stable/examples/training_with_submitit.html), +[Hydra Submitit Launcher](https://hydra.cc/docs/plugins/submitit_launcher/), +[PyTorch Forum Discussion](https://discuss.pytorch.org/t/using-submitit-for-distributed-training/121881), +[Fairseq Example](https://github.com/facebookresearch/fairseq/blob/main/examples/language_model/submitit_train.py)). + +It works for both **single-node** and **multi-node** jobs as long as the `MASTER_ADDR` points to a hostname reachable from all nodes. diff --git a/templates/src/mlp/ddp/train.py b/templates/src/mlp/ddp/train.py index d8ae255..9eda3d7 100644 --- a/templates/src/mlp/ddp/train.py +++ b/templates/src/mlp/ddp/train.py @@ -98,16 +98,22 @@ def _get_distributed_config(self): return job_env, job_env.global_rank, job_env.local_rank, job_env.num_tasks def _prepare_environment(self, job_env, rank, local_rank, world_size): + print( + f"Preparing environment for rank {rank}, local_rank {local_rank}, world_size {world_size}" + ) os.environ.setdefault("RANK", str(rank)) os.environ.setdefault("LOCAL_RANK", str(local_rank)) os.environ.setdefault("WORLD_SIZE", str(world_size)) if "MASTER_ADDR" not in os.environ: - hostnames = getattr(job_env, "hostnames", None) or [job_env.hostname] - os.environ["MASTER_ADDR"] = str(hostnames[0]) + master_addr = ( + job_env.hostnames[0] + if hasattr(job_env, "hostnames") + else job_env.hostname + ) + os.environ["MASTER_ADDR"] = str(master_addr) - if "MASTER_PORT" not in os.environ: - os.environ["MASTER_PORT"] = "29500" + os.environ.setdefault("MASTER_PORT", "29500") def _log_run_configuration(self, seed, world_size, local_rank, rank): if rank != 0: @@ -200,7 +206,6 @@ def _train_epoch( device, epoch, world_size, - rank, ): """Train for one epoch and return metrics.""" # Set epoch for DistributedSampler to ensure proper shuffling across epochs @@ -275,10 +280,9 @@ def __call__(self, cfg): device, epoch, world_size, - rank, ) - avg_loss = loss_sum / len(loader) + avg_loss = loss_sum / (len(loader) * world_size) acc = 100.0 * correct / total should_checkpoint = epoch % 100 == 0 or epoch == num_epochs - 1 From d92bd2cdfbb498b25a1ba909a918e1b452e8648d Mon Sep 17 00:00:00 2001 From: kohankhaki Date: Thu, 16 Oct 2025 12:42:27 -0400 Subject: [PATCH 3/8] fixed typos in mlp-ddp example. --- templates/src/mlp/ddp/README.md | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/templates/src/mlp/ddp/README.md b/templates/src/mlp/ddp/README.md index 68a8598..02d2a5f 100644 --- a/templates/src/mlp/ddp/README.md +++ b/templates/src/mlp/ddp/README.md @@ -1,20 +1,18 @@ -# Distributed Data Parallel Example - -> :warning: WIP: This template is a work in progress and does not use DDP in its current state. +# MLP Distributed Data Parallel Template *Data Parallelism* lets you to split your data across multiple accelerators so that you can train your model faster! -Most of the time all your accelerators (gpus) will be on the same machine (node), and that simplifies things. However if you are using a large number of gpus that can't fit on a single machine, then you'll have to use multiple machines (nodes). For example, on the Killarney cluster, L40's have a maximum of 4 per node and H100's have a maximum of 8 per nodes. Data Parallelism across multiple nodes is referred to as *Distributed Data Parallelism* (DDP). By default DDP works for both single node and multi-node settings. +Most of the time all your accelerators (GPUs) will be on the same machine (node), and that simplifies things. However if you are using a large number of GPUs that can't fit on a single machine, then you'll have to use multiple machines (nodes). For example, on the Killarney cluster, L40's have a maximum of 4 per node and H100's have a maximum of 8 per nodes. Data Parallelism across multiple nodes is referred to as *Distributed Data Parallelism* (DDP). By default DDP works for both single node and multi-node settings. This example implements a simple MLP using DDP. ## DDP Background -**World Size:** The total number of gpu's across all nodes +**World Size:** The total number of GPU's across all nodes -**Rank:** Integer ID for a single gpu. Unique across all nodes. (from `0` to `world_size - 1`) +**Rank:** Integer ID for a single GPU. Unique across all nodes. (from `0` to `world_size - 1`) -**Local Rank:** Integer ID for a single gpu. Unique only within a node. (from `0` to `num_gpus_per_node - 1`) +**Local Rank:** Integer ID for a single GPU. Unique only within a node. (from `0` to `num_gpus_per_node - 1`) ## DDP Setup From 2f29a099ef654380ce427c2ac32aeb94cab31b39 Mon Sep 17 00:00:00 2001 From: kohankhaki Date: Thu, 16 Oct 2025 13:37:27 -0400 Subject: [PATCH 4/8] remvoed extra prints. --- templates/src/mlp/ddp/train.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/templates/src/mlp/ddp/train.py b/templates/src/mlp/ddp/train.py index 9eda3d7..0ac4d38 100644 --- a/templates/src/mlp/ddp/train.py +++ b/templates/src/mlp/ddp/train.py @@ -98,9 +98,6 @@ def _get_distributed_config(self): return job_env, job_env.global_rank, job_env.local_rank, job_env.num_tasks def _prepare_environment(self, job_env, rank, local_rank, world_size): - print( - f"Preparing environment for rank {rank}, local_rank {local_rank}, world_size {world_size}" - ) os.environ.setdefault("RANK", str(rank)) os.environ.setdefault("LOCAL_RANK", str(local_rank)) os.environ.setdefault("WORLD_SIZE", str(world_size)) @@ -158,7 +155,7 @@ def _initialize_data_and_loader(self, cfg, world_size, rank): num_classes = OmegaConf.select(cfg, "trainer.num_classes", default=3) batch_size = OmegaConf.select(cfg, "trainer.batch_size", default=32) - dataset = create_dummy_data(1000, input_dim, num_classes) + dataset = create_dummy_data(100000, input_dim, num_classes) sampler = ( DistributedSampler( dataset, num_replicas=world_size, rank=rank, shuffle=True From 253dd532c640496007a037fb28eb23e8b964cc35 Mon Sep 17 00:00:00 2001 From: kohankhaki Date: Fri, 17 Oct 2025 17:46:50 -0400 Subject: [PATCH 5/8] fixed cpus_per_task and mem_gb presets. --- templates/configs/compute/bon_echo/a40_1x.yaml | 4 ++-- templates/configs/compute/bon_echo/a40_2x.yaml | 6 +++--- templates/configs/compute/bon_echo/a40_4x.yaml | 13 +++++++++++++ templates/configs/compute/killarney/cpu_1x.yaml | 10 ---------- templates/configs/compute/killarney/h100_1x.yaml | 4 ++-- templates/configs/compute/killarney/h100_2x.yaml | 12 ++++++++++++ templates/configs/compute/killarney/h100_4x.yaml | 12 ++++++++++++ templates/configs/compute/killarney/h100_8x.yaml | 4 ++-- templates/configs/compute/killarney/l40s_1x.yaml | 4 ++-- templates/configs/compute/killarney/l40s_2x.yaml | 4 ++-- templates/configs/compute/killarney/l40s_4x.yaml | 12 ++++++++++++ 11 files changed, 62 insertions(+), 23 deletions(-) create mode 100644 templates/configs/compute/bon_echo/a40_4x.yaml delete mode 100644 templates/configs/compute/killarney/cpu_1x.yaml create mode 100644 templates/configs/compute/killarney/h100_2x.yaml create mode 100644 templates/configs/compute/killarney/h100_4x.yaml create mode 100644 templates/configs/compute/killarney/l40s_4x.yaml diff --git a/templates/configs/compute/bon_echo/a40_1x.yaml b/templates/configs/compute/bon_echo/a40_1x.yaml index e5fd4cb..27e78e5 100644 --- a/templates/configs/compute/bon_echo/a40_1x.yaml +++ b/templates/configs/compute/bon_echo/a40_1x.yaml @@ -5,8 +5,8 @@ gpus_per_node: 1 time_limit: "8:00:00" timeout_min: 480 work_root: /scratch/ssd004/scratch/${oc.env:USER} -mem_gb: 16 -cpus_per_task: 16 +cpus_per_task: 8 +mem_gb: 40 gres: gpu:${.gpu_type}:${.gpus_per_node} slurm: partition: a40 diff --git a/templates/configs/compute/bon_echo/a40_2x.yaml b/templates/configs/compute/bon_echo/a40_2x.yaml index c703c97..ab2e239 100644 --- a/templates/configs/compute/bon_echo/a40_2x.yaml +++ b/templates/configs/compute/bon_echo/a40_2x.yaml @@ -2,11 +2,11 @@ cluster: bon_echo nodes: 1 gpu_type: a40 gpus_per_node: 2 -time_limit: "2:00:00" -timeout_min: 120 +time_limit: "8:00:00" +timeout_min: 480 work_root: /scratch/ssd004/scratch/${oc.env:USER} -mem_gb: 64 cpus_per_task: 8 +mem_gb: 80 gres: gpu:${.gpu_type}:${.gpus_per_node} slurm: partition: a40 diff --git a/templates/configs/compute/bon_echo/a40_4x.yaml b/templates/configs/compute/bon_echo/a40_4x.yaml new file mode 100644 index 0000000..24db9a3 --- /dev/null +++ b/templates/configs/compute/bon_echo/a40_4x.yaml @@ -0,0 +1,13 @@ +cluster: bon_echo +nodes: 1 +gpu_type: a40 +gpus_per_node: 4 +time_limit: "8:00:00" +timeout_min: 480 +work_root: /scratch/ssd004/scratch/${oc.env:USER} +cpus_per_task: 8 +mem_gb: 160 +gres: gpu:${.gpu_type}:${.gpus_per_node} +slurm: + partition: a40 + gpus_per_node: null diff --git a/templates/configs/compute/killarney/cpu_1x.yaml b/templates/configs/compute/killarney/cpu_1x.yaml deleted file mode 100644 index 4593fcb..0000000 --- a/templates/configs/compute/killarney/cpu_1x.yaml +++ /dev/null @@ -1,10 +0,0 @@ -cluster: killarney -nodes: 1 -gpus_per_node: 0 -cpus_per_task: 32 -mem_gb: 64 -work_root: /scratch/${oc.env:USER} -timeout_min: 60 -gres: null -slurm: - gpus_per_node: null diff --git a/templates/configs/compute/killarney/h100_1x.yaml b/templates/configs/compute/killarney/h100_1x.yaml index 3be27d7..7bc9632 100644 --- a/templates/configs/compute/killarney/h100_1x.yaml +++ b/templates/configs/compute/killarney/h100_1x.yaml @@ -5,8 +5,8 @@ gpus_per_node: 1 time_limit: "1:00:00" timeout_min: 60 work_root: /scratch/${oc.env:USER} -mem_gb: 256 -cpus_per_task: 24 +cpus_per_task: 6 +mem_gb: 250 gres: gpu:${.gpu_type}:${.gpus_per_node} slurm: gpus_per_node: null diff --git a/templates/configs/compute/killarney/h100_2x.yaml b/templates/configs/compute/killarney/h100_2x.yaml new file mode 100644 index 0000000..efb8dae --- /dev/null +++ b/templates/configs/compute/killarney/h100_2x.yaml @@ -0,0 +1,12 @@ +cluster: killarney +nodes: 1 +gpu_type: h100 +gpus_per_node: 2 +time_limit: "1:00:00" +timeout_min: 60 +work_root: /scratch/${oc.env:USER} +cpus_per_task: 6 +mem_gb: 500 +gres: gpu:${.gpu_type}:${.gpus_per_node} +slurm: + gpus_per_node: null diff --git a/templates/configs/compute/killarney/h100_4x.yaml b/templates/configs/compute/killarney/h100_4x.yaml new file mode 100644 index 0000000..0a00aba --- /dev/null +++ b/templates/configs/compute/killarney/h100_4x.yaml @@ -0,0 +1,12 @@ +cluster: killarney +nodes: 1 +gpu_type: h100 +gpus_per_node: 4 +time_limit: "1:00:00" +timeout_min: 60 +work_root: /scratch/${oc.env:USER} +cpus_per_task: 6 +mem_gb: 1000 +gres: gpu:${.gpu_type}:${.gpus_per_node} +slurm: + gpus_per_node: null diff --git a/templates/configs/compute/killarney/h100_8x.yaml b/templates/configs/compute/killarney/h100_8x.yaml index 2530d37..6b9454a 100644 --- a/templates/configs/compute/killarney/h100_8x.yaml +++ b/templates/configs/compute/killarney/h100_8x.yaml @@ -5,8 +5,8 @@ gpus_per_node: 8 time_limit: "1:00:00" timeout_min: 60 work_root: /scratch/${oc.env:USER} -mem_gb: 2048 -cpus_per_task: 96 +cpus_per_task: 6 +mem_gb: 2000 gres: gpu:${.gpu_type}:${.gpus_per_node} slurm: gpus_per_node: null diff --git a/templates/configs/compute/killarney/l40s_1x.yaml b/templates/configs/compute/killarney/l40s_1x.yaml index 956c01d..802950e 100644 --- a/templates/configs/compute/killarney/l40s_1x.yaml +++ b/templates/configs/compute/killarney/l40s_1x.yaml @@ -5,8 +5,8 @@ gpus_per_node: 1 time_limit: "1:00:00" timeout_min: 60 work_root: /scratch/${oc.env:USER} -mem_gb: 64 -cpus_per_task: 32 +cpus_per_task: 16 +mem_gb: 128 gres: gpu:${.gpu_type}:${.gpus_per_node} slurm: gpus_per_node: null diff --git a/templates/configs/compute/killarney/l40s_2x.yaml b/templates/configs/compute/killarney/l40s_2x.yaml index 26a09ce..7543219 100644 --- a/templates/configs/compute/killarney/l40s_2x.yaml +++ b/templates/configs/compute/killarney/l40s_2x.yaml @@ -5,8 +5,8 @@ gpus_per_node: 2 time_limit: "1:00:00" timeout_min: 60 work_root: /scratch/${oc.env:USER} -mem_gb: 128 -cpus_per_task: 64 +cpus_per_task: 16 +mem_gb: 256 gres: gpu:${.gpu_type}:${.gpus_per_node} slurm: gpus_per_node: null diff --git a/templates/configs/compute/killarney/l40s_4x.yaml b/templates/configs/compute/killarney/l40s_4x.yaml new file mode 100644 index 0000000..77aaca4 --- /dev/null +++ b/templates/configs/compute/killarney/l40s_4x.yaml @@ -0,0 +1,12 @@ +cluster: killarney +nodes: 1 +gpu_type: l40s +gpus_per_node: 4 +time_limit: "1:00:00" +timeout_min: 60 +work_root: /scratch/${oc.env:USER} +cpus_per_task: 16 +mem_gb: 512 +gres: gpu:${.gpu_type}:${.gpus_per_node} +slurm: + gpus_per_node: null From 108d1ed9339d6fa504524811921953fd42252b9d Mon Sep 17 00:00:00 2001 From: kohankhaki Date: Fri, 17 Oct 2025 23:42:46 -0400 Subject: [PATCH 6/8] added documentation for function in train.py, print for non zero ranks, and updated readme to include info on env vars. --- templates/src/mlp/ddp/README.md | 36 ++++++++++++++++++++++++++++++--- templates/src/mlp/ddp/train.py | 10 +++++++++ 2 files changed, 43 insertions(+), 3 deletions(-) diff --git a/templates/src/mlp/ddp/README.md b/templates/src/mlp/ddp/README.md index 02d2a5f..a7ca85d 100644 --- a/templates/src/mlp/ddp/README.md +++ b/templates/src/mlp/ddp/README.md @@ -16,13 +16,43 @@ This example implements a simple MLP using DDP. ## DDP Setup -Unlike `torchrun`, Submitit is a **job scheduler integration**, not a distributed orchestrator. It spawns one process per GPU (or per `tasks_per_node`), but it does **not automatically set** the PyTorch environment variables (`RANK`, `LOCAL_RANK`, `WORLD_SIZE`, `MASTER_ADDR`, `MASTER_PORT`) required by `torch.distributed`. +Unlike `torchrun`, Submitit is a **job scheduler integration**, not a distributed orchestrator. +It spawns one process per GPU (or per `tasks_per_node`), but it does **not automatically set** the PyTorch environment variables (`RANK`, `LOCAL_RANK`, `WORLD_SIZE`, `MASTER_ADDR`, `MASTER_PORT`) required by `torch.distributed`. + +However, Submitit automatically determines the distributed context (each task’s **global rank**, **local rank**, **world size**, and **hostnames**). +You don’t manually assign local ranks; you retrieve them from `submitit.JobEnvironment()` and use them to initialize PyTorch DDP: + +```python +job_env = submitit.JobEnvironment() +rank = job_env.global_rank +local_rank = job_env.local_rank +world_size = job_env.num_tasks +``` + +Once you retrieve these values, export them as environment variables and call: + +```python +torch.distributed.init_process_group(init_method="env://", backend="nccl") +``` -Therefore, this project explicitly initializes the distributed environment inside the training script using `submitit.JobEnvironment()`. This pattern is the standard way to perform DDP initialization with Submitit when not using `torchrun` ([MosaicML Docs](https://docs.mosaicml.com/projects/composer/en/stable/examples/training_with_submitit.html), [Hydra Submitit Launcher](https://hydra.cc/docs/plugins/submitit_launcher/), [PyTorch Forum Discussion](https://discuss.pytorch.org/t/using-submitit-for-distributed-training/121881), [Fairseq Example](https://github.com/facebookresearch/fairseq/blob/main/examples/language_model/submitit_train.py)). -It works for both **single-node** and **multi-node** jobs as long as the `MASTER_ADDR` points to a hostname reachable from all nodes. +Submitit also provides an optional helper class, `submitit.helpers.TorchDistributedEnvironment`, which wraps `JobEnvironment`. +It automatically exports the standard PyTorch environment variables (`RANK`, `LOCAL_RANK`, `WORLD_SIZE`, `MASTER_ADDR`, and `MASTER_PORT`) so that you can initialize DDP with `init_method="env://"` directly. Think of it as a convenience layer built on top of `JobEnvironment`. `JobEnvironment` also exposes extra metadata like `hostnames` and `hostname`, which can be helpful for advanced or custom multi-node configurations. + +For a minimal example that uses `submitit.helpers.TorchDistributedEnvironment()` together with +`torch.distributed.init_process_group(init_method="env://")`, see the official Submitit example +[`docs/examples/torch_distributed.py`](https://github.com/facebookincubator/submitit/blob/main/docs/examples/torch_distributed.py). + + +### Logging in DDP (Hydra + Submitit) + +To avoid duplicated lines in the global Hydra log, we log with `logger` only on **rank 0**. +For per-rank visibility, use `print()` on non-zero ranks. Those messages appear only in that rank’s stdout (Submitit/Slurm per-task output). + +- `logger.info(...)` (rank 0): goes to the single, global Hydra log for the run. +- `print(...)` (ranks > 0): stays in the rank-local stdout, not in the global Hydra log. diff --git a/templates/src/mlp/ddp/train.py b/templates/src/mlp/ddp/train.py index 0ac4d38..972c4f2 100644 --- a/templates/src/mlp/ddp/train.py +++ b/templates/src/mlp/ddp/train.py @@ -80,6 +80,7 @@ def _setup_distributed(self, rank, world_size): ) def _wrap_distributed(self, model, world_size, local_rank): + """Wrap the model with DDP if running in distributed mode.""" if world_size > 1: return nn.parallel.DistributedDataParallel( model, @@ -88,16 +89,19 @@ def _wrap_distributed(self, model, world_size, local_rank): return model def _configure_training(self, cfg): + """Extract core training hyperparameters from the configuration.""" lr = OmegaConf.select(cfg, "trainer.learning_rate", default=1e-3) num_epochs = OmegaConf.select(cfg, "trainer.num_epochs", default=1000) seed = OmegaConf.select(cfg, "trainer.seed", default=42) return lr, num_epochs, seed def _get_distributed_config(self): + """Retrieve distributed job configuration information from Submitit.""" job_env = submitit.JobEnvironment() return job_env, job_env.global_rank, job_env.local_rank, job_env.num_tasks def _prepare_environment(self, job_env, rank, local_rank, world_size): + """Set up distributed environment variables for PyTorch.""" os.environ.setdefault("RANK", str(rank)) os.environ.setdefault("LOCAL_RANK", str(local_rank)) os.environ.setdefault("WORLD_SIZE", str(world_size)) @@ -113,6 +117,7 @@ def _prepare_environment(self, job_env, rank, local_rank, world_size): os.environ.setdefault("MASTER_PORT", "29500") def _log_run_configuration(self, seed, world_size, local_rank, rank): + """Log the configuration of the current DDP run.""" if rank != 0: return logger.info(f"Starting DDP MLP training with seed {seed}") @@ -121,6 +126,7 @@ def _log_run_configuration(self, seed, world_size, local_rank, rank): logger.info(f"Number of available GPUs: {torch.cuda.device_count()}") def _set_seed(self, seed): + """Set random seeds for reproducibility across PyTorch and CUDA.""" torch.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed(seed) @@ -254,6 +260,10 @@ def __call__(self, cfg): self._setup_distributed(rank, world_size) device, model = self._initialize_device_and_model(cfg, local_rank) + if rank == 0: + logger.info(f"[Rank {rank}] Initialized on device: {device}") + else: + print(f"[Rank {rank}] Initialized on device: {device}") if rank == 0: logger.info(f"Using device: {device}") From 95a628ebbd74fb093fb78532146c1fb3568c8466 Mon Sep 17 00:00:00 2001 From: Farnaz Kohankhaki Date: Wed, 22 Oct 2025 19:51:03 -0400 Subject: [PATCH 7/8] updated compute config to set default value for number of tasks per node. --- templates/configs/_global.yaml | 13 ++++++------- templates/configs/compute/killarney/h100_1x.yaml | 12 ++++++------ templates/configs/compute/killarney/h100_2x.yaml | 12 ++++++------ templates/configs/compute/killarney/h100_4x.yaml | 12 ++++++------ templates/configs/compute/killarney/h100_8x.yaml | 12 ++++++------ templates/configs/compute/killarney/l40s_1x.yaml | 10 +++++----- templates/configs/compute/killarney/l40s_2x.yaml | 12 ++++++------ templates/configs/compute/killarney/l40s_4x.yaml | 12 ++++++------ 8 files changed, 47 insertions(+), 48 deletions(-) diff --git a/templates/configs/_global.yaml b/templates/configs/_global.yaml index 8788422..80c89e0 100644 --- a/templates/configs/_global.yaml +++ b/templates/configs/_global.yaml @@ -20,14 +20,13 @@ hydra: launcher: submitit_folder: ${hydra.sweep.dir}/submitit_logs/%j nodes: ${oc.select:compute.nodes,null} - gpus_per_node: ${oc.select:compute.slurm.gpus_per_node, ${compute.gpus_per_node}} - tasks_per_node: ${oc.select:compute.tasks_per_node, ${compute.gpus_per_node}} - cpus_per_task: ${compute.cpus_per_task} + tasks_per_node: ${oc.select:compute.tasks_per_node, 1} + cpus_per_task: ${oc.select:compute.cpus_per_task, 4} mem_gb: ${compute.mem_gb} timeout_min: ${compute.timeout_min} - gres: ${oc.select:compute.gres,null} - partition: ${oc.select:compute.slurm.partition,null} - qos: ${oc.select:compute.slurm.qos,null} + gres: ${oc.select:compute.gres, null} + partition: ${oc.select:compute.slurm.partition, null} + qos: ${oc.select:compute.slurm.qos, null} account: ${user.slurm.account} max_num_timeout: 2 - additional_parameters: ${oc.select:user.slurm.additional_parameters, {}} + additional_parameters: ${oc.select:compute.slurm.additional_parameters, ${oc.select:user.slurm.additional_parameters, {}}} diff --git a/templates/configs/compute/killarney/h100_1x.yaml b/templates/configs/compute/killarney/h100_1x.yaml index 7bc9632..a002a91 100644 --- a/templates/configs/compute/killarney/h100_1x.yaml +++ b/templates/configs/compute/killarney/h100_1x.yaml @@ -2,11 +2,11 @@ cluster: killarney nodes: 1 gpu_type: h100 gpus_per_node: 1 -time_limit: "1:00:00" -timeout_min: 60 -work_root: /scratch/${oc.env:USER} -cpus_per_task: 6 -mem_gb: 250 gres: gpu:${.gpu_type}:${.gpus_per_node} +tasks_per_node: ${.gpus_per_node} +cpus_per_task: 6 +mem_gb: 240 +work_root: /scratch/${oc.env:USER} +timeout_min: 60 slurm: - gpus_per_node: null + additional_parameters: {} diff --git a/templates/configs/compute/killarney/h100_2x.yaml b/templates/configs/compute/killarney/h100_2x.yaml index efb8dae..97dfa63 100644 --- a/templates/configs/compute/killarney/h100_2x.yaml +++ b/templates/configs/compute/killarney/h100_2x.yaml @@ -2,11 +2,11 @@ cluster: killarney nodes: 1 gpu_type: h100 gpus_per_node: 2 -time_limit: "1:00:00" -timeout_min: 60 -work_root: /scratch/${oc.env:USER} -cpus_per_task: 6 -mem_gb: 500 gres: gpu:${.gpu_type}:${.gpus_per_node} +tasks_per_node: ${.gpus_per_node} +cpus_per_task: 6 +mem_gb: 480 +work_root: /scratch/${oc.env:USER} +timeout_min: 60 slurm: - gpus_per_node: null + additional_parameters: {} diff --git a/templates/configs/compute/killarney/h100_4x.yaml b/templates/configs/compute/killarney/h100_4x.yaml index 0a00aba..29c0218 100644 --- a/templates/configs/compute/killarney/h100_4x.yaml +++ b/templates/configs/compute/killarney/h100_4x.yaml @@ -2,11 +2,11 @@ cluster: killarney nodes: 1 gpu_type: h100 gpus_per_node: 4 -time_limit: "1:00:00" -timeout_min: 60 -work_root: /scratch/${oc.env:USER} -cpus_per_task: 6 -mem_gb: 1000 gres: gpu:${.gpu_type}:${.gpus_per_node} +tasks_per_node: ${.gpus_per_node} +cpus_per_task: 6 +mem_gb: 960 +work_root: /scratch/${oc.env:USER} +timeout_min: 60 slurm: - gpus_per_node: null + additional_parameters: {} diff --git a/templates/configs/compute/killarney/h100_8x.yaml b/templates/configs/compute/killarney/h100_8x.yaml index 6b9454a..900ec89 100644 --- a/templates/configs/compute/killarney/h100_8x.yaml +++ b/templates/configs/compute/killarney/h100_8x.yaml @@ -2,11 +2,11 @@ cluster: killarney nodes: 1 gpu_type: h100 gpus_per_node: 8 -time_limit: "1:00:00" -timeout_min: 60 -work_root: /scratch/${oc.env:USER} -cpus_per_task: 6 -mem_gb: 2000 gres: gpu:${.gpu_type}:${.gpus_per_node} +tasks_per_node: ${.gpus_per_node} +cpus_per_task: 6 +mem_gb: 1920 +work_root: /scratch/${oc.env:USER} +timeout_min: 60 slurm: - gpus_per_node: null + additional_parameters: {} diff --git a/templates/configs/compute/killarney/l40s_1x.yaml b/templates/configs/compute/killarney/l40s_1x.yaml index 802950e..eac455a 100644 --- a/templates/configs/compute/killarney/l40s_1x.yaml +++ b/templates/configs/compute/killarney/l40s_1x.yaml @@ -2,11 +2,11 @@ cluster: killarney nodes: 1 gpu_type: l40s gpus_per_node: 1 -time_limit: "1:00:00" -timeout_min: 60 -work_root: /scratch/${oc.env:USER} -cpus_per_task: 16 -mem_gb: 128 gres: gpu:${.gpu_type}:${.gpus_per_node} +tasks_per_node: ${.gpus_per_node} +cpus_per_task: 16 +mem_gb: 120 +work_root: /scratch/${oc.env:USER} +timeout_min: 60 slurm: gpus_per_node: null diff --git a/templates/configs/compute/killarney/l40s_2x.yaml b/templates/configs/compute/killarney/l40s_2x.yaml index 7543219..d84976f 100644 --- a/templates/configs/compute/killarney/l40s_2x.yaml +++ b/templates/configs/compute/killarney/l40s_2x.yaml @@ -2,11 +2,11 @@ cluster: killarney nodes: 1 gpu_type: l40s gpus_per_node: 2 -time_limit: "1:00:00" -timeout_min: 60 -work_root: /scratch/${oc.env:USER} -cpus_per_task: 16 -mem_gb: 256 gres: gpu:${.gpu_type}:${.gpus_per_node} +tasks_per_node: ${.gpus_per_node} +cpus_per_task: 16 +mem_gb: 240 +work_root: /scratch/${oc.env:USER} +timeout_min: 60 slurm: - gpus_per_node: null + additional_parameters: {} diff --git a/templates/configs/compute/killarney/l40s_4x.yaml b/templates/configs/compute/killarney/l40s_4x.yaml index 77aaca4..8743ec6 100644 --- a/templates/configs/compute/killarney/l40s_4x.yaml +++ b/templates/configs/compute/killarney/l40s_4x.yaml @@ -2,11 +2,11 @@ cluster: killarney nodes: 1 gpu_type: l40s gpus_per_node: 4 -time_limit: "1:00:00" -timeout_min: 60 -work_root: /scratch/${oc.env:USER} -cpus_per_task: 16 -mem_gb: 512 gres: gpu:${.gpu_type}:${.gpus_per_node} +tasks_per_node: ${.gpus_per_node} +cpus_per_task: 16 +mem_gb: 480 +work_root: /scratch/${oc.env:USER} +timeout_min: 60 slurm: - gpus_per_node: null + additional_parameters: {} From 07b556006c886097291dd5c1b87d0b4c106f964c Mon Sep 17 00:00:00 2001 From: kohankhaki Date: Wed, 22 Oct 2025 20:04:34 -0400 Subject: [PATCH 8/8] updated bon echo config formats to match killarney. --- templates/configs/compute/bon_echo/a100_1x.yaml | 12 ++++++------ templates/configs/compute/bon_echo/a100_4x.yaml | 12 ++++++------ templates/configs/compute/bon_echo/a40_1x.yaml | 10 +++++----- templates/configs/compute/bon_echo/a40_2x.yaml | 10 +++++----- templates/configs/compute/bon_echo/a40_4x.yaml | 10 +++++----- templates/configs/compute/bon_echo/cpu_1x.yaml | 7 +++---- 6 files changed, 30 insertions(+), 31 deletions(-) diff --git a/templates/configs/compute/bon_echo/a100_1x.yaml b/templates/configs/compute/bon_echo/a100_1x.yaml index 9362079..8061669 100644 --- a/templates/configs/compute/bon_echo/a100_1x.yaml +++ b/templates/configs/compute/bon_echo/a100_1x.yaml @@ -2,12 +2,12 @@ cluster: bon_echo nodes: 1 gpu_type: a100 gpus_per_node: 1 -time_limit: "8:00:00" -timeout_min: 480 -work_root: /scratch/ssd004/scratch/${oc.env:USER} -mem_gb: 80 -cpus_per_task: 16 gres: gpu:${.gpu_type}:${.gpus_per_node} +tasks_per_node: ${.gpus_per_node} +cpus_per_task: 16 +mem_gb: 80 +work_root: /scratch/ssd004/scratch/${oc.env:USER} +timeout_min: 60 slurm: partition: a100 - gpus_per_node: null + additional_parameters: {} diff --git a/templates/configs/compute/bon_echo/a100_4x.yaml b/templates/configs/compute/bon_echo/a100_4x.yaml index 750ce59..2f8ffbd 100644 --- a/templates/configs/compute/bon_echo/a100_4x.yaml +++ b/templates/configs/compute/bon_echo/a100_4x.yaml @@ -2,12 +2,12 @@ cluster: bon_echo nodes: 1 gpu_type: a100 gpus_per_node: 4 -time_limit: "2:00:00" -timeout_min: 120 -work_root: /scratch/ssd004/scratch/${oc.env:USER} -mem_gb: 320 -cpus_per_task: 8 gres: gpu:${.gpu_type}:${.gpus_per_node} +tasks_per_node: ${.gpus_per_node} +cpus_per_task: 16 +mem_gb: 320 +work_root: /scratch/ssd004/scratch/${oc.env:USER} +timeout_min: 60 slurm: partition: a100 - gpus_per_node: null + additional_parameters: {} diff --git a/templates/configs/compute/bon_echo/a40_1x.yaml b/templates/configs/compute/bon_echo/a40_1x.yaml index 27e78e5..fde10ac 100644 --- a/templates/configs/compute/bon_echo/a40_1x.yaml +++ b/templates/configs/compute/bon_echo/a40_1x.yaml @@ -2,12 +2,12 @@ cluster: bon_echo nodes: 1 gpu_type: a40 gpus_per_node: 1 -time_limit: "8:00:00" -timeout_min: 480 -work_root: /scratch/ssd004/scratch/${oc.env:USER} +gres: gpu:${.gpu_type}:${.gpus_per_node} +tasks_per_node: ${.gpus_per_node} cpus_per_task: 8 mem_gb: 40 -gres: gpu:${.gpu_type}:${.gpus_per_node} +work_root: /scratch/ssd004/scratch/${oc.env:USER} +timeout_min: 60 slurm: partition: a40 - gpus_per_node: null + additional_parameters: {} diff --git a/templates/configs/compute/bon_echo/a40_2x.yaml b/templates/configs/compute/bon_echo/a40_2x.yaml index ab2e239..20ccc70 100644 --- a/templates/configs/compute/bon_echo/a40_2x.yaml +++ b/templates/configs/compute/bon_echo/a40_2x.yaml @@ -2,12 +2,12 @@ cluster: bon_echo nodes: 1 gpu_type: a40 gpus_per_node: 2 -time_limit: "8:00:00" -timeout_min: 480 -work_root: /scratch/ssd004/scratch/${oc.env:USER} +gres: gpu:${.gpu_type}:${.gpus_per_node} +tasks_per_node: ${.gpus_per_node} cpus_per_task: 8 mem_gb: 80 -gres: gpu:${.gpu_type}:${.gpus_per_node} +work_root: /scratch/ssd004/scratch/${oc.env:USER} +timeout_min: 60 slurm: partition: a40 - gpus_per_node: null + additional_parameters: {} diff --git a/templates/configs/compute/bon_echo/a40_4x.yaml b/templates/configs/compute/bon_echo/a40_4x.yaml index 24db9a3..fb1bb35 100644 --- a/templates/configs/compute/bon_echo/a40_4x.yaml +++ b/templates/configs/compute/bon_echo/a40_4x.yaml @@ -2,12 +2,12 @@ cluster: bon_echo nodes: 1 gpu_type: a40 gpus_per_node: 4 -time_limit: "8:00:00" -timeout_min: 480 -work_root: /scratch/ssd004/scratch/${oc.env:USER} +gres: gpu:${.gpu_type}:${.gpus_per_node} +tasks_per_node: ${.gpus_per_node} cpus_per_task: 8 mem_gb: 160 -gres: gpu:${.gpu_type}:${.gpus_per_node} +work_root: /scratch/ssd004/scratch/${oc.env:USER} +timeout_min: 60 slurm: partition: a40 - gpus_per_node: null + additional_parameters: {} diff --git a/templates/configs/compute/bon_echo/cpu_1x.yaml b/templates/configs/compute/bon_echo/cpu_1x.yaml index fadbc3f..2101d48 100644 --- a/templates/configs/compute/bon_echo/cpu_1x.yaml +++ b/templates/configs/compute/bon_echo/cpu_1x.yaml @@ -1,11 +1,10 @@ cluster: bon_echo nodes: 1 gpus_per_node: 0 +gres: null cpus_per_task: 2 mem_gb: 8 work_root: /scratch/ssd004/scratch/${oc.env:USER} -time_limit: "0:15:00" -timeout_min: 15 -gres: null +timeout_min: 60 slurm: - gpus_per_node: null + additional_parameters: {}