VectorInstitute · kohankhaki · Oct 23, 2025 · Oct 8, 2025 · Oct 15, 2025 · Oct 16, 2025
diff --git a/templates/configs/_global.yaml b/templates/configs/_global.yaml
@@ -20,15 +20,13 @@ hydra:
   launcher:
     submitit_folder: ${hydra.sweep.dir}/submitit_logs/%j
     nodes: ${oc.select:compute.nodes,null}
-    gpus_per_node: ${oc.select:compute.slurm.gpus_per_node, ${compute.gpus_per_node}}
-    tasks_per_node: 1
-    cpus_per_task: ${compute.cpus_per_task}
+    tasks_per_node: ${oc.select:compute.tasks_per_node, 1}
+    cpus_per_task: ${oc.select:compute.cpus_per_task, 4}
     mem_gb: ${compute.mem_gb}
     timeout_min: ${compute.timeout_min}
-    gres: ${oc.select:compute.gres,null}
-    partition: ${oc.select:compute.slurm.partition,null}
-    qos: ${oc.select:compute.slurm.qos,null}
+    gres: ${oc.select:compute.gres, null}
+    partition: ${oc.select:compute.slurm.partition, null}
+    qos: ${oc.select:compute.slurm.qos, null}
     account: ${user.slurm.account}
     max_num_timeout: 2
-    additional_parameters: ${oc.select:user.slurm.additional_parameters, {}}
-
+    additional_parameters: ${oc.select:compute.slurm.additional_parameters, ${oc.select:user.slurm.additional_parameters, {}}}
diff --git a/templates/configs/compute/bon_echo/a100_1x.yaml b/templates/configs/compute/bon_echo/a100_1x.yaml
@@ -2,12 +2,12 @@ cluster: bon_echo
 nodes: 1
 gpu_type: a100
 gpus_per_node: 1
-time_limit: "8:00:00"
-timeout_min: 480
-work_root: /scratch/ssd004/scratch/${oc.env:USER}
-mem_gb: 80
-cpus_per_task: 16
 gres: gpu:${.gpu_type}:${.gpus_per_node}
+tasks_per_node: ${.gpus_per_node}
+cpus_per_task: 16
+mem_gb: 80
+work_root: /scratch/ssd004/scratch/${oc.env:USER}
+timeout_min: 60
 slurm:
   partition: a100
-  gpus_per_node: null
+  additional_parameters: {}
diff --git a/templates/configs/compute/bon_echo/a100_4x.yaml b/templates/configs/compute/bon_echo/a100_4x.yaml
@@ -2,12 +2,12 @@ cluster: bon_echo
 nodes: 1
 gpu_type: a100
 gpus_per_node: 4
-time_limit: "2:00:00"
-timeout_min: 120
-work_root: /scratch/ssd004/scratch/${oc.env:USER}
-mem_gb: 320
-cpus_per_task: 8
 gres: gpu:${.gpu_type}:${.gpus_per_node}
+tasks_per_node: ${.gpus_per_node}
+cpus_per_task: 16
+mem_gb: 320
+work_root: /scratch/ssd004/scratch/${oc.env:USER}
+timeout_min: 60
 slurm:
   partition: a100
-  gpus_per_node: null
+  additional_parameters: {}
diff --git a/templates/configs/compute/bon_echo/a40_1x.yaml b/templates/configs/compute/bon_echo/a40_1x.yaml
@@ -2,12 +2,12 @@ cluster: bon_echo
 nodes: 1
 gpu_type: a40
 gpus_per_node: 1
-time_limit: "8:00:00"
-timeout_min: 480
-work_root: /scratch/ssd004/scratch/${oc.env:USER}
-mem_gb: 16
-cpus_per_task: 16
 gres: gpu:${.gpu_type}:${.gpus_per_node}
+tasks_per_node: ${.gpus_per_node}
+cpus_per_task: 8
+mem_gb: 40
+work_root: /scratch/ssd004/scratch/${oc.env:USER}
+timeout_min: 60
 slurm:
   partition: a40
-  gpus_per_node: null
+  additional_parameters: {}
diff --git a/templates/configs/compute/bon_echo/a40_2x.yaml b/templates/configs/compute/bon_echo/a40_2x.yaml
@@ -2,12 +2,12 @@ cluster: bon_echo
 nodes: 1
 gpu_type: a40
 gpus_per_node: 2
-time_limit: "2:00:00"
-timeout_min: 120
-work_root: /scratch/ssd004/scratch/${oc.env:USER}
-mem_gb: 64
-cpus_per_task: 8
 gres: gpu:${.gpu_type}:${.gpus_per_node}
+tasks_per_node: ${.gpus_per_node}
+cpus_per_task: 8
+mem_gb: 80
+work_root: /scratch/ssd004/scratch/${oc.env:USER}
+timeout_min: 60
 slurm:
   partition: a40
-  gpus_per_node: null
+  additional_parameters: {}
diff --git a/templates/configs/compute/bon_echo/a40_4x.yaml b/templates/configs/compute/bon_echo/a40_4x.yaml
@@ -0,0 +1,13 @@
+cluster: bon_echo
+nodes: 1
+gpu_type: a40
+gpus_per_node: 4
+gres: gpu:${.gpu_type}:${.gpus_per_node}
+tasks_per_node: ${.gpus_per_node}
+cpus_per_task: 8
+mem_gb: 160
+work_root: /scratch/ssd004/scratch/${oc.env:USER}
+timeout_min: 60
+slurm:
+  partition: a40
+  additional_parameters: {}
diff --git a/templates/configs/compute/bon_echo/cpu_1x.yaml b/templates/configs/compute/bon_echo/cpu_1x.yaml
@@ -1,11 +1,10 @@
 cluster: bon_echo
 nodes: 1
 gpus_per_node: 0
+gres: null
 cpus_per_task: 2
 mem_gb: 8
 work_root: /scratch/ssd004/scratch/${oc.env:USER}
-time_limit: "0:15:00"
-timeout_min: 15
-gres: null
+timeout_min: 60
 slurm:
-  gpus_per_node: null
+  additional_parameters: {}
diff --git a/templates/configs/compute/killarney/cpu_1x.yaml b/templates/configs/compute/killarney/cpu_1x.yaml
diff --git a/templates/configs/compute/killarney/h100_1x.yaml b/templates/configs/compute/killarney/h100_1x.yaml
@@ -2,11 +2,11 @@ cluster: killarney
 nodes: 1
 gpu_type: h100
 gpus_per_node: 1
-time_limit: "1:00:00"
-timeout_min: 60
-work_root: /scratch/${oc.env:USER}
-mem_gb: 256
-cpus_per_task: 24
 gres: gpu:${.gpu_type}:${.gpus_per_node}
+tasks_per_node: ${.gpus_per_node}
+cpus_per_task: 6
+mem_gb: 240
+work_root: /scratch/${oc.env:USER}
+timeout_min: 60
 slurm:
-  gpus_per_node: null
+  additional_parameters: {}
diff --git a/templates/configs/compute/killarney/h100_2x.yaml b/templates/configs/compute/killarney/h100_2x.yaml
@@ -0,0 +1,12 @@
+cluster: killarney
+nodes: 1
+gpu_type: h100
+gpus_per_node: 2
+gres: gpu:${.gpu_type}:${.gpus_per_node}
+tasks_per_node: ${.gpus_per_node}
+cpus_per_task: 6
+mem_gb: 480
+work_root: /scratch/${oc.env:USER}
+timeout_min: 60
+slurm:
+  additional_parameters: {}
diff --git a/templates/configs/compute/killarney/h100_4x.yaml b/templates/configs/compute/killarney/h100_4x.yaml
@@ -0,0 +1,12 @@
+cluster: killarney
+nodes: 1
+gpu_type: h100
+gpus_per_node: 4
+gres: gpu:${.gpu_type}:${.gpus_per_node}
+tasks_per_node: ${.gpus_per_node}
+cpus_per_task: 6
+mem_gb: 960
+work_root: /scratch/${oc.env:USER}
+timeout_min: 60
+slurm:
+  additional_parameters: {}
diff --git a/templates/configs/compute/killarney/h100_8x.yaml b/templates/configs/compute/killarney/h100_8x.yaml
@@ -2,11 +2,11 @@ cluster: killarney
 nodes: 1
 gpu_type: h100
 gpus_per_node: 8
-time_limit: "1:00:00"
-timeout_min: 60
-work_root: /scratch/${oc.env:USER}
-mem_gb: 2048
-cpus_per_task: 96
 gres: gpu:${.gpu_type}:${.gpus_per_node}
+tasks_per_node: ${.gpus_per_node}
+cpus_per_task: 6
+mem_gb: 1920
+work_root: /scratch/${oc.env:USER}
+timeout_min: 60
 slurm:
-  gpus_per_node: null
+  additional_parameters: {}
diff --git a/templates/configs/compute/killarney/l40s_1x.yaml b/templates/configs/compute/killarney/l40s_1x.yaml
@@ -2,11 +2,11 @@ cluster: killarney
 nodes: 1
 gpu_type: l40s
 gpus_per_node: 1
-time_limit: "1:00:00"
-timeout_min: 60
-work_root: /scratch/${oc.env:USER}
-mem_gb: 64
-cpus_per_task: 32
 gres: gpu:${.gpu_type}:${.gpus_per_node}
+tasks_per_node: ${.gpus_per_node}
+cpus_per_task: 16
+mem_gb: 120
+work_root: /scratch/${oc.env:USER}
+timeout_min: 60
 slurm:
   gpus_per_node: null
diff --git a/templates/configs/compute/killarney/l40s_2x.yaml b/templates/configs/compute/killarney/l40s_2x.yaml
@@ -2,11 +2,11 @@ cluster: killarney
 nodes: 1
 gpu_type: l40s
 gpus_per_node: 2
-time_limit: "1:00:00"
-timeout_min: 60
-work_root: /scratch/${oc.env:USER}
-mem_gb: 128
-cpus_per_task: 64
 gres: gpu:${.gpu_type}:${.gpus_per_node}
+tasks_per_node: ${.gpus_per_node}
+cpus_per_task: 16
+mem_gb: 240
+work_root: /scratch/${oc.env:USER}
+timeout_min: 60
 slurm:
-  gpus_per_node: null
+  additional_parameters: {}
diff --git a/templates/configs/compute/killarney/l40s_4x.yaml b/templates/configs/compute/killarney/l40s_4x.yaml
@@ -0,0 +1,12 @@
+cluster: killarney
+nodes: 1
+gpu_type: l40s
+gpus_per_node: 4
+gres: gpu:${.gpu_type}:${.gpus_per_node}
+tasks_per_node: ${.gpus_per_node}
+cpus_per_task: 16
+mem_gb: 480
+work_root: /scratch/${oc.env:USER}
+timeout_min: 60
+slurm:
+  additional_parameters: {}
diff --git a/templates/src/mlp/ddp/README.md b/templates/src/mlp/ddp/README.md
@@ -1,17 +1,58 @@
-# Distributed Data Parallel Example
+# MLP Distributed Data Parallel Template
 
-> :warning: WIP: This template is a work in progress and does not use DDP in its current state.
+*Data Parallelism* lets you to split your data across multiple accelerators so that you can train your model faster!
 
-*Data Parallelism* lets you to split your data across multiple accelerators so that you can train your model faster! 
-
-Most of the time all your accelerators (gpus) will be on the same machine (node), and that simplifies things. However if you are using a large number of gpus that can't fit on a single machine, then you'll have to use multiple machines (nodes). For example, on the Killarney cluster, L40's have a maximum of 4 per node and H100's have a maximum of 8 per nodes. Data Parallelism across multiple nodes is referred to as *Distributed Data Parallelism* (DDP). By default DDP works for both single node and multi-node settings.
+Most of the time all your accelerators (GPUs) will be on the same machine (node), and that simplifies things. However if you are using a large number of GPUs that can't fit on a single machine, then you'll have to use multiple machines (nodes). For example, on the Killarney cluster, L40's have a maximum of 4 per node and H100's have a maximum of 8 per nodes. Data Parallelism across multiple nodes is referred to as *Distributed Data Parallelism* (DDP). By default DDP works for both single node and multi-node settings.
 
 This example implements a simple MLP using DDP.
 
 ## DDP Background
 
-**World Size:** The total number of gpu's across all nodes
+**World Size:** The total number of GPU's across all nodes
+
+**Rank:** Integer ID for a single GPU. Unique across all nodes. (from `0` to `world_size - 1`)
+
+**Local Rank:** Integer ID for a single GPU. Unique only within a node. (from `0` to `num_gpus_per_node - 1`)
+
+## DDP Setup
+
+Unlike `torchrun`, Submitit is a **job scheduler integration**, not a distributed orchestrator.
+It spawns one process per GPU (or per `tasks_per_node`), but it does **not automatically set** the PyTorch environment variables (`RANK`, `LOCAL_RANK`, `WORLD_SIZE`, `MASTER_ADDR`, `MASTER_PORT`) required by `torch.distributed`.
+
+However, Submitit automatically determines the distributed context (each task’s **global rank**, **local rank**, **world size**, and **hostnames**).
+You don’t manually assign local ranks; you retrieve them from `submitit.JobEnvironment()` and use them to initialize PyTorch DDP:
+
+```python
+job_env = submitit.JobEnvironment()
+rank = job_env.global_rank
+local_rank = job_env.local_rank
+world_size = job_env.num_tasks
+```
+
+Once you retrieve these values, export them as environment variables and call:
+
+```python
+torch.distributed.init_process_group(init_method="env://", backend="nccl")
+```
+
+This pattern is the standard way to perform DDP initialization with Submitit when not using `torchrun`
+([MosaicML Docs](https://docs.mosaicml.com/projects/composer/en/stable/examples/training_with_submitit.html),
+[Hydra Submitit Launcher](https://hydra.cc/docs/plugins/submitit_launcher/),
+[PyTorch Forum Discussion](https://discuss.pytorch.org/t/using-submitit-for-distributed-training/121881),
+[Fairseq Example](https://github.com/facebookresearch/fairseq/blob/main/examples/language_model/submitit_train.py)).
+
+Submitit also provides an optional helper class, `submitit.helpers.TorchDistributedEnvironment`, which wraps `JobEnvironment`.
+It automatically exports the standard PyTorch environment variables (`RANK`, `LOCAL_RANK`, `WORLD_SIZE`, `MASTER_ADDR`, and `MASTER_PORT`) so that you can initialize DDP with `init_method="env://"` directly. Think of it as a convenience layer built on top of `JobEnvironment`. `JobEnvironment` also exposes extra metadata like `hostnames` and `hostname`, which can be helpful for advanced or custom multi-node configurations.
+
+For a minimal example that uses `submitit.helpers.TorchDistributedEnvironment()` together with
+`torch.distributed.init_process_group(init_method="env://")`, see the official Submitit example
+[`docs/examples/torch_distributed.py`](https://github.com/facebookincubator/submitit/blob/main/docs/examples/torch_distributed.py).
+
+
+### Logging in DDP (Hydra + Submitit)
 
-**Rank:** Integer ID for a single gpu. Unique across all nodes. (from `0` to `world_size - 1`)
+To avoid duplicated lines in the global Hydra log, we log with `logger` only on **rank 0**.
+For per-rank visibility, use `print()` on non-zero ranks. Those messages appear only in that rank’s stdout (Submitit/Slurm per-task output).
 
-**Local Rank:** Integer ID for a single gpu. Unique only within a node. (from `0` to `num_gpus_per_node - 1`)
+- `logger.info(...)` (rank 0): goes to the single, global Hydra log for the run.
+- `print(...)` (ranks > 0): stays in the rank-local stdout, not in the global Hydra log.