From c3b9ca8dd83c17633c8f9814daa2add13602af2c Mon Sep 17 00:00:00 2001 From: Shawn Carere Date: Fri, 26 Sep 2025 16:40:29 -0400 Subject: [PATCH 1/4] Edits to getting started, WIP edits on template instructions --- README.md | 2 +- .../introduction-to-vector-compute/README.md | 16 ++++++-- templates/README.md | 37 ++++++++++++++----- 3 files changed, 41 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index f96d3fc..2903e79 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ See [getting-started](./getting-started) for documentation on using Vector compu ## Templates -See [templates](./templates) for training templates with Hydra + Submitit. +See [templates](./templates) for training templates that use Hydra + Submitit to structure experiments. - Code lives under: [templates/src](./templates/src) - Cluster configs live under: [templates/configs](./templates/configs) diff --git a/getting-started/introduction-to-vector-compute/README.md b/getting-started/introduction-to-vector-compute/README.md index c142f0f..be2ef67 100644 --- a/getting-started/introduction-to-vector-compute/README.md +++ b/getting-started/introduction-to-vector-compute/README.md @@ -64,6 +64,7 @@ ssh-ed25519 AAAA5AA7OZOZ7NRB1acK54bB47h58N6AIEX4zDziR1r0nM41d3NCG0fgCArjUD45pr13 Next, open the SSH Keys page in your Alliance account: [https://ccdb.alliancecan.ca/ssh_authorized_keys](https://ccdb.alliancecan.ca/ssh_authorized_keys). Paste your key into the SSH Key field, give it a name (typically the host name of the computer where you generated it) and hit Add Key. +**NOTE:** You may need to wait up to 30 minutes after adding your ssh key for it to work when trying to login via ssh. Have lunch and come back. ## SSH Access @@ -127,6 +128,7 @@ In addition to your home directory, you have a minimum of additional 250 GB scra A detailed description of the scratch purging policy is available on the Alliance Canada website: [https://docs.alliancecan.ca/wiki/Scratch_purging_policy](https://docs.alliancecan.ca/wiki/Scratch_purging_policy) +Your scratch space directory will not exist when you initially log in. To have it set up send a request to [ops-help@vectorinstitute.ai](mailto:ops-help@vectorinstitute.ai). Include the name of your PI in the email. ## Shared projects @@ -143,7 +145,7 @@ Instead of copying these datasets on your home directory, you can create a symli ``` -ln -s /dataset/PATH_TO_DATASET ~/PATH_OF_LINK # path of link can be some place in your home directory so that PyTorch/TF can pick up the dataset to these already downloaded directories. +ln -s /datasets/PATH_TO_DATASET ~/PATH_OF_LINK # path of link can be some place in your home directory so that PyTorch/TF can pick up the dataset to these already downloaded directories. ``` @@ -162,6 +164,8 @@ Unlike the legacy Bon Echo (Vaughan) cluster, there is no dedicated checkpoint s # Migration from legacy Vaughan (Bon Echo) Cluster +**NOTE:** The approach for migrating detailed here requires that you set up a second ssh key on killarney. Your public ssh key on the vaughan cluster will be different than the one on your local machine. + The easiest way to migrate data from the legacy Vaughan (Bon Echo) Cluster to Killarney is by using a file transfer command (likely `rsync` or `scp`) from an SSH session. Start by connecting via https://support.vectorinstitute.ai/Killarney?action=AttachFile&do=view&target=User+Guide+to+Killarney+for+Vector+Researchers.pdfsh into the legacy Bon Echo (Vaughan) cluster: @@ -377,6 +381,8 @@ gpubase_l40s_b3 32/32/0/64 gpu:l40s:4(IDX:0-3) gpu:l40s:4 [...] ``` +For CPU's, A/I/OT stands for **A**llocated, **I**dle, **O**ther (eg. down) and **T**otal. Even if the GPU's on a node are available, if there are no Idle CPU's on the node then you won't be able to use it. + ## Jupyter notebooks To run a Jupyter environment from the cluster, you can request an interactive session and start a Jupyter notebook from there. @@ -430,6 +436,7 @@ You will need a VPN connection to access this notebook. Once you are connected t # Software Environments +## Pre-installed Environments The cluster comes with preinstalled software environments called **modules**. These will allow you to access many different versions of Python, VS Code Server, RStudio Server, NodeJS and many others. To see the available preinstalled environments, run: @@ -444,7 +451,8 @@ To use an environment, use `module load`. For example, if you need to use Python module load python/3.10.12 ``` -If there isn't a preinstalled environment for your needs, you can use Poetry or python-venv. Here is a quick example of how to use python venv. +## Custom Environments +If there isn't a preinstalled environment for your needs, you can use [uv](https://docs.astral.sh/uv/), or python-venv. For ongoing projects it is highly recommended to use uv to manage dependencies. To just run something quickly one time, python-venv might be easier. Here is a quick example of how to use python venv. In the login node run the following: @@ -498,13 +506,15 @@ gpubase_l40s_b5 up 7-00:00:00 17/0/0/17 kn[085-101] ## Automatic Restarts +**NOTE:** There is currently no premption on the Killarney cluster + All jobs in our Slurm cluster have a time limit, after which they will get stopped. For longer running jobs which need more than a few hours, the [Vaughan Slurm Changes](https://support.vectorinstitute.ai/Computing?action=AttachFile&do=view&target=Vector+Vaughan+HPC+Changes+FAQ+2023.pdf) document describes how to automatically restart these. ## Checkpoints In order to avoid losing your work when your job exits, you will need to implement checkpoints - periodic snapshots of your work that you load from so you can stop and resume without much lost work. -On the legacy Bon Echo cluster, there was a dedicated checkpoint space in the file system for checkpoints. **⚠️ In Killarney, there is no dedicated checkpoint space.** Users are expected to manage their own checkpoints under their `$SCRATCH` folder. +On the legacy Bon Echo cluster, there was a dedicated checkpoint space in the file system for checkpoints. **⚠️ In Killarney, there is no dedicated checkpoint space.** Users are expected to manage their own checkpoints under their `$SCRATCH` folder. Recall that your scratch folder is not permanent, and so you'll want to move any important checkpoints to you're home or project folder. # Useful Links and Resources diff --git a/templates/README.md b/templates/README.md index b5723b3..8cc1ca3 100644 --- a/templates/README.md +++ b/templates/README.md @@ -2,6 +2,12 @@ Templates for training ML models workflows on Bon Echo and Killarney clusters using Hydra and Submitit. +[Hydra](https://hydra.cc/docs/intro/) is a python framework for creating configurable experiments that you can change through a config file. One of it's main uses is its ability to automatically perform hyperparameter sweeps for model training. + +[submitit](https://github.com/facebookincubator/submitit) is a simple python package that lets you submit slurm jobs programmatically and automatically access and manipulate the results of those jobs once they are complete. It also handles automatic requeing of jobs should they be inturrupted for some reason. + +Hydra conveniently has a submitit plugin that allows them to work together. Put simply, using these tools you can automatically queue up a large number of experiments, run dependent experiments sequentially, requeue long running experiments and more. + ## Layout ``` @@ -17,22 +23,30 @@ templates/ Each template directory is self-contained: it has a `launch.py`, a `train.py`, and a `config.yaml`. The `configs/` directory defines Slurm presets and shared Hydra + Submitit settings. -Hydra starts from `configs/_global.yaml`, pulls in the appropriate entries from `configs/user.yaml` and `configs/compute/*`, then merges the template's own `config.yaml` before forwarding the resolved configuration to Submitit; CLI overrides (e.g. `compute=killarney/h100_1x`) are applied in that final merge, so every launch script receives a single, fully-specified config that Submitit uses to submit or run locally. +Hydra starts from `configs/_global.yaml` and pulls in the appropriate entries from `configs/user.yaml` and `configs/compute/*`. The launch script within each template then merges the template's own local `config.yaml` before forwarding the resolved configuration to Submitit; CLI overrides (e.g. `compute=killarney/h100_1x`) are applied in that final merge, so every launch script receives a single, fully-specified config that Submitit uses to submit or run locally. + +The `_global.yaml` config contains the bulk of the autoconfiguration. Placeholders are used to automatically fill values with values from other configuration files. `hydra.launcher` arguments largely align with the CLI arguments available for the [sbatch](https://slurm.schedmd.com/sbatch.html) command. See [this](https://hydra.cc/docs/plugins/submitit_launcher/) page for the officialy available hydra slurm launcher parameters. Note that the majority of the parameters are sourced from the selected `compute` config. ## Local Setup -1) Create and activate a virtual environment: +1) Install [uv](https://docs.astral.sh/uv/getting-started/installation/) ```bash -uv venv .venv -source .venv/bin/activate +curl -LsSf https://astral.sh/uv/install.sh | sh ``` -2) Resolve and install dependencies from `pyproject.toml`: +2) Clone the vec-playbook repository ```bash -uv lock -uv sync +git clone https://github.com/VectorInstitute/vec-playbook.git ``` +3) Resolve and install dependencies from `pyproject.toml` into a virtual environment: +```bash +cd path/to/vec-playbook +uv sync # Automatically installs dependencies in vec-playbook/.venv +``` + +Finally, ensure you're working directory (by default your cluster scratch space) exists and that you have access to the resources you're requesting on the cluster. + ## Cluster Setup 1) Provide your user Slurm account and any optional parameters in `templates/configs/user.yaml`. @@ -44,17 +58,19 @@ user: # additional_parameters: # qos: m2 # example Bon Echo QoS ``` +**NOTE:** why is qos used as example of additional parameter here when it is an official launcher parameter that seems to be sourced from compute config? -Uncomment and edit `additional_parameters` entries as needed. Use CLI overrides for alternate accounts or QoS when launching jobs, for example `... user.slurm.account=ACCOUNT_B user.slurm.additional_parameters.qos=fast`. +Uncomment and edit `additional_parameters` entries as needed. This field is solely for sbatch arguments not already available in the [Hydra Submitit Slurm Launcher Plugin](https://hydra.cc/docs/plugins/submitit_launcher/). Use CLI overrides for alternate accounts or QoS when launching jobs, for example `... user.slurm.account=ACCOUNT_B user.slurm.additional_parameters.qos=fast`. -2) Pick a compute preset: +2) Pick a compute preset to use in the next section: - `templates/configs/compute/bon_echo/*` (A40, A100) - `templates/configs/compute/killarney/*` (L40S, H100) - Create your own preset under `templates/configs/compute/` if you need different resources (match the YAML shape used in the existing files). ## Running Templates -All launchers follow the same pattern: use `uv run python -m .launch` with Hydra overrides that select compute presets, requeue behaviour, and any template-specific hyperparameters. +All launchers follow the same pattern: use `uv run python -m .launch` with Hydra overrides that select compute presets, requeue behaviour, and any template-specific hyperparameters. uv will automatically detect the virtual environment located in `.venv` of your CWD. + ### Command Pattern ```bash @@ -65,6 +81,7 @@ uv run python -m .launch \ --multirun ``` +- `` refers to the path to the template you would like to run (within) - `compute=/` chooses the Slurm resources defined under `templates/configs/compute/` (or a custom preset you add). - `requeue=` toggles the Submitit requeue flag described in the checkpointing section. - Additional Hydra overrides use `key=value` syntax; nested keys follow the YAML structure (e.g., `trainer.learning_rate=5e-4`). From 957068d9a0b6a7f04aaeeeb5637042e449a4c2c5 Mon Sep 17 00:00:00 2001 From: Shawn Carere Date: Mon, 29 Sep 2025 11:18:53 -0400 Subject: [PATCH 2/4] edits and comments for template instructions --- templates/README.md | 94 ++++++++++++++++++++++++++++++--------------- 1 file changed, 62 insertions(+), 32 deletions(-) diff --git a/templates/README.md b/templates/README.md index 8cc1ca3..a45aba9 100644 --- a/templates/README.md +++ b/templates/README.md @@ -6,7 +6,31 @@ Templates for training ML models workflows on Bon Echo and Killarney clusters us [submitit](https://github.com/facebookincubator/submitit) is a simple python package that lets you submit slurm jobs programmatically and automatically access and manipulate the results of those jobs once they are complete. It also handles automatic requeing of jobs should they be inturrupted for some reason. -Hydra conveniently has a submitit plugin that allows them to work together. Put simply, using these tools you can automatically queue up a large number of experiments, run dependent experiments sequentially, requeue long running experiments and more. +Hydra conveniently has a submitit plugin that allows them to work together. Put simply, using these tools you can automatically queue up a large number of experiments, run dependent experiments sequentially, requeue long running experiments and more. + +## Local Setup + +1) Install [uv](https://docs.astral.sh/uv/getting-started/installation/) +```bash +curl -LsSf https://astral.sh/uv/install.sh | sh +``` + +2) Clone the vec-playbook repository +```bash +git clone https://github.com/VectorInstitute/vec-playbook.git +``` + +3) Resolve and install dependencies from `pyproject.toml` into a virtual environment: +```bash +cd path/to/vec-playbook +uv sync # Automatically installs dependencies in vec-playbook/.venv +``` + +Finally, ensure you're working directory (by default your cluster scratch space) exists and that you have access to the resources you're requesting on the cluster. + +### UV Tip for Killarney + +If you're on killarney you'll have to clone the repository into your scratch space. You can't run files stored in your home directory. The UV cache by default is located in your home directory which is a different filesystem. This breaks uv's default method of hardlinking packages to avoid having to redownload packages. You can either change your cache directory to be on the same filesystem or use `--link-mode=copy`. Avoid using symlink mode as this can break things. ## Layout @@ -27,29 +51,10 @@ Hydra starts from `configs/_global.yaml` and pulls in the appropriate entries fr The `_global.yaml` config contains the bulk of the autoconfiguration. Placeholders are used to automatically fill values with values from other configuration files. `hydra.launcher` arguments largely align with the CLI arguments available for the [sbatch](https://slurm.schedmd.com/sbatch.html) command. See [this](https://hydra.cc/docs/plugins/submitit_launcher/) page for the officialy available hydra slurm launcher parameters. Note that the majority of the parameters are sourced from the selected `compute` config. -## Local Setup - -1) Install [uv](https://docs.astral.sh/uv/getting-started/installation/) -```bash -curl -LsSf https://astral.sh/uv/install.sh | sh -``` - -2) Clone the vec-playbook repository -```bash -git clone https://github.com/VectorInstitute/vec-playbook.git -``` - -3) Resolve and install dependencies from `pyproject.toml` into a virtual environment: -```bash -cd path/to/vec-playbook -uv sync # Automatically installs dependencies in vec-playbook/.venv -``` - -Finally, ensure you're working directory (by default your cluster scratch space) exists and that you have access to the resources you're requesting on the cluster. ## Cluster Setup -1) Provide your user Slurm account and any optional parameters in `templates/configs/user.yaml`. +1) Provide your Slurm user account and any optional parameters in `templates/configs/user.yaml`. ```yaml user: @@ -69,7 +74,7 @@ Uncomment and edit `additional_parameters` entries as needed. This field is sole ## Running Templates -All launchers follow the same pattern: use `uv run python -m .launch` with Hydra overrides that select compute presets, requeue behaviour, and any template-specific hyperparameters. uv will automatically detect the virtual environment located in `.venv` of your CWD. +All launchers follow the same pattern: use `uv run python -m .launch` with Hydra overrides that select compute presets, requeue behaviour, and any template-specific hyperparameters. uv will automatically detect the virtual environment located in `.venv` of your CWD. The templates are automatically loaded as python modules by `uv`. If you add your own template you will have to sync the virtual environment using `uv sync`. ### Command Pattern @@ -81,12 +86,14 @@ uv run python -m .launch \ --multirun ``` -- `` refers to the path to the template you would like to run (within) -- `compute=/` chooses the Slurm resources defined under `templates/configs/compute/` (or a custom preset you add). -- `requeue=` toggles the Submitit requeue flag described in the checkpointing section. +- ``: The module path to the template launch script (eg. `mlp.single`) +- `compute=/`: chooses the Slurm resources defined under `templates/configs/compute/` (or a custom preset you add). +- `requeue=`: toggles the Submitit requeue flag described in the checkpointing section. - Additional Hydra overrides use `key=value` syntax; nested keys follow the YAML structure (e.g., `trainer.learning_rate=5e-4`). -- Use of `--multirun` is required for the launcher to be picked up.. -- Prepend `+` to introduce new keys at runtime, like `+trainer.notes=baseline_a`. +- Prepend `+` to introduce new keys (not already present in config) at runtime, like `+trainer.notes=baseline_a`. +- Use of `--multirun` is required for the launcher to be picked up. + +[//]: <> (What does "picked up" mean when explaining --multirun flag?) ### Examples (single parameter set) @@ -104,7 +111,19 @@ uv run python -m llm.text_classification.launch \ --multirun ``` +Your output should look something like this: +``` +[2025-09-29 11:06:00,546][HYDRA] Submitit 'slurm' sweep output dir : /scratch/$USER/vec_jobs/20250929-110600 +[2025-09-29 11:06:00,546][HYDRA] #0 : compute=killarney/l40s_1x +``` + +[//]: <> (Why does learning_rate need the + prepended if its already in local config?) +[//]: <> (Perhaps a little more clarity on this) +[//]: <> (`+trainer.num_epochs=100` override did not work for mlp.single) +[//]: <> (multirun.yaml is long and confusing and still contains placeholders. Is there a way to save the final static config yaml?) + Hydra blocks until the job finishes (or fails). For long or interactive sessions, wrap the command in `tmux`, `screen`, or submit a wrapper script as shown below. + ### Practical Patterns for Long Jobs ```bash @@ -122,11 +141,13 @@ uv run python -m llm.text_classification.launch compute=bon_echo/a40_1x --multir Hydra sweeps expand comma-separated value lists into Cartesian products and schedule each configuration as a separate Submitit job. Output directories are numbered based on Hydra's sweep index. +[//]: <> (Sweep seems to work, but checkpoints overwrite eachother i'm assuming? Hydra does not create subdirectories in outputs for sweep.l) + ```bash # Sweep learning rate and hidden size for the MLP template uv run python -m mlp.single.launch \ - +trainer.learning_rate=1e-2,1e-3,1e-4 \ - +trainer.hidden_dim=64,128,256 \ + +trainer.learning_rate=1e-2,1e-3 \ + +trainer.hidden_dim=64,128 \ compute=bon_echo/a40_1x \ --multirun @@ -138,13 +159,23 @@ uv run python -m vlm.image_captioning.launch \ --multirun ``` +Your output for a sweep should look something like this: + +``` +[2025-09-29 11:06:00,546][HYDRA] Submitit 'slurm' sweep output dir : /scratch/$USER/vec_jobs/20250929-110600 +[2025-09-29 11:06:00,546][HYDRA] #0 : +trainer.learning_rate=0.01 +trainer.hidden_dim=64 compute=killarney/l40s_1x +[2025-09-29 11:06:00,546][HYDRA] #1 : +trainer.learning_rate=0.01 +trainer.hidden_dim=128 compute=killarney/l40s_1x +[2025-09-29 11:06:00,546][HYDRA] #2 : +trainer.learning_rate=0.001 +trainer.hidden_dim=64 compute=killarney/l40s_1x +[2025-09-29 11:06:00,546][HYDRA] #3 : +trainer.learning_rate=0.001 +trainer.hidden_dim=128 compute=killarney/l40s_1x +``` + ### Monitoring Jobs By default, Hydra and Submitit create the working directory at `~/vec_jobs/` (see `configs/_global.yaml`). Override it when needed with flags such as `paths.work_root=/scratch/$USER` or `work_dir=/scratch/$USER/vec_jobs/${experiment_name}`. ```bash # Check SLURM job status -squeue -u $USER +squeue --me # Inspect the latest work directory ls -1t ~/vec_jobs | head @@ -154,7 +185,7 @@ tail -f ~/vec_jobs/YYYYMMDD-HHMMSS/.submitit/*/stdout* ``` ## Checkpointing & Requeue -Checkpointing lets Submitit resubmit interrupted jobs (preemption, timeout, manual `scontrol requeue`) without restarting from scratch. The templates already subclass `submitit.helpers.Checkpointable`, so they ship with a default `checkpoint()` that returns `DelayedSubmission(self, *args, **kwargs)`. You simply need to persist enough training state to continue where you left off. +Checkpointing lets Submitit resubmit interrupted jobs (preemption, timeout, manual `scontrol requeue`) without restarting from scratch. The templates already subclass `submitit.helpers.Checkpointable`, so they ship with a default `checkpoint()` that returns `DelayedSubmission(self, *args, **kwargs)`. You simply need to persist enough training state to continue where you left off. See [mlp.single.train](src/mlp/single/train.py) for an example of a basic checkpointing implementation. Submitit’s official [checkpointing guide](https://github.com/facebookincubator/submitit/blob/main/docs/checkpointing.md) covers how the `checkpoint()` hook works under the hood and provides additional patterns (e.g., swapping callables, partial pickling) if you need more control. @@ -168,7 +199,6 @@ Submitit’s official [checkpointing guide](https://github.com/facebookincubator 3. Ensure your `checkpoint()` method returns a `DelayedSubmission` that recreates the callable with the same arguments. If you need custom behaviour (changing hyperparameters, skipping corrupt steps), instantiate a new callable and pass it to `DelayedSubmission` instead of `self`. 4. Test the flow by requeueing a running job (`scancel --signal=USR1 ` or Submitit's `job._interrupt(timeout=True)`) to confirm state is restored as expected. - ## Resources - Submitit: https://github.com/facebookincubator/submitit - Hydra Submitit launcher: https://hydra.cc/docs/plugins/submitit_launcher From 7b1482320d921bdee2fc39e59205b729c84b8cba Mon Sep 17 00:00:00 2001 From: Shawn Carere Date: Thu, 2 Oct 2025 10:16:06 -0400 Subject: [PATCH 3/4] addressed some comments --- templates/README.md | 14 +++++++------- templates/configs/user.yaml | 2 -- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/templates/README.md b/templates/README.md index a45aba9..dd003cf 100644 --- a/templates/README.md +++ b/templates/README.md @@ -63,7 +63,8 @@ user: # additional_parameters: # qos: m2 # example Bon Echo QoS ``` -**NOTE:** why is qos used as example of additional parameter here when it is an official launcher parameter that seems to be sourced from compute config? + +[//]: <> (why is qos used as example of additional parameter here when it is an official launcher parameter that seems to be sourced from compute config?) Uncomment and edit `additional_parameters` entries as needed. This field is solely for sbatch arguments not already available in the [Hydra Submitit Slurm Launcher Plugin](https://hydra.cc/docs/plugins/submitit_launcher/). Use CLI overrides for alternate accounts or QoS when launching jobs, for example `... user.slurm.account=ACCOUNT_B user.slurm.additional_parameters.qos=fast`. @@ -82,18 +83,17 @@ All launchers follow the same pattern: use `uv run python -m .launch` w uv run python -m .launch \ compute=/ \ requeue= \ - \ + \ + + \ --multirun ``` - ``: The module path to the template launch script (eg. `mlp.single`) - `compute=/`: chooses the Slurm resources defined under `templates/configs/compute/` (or a custom preset you add). - `requeue=`: toggles the Submitit requeue flag described in the checkpointing section. -- Additional Hydra overrides use `key=value` syntax; nested keys follow the YAML structure (e.g., `trainer.learning_rate=5e-4`). -- Prepend `+` to introduce new keys (not already present in config) at runtime, like `+trainer.notes=baseline_a`. -- Use of `--multirun` is required for the launcher to be picked up. - -[//]: <> (What does "picked up" mean when explaining --multirun flag?) +- Additional Hydra overrides use `key=value` syntax; nested keys follow the YAML structure (e.g., `compute.mem_gb=32`). +- Keys not already present in `_global.yaml` or it's dependencies (`user.yaml`, compute yamls) must be prepended with a `+`. This includes new keys as well as those merged in later such as the templates local `config.yaml` (eg. `trainer.learnin_rate`). +- Use of `--multirun` is required to use the submitit slurm launcher, even if you are only performing a single run. Otherwise the model will attempt to train locally. ### Examples (single parameter set) diff --git a/templates/configs/user.yaml b/templates/configs/user.yaml index 756bb42..13e0cfc 100644 --- a/templates/configs/user.yaml +++ b/templates/configs/user.yaml @@ -1,5 +1,3 @@ user: slurm: account: vector - additional_parameters: - qos: m2 From bfd4fa8b40a96840edbb3ded666f0ac637ecbfca Mon Sep 17 00:00:00 2001 From: Shawn Carere Date: Thu, 2 Oct 2025 20:45:20 -0400 Subject: [PATCH 4/4] hot-fixed templates and updated docs --- pyproject.toml | 5 +- templates/README.md | 85 +++++++++++++------ templates/configs/__init__.py | 0 templates/configs/_global.yaml | 11 ++- .../starters/llm_text_classification.yaml | 4 - templates/src/__init__.py | 9 -- .../src/llm/text_classification/config.yaml | 8 ++ .../src/llm/text_classification/launch.py | 29 ++++--- .../src/llm/text_classification/train.py | 36 +++++--- templates/src/mlp/README.md | 2 +- templates/src/mlp/ddp/config.yaml | 8 ++ templates/src/mlp/ddp/launch.py | 30 ++++--- templates/src/mlp/ddp/train.py | 50 ++++++----- templates/src/mlp/single/config.yaml | 9 ++ templates/src/mlp/single/launch.py | 31 ++++--- templates/src/mlp/single/train.py | 45 ++++++---- .../mlp/single_not_checkpointable/config.yaml | 12 ++- .../mlp/single_not_checkpointable/launch.py | 32 +++---- .../mlp/single_not_checkpointable/train.py | 34 +++++--- templates/src/vlm/README.md | 2 +- .../src/vlm/image_captioning/config.yaml | 8 ++ templates/src/vlm/image_captioning/launch.py | 29 ++++--- templates/src/vlm/image_captioning/train.py | 46 +++++----- 23 files changed, 329 insertions(+), 196 deletions(-) create mode 100644 templates/configs/__init__.py delete mode 100644 templates/configs/starters/llm_text_classification.yaml delete mode 100644 templates/src/__init__.py diff --git a/pyproject.toml b/pyproject.toml index 36d8be9..9458b61 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,9 +3,12 @@ requires = ["setuptools>=65", "wheel"] build-backend = "setuptools.build_meta" [tool.setuptools.packages.find] -where = ["templates/src"] +where = ["templates", "templates/src"] # Include configs and templates as packages include = ["*"] +[tool.setuptools.package-data] +"configs" = ["**/*.yaml"] # Make sure configs package includes the yaml configs + [project] name = "vec-playbook" version = "0.1.0" diff --git a/templates/README.md b/templates/README.md index dd003cf..0837c91 100644 --- a/templates/README.md +++ b/templates/README.md @@ -44,10 +44,12 @@ templates/ └── configs/ # Hydra + Submitit configs ``` -Each template directory is self-contained: it has a `launch.py`, a `train.py`, and a `config.yaml`. -The `configs/` directory defines Slurm presets and shared Hydra + Submitit settings. +Each template directory contains a `launch.py`, a `train.py`, and a `config.yaml`. +The `configs/` directory defines Slurm presets and shared Hydra + Submitit settings. -Hydra starts from `configs/_global.yaml` and pulls in the appropriate entries from `configs/user.yaml` and `configs/compute/*`. The launch script within each template then merges the template's own local `config.yaml` before forwarding the resolved configuration to Submitit; CLI overrides (e.g. `compute=killarney/h100_1x`) are applied in that final merge, so every launch script receives a single, fully-specified config that Submitit uses to submit or run locally. +The launch script contains the `hydra.main` decorator which points hydra to the templates local `config.yaml`. This `config.yaml` imports the `_global` config from the `configs/` directory, which in turn imports other preset configs. + +Most templates import the `_global.yaml` config from the `configs/` directory as a base experimental setup, and are therefore dependent on it's settings. The global config in turn imports other preset configs such as the `user.yaml` config and the compute configs. Modifying the `_global.yaml` file may break some of the other templates. Therefore be careful making changes to `_global.yaml` settings, if the settings do not need to be globally applied to all templates consider including them in the local config instead. Hydra takes the starting local config, populates it with the additional fields from all its dependencies, and provides that to submitit to launch a job. Submitit executes the function decorated by `hydra.main` as a slurm job. The fully specified config that was provided to slurm is passed to that function as an argument. The `_global.yaml` config contains the bulk of the autoconfiguration. Placeholders are used to automatically fill values with values from other configuration files. `hydra.launcher` arguments largely align with the CLI arguments available for the [sbatch](https://slurm.schedmd.com/sbatch.html) command. See [this](https://hydra.cc/docs/plugins/submitit_launcher/) page for the officialy available hydra slurm launcher parameters. Note that the majority of the parameters are sourced from the selected `compute` config. @@ -64,10 +66,10 @@ user: # qos: m2 # example Bon Echo QoS ``` -[//]: <> (why is qos used as example of additional parameter here when it is an official launcher parameter that seems to be sourced from compute config?) - Uncomment and edit `additional_parameters` entries as needed. This field is solely for sbatch arguments not already available in the [Hydra Submitit Slurm Launcher Plugin](https://hydra.cc/docs/plugins/submitit_launcher/). Use CLI overrides for alternate accounts or QoS when launching jobs, for example `... user.slurm.account=ACCOUNT_B user.slurm.additional_parameters.qos=fast`. +[//]: <> (Will specifying qos as an additional parameter overwrite the qos in compute setting?) + 2) Pick a compute preset to use in the next section: - `templates/configs/compute/bon_echo/*` (A40, A100) - `templates/configs/compute/killarney/*` (L40S, H100) @@ -75,7 +77,7 @@ Uncomment and edit `additional_parameters` entries as needed. This field is sole ## Running Templates -All launchers follow the same pattern: use `uv run python -m .launch` with Hydra overrides that select compute presets, requeue behaviour, and any template-specific hyperparameters. uv will automatically detect the virtual environment located in `.venv` of your CWD. The templates are automatically loaded as python modules by `uv`. If you add your own template you will have to sync the virtual environment using `uv sync`. +All launchers follow the same pattern: use `uv run python -m .launch` with Hydra overrides that select compute presets, requeue behaviour, and any template-specific hyperparameters. uv will automatically detect the virtual environment located in `.venv` of your CWD. The templates are automatically loaded as python modules by `uv`. If you add your own template you will have to sync the virtual environment using `uv sync`. ### Command Pattern @@ -83,17 +85,17 @@ All launchers follow the same pattern: use `uv run python -m .launch` w uv run python -m .launch \ compute=/ \ requeue= \ - \ - + \ + \ + \ --multirun ``` - ``: The module path to the template launch script (eg. `mlp.single`) - `compute=/`: chooses the Slurm resources defined under `templates/configs/compute/` (or a custom preset you add). - `requeue=`: toggles the Submitit requeue flag described in the checkpointing section. -- Additional Hydra overrides use `key=value` syntax; nested keys follow the YAML structure (e.g., `compute.mem_gb=32`). -- Keys not already present in `_global.yaml` or it's dependencies (`user.yaml`, compute yamls) must be prepended with a `+`. This includes new keys as well as those merged in later such as the templates local `config.yaml` (eg. `trainer.learnin_rate`). -- Use of `--multirun` is required to use the submitit slurm launcher, even if you are only performing a single run. Otherwise the model will attempt to train locally. +- Additional config overrides use `key=value` syntax; nested keys follow the YAML structure (e.g., `compute.mem_gb=32`). +- Keys not already present in the local `config.yaml` or it's dependencies (`_global.yaml`, `user.yaml`, compute yamls) must be prepended with a `+`. This denotes the key as being new rather than an override. +- Use of `--multirun` is required to use the submitit slurm launcher, even if you are only performing a single run. Otherwise the model will attempt to train locally on your login node. ### Examples (single parameter set) @@ -107,7 +109,7 @@ uv run python -m mlp.single.launch compute=killarney/l40s_1x requeue=off --multi # Fine-tune a text classifier template with custom learning rate uv run python -m llm.text_classification.launch \ compute=killarney/l40s_1x \ - +trainer.learning_rate=5e-4 \ + trainer.learning_rate=5e-4 \ --multirun ``` @@ -117,11 +119,6 @@ Your output should look something like this: [2025-09-29 11:06:00,546][HYDRA] #0 : compute=killarney/l40s_1x ``` -[//]: <> (Why does learning_rate need the + prepended if its already in local config?) -[//]: <> (Perhaps a little more clarity on this) -[//]: <> (`+trainer.num_epochs=100` override did not work for mlp.single) -[//]: <> (multirun.yaml is long and confusing and still contains placeholders. Is there a way to save the final static config yaml?) - Hydra blocks until the job finishes (or fails). For long or interactive sessions, wrap the command in `tmux`, `screen`, or submit a wrapper script as shown below. ### Practical Patterns for Long Jobs @@ -141,20 +138,18 @@ uv run python -m llm.text_classification.launch compute=bon_echo/a40_1x --multir Hydra sweeps expand comma-separated value lists into Cartesian products and schedule each configuration as a separate Submitit job. Output directories are numbered based on Hydra's sweep index. -[//]: <> (Sweep seems to work, but checkpoints overwrite eachother i'm assuming? Hydra does not create subdirectories in outputs for sweep.l) - ```bash # Sweep learning rate and hidden size for the MLP template uv run python -m mlp.single.launch \ - +trainer.learning_rate=1e-2,1e-3 \ - +trainer.hidden_dim=64,128 \ + trainer.learning_rate=1e-2,1e-3 \ + trainer.hidden_dim=64,128 \ compute=bon_echo/a40_1x \ --multirun # Sweep batch size and LR for the VLM captioning template uv run python -m vlm.image_captioning.launch \ - +trainer.batch_size=8,16,32 \ - +trainer.learning_rate=1e-4,5e-5 \ + trainer.batch_size=8,16,32 \ + trainer.learning_rate=1e-4,5e-5 \ compute=killarney/h100_1x \ --multirun ``` @@ -171,7 +166,7 @@ Your output for a sweep should look something like this: ### Monitoring Jobs -By default, Hydra and Submitit create the working directory at `~/vec_jobs/` (see `configs/_global.yaml`). Override it when needed with flags such as `paths.work_root=/scratch/$USER` or `work_dir=/scratch/$USER/vec_jobs/${experiment_name}`. +By default, Hydra and Submitit create a `vec_jobs/` working directory in your scratch folder (`/scratch/$USER` on killarney, `/scratch/ssd004/scratch/u$USER` on bon-echo). Override it when needed with flags such as `paths.work_root=/scratch/$USER` or `paths.work_dir=/scratch/$USER/vec_jobs/${experiment_name}`. These are set in `configs/_global.yaml`. ```bash # Check SLURM job status @@ -189,12 +184,14 @@ Checkpointing lets Submitit resubmit interrupted jobs (preemption, timeout, manu Submitit’s official [checkpointing guide](https://github.com/facebookincubator/submitit/blob/main/docs/checkpointing.md) covers how the `checkpoint()` hook works under the hood and provides additional patterns (e.g., swapping callables, partial pickling) if you need more control. +Note that in order to prevent multirun jobs (such as parameter sweeps) from overwriting eachothers configs, the checkpoint output directory must be different for each run. We handle this in the launch script by using the `hydra.runtime.output_dir` to set a dynamic output dir in `cfg.paths.out_dir`. The runtime output dir will always be unique to each run. For multirun sweeps, subdirectories are automatically generated by hydra. You can configure hydra to customize how these subdirectoires are named but by default they're just monotonically increasing integers. + **Toggling requeue behaviour** - Defaults live in `configs/requeue/{on,off}.yaml`. Pick the version you want via `requeue=on` or `requeue=off` on the CLI. (`off` disables the Slurm `--requeue` flag.) - Global safeguards such as `max_num_timeout` come from `configs/_global.yaml`; adjust them if your workload needs more automatic retries. **Implementation checklist** -1. Save checkpoints regularly inside `cfg.work_dir` (e.g., `outputs/checkpoint-epoch-*`). Capture model weights, optimizer state, and any metadata you need to resume. +1. Save checkpoints regularly inside `cfg.paths.out_dir` (e.g., `outputs/checkpoint-epoch-*`). Capture model weights, optimizer state, and any metadata you need to resume. 2. On startup (`__call__`), look for the most recent checkpoint and restore state before training continues. The templates include helper methods (`_latest_checkpoint`) you can reuse or extend. 3. Ensure your `checkpoint()` method returns a `DelayedSubmission` that recreates the callable with the same arguments. If you need custom behaviour (changing hyperparameters, skipping corrupt steps), instantiate a new callable and pass it to `DelayedSubmission` instead of `self`. 4. Test the flow by requeueing a running job (`scancel --signal=USR1 ` or Submitit's `job._interrupt(timeout=True)`) to confirm state is restored as expected. @@ -202,3 +199,41 @@ Submitit’s official [checkpointing guide](https://github.com/facebookincubator ## Resources - Submitit: https://github.com/facebookincubator/submitit - Hydra Submitit launcher: https://hydra.cc/docs/plugins/submitit_launcher + +## Understanding Job Outputs + +After running a template, the output artifacts will be saved to the work_dir specified in `_global.yaml`. By default this is `$SCRATCH_DIR/vec_jobs/`. The directory structure for you're outputs will look something like this: + +``` +vec_jobs// +├── multirun.yaml # Template config used by hydra for all runs +├── submitit_logs/ +│ ├── / # One for each run (job submitted by hydra) +│ │ ├── __log.err # stderr +│ │ ├── __log.out # stdout +│ │ ├── __result.pkl +│ │ ├── _submission.sh # The sbatch script that was submitted for this job +│ │ └── _submitted.pkl +│ ... +│ └── / +│ ... +│ └── ... +│── / # This is the actual job/run output. One for each run +│ ├── launch.log # Only contains log messages, not stdout or stderr +│ ├── outputs/ # Contains model outputs saved to cfg.paths.out_dir (eg. checkpoints) +│ └── hydra_configs +│ ├── config.yaml # The final config passed to the function decorated by @hydra.main (for this run) +│ ├── overrides.yaml # CLI overrides that were used for this run +│ ├── hydra.yaml # The hydra settings that were used for this run (some placeholder values still present) +│ └── hydra_resolved.yaml # The hydra settings that were used for this run (with all placeholder values resolved) +│ +... +└── / + ... + └── ... +``` + +**Notes:** +- print messages will not be sent to launch.log, use a logger instead (see example templates) +- `multirun.yaml` and `hydra.yaml` will contain placeholder values (eg. `${oc.select:compute.mem_gb}`). These are used to fill in the values with values from other parts of the config or other configs included in the defaults. See hydra documentation for more detail. +- When doing a hyperparameter sweep, a run is performed for each unique combination of hyperparameters. Each run is run as a separate slurm job with a unique slurm ID. diff --git a/templates/configs/__init__.py b/templates/configs/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/templates/configs/_global.yaml b/templates/configs/_global.yaml index 35fc6b7..8c7fda3 100644 --- a/templates/configs/_global.yaml +++ b/templates/configs/_global.yaml @@ -9,14 +9,16 @@ experiment_name: ${now:%Y%m%d}-${now:%H%M%S} paths: work_root: ${oc.select:compute.work_root, /scratch/${oc.env:USER}} -work_dir: ${paths.work_root}/vec_jobs/${experiment_name} + work_dir: ${paths.work_root}/vec_jobs/${experiment_name} hydra: + output_subdir: hydra_configs run: - dir: ${work_dir} + dir: ${paths.work_dir} sweep: - dir: ${work_dir} + dir: ${paths.work_dir} launcher: + submitit_folder: ${hydra.sweep.dir}/submitit_logs/%j nodes: ${oc.select:compute.nodes,null} gpus_per_node: ${oc.select:compute.slurm.gpus_per_node, ${compute.gpus_per_node}} tasks_per_node: 1 @@ -28,4 +30,5 @@ hydra: qos: ${oc.select:compute.slurm.qos,null} account: ${user.slurm.account} max_num_timeout: 2 - additional_parameters: ${oc.select:user.slurm.additional_parameters,{}} + additional_parameters: ${oc.select:user.slurm.additional_parameters, {}} + diff --git a/templates/configs/starters/llm_text_classification.yaml b/templates/configs/starters/llm_text_classification.yaml deleted file mode 100644 index a08c887..0000000 --- a/templates/configs/starters/llm_text_classification.yaml +++ /dev/null @@ -1,4 +0,0 @@ -starter: - module: starters.llm.text_classification.runner - entry: Runner - config: starters/llm/text_classification/config.yaml diff --git a/templates/src/__init__.py b/templates/src/__init__.py deleted file mode 100644 index c28d660..0000000 --- a/templates/src/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -"""Templates training ML models workloads on Vector cluster using Hydra and Submitit.""" - -# Give Submitit extra time for result pickles to land on slow filesystems. -import submitit - - -submitit.core.core.Job._results_timeout_s = max( - submitit.core.core.Job._results_timeout_s, 360 -) diff --git a/templates/src/llm/text_classification/config.yaml b/templates/src/llm/text_classification/config.yaml index f8c5538..6489798 100644 --- a/templates/src/llm/text_classification/config.yaml +++ b/templates/src/llm/text_classification/config.yaml @@ -1,3 +1,7 @@ +defaults: + - _global # Import global config settings + - _self_ # Must include the settings from this file + trainer: model_name: distilbert-base-uncased dataset_name: ag_news @@ -11,3 +15,7 @@ trainer: logging_steps: 50 save_steps: 100 eval_steps: 200 + +hydra: + searchpath: + - pkg://configs # Include configs from the configs package in the searchpath \ No newline at end of file diff --git a/templates/src/llm/text_classification/launch.py b/templates/src/llm/text_classification/launch.py index 983676d..802da09 100644 --- a/templates/src/llm/text_classification/launch.py +++ b/templates/src/llm/text_classification/launch.py @@ -1,29 +1,32 @@ """Launch script to run template with Hydra + Submitit.""" import os +import logging import hydra from omegaconf import DictConfig, OmegaConf from .train import TextClassificationTrainer +logger = logging.getLogger(__name__) -_CONFIG_PATH = os.path.normpath( - os.path.join(os.path.dirname(__file__), "../../../configs") -) - - -@hydra.main(config_path=_CONFIG_PATH, config_name="_global", version_base=None) +@hydra.main(config_path=".", config_name="config", version_base=None) def main(cfg: DictConfig): - """Hydra entrypoint that merges local config and runs the Trainer.""" - local_cfg = OmegaConf.load(os.path.join(os.path.dirname(__file__), "config.yaml")) + """Hydra entrypoint that updates config with out_dir, saves resolved hydra config and runs the Trainer.""" + # Turn of struct mode so that we can modify DictConfig OmegaConf.set_struct(cfg, False) - cfg = OmegaConf.merge(cfg, local_cfg) - if "trainer" in cfg: - trainer_cfg = cfg.trainer - cfg = OmegaConf.merge(cfg, trainer_cfg) - del cfg.trainer + # Add output_directory for current run + hydra_config = hydra.core.hydra_config.HydraConfig.get() + cfg.paths.out_dir = str(os.path.join(hydra_config.runtime.output_dir, "outputs")) + logger.info(f"Setting paths.out_dir to: {cfg.paths.out_dir}") + + # Save a resolved version of the hydra config + save_path = os.path.join(hydra_config.runtime.output_dir, hydra_config.output_subdir, "hydra_resolved.yaml") + logger.info(f"Resolving hydra config for this run and saving to: {save_path}") + OmegaConf.set_readonly(hydra_config, False) + OmegaConf.resolve(hydra_config) + OmegaConf.save(hydra_config, save_path) text_classification_trainer = TextClassificationTrainer() return text_classification_trainer(cfg) diff --git a/templates/src/llm/text_classification/train.py b/templates/src/llm/text_classification/train.py index 5701f12..62dd653 100644 --- a/templates/src/llm/text_classification/train.py +++ b/templates/src/llm/text_classification/train.py @@ -1,6 +1,7 @@ """Fine-tune a HF model for text classification with a basic loop.""" import os +import logging import submitit from datasets import load_dataset @@ -12,6 +13,9 @@ TrainingArguments, ) +from omegaconf import DictConfig, OmegaConf + +logger = logging.getLogger(__name__) class TextClassificationTrainer(submitit.helpers.Checkpointable): """Trainer for text classification.""" @@ -29,40 +33,44 @@ def _latest_checkpoint(self, out_dir): def __call__(self, cfg): """Train the model.""" - out_dir = os.path.join(cfg.work_dir, "outputs") + cfg : DictConfig = OmegaConf.create(cfg) # Ensure cfg is a DictConfig + + # Create output directory + out_dir = cfg.paths.out_dir os.makedirs(out_dir, exist_ok=True) + self.ckpt_dir = self._latest_checkpoint(out_dir) - model_name = getattr(cfg, "model_name", "distilbert-base-uncased") + model_name = OmegaConf.select(cfg, "trainer.model_name", "distilbert-base-uncased") ds = load_dataset("ag_news") tok = AutoTokenizer.from_pretrained(model_name, use_fast=True) def tok_fn(ex): return tok( - ex["text"], truncation=True, max_length=getattr(cfg, "max_length", 256) + ex["text"], truncation=True, max_length=OmegaConf.select(cfg, "trainer.max_length", 256) ) ds = ds.map(tok_fn, batched=True) collator = DataCollatorWithPadding(tokenizer=tok) model = AutoModelForSequenceClassification.from_pretrained( - model_name, num_labels=getattr(cfg, "num_labels", 4) + model_name, num_labels=OmegaConf.select(cfg, "trainer.num_labels", 4) ) args = TrainingArguments( output_dir=out_dir, overwrite_output_dir=True, - num_train_epochs=getattr(cfg, "num_train_epochs", 2), - per_device_train_batch_size=getattr(cfg, "per_device_train_batch_size", 16), - per_device_eval_batch_size=getattr(cfg, "per_device_eval_batch_size", 32), + num_train_epochs=OmegaConf.select(cfg, "trainer.num_train_epochs", default=2), + per_device_train_batch_size=OmegaConf.select(cfg, "trainer.per_device_train_batch_size", default=16), + per_device_eval_batch_size=OmegaConf.select(cfg, "trainer.per_device_eval_batch_size", default=32), eval_strategy="steps", - eval_steps=getattr(cfg, "eval_steps", 200), - logging_steps=getattr(cfg, "logging_steps", 50), - learning_rate=getattr(cfg, "learning_rate", 5e-5), - weight_decay=getattr(cfg, "weight_decay", 0.01), + eval_steps=OmegaConf.select(cfg, "trainer.eval_steps", default=200), + logging_steps=OmegaConf.select(cfg, "trainer.logging_steps", default=50), + learning_rate=OmegaConf.select(cfg, "trainer.learning_rate", default=5e-5), + weight_decay=OmegaConf.select(cfg, "trainer.weight_decay", default=0.01), save_strategy="steps", - save_steps=getattr(cfg, "save_steps", 100), - save_total_limit=getattr(cfg, "save_total_limit", 2), + save_steps=OmegaConf.select(cfg, "trainer.save_steps", default=100), + save_total_limit=OmegaConf.select(cfg, "trainer.save_total_limit", default=2), report_to=[], ) @@ -77,7 +85,7 @@ def tok_fn(ex): trainer.train(resume_from_checkpoint=self.ckpt_dir) metrics = trainer.evaluate() - print(metrics) + logger.info(metrics) return 0 def checkpoint(self, *args, **kwargs): diff --git a/templates/src/mlp/README.md b/templates/src/mlp/README.md index d5e628a..9531f27 100644 --- a/templates/src/mlp/README.md +++ b/templates/src/mlp/README.md @@ -18,6 +18,6 @@ uv run python -m mlp.single.launch compute=bon_echo/a40_1x requeue=on --multirun # Launch 2×A40 DDP training with a larger hidden layer uv run python -m mlp.ddp.launch \ compute=bon_echo/a40_2x \ - +trainer.hidden_dim=256 \ + trainer.hidden_dim=256 \ --multirun ``` diff --git a/templates/src/mlp/ddp/config.yaml b/templates/src/mlp/ddp/config.yaml index 6a1d981..4ce2f19 100644 --- a/templates/src/mlp/ddp/config.yaml +++ b/templates/src/mlp/ddp/config.yaml @@ -1,3 +1,7 @@ +defaults: + - _global # Import global config settings + - _self_ # Must include the settings from this file + trainer: input_dim: 10 hidden_dim: 64 @@ -6,3 +10,7 @@ trainer: learning_rate: 1e-3 num_epochs: 1000 seed: 42 + +hydra: + searchpath: + - pkg://configs # Include configs from the configs package in the searchpath diff --git a/templates/src/mlp/ddp/launch.py b/templates/src/mlp/ddp/launch.py index ce93bc2..bca9c36 100644 --- a/templates/src/mlp/ddp/launch.py +++ b/templates/src/mlp/ddp/launch.py @@ -1,30 +1,34 @@ """Launch script for DDP MLP training with Hydra + Submitit.""" import os +import logging import hydra from omegaconf import DictConfig, OmegaConf from .train import DDPMLPTrainer +logger = logging.getLogger(__name__) -_CONFIG_PATH = os.path.normpath( - os.path.join(os.path.dirname(__file__), "../../../configs") -) - - -@hydra.main(config_path=_CONFIG_PATH, config_name="_global", version_base=None) +@hydra.main(config_path=".", config_name="config", version_base=None) def main(cfg: DictConfig): - """Hydra entrypoint that merges local config and runs the Trainer.""" - local_cfg = OmegaConf.load(os.path.join(os.path.dirname(__file__), "config.yaml")) + """Hydra entrypoint that updates config with out_dir, saves resolved hydra config and runs the Trainer.""" + # Turn of struct mode so that we can modify DictConfig OmegaConf.set_struct(cfg, False) - cfg = OmegaConf.merge(cfg, local_cfg) - if "trainer" in cfg: - trainer_cfg = cfg.trainer - cfg = OmegaConf.merge(cfg, trainer_cfg) - del cfg.trainer + # Add output_directory for current run + hydra_config = hydra.core.hydra_config.HydraConfig.get() + cfg.paths.out_dir = str(os.path.join(hydra_config.runtime.output_dir, "outputs")) + logger.info(f"Setting paths.out_dir to: {cfg.paths.out_dir}") + + # Save a resolved version of the hydra config + save_path = os.path.join(hydra_config.runtime.output_dir, hydra_config.output_subdir, "hydra_resolved.yaml") + logger.info(f"Resolving hydra config for this run and saving to: {save_path}") + OmegaConf.set_readonly(hydra_config, False) + OmegaConf.resolve(hydra_config) + OmegaConf.save(hydra_config, save_path) + # Start trainer ddp_trainer = DDPMLPTrainer() return ddp_trainer(cfg) diff --git a/templates/src/mlp/ddp/train.py b/templates/src/mlp/ddp/train.py index 58b0d43..b95fcf2 100644 --- a/templates/src/mlp/ddp/train.py +++ b/templates/src/mlp/ddp/train.py @@ -1,12 +1,17 @@ """Distributed MLP training using PyTorch DDP.""" import os +import logging import submitit import torch import torch.distributed as dist from torch import nn, optim from torch.utils.data import DataLoader, DistributedSampler, TensorDataset +from omegaconf import DictConfig, OmegaConf + + +logger = logging.getLogger(__name__) def create_dummy_data( @@ -63,7 +68,7 @@ def _save_checkpoint(self, model, optimizer, epoch, out_dir, loss, accuracy, ran } torch.save(checkpoint, os.path.join(save_dir, "model.pt")) - print(f"Checkpoint saved at epoch {epoch}") + logger.info(f"Checkpoint saved at epoch {epoch}") def _setup_distributed(self, rank, world_size): """Initialize distributed training.""" @@ -76,9 +81,9 @@ def _setup_distributed(self, rank, world_size): def _initialize_device_and_model(self, cfg, local_rank): """Initialize device and model.""" - input_dim = getattr(cfg, "input_dim", 10) - hidden_dim = getattr(cfg, "hidden_dim", 64) - num_classes = getattr(cfg, "num_classes", 3) + input_dim = OmegaConf.select(cfg, "trainer.input_dim", default=10) + hidden_dim = OmegaConf.select(cfg, "trainer.hidden_dim", default=64) + num_classes = OmegaConf.select(cfg, "trainer.num_classes", default=3) # Setup device if torch.cuda.is_available(): @@ -100,9 +105,9 @@ def _initialize_device_and_model(self, cfg, local_rank): def _initialize_data_and_loader(self, cfg, world_size, rank): """Initialize dataset and dataloader with distributed sampler.""" - input_dim = getattr(cfg, "input_dim", 10) - num_classes = getattr(cfg, "num_classes", 3) - batch_size = getattr(cfg, "batch_size", 32) + input_dim = OmegaConf.select(cfg, "trainer.input_dim", default=10) + num_classes = OmegaConf.select(cfg, "trainer.num_classes", default=3) + batch_size = OmegaConf.select(cfg, "trainer.batch_size", default=32) dataset = create_dummy_data(1000, input_dim, num_classes) sampler = ( @@ -127,7 +132,7 @@ def _load_checkpoint_if_exists(self, model, optimizer, device, rank): checkpoint_path = os.path.join(self.ckpt_dir, "model.pt") if os.path.exists(checkpoint_path): if rank == 0: - print(f"Resuming from checkpoint: {self.ckpt_dir}") + logger.info(f"Resuming from checkpoint: {self.ckpt_dir}") checkpoint = torch.load(checkpoint_path, map_location=device) # Load model state (handle DDP wrapper) @@ -139,7 +144,7 @@ def _load_checkpoint_if_exists(self, model, optimizer, device, rank): optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) start_epoch = checkpoint["epoch"] + 1 if rank == 0: - print(f"Resumed from epoch {checkpoint['epoch']}") + logger.info(f"Resumed from epoch {checkpoint['epoch']}") return start_epoch def _train_epoch( @@ -188,14 +193,19 @@ def _train_epoch( def __call__(self, cfg): """Train the MLP model with DDP.""" - out_dir = os.path.join(cfg.work_dir, "outputs") + cfg : DictConfig = OmegaConf.create(cfg) # Ensure cfg is a DictConfig + + # Create output directory + out_dir = cfg.paths.out_dir os.makedirs(out_dir, exist_ok=True) + + # Get ckpt dir self.ckpt_dir = self._latest_checkpoint(out_dir) # Configuration - lr = getattr(cfg, "learning_rate", 1e-3) - num_epochs = getattr(cfg, "num_epochs", 1000) - seed = getattr(cfg, "seed", 42) + lr = OmegaConf.select(cfg, "trainer.learning_rate", default=1e-3) + num_epochs = OmegaConf.select(cfg, "trainer.num_epochs", default=1000) + seed = OmegaConf.select(cfg, "trainer.seed", default=42) # Get distributed training info from environment rank = int(os.environ.get("RANK", "0")) @@ -203,8 +213,8 @@ def __call__(self, cfg): world_size = int(os.environ.get("WORLD_SIZE", "1")) if rank == 0: - print(f"Starting DDP MLP training with seed {seed}") - print(f"World size: {world_size}, Local rank: {local_rank}") + logger.info(f"Starting DDP MLP training with seed {seed}") + logger.info(f"World size: {world_size}, Local rank: {local_rank}") # Set seed for reproducibility (same seed on all processes) torch.manual_seed(seed) @@ -218,7 +228,7 @@ def __call__(self, cfg): device, model = self._initialize_device_and_model(cfg, local_rank) if rank == 0: - print(f"Using device: {device}") + logger.info(f"Using device: {device}") # Wrap model with DDP if world_size > 1: @@ -236,7 +246,7 @@ def __call__(self, cfg): start_epoch = self._load_checkpoint_if_exists(model, optimizer, device, rank) if rank == 0: - print(f"Training from epoch {start_epoch} to {num_epochs}...") + logger.info(f"Training from epoch {start_epoch} to {num_epochs}...") # Training loop with DDP for epoch in range(start_epoch, num_epochs): @@ -252,11 +262,11 @@ def __call__(self, cfg): rank, ) - # Print metrics only on rank 0 + # Log metrics only on rank 0 if rank == 0: acc = 100.0 * correct / total avg_loss = loss_sum / len(loader) - print(f"Epoch {epoch}: loss={avg_loss:.4f} acc={acc:.2f}%") + logger.info(f"Epoch {epoch}: loss={avg_loss:.4f} acc={acc:.2f}%") if epoch % 100 == 0 or epoch == num_epochs - 1: if world_size > 1: @@ -266,7 +276,7 @@ def __call__(self, cfg): ) if rank == 0: - print("Training completed!") + logger.info("Training completed!") # Clean up distributed training if world_size > 1 and dist.is_initialized(): diff --git a/templates/src/mlp/single/config.yaml b/templates/src/mlp/single/config.yaml index ac43e77..f14aa75 100644 --- a/templates/src/mlp/single/config.yaml +++ b/templates/src/mlp/single/config.yaml @@ -1,3 +1,8 @@ +defaults: + - _global # Import global config settings + - _self_ # Must include the settings from this file + +# Local Trainer Defaults trainer: input_dim: 10 hidden_dim: 64 @@ -6,3 +11,7 @@ trainer: learning_rate: 1e-3 num_epochs: 1000 # Can be larger with checkpointing seed: 42 + +hydra: + searchpath: + - pkg://configs # Include configs from the configs package in the searchpath diff --git a/templates/src/mlp/single/launch.py b/templates/src/mlp/single/launch.py index 2668c7c..ab3e059 100644 --- a/templates/src/mlp/single/launch.py +++ b/templates/src/mlp/single/launch.py @@ -1,33 +1,36 @@ """Launch script for checkpointable MLP training with Hydra + Submitit.""" import os +import logging import hydra from omegaconf import DictConfig, OmegaConf from .train import CheckpointableMLPTrainer +logger = logging.getLogger(__name__) -_CONFIG_PATH = os.path.normpath( - os.path.join(os.path.dirname(__file__), "../../../configs") -) - - -@hydra.main(config_path=_CONFIG_PATH, config_name="_global", version_base=None) +@hydra.main(config_path=".", config_name="config", version_base=None) def main(cfg: DictConfig): - """Hydra entrypoint that merges local config and runs the Trainer.""" - local_cfg = OmegaConf.load(os.path.join(os.path.dirname(__file__), "config.yaml")) + """Hydra entrypoint that updates config with out_dir, saves resolved hydra config and runs the Trainer.""" + # Turn of struct mode so that we can modify DictConfig OmegaConf.set_struct(cfg, False) - cfg = OmegaConf.merge(cfg, local_cfg) - if "trainer" in cfg: - trainer_cfg = cfg.trainer - cfg = OmegaConf.merge(cfg, trainer_cfg) - del cfg.trainer + # Add output_directory for current run + hydra_config = hydra.core.hydra_config.HydraConfig.get() + cfg.paths.out_dir = str(os.path.join(hydra_config.runtime.output_dir, "outputs")) + logger.info(f"Setting paths.out_dir to: {cfg.paths.out_dir}") + # Save a resolved version of the hydra config + save_path = os.path.join(hydra_config.runtime.output_dir, hydra_config.output_subdir, "hydra_resolved.yaml") + logger.info(f"Resolving hydra config for this run and saving to: {save_path}") + OmegaConf.set_readonly(hydra_config, False) + OmegaConf.resolve(hydra_config) + OmegaConf.save(hydra_config, save_path) + + # Run the trainer with the run config checkpointable_trainer = CheckpointableMLPTrainer() return checkpointable_trainer(cfg) - if __name__ == "__main__": main() diff --git a/templates/src/mlp/single/train.py b/templates/src/mlp/single/train.py index c3508ae..098ef23 100644 --- a/templates/src/mlp/single/train.py +++ b/templates/src/mlp/single/train.py @@ -1,11 +1,15 @@ """Single-GPU MLP training with checkpointing.""" import os +import logging import submitit import torch from torch import nn, optim from torch.utils.data import DataLoader, TensorDataset +from omegaconf import OmegaConf, DictConfig + +logger = logging.getLogger(__name__) def create_dummy_data( @@ -52,26 +56,33 @@ def _save_checkpoint(self, model, optimizer, epoch, out_dir, loss, accuracy): } torch.save(checkpoint, os.path.join(save_dir, "model.pt")) - print(f"Checkpoint saved at epoch {epoch}") + logger.info(f"Checkpoint saved at epoch {epoch}") def __call__(self, cfg): """Train the MLP model.""" - out_dir = os.path.join(cfg.work_dir, "outputs") + cfg : DictConfig = OmegaConf.create(cfg) # Ensure cfg is a DictConfig + + # Create output directory + out_dir = cfg.paths.out_dir os.makedirs(out_dir, exist_ok=True) + + # Get ckpt dir self.ckpt_dir = self._latest_checkpoint(out_dir) - input_dim = getattr(cfg, "input_dim", 10) - hidden_dim = getattr(cfg, "hidden_dim", 64) - num_classes = getattr(cfg, "num_classes", 3) - batch_size = getattr(cfg, "batch_size", 32) - lr = getattr(cfg, "learning_rate", 1e-3) - num_epochs = getattr(cfg, "num_epochs", 1000) - seed = getattr(cfg, "seed", 42) - - print(f"Starting checkpointable MLP training with seed {seed}") + + # Get trainer config variables + input_dim = OmegaConf.select(cfg, "trainer.input_dim", default=10) + hidden_dim = OmegaConf.select(cfg, "trainer.hidden_dim", default=64) + num_classes = OmegaConf.select(cfg, "trainer.num_classes", default=3) + batch_size = OmegaConf.select(cfg, "trainer.batch_size", default=32) + lr = OmegaConf.select(cfg, "trainer.learning_rate", default=1e-3) + num_epochs = OmegaConf.select(cfg, "trainer.num_epochs", default=1000) + seed = OmegaConf.select(cfg, "trainer.seed", default=42) + + logger.info(f"Starting checkpointable MLP training with seed {seed}") torch.manual_seed(seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - print(f"Using device: {device}") + logger.info(f"Using device: {device}") model = nn.Sequential( nn.Linear(input_dim, hidden_dim), @@ -91,14 +102,14 @@ def __call__(self, cfg): if self.ckpt_dir and os.path.exists(self.ckpt_dir): checkpoint_path = os.path.join(self.ckpt_dir, "model.pt") if os.path.exists(checkpoint_path): - print(f"Resuming from checkpoint: {self.ckpt_dir}") + logger.info(f"Resuming from checkpoint: {self.ckpt_dir}") checkpoint = torch.load(checkpoint_path, map_location=device) model.load_state_dict(checkpoint["model_state_dict"]) optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) start_epoch = checkpoint["epoch"] + 1 - print(f"Resumed from epoch {checkpoint['epoch']}") + logger.info(f"Resumed from epoch {checkpoint['epoch']}") - print(f"Training from epoch {start_epoch} to {num_epochs}...") + logger.info(f"Training from epoch {start_epoch} to {num_epochs}...") # Training loop with checkpointing for epoch in range(start_epoch, num_epochs): @@ -121,13 +132,13 @@ def __call__(self, cfg): acc = 100.0 * correct / total avg_loss = loss_sum / len(loader) - print(f"Epoch {epoch}: loss={avg_loss:.4f} acc={acc:.2f}%") + logger.info(f"Epoch {epoch}: loss={avg_loss:.4f} acc={acc:.2f}%") # Save checkpoint every 100 epochs if epoch % 100 == 0 or epoch == num_epochs - 1: self._save_checkpoint(model, optimizer, epoch, out_dir, avg_loss, acc) - print("Training completed!") + logger.info("Training completed!") return 0 def checkpoint(self, *args, **kwargs): diff --git a/templates/src/mlp/single_not_checkpointable/config.yaml b/templates/src/mlp/single_not_checkpointable/config.yaml index 7445421..6c18244 100644 --- a/templates/src/mlp/single_not_checkpointable/config.yaml +++ b/templates/src/mlp/single_not_checkpointable/config.yaml @@ -1,8 +1,18 @@ +defaults: + - _global # Import global config settings + - _self_ # Must include the settings from this file + +# Local Trainer Defaults trainer: input_dim: 10 hidden_dim: 64 num_classes: 3 batch_size: 32 learning_rate: 1e-3 - num_epochs: 100 # Smaller number since no checkpointing + num_epochs: 100 # Smaller number since no checkpointing seed: 42 + +hydra: + searchpath: + - pkg://configs # Include configs from the configs package in the searchpath + diff --git a/templates/src/mlp/single_not_checkpointable/launch.py b/templates/src/mlp/single_not_checkpointable/launch.py index 6922743..85cbff6 100644 --- a/templates/src/mlp/single_not_checkpointable/launch.py +++ b/templates/src/mlp/single_not_checkpointable/launch.py @@ -1,6 +1,7 @@ """Launch script for simple MLP training (no checkpointing) with Hydra + Submitit.""" import os +import logging import hydra from omegaconf import DictConfig, OmegaConf @@ -8,26 +9,27 @@ from .train import SimpleMLPTrainer -_CONFIG_PATH = os.path.normpath( - os.path.join(os.path.dirname(__file__), "../../../configs") -) +logger = logging.getLogger(__name__) - -@hydra.main(config_path=_CONFIG_PATH, config_name="_global", version_base=None) +@hydra.main(config_path=".", config_name="config", version_base=None) def main(cfg: DictConfig): - """Hydra entrypoint that merges local config and runs the Trainer.""" - local_cfg = OmegaConf.load(os.path.join(os.path.dirname(__file__), "config.yaml")) - - # Disable struct mode to allow merging new keys + """Hydra entrypoint that updates config with out_dir, saves resolved hydra config and runs the Trainer.""" + # Turn of struct mode so that we can modify DictConfig OmegaConf.set_struct(cfg, False) - cfg = OmegaConf.merge(cfg, local_cfg) - # Flatten trainer config to top level for easy access - if "trainer" in cfg: - trainer_cfg = cfg.trainer - cfg = OmegaConf.merge(cfg, trainer_cfg) - del cfg.trainer + # Add output_directory for current run + hydra_config = hydra.core.hydra_config.HydraConfig.get() + cfg.paths.out_dir = str(os.path.join(hydra_config.runtime.output_dir, "outputs")) + logger.info(f"Setting paths.out_dir to: {cfg.paths.out_dir}") + + # Save a resolved version of the hydra config + save_path = os.path.join(hydra_config.runtime.output_dir, hydra_config.output_subdir, "hydra_resolved.yaml") + logger.info(f"Resolving hydra config for this run and saving to: {save_path}") + OmegaConf.set_readonly(hydra_config, False) + OmegaConf.resolve(hydra_config) + OmegaConf.save(hydra_config, save_path) + # Run Trainer simple_trainer = SimpleMLPTrainer() return simple_trainer(cfg) diff --git a/templates/src/mlp/single_not_checkpointable/train.py b/templates/src/mlp/single_not_checkpointable/train.py index 6da404c..07a9960 100644 --- a/templates/src/mlp/single_not_checkpointable/train.py +++ b/templates/src/mlp/single_not_checkpointable/train.py @@ -1,8 +1,13 @@ """Simple single-GPU MLP training (no checkpointing).""" +import logging import torch from torch import nn, optim from torch.utils.data import DataLoader, TensorDataset +from omegaconf import DictConfig, OmegaConf + + +logger = logging.getLogger(__name__) def create_dummy_data( @@ -23,19 +28,22 @@ def __init__(self): def __call__(self, cfg): """Train the MLP model.""" - input_dim = getattr(cfg, "input_dim", 10) - hidden_dim = getattr(cfg, "hidden_dim", 64) - num_classes = getattr(cfg, "num_classes", 3) - batch_size = getattr(cfg, "batch_size", 32) - lr = getattr(cfg, "learning_rate", 1e-3) - num_epochs = getattr(cfg, "num_epochs", 100) - seed = getattr(cfg, "seed", 42) - - print(f"Starting simple MLP training with seed {seed}") + cfg : DictConfig = OmegaConf.create(cfg) # Ensure cfg is a DictConfig + + # Get trainer config variables + input_dim = OmegaConf.select(cfg, "trainer.input_dim", default=10) + hidden_dim = OmegaConf.select(cfg, "trainer.hidden_dim", default=64) + num_classes = OmegaConf.select(cfg, "trainer.num_classes", default=3) + batch_size = OmegaConf.select(cfg, "trainer.batch_size", default=32) + lr = OmegaConf.select(cfg, "trainer.learning_rate", default=1e-3) + num_epochs = OmegaConf.select(cfg, "trainer.num_epochs", default=1000) + seed = OmegaConf.select(cfg, "trainer.seed", default=42) + + logger.info(f"Starting simple MLP training with seed {seed}") torch.manual_seed(seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - print(f"Using device: {device}") + logger.info(f"Using device: {device}") model = nn.Sequential( nn.Linear(input_dim, hidden_dim), @@ -51,7 +59,7 @@ def __call__(self, cfg): optimizer = optim.Adam(model.parameters(), lr=lr) criterion = nn.CrossEntropyLoss() - print(f"Training for {num_epochs} epochs...") + logger.info(f"Training for {num_epochs} epochs...") for epoch in range(num_epochs): model.train() @@ -73,7 +81,7 @@ def __call__(self, cfg): acc = 100.0 * correct / total avg_loss = loss_sum / len(loader) - print(f"Epoch {epoch}: loss={avg_loss:.4f} acc={acc:.2f}%") + logger.info(f"Epoch {epoch}: loss={avg_loss:.4f} acc={acc:.2f}%") - print("Training completed!") + logger.info("Training completed!") return 0 diff --git a/templates/src/vlm/README.md b/templates/src/vlm/README.md index 3776963..24a1878 100644 --- a/templates/src/vlm/README.md +++ b/templates/src/vlm/README.md @@ -11,6 +11,6 @@ These folders show how to fine-tune a vision-language model with Hydra + Submiti # Submit a GPU run on Bon Echo A40 uv run python -m vlm.image_captioning.launch \ compute=bon_echo/a40_1x \ - +trainer.batch_size=16 \ + trainer.batch_size=16 \ --multirun ``` diff --git a/templates/src/vlm/image_captioning/config.yaml b/templates/src/vlm/image_captioning/config.yaml index 0a1d025..651264f 100644 --- a/templates/src/vlm/image_captioning/config.yaml +++ b/templates/src/vlm/image_captioning/config.yaml @@ -1,3 +1,7 @@ +defaults: + - _global # Import global config settings + - _self_ # Must include the settings from this file + trainer: model_name: Salesforce/blip-image-captioning-base dataset_name: cifar10 @@ -6,3 +10,7 @@ trainer: learning_rate: 1e-5 print_every: 50 seed: 42 + +hydra: + searchpath: + - pkg://configs # Include configs from the configs package in the searchpath diff --git a/templates/src/vlm/image_captioning/launch.py b/templates/src/vlm/image_captioning/launch.py index 1fda42c..8460b6b 100644 --- a/templates/src/vlm/image_captioning/launch.py +++ b/templates/src/vlm/image_captioning/launch.py @@ -1,30 +1,35 @@ """Launch script to run VLM image captioning with Hydra + Submitit.""" import os +import logging import hydra from omegaconf import DictConfig, OmegaConf from .train import ImageCaptioningTrainer +logger = logging.getLogger(__name__) -_CONFIG_PATH = os.path.normpath( - os.path.join(os.path.dirname(__file__), "../../../configs") -) - -@hydra.main(config_path=_CONFIG_PATH, config_name="_global", version_base=None) +@hydra.main(config_path=".", config_name="config", version_base=None) def main(cfg: DictConfig): - """Hydra entrypoint that merges local config and runs the Trainer.""" - local_cfg = OmegaConf.load(os.path.join(os.path.dirname(__file__), "config.yaml")) + """Hydra entrypoint that updates config with out_dir, saves resolved hydra config and runs the Trainer.""" + # Turn of struct mode so that we can modify DictConfig OmegaConf.set_struct(cfg, False) - cfg = OmegaConf.merge(cfg, local_cfg) - if "trainer" in cfg: - trainer_cfg = cfg.trainer - cfg = OmegaConf.merge(cfg, trainer_cfg) - del cfg.trainer + # Add output_directory for current run + hydra_config = hydra.core.hydra_config.HydraConfig.get() + cfg.paths.out_dir = str(os.path.join(hydra_config.runtime.output_dir, "outputs")) + logger.info(f"Setting paths.out_dir to: {cfg.paths.out_dir}") + + # Save a resolved version of the hydra config + save_path = os.path.join(hydra_config.runtime.output_dir, hydra_config.output_subdir, "hydra_resolved.yaml") + logger.info(f"Resolving hydra config for this run and saving to: {save_path}") + OmegaConf.set_readonly(hydra_config, False) + OmegaConf.resolve(hydra_config) + OmegaConf.save(hydra_config, save_path) + # Run Trainer image_captioning_trainer = ImageCaptioningTrainer() return image_captioning_trainer(cfg) diff --git a/templates/src/vlm/image_captioning/train.py b/templates/src/vlm/image_captioning/train.py index 088363f..196bc7c 100644 --- a/templates/src/vlm/image_captioning/train.py +++ b/templates/src/vlm/image_captioning/train.py @@ -2,13 +2,16 @@ import os import random +import logging import submitit import torch from datasets import load_dataset from torch.utils.data import DataLoader from transformers import BlipForConditionalGeneration, BlipProcessor +from omegaconf import DictConfig, OmegaConf +logger = logging.getLogger(__name__) class ImageCaptioningTrainer(submitit.helpers.Checkpointable): """Trainer for VLM image captioning.""" @@ -33,14 +36,14 @@ def _latest_checkpoint(self, out_dir): def _setup_data(self, processor, cfg): """Set up dataset and dataloaders.""" - dataset_name = getattr(cfg, "dataset_name", "cifar10") - batch_size = getattr(cfg, "batch_size", 16) + dataset_name = OmegaConf.select(cfg, "trainer.dataset_name", "cifar10") + batch_size = OmegaConf.select(cfg, "trainer.batch_size", 16) - print(f"Loading dataset: {dataset_name}") + logger.info(f"Loading dataset: {dataset_name}") if dataset_name == "cifar10": ds = load_dataset("cifar10") - print(f"Dataset splits: {list(ds.keys())}") + logger.info(f"Dataset splits: {list(ds.keys())}") cifar_classes = [ "airplane", @@ -69,13 +72,13 @@ def add_captions(example): else: ds = load_dataset(dataset_name) - print(f"Dataset splits: {list(ds.keys())}") + logger.info(f"Dataset splits: {list(ds.keys())}") train_split = ds.get("train") or ds[list(ds.keys())[0]] eval_split = ds.get("validation") or ds.get("test") - print(f"Train split columns: {train_split.column_names}") - print(f"Sample train item: {train_split[0]}") + logger.info(f"Train split columns: {train_split.column_names}") + logger.info(f"Sample train item: {train_split[0]}") image_col = None text_col = None @@ -94,7 +97,7 @@ def add_captions(example): f"Could not find image and text columns. Available columns: {train_split.column_names}" ) - print(f"Using columns: image='{image_col}', text='{text_col}'") + logger.info(f"Using columns: image='{image_col}', text='{text_col}'") def collate(batch): images = [b[image_col] for b in batch] @@ -138,7 +141,7 @@ def _train_epoch( model.train() running_loss = 0.0 seen = 0 - print_every = getattr(cfg, "print_every", 100) + print_every = OmegaConf.select(cfg, "trainer.print_every", 100) for batch in train_loader: batch_device = { @@ -162,7 +165,7 @@ def _train_epoch( if print_every > 0 and (global_step % print_every == 0): avg_loss = running_loss / max(seen, 1) - print(f"Epoch {epoch} Step {global_step}: loss={avg_loss:.4f}") + logger.info(f"Epoch {epoch} Step {global_step}: loss={avg_loss:.4f}") running_loss = 0.0 seen = 0 @@ -194,7 +197,7 @@ def _evaluate_epoch(self, model, eval_loader, epoch, device): if total > 0: avg_loss = loss_sum / total - print(f"Epoch {epoch} eval: loss={avg_loss:.4f}") + logger.info(f"Epoch {epoch} eval: loss={avg_loss:.4f}") def _save_checkpoint(self, model, processor, epoch, out_dir): """Save model and processor checkpoint.""" @@ -205,18 +208,23 @@ def _save_checkpoint(self, model, processor, epoch, out_dir): def __call__(self, cfg): """Train the VLM model.""" - out_dir = os.path.join(cfg.work_dir, "outputs") + cfg : DictConfig = OmegaConf.create(cfg) # Ensure cfg is a DictConfig + + # Create output directory + out_dir = cfg.paths.out_dir os.makedirs(out_dir, exist_ok=True) + + # Get ckpt dir self.ckpt_dir = self._latest_checkpoint(out_dir) # Configuration - model_name = getattr(cfg, "model_name", "Salesforce/blip-image-captioning-base") - num_epochs = getattr(cfg, "num_epochs", 2) - lr = getattr(cfg, "learning_rate", 1e-5) - seed = getattr(cfg, "seed", 42) + model_name = OmegaConf.select(cfg, "trainer.model_name", "Salesforce/blip-image-captioning-base") + lr = OmegaConf.select(cfg, "trainer.learning_rate", default=1e-5) + num_epochs = OmegaConf.select(cfg, "trainer.num_epochs", default=2) + seed = OmegaConf.select(cfg, "trainer.seed", default=42) # Set seed - print(f"Starting VLM captioning training with seed {seed}") + logger.info(f"Starting VLM captioning training with seed {seed}") random.seed(seed) torch.manual_seed(seed) @@ -232,7 +240,7 @@ def __call__(self, cfg): # Resume from checkpoint if available start_epoch = 0 if self.ckpt_dir and os.path.exists(self.ckpt_dir): - print(f"Resuming from checkpoint: {self.ckpt_dir}") + logger.info(f"Resuming from checkpoint: {self.ckpt_dir}") model = BlipForConditionalGeneration.from_pretrained(self.ckpt_dir).to( device ) @@ -248,7 +256,7 @@ def __call__(self, cfg): self._evaluate_epoch(model, eval_loader, epoch, device) self._save_checkpoint(model, processor, epoch, out_dir) - print("Training completed!") + logger.info("Training completed!") return 0 def checkpoint(self, *args, **kwargs):