OpenNMT · alexis-allemann · Jan 11, 2024 · Jan 12, 2024 · Jan 19, 2024 · Jan 19, 2024
diff --git a/onmt/opts.py b/onmt/opts.py
@@ -1459,6 +1459,14 @@ def _add_train_general_opts(parser):
         "Default is every update, "
         "if -average_decay is set.",
     )
+    group.add(
+        "--max_consecutive_oom_errors",
+        "-max_consecutive_oom_errors",
+        type=int,
+        default=3,
+        help="Maximum consecutive out of memory errors before"
+        "shutting down training.",
+    )
 
     # learning rate
     group = parser.add_argument_group("Optimization- Rate")

diff --git a/onmt/train_single.py b/onmt/train_single.py
@@ -240,6 +240,7 @@ def main(opt, device_id):
         save_checkpoint_steps=opt.save_checkpoint_steps,
         valid_iter=valid_iter,
         valid_steps=opt.valid_steps,
+        max_consecutive_oom_errors=opt.max_consecutive_oom_errors,
     )
 
     if trainer.report_manager.tensorboard_writer is not None:

diff --git a/onmt/trainer.py b/onmt/trainer.py
@@ -9,15 +9,17 @@
           users of this library) for the strategy things we do.
 """
 
-import time
 import sys
-import torch
+import time
 import traceback
+
+import torch
+
 import onmt.utils
-from onmt.utils.loss import LossCompute
+from onmt.scorers import build_scorers, get_scorers_cls
 from onmt.utils.logging import logger
+from onmt.utils.loss import LossCompute
 from onmt.utils.scoring_utils import ScoringPreparator
-from onmt.scorers import get_scorers_cls, build_scorers
 
 
 def build_trainer(opt, device_id, model, vocabs, optim, model_saver=None):
@@ -149,6 +151,7 @@ def __init__(
         scoring_preparator,
         valid_scorers,
         optim,
+        max_consecutive_oom_errors,
         trunc_size=0,
         norm_method="sents",
         accum_count=[1],
@@ -177,6 +180,7 @@ def __init__(
         self.scoring_preparator = scoring_preparator
         self.valid_scorers = valid_scorers
         self.optim = optim
+        self.max_consecutive_oom_errors = max_consecutive_oom_errors
         self.trunc_size = trunc_size
         self.norm_method = norm_method
         self.accum_count_l = accum_count
@@ -197,6 +201,7 @@ def __init__(
         self.attention_dropout = attention_dropout
         self.dropout_steps = dropout_steps
         self.zero_out_prompt_loss = zero_out_prompt_loss
+        self.current_nb_oom_errors = 0
 
         for i in range(len(self.accum_count_l)):
             assert self.accum_count_l[i] > 0
@@ -306,7 +311,6 @@ def train(
         torch.cuda.empty_cache()
 
         for i, (batches, normalization) in enumerate(self._accum_batches(train_iter)):
-
             step = self.optim.training_step
             # UPDATE DROPOUT
             self._maybe_update_dropout(step)
@@ -515,6 +519,7 @@ def _gradient_accumulation(
 
                     total_stats.update(batch_stats)
                     report_stats.update(batch_stats)
+                    self.max_consecutive_oom_errors = 0
 
                 except Exception as exc:
                     trace_content = traceback.format_exc()
@@ -524,7 +529,12 @@ def _gradient_accumulation(
                             self.optim.training_step,
                         )
                         torch.cuda.empty_cache()
-                        if self.n_gpu > 1 and self.parallel_mode == "tensor_parallel":
+                        self.current_nb_oom_errors += 1
+                        if (
+                            self.n_gpu > 1 and self.parallel_mode == "tensor_parallel"
+                        ) or (
+                            self.current_nb_oom_errors > self.max_consecutive_oom_errors
+                        ):
                             torch.distributed.destroy_process_group()
                             sys.exit()
                     else:
@@ -538,11 +548,12 @@ def _gradient_accumulation(
         # in case of multi step gradient accumulation,
         # update only after accum batches
         if self.n_gpu > 1 and self.parallel_mode == "data_parallel":
-            grads = [
-                p.grad.data
-                for p in self.model.parameters()
-                if p.requires_grad and p.grad is not None
-            ]
+            grads = []
+            for p in self.model.parameters():
+                if p.requires_grad:
+                    if p.grad is None:
+                        p.grad = torch.zeros(p.shape).cuda(p.device)
+                    grads.append(p.grad.data)
             onmt.utils.distributed.all_reduce_and_rescale_tensors(
                 grads, float(self.n_gpu)
             )