[DCP] Removes Checkpoint Wrapped Prefix from state dict fqns (pytorch#118119)

LucasLLC · jeffdaily · commit 76a484d35329 · 2024-02-08T22:34:01.000Z
Fixes pytorch#117399 ~~Soliciting some early feedback here.~~ ~~Do we happen to know if there already some tests that cover this case or would it make sense to add? @fegin , @wz337~~ Edit: Added tests Pull Request resolved: pytorch#118119 Approved by: https://github.com/fegin
diff --git a/test/distributed/checkpoint/test_state_dict.py b/test/distributed/checkpoint/test_state_dict.py
@@ -11,6 +11,9 @@
 from torch.distributed._composable import fully_shard, replicate
 from torch.distributed._shard.sharded_tensor import ShardedTensor
 from torch.distributed._tensor import DTensor, init_device_mesh
+from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
+    apply_activation_checkpointing,
+)
 from torch.distributed.checkpoint.state_dict import (
     _patch_model_state_dict,
     _patch_optimizer_state_dict,
@@ -443,6 +446,19 @@ def is_cpu(v):
             self.assertEqual(mst, {})
             self.assertEqual(ost, {})
 
+    @with_comms
+    @skip_if_lt_x_gpu(1)
+    def test_activation_ckpt_fqns(self) -> None:
+        """Tests that activation checkpointing prefixes are removed from module names"""
+        model = CompositeParamModel(device=torch.device("cuda"))
+        original_keys = get_model_state_dict(model).keys()
+
+        apply_activation_checkpointing(model)
+        model = DDP(model)
+        new_keys = get_model_state_dict(model).keys()
+
+        self.assertEqual(original_keys, new_keys)
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/torch/distributed/checkpoint/state_dict.py b/torch/distributed/checkpoint/state_dict.py
@@ -26,6 +26,9 @@
     _offload_state_dict_to_cpu,
 )
 from torch.distributed._tensor import DTensor
+from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
+    _CHECKPOINT_PREFIX,
+)
 from torch.distributed.fsdp import (
     FullOptimStateDictConfig,
     FullStateDictConfig,
@@ -145,7 +148,7 @@ def _get_fqns(model: nn.Module, name: str, skip_ddp_prefix: bool = True) -> FQNS
         The canonical FQNs based on the model traversal.
     """
     if "." not in name:
-        return {name}
+        return {name.replace(_CHECKPOINT_PREFIX, "")}
 
     obj_names = name.split(".")
     fqn_obj_names = []
@@ -162,6 +165,8 @@ def _get_fqns(model: nn.Module, name: str, skip_ddp_prefix: bool = True) -> FQNS
                 flat_param = getattr(curr_obj, FLAT_PARAM)
                 if prefix:
                     prefix = f"{prefix}."
+                # FSDP already handles removal of checkpoint prefix, so we can return
+                # directly
                 return {f"{prefix}{fqn}" for fqn in flat_param._fqns}
             curr_obj = getattr(curr_obj, FSDP_WRAPPED_MODULE)
             if curr_obj_name != FSDP_WRAPPED_MODULE:
@@ -171,7 +176,7 @@ def _get_fqns(model: nn.Module, name: str, skip_ddp_prefix: bool = True) -> FQNS
             fqn_obj_names.append(curr_obj_name)
             curr_obj = getattr(curr_obj, curr_obj_name)
 
-    return {".".join(fqn_obj_names)}
+    return {".".join(fqn_obj_names).replace(_CHECKPOINT_PREFIX, "")}
 
 
 def _verify_options(