generalize api (NVIDIA#168)

XuehaiPan · Jul 20, 2023 · 0e9b765 · 0e9b765
1 parent 69e3c6a
commit 0e9b765
Show file tree

Hide file tree

Showing 2 changed files with 16 additions and 6 deletions.
diff --git a/megatron/model/gpt_model.py b/megatron/model/gpt_model.py
@@ -19,7 +19,11 @@
 from .transformer import ParallelTransformerLayerPipe, LMHeadPipe
 from deepspeed.pipe import PipelineModule, LayerSpec, TiedLayerSpec
 
-from apex.normalization import MixedFusedRMSNorm
+try:
+    from apex.normalization import MixedFusedRMSNorm
+except ImportError:
+    MixedFusedRMSNorm = None
+
 
 def post_language_model_processing(lm_output, labels, logit_weights,
                                    parallel_output,

diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
@@ -45,7 +45,12 @@
     flash_attn_builder = FlashAttentionBuilder().load()
 except TypeError:
     flash_attn_builder = None
-from apex.normalization import MixedFusedRMSNorm
+
+try:
+    from apex.normalization import MixedFusedRMSNorm
+except ImportError:
+    MixedFusedRMSNorm = None
+
 
 
 """ We use the following notation throughout this file:
@@ -383,10 +388,11 @@ def forward(self, q, k, v):
         """
 
         assert all((i.dtype in [torch.float16, torch.bfloat16] for i in (q,k,v)))
-        if get_accelerator().device_name() == 'cuda':
-            assert all((i.is_cuda for i in (q,k,v)))
-        else:
-            assert all((i.is_xpu for i in (q,k,v)))
+        assert all((get_accelerator().on_accelerator(i) for i in (q, k, v)))
+        # if get_accelerator().device_name() == 'cuda':
+        #     assert all((i.is_cuda for i in (q,k,v)))
+        # else:
+        #     assert all((i.is_xpu for i in (q,k,v)))
 
         batch_size, seqlen_q = q.shape[0], q.shape[1]
         seqlen_k = k.shape[1]