diff --git a/paddle/phi/backends/xpu/xpu2_op_list.cc b/paddle/phi/backends/xpu/xpu2_op_list.cc
index 323207df20371..efb5b4c9d098f 100644
--- a/paddle/phi/backends/xpu/xpu2_op_list.cc
+++ b/paddle/phi/backends/xpu/xpu2_op_list.cc
@@ -595,7 +595,10 @@ XPUOpMap& get_kl2_ops() {
        XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
       {"mean_grad",
        XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
-      {"mean", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"mean",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::BFLOAT16})},
       {"merged_adam", XPUKernelSet({phi::DataType::FLOAT32})},
       {"merged_momentum",
        XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
diff --git a/paddle/phi/backends/xpu/xpu3_op_list.cc b/paddle/phi/backends/xpu/xpu3_op_list.cc
index aeb51998f4d7a..29e9c7e0f8901 100644
--- a/paddle/phi/backends/xpu/xpu3_op_list.cc
+++ b/paddle/phi/backends/xpu/xpu3_op_list.cc
@@ -571,7 +571,10 @@ XPUOpMap& get_kl3_ops() {
        XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
       {"mean_grad",
        XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
-      {"mean", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"mean",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::BFLOAT16})},
       {"merged_momentum",
        XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
       {"mish_grad", XPUKernelSet({phi::DataType::FLOAT32})},
diff --git a/paddle/phi/kernels/funcs/tensor_formatter.cc b/paddle/phi/kernels/funcs/tensor_formatter.cc
index 16d3b38bced7c..7c4cd28fe20c7 100644
--- a/paddle/phi/kernels/funcs/tensor_formatter.cc
+++ b/paddle/phi/kernels/funcs/tensor_formatter.cc
@@ -107,6 +107,10 @@ std::string TensorFormatter::Format(const phi::DenseTensor& print_tensor,
     FormatData<int64_t>(print_tensor, log_stream);
   } else if (dtype == phi::DataType::BOOL) {
     FormatData<bool>(print_tensor, log_stream);
+  } else if (dtype == phi::DataType::FLOAT16) {
+    FormatData<phi::dtype::float16>(print_tensor, log_stream);
+  } else if (dtype == phi::DataType::BFLOAT16) {
+    FormatData<phi::dtype::bfloat16>(print_tensor, log_stream);
   } else {
     log_stream << "  - data: unprintable type: " << dtype << std::endl;
   }
@@ -153,6 +157,10 @@ template void TensorFormatter::FormatData<int>(
     const phi::DenseTensor& print_tensor, std::stringstream& log_stream);
 template void TensorFormatter::FormatData<int64_t>(
     const phi::DenseTensor& print_tensor, std::stringstream& log_stream);
+template void TensorFormatter::FormatData<phi::dtype::float16>(
+    const phi::DenseTensor& print_tensor, std::stringstream& log_stream);
+template void TensorFormatter::FormatData<phi::dtype::bfloat16>(
+    const phi::DenseTensor& print_tensor, std::stringstream& log_stream);
 
 }  // namespace funcs
 }  // namespace paddle
diff --git a/test/legacy_test/test_print_op.py b/test/legacy_test/test_print_op.py
index 95c1dd420626d..1ce1a08643210 100755
--- a/test/legacy_test/test_print_op.py
+++ b/test/legacy_test/test_print_op.py
@@ -15,6 +15,7 @@
 import unittest
 
 import numpy as np
+from op_test import convert_float_to_uint16
 from simple_nets import init_data, simple_fc_net
 
 import paddle
@@ -30,14 +31,17 @@
 
 class TestPrintOpCPU(unittest.TestCase):
     def setUp(self):
+        self.dtype = 'float32'
         self.place = paddle.CPUPlace()
         self.x_tensor = base.core.LoDTensor()
-        tensor_np = np.random.random(size=(2, 3)).astype('float32')
+        tensor_np = np.random.random(size=(2, 3)).astype(self.dtype)
         self.x_tensor.set(tensor_np, self.place)
         self.x_tensor.set_recursive_sequence_lengths([[1, 1]])
 
     def build_network(self, only_forward, **kargs):
-        x = paddle.static.data('x', shape=[-1, 3], dtype='float32', lod_level=1)
+        x = paddle.static.data(
+            'x', shape=[-1, 3], dtype=self.dtype, lod_level=1
+        )
         x.stop_gradient = False
         paddle.static.Print(input=x, **kargs)
         loss = paddle.mean(x)
@@ -77,7 +81,7 @@ def test_all_parameters(self):
         prog = paddle.static.Program()
         with paddle.static.program_guard(prog, paddle.static.Program()):
             x = paddle.static.data(
-                'x', shape=[-1, 3], dtype='float32', lod_level=1
+                'x', shape=[-1, 3], dtype=self.dtype, lod_level=1
             )
             x.stop_gradient = False
 
@@ -136,9 +140,36 @@ def test_errors(self):
 )
 class TestPrintOpGPU(TestPrintOpCPU):
     def setUp(self):
+        self.dtype = 'float32'
         self.place = paddle.CUDAPlace(0)
         self.x_tensor = base.core.LoDTensor()
-        tensor_np = np.random.random(size=(2, 3)).astype('float32')
+        tensor_np = np.random.random(size=(2, 3)).astype(self.dtype)
+        self.x_tensor.set(tensor_np, self.place)
+        self.x_tensor.set_recursive_sequence_lengths([[1, 1]])
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
+)
+class TestPrintOpGPUFP16(TestPrintOpCPU):
+    def setUp(self):
+        self.dtype = 'float16'
+        self.place = paddle.CUDAPlace(0)
+        self.x_tensor = base.core.LoDTensor()
+        tensor_np = np.random.random(size=(2, 3)).astype(self.dtype)
+        self.x_tensor.set(tensor_np, self.place)
+        self.x_tensor.set_recursive_sequence_lengths([[1, 1]])
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
+)
+class TestPrintOpGPUBFP16(TestPrintOpCPU):
+    def setUp(self):
+        self.dtype = 'bfloat16'
+        self.place = paddle.CUDAPlace(0)
+        self.x_tensor = base.core.LoDTensor()
+        tensor_np = convert_float_to_uint16(np.random.random(size=(2, 3)))
         self.x_tensor.set(tensor_np, self.place)
         self.x_tensor.set_recursive_sequence_lengths([[1, 1]])
 
diff --git a/test/xpu/op_test_xpu.py b/test/xpu/op_test_xpu.py
index 7ea5359de5044..09ee428714bd6 100644
--- a/test/xpu/op_test_xpu.py
+++ b/test/xpu/op_test_xpu.py
@@ -183,8 +183,8 @@ def check_grad_with_place(
             if not core.is_float16_supported(place):
                 return
 
-        if self.dtype == np.float16:
-            max_relative_error = 1.0
+        if self.dtype == np.float16 or self.dtype == np.uint16:
+            max_relative_error = 0.1
             return super().check_grad_with_place(
                 place,
                 inputs_to_check,
diff --git a/test/xpu/test_adamw_op_xpu.py b/test/xpu/test_adamw_op_xpu.py
index 1a777f2d23578..8584360837d79 100644
--- a/test/xpu/test_adamw_op_xpu.py
+++ b/test/xpu/test_adamw_op_xpu.py
@@ -84,7 +84,7 @@ def setUp(self):
             # Test AdamW Op with supplied attributes
             self.op_type = "adamw"
             self.init_shape()
-            self.dtype = self.in_type_str
+            self.dtype = self.in_type
             param = np.random.uniform(-1, 1, self.shape).astype(self.dtype)
             grad = np.random.uniform(-1, 1, self.shape).astype(self.dtype)
             moment1 = np.random.uniform(-1, 1, self.shape).astype("float32")