add nan inf check

Xreki · Mar 30, 2023 · c2afd9f · c2afd9f
1 parent b50038f
commit c2afd9f
Show file tree

Hide file tree

Showing 6 changed files with 787 additions and 61 deletions.
diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cc b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
@@ -24,9 +24,28 @@
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/phi/kernels/funcs/eigen/extensions.h"
 
+DECLARE_int32(check_nan_inf_level);
+
 namespace paddle {
 namespace framework {
 namespace details {
+struct DebugTools {
+  DebugTools() {}
+  std::string path = "";
+};
+static DebugTools debug_nan_inf;
+
+void SetNanInfDebugPath(const std::string& nan_inf_path) {
+  debug_nan_inf.path = nan_inf_path;
+  VLOG(4) << "Set the log's path of debug tools : " << nan_inf_path;
+}
+
+std::string GetNanPath() {
+  if (debug_nan_inf.path.empty()) {
+    return "";
+  }
+  return debug_nan_inf.path + "/";
+}
 
 static std::once_flag white_list_init_flag;
 
@@ -90,15 +109,15 @@ static void InitWhiteListFormEnv() {
   const char* op_role_skip = std::getenv("PADDLE_INF_NAN_SKIP_ROLE");
   const char* op_var_skip = std::getenv("PADDLE_INF_NAN_SKIP_VAR");
 
-  if (op_type_skip != NULL) {
+  if (op_type_skip) {
     std::stringstream ss(op_type_skip);
     std::string op_type;
     while (std::getline(ss, op_type, ',')) {
       op_type_nan_inf_white_list().emplace(op_type);
     }
   }
 
-  if (op_role_skip != NULL) {
+  if (op_role_skip) {
     std::stringstream ss(op_role_skip);
     std::string op_role;
     while (std::getline(ss, op_role, ',')) {
@@ -113,7 +132,7 @@ static void InitWhiteListFormEnv() {
     }
   }
 
-  if (op_var_skip != NULL) {
+  if (op_var_skip) {
     std::stringstream ss(op_var_skip);
     std::string op_var;
     while (std::getline(ss, op_var, ',')) {
@@ -326,13 +345,13 @@ void TensorCheckerVisitor<phi::CPUContext>::apply(
   // use env strategy control in future, -1=print_all.
   int print_num = 3;
   CheckNanInf(
-      tensor_.data<T>(), tensor_.numel(), print_num, op_type_, var_name_);
+      tensor.data<T>(), tensor.numel(), print_num, op_type, var_name);
 }
 
 template <>
 void tensor_check<phi::CPUContext>(const std::string& op_type,
                                    const std::string& var_name,
-                                   const framework::Tensor& tensor,
+                                   const phi::DenseTensor& tensor,
                                    const platform::Place& place) {
   TensorCheckerVisitor<phi::CPUContext> vistor(
       op_type, var_name, tensor, place);
@@ -348,9 +367,9 @@ void CheckVarHasNanOrInf(const std::string& op_type,
       platform::errors::NotFound(
           "Cannot find var: `%s` in op `%s`.", var_name, op_type));
 
-  const Tensor* tensor{nullptr};
-  if (var->IsType<framework::LoDTensor>()) {
-    tensor = &var->Get<framework::LoDTensor>();
+  const phi::DenseTensor* tensor{nullptr};
+  if (var->IsType<phi::DenseTensor>()) {
+    tensor = &var->Get<phi::DenseTensor>();
   } else if (var->IsType<phi::SelectedRows>()) {
     tensor = &var->Get<phi::SelectedRows>().value();
   } else {
@@ -371,7 +390,8 @@ void CheckVarHasNanOrInf(const std::string& op_type,
     tensor_check<phi::GPUContext>(op_type, var_name, *tensor, place);
 #else
     PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "Tensor[%s] use gpu place. PaddlePaddle must compile with GPU.",
+        "phi::DenseTensor[%s] use gpu place. PaddlePaddle must compile "
+        "with GPU.",
         var_name));
 #endif
     return;
@@ -400,10 +420,13 @@ void CheckVarHasNanOrInf(const std::string& op_type,
         flag,
         true,
         platform::errors::Fatal(
-            "Operator %s output Tensor %s contains Inf.", op_type, var_name));
+            "Operator %s output phi::DenseTensor %s contains Inf.",
+            op_type,
+            var_name));
 #else
     PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "Tensor[%s] use xpu place. PaddlePaddle must compile with XPU.",
+        "phi::DenseTensor[%s] use xpu place. PaddlePaddle must compile "
+        "with XPU.",
         var_name));
 #endif
     return;
@@ -414,7 +437,7 @@ void CheckVarHasNanOrInf(const std::string& op_type,
       return;
     }
 
-    framework::LoDTensor cpu_tensor;
+    phi::DenseTensor cpu_tensor;
     cpu_tensor.Resize(tensor->dims());
     float* cpu_data = static_cast<float*>(
         cpu_tensor.mutable_data(platform::CPUPlace(), tensor->dtype()));
@@ -431,10 +454,13 @@ void CheckVarHasNanOrInf(const std::string& op_type,
         flag,
         true,
         platform::errors::Fatal(
-            "Operator %s output Tensor %s contains Inf.", op_type, var_name));
+            "Operator %s output phi::DenseTensor %s contains Inf.",
+            op_type,
+            var_name));
 #else
     PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "Tensor[%s] use npu place. PaddlePaddle must compile with NPU.",
+        "phi::DenseTensor[%s] use npu place. PaddlePaddle must compile "
+        "with NPU.",
         var_name));
 #endif
     return;
@@ -473,8 +499,8 @@ using NpuOpRunner = paddle::operators::NpuOpRunner;
 
 constexpr int FLOAT_STATUS_SIZE = 8;
 
-static framework::Tensor& npu_float_status() {
-  static framework::Tensor float_status;
+static phi::DenseTensor& npu_float_status() {
+  static phi::DenseTensor float_status;
   return float_status;
 }
 
@@ -494,7 +520,7 @@ void NPUAllocAndClearFloatStatus(const framework::OperatorBase& op,
   flag.mutable_data<float>({FLOAT_STATUS_SIZE}, place);
   NpuOpRunner("NPUAllocFloatStatus", {}, {flag}).Run(stream);
 
-  framework::Tensor tmp;
+  phi::DenseTensor tmp;
   tmp.mutable_data<float>({FLOAT_STATUS_SIZE}, place);
   NpuOpRunner("NPUClearFloatStatus", {tmp}, {flag}).Run(stream);
 }
@@ -503,9 +529,9 @@ void PrintNpuVarInfo(const std::string& op_type,
                      const std::string& var_name,
                      const framework::Variable* var,
                      const platform::Place& place) {
-  const Tensor* tensor{nullptr};
-  if (var->IsType<framework::LoDTensor>()) {
-    tensor = &var->Get<framework::LoDTensor>();
+  const phi::DenseTensor* tensor{nullptr};
+  if (var->IsType<phi::DenseTensor>()) {
+    tensor = &var->Get<phi::DenseTensor>();
   } else if (var->IsType<phi::SelectedRows>()) {
     tensor = &var->Get<phi::SelectedRows>().value();
   } else {
@@ -528,7 +554,7 @@ void PrintNpuVarInfo(const std::string& op_type,
   VLOG(10) << "begin check " << op_type << " var_name:" << var_name
            << ", place:" << tensor->place() << ", numel:" << tensor->numel();
 
-  framework::Tensor cpu_tensor;
+  phi::DenseTensor cpu_tensor;
   cpu_tensor.Resize(tensor->dims());
   cpu_tensor.mutable_data(platform::CPUPlace(), tensor->dtype());
   framework::TensorCopySync(*tensor, platform::CPUPlace(), &cpu_tensor);
@@ -575,13 +601,13 @@ static void NPUCheckOpHasNanOrInf(const framework::OperatorBase& op,
   auto stream = dev_ctx->stream();
 
   auto& flag = npu_float_status();
-  Tensor tmp;
+  phi::DenseTensor tmp;
   tmp.mutable_data<float>({FLOAT_STATUS_SIZE}, place);
   // NPUGetFloatStatus updates data on input in-place.
   // tmp is only placeholder.
   NpuOpRunner("NPUGetFloatStatus", {flag}, {tmp}).Run(stream);
 
-  framework::Tensor cpu_tensor;
+  phi::DenseTensor cpu_tensor;
   auto cpu_place = platform::CPUPlace();
   float* cpu_data = static_cast<float*>(
       cpu_tensor.mutable_data<float>({FLOAT_STATUS_SIZE}, cpu_place));