Skip to content

Commit

Permalink
add nan inf check
Browse files Browse the repository at this point in the history
  • Loading branch information
shaojiewang committed Mar 30, 2023
1 parent b50038f commit c2afd9f
Show file tree
Hide file tree
Showing 6 changed files with 787 additions and 61 deletions.
72 changes: 49 additions & 23 deletions paddle/fluid/framework/details/nan_inf_utils_detail.cc
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,28 @@
#include "paddle/fluid/framework/convert_utils.h"
#include "paddle/phi/kernels/funcs/eigen/extensions.h"

DECLARE_int32(check_nan_inf_level);

namespace paddle {
namespace framework {
namespace details {
struct DebugTools {
DebugTools() {}
std::string path = "";
};
static DebugTools debug_nan_inf;

void SetNanInfDebugPath(const std::string& nan_inf_path) {
debug_nan_inf.path = nan_inf_path;
VLOG(4) << "Set the log's path of debug tools : " << nan_inf_path;
}

std::string GetNanPath() {
if (debug_nan_inf.path.empty()) {
return "";
}
return debug_nan_inf.path + "/";
}

static std::once_flag white_list_init_flag;

Expand Down Expand Up @@ -90,15 +109,15 @@ static void InitWhiteListFormEnv() {
const char* op_role_skip = std::getenv("PADDLE_INF_NAN_SKIP_ROLE");
const char* op_var_skip = std::getenv("PADDLE_INF_NAN_SKIP_VAR");

if (op_type_skip != NULL) {
if (op_type_skip) {
std::stringstream ss(op_type_skip);
std::string op_type;
while (std::getline(ss, op_type, ',')) {
op_type_nan_inf_white_list().emplace(op_type);
}
}

if (op_role_skip != NULL) {
if (op_role_skip) {
std::stringstream ss(op_role_skip);
std::string op_role;
while (std::getline(ss, op_role, ',')) {
Expand All @@ -113,7 +132,7 @@ static void InitWhiteListFormEnv() {
}
}

if (op_var_skip != NULL) {
if (op_var_skip) {
std::stringstream ss(op_var_skip);
std::string op_var;
while (std::getline(ss, op_var, ',')) {
Expand Down Expand Up @@ -326,13 +345,13 @@ void TensorCheckerVisitor<phi::CPUContext>::apply(
// use env strategy control in future, -1=print_all.
int print_num = 3;
CheckNanInf(
tensor_.data<T>(), tensor_.numel(), print_num, op_type_, var_name_);
tensor.data<T>(), tensor.numel(), print_num, op_type, var_name);
}

template <>
void tensor_check<phi::CPUContext>(const std::string& op_type,
const std::string& var_name,
const framework::Tensor& tensor,
const phi::DenseTensor& tensor,
const platform::Place& place) {
TensorCheckerVisitor<phi::CPUContext> vistor(
op_type, var_name, tensor, place);
Expand All @@ -348,9 +367,9 @@ void CheckVarHasNanOrInf(const std::string& op_type,
platform::errors::NotFound(
"Cannot find var: `%s` in op `%s`.", var_name, op_type));

const Tensor* tensor{nullptr};
if (var->IsType<framework::LoDTensor>()) {
tensor = &var->Get<framework::LoDTensor>();
const phi::DenseTensor* tensor{nullptr};
if (var->IsType<phi::DenseTensor>()) {
tensor = &var->Get<phi::DenseTensor>();
} else if (var->IsType<phi::SelectedRows>()) {
tensor = &var->Get<phi::SelectedRows>().value();
} else {
Expand All @@ -371,7 +390,8 @@ void CheckVarHasNanOrInf(const std::string& op_type,
tensor_check<phi::GPUContext>(op_type, var_name, *tensor, place);
#else
PADDLE_THROW(platform::errors::PreconditionNotMet(
"Tensor[%s] use gpu place. PaddlePaddle must compile with GPU.",
"phi::DenseTensor[%s] use gpu place. PaddlePaddle must compile "
"with GPU.",
var_name));
#endif
return;
Expand Down Expand Up @@ -400,10 +420,13 @@ void CheckVarHasNanOrInf(const std::string& op_type,
flag,
true,
platform::errors::Fatal(
"Operator %s output Tensor %s contains Inf.", op_type, var_name));
"Operator %s output phi::DenseTensor %s contains Inf.",
op_type,
var_name));
#else
PADDLE_THROW(platform::errors::PreconditionNotMet(
"Tensor[%s] use xpu place. PaddlePaddle must compile with XPU.",
"phi::DenseTensor[%s] use xpu place. PaddlePaddle must compile "
"with XPU.",
var_name));
#endif
return;
Expand All @@ -414,7 +437,7 @@ void CheckVarHasNanOrInf(const std::string& op_type,
return;
}

framework::LoDTensor cpu_tensor;
phi::DenseTensor cpu_tensor;
cpu_tensor.Resize(tensor->dims());
float* cpu_data = static_cast<float*>(
cpu_tensor.mutable_data(platform::CPUPlace(), tensor->dtype()));
Expand All @@ -431,10 +454,13 @@ void CheckVarHasNanOrInf(const std::string& op_type,
flag,
true,
platform::errors::Fatal(
"Operator %s output Tensor %s contains Inf.", op_type, var_name));
"Operator %s output phi::DenseTensor %s contains Inf.",
op_type,
var_name));
#else
PADDLE_THROW(platform::errors::PreconditionNotMet(
"Tensor[%s] use npu place. PaddlePaddle must compile with NPU.",
"phi::DenseTensor[%s] use npu place. PaddlePaddle must compile "
"with NPU.",
var_name));
#endif
return;
Expand Down Expand Up @@ -473,8 +499,8 @@ using NpuOpRunner = paddle::operators::NpuOpRunner;

constexpr int FLOAT_STATUS_SIZE = 8;

static framework::Tensor& npu_float_status() {
static framework::Tensor float_status;
static phi::DenseTensor& npu_float_status() {
static phi::DenseTensor float_status;
return float_status;
}

Expand All @@ -494,7 +520,7 @@ void NPUAllocAndClearFloatStatus(const framework::OperatorBase& op,
flag.mutable_data<float>({FLOAT_STATUS_SIZE}, place);
NpuOpRunner("NPUAllocFloatStatus", {}, {flag}).Run(stream);

framework::Tensor tmp;
phi::DenseTensor tmp;
tmp.mutable_data<float>({FLOAT_STATUS_SIZE}, place);
NpuOpRunner("NPUClearFloatStatus", {tmp}, {flag}).Run(stream);
}
Expand All @@ -503,9 +529,9 @@ void PrintNpuVarInfo(const std::string& op_type,
const std::string& var_name,
const framework::Variable* var,
const platform::Place& place) {
const Tensor* tensor{nullptr};
if (var->IsType<framework::LoDTensor>()) {
tensor = &var->Get<framework::LoDTensor>();
const phi::DenseTensor* tensor{nullptr};
if (var->IsType<phi::DenseTensor>()) {
tensor = &var->Get<phi::DenseTensor>();
} else if (var->IsType<phi::SelectedRows>()) {
tensor = &var->Get<phi::SelectedRows>().value();
} else {
Expand All @@ -528,7 +554,7 @@ void PrintNpuVarInfo(const std::string& op_type,
VLOG(10) << "begin check " << op_type << " var_name:" << var_name
<< ", place:" << tensor->place() << ", numel:" << tensor->numel();

framework::Tensor cpu_tensor;
phi::DenseTensor cpu_tensor;
cpu_tensor.Resize(tensor->dims());
cpu_tensor.mutable_data(platform::CPUPlace(), tensor->dtype());
framework::TensorCopySync(*tensor, platform::CPUPlace(), &cpu_tensor);
Expand Down Expand Up @@ -575,13 +601,13 @@ static void NPUCheckOpHasNanOrInf(const framework::OperatorBase& op,
auto stream = dev_ctx->stream();

auto& flag = npu_float_status();
Tensor tmp;
phi::DenseTensor tmp;
tmp.mutable_data<float>({FLOAT_STATUS_SIZE}, place);
// NPUGetFloatStatus updates data on input in-place.
// tmp is only placeholder.
NpuOpRunner("NPUGetFloatStatus", {flag}, {tmp}).Run(stream);

framework::Tensor cpu_tensor;
phi::DenseTensor cpu_tensor;
auto cpu_place = platform::CPUPlace();
float* cpu_data = static_cast<float*>(
cpu_tensor.mutable_data<float>({FLOAT_STATUS_SIZE}, cpu_place));
Expand Down
Loading

0 comments on commit c2afd9f

Please sign in to comment.