Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

【Fix PIR Unittest No.75,219,230,279,362】Fix some test cast in PIR #64442

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions paddle/fluid/framework/details/nan_inf_utils_detail.cc
Original file line number Diff line number Diff line change
Expand Up @@ -83,14 +83,14 @@ static const std::unordered_map<std::string, int>& role_str2int() {
return _role_str2int;
}

static std::unordered_set<std::string>& op_type_nan_inf_white_list() {
std::unordered_set<std::string>& op_type_nan_inf_white_list() {
static std::unordered_set<std::string> _op_type_nan_inf_white_list = {
"coalesce_tensor", /* This Op will alloc tensor, and may not init space */
};
return _op_type_nan_inf_white_list;
}

static std::unordered_map<std::string, std::vector<std::string>>&
std::unordered_map<std::string, std::vector<std::string>>&
op_var_nan_inf_white_list() {
static std::unordered_map<std::string, std::vector<std::string>>
_op_var_nan_inf_white_list = {
Expand All @@ -100,7 +100,7 @@ op_var_nan_inf_white_list() {
return _op_var_nan_inf_white_list;
}

static void InitWhiteListFormEnv() {
void InitWhiteListFormEnv() {
// op_type_skip and op_var_skip may be NULL.
// So need init static value in there, prevent thread competition.
// NOTE. role_str2int needn't do this for it only used in this func.
Expand Down
4 changes: 4 additions & 0 deletions paddle/fluid/framework/details/nan_inf_utils_detail.h
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,10 @@ void tensor_check(const std::string& op_type,
VisitDataType(framework::TransToProtoVarType(tensor.dtype()), vistor);
}

void InitWhiteListFormEnv();
std::unordered_set<std::string>& op_type_nan_inf_white_list();
std::unordered_map<std::string, std::vector<std::string>>&
op_var_nan_inf_white_list();
} // namespace details
} // namespace framework
} // namespace paddle
99 changes: 99 additions & 0 deletions paddle/fluid/framework/new_executor/nan_inf_utils.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "paddle/fluid/framework/new_executor/nan_inf_utils.h"

#include "paddle/common/flags.h"
#include "paddle/fluid/framework/details/nan_inf_utils_detail.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/compat/convert_utils.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/distributed/auto_parallel/dist_tensor.h"
#include "paddle/phi/core/selected_rows.h"

namespace paddle {
namespace framework {

static std::once_flag pir_white_list_init_flag;

void CheckTensorHasNanOrInf(InstructionBase* instruction,
const paddle::framework::Scope* scope,
ValueExecutionInfo* value_exe_info) {
std::call_once(pir_white_list_init_flag, details::InitWhiteListFormEnv);

std::string dialect_name = instruction->Operation()
->attributes()
.at("op_name")
.dyn_cast<pir::StrAttribute>()
.AsString();
auto api_name =
dialect_name.substr(dialect_name.find(".") + 1, dialect_name.size());
auto op_name = phi::TransToFluidOpName(api_name);

if (details::op_type_nan_inf_white_list().count(api_name) != 0) {
return;
}

for (auto iter : instruction->Outputs()) {
auto tensor_name = value_exe_info->GetVarName(iter.first);
bool need_check = true;
if (details::op_var_nan_inf_white_list().count(api_name) != 0) {
for (auto& white_vname :
details::op_var_nan_inf_white_list().at(api_name)) {
if (tensor_name.find(white_vname) != std::string::npos) {
need_check = false;
break;
}
}
}
if (!need_check) continue;

if (scope) {
const phi::DenseTensor* dense_tensor{nullptr};
Variable* var = scope->FindVar(tensor_name);
if (!var) {
VLOG(10) << "No var found for tensor_name: " << tensor_name;
continue;
}
if (var->IsType<phi::DenseTensor>()) {
dense_tensor = var->GetMutable<phi::DenseTensor>();
} else if (var->IsType<phi::SelectedRows>()) {
dense_tensor = var->GetMutable<phi::SelectedRows>()->mutable_value();
} else {
VLOG(10) << "Only DenseTensor,SelectedRows,DistTensor need to check, "
<< tensor_name << " is no need.";
break;
}

auto& place = dense_tensor->place();
if (paddle::platform::is_gpu_place(place)) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
paddle::framework::details::tensor_check<phi::GPUContext>(
api_name, tensor_name, *dense_tensor, place);
#else
PADDLE_THROW(paddle::platform::errors::PreconditionNotMet(
"Tensor[%s] use gpu place. PaddlePaddle must compile with GPU.",
tensor_name));
#endif
continue;
}
paddle::framework::details::tensor_check<phi::CPUContext>(
api_name, tensor_name, *dense_tensor, place);
}
}
}

} // namespace framework
} // namespace paddle
29 changes: 29 additions & 0 deletions paddle/fluid/framework/new_executor/nan_inf_utils.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#pragma once

#include "paddle/fluid/framework/new_executor/instruction/instruction_base.h"
#include "paddle/fluid/framework/new_executor/new_executor_defs.h"
#include "paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.h"

namespace paddle {
namespace framework {

void CheckTensorHasNanOrInf(InstructionBase* instruction,
const paddle::framework::Scope* scope,
ValueExecutionInfo* value_exe_info);

} // namespace framework
} // namespace paddle
4 changes: 4 additions & 0 deletions paddle/fluid/framework/new_executor/pir_interpreter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@
#include "paddle/phi/core/distributed/nccl_comm_context.h"
COMMON_DECLARE_bool(dynamic_static_unified_comm);
#endif
#include "paddle/fluid/framework/new_executor/nan_inf_utils.h"

COMMON_DECLARE_bool(enable_pir_in_executor);
COMMON_DECLARE_bool(enable_pir_in_executor_trace_run);
Expand Down Expand Up @@ -1829,6 +1830,9 @@ void PirInterpreter::RunInstructionBase(InstructionBase* instr_node) {
<< "): context wait and get last error";
#endif
}
if (FLAGS_check_nan_inf) {
CheckTensorHasNanOrInf(instr_node, scope_, value_exe_info_.get());
}
VLOG(2) << "\ndone: " << __func__ << " OP id:" << instr_node->Id()
<< " name:" << instr_node->Name() << " type:"
<< (instr_node->KernelType() == OpFuncType::kCpuSync
Expand Down
1 change: 0 additions & 1 deletion test/deprecated/legacy_test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -700,7 +700,6 @@ set_tests_properties(test_index_add_op PROPERTIES TIMEOUT 120)
set_tests_properties(test_argsort_op PROPERTIES TIMEOUT 120)
set_tests_properties(test_gather_nd_op PROPERTIES TIMEOUT 120)
set_tests_properties(test_row_conv_op PROPERTIES TIMEOUT 120)
set_tests_properties(test_nan_inf PROPERTIES TIMEOUT 120)
set_tests_properties(test_deformable_conv_v1_op PROPERTIES TIMEOUT 300)
set_tests_properties(test_imperative_ptb_rnn_sorted_gradient PROPERTIES TIMEOUT
120)
Expand Down
1 change: 1 addition & 0 deletions test/legacy_test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1077,3 +1077,4 @@ set_pir_tests_properties()

set_tests_properties(test_nadam_op PROPERTIES TIMEOUT 100)
set_tests_properties(test_radam_op PROPERTIES TIMEOUT 100)
set_tests_properties(test_nan_inf PROPERTIES TIMEOUT 120)
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,10 @@ def test_check_stack(self):
self.check_stack(" check_nan_inf_backward_stack.py")

def test_static_check_stack(self):
self.check_stack(" check_nan_inf_backward_static_stack.py")
if not paddle.framework.in_pir_mode() and not os.environ.get(
"FLAGS_enable_pir_api"
):
self.check_stack(" check_nan_inf_backward_static_stack.py")


class TestNanInfCheckResult(TestNanInfBase):
Expand Down