diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake index 0d340ab638b1a..83411a68f0847 100644 --- a/cmake/external/xpu.cmake +++ b/cmake/external/xpu.cmake @@ -36,7 +36,7 @@ ENDIF() if(NOT DEFINED XPU_BASE_URL) SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev") - SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220327") + SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220331") else() SET(XPU_BASE_URL "${XPU_BASE_URL}") endif() diff --git a/paddle/fluid/distributed/CMakeLists.txt b/paddle/fluid/distributed/CMakeLists.txt index 17432a0c043f2..06b0583eddf24 100644 --- a/paddle/fluid/distributed/CMakeLists.txt +++ b/paddle/fluid/distributed/CMakeLists.txt @@ -1,9 +1,6 @@ add_subdirectory(collective) add_subdirectory(store) if(NOT WITH_PSCORE) - if(WITH_HETERPS) - add_subdirectory(ps) - endif() add_subdirectory(fleet_executor) return() endif() diff --git a/paddle/fluid/distributed/collective/reducer.cc b/paddle/fluid/distributed/collective/reducer.cc index 79961cca85ae0..ec02406efc818 100644 --- a/paddle/fluid/distributed/collective/reducer.cc +++ b/paddle/fluid/distributed/collective/reducer.cc @@ -320,7 +320,7 @@ EagerReducer::EagerReducer( if (find_unused_vars_each_step_) { global_used_vars_ = paddle::experimental::empty( - ScalarArray({static_cast(tensors_.size())}), DataType::INT32, + IntArray({static_cast(tensors_.size())}), DataType::INT32, inner_place_); } } @@ -364,7 +364,7 @@ void EagerReducer::InitializeGroups( // process the dense gradient. InitializeDenseGroups(tensor_indices_, &group); group.dense_contents_ = paddle::experimental::empty( - ScalarArray({group.all_length_}), group.dtype_, inner_place_); + IntArray({group.all_length_}), group.dtype_, inner_place_); } // map tensors to this group by VariableLocator @@ -403,7 +403,7 @@ void EagerReducer::InitializeDenseGroups( p_group->length_.push_back(size); // for concat operator - p_group->origin_shapes_.push_back(ScalarArray(tensor.shape())); + p_group->origin_shapes_.push_back(IntArray(tensor.shape())); p_group->dense_tensors_.push_back(phi::DenseTensor()); const auto &dtype = tensor.dtype(); diff --git a/paddle/fluid/distributed/collective/reducer.h b/paddle/fluid/distributed/collective/reducer.h index d3ffa8498a14b..848277f5fad4e 100644 --- a/paddle/fluid/distributed/collective/reducer.h +++ b/paddle/fluid/distributed/collective/reducer.h @@ -35,8 +35,8 @@ namespace paddle { namespace distributed { using Tensor = paddle::experimental::Tensor; using Scalar = paddle::experimental::ScalarBase; -using ScalarArray = - paddle::experimental::ScalarArrayBase; +using IntArray = + paddle::experimental::IntArrayBase; using Backend = paddle::experimental::Backend; std::vector> Eager_AssignGroupBySize( @@ -52,7 +52,7 @@ class EagerGroup { std::vector dense_tensors_; std::vector length_; int64_t all_length_{0}; - std::vector origin_shapes_; + std::vector origin_shapes_; // Global indices of participating tensors in the group std::vector tensor_indices_; diff --git a/paddle/fluid/distributed/ps/service/CMakeLists.txt b/paddle/fluid/distributed/ps/service/CMakeLists.txt old mode 100644 new mode 100755 index ab6c2e2600274..b8de291072a1f --- a/paddle/fluid/distributed/ps/service/CMakeLists.txt +++ b/paddle/fluid/distributed/ps/service/CMakeLists.txt @@ -39,8 +39,8 @@ cc_library(server SRCS server.cc DEPS downpour_server boost ${RPC_DEPS}) cc_library(communicator SRCS communicator/communicator.cc DEPS scope client boost table math_function selected_rows_functor ${RPC_DEPS}) cc_library(ps_service SRCS ps_service/service.cc DEPS communicator client server boost ${RPC_DEPS}) -cc_library(heter_server SRCS heter_server.cc DEPS brpc_utils ${COMMON_DEPS} ${RPC_DEPS}) cc_library(heter_client SRCS heter_client.cc DEPS brpc_utils ${COMMON_DEPS} ${RPC_DEPS}) +cc_library(heter_server SRCS heter_server.cc DEPS heter_client brpc_utils ${COMMON_DEPS} ${RPC_DEPS}) set_source_files_properties(ps_service/graph_py_service.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) cc_library(graph_py_service SRCS ps_service/graph_py_service.cc DEPS ps_service) diff --git a/paddle/fluid/distributed/ps/service/brpc_ps_client.cc b/paddle/fluid/distributed/ps/service/brpc_ps_client.cc old mode 100644 new mode 100755 index 9674717ffc24b..5a92afb297c7e --- a/paddle/fluid/distributed/ps/service/brpc_ps_client.cc +++ b/paddle/fluid/distributed/ps/service/brpc_ps_client.cc @@ -55,6 +55,8 @@ DEFINE_int32(pserver_sparse_merge_thread, 1, "pserver sparse merge thread num"); DEFINE_int32(pserver_sparse_table_shard_num, 1000, "sparse table shard for save & load"); +DEFINE_int32(heter_world_size, 100, "group size"); // 可配置 + namespace paddle { namespace framework { class Scope; @@ -1518,7 +1520,7 @@ void sparse_local_merge(ValueAccessor *accessor, float *merge_data, merge_data_shell[i] = merge_data + i; another_data_shell[i] = another_data + i; } - accessor->merge(merge_data_shell, another_data_shell, 1); + accessor->Merge(merge_data_shell, another_data_shell, 1); } int BrpcPsClient::push_sparse_async_shard_merge( @@ -1757,7 +1759,7 @@ void BrpcPsClient::push_dense_task_consume() { async_task]() -> int { auto &tmp_task_vec = *(async_task->data()); const float *merge_data = tmp_task_vec.data(); - accessor->merge(&total_send_data, &merge_data, + accessor->Merge(&total_send_data, &merge_data, total_send_data_size); #pragma optimize("", off) auto *debug_closure = closure; diff --git a/paddle/fluid/distributed/ps/service/brpc_ps_server.cc b/paddle/fluid/distributed/ps/service/brpc_ps_server.cc index 0d7624baec580..2e77020c30751 100644 --- a/paddle/fluid/distributed/ps/service/brpc_ps_server.cc +++ b/paddle/fluid/distributed/ps/service/brpc_ps_server.cc @@ -206,7 +206,8 @@ int32_t BrpcPsService::pull_dense(Table *table, const PsRequestMessage &request, } auto res_data = butil::get_object>(); - res_data->resize(num * table->value_accesor()->select_size() / sizeof(float)); + res_data->resize(num * table->value_accesor()->GetTableInfo(SELECT_SIZE) / + sizeof(float)); TableContext table_context; table_context.value_type = Dense; table_context.pull_context.values = res_data->data(); @@ -385,7 +386,7 @@ int32_t BrpcPsService::pull_sparse(Table *table, CostTimer timer("pserver_server_pull_sparse"); uint32_t num = *(uint32_t *)(request.params(0).c_str()); - auto dim = table->value_accesor()->select_dim(); + auto dim = table->value_accesor()->GetTableInfo(SELECT_DIM); thread_local std::string req_buffer; req_buffer.reserve(req_buffer_size); diff --git a/paddle/fluid/distributed/ps/service/heter_client.cc b/paddle/fluid/distributed/ps/service/heter_client.cc index d6287cda6d443..4ca25dac826f0 100644 --- a/paddle/fluid/distributed/ps/service/heter_client.cc +++ b/paddle/fluid/distributed/ps/service/heter_client.cc @@ -13,18 +13,14 @@ // limitations under the License. #include "paddle/fluid/distributed/ps/service/heter_client.h" + #include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/platform/profiler.h" -#include "paddle/fluid/string/split.h" - -DECLARE_int32(rpc_deadline); -DECLARE_int32(pserver_timeout_ms); namespace paddle { namespace distributed { -std::shared_ptr HeterClient::s_instance_ = NULL; -bool HeterClient::is_initialized_ = false; +std::shared_ptr HeterClient::s_instance_ = nullptr; int GetMicroId(const platform::DeviceContext& ctx, const framework::Scope* scope) { @@ -54,58 +50,21 @@ int GetMicroId(const platform::DeviceContext& ctx, return micro_id; } -void HeterClient::MainThread() { - while (running_) { - RpcProfilerControl(); - } -} - void HeterClient::Stop() { - running_ = false; - if (!is_initialized_) { - VLOG(3) << "HeterClient is not inited, do nothing"; - } else { - if (main_thread_) { - auto status = StopHeterWorker(); - status.wait(); - main_thread_->join(); - main_thread_.reset(nullptr); - } - VLOG(3) << "HeterClient Stop Done"; - } -} - -void HeterClient::FinalizeWorker() { - running_ = false; - if (!is_initialized_) { - VLOG(3) << "HeterClient is not inited, do nothing"; - } else { - if (main_thread_) { - main_thread_->join(); - main_thread_.reset(nullptr); - } - VLOG(3) << "HeterClient Stop Done"; - } + auto status = StopHeterWorker(); + status.wait(); } std::future HeterClient::StopHeterWorker() { return SendCmd(-1, PS_STOP_SERVER, {}); } -void HeterClient::RpcProfilerControl() { - if (trainer_id_ == 0) { - if (!do_server_profiler_ && platform::IsProfileEnabled()) { - // send profiler start flag - do_server_profiler_ = true; - auto start_status = StartProfiler(); - start_status.wait(); - } else if (do_server_profiler_ && !platform::IsProfileEnabled()) { - // send profiler end flag - auto stop_status = StopProfiler(); - stop_status.wait(); - do_server_profiler_ = false; - } - } +std::future HeterClient::StartProfiler() { + return SendCmd(-1, PS_START_PROFILER, {}); +} + +std::future HeterClient::StopProfiler() { + return SendCmd(-1, PS_STOP_PROFILER, {}); } void HeterClient::CreateClient2XpuConnection() { @@ -156,27 +115,24 @@ void HeterClient::SendAndRecvAsync( 1); const platform::DeviceContext* p_ctx = &ctx; const framework::Scope* p_scope = &scope; - const std::string message_name_val = message_name; const std::vector send_var_name_val = send_var_name; const std::vector recv_var_name_val = recv_var_name; - VLOG(3) << "BRPCClient::SendAndRecv Begin, message_name: " - << message_name_val; + VLOG(3) << "BRPCClient::SendAndRecv Begin, message_name: " << message_name; brpc::Channel* channel = nullptr; distributed::MultiVarMsg request; - OnHeterRpcDone* closure = new OnHeterRpcDone([p_ctx, p_scope](void* done) { + OnHeterRpcDone* closure = new OnHeterRpcDone([](void* done) { auto* closure = reinterpret_cast(done); PADDLE_ENFORCE_NE( closure->cntl.Failed(), true, platform::errors::Unimplemented( "HeterClient::SendAndRecv meets brpc error, error message is %s", closure->cntl.ErrorText())); - VLOG(4) << "call heter_worker success"; }); closure->cntl.set_timeout_ms(FLAGS_pserver_timeout_ms); auto& request_io_buffer = closure->cntl.request_attachment(); distributed::SerializeToMultiVarMsgAndIOBuf( - message_name_val, send_var_name_val, recv_var_name_val, *p_ctx, p_scope, + message_name, send_var_name_val, recv_var_name_val, *p_ctx, p_scope, &request, &request_io_buffer); int micro_id = GetMicroId(ctx, p_scope); @@ -188,6 +144,19 @@ void HeterClient::SendAndRecvAsync( } else if (mode == "backward") { int num = minibatch_id % previous_xpu_channels_.size(); channel = previous_xpu_channels_[num].get(); + } else if (mode == "send_to_switch") { + VLOG(4) << "calling switch service"; + // auto promise = std::make_shared>(); + // closure->add_promise(promise); + // std::future fut = promise->get_future(); + // int idx = 1; // for test + // LOG(INFO) << "xpu_channels_ size: " << xpu_channels_.size(); + // channel = xpu_channels_[idx].get(); // 为了适配 send_and_recv op + // ::paddle::distributed::PsService_Stub stub(channel); + // stub.SendToSwitch(&closure->cntl, &request, &closure->response, + // closure); fut.wait(); + VLOG(4) << "calling switch service done"; + return; } ::paddle::distributed::PsService_Stub stub(channel); stub.SendAndRecvVariable(&closure->cntl, &request, &closure->response, @@ -229,13 +198,209 @@ std::future HeterClient::SendCmd( return fut; } -std::future HeterClient::StartProfiler() { - return SendCmd(-1, PS_START_PROFILER, {}); +int HeterClient::Send(const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& message_name, + const std::vector& send_var_names) { + const framework::Scope* p_scope = &scope; // 注意是 const + OnHeterRpcDone* closure = new OnHeterRpcDone([](void* done) { + auto* closure = reinterpret_cast(done); + int ret = 0; + closure->set_promise_value(ret); + if (closure->cntl.Failed()) { + PADDLE_ENFORCE_NE( + closure->cntl.Failed(), true, + platform::errors::Unimplemented( + "HeterClient::SendToSwitch meets brpc error, error message is %s", + closure->cntl.ErrorText())); + } + }); + + closure->cntl.set_timeout_ms(FLAGS_pserver_timeout_ms); + auto& request_io_buffer = closure->cntl.request_attachment(); + + distributed::MultiVarMsg request; + // 1. set req message_name(string) + request.set_message_name(message_name); + + // 2. set req send_var_names() + for (auto& send_var_name : send_var_names) { + request.add_send_var_names(send_var_name); + } + + // 3. set req var_messages() + for (auto& send_var_name : send_var_names) { + auto* send_var_msg = request.add_var_messages(); + send_var_msg->set_varname(send_var_name); + framework::Variable* var = p_scope->FindVar(send_var_name); + butil::IOBuf temp_iobuf; + if (var->IsType()) { + SerializeLodTensor(var, ctx, send_var_msg, &temp_iobuf); + } else if (var->IsType()) { + SerializeSelectedRows(var, ctx, send_var_msg, &temp_iobuf); + } + request_io_buffer.append(temp_iobuf); + } + auto promise = std::make_shared>(); + closure->add_promise(promise); + std::future fut = promise->get_future(); + if (send_switch_channels_.empty()) { + LOG(ERROR) << "send_switch_channels_ is null, get xpu_channels_[0]"; + if (xpu_channels_.empty()) { + LOG(ERROR) << "xpu_channels_ is null"; + } + send_switch_channels_.push_back(xpu_channels_[0]); + } + brpc::Channel* channel = send_switch_channels_[0].get(); + // brpc::Channel* channel = xpu_channels_[0].get(); + ::paddle::distributed::PsService_Stub stub(channel); + stub.SendToSwitch(&closure->cntl, &request, &closure->ps_response, closure); + + VLOG(4) << "waiting SendToSwitch response result......"; + fut.wait(); + VLOG(4) << "Send done"; + return 0; } -std::future HeterClient::StopProfiler() { - return SendCmd(-1, PS_STOP_PROFILER, {}); +int HeterClient::Send(int group_id, const std::vector& var_names, + const std::vector& vars_len, void* data_ptr, + int64_t data_size) { + OnHeterRpcDone* closure = new OnHeterRpcDone([](void* done) { + auto* closure = reinterpret_cast(done); + int ret = 0; + closure->set_promise_value(ret); + if (closure->cntl.Failed()) { + LOG(ERROR) << "Send meets brpc error, err msg is %s" + << closure->cntl.ErrorText(); + } + }); + distributed::MultiVarMsg request; + closure->cntl.set_timeout_ms(FLAGS_pserver_timeout_ms); + std::string message_name = "send and save"; + request.set_message_name(message_name); + request.set_group_id(group_id); + for (auto& send_var_name : var_names) { + request.add_send_var_names(send_var_name); + } + for (auto var_len : vars_len) { + request.add_vars_len(var_len); + } + auto& request_buffer = closure->cntl.request_attachment(); + request_buffer.append(reinterpret_cast(data_ptr), + data_size * sizeof(float)); + auto promise = std::make_shared>(); + closure->add_promise(promise); + std::future fut = promise->get_future(); + if (send_switch_channels_.empty()) { + LOG(ERROR) << "send_switch_channels_ is null, get xpu_channels_[0]"; + if (xpu_channels_.empty()) { + LOG(ERROR) << "xpu_channels_ is null"; + } + send_switch_channels_.push_back(xpu_channels_[0]); + } + brpc::Channel* channel = send_switch_channels_[0].get(); + ::paddle::distributed::PsService_Stub stub(channel); + stub.SendToSwitch(&closure->cntl, &request, &closure->ps_response, closure); + fut.wait(); + return 0; } -} // end namespace distributed +int HeterClient::Recv(const platform::DeviceContext& ctx, + framework::Scope& recv_scope, // NOLINT + const std::string& message_name, + const std::vector& recv_var_names) { + OnHeterRpcDone* closure = new OnHeterRpcDone([](void* done) { + auto* closure = reinterpret_cast(done); + VLOG(4) << "Recv service call done"; + int ret = 0; + closure->set_promise_value(ret); + if (closure->cntl.Failed()) { + VLOG(4) << "HeterClient::RecvFromSwitch meets " + "brpc error, error message is %s" + << closure->cntl.ErrorText(); + } + }); + + closure->cntl.set_timeout_ms(FLAGS_pserver_timeout_ms); + + distributed::MultiVarMsg request; + // 1. set req message_name(string) + request.set_message_name(message_name); + + // 2. set req recv_var_names() + for (auto& recv_var_name : recv_var_names) { + request.add_recv_var_names(recv_var_name); + } + auto promise = std::make_shared>(); + closure->add_promise(promise); + std::future fut = promise->get_future(); + if (recv_switch_channels_.empty()) { + LOG(ERROR) << "peer_switch_channels_ is null, get xpu_channels_[1]"; + if (xpu_channels_.size() < 2) { + LOG(ERROR) << "xpu_channels_ is null"; + } + recv_switch_channels_.push_back(xpu_channels_[1]); + } + brpc::Channel* channel = recv_switch_channels_[0].get(); + ::paddle::distributed::PsService_Stub stub(channel); + stub.RecvFromSwitch(&closure->cntl, &request, &closure->response, closure); + fut.wait(); + VLOG(4) << "RecvFromSwitch done"; + // save in worker + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + platform::CPUPlace cpu_place; + auto& cpu_dev_ctx = *pool.Get(cpu_place); + auto& res_io_buffer = closure->cntl.response_attachment(); + VLOG(4) << "entering DeserializeFromMultiVarMsgAndIOBuf"; + distributed::DeserializeFromMultiVarMsgAndIOBuf( + closure->response, &res_io_buffer, cpu_dev_ctx, &recv_scope); + VLOG(4) << "Recv done"; + return 0; +} + +int HeterClient::Recv(int group_id, const std::vector& var_names, + void* data_ptr, int64_t data_size) { + OnHeterRpcDone* closure = new OnHeterRpcDone([](void* done) { + auto* closure = reinterpret_cast(done); + int ret = 0; + closure->set_promise_value(ret); + if (closure->cntl.Failed()) { + LOG(ERROR) << "Recv meets brpc error, err msg is %s" + << closure->cntl.ErrorText(); + } + }); + closure->cntl.set_timeout_ms(FLAGS_pserver_timeout_ms); + + distributed::MultiVarMsg request; + std::string message_name = "query and recv"; + request.set_message_name(message_name); + request.set_group_id(group_id); + + for (auto& recv_var_name : var_names) { + request.add_recv_var_names(recv_var_name); + } + auto promise = std::make_shared>(); + closure->add_promise(promise); + std::future fut = promise->get_future(); + if (recv_switch_channels_.empty()) { + LOG(ERROR) << "peer_switch_channels_ is null, get xpu_channels_[1]"; + if (xpu_channels_.size() < 2) { + LOG(ERROR) << "xpu_channels_ is null"; + } + recv_switch_channels_.push_back(xpu_channels_[1]); + } + brpc::Channel* channel = recv_switch_channels_[0].get(); + ::paddle::distributed::PsService_Stub stub(channel); + stub.RecvFromSwitch(&closure->cntl, &request, &closure->response, closure); + fut.wait(); + VLOG(4) << "RecvFromSwitch done"; + // save in worker + auto& res_io_buffer = closure->cntl.response_attachment(); + butil::IOBufBytesIterator io_buffer_itr(res_io_buffer); + io_buffer_itr.copy_and_forward(reinterpret_cast(data_ptr), + data_size * sizeof(float)); + VLOG(4) << "Recv done"; + return 0; +} +} // namespace distributed } // end namespace paddle diff --git a/paddle/fluid/distributed/ps/service/heter_client.h b/paddle/fluid/distributed/ps/service/heter_client.h old mode 100644 new mode 100755 index 4f27ef75ea954..006f87ddf5b06 --- a/paddle/fluid/distributed/ps/service/heter_client.h +++ b/paddle/fluid/distributed/ps/service/heter_client.h @@ -32,13 +32,14 @@ limitations under the License. */ #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/variable_helper.h" #include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN +#include "paddle/fluid/string/split.h" namespace paddle { namespace framework { class Scope; } // namespace framework } // namespace paddle - +DECLARE_int32(pserver_timeout_ms); namespace paddle { namespace distributed { @@ -51,24 +52,72 @@ class OnHeterRpcDone : public google::protobuf::Closure { public: explicit OnHeterRpcDone(HeterRpcCallbackFunc func) : handler_(func) {} virtual ~OnHeterRpcDone() {} - void Run() { - std::unique_ptr self_guard(this); - handler_(this); + void Run() { handler_(this); } + + void add_promise(std::shared_ptr>& promise) { // NOLINT + _promises.push_back(promise); } + void set_promise_value(int value) { + for (auto& promise : _promises) { + promise->set_value(value); + } + } + int CheckResponse() { return 0; } + std::vector>> _promises; HeterRpcCallbackFunc handler_; + + MultiVariableMessage request; MultiVariableMessage response; + + PsResponseMessage ps_response; + brpc::Controller cntl; + // PsRequestMessage *request(size_t i) { return &_requests[i]; } + // PsResponseMessage *response(size_t i) { return &_responses[i]; } + // std::vector _requests; + // std::vector _responses; + // std::vector> _cntls; }; class HeterClient { public: virtual ~HeterClient() {} - HeterClient() { - running_ = true; - main_thread_.reset( - new std::thread(std::bind(&HeterClient::MainThread, this))); + void InitClientChannels(bool need_encrypt, + const std::vector& node_list, + int32_t peer_role) { + brpc::ChannelOptions options; + options.protocol = "baidu_std"; + options.connection_type = "single"; + options.timeout_ms = FLAGS_pserver_timeout_ms; + std::vector>* client_channels = nullptr; + if (peer_role == PEER_ROLE_IS_SWITCH) { + options.ssl_options.enable = need_encrypt; + client_channels = &peer_switch_channels_; + } else if (peer_role == PEER_ROLE_IS_WORKER) { + client_channels = &peer_worker_channels_; + } else { + LOG(ERROR) << "init switch client failed, peer_role not valid"; + } + (*client_channels).resize(node_list.size()); + for (size_t i = 0; i < node_list.size(); ++i) { + (*client_channels)[i].reset(new brpc::Channel()); + if ((*client_channels)[i]->Init(node_list[i].c_str(), "", &options) != + 0) { + VLOG(0) << "client channel init failed! try again"; + auto ip_port = paddle::string::Split(node_list[i], ':'); + std::string ip = ip_port[0]; + int port = std::stoi(ip_port[1]); + std::string int_ip_port = GetIntTypeEndpoint(ip, port); + if ((*client_channels)[i]->Init(int_ip_port.c_str(), "", &options) != + 0) { + LOG(ERROR) << "client channel init failed! peer ip_port = " + << int_ip_port; + } + } + } + VLOG(4) << "InitClientChannels success"; } void CreateClient2XpuConnection(); @@ -80,14 +129,28 @@ class HeterClient { const std::vector& recv_var_name, const std::string& mode = "forward"); + int Send(int group_id, const std::vector& var_names, + const std::vector& vars_len, void* data_ptr, int64_t data_size); + + int Send(const platform::DeviceContext& ctx, const framework::Scope& scope, + const std::string& message_name, + const std::vector& send_var_names); + + int Recv(int group_id, const std::vector& var_names, + void* data_ptr, int64_t data_size); + + int Recv(const platform::DeviceContext& ctx, + framework::Scope& recv_scope, // NOLINT + const std::string& message_name, + const std::vector& recv_var_names); + // HeterClient singleton static std::shared_ptr GetInstance( const std::vector& endpoint, const std::vector& previous_endpoint, const int& trainer_id) { if (NULL == s_instance_) { - is_initialized_ = true; - s_instance_.reset(new paddle::distributed::HeterClient()); + s_instance_.reset(new HeterClient()); s_instance_->SetXpuList(endpoint); s_instance_->SetPreviousXpuList(previous_endpoint); s_instance_->SetTrainerID(trainer_id); @@ -96,13 +159,29 @@ class HeterClient { return s_instance_; } - void Stop(); + // switch client singleton + static HeterClient& GetSwitchInstance( + const std::vector& peer_endpoints, int32_t peer_role) { + static HeterClient switch_s_instance_; + if (peer_endpoints.empty()) { + VLOG(4) << "init switch client failed, null peer_endpoints"; + } + VLOG(4) << "peer role is: " << peer_role + << ", addr is: " << peer_endpoints[0]; + switch_s_instance_.SetPeerSwitchList(peer_endpoints); + switch_s_instance_.InitClientChannels(false, peer_endpoints, peer_role); + return switch_s_instance_; + } - void FinalizeWorker(); + void SetPeerSwitchList(const std::vector& peer_endpoints) { + peer_switch_list_ = peer_endpoints; + } - void MainThread(); + void SetPeerWorkerList(const std::vector& worker_endpoints) { + peer_worker_list_ = worker_endpoints; + } - void RpcProfilerControl(); + void Stop(); std::future SendCmd(uint32_t table_id, int cmd_id, const std::vector& params); @@ -124,20 +203,32 @@ class HeterClient { void SetTrainerID(const int& trainer_id) { trainer_id_ = trainer_id; } + public: + std::vector send_switch_list_; + std::vector recv_switch_list_; + + std::vector peer_switch_list_; + std::vector peer_worker_list_; + std::vector> send_switch_channels_; + std::vector> recv_switch_channels_; + + std::vector> peer_switch_channels_; + std::vector> peer_worker_channels_; + private: + HeterClient() {} + HeterClient& operator=(const HeterClient&); + HeterClient(const HeterClient&); + static std::shared_ptr s_instance_; - static bool is_initialized_; - std::unique_ptr main_thread_{nullptr}; std::vector> xpu_channels_; std::vector> previous_xpu_channels_; - DISABLE_COPY_AND_ASSIGN(HeterClient); + // DISABLE_COPY_AND_ASSIGN(HeterClient); std::vector xpu_list_; std::vector previous_xpu_list_; - bool running_ = false; int trainer_id_; - bool do_server_profiler_ = false; }; } // end namespace distributed diff --git a/paddle/fluid/distributed/ps/service/heter_server.cc b/paddle/fluid/distributed/ps/service/heter_server.cc index 01afed3f12375..e21bf093f1915 100644 --- a/paddle/fluid/distributed/ps/service/heter_server.cc +++ b/paddle/fluid/distributed/ps/service/heter_server.cc @@ -13,21 +13,28 @@ // limitations under the License. #include "paddle/fluid/distributed/ps/service/heter_server.h" + #include "paddle/fluid/string/split.h" namespace paddle { namespace distributed { +// DEFINE_string(cert_path, "./cert.pem", "cert.pem path"); +// DEFINE_string(key_path, "./key.pem", "key.pem path"); -std::shared_ptr HeterServer::s_instance_ = NULL; +std::shared_ptr HeterServer::s_instance_ = nullptr; void HeterServer::RegisterServiceHandler(std::string message_name, HeterServiceHandler func) { service_.RegisterServiceHandler(message_name, func); } -void HeterServer::StartHeterService() { +void HeterServer::StartHeterService(bool neeed_encrypt) { server_.AddService(&service_, brpc::SERVER_DOESNT_OWN_SERVICE); brpc::ServerOptions options; + if (neeed_encrypt) { + options.ssl_options.default_cert.certificate = "/cert.pem"; + options.ssl_options.default_cert.private_key = "/key.pem"; + } if (server_.Start(endpoint_.c_str(), &options) != 0) { VLOG(0) << "HeterServer start fail. Try again."; auto ip_port = paddle::string::Split(endpoint_, ':'); @@ -47,16 +54,50 @@ void HeterServer::StartHeterService() { ready_ = 1; } condition_ready_.notify_all(); + VLOG(4) << "stopped: " << stoped_ << ", ready_: " << ready_; std::unique_lock running_lock(mutex_); cv_.wait(running_lock, [&] { - VLOG(1) << "Heter Server is Stop? " << stoped_; + VLOG(4) << "Heter Server is Stop? " << stoped_; return stoped_; }); + VLOG(4) << "start service done"; } -void HeterServer::SetEndPoint(const std::string& endpoint) { - endpoint_ = endpoint; - service_.SetEndpoint(endpoint); +void HeterServer::StartHeterInterService(bool neeed_encrypt) { + server_inter_.AddService(&service_, brpc::SERVER_DOESNT_OWN_SERVICE); + brpc::ServerOptions options; + if (neeed_encrypt) { + options.ssl_options.default_cert.certificate = "/cert.pem"; + options.ssl_options.default_cert.private_key = "/key.pem"; + } + if (server_inter_.Start(endpoint_inter_.c_str(), &options) != 0) { + VLOG(4) << "switch inter server start fail. Try again."; + auto ip_port = paddle::string::Split(endpoint_inter_, ':'); + std::string ip = ip_port[0]; + int port = std::stoi(ip_port[1]); + std::string int_ip_port = GetIntTypeEndpoint(ip, port); + if (server_inter_.Start(endpoint_inter_.c_str(), &options) != 0) { + LOG(ERROR) << "switch inter server start failed, ip_port= " + << int_ip_port; + } + } else { + VLOG(4) << "switch inter server server start success! listen on " + << endpoint_inter_; + } + + { + std::lock_guard lock(this->mutex_ready_); + stoped_ = false; + ready_ = 1; + } + condition_ready_.notify_all(); + VLOG(4) << "stopped: " << stoped_ << ", ready_: " << ready_; + std::unique_lock running_lock(mutex_); + cv_.wait(running_lock, [&] { + VLOG(4) << "Heter Server is Stop? " << stoped_; + return stoped_; + }); + VLOG(4) << "start service done"; } void HeterServer::SetFanin(const int& fan_in) { service_.SetFanin(fan_in); } @@ -64,35 +105,180 @@ void HeterServer::SetFanin(const int& fan_in) { service_.SetFanin(fan_in); } void HeterServer::WaitServerReady() { std::unique_lock lock(this->mutex_ready_); condition_ready_.wait(lock, [=] { return this->ready_ == 1; }); + while (!this->ready_) { + sleep(1); + } } -int32_t HeterService::stop_profiler(const PsRequestMessage& request, - PsResponseMessage& response, - brpc::Controller* cntl) { - platform::DisableProfiler( - platform::EventSortingKey::kDefault, - string::Sprintf("heter_worker_%s_profile", endpoint_)); +int SendAndRecvVariableHandler::SaveInSwitchWithShard( + const MultiVarMsg* request, PsResponseMessage* response, + brpc::Controller* cntl) { + VLOG(4) << "entering SaveInSwitchWithShard"; + int32_t group_id = request->group_id(); + auto& local_shard = _local_shards[group_id]; + auto& request_io_buffer = cntl->request_attachment(); + butil::IOBufBytesIterator io_buffer_itr(request_io_buffer); + for (int idx = 0; idx < request->send_var_names_size(); idx++) { + const auto& var_name = request->send_var_names(idx); + const auto& var_len = request->vars_len(idx); + auto itr = local_shard.find(var_name); + if (itr != local_shard.end()) { + LOG(INFO) << "var: " << var_name << "has not been consumed!" + << "check again"; + WaitForVarsConsumed(group_id, var_name); + } + auto& value = local_shard[var_name]; + value.resize(var_len); + io_buffer_itr.copy_and_forward(reinterpret_cast(value.data()), + var_len * sizeof(float)); + VLOG(4) << "saved data in shards: "; + for (uint32_t i = 0; i < local_shard[var_name].size(); i++) { + VLOG(4) << *(local_shard[var_name].data() + i); + } + } + VLOG(4) << "SaveInSwitchWithShard success"; return 0; } -int32_t HeterService::start_profiler(const PsRequestMessage& request, - PsResponseMessage& response, - brpc::Controller* cntl) { - platform::EnableProfiler(platform::ProfilerState::kAll); +int SendAndRecvVariableHandler::QueryInSwitchWithShard( + const MultiVarMsg* request, MultiVarMsg* response, brpc::Controller* cntl) { + VLOG(4) << "entering QueryInSwitchWithShard"; + int32_t group_id = request->group_id(); + VLOG(4) << "group id: " << group_id; + auto& local_shard = _local_shards[group_id]; + auto& response_io_buffer = cntl->response_attachment(); + auto req_var_nums = request->recv_var_names_size(); + std::vector req_var_names(req_var_nums); + for (int var_idx = 0; var_idx < req_var_nums; ++var_idx) { + req_var_names[var_idx] = request->recv_var_names(var_idx); + } + auto msg_name = request->message_name(); + response->set_message_name(msg_name); + + for (auto& req_var_name : req_var_names) { + VLOG(4) << "req var name: " << req_var_name; + response->add_send_var_names(req_var_name); + auto itr = local_shard.find(req_var_name); + if (itr == local_shard.end()) { + LOG(INFO) << "var: " << req_var_name << " not found in shards"; + WaitForVarsProduced(group_id, req_var_name); + } + LOG(INFO) << "var: " << req_var_name << " found in shards"; + itr = local_shard.find(req_var_name); + auto& value = itr.value(); + response_io_buffer.append(value.data(), value.size() * sizeof(float)); + value.resize(0); // 标记位 + } + VLOG(4) << "heter server QueryInSwitchWithShard done"; return 0; } -int32_t HeterService::stop_heter_worker(const PsRequestMessage& request, - PsResponseMessage& response, - brpc::Controller* cntl) { - auto client_id = request.client_id(); - stop_cpu_worker_set_.insert(client_id); - if (stop_cpu_worker_set_.size() == fan_in_) { - is_exit_ = true; - VLOG(3) << "Stop heter Service done."; +int SendAndRecvVariableHandler::SaveInSwitchWithScope( + const MultiVarMsg* request, PsResponseMessage* response, + brpc::Controller* cntl) { + VLOG(4) << "entering SaveInSwitchWithScope"; + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + platform::CPUPlace cpu_place; + auto& cpu_dev_ctx = *pool.Get(cpu_place); + auto message_name = request->message_name(); + VLOG(4) << "message_name in heter server: " << message_name; + std::unique_lock lk(scope_mutex_); + auto local_scope = local_scope_ptr.get(); + if (!local_scope) { + LOG(ERROR) << "local_scope_ptr is null in SaveInSwitchWithScope"; + } + for (int idx = 0; idx < request->send_var_names_size(); idx++) { + const auto& msg = request->var_messages(idx); + std::string var_name = msg.varname(); + auto* var_exist_ptr = local_scope->FindVar(var_name); + if (!var_exist_ptr) { + VLOG(4) << "not find var: " << var_name << " in local_scope"; + } + vars_table[var_name] += 1; + VLOG(4) << "saved var_name: " << var_name + << ", cnt = " << vars_table[var_name]; + } + auto& request_io_buffer = cntl->request_attachment(); + distributed::DeserializeFromMultiVarMsgAndIOBuf(*request, &request_io_buffer, + cpu_dev_ctx, local_scope); + lk.unlock(); + while (true) { + int ret = 0; + for (int idx = 0; idx < request->send_var_names_size(); idx++) { + ret |= vars_table[request->var_messages(idx).varname()]; + } + if (!ret) { + VLOG(4) << "all saved vars consumed"; + break; + } + VLOG(4) << "waiting consume result......"; + sleep(1); } + VLOG(4) << "SaveInSwitchWithScope success"; return 0; } +int SendAndRecvVariableHandler::QueryInSwitchWithScope( + const MultiVarMsg* request, MultiVarMsg* response, brpc::Controller* cntl) { + VLOG(4) << "entering QueryInSwitchWithScope"; + auto local_scope = local_scope_ptr.get(); + if (!local_scope) { + LOG(INFO) << "local_scope is null"; + } + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + platform::CPUPlace cpu_place; + auto& cpu_dev_ctx = *pool.Get(cpu_place); + + // get req message_name & req_var_names + auto msg_name = request->message_name(); + auto req_var_nums = request->recv_var_names_size(); + std::vector req_var_names(req_var_nums); + for (int var_idx = 0; var_idx < req_var_nums; ++var_idx) { + req_var_names[var_idx] = request->recv_var_names(var_idx); + } + auto& response_io_buffer = cntl->response_attachment(); + + // 1. fill message_name(string) + response->set_message_name(msg_name); + + // 2. fill var_names(string) + for (auto& req_var_name : req_var_names) { + response->add_send_var_names(req_var_name); + } + + // 3. fill var_messages(VarMessage) + for (auto& req_var_name : req_var_names) { + LOG(INFO) << "query var_name: " << req_var_name; + auto* send_var_msg = response->add_var_messages(); + send_var_msg->set_varname(req_var_name); + + framework::Variable* var_ptr; + while (true) { + var_ptr = local_scope->FindVar(req_var_name); + if (!var_ptr) { + LOG(INFO) << "local_scope not find var: " << req_var_name; + } else { + break; + } + sleep(1); + } + butil::IOBuf temp_iobuf; + if (var_ptr->IsType()) { + SerializeLodTensor(var_ptr, cpu_dev_ctx, send_var_msg, &temp_iobuf); + } else if (var_ptr->IsType()) { + SerializeSelectedRows(var_ptr, cpu_dev_ctx, send_var_msg, &temp_iobuf); + } + response_io_buffer.append(temp_iobuf); + } + for (auto& req_var_name : req_var_names) { + std::unique_lock lk(scope_mutex_); + vars_table[req_var_name] -= 1; + VLOG(4) << "remained var: " << req_var_name + << ", cnt = " << vars_table[req_var_name]; + lk.unlock(); + } + VLOG(4) << "heter server QueryInSwitchWithScope done"; + return 0; +} } // end namespace distributed -} // end namespace paddle +} // namespace paddle diff --git a/paddle/fluid/distributed/ps/service/heter_server.h b/paddle/fluid/distributed/ps/service/heter_server.h index a14fb5f6cc04a..624e76112c7b0 100644 --- a/paddle/fluid/distributed/ps/service/heter_server.h +++ b/paddle/fluid/distributed/ps/service/heter_server.h @@ -22,11 +22,14 @@ limitations under the License. */ #include #include #include + #include "brpc/channel.h" #include "brpc/controller.h" #include "brpc/server.h" #include "paddle/fluid/distributed/ps/service/brpc_utils.h" +#include "paddle/fluid/distributed/ps/service/heter_client.h" #include "paddle/fluid/distributed/ps/service/sendrecv.pb.h" +#include "paddle/fluid/distributed/ps/table/depends/feature_value.h" #include "paddle/fluid/framework/blocking_queue.h" #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/program_desc.h" @@ -51,108 +54,37 @@ class Scope; } // namespace paddle DECLARE_double(eager_delete_tensor_gb); +DECLARE_int32(pserver_timeout_ms); +DECLARE_int32(heter_world_size); namespace paddle { namespace distributed { -using MultiVarMsg = ::paddle::distributed::MultiVariableMessage; -using VarMsg = ::paddle::distributed::VariableMessage; - -class HeterService; +using MultiVarMsg = MultiVariableMessage; +using VarMsg = VariableMessage; -typedef int32_t (HeterService::*serviceHandlerFunc)( +using serviceHandler = std::function; +using HeterServiceHandler = + std::function; -typedef std::function HeterRpcCallbackFunc; -typedef std::function - HeterServiceHandler; +using HeterRpcCallbackFunc = std::function; -class HeterService : public ::paddle::distributed::PsService { +class ServiceHandlerBase { public: - HeterService() { - _service_handler_map[PS_STOP_SERVER] = &HeterService::stop_heter_worker; - _service_handler_map[PS_START_PROFILER] = &HeterService::start_profiler; - _service_handler_map[PS_STOP_PROFILER] = &HeterService::stop_profiler; - } + ServiceHandlerBase() : dev_ctx_(nullptr), scope_(nullptr) {} - virtual ~HeterService() {} + virtual ~ServiceHandlerBase() {} - virtual void service(::google::protobuf::RpcController* controller, - const PsRequestMessage* request, - PsResponseMessage* response, - ::google::protobuf::Closure* done) { - brpc::ClosureGuard done_guard(done); - std::string log_label("ReceiveCmd-"); - - response->set_err_code(0); - response->set_err_msg(""); - brpc::Controller* cntl = static_cast(controller); - auto itr = _service_handler_map.find(request->cmd_id()); - if (itr == _service_handler_map.end()) { - std::string err_msg( - "undefined cmd_id, should match PsCmdID in ps.proto, cmd_id:"); - err_msg.append(std::to_string(request->cmd_id())); - return; - } - serviceHandlerFunc handler_func = itr->second; - int service_ret = (this->*handler_func)(*request, *response, cntl); - if (service_ret != 0) { - response->set_err_code(service_ret); - response->set_err_msg("server internal error"); - } - } - - void SendAndRecvVariable(::google::protobuf::RpcController* controller, - const MultiVarMsg* request, MultiVarMsg* response, - ::google::protobuf::Closure* done) { - brpc::ClosureGuard done_guard(done); - std::string message_name = request->message_name(); - auto itr = handler_map_.find(message_name); - brpc::Controller* cntl = static_cast(controller); - PADDLE_ENFORCE_NE( - itr, handler_map_.end(), - platform::errors::InvalidArgument( - "HeterService::SendAndRecvVariable Get illegal message_name: %s " - "which is not in HeterService::handler_map_", - message_name)); - itr->second(request, response, cntl); - } - - void RegisterServiceHandler(std::string message_name, - HeterServiceHandler func) { - handler_map_[message_name] = func; - } - - int32_t ForceExit() { - VLOG(3) << "heter service force exit"; - is_exit_ = true; - return 0; - } - - void SetEndpoint(const std::string& end_point) { endpoint_ = end_point; } - void SetFanin(const int& fan_in) { fan_in_ = fan_in; } - bool IsExit() { return is_exit_; } - - private: - int32_t stop_profiler(const PsRequestMessage& request, - PsResponseMessage& response, // NOLINT - brpc::Controller* cntl); - - int32_t start_profiler(const PsRequestMessage& request, - PsResponseMessage& response, // NOLINT - brpc::Controller* cntl); + void SetScope(const framework::Scope* scope) { scope_ = scope; } + void SetDevCtx(const platform::DeviceContext* dev_ctx) { dev_ctx_ = dev_ctx; } - int32_t stop_heter_worker(const PsRequestMessage& request, - PsResponseMessage& response, // NOLINT - brpc::Controller* cntl); + virtual int Handle(const MultiVarMsg* request, MultiVarMsg* response, + brpc::Controller* cntl) = 0; - private: - std::string endpoint_; - std::unordered_map handler_map_; - std::unordered_map _service_handler_map; - std::unordered_set stop_cpu_worker_set_; - int fan_in_; - bool is_exit_ = false; + protected: + const platform::DeviceContext* dev_ctx_; + const framework::Scope* scope_; }; using SharedMiniScope = @@ -163,31 +95,15 @@ using SharedTaskQueue = std::shared_ptr< std::unordered_map>>>>; -class HeterRequestHandler { - public: - HeterRequestHandler() : dev_ctx_(nullptr), scope_(nullptr) {} - - virtual ~HeterRequestHandler() {} - - void SetScope(const framework::Scope* scope) { scope_ = scope; } - void SetDevCtx(const platform::DeviceContext* dev_ctx) { dev_ctx_ = dev_ctx; } - - virtual int Handle(const MultiVarMsg* request, MultiVarMsg* response, - brpc::Controller* cntl) = 0; - - protected: - const platform::DeviceContext* dev_ctx_; - const framework::Scope* scope_; -}; - -class RequestSendAndRecvHandler final : public HeterRequestHandler { +class SendAndRecvVariableHandler final : public ServiceHandlerBase { public: - RequestSendAndRecvHandler() { + SendAndRecvVariableHandler() { this->num_microbatch_ = 0; this->num_minibatch_ = 0; + _local_shards.reset(new shard_type[FLAGS_heter_world_size]); } - virtual ~RequestSendAndRecvHandler() {} + virtual ~SendAndRecvVariableHandler() {} void SetMiniScopes(SharedMiniScope mini_scopes) { mini_scopes_ = mini_scopes; @@ -209,11 +125,47 @@ class RequestSendAndRecvHandler final : public HeterRequestHandler { return (*task_queue_).size(); } + int SaveInSwitchWithScope(const MultiVarMsg* request, + PsResponseMessage* response, + brpc::Controller* cntl); + + void WaitForVarsConsumed(int32_t group_id, const std::string& var_name) { + auto& local_shard = _local_shards[group_id]; + while (local_shard.find(var_name) != local_shard.end()) { + if (local_shard[var_name].size() == 0) { + break; + } + VLOG(4) << "waiting consume result......"; + sleep(1); + } + return; + } + + void WaitForVarsProduced(int32_t group_id, const std::string& var_name) { + auto& local_shard = _local_shards[group_id]; + while (local_shard.find(var_name) == local_shard.end()) { + VLOG(4) << "waiting produce result......"; + sleep(1); + } + return; + } + + int SaveInSwitchWithShard(const MultiVarMsg* request, + PsResponseMessage* response, + brpc::Controller* cntl); + + int QueryInSwitchWithShard(const MultiVarMsg* request, MultiVarMsg* response, + brpc::Controller* cntl); + + int QueryInSwitchWithScope(const MultiVarMsg* request, MultiVarMsg* response, + brpc::Controller* cntl); + void SetTaskQueue(SharedTaskQueue task_queue) { task_queue_ = task_queue; } int Handle(const MultiVarMsg* request, MultiVarMsg* response, brpc::Controller* cntl) override { - platform::RecordEvent record_event("RequestSendAndRecvHandler->Handle", + LOG(INFO) << "entered Handle"; + platform::RecordEvent record_event("SendAndRecvVariableHandler->Handle", platform::TracerEventType::Communication, 1); FLAGS_eager_delete_tensor_gb = -1; @@ -241,7 +193,6 @@ class RequestSendAndRecvHandler final : public HeterRequestHandler { auto* tensor = var->GetMutable(); auto data = reinterpret_cast(tensor->data()); auto micro_id = static_cast(data[0]); - int minibatch_index = micro_id / 10; int microbatch_index = micro_id % 10; @@ -249,10 +200,7 @@ class RequestSendAndRecvHandler final : public HeterRequestHandler { std::unique_lock lk(scope_mutex_); if ((*mini_scopes_).find(minibatch_index) != (*mini_scopes_).end()) { lk.unlock(); - // PADDLE_ENFORCE_EQ( - // (*mini_scopes_).find(minibatch_index) != (*mini_scopes_).end(), 1, - // platform::errors::InvalidArgument( - // "minibatch index should in current trainer")); + PADDLE_ENFORCE_EQ( (*micro_scopes_).find(minibatch_index) != (*micro_scopes_).end(), 1, platform::errors::InvalidArgument( @@ -282,6 +230,7 @@ class RequestSendAndRecvHandler final : public HeterRequestHandler { // blocking queue handles multi thread (*task_queue_)[minibatch_index]->Push( std::make_pair(message_name, microbatch_index)); + auto response_var_nums = request->recv_var_names_size(); std::vector response_var_names(response_var_nums), empty_var_names{}; @@ -295,6 +244,12 @@ class RequestSendAndRecvHandler final : public HeterRequestHandler { return 0; } + public: + using shard_type = SparseTableShard; + std::shared_ptr local_scope_ptr; // for switch + std::unordered_map vars_table; + std::unique_ptr _local_shards; + private: // share with HeterPipelineTrainer SharedMiniScope mini_scopes_{nullptr}; @@ -310,15 +265,254 @@ class RequestSendAndRecvHandler final : public HeterRequestHandler { SharedTaskQueue task_queue_; }; +class HeterService : public PsService { + public: + HeterService() { + _service_handler_map[PS_STOP_SERVER] = + std::bind(&HeterService::stop_heter_worker, this, std::placeholders::_1, + std::placeholders::_2, std::placeholders::_3); + _service_handler_map[PS_START_PROFILER] = + std::bind(&HeterService::start_profiler, this, std::placeholders::_1, + std::placeholders::_2, std::placeholders::_3); + _service_handler_map[PS_STOP_PROFILER] = + std::bind(&HeterService::stop_profiler, this, std::placeholders::_1, + std::placeholders::_2, std::placeholders::_3); + + service_handler_.local_scope_ptr = + std::make_shared(); + } + + virtual ~HeterService() {} + + virtual void service(::google::protobuf::RpcController* controller, + const PsRequestMessage* request, + PsResponseMessage* response, + ::google::protobuf::Closure* done) { + brpc::ClosureGuard done_guard(done); + + response->set_err_code(0); + response->set_err_msg(""); + brpc::Controller* cntl = static_cast(controller); + auto itr = _service_handler_map.find(request->cmd_id()); + if (itr == _service_handler_map.end()) { + std::string err_msg( + "undefined cmd_id, should match PsCmdID in ps.proto, cmd_id:"); + err_msg.append(std::to_string(request->cmd_id())); + return; + } + serviceHandler handler = itr->second; + int service_ret = handler(*request, *response, cntl); + VLOG(4) << "handler in service ret: " << service_ret; + if (service_ret != 0) { + response->set_err_code(service_ret); + response->set_err_msg("server internal error"); + } + } + + virtual void SendAndRecvVariable( + ::google::protobuf::RpcController* controller, const MultiVarMsg* request, + MultiVarMsg* response, ::google::protobuf::Closure* done) { + // This object helps you to call done->Run() in RAII style. If you need + // to process the request asynchronously, pass done_guard.release(). + brpc::ClosureGuard done_guard(done); + std::string message_name = request->message_name(); + VLOG(0) << "SendAndRecvVariable message_name: " << message_name; + auto itr = handler_map_.find(message_name); + brpc::Controller* cntl = static_cast(controller); + LOG(INFO) << "SendAndRecvVariable(client addr) =" << cntl->remote_side(); + PADDLE_ENFORCE_NE( + itr, handler_map_.end(), + platform::errors::InvalidArgument( + "HeterService::SendAndRecvVariable Get illegal message_name: %s " + "which is not in HeterService::handler_map_", + message_name)); + itr->second(request, response, cntl); + // We don't want to call done->Run() here, release the guard. + // done_guard.release(); + } + + virtual void RecvFromSwitch(::google::protobuf::RpcController* controller, + const MultiVarMsg* request, MultiVarMsg* response, + ::google::protobuf::Closure* done) { + brpc::ClosureGuard done_guard(done); + brpc::Controller* cntl = static_cast(controller); + // int ret = service_handler_.QueryInSwitchWithScope(request, response, + // cntl); + int ret = service_handler_.QueryInSwitchWithShard(request, response, cntl); + // std::string message_name = request->message_name(); + // auto itr = handler_map_.find(message_name); + // int ret = itr->second(request, response, cntl); + if (ret != 0) { + LOG(ERROR) << "QueryInSwitchWithScope failed!"; + } + // response->set_message_name(message_name); + } + + virtual void SendToSwitch(::google::protobuf::RpcController* controller, + const MultiVarMsg* request, + PsResponseMessage* response, + ::google::protobuf::Closure* done) { + VLOG(4) << "entering SendToSwitch"; + brpc::ClosureGuard done_guard(done); + auto& switch_client_ptr_ = + HeterClient::GetSwitchInstance(peer_endpoints_, PEER_ROLE_IS_SWITCH); + if (switch_client_ptr_.peer_switch_channels_.empty()) { + LOG(ERROR) << "switch_client_ptr_.peer_switch_channels_ null"; + } + brpc::Channel* channel = switch_client_ptr_.peer_switch_channels_[0].get(); + brpc::Controller* cntl = static_cast(controller); + // proxy: 定义新的 OnHeterRpcDone 对象(或者在类 OnHeterRpcDone 中 reset) + OnHeterRpcDone* closure2 = new OnHeterRpcDone([](void* done) { + auto* closure = reinterpret_cast(done); + int ret = closure->CheckResponse(); + closure->set_promise_value(ret); + if (closure->cntl.Failed()) { + PADDLE_ENFORCE_NE( + closure->cntl.Failed(), true, + platform::errors::Unimplemented( + "HeterClient::SendS2S meets brpc error, error message is %s", + closure->cntl.ErrorText())); + } + }); + auto& std_cntl = closure2->cntl; + std_cntl.set_timeout_ms(FLAGS_pserver_timeout_ms); + std_cntl.request_attachment().append(cntl->request_attachment().movable()); + + auto promise = std::make_shared>(); + closure2->add_promise(promise); + std::future fut = promise->get_future(); + // brpc::Controller std_cntl; + // std_cntl.request_attachment().append(cntl->request_attachment().movable()); + PsService_Stub stub(channel); + stub.SendS2S(&std_cntl, request, response, closure2); + cntl->response_attachment().append( + std_cntl.response_attachment().movable()); + fut.wait(); + VLOG(4) << "SendToSwitch done"; + } + + void SendS2S(::google::protobuf::RpcController* controller, + const MultiVarMsg* request, PsResponseMessage* response, + ::google::protobuf::Closure* done) { + VLOG(4) << "entering SendS2S"; + brpc::ClosureGuard done_guard(done); + brpc::Controller* cntl = static_cast(controller); + // int ret = service_handler_.SaveInSwitchWithScope(request, response, + // cntl); + int ret = service_handler_.SaveInSwitchWithShard(request, response, cntl); + // std::string message_name = request->message_name(); + // auto itr = handler_map_.find(message_name); + // if (itr == handler_map_.end()) { + // LOG(ERROR) << "can not find func handler"; + //} + // int ret = itr->second(request, response, cntl); + if (ret != 0) { + LOG(ERROR) << "SaveInSwitchWithScope failed"; + } + std::string err_msg = "ok"; + response->set_err_msg(err_msg.c_str()); + response->set_err_code(ret); + VLOG(4) << "heter server SendS2S done"; + } + + void SendToWorker(::google::protobuf::RpcController* controller, + const MultiVarMsg* request, PsResponseMessage* response, + ::google::protobuf::Closure* done) { + brpc::ClosureGuard done_guard(done); + brpc::Controller* cntl = static_cast(controller); + VLOG(4) << "SendToWorker(client addr) =" << cntl->remote_side(); + auto& switch_client_ptr_ = + HeterClient::GetSwitchInstance(peer_endpoints_, PEER_ROLE_IS_WORKER); + VLOG(4) << "in switch client, peer worker 0: " + << switch_client_ptr_.peer_worker_list_[0]; + brpc::Channel* channel = switch_client_ptr_.peer_worker_channels_[0].get(); + + auto* closure = reinterpret_cast(done); + PsService_Stub stub(channel); + stub.SendAndRecvVariable(controller, request, &closure->response, done); + // fill response content + std::string err_msg("pass to worker"); + response->set_err_msg(err_msg.c_str()); + response->set_err_code(0); + } + + void RegisterServiceHandler(std::string message_name, + HeterServiceHandler func) { + handler_map_[message_name] = func; + } + + void SetEndpoint(const std::string& end_point) { endpoint_ = end_point; } + + void SetInterEndpoint(const std::string& end_point) { + endpoint_inter_ = end_point; + } + + void SetPeerEndPoints(const std::vector& peer_endpoints) { + peer_endpoints_ = peer_endpoints; + } + + void SetFanin(const int& fan_in) { fan_in_ = fan_in; } + + void ForceExit() { + VLOG(3) << "heter service force exit"; + is_exit_ = true; + return; + } + + bool IsExit() { return is_exit_; } + + private: + int32_t stop_profiler(const PsRequestMessage& request, + PsResponseMessage& response, // NOLINT + brpc::Controller* cntl) { + platform::DisableProfiler( + platform::EventSortingKey::kDefault, + string::Sprintf("heter_worker_%s_profile", endpoint_)); + return 0; + } + + int32_t start_profiler(const PsRequestMessage& request, + PsResponseMessage& response, // NOLINT + brpc::Controller* cntl) { + platform::EnableProfiler(platform::ProfilerState::kAll); + return 0; + } + + int32_t stop_heter_worker(const PsRequestMessage& request, + PsResponseMessage& response, // NOLINT + brpc::Controller* cntl) { + auto client_id = request.client_id(); + stop_cpu_worker_set_.insert(client_id); + if (stop_cpu_worker_set_.size() == fan_in_) { + is_exit_ = true; + } + return 0; + } + + private: + SendAndRecvVariableHandler service_handler_; + std::string endpoint_; + std::string endpoint_inter_; + // for switch + std::vector peer_endpoints_; + + std::unordered_map _service_handler_map; + std::unordered_map handler_map_; + std::unordered_set stop_cpu_worker_set_; + uint32_t fan_in_; + bool is_exit_ = false; +}; + class HeterServer { public: + HeterServer() : ready_(0) {} virtual ~HeterServer() {} - void Stop() { std::unique_lock lock(mutex_); if (stoped_ == true) return; - if (!IsExit()) service_.ForceExit(); - VLOG(3) << "HeterServer Stop()"; + if (!IsExit()) { + service_.ForceExit(); + } stoped_ = true; cv_.notify_all(); server_.Stop(1000); @@ -327,26 +521,42 @@ class HeterServer { bool IsStop() { std::unique_lock lock(mutex_); - if (stoped_ == true) - return true; - else - return false; + return stoped_; } bool IsExit() { return service_.IsExit(); } - HeterServer() : service_(), ready_(0) {} - void RegisterServiceHandler(std::string message_name, HeterServiceHandler func); - void StartHeterService(); + void StartHeterService(bool need_encrypt = false); + + void StartHeterInterService(bool need_encrypt = false); + + void SetEndPoint(const std::string& endpoint) { + this->endpoint_ = endpoint; + service_.SetEndpoint(endpoint); + } + + void SetLocalScope() { + request_handler_->local_scope_ptr = + std::make_shared(); + } + + void SetInterEndpoint(const std::string& endpoint) { + this->endpoint_inter_ = endpoint; + service_.SetInterEndpoint(endpoint); + } + + void SetPeerEndPoints(const std::vector& peer_endpoints) { + this->peer_endpoints_ = peer_endpoints; + service_.SetPeerEndPoints(peer_endpoints); + } - void SetEndPoint(const std::string& endpoint); void SetFanin(const int& fan_in); - void SetRequestHandler( - std::shared_ptr request_handler) { + void SetServiceHandler( + std::shared_ptr request_handler) { request_handler_ = request_handler; } @@ -381,11 +591,15 @@ class HeterServer { std::condition_variable condition_ready_; bool stoped_ = true; std::string endpoint_; + std::string endpoint_inter_; + // for switch + std::vector peer_endpoints_; protected: brpc::Server server_; + brpc::Server server_inter_; HeterService service_; - std::shared_ptr request_handler_; + std::shared_ptr request_handler_; DISABLE_COPY_AND_ASSIGN(HeterServer); std::mutex mutex_ready_; diff --git a/paddle/fluid/distributed/ps/service/ps_client.cc b/paddle/fluid/distributed/ps/service/ps_client.cc index fd956b758de1a..27f2d88fdd9fa 100644 --- a/paddle/fluid/distributed/ps/service/ps_client.cc +++ b/paddle/fluid/distributed/ps/service/ps_client.cc @@ -46,8 +46,8 @@ int32_t PSClient::configure( auto *accessor = CREATE_PSCORE_CLASS( ValueAccessor, work_param.downpour_table_param(i).accessor().accessor_class()); - accessor->configure(work_param.downpour_table_param(i).accessor()); - accessor->initialize(); + accessor->Configure(work_param.downpour_table_param(i).accessor()); + accessor->Initialize(); _table_accessors[work_param.downpour_table_param(i).table_id()].reset( accessor); } diff --git a/paddle/fluid/distributed/ps/service/ps_local_client.cc b/paddle/fluid/distributed/ps/service/ps_local_client.cc old mode 100755 new mode 100644 index fe5cbe682ea67..dbf47f0df4116 --- a/paddle/fluid/distributed/ps/service/ps_local_client.cc +++ b/paddle/fluid/distributed/ps/service/ps_local_client.cc @@ -174,7 +174,8 @@ ::std::future PsLocalClient::pull_dense(Region* regions, auto* accessor = table_accessor(table_id); auto* table_ptr = table(table_id); - uint32_t num_per_shard = dense_dim_per_shard(accessor->fea_dim(), 1); + uint32_t num_per_shard = + dense_dim_per_shard(accessor->GetTableInfo(FEA_DIM), 1); std::vector region_buffer; region_buffer.resize(num_per_shard); table_ptr->pull_dense(region_buffer.data(), region_buffer.size()); @@ -219,7 +220,8 @@ ::std::future PsLocalClient::push_dense_param(const Region* regions, auto* table_ptr = table(table_id); std::vector region_buffer; - region_buffer.resize(dense_dim_per_shard(accessor->fea_dim(), 1), 0); + region_buffer.resize(dense_dim_per_shard(accessor->GetTableInfo(FEA_DIM), 1), + 0); for (size_t i = 0, offset = 0; i < region_num; ++i) { uint32_t data_num = regions[i].size / sizeof(float); memcpy(region_buffer.data() + offset, regions[i].data, regions[i].size); @@ -252,7 +254,7 @@ ::std::future PsLocalClient::push_dense(const Region* regions, auto* table_ptr = table(table_id); std::vector region_buffer; - region_buffer.resize(dense_dim_per_shard(accessor->fea_dim(), 1)); + region_buffer.resize(dense_dim_per_shard(accessor->GetTableInfo(FEA_DIM), 1)); size_t data_size = region_buffer.size(); for (size_t i = 0, offset = 0; i < region_num; ++i) { uint32_t data_num = regions[i].size / sizeof(float); diff --git a/paddle/fluid/distributed/ps/service/sendrecv.proto b/paddle/fluid/distributed/ps/service/sendrecv.proto old mode 100644 new mode 100755 index 6dfaff1ffa1df..580f411c28c07 --- a/paddle/fluid/distributed/ps/service/sendrecv.proto +++ b/paddle/fluid/distributed/ps/service/sendrecv.proto @@ -59,6 +59,12 @@ enum PsCmdID { PS_GRAPH_SAMPLE_NODES_FROM_ONE_SERVER = 38; PS_GRAPH_USE_NEIGHBORS_SAMPLE_CACHE = 39; PS_GRAPH_LOAD_GRAPH_SPLIT_CONFIG = 40; + PEER_ROLE_IS_WORKER = 41; + PEER_ROLE_IS_SWITCH = 42; + PS_SAVE_WITH_SCOPE = 43; + PS_SAVE_WITH_SHARD = 44; + PS_QUERY_WITH_SCOPE = 45; + PS_QUERY_WITH_SHARD = 46; } message PsRequestMessage { @@ -117,9 +123,16 @@ message MultiVariableMessage { repeated string send_var_names = 2; repeated string recv_var_names = 3; repeated VariableMessage var_messages = 4; + optional bytes data = 5; + repeated int32 vars_len = 6; + optional int32 group_id = 7; }; service PsService { rpc service(PsRequestMessage) returns (PsResponseMessage); rpc SendAndRecvVariable(MultiVariableMessage) returns (MultiVariableMessage); + rpc SendToWorker(MultiVariableMessage) returns (PsResponseMessage); + rpc SendToSwitch(MultiVariableMessage) returns (PsResponseMessage); + rpc SendS2S(MultiVariableMessage) returns (PsResponseMessage); + rpc RecvFromSwitch(MultiVariableMessage) returns (MultiVariableMessage); }; diff --git a/paddle/fluid/distributed/ps/table/accessor.h b/paddle/fluid/distributed/ps/table/accessor.h index 207cc94b4cb15..efc1e604dc9d0 100644 --- a/paddle/fluid/distributed/ps/table/accessor.h +++ b/paddle/fluid/distributed/ps/table/accessor.h @@ -72,7 +72,7 @@ class ValueAccessor { ValueAccessor() {} virtual ~ValueAccessor() {} - virtual int configure(const TableAccessorParameter& parameter) { + virtual int Configure(const TableAccessorParameter& parameter) { _config = parameter; // data_convert结构体初始化 if (_config.table_accessor_save_param_size() != 0) { @@ -88,38 +88,15 @@ class ValueAccessor { } return 0; } - virtual int initialize() = 0; + virtual int Initialize() = 0; virtual void SetTableInfo(AccessorInfo& info) = 0; virtual size_t GetTableInfo(InfoKey key) = 0; - // value维度 - virtual size_t dim() = 0; - // value各个维度的size - virtual size_t dim_size(size_t dim) = 0; - // value各维度相加总size - virtual size_t size() = 0; - - // value中mf动态长度部分总size大小, sparse下生效 - virtual size_t mf_size() { return 0; } - virtual bool need_extend_mf(float* value) { return false; } - virtual bool has_mf(size_t size) { return false; } - // pull value维度 - virtual size_t select_dim() = 0; - // pull value各个维度的size - virtual size_t select_dim_size(size_t dim) = 0; - // pull value各维度相加总size - virtual size_t select_size() = 0; - // push value维度 - virtual size_t update_dim() = 0; - // push value各个维度的size - virtual size_t update_dim_size(size_t dim) = 0; - // push value各维度相加总size - virtual size_t update_size() = 0; - // fea total for dense - virtual size_t fea_dim() { return _config.fea_dim(); } + virtual bool NeedExtendMF(float* value) { return false; } + virtual bool HasMF(size_t size) { return false; } // converter for save - virtual std::string get_converter(int param) { + virtual std::string GetConverter(int param) { auto itr = _data_coverter_map.find(param); if (itr == _data_coverter_map.end()) { return ""; @@ -128,7 +105,7 @@ class ValueAccessor { } } // deconverter for load - virtual std::string get_deconverter(int param) { + virtual std::string GetDeconverter(int param) { auto itr = _data_coverter_map.find(param); if (itr == _data_coverter_map.end()) { return ""; @@ -137,47 +114,47 @@ class ValueAccessor { } } // 判断该value是否进行shrink - virtual bool shrink(float* value) = 0; + virtual bool Shrink(float* value) = 0; // 判断该value是否在save阶段dump, // param作为参数用于标识save阶段,如downpour的xbox与batch_model - virtual bool save(float* value, int param) = 0; + virtual bool Save(float* value, int param) = 0; // update delta_score and unseen_days after save - virtual void update_stat_after_save(float* value, int param) {} + virtual void UpdateStatAfterSave(float* value, int param) {} // keys不存在时,为values生成随机值 - virtual int32_t create(float** value, size_t num) = 0; - virtual bool create_value(int type, const float* value) { return true; } + virtual int32_t Create(float** value, size_t num) = 0; + virtual bool CreateValue(int type, const float* value) { return true; } // 从values中选取到select_values中 - virtual int32_t select(float** select_values, const float** values, + virtual int32_t Select(float** select_values, const float** values, size_t num) = 0; // 将update_values聚合到一起 - virtual int32_t merge(float** update_values, + virtual int32_t Merge(float** update_values, const float** other_update_values, size_t num) = 0; // 将update_values聚合到一起,通过it.next判定是否进入下一个key - // virtual int32_t merge(float** update_values, iterator it); + // virtual int32_t Merge(float** update_values, iterator it); // 将update_values更新应用到values中 - virtual int32_t update(float** values, const float** update_values, + virtual int32_t Update(float** values, const float** update_values, size_t num) = 0; // used to save model, will filter feature - virtual std::string parse_to_string(const float* value, int param) = 0; + virtual std::string ParseToString(const float* value, int param) = 0; // parse value from string, used to load model - virtual int32_t parse_from_string(const std::string& data, float* value) = 0; + virtual int32_t ParseFromString(const std::string& data, float* value) = 0; - virtual FsDataConverter converter(int param) { + virtual FsDataConverter Converter(int param) { FsDataConverter data_convert; - data_convert.converter = this->get_converter(param); - data_convert.deconverter = this->get_deconverter(param); + data_convert.converter = this->GetConverter(param); + data_convert.deconverter = this->GetDeconverter(param); return data_convert; } - virtual int set_weight(float** values, const float** update_values, - size_t num) { + virtual int SetWeight(float** values, const float** update_values, + size_t num) { return 0; } - virtual float get_field(float* value, const std::string& name) { return 0.0; } + virtual float GetField(float* value, const std::string& name) { return 0.0; } #define DEFINE_GET_INDEX(class, field) \ virtual int get_##field##_index() override { return class ::field##_index(); } diff --git a/paddle/fluid/distributed/ps/table/common_dense_table.cc b/paddle/fluid/distributed/ps/table/common_dense_table.cc index a462fc50aeb72..caec575e33eef 100644 --- a/paddle/fluid/distributed/ps/table/common_dense_table.cc +++ b/paddle/fluid/distributed/ps/table/common_dense_table.cc @@ -232,9 +232,9 @@ int32_t CommonDenseTable::load(const std::string& path, int load_param = atoi(param.c_str()); FsChannelConfig channel_config; - channel_config.converter = _value_accesor->converter(load_param).converter; + channel_config.converter = _value_accesor->Converter(load_param).converter; channel_config.deconverter = - _value_accesor->converter(load_param).deconverter; + _value_accesor->Converter(load_param).deconverter; bool is_read_failed = false; int err_no = 0; int retry_num = 0; @@ -329,9 +329,9 @@ int32_t CommonDenseTable::save(const std::string& path, "%s/part-%03d", table_dir(path).c_str(), _shard_idx); } _afs_client.remove(channel_config.path); - channel_config.converter = _value_accesor->converter(save_param).converter; + channel_config.converter = _value_accesor->Converter(save_param).converter; channel_config.deconverter = - _value_accesor->converter(save_param).deconverter; + _value_accesor->Converter(save_param).deconverter; bool is_write_failed = false; std::vector> result_buffer_param( diff --git a/paddle/fluid/distributed/ps/table/ctr_accessor.cc b/paddle/fluid/distributed/ps/table/ctr_accessor.cc index ffb97914fb8c0..8380177963ed9 100644 --- a/paddle/fluid/distributed/ps/table/ctr_accessor.cc +++ b/paddle/fluid/distributed/ps/table/ctr_accessor.cc @@ -20,7 +20,7 @@ namespace paddle { namespace distributed { -int CtrCommonAccessor::initialize() { +int CtrCommonAccessor::Initialize() { auto name = _config.embed_sgd_param().name(); _embed_sgd_rule = CREATE_PSCORE_CLASS(SparseValueSGDRule, name); _embed_sgd_rule->load_config(_config.embed_sgd_param(), 1); @@ -39,73 +39,72 @@ int CtrCommonAccessor::initialize() { } void CtrCommonAccessor::SetTableInfo(AccessorInfo& info) { - info.dim = dim(); - info.size = size(); - info.select_dim = select_dim(); - info.select_size = select_size(); - info.update_dim = update_dim(); - info.update_size = update_size(); - info.mf_size = mf_size(); - info.fea_dim = fea_dim(); + info.dim = Dim(); + info.size = Size(); + info.select_dim = SelectDim(); + info.select_size = SelectSize(); + info.update_dim = UpdateDim(); + info.update_size = UpdateSize(); + info.mf_size = MFSize(); } size_t CtrCommonAccessor::GetTableInfo(InfoKey key) { switch (key) { case DIM: - return dim(); + return Dim(); case SIZE: - return size(); + return Size(); case SELECT_DIM: - return select_dim(); + return SelectDim(); case SELECT_SIZE: - return select_size(); + return SelectSize(); case UPDATE_DIM: - return update_dim(); + return UpdateDim(); case UPDATE_SIZE: - return update_size(); + return UpdateSize(); case MF_SIZE: - return mf_size(); - case FEA_DIM: - return fea_dim(); + return MFSize(); + default: + return 0; } return 0; } -size_t CtrCommonAccessor::dim() { return common_feature_value.dim(); } +size_t CtrCommonAccessor::Dim() { return common_feature_value.Dim(); } -size_t CtrCommonAccessor::dim_size(size_t dim) { +size_t CtrCommonAccessor::DimSize(size_t dim) { auto embedx_dim = _config.embedx_dim(); - return common_feature_value.dim_size(dim, embedx_dim); + return common_feature_value.DimSize(dim, embedx_dim); } -size_t CtrCommonAccessor::size() { return common_feature_value.size(); } +size_t CtrCommonAccessor::Size() { return common_feature_value.Size(); } -size_t CtrCommonAccessor::mf_size() { +size_t CtrCommonAccessor::MFSize() { return (_config.embedx_dim() + common_feature_value.embedx_sgd_dim) * sizeof(float); // embedx embedx_g2sum } // pull value -size_t CtrCommonAccessor::select_dim() { +size_t CtrCommonAccessor::SelectDim() { auto embedx_dim = _config.embedx_dim(); return 3 + embedx_dim; } -size_t CtrCommonAccessor::select_dim_size(size_t dim) { return sizeof(float); } +size_t CtrCommonAccessor::SelectDimSize(size_t dim) { return sizeof(float); } -size_t CtrCommonAccessor::select_size() { return select_dim() * sizeof(float); } +size_t CtrCommonAccessor::SelectSize() { return SelectDim() * sizeof(float); } // push value -size_t CtrCommonAccessor::update_dim() { +size_t CtrCommonAccessor::UpdateDim() { auto embedx_dim = _config.embedx_dim(); return 4 + embedx_dim; } -size_t CtrCommonAccessor::update_dim_size(size_t dim) { return sizeof(float); } +size_t CtrCommonAccessor::UpdateDimSize(size_t dim) { return sizeof(float); } -size_t CtrCommonAccessor::update_size() { return update_dim() * sizeof(float); } +size_t CtrCommonAccessor::UpdateSize() { return UpdateDim() * sizeof(float); } -bool CtrCommonAccessor::shrink(float* value) { +bool CtrCommonAccessor::Shrink(float* value) { auto base_threshold = _config.ctr_accessor_param().base_threshold(); auto delta_threshold = _config.ctr_accessor_param().delta_threshold(); auto delete_after_unseen_days = @@ -113,12 +112,12 @@ bool CtrCommonAccessor::shrink(float* value) { auto delete_threshold = _config.ctr_accessor_param().delete_threshold(); // time_decay first - common_feature_value.show(value) *= _show_click_decay_rate; - common_feature_value.click(value) *= _show_click_decay_rate; + common_feature_value.Show(value) *= _show_click_decay_rate; + common_feature_value.Click(value) *= _show_click_decay_rate; // shrink after - auto score = show_click_score(common_feature_value.show(value), - common_feature_value.click(value)); + auto score = show_click_score(common_feature_value.Show(value), + common_feature_value.Click(value)); auto unseen_days = common_feature_value.unseen_days(value); if (score < delete_threshold || unseen_days > delete_after_unseen_days) { return true; @@ -126,7 +125,7 @@ bool CtrCommonAccessor::shrink(float* value) { return false; } -bool CtrCommonAccessor::save(float* value, int param) { +bool CtrCommonAccessor::Save(float* value, int param) { auto base_threshold = _config.ctr_accessor_param().base_threshold(); auto delta_threshold = _config.ctr_accessor_param().delta_threshold(); auto delta_keep_days = _config.ctr_accessor_param().delta_keep_days(); @@ -142,8 +141,8 @@ bool CtrCommonAccessor::save(float* value, int param) { case 1: // save xbox base case 2: { - if (show_click_score(common_feature_value.show(value), - common_feature_value.click(value)) >= + if (show_click_score(common_feature_value.Show(value), + common_feature_value.Click(value)) >= base_threshold && common_feature_value.delta_score(value) >= delta_threshold && common_feature_value.unseen_days(value) <= delta_keep_days) { @@ -171,7 +170,7 @@ bool CtrCommonAccessor::save(float* value, int param) { } } -void CtrCommonAccessor::update_stat_after_save(float* value, int param) { +void CtrCommonAccessor::UpdateStatAfterSave(float* value, int param) { auto base_threshold = _config.ctr_accessor_param().base_threshold(); auto delta_threshold = _config.ctr_accessor_param().delta_threshold(); auto delta_keep_days = _config.ctr_accessor_param().delta_keep_days(); @@ -180,8 +179,8 @@ void CtrCommonAccessor::update_stat_after_save(float* value, int param) { } switch (param) { case 1: { - if (show_click_score(common_feature_value.show(value), - common_feature_value.click(value)) >= + if (show_click_score(common_feature_value.Show(value), + common_feature_value.Click(value)) >= base_threshold && common_feature_value.delta_score(value) >= delta_threshold && common_feature_value.unseen_days(value) <= delta_keep_days) { @@ -198,52 +197,52 @@ void CtrCommonAccessor::update_stat_after_save(float* value, int param) { } } -int32_t CtrCommonAccessor::create(float** values, size_t num) { +int32_t CtrCommonAccessor::Create(float** values, size_t num) { auto embedx_dim = _config.embedx_dim(); for (size_t value_item = 0; value_item < num; ++value_item) { float* value = values[value_item]; value[common_feature_value.unseen_days_index()] = 0; value[common_feature_value.delta_score_index()] = 0; - value[common_feature_value.show_index()] = 0; - value[common_feature_value.click_index()] = 0; - value[common_feature_value.slot_index()] = -1; + value[common_feature_value.ShowIndex()] = 0; + value[common_feature_value.ClickIndex()] = 0; + value[common_feature_value.SlotIndex()] = -1; _embed_sgd_rule->init_value( - value + common_feature_value.embed_w_index(), + value + common_feature_value.Embed_W_Index(), value + common_feature_value.embed_g2sum_index()); _embedx_sgd_rule->init_value( - value + common_feature_value.embedx_w_index(), + value + common_feature_value.Embedx_W_Index(), value + common_feature_value.embedx_g2sum_index(), false); } return 0; } -bool CtrCommonAccessor::need_extend_mf(float* value) { - float show = value[common_feature_value.show_index()]; - float click = value[common_feature_value.click_index()]; +bool CtrCommonAccessor::NeedExtendMF(float* value) { + float show = value[common_feature_value.ShowIndex()]; + float click = value[common_feature_value.ClickIndex()]; float score = (show - click) * _config.ctr_accessor_param().nonclk_coeff() + click * _config.ctr_accessor_param().click_coeff(); return score >= _config.embedx_threshold(); } -bool CtrCommonAccessor::has_mf(size_t size) { +bool CtrCommonAccessor::HasMF(size_t size) { return size > common_feature_value.embedx_g2sum_index(); } // from CommonFeatureValue to CtrCommonPullValue -int32_t CtrCommonAccessor::select(float** select_values, const float** values, +int32_t CtrCommonAccessor::Select(float** select_values, const float** values, size_t num) { auto embedx_dim = _config.embedx_dim(); for (size_t value_item = 0; value_item < num; ++value_item) { float* select_value = select_values[value_item]; const float* value = values[value_item]; - select_value[CtrCommonPullValue::show_index()] = - value[common_feature_value.show_index()]; - select_value[CtrCommonPullValue::click_index()] = - value[common_feature_value.click_index()]; - select_value[CtrCommonPullValue::embed_w_index()] = - value[common_feature_value.embed_w_index()]; - memcpy(select_value + CtrCommonPullValue::embedx_w_index(), - value + common_feature_value.embedx_w_index(), + select_value[CtrCommonPullValue::ShowIndex()] = + value[common_feature_value.ShowIndex()]; + select_value[CtrCommonPullValue::ClickIndex()] = + value[common_feature_value.ClickIndex()]; + select_value[CtrCommonPullValue::Embed_W_Index()] = + value[common_feature_value.Embed_W_Index()]; + memcpy(select_value + CtrCommonPullValue::Embedx_W_Index(), + value + common_feature_value.Embedx_W_Index(), embedx_dim * sizeof(float)); } return 0; @@ -252,16 +251,16 @@ int32_t CtrCommonAccessor::select(float** select_values, const float** values, // from CtrCommonPushValue to CtrCommonPushValue // first dim: item // second dim: field num -int32_t CtrCommonAccessor::merge(float** update_values, +int32_t CtrCommonAccessor::Merge(float** update_values, const float** other_update_values, size_t num) { auto embedx_dim = _config.embedx_dim(); - size_t total_dim = CtrCommonPushValue::dim(embedx_dim); + size_t total_dim = CtrCommonPushValue::Dim(embedx_dim); for (size_t value_item = 0; value_item < num; ++value_item) { float* update_value = update_values[value_item]; const float* other_update_value = other_update_values[value_item]; for (auto i = 0u; i < total_dim; ++i) { - if (i != CtrCommonPushValue::slot_index()) { + if (i != CtrCommonPushValue::SlotIndex()) { update_value[i] += other_update_value[i]; } } @@ -272,43 +271,43 @@ int32_t CtrCommonAccessor::merge(float** update_values, // from CtrCommonPushValue to CommonFeatureValue // first dim: item // second dim: field num -int32_t CtrCommonAccessor::update(float** update_values, +int32_t CtrCommonAccessor::Update(float** update_values, const float** push_values, size_t num) { auto embedx_dim = _config.embedx_dim(); for (size_t value_item = 0; value_item < num; ++value_item) { float* update_value = update_values[value_item]; const float* push_value = push_values[value_item]; - float push_show = push_value[CtrCommonPushValue::show_index()]; - float push_click = push_value[CtrCommonPushValue::click_index()]; - float slot = push_value[CtrCommonPushValue::slot_index()]; - update_value[common_feature_value.show_index()] += push_show; - update_value[common_feature_value.click_index()] += push_click; - update_value[common_feature_value.slot_index()] = slot; + float push_show = push_value[CtrCommonPushValue::ShowIndex()]; + float push_click = push_value[CtrCommonPushValue::ClickIndex()]; + float slot = push_value[CtrCommonPushValue::SlotIndex()]; + update_value[common_feature_value.ShowIndex()] += push_show; + update_value[common_feature_value.ClickIndex()] += push_click; + update_value[common_feature_value.SlotIndex()] = slot; update_value[common_feature_value.delta_score_index()] += (push_show - push_click) * _config.ctr_accessor_param().nonclk_coeff() + push_click * _config.ctr_accessor_param().click_coeff(); update_value[common_feature_value.unseen_days_index()] = 0; _embed_sgd_rule->update_value( - update_value + common_feature_value.embed_w_index(), + update_value + common_feature_value.Embed_W_Index(), update_value + common_feature_value.embed_g2sum_index(), - push_value + CtrCommonPushValue::embed_g_index()); + push_value + CtrCommonPushValue::Embed_G_Index()); _embedx_sgd_rule->update_value( - update_value + common_feature_value.embedx_w_index(), + update_value + common_feature_value.Embedx_W_Index(), update_value + common_feature_value.embedx_g2sum_index(), - push_value + CtrCommonPushValue::embedx_g_index()); + push_value + CtrCommonPushValue::Embedx_G_Index()); } return 0; } -bool CtrCommonAccessor::create_value(int stage, const float* value) { +bool CtrCommonAccessor::CreateValue(int stage, const float* value) { // stage == 0, pull // stage == 1, push if (stage == 0) { return true; } else if (stage == 1) { // operation - auto show = CtrCommonPushValue::show(const_cast(value)); - auto click = CtrCommonPushValue::click(const_cast(value)); + auto show = CtrCommonPushValue::Show(const_cast(value)); + auto click = CtrCommonPushValue::Click(const_cast(value)); auto score = show_click_score(show, click); if (score <= 0) { return false; @@ -329,34 +328,34 @@ float CtrCommonAccessor::show_click_score(float show, float click) { return (show - click) * nonclk_coeff + click * click_coeff; } -std::string CtrCommonAccessor::parse_to_string(const float* v, int param) { +std::string CtrCommonAccessor::ParseToString(const float* v, int param) { thread_local std::ostringstream os; os.clear(); os.str(""); os << v[0] << " " << v[1] << " " << v[2] << " " << v[3] << " " << v[4] << " " << v[5]; for (int i = common_feature_value.embed_g2sum_index(); - i < common_feature_value.embedx_w_index(); i++) { + i < common_feature_value.Embedx_W_Index(); i++) { os << " " << v[i]; } - auto show = common_feature_value.show(const_cast(v)); - auto click = common_feature_value.click(const_cast(v)); + auto show = common_feature_value.Show(const_cast(v)); + auto click = common_feature_value.Click(const_cast(v)); auto score = show_click_score(show, click); if (score >= _config.embedx_threshold() && - param > common_feature_value.embedx_w_index()) { - for (auto i = common_feature_value.embedx_w_index(); - i < common_feature_value.dim(); ++i) { + param > common_feature_value.Embedx_W_Index()) { + for (auto i = common_feature_value.Embedx_W_Index(); + i < common_feature_value.Dim(); ++i) { os << " " << v[i]; } } return os.str(); } -int CtrCommonAccessor::parse_from_string(const std::string& str, float* value) { +int CtrCommonAccessor::ParseFromString(const std::string& str, float* value) { int embedx_dim = _config.embedx_dim(); _embedx_sgd_rule->init_value( - value + common_feature_value.embedx_w_index(), + value + common_feature_value.Embedx_W_Index(), value + common_feature_value.embedx_g2sum_index()); auto ret = paddle::string::str_to_float(str.data(), value); CHECK(ret >= 6) << "expect more than 6 real:" << ret; diff --git a/paddle/fluid/distributed/ps/table/ctr_accessor.h b/paddle/fluid/distributed/ps/table/ctr_accessor.h index a2121b21d9fe6..21dfc6a5c1c38 100644 --- a/paddle/fluid/distributed/ps/table/ctr_accessor.h +++ b/paddle/fluid/distributed/ps/table/ctr_accessor.h @@ -40,27 +40,27 @@ class CtrCommonAccessor : public ValueAccessor { std::float embedx_g2sum; */ - int dim() { return 6 + embed_sgd_dim + embedx_sgd_dim + embedx_dim; } - int dim_size(size_t dim, int embedx_dim) { return sizeof(float); } - int size() { return dim() * sizeof(float); } - int slot_index() { return 0; } - int unseen_days_index() { return slot_index() + 1; } + int Dim() { return 6 + embed_sgd_dim + embedx_sgd_dim + embedx_dim; } + int DimSize(size_t dim, int embedx_dim) { return sizeof(float); } + int Size() { return Dim() * sizeof(float); } + int SlotIndex() { return 0; } + int unseen_days_index() { return SlotIndex() + 1; } int delta_score_index() { return unseen_days_index() + 1; } - int show_index() { return delta_score_index() + 1; } - int click_index() { return show_index() + 1; } - int embed_w_index() { return click_index() + 1; } - int embed_g2sum_index() { return embed_w_index() + 1; } - int embedx_w_index() { return embed_g2sum_index() + embed_sgd_dim; } - int embedx_g2sum_index() { return embedx_w_index() + embedx_dim; } + int ShowIndex() { return delta_score_index() + 1; } + int ClickIndex() { return ShowIndex() + 1; } + int Embed_W_Index() { return ClickIndex() + 1; } + int embed_g2sum_index() { return Embed_W_Index() + 1; } + int Embedx_W_Index() { return embed_g2sum_index() + embed_sgd_dim; } + int embedx_g2sum_index() { return Embedx_W_Index() + embedx_dim; } float& unseen_days(float* val) { return val[unseen_days_index()]; } float& delta_score(float* val) { return val[delta_score_index()]; } - float& show(float* val) { return val[show_index()]; } - float& click(float* val) { return val[click_index()]; } - float& slot(float* val) { return val[slot_index()]; } - float& embed_w(float* val) { return val[embed_w_index()]; } + float& Show(float* val) { return val[ShowIndex()]; } + float& Click(float* val) { return val[ClickIndex()]; } + float& Slot(float* val) { return val[SlotIndex()]; } + float& EmbedW(float* val) { return val[Embed_W_Index()]; } float& embed_g2sum(float* val) { return val[embed_g2sum_index()]; } - float& embedx_w(float* val) { return val[embedx_w_index()]; } + float& EmbedxW(float* val) { return val[Embedx_W_Index()]; } float& embedx_g2sum(float* val) { return val[embedx_g2sum_index()]; } int embed_sgd_dim; @@ -77,31 +77,31 @@ class CtrCommonAccessor : public ValueAccessor { std::vector embedx_g; */ - static int dim(int embedx_dim) { return 4 + embedx_dim; } + static int Dim(int embedx_dim) { return 4 + embedx_dim; } - static int dim_size(int dim, int embedx_dim) { return sizeof(float); } - static int size(int embedx_dim) { return dim(embedx_dim) * sizeof(float); } - static int slot_index() { return 0; } - static int show_index() { return CtrCommonPushValue::slot_index() + 1; } - static int click_index() { return CtrCommonPushValue::show_index() + 1; } - static int embed_g_index() { return CtrCommonPushValue::click_index() + 1; } - static int embedx_g_index() { - return CtrCommonPushValue::embed_g_index() + 1; + static int DimSize(int dim, int embedx_dim) { return sizeof(float); } + static int Size(int embedx_dim) { return Dim(embedx_dim) * sizeof(float); } + static int SlotIndex() { return 0; } + static int ShowIndex() { return CtrCommonPushValue::SlotIndex() + 1; } + static int ClickIndex() { return CtrCommonPushValue::ShowIndex() + 1; } + static int Embed_G_Index() { return CtrCommonPushValue::ClickIndex() + 1; } + static int Embedx_G_Index() { + return CtrCommonPushValue::Embed_G_Index() + 1; } - static float& slot(float* val) { - return val[CtrCommonPushValue::slot_index()]; + static float& Slot(float* val) { + return val[CtrCommonPushValue::SlotIndex()]; } - static float& show(float* val) { - return val[CtrCommonPushValue::show_index()]; + static float& Show(float* val) { + return val[CtrCommonPushValue::ShowIndex()]; } - static float& click(float* val) { - return val[CtrCommonPushValue::click_index()]; + static float& Click(float* val) { + return val[CtrCommonPushValue::ClickIndex()]; } - static float& embed_g(float* val) { - return val[CtrCommonPushValue::embed_g_index()]; + static float& EmbedG(float* val) { + return val[CtrCommonPushValue::Embed_G_Index()]; } - static float* embedx_g(float* val) { - return val + CtrCommonPushValue::embedx_g_index(); + static float* EmbedxG(float* val) { + return val + CtrCommonPushValue::Embedx_G_Index(); } }; @@ -113,90 +113,90 @@ class CtrCommonAccessor : public ValueAccessor { std::vector embedx_w; */ - static int dim(int embedx_dim) { return 3 + embedx_dim; } - static int dim_size(size_t dim) { return sizeof(float); } - static int size(int embedx_dim) { return dim(embedx_dim) * sizeof(float); } - static int show_index() { return 0; } - static int click_index() { return 1; } - static int embed_w_index() { return 2; } - static int embedx_w_index() { return 3; } - static float& show(float* val) { - return val[CtrCommonPullValue::show_index()]; + static int Dim(int embedx_dim) { return 3 + embedx_dim; } + static int DimSize(size_t dim) { return sizeof(float); } + static int Size(int embedx_dim) { return Dim(embedx_dim) * sizeof(float); } + static int ShowIndex() { return 0; } + static int ClickIndex() { return 1; } + static int Embed_W_Index() { return 2; } + static int Embedx_W_Index() { return 3; } + static float& Show(float* val) { + return val[CtrCommonPullValue::ShowIndex()]; } - static float& click(float* val) { - return val[CtrCommonPullValue::click_index()]; + static float& Click(float* val) { + return val[CtrCommonPullValue::ClickIndex()]; } - static float& embed_w(float* val) { - return val[CtrCommonPullValue::embed_w_index()]; + static float& EmbedW(float* val) { + return val[CtrCommonPullValue::Embed_W_Index()]; } - static float* embedx_w(float* val) { - return val + CtrCommonPullValue::embedx_w_index(); + static float* EmbedxW(float* val) { + return val + CtrCommonPullValue::Embedx_W_Index(); } }; CtrCommonAccessor() {} - virtual int initialize(); + virtual int Initialize(); virtual ~CtrCommonAccessor() {} virtual void SetTableInfo(AccessorInfo& info); virtual size_t GetTableInfo(InfoKey key); // value维度 - virtual size_t dim(); + size_t Dim(); // value各个维度的size - virtual size_t dim_size(size_t dim); + size_t DimSize(size_t dim); // value各维度相加总size - virtual size_t size(); + size_t Size(); // value中mf动态长度部分总size大小, sparse下生效 - virtual size_t mf_size(); + size_t MFSize(); // pull value维度 - virtual size_t select_dim(); + size_t SelectDim(); // pull value各个维度的size - virtual size_t select_dim_size(size_t dim); + size_t SelectDimSize(size_t dim); // pull value各维度相加总size - virtual size_t select_size(); + size_t SelectSize(); // push value维度 - virtual size_t update_dim(); + size_t UpdateDim(); // push value各个维度的size - virtual size_t update_dim_size(size_t dim); + size_t UpdateDimSize(size_t dim); // push value各维度相加总size - virtual size_t update_size(); + size_t UpdateSize(); // 判断该value是否进行shrink - virtual bool shrink(float* value); + virtual bool Shrink(float* value); // 判断该value是否保存到ssd // virtual bool save_ssd(float* value); - virtual bool need_extend_mf(float* value); - virtual bool has_mf(size_t size); + virtual bool NeedExtendMF(float* value); + virtual bool HasMF(size_t size); // 判断该value是否在save阶段dump, // param作为参数用于标识save阶段,如downpour的xbox与batch_model // param = 0, save all feature // param = 1, save delta feature // param = 2, save xbox base feature - bool save(float* value, int param) override; + bool Save(float* value, int param) override; // update delta_score and unseen_days after save - void update_stat_after_save(float* value, int param) override; + void UpdateStatAfterSave(float* value, int param) override; // keys不存在时,为values生成随机值 // 要求value的内存由外部调用者分配完毕 - virtual int32_t create(float** value, size_t num); + virtual int32_t Create(float** value, size_t num); // 从values中选取到select_values中 - virtual int32_t select(float** select_values, const float** values, + virtual int32_t Select(float** select_values, const float** values, size_t num); // 将update_values聚合到一起 - virtual int32_t merge(float** update_values, + virtual int32_t Merge(float** update_values, const float** other_update_values, size_t num); // 将update_values聚合到一起,通过it.next判定是否进入下一个key - // virtual int32_t merge(float** update_values, iterator it); + // virtual int32_t Merge(float** update_values, iterator it); // 将update_values更新应用到values中 - virtual int32_t update(float** values, const float** update_values, + virtual int32_t Update(float** values, const float** update_values, size_t num); - std::string parse_to_string(const float* value, int param) override; - int32_t parse_from_string(const std::string& str, float* v) override; - virtual bool create_value(int type, const float* value); + std::string ParseToString(const float* value, int param) override; + int32_t ParseFromString(const std::string& str, float* v) override; + virtual bool CreateValue(int type, const float* value); // 这个接口目前只用来取show - float get_field(float* value, const std::string& name) override { + float GetField(float* value, const std::string& name) override { // CHECK(name == "show"); if (name == "show") { - return common_feature_value.show(value); + return common_feature_value.Show(value); } return 0.0; } diff --git a/paddle/fluid/distributed/ps/table/ctr_double_accessor.cc b/paddle/fluid/distributed/ps/table/ctr_double_accessor.cc index 0e3df6e82521d..ed21a6dac317e 100644 --- a/paddle/fluid/distributed/ps/table/ctr_double_accessor.cc +++ b/paddle/fluid/distributed/ps/table/ctr_double_accessor.cc @@ -20,7 +20,7 @@ namespace paddle { namespace distributed { -int DownpourCtrDoubleAccessor::initialize() { +int DownpourCtrDoubleAccessor::Initialize() { auto name = _config.embed_sgd_param().name(); _embed_sgd_rule = CREATE_PSCORE_CLASS(SparseValueSGDRule, name); _embed_sgd_rule->load_config(_config.embed_sgd_param(), 1); @@ -38,76 +38,75 @@ int DownpourCtrDoubleAccessor::initialize() { } void DownpourCtrDoubleAccessor::SetTableInfo(AccessorInfo& info) { - info.dim = dim(); - info.size = size(); - info.select_dim = select_dim(); - info.select_size = select_size(); - info.update_dim = update_dim(); - info.update_size = update_size(); - info.mf_size = mf_size(); - info.fea_dim = fea_dim(); + info.dim = Dim(); + info.size = Size(); + info.select_dim = SelectDim(); + info.select_size = SelectSize(); + info.update_dim = UpdateDim(); + info.update_size = UpdateSize(); + info.mf_size = MFSize(); } size_t DownpourCtrDoubleAccessor::GetTableInfo(InfoKey key) { switch (key) { case DIM: - return dim(); + return Dim(); case SIZE: - return size(); + return Size(); case SELECT_DIM: - return select_dim(); + return SelectDim(); case SELECT_SIZE: - return select_size(); + return SelectSize(); case UPDATE_DIM: - return update_dim(); + return UpdateDim(); case UPDATE_SIZE: - return update_size(); + return UpdateSize(); case MF_SIZE: - return mf_size(); - case FEA_DIM: - return fea_dim(); + return MFSize(); + default: + return 0; } return 0; } -size_t DownpourCtrDoubleAccessor::dim() { +size_t DownpourCtrDoubleAccessor::Dim() { auto embedx_dim = _config.embedx_dim(); - return DownpourCtrDoubleFeatureValue::dim(embedx_dim); + return DownpourCtrDoubleFeatureValue::Dim(embedx_dim); } -size_t DownpourCtrDoubleAccessor::dim_size(size_t dim) { +size_t DownpourCtrDoubleAccessor::DimSize(size_t dim) { auto embedx_dim = _config.embedx_dim(); - return DownpourCtrDoubleFeatureValue::dim_size(dim, embedx_dim); + return DownpourCtrDoubleFeatureValue::DimSize(dim, embedx_dim); } -size_t DownpourCtrDoubleAccessor::size() { +size_t DownpourCtrDoubleAccessor::Size() { auto embedx_dim = _config.embedx_dim(); - return DownpourCtrDoubleFeatureValue::size(embedx_dim); + return DownpourCtrDoubleFeatureValue::Size(embedx_dim); } -size_t DownpourCtrDoubleAccessor::mf_size() { +size_t DownpourCtrDoubleAccessor::MFSize() { return (_config.embedx_dim() + 1) * sizeof(float); // embedx embedx_g2sum } // pull value -size_t DownpourCtrDoubleAccessor::select_dim() { +size_t DownpourCtrDoubleAccessor::SelectDim() { auto embedx_dim = _config.embedx_dim(); return 3 + embedx_dim; } -size_t DownpourCtrDoubleAccessor::select_dim_size(size_t dim) { +size_t DownpourCtrDoubleAccessor::SelectDimSize(size_t dim) { return sizeof(float); } -size_t DownpourCtrDoubleAccessor::select_size() { - return select_dim() * sizeof(float); +size_t DownpourCtrDoubleAccessor::SelectSize() { + return SelectDim() * sizeof(float); } // push value -size_t DownpourCtrDoubleAccessor::update_dim() { +size_t DownpourCtrDoubleAccessor::UpdateDim() { auto embedx_dim = _config.embedx_dim(); return 4 + embedx_dim; } -size_t DownpourCtrDoubleAccessor::update_dim_size(size_t dim) { +size_t DownpourCtrDoubleAccessor::UpdateDimSize(size_t dim) { return sizeof(float); } -size_t DownpourCtrDoubleAccessor::update_size() { - return update_dim() * sizeof(float); +size_t DownpourCtrDoubleAccessor::UpdateSize() { + return UpdateDim() * sizeof(float); } -bool DownpourCtrDoubleAccessor::shrink(float* value) { +bool DownpourCtrDoubleAccessor::Shrink(float* value) { // auto base_threshold = _config.ctr_accessor_param().base_threshold(); // auto delta_threshold = _config.ctr_accessor_param().delta_threshold(); // auto delete_threshold = _config.ctr_accessor_param().delete_threshold(); @@ -117,11 +116,11 @@ bool DownpourCtrDoubleAccessor::shrink(float* value) { _config.ctr_accessor_param().delete_after_unseen_days(); auto delete_threshold = _config.ctr_accessor_param().delete_threshold(); // time_decay first - DownpourCtrDoubleFeatureValue::show(value) *= _show_click_decay_rate; - DownpourCtrDoubleFeatureValue::click(value) *= _show_click_decay_rate; + DownpourCtrDoubleFeatureValue::Show(value) *= _show_click_decay_rate; + DownpourCtrDoubleFeatureValue::Click(value) *= _show_click_decay_rate; // shrink after - auto score = show_click_score(DownpourCtrDoubleFeatureValue::show(value), - DownpourCtrDoubleFeatureValue::click(value)); + auto score = show_click_score(DownpourCtrDoubleFeatureValue::Show(value), + DownpourCtrDoubleFeatureValue::Click(value)); auto unseen_days = DownpourCtrDoubleFeatureValue::unseen_days(value); if (score < delete_threshold || unseen_days > delete_after_unseen_days) { return true; @@ -139,16 +138,16 @@ bool DownpourCtrDoubleAccessor::save_ssd(float* value) { // float* value, int param, double global_cache_threshold) { // auto base_threshold = _config.ctr_accessor_param().base_threshold(); // auto delta_keep_days = _config.ctr_accessor_param().delta_keep_days(); -// if (show_click_score(DownpourCtrDoubleFeatureValue::show(value), -// DownpourCtrDoubleFeatureValue::click(value)) >= base_threshold +// if (show_click_score(DownpourCtrDoubleFeatureValue::Show(value), +// DownpourCtrDoubleFeatureValue::Click(value)) >= base_threshold // && DownpourCtrDoubleFeatureValue::unseen_days(value) <= // delta_keep_days) { -// return DownpourCtrDoubleFeatureValue::show(value) > +// return DownpourCtrDoubleFeatureValue::Show(value) > // global_cache_threshold; // } // return false; // } -bool DownpourCtrDoubleAccessor::save(float* value, int param) { +bool DownpourCtrDoubleAccessor::Save(float* value, int param) { // auto base_threshold = _config.ctr_accessor_param().base_threshold(); // auto delta_threshold = _config.ctr_accessor_param().delta_threshold(); // auto delta_keep_days = _config.ctr_accessor_param().delta_keep_days(); @@ -167,8 +166,8 @@ bool DownpourCtrDoubleAccessor::save(float* value, int param) { case 1: // save xbox base case 2: { - if (show_click_score(DownpourCtrDoubleFeatureValue::show(value), - DownpourCtrDoubleFeatureValue::click(value)) >= + if (show_click_score(DownpourCtrDoubleFeatureValue::Show(value), + DownpourCtrDoubleFeatureValue::Click(value)) >= base_threshold && DownpourCtrDoubleFeatureValue::delta_score(value) >= delta_threshold && @@ -185,8 +184,8 @@ bool DownpourCtrDoubleAccessor::save(float* value, int param) { } // already decayed in shrink case 3: { - // DownpourCtrFeatureValue::show(value) *= _show_click_decay_rate; - // DownpourCtrFeatureValue::click(value) *= _show_click_decay_rate; + // DownpourCtrFeatureValue::Show(value) *= _show_click_decay_rate; + // DownpourCtrFeatureValue::Click(value) *= _show_click_decay_rate; // do this after save, because it must not be modified when retry // DownpourCtrDoubleFeatureValue::unseen_days(value)++; return true; @@ -196,8 +195,7 @@ bool DownpourCtrDoubleAccessor::save(float* value, int param) { }; } -void DownpourCtrDoubleAccessor::update_stat_after_save(float* value, - int param) { +void DownpourCtrDoubleAccessor::UpdateStatAfterSave(float* value, int param) { auto base_threshold = _config.ctr_accessor_param().base_threshold(); auto delta_threshold = _config.ctr_accessor_param().delta_threshold(); auto delta_keep_days = _config.ctr_accessor_param().delta_keep_days(); @@ -206,8 +204,8 @@ void DownpourCtrDoubleAccessor::update_stat_after_save(float* value, } switch (param) { case 1: { - if (show_click_score(DownpourCtrDoubleFeatureValue::show(value), - DownpourCtrDoubleFeatureValue::click(value)) >= + if (show_click_score(DownpourCtrDoubleFeatureValue::Show(value), + DownpourCtrDoubleFeatureValue::Click(value)) >= base_threshold && DownpourCtrDoubleFeatureValue::delta_score(value) >= delta_threshold && @@ -226,29 +224,29 @@ void DownpourCtrDoubleAccessor::update_stat_after_save(float* value, }; } -int32_t DownpourCtrDoubleAccessor::create(float** values, size_t num) { +int32_t DownpourCtrDoubleAccessor::Create(float** values, size_t num) { auto embedx_dim = _config.embedx_dim(); for (size_t value_item = 0; value_item < num; ++value_item) { float* value = values[value_item]; value[DownpourCtrDoubleFeatureValue::unseen_days_index()] = 0; value[DownpourCtrDoubleFeatureValue::delta_score_index()] = 0; - *(double*)(value + DownpourCtrDoubleFeatureValue::show_index()) = 0; - *(double*)(value + DownpourCtrDoubleFeatureValue::click_index()) = 0; - value[DownpourCtrDoubleFeatureValue::slot_index()] = -1; + *(double*)(value + DownpourCtrDoubleFeatureValue::ShowIndex()) = 0; + *(double*)(value + DownpourCtrDoubleFeatureValue::ClickIndex()) = 0; + value[DownpourCtrDoubleFeatureValue::SlotIndex()] = -1; _embed_sgd_rule->init_value( - value + DownpourCtrDoubleFeatureValue::embed_w_index(), + value + DownpourCtrDoubleFeatureValue::Embed_W_Index(), value + DownpourCtrDoubleFeatureValue::embed_g2sum_index()); _embedx_sgd_rule->init_value( - value + DownpourCtrDoubleFeatureValue::embedx_w_index(), + value + DownpourCtrDoubleFeatureValue::Embedx_W_Index(), value + DownpourCtrDoubleFeatureValue::embedx_g2sum_index(), false); } return 0; } -bool DownpourCtrDoubleAccessor::need_extend_mf(float* value) { +bool DownpourCtrDoubleAccessor::NeedExtendMF(float* value) { auto show = - ((double*)(value + DownpourCtrDoubleFeatureValue::show_index()))[0]; + ((double*)(value + DownpourCtrDoubleFeatureValue::ShowIndex()))[0]; auto click = - ((double*)(value + DownpourCtrDoubleFeatureValue::click_index()))[0]; + ((double*)(value + DownpourCtrDoubleFeatureValue::ClickIndex()))[0]; // float score = (show - click) * _config.ctr_accessor_param().nonclk_coeff() auto score = (show - click) * _config.ctr_accessor_param().nonclk_coeff() + click * _config.ctr_accessor_param().click_coeff(); @@ -256,20 +254,20 @@ bool DownpourCtrDoubleAccessor::need_extend_mf(float* value) { return score >= _config.embedx_threshold(); } // from DownpourCtrFeatureValue to DownpourCtrPullValue -int32_t DownpourCtrDoubleAccessor::select(float** select_values, +int32_t DownpourCtrDoubleAccessor::Select(float** select_values, const float** values, size_t num) { auto embedx_dim = _config.embedx_dim(); for (size_t value_item = 0; value_item < num; ++value_item) { float* select_value = select_values[value_item]; float* value = const_cast(values[value_item]); - select_value[DownpourCtrDoublePullValue::show_index()] = - (float)*(double*)(value + DownpourCtrDoubleFeatureValue::show_index()); - select_value[DownpourCtrDoublePullValue::click_index()] = - (float)*(double*)(value + DownpourCtrDoubleFeatureValue::click_index()); - select_value[DownpourCtrDoublePullValue::embed_w_index()] = - value[DownpourCtrDoubleFeatureValue::embed_w_index()]; - memcpy(select_value + DownpourCtrDoublePullValue::embedx_w_index(), - value + DownpourCtrDoubleFeatureValue::embedx_w_index(), + select_value[DownpourCtrDoublePullValue::ShowIndex()] = + (float)*(double*)(value + DownpourCtrDoubleFeatureValue::ShowIndex()); + select_value[DownpourCtrDoublePullValue::ClickIndex()] = + (float)*(double*)(value + DownpourCtrDoubleFeatureValue::ClickIndex()); + select_value[DownpourCtrDoublePullValue::Embed_W_Index()] = + value[DownpourCtrDoubleFeatureValue::Embed_W_Index()]; + memcpy(select_value + DownpourCtrDoublePullValue::Embedx_W_Index(), + value + DownpourCtrDoubleFeatureValue::Embedx_W_Index(), embedx_dim * sizeof(float)); } return 0; @@ -277,23 +275,23 @@ int32_t DownpourCtrDoubleAccessor::select(float** select_values, // from DownpourCtrPushValue to DownpourCtrPushValue // first dim: item // second dim: field num -int32_t DownpourCtrDoubleAccessor::merge(float** update_values, +int32_t DownpourCtrDoubleAccessor::Merge(float** update_values, const float** other_update_values, size_t num) { auto embedx_dim = _config.embedx_dim(); - size_t total_dim = DownpourCtrDoublePushValue::dim(embedx_dim); + size_t total_dim = DownpourCtrDoublePushValue::Dim(embedx_dim); for (size_t value_item = 0; value_item < num; ++value_item) { float* update_value = update_values[value_item]; const float* other_update_value = other_update_values[value_item]; - /**(double*)(update_value + DownpourCtrDoublePushValue::show_index()) += - *(double*)(other_update_value + DownpourCtrDoublePushValue::show_index()); - *(double*)(update_value + DownpourCtrDoublePushValue::click_index()) += - *(double*)(other_update_value + DownpourCtrDoublePushValue::click_index()); + /**(double*)(update_value + DownpourCtrDoublePushValue::ShowIndex()) += + *(double*)(other_update_value + DownpourCtrDoublePushValue::ShowIndex()); + *(double*)(update_value + DownpourCtrDoublePushValue::ClickIndex()) += + *(double*)(other_update_value + DownpourCtrDoublePushValue::ClickIndex()); for (auto i = 3u; i < total_dim; ++i) { update_value[i] += other_update_value[i]; }*/ for (auto i = 0u; i < total_dim; ++i) { - if (i != DownpourCtrDoublePushValue::slot_index()) { + if (i != DownpourCtrDoublePushValue::SlotIndex()) { update_value[i] += other_update_value[i]; } } @@ -303,21 +301,21 @@ int32_t DownpourCtrDoubleAccessor::merge(float** update_values, // from DownpourCtrPushValue to DownpourCtrFeatureValue // first dim: item // second dim: field num -int32_t DownpourCtrDoubleAccessor::update(float** update_values, +int32_t DownpourCtrDoubleAccessor::Update(float** update_values, const float** push_values, size_t num) { auto embedx_dim = _config.embedx_dim(); for (size_t value_item = 0; value_item < num; ++value_item) { float* update_value = update_values[value_item]; const float* push_value = push_values[value_item]; - float push_show = push_value[DownpourCtrDoublePushValue::show_index()]; - float push_click = push_value[DownpourCtrDoublePushValue::click_index()]; - float slot = push_value[DownpourCtrDoublePushValue::slot_index()]; - *(double*)(update_value + DownpourCtrDoubleFeatureValue::show_index()) += + float push_show = push_value[DownpourCtrDoublePushValue::ShowIndex()]; + float push_click = push_value[DownpourCtrDoublePushValue::ClickIndex()]; + float slot = push_value[DownpourCtrDoublePushValue::SlotIndex()]; + *(double*)(update_value + DownpourCtrDoubleFeatureValue::ShowIndex()) += (double)push_show; - *(double*)(update_value + DownpourCtrDoubleFeatureValue::click_index()) += + *(double*)(update_value + DownpourCtrDoubleFeatureValue::ClickIndex()) += (double)push_click; - update_value[DownpourCtrDoubleFeatureValue::slot_index()] = slot; + update_value[DownpourCtrDoubleFeatureValue::SlotIndex()] = slot; update_value[DownpourCtrDoubleFeatureValue::delta_score_index()] += (push_show - push_click) * _config.ctr_accessor_param().nonclk_coeff() + push_click * _config.ctr_accessor_param().click_coeff(); @@ -325,24 +323,24 @@ int32_t DownpourCtrDoubleAccessor::update(float** update_values, // push_click * _config.ctr_accessor_param().click_coeff(); update_value[DownpourCtrDoubleFeatureValue::unseen_days_index()] = 0; _embed_sgd_rule->update_value( - update_value + DownpourCtrDoubleFeatureValue::embed_w_index(), + update_value + DownpourCtrDoubleFeatureValue::Embed_W_Index(), update_value + DownpourCtrDoubleFeatureValue::embed_g2sum_index(), - push_value + DownpourCtrDoublePushValue::embed_g_index(), push_show); + push_value + DownpourCtrDoublePushValue::Embed_G_Index(), push_show); _embedx_sgd_rule->update_value( - update_value + DownpourCtrDoubleFeatureValue::embedx_w_index(), + update_value + DownpourCtrDoubleFeatureValue::Embedx_W_Index(), update_value + DownpourCtrDoubleFeatureValue::embedx_g2sum_index(), - push_value + DownpourCtrDoublePushValue::embedx_g_index(), push_show); + push_value + DownpourCtrDoublePushValue::Embedx_G_Index(), push_show); } return 0; } -bool DownpourCtrDoubleAccessor::create_value(int stage, const float* value) { +bool DownpourCtrDoubleAccessor::CreateValue(int stage, const float* value) { // stage == 0, pull // stage == 1, push if (stage == 0) { return true; } else if (stage == 1) { - auto show = DownpourCtrDoublePushValue::show(const_cast(value)); - auto click = DownpourCtrDoublePushValue::click(const_cast(value)); + auto show = DownpourCtrDoublePushValue::Show(const_cast(value)); + auto click = DownpourCtrDoublePushValue::Click(const_cast(value)); auto score = show_click_score(show, click); if (score <= 0) { return false; @@ -363,16 +361,16 @@ double DownpourCtrDoubleAccessor::show_click_score(double show, double click) { auto click_coeff = _config.ctr_accessor_param().click_coeff(); return (show - click) * nonclk_coeff + click * click_coeff; } -std::string DownpourCtrDoubleAccessor::parse_to_string(const float* v, - int param_size) { +std::string DownpourCtrDoubleAccessor::ParseToString(const float* v, + int param_size) { thread_local std::ostringstream os; os.clear(); os.str(""); os << v[0] << " " << v[1] << " " << (float)((double*)(v + 2))[0] << " " << (float)((double*)(v + 4))[0] << " " << v[6] << " " << v[7] << " " << v[8]; - auto show = DownpourCtrDoubleFeatureValue::show(const_cast(v)); - auto click = DownpourCtrDoubleFeatureValue::click(const_cast(v)); + auto show = DownpourCtrDoubleFeatureValue::Show(const_cast(v)); + auto click = DownpourCtrDoubleFeatureValue::Click(const_cast(v)); auto score = show_click_score(show, click); if (score >= _config.embedx_threshold() && param_size > 9) { os << " " << v[9]; @@ -382,23 +380,23 @@ std::string DownpourCtrDoubleAccessor::parse_to_string(const float* v, } return os.str(); } -int DownpourCtrDoubleAccessor::parse_from_string(const std::string& str, - float* value) { +int DownpourCtrDoubleAccessor::ParseFromString(const std::string& str, + float* value) { int embedx_dim = _config.embedx_dim(); - float data_buff[dim() + 2]; + float data_buff[Dim() + 2]; float* data_buff_ptr = data_buff; _embedx_sgd_rule->init_value( - data_buff_ptr + DownpourCtrDoubleFeatureValue::embedx_w_index(), + data_buff_ptr + DownpourCtrDoubleFeatureValue::Embedx_W_Index(), data_buff_ptr + DownpourCtrDoubleFeatureValue::embedx_g2sum_index()); auto str_len = paddle::string::str_to_float(str.data(), data_buff_ptr); CHECK(str_len >= 6) << "expect more than 6 real:" << str_len; - int show_index = DownpourCtrDoubleFeatureValue::show_index(); - int click_index = DownpourCtrDoubleFeatureValue::click_index(); - int embed_w_index = DownpourCtrDoubleFeatureValue::embed_w_index(); + int show_index = DownpourCtrDoubleFeatureValue::ShowIndex(); + int click_index = DownpourCtrDoubleFeatureValue::ClickIndex(); + int embed_w_index = DownpourCtrDoubleFeatureValue::Embed_W_Index(); // no slot, embedx - int value_dim = dim(); + int value_dim = Dim(); int embedx_g2sum_index = DownpourCtrDoubleFeatureValue::embedx_g2sum_index(); - value[DownpourCtrDoubleFeatureValue::slot_index()] = -1; + value[DownpourCtrDoubleFeatureValue::SlotIndex()] = -1; // other case if (str_len == (value_dim - 1)) { // copy unseen_days..delta_score @@ -407,7 +405,7 @@ int DownpourCtrDoubleAccessor::parse_from_string(const std::string& str, *(double*)(value + show_index) = (double)data_buff_ptr[2]; *(double*)(value + click_index) = (double)data_buff_ptr[3]; // copy others - value[DownpourCtrDoubleFeatureValue::embed_w_index()] = data_buff_ptr[4]; + value[DownpourCtrDoubleFeatureValue::Embed_W_Index()] = data_buff_ptr[4]; value[DownpourCtrDoubleFeatureValue::embed_g2sum_index()] = data_buff_ptr[5]; memcpy(value + embedx_g2sum_index, data_buff_ptr + 6, diff --git a/paddle/fluid/distributed/ps/table/ctr_double_accessor.h b/paddle/fluid/distributed/ps/table/ctr_double_accessor.h index fb8b27ecfd985..29ddcbc86d7c7 100644 --- a/paddle/fluid/distributed/ps/table/ctr_double_accessor.h +++ b/paddle/fluid/distributed/ps/table/ctr_double_accessor.h @@ -38,36 +38,36 @@ class DownpourCtrDoubleAccessor : public ValueAccessor { float embedx_g2sum; std::vector embedx_w; */ - static int dim(int embedx_dim) { return 8 + embedx_dim; } - static int dim_size(size_t dim, int embedx_dim) { return sizeof(float); } - static int size(int embedx_dim) { - return (dim(embedx_dim) + 2) * sizeof(float); + static int Dim(int embedx_dim) { return 8 + embedx_dim; } + static int DimSize(size_t dim, int embedx_dim) { return sizeof(float); } + static int Size(int embedx_dim) { + return (Dim(embedx_dim) + 2) * sizeof(float); } static int unseen_days_index() { return 0; } static int delta_score_index() { return DownpourCtrDoubleFeatureValue::unseen_days_index() + 1; } - static int show_index() { + static int ShowIndex() { return DownpourCtrDoubleFeatureValue::delta_score_index() + 1; } // show is double - static int click_index() { - return DownpourCtrDoubleFeatureValue::show_index() + 2; + static int ClickIndex() { + return DownpourCtrDoubleFeatureValue::ShowIndex() + 2; } // click is double - static int embed_w_index() { - return DownpourCtrDoubleFeatureValue::click_index() + 2; + static int Embed_W_Index() { + return DownpourCtrDoubleFeatureValue::ClickIndex() + 2; } static int embed_g2sum_index() { - return DownpourCtrDoubleFeatureValue::embed_w_index() + 1; + return DownpourCtrDoubleFeatureValue::Embed_W_Index() + 1; } - static int slot_index() { + static int SlotIndex() { return DownpourCtrDoubleFeatureValue::embed_g2sum_index() + 1; } static int embedx_g2sum_index() { - return DownpourCtrDoubleFeatureValue::slot_index() + 1; + return DownpourCtrDoubleFeatureValue::SlotIndex() + 1; } - static int embedx_w_index() { + static int Embedx_W_Index() { return DownpourCtrDoubleFeatureValue::embedx_g2sum_index() + 1; } static float& unseen_days(float* val) { @@ -76,17 +76,17 @@ class DownpourCtrDoubleAccessor : public ValueAccessor { static float& delta_score(float* val) { return val[DownpourCtrDoubleFeatureValue::delta_score_index()]; } - static double& show(float* val) { - return ((double*)(val + DownpourCtrDoubleFeatureValue::show_index()))[0]; + static double& Show(float* val) { + return ((double*)(val + DownpourCtrDoubleFeatureValue::ShowIndex()))[0]; } - static double& click(float* val) { - return ((double*)(val + DownpourCtrDoubleFeatureValue::click_index()))[0]; + static double& Click(float* val) { + return ((double*)(val + DownpourCtrDoubleFeatureValue::ClickIndex()))[0]; } - static float& slot(float* val) { - return val[DownpourCtrDoubleFeatureValue::slot_index()]; + static float& Slot(float* val) { + return val[DownpourCtrDoubleFeatureValue::SlotIndex()]; } - static float& embed_w(float* val) { - return val[DownpourCtrDoubleFeatureValue::embed_w_index()]; + static float& EmbedW(float* val) { + return val[DownpourCtrDoubleFeatureValue::Embed_W_Index()]; } static float& embed_g2sum(float* val) { return val[DownpourCtrDoubleFeatureValue::embed_g2sum_index()]; @@ -94,8 +94,8 @@ class DownpourCtrDoubleAccessor : public ValueAccessor { static float& embedx_g2sum(float* val) { return val[DownpourCtrDoubleFeatureValue::embedx_g2sum_index()]; } - static float* embedx_w(float* val) { - return (val + DownpourCtrDoubleFeatureValue::embedx_w_index()); + static float* EmbedxW(float* val) { + return (val + DownpourCtrDoubleFeatureValue::Embedx_W_Index()); } }; struct DownpourCtrDoublePushValue { @@ -106,36 +106,36 @@ class DownpourCtrDoubleAccessor : public ValueAccessor { float embed_g; std::vector embedx_g; */ - static int dim(int embedx_dim) { return 4 + embedx_dim; } - static int dim_size(int dim, int embedx_dim) { return sizeof(float); } - static int size(int embedx_dim) { return dim(embedx_dim) * sizeof(float); } - static int slot_index() { return 0; } - static int show_index() { - return DownpourCtrDoublePushValue::slot_index() + 1; + static int Dim(int embedx_dim) { return 4 + embedx_dim; } + static int DimSize(int dim, int embedx_dim) { return sizeof(float); } + static int Size(int embedx_dim) { return Dim(embedx_dim) * sizeof(float); } + static int SlotIndex() { return 0; } + static int ShowIndex() { + return DownpourCtrDoublePushValue::SlotIndex() + 1; } - static int click_index() { - return DownpourCtrDoublePushValue::show_index() + 1; + static int ClickIndex() { + return DownpourCtrDoublePushValue::ShowIndex() + 1; } - static int embed_g_index() { - return DownpourCtrDoublePushValue::click_index() + 1; + static int Embed_G_Index() { + return DownpourCtrDoublePushValue::ClickIndex() + 1; } - static int embedx_g_index() { - return DownpourCtrDoublePushValue::embed_g_index() + 1; + static int Embedx_G_Index() { + return DownpourCtrDoublePushValue::Embed_G_Index() + 1; } - static float& slot(float* val) { - return val[DownpourCtrDoublePushValue::slot_index()]; + static float& Slot(float* val) { + return val[DownpourCtrDoublePushValue::SlotIndex()]; } - static float& show(float* val) { - return val[DownpourCtrDoublePushValue::show_index()]; + static float& Show(float* val) { + return val[DownpourCtrDoublePushValue::ShowIndex()]; } - static float& click(float* val) { - return val[DownpourCtrDoublePushValue::click_index()]; + static float& Click(float* val) { + return val[DownpourCtrDoublePushValue::ClickIndex()]; } - static float& embed_g(float* val) { - return val[DownpourCtrDoublePushValue::embed_g_index()]; + static float& EmbedG(float* val) { + return val[DownpourCtrDoublePushValue::Embed_G_Index()]; } - static float* embedx_g(float* val) { - return val + DownpourCtrDoublePushValue::embedx_g_index(); + static float* EmbedxG(float* val) { + return val + DownpourCtrDoublePushValue::Embedx_G_Index(); } }; struct DownpourCtrDoublePullValue { @@ -145,88 +145,88 @@ class DownpourCtrDoubleAccessor : public ValueAccessor { float embed_w; std::vector embedx_w; */ - static int dim(int embedx_dim) { return 3 + embedx_dim; } - static int dim_size(size_t dim) { return sizeof(float); } - static int size(int embedx_dim) { return dim(embedx_dim) * sizeof(float); } - static int show_index() { return 0; } - static int click_index() { return 1; } - static int embed_w_index() { return 2; } - static int embedx_w_index() { return 3; } - static float& show(float* val) { - return val[DownpourCtrDoublePullValue::show_index()]; + static int Dim(int embedx_dim) { return 3 + embedx_dim; } + static int DimSize(size_t dim) { return sizeof(float); } + static int Size(int embedx_dim) { return Dim(embedx_dim) * sizeof(float); } + static int ShowIndex() { return 0; } + static int ClickIndex() { return 1; } + static int Embed_W_Index() { return 2; } + static int Embedx_W_Index() { return 3; } + static float& Show(float* val) { + return val[DownpourCtrDoublePullValue::ShowIndex()]; } - static float& click(float* val) { - return val[DownpourCtrDoublePullValue::click_index()]; + static float& Click(float* val) { + return val[DownpourCtrDoublePullValue::ClickIndex()]; } - static float& embed_w(float* val) { - return val[DownpourCtrDoublePullValue::embed_w_index()]; + static float& EmbedW(float* val) { + return val[DownpourCtrDoublePullValue::Embed_W_Index()]; } - static float* embedx_w(float* val) { - return val + DownpourCtrDoublePullValue::embedx_w_index(); + static float* EmbedxW(float* val) { + return val + DownpourCtrDoublePullValue::Embedx_W_Index(); } }; DownpourCtrDoubleAccessor() {} virtual ~DownpourCtrDoubleAccessor() {} - virtual int initialize(); + virtual int Initialize(); virtual void SetTableInfo(AccessorInfo& info); virtual size_t GetTableInfo(InfoKey key); // value维度 - virtual size_t dim(); + size_t Dim(); // value各个维度的size - virtual size_t dim_size(size_t dim); + size_t DimSize(size_t dim); // value各维度相加总size - virtual size_t size(); + size_t Size(); // value中mf动态长度部分总size大小, sparse下生效 - virtual size_t mf_size(); + size_t MFSize(); // pull value维度 - virtual size_t select_dim(); + size_t SelectDim(); // pull value各个维度的size - virtual size_t select_dim_size(size_t dim); + size_t SelectDimSize(size_t dim); // pull value各维度相加总size - virtual size_t select_size(); + size_t SelectSize(); // push value维度 - virtual size_t update_dim(); + size_t UpdateDim(); // push value各个维度的size - virtual size_t update_dim_size(size_t dim); + size_t UpdateDimSize(size_t dim); // push value各维度相加总size - virtual size_t update_size(); + size_t UpdateSize(); // 判断该value是否进行shrink - virtual bool shrink(float* value); - virtual bool need_extend_mf(float* value); + virtual bool Shrink(float* value); + virtual bool NeedExtendMF(float* value); // 判断该value是否在save阶段dump, // param作为参数用于标识save阶段,如downpour的xbox与batch_model // param = 0, save all feature // param = 1, save delta feature // param = 3, save all feature with time decay - virtual bool save(float* value, int param) override; + virtual bool Save(float* value, int param) override; // update delta_score and unseen_days after save - virtual void update_stat_after_save(float* value, int param) override; + virtual void UpdateStatAfterSave(float* value, int param) override; // 判断该value是否保存到ssd virtual bool save_ssd(float* value); // virtual bool save_cache(float* value, int param, double // global_cache_threshold) override; // keys不存在时,为values生成随机值 // 要求value的内存由外部调用者分配完毕 - virtual int32_t create(float** value, size_t num); + virtual int32_t Create(float** value, size_t num); // 从values中选取到select_values中 - virtual int32_t select(float** select_values, const float** values, + virtual int32_t Select(float** select_values, const float** values, size_t num); // 将update_values聚合到一起 - virtual int32_t merge(float** update_values, + virtual int32_t Merge(float** update_values, const float** other_update_values, size_t num); // 将update_values聚合到一起,通过it.next判定是否进入下一个key - // virtual int32_t merge(float** update_values, iterator it); + // virtual int32_t Merge(float** update_values, iterator it); // 将update_values更新应用到values中 - virtual int32_t update(float** values, const float** update_values, + virtual int32_t Update(float** values, const float** update_values, size_t num); - virtual std::string parse_to_string(const float* value, int param) override; - virtual int32_t parse_from_string(const std::string& str, float* v) override; - virtual bool create_value(int type, const float* value); + virtual std::string ParseToString(const float* value, int param) override; + virtual int32_t ParseFromString(const std::string& str, float* v) override; + virtual bool CreateValue(int type, const float* value); //这个接口目前只用来取show - virtual float get_field(float* value, const std::string& name) override { + virtual float GetField(float* value, const std::string& name) override { CHECK(name == "show"); if (name == "show") { - return (float)DownpourCtrDoubleFeatureValue::show(value); + return (float)DownpourCtrDoubleFeatureValue::Show(value); } return 0.0; } diff --git a/paddle/fluid/distributed/ps/table/downpour_ctr_accessor.cc b/paddle/fluid/distributed/ps/table/downpour_ctr_accessor.cc index 2fff81b1a4dc6..1140afd1c1e09 100644 --- a/paddle/fluid/distributed/ps/table/downpour_ctr_accessor.cc +++ b/paddle/fluid/distributed/ps/table/downpour_ctr_accessor.cc @@ -20,7 +20,7 @@ namespace paddle { namespace distributed { -int DownpourCtrAccessor::initialize() { +int DownpourCtrAccessor::Initialize() { auto name = _config.embed_sgd_param().name(); _embed_sgd_rule = CREATE_PSCORE_CLASS(SparseValueSGDRule, name); _embed_sgd_rule->load_config(_config.embed_sgd_param(), 1); @@ -38,86 +38,77 @@ int DownpourCtrAccessor::initialize() { } void DownpourCtrAccessor::SetTableInfo(AccessorInfo& info) { - info.dim = dim(); - info.size = size(); - info.select_dim = select_dim(); - info.select_size = select_size(); - info.update_dim = update_dim(); - info.update_size = update_size(); - info.mf_size = mf_size(); - info.fea_dim = fea_dim(); + info.dim = Dim(); + info.size = Size(); + info.select_dim = SelectDim(); + info.select_size = SelectSize(); + info.update_dim = UpdateDim(); + info.update_size = UpdateSize(); + info.mf_size = MFSize(); } size_t DownpourCtrAccessor::GetTableInfo(InfoKey key) { switch (key) { case DIM: - return dim(); + return Dim(); case SIZE: - return size(); + return Size(); case SELECT_DIM: - return select_dim(); + return SelectDim(); case SELECT_SIZE: - return select_size(); + return SelectSize(); case UPDATE_DIM: - return update_dim(); + return UpdateDim(); case UPDATE_SIZE: - return update_size(); + return UpdateSize(); case MF_SIZE: - return mf_size(); - case FEA_DIM: - return fea_dim(); + return MFSize(); + default: + return 0; } return 0; } -size_t DownpourCtrAccessor::dim() { +size_t DownpourCtrAccessor::Dim() { auto embedx_dim = _config.embedx_dim(); - return DownpourCtrFeatureValue::dim(embedx_dim); + return DownpourCtrFeatureValue::Dim(embedx_dim); } -size_t DownpourCtrAccessor::dim_size(size_t dim) { +size_t DownpourCtrAccessor::DimSize(size_t dim) { auto embedx_dim = _config.embedx_dim(); - return DownpourCtrFeatureValue::dim_size(dim, embedx_dim); + return DownpourCtrFeatureValue::DimSize(dim, embedx_dim); } -size_t DownpourCtrAccessor::size() { +size_t DownpourCtrAccessor::Size() { auto embedx_dim = _config.embedx_dim(); - return DownpourCtrFeatureValue::size(embedx_dim); + return DownpourCtrFeatureValue::Size(embedx_dim); } -size_t DownpourCtrAccessor::mf_size() { +size_t DownpourCtrAccessor::MFSize() { return (_config.embedx_dim() + 1) * sizeof(float); // embedx embedx_g2sum } // pull value -size_t DownpourCtrAccessor::select_dim() { +size_t DownpourCtrAccessor::SelectDim() { auto embedx_dim = _config.embedx_dim(); return 3 + embedx_dim; } -size_t DownpourCtrAccessor::select_dim_size(size_t dim) { - return sizeof(float); -} +size_t DownpourCtrAccessor::SelectDimSize(size_t dim) { return sizeof(float); } -size_t DownpourCtrAccessor::select_size() { - return select_dim() * sizeof(float); -} +size_t DownpourCtrAccessor::SelectSize() { return SelectDim() * sizeof(float); } // push value -size_t DownpourCtrAccessor::update_dim() { +size_t DownpourCtrAccessor::UpdateDim() { auto embedx_dim = _config.embedx_dim(); return 4 + embedx_dim; } -size_t DownpourCtrAccessor::update_dim_size(size_t dim) { - return sizeof(float); -} +size_t DownpourCtrAccessor::UpdateDimSize(size_t dim) { return sizeof(float); } -size_t DownpourCtrAccessor::update_size() { - return update_dim() * sizeof(float); -} +size_t DownpourCtrAccessor::UpdateSize() { return UpdateDim() * sizeof(float); } -bool DownpourCtrAccessor::shrink(float* value) { +bool DownpourCtrAccessor::Shrink(float* value) { // auto base_threshold = _config.ctr_accessor_param().base_threshold(); // auto delta_threshold = _config.ctr_accessor_param().delta_threshold(); // auto delete_threshold = _config.ctr_accessor_param().delete_threshold(); @@ -134,9 +125,9 @@ bool DownpourCtrAccessor::shrink(float* value) { return true; } auto show_right = - DownpourCtrFeatureValue::show(value) * _time_decay_rates[day_diff]; + DownpourCtrFeatureValue::Show(value) * _time_decay_rates[day_diff]; auto click_right = - DownpourCtrFeatureValue::click(value) * _time_decay_rates[day_diff]; + DownpourCtrFeatureValue::Click(value) * _time_decay_rates[day_diff]; // shrink after auto score = show_click_score(show_right, click_right); @@ -175,15 +166,15 @@ bool DownpourCtrAccessor::save_ssd(float* value) { // auto delta_keep_days = _config.ctr_accessor_param().delta_keep_days(); // auto unseen_days = DownpourCtrFeatureValue::unseen_days(value); // int16_t day_diff = _day_id - unseen_days; -// if (show_click_score(DownpourCtrFeatureValue::show(value), -// DownpourCtrFeatureValue::click(value)) >= base_threshold +// if (show_click_score(DownpourCtrFeatureValue::Show(value), +// DownpourCtrFeatureValue::Click(value)) >= base_threshold // && day_diff <= delta_keep_days) { -// return DownpourCtrFeatureValue::show(value) > global_cache_threshold; +// return DownpourCtrFeatureValue::Show(value) > global_cache_threshold; // } // return false; // } -bool DownpourCtrAccessor::save(float* value, int param) { +bool DownpourCtrAccessor::Save(float* value, int param) { // auto base_threshold = _config.ctr_accessor_param().base_threshold(); // auto delta_threshold = _config.ctr_accessor_param().delta_threshold(); // auto delta_keep_days = _config.ctr_accessor_param().delta_keep_days(); @@ -206,9 +197,9 @@ bool DownpourCtrAccessor::save(float* value, int param) { int16_t day_diff = _day_id - unseen_days; auto show_right = - DownpourCtrFeatureValue::show(value) * _time_decay_rates[day_diff]; + DownpourCtrFeatureValue::Show(value) * _time_decay_rates[day_diff]; auto click_right = - DownpourCtrFeatureValue::click(value) * _time_decay_rates[day_diff]; + DownpourCtrFeatureValue::Click(value) * _time_decay_rates[day_diff]; if (show_click_score(show_right, click_right) >= base_threshold && DownpourCtrFeatureValue::delta_score(value) >= delta_threshold && @@ -224,8 +215,8 @@ bool DownpourCtrAccessor::save(float* value, int param) { } // already decayed in shrink case 3: { - // DownpourCtrFeatureValue::show(value) *= _show_click_decay_rate; - // DownpourCtrFeatureValue::click(value) *= _show_click_decay_rate; + // DownpourCtrFeatureValue::Show(value) *= _show_click_decay_rate; + // DownpourCtrFeatureValue::Click(value) *= _show_click_decay_rate; // do this after save, because it must not be modified when retry // DownpourCtrFeatureValue::unseen_days(value)++; return true; @@ -235,7 +226,7 @@ bool DownpourCtrAccessor::save(float* value, int param) { }; } -void DownpourCtrAccessor::update_stat_after_save(float* value, int param) { +void DownpourCtrAccessor::UpdateStatAfterSave(float* value, int param) { auto base_threshold = _config.ctr_accessor_param().base_threshold(); auto delta_threshold = _config.ctr_accessor_param().delta_threshold(); auto delta_keep_days = _config.ctr_accessor_param().delta_keep_days(); @@ -247,9 +238,9 @@ void DownpourCtrAccessor::update_stat_after_save(float* value, int param) { auto unseen_days = DownpourCtrFeatureValue::unseen_days(value); int16_t day_diff = _day_id - unseen_days; auto show_right = - DownpourCtrFeatureValue::show(value) * _time_decay_rates[day_diff]; + DownpourCtrFeatureValue::Show(value) * _time_decay_rates[day_diff]; auto click_right = - DownpourCtrFeatureValue::click(value) * _time_decay_rates[day_diff]; + DownpourCtrFeatureValue::Click(value) * _time_decay_rates[day_diff]; if (show_click_score(show_right, click_right) >= base_threshold && DownpourCtrFeatureValue::delta_score(value) >= delta_threshold && @@ -268,28 +259,28 @@ void DownpourCtrAccessor::update_stat_after_save(float* value, int param) { }; } -int32_t DownpourCtrAccessor::create(float** values, size_t num) { +int32_t DownpourCtrAccessor::Create(float** values, size_t num) { auto embedx_dim = _config.embedx_dim(); for (size_t value_item = 0; value_item < num; ++value_item) { float* value = values[value_item]; value[DownpourCtrFeatureValue::unseen_days_index()] = 0; value[DownpourCtrFeatureValue::delta_score_index()] = 0; - value[DownpourCtrFeatureValue::show_index()] = 0; - value[DownpourCtrFeatureValue::click_index()] = 0; - value[DownpourCtrFeatureValue::slot_index()] = -1; + value[DownpourCtrFeatureValue::ShowIndex()] = 0; + value[DownpourCtrFeatureValue::ClickIndex()] = 0; + value[DownpourCtrFeatureValue::SlotIndex()] = -1; _embed_sgd_rule->init_value( - value + DownpourCtrFeatureValue::embed_w_index(), + value + DownpourCtrFeatureValue::Embed_W_Index(), value + DownpourCtrFeatureValue::embed_g2sum_index(), true); _embedx_sgd_rule->init_value( - value + DownpourCtrFeatureValue::embedx_w_index(), + value + DownpourCtrFeatureValue::Embedx_W_Index(), value + DownpourCtrFeatureValue::embedx_g2sum_index()); } return 0; } -bool DownpourCtrAccessor::need_extend_mf(float* value) { - float show = value[DownpourCtrFeatureValue::show_index()]; - float click = value[DownpourCtrFeatureValue::click_index()]; +bool DownpourCtrAccessor::NeedExtendMF(float* value) { + float show = value[DownpourCtrFeatureValue::ShowIndex()]; + float click = value[DownpourCtrFeatureValue::ClickIndex()]; // float score = (show - click) * _config.ctr_accessor_param().nonclk_coeff() float score = (show - click) * _config.ctr_accessor_param().nonclk_coeff() + click * _config.ctr_accessor_param().click_coeff(); @@ -297,25 +288,25 @@ bool DownpourCtrAccessor::need_extend_mf(float* value) { return score >= _config.embedx_threshold(); } -bool DownpourCtrAccessor::has_mf(size_t size) { +bool DownpourCtrAccessor::HasMF(size_t size) { return size > DownpourCtrFeatureValue::embedx_g2sum_index(); } // from DownpourCtrFeatureValue to DownpourCtrPullValue -int32_t DownpourCtrAccessor::select(float** select_values, const float** values, +int32_t DownpourCtrAccessor::Select(float** select_values, const float** values, size_t num) { auto embedx_dim = _config.embedx_dim(); for (size_t value_item = 0; value_item < num; ++value_item) { float* select_value = select_values[value_item]; float* value = const_cast(values[value_item]); - select_value[DownpourCtrPullValue::show_index()] = - value[DownpourCtrFeatureValue::show_index()]; - select_value[DownpourCtrPullValue::click_index()] = - value[DownpourCtrFeatureValue::click_index()]; - select_value[DownpourCtrPullValue::embed_w_index()] = - value[DownpourCtrFeatureValue::embed_w_index()]; - memcpy(select_value + DownpourCtrPullValue::embedx_w_index(), - value + DownpourCtrFeatureValue::embedx_w_index(), + select_value[DownpourCtrPullValue::ShowIndex()] = + value[DownpourCtrFeatureValue::ShowIndex()]; + select_value[DownpourCtrPullValue::ClickIndex()] = + value[DownpourCtrFeatureValue::ClickIndex()]; + select_value[DownpourCtrPullValue::Embed_W_Index()] = + value[DownpourCtrFeatureValue::Embed_W_Index()]; + memcpy(select_value + DownpourCtrPullValue::Embedx_W_Index(), + value + DownpourCtrFeatureValue::Embedx_W_Index(), embedx_dim * sizeof(float)); } return 0; @@ -324,16 +315,16 @@ int32_t DownpourCtrAccessor::select(float** select_values, const float** values, // from DownpourCtrPushValue to DownpourCtrPushValue // first dim: item // second dim: field num -int32_t DownpourCtrAccessor::merge(float** update_values, +int32_t DownpourCtrAccessor::Merge(float** update_values, const float** other_update_values, size_t num) { auto embedx_dim = _config.embedx_dim(); - size_t total_dim = DownpourCtrPushValue::dim(embedx_dim); + size_t total_dim = DownpourCtrPushValue::Dim(embedx_dim); for (size_t value_item = 0; value_item < num; ++value_item) { float* update_value = update_values[value_item]; const float* other_update_value = other_update_values[value_item]; for (auto i = 0u; i < total_dim; ++i) { - if (i != DownpourCtrPushValue::slot_index()) { + if (i != DownpourCtrPushValue::SlotIndex()) { update_value[i] += other_update_value[i]; } } @@ -344,18 +335,18 @@ int32_t DownpourCtrAccessor::merge(float** update_values, // from DownpourCtrPushValue to DownpourCtrFeatureValue // first dim: item // second dim: field num -int32_t DownpourCtrAccessor::update(float** update_values, +int32_t DownpourCtrAccessor::Update(float** update_values, const float** push_values, size_t num) { auto embedx_dim = _config.embedx_dim(); for (size_t value_item = 0; value_item < num; ++value_item) { float* update_value = update_values[value_item]; const float* push_value = push_values[value_item]; - float push_show = push_value[DownpourCtrPushValue::show_index()]; - float push_click = push_value[DownpourCtrPushValue::click_index()]; - float slot = push_value[DownpourCtrPushValue::slot_index()]; - update_value[DownpourCtrFeatureValue::show_index()] += push_show; - update_value[DownpourCtrFeatureValue::click_index()] += push_click; - update_value[DownpourCtrFeatureValue::slot_index()] = slot; + float push_show = push_value[DownpourCtrPushValue::ShowIndex()]; + float push_click = push_value[DownpourCtrPushValue::ClickIndex()]; + float slot = push_value[DownpourCtrPushValue::SlotIndex()]; + update_value[DownpourCtrFeatureValue::ShowIndex()] += push_show; + update_value[DownpourCtrFeatureValue::ClickIndex()] += push_click; + update_value[DownpourCtrFeatureValue::SlotIndex()] = slot; update_value[DownpourCtrFeatureValue::delta_score_index()] += (push_show - push_click) * _config.ctr_accessor_param().nonclk_coeff() + push_click * _config.ctr_accessor_param().click_coeff(); @@ -363,25 +354,25 @@ int32_t DownpourCtrAccessor::update(float** update_values, // push_click * _config.ctr_accessor_param().click_coeff(); update_value[DownpourCtrFeatureValue::unseen_days_index()] = 0; _embed_sgd_rule->update_value( - update_value + DownpourCtrFeatureValue::embed_w_index(), + update_value + DownpourCtrFeatureValue::Embed_W_Index(), update_value + DownpourCtrFeatureValue::embed_g2sum_index(), - push_value + DownpourCtrPushValue::embed_g_index(), push_show); + push_value + DownpourCtrPushValue::Embed_G_Index(), push_show); _embedx_sgd_rule->update_value( - update_value + DownpourCtrFeatureValue::embedx_w_index(), + update_value + DownpourCtrFeatureValue::Embedx_W_Index(), update_value + DownpourCtrFeatureValue::embedx_g2sum_index(), - push_value + DownpourCtrPushValue::embedx_g_index(), push_show); + push_value + DownpourCtrPushValue::Embedx_G_Index(), push_show); } return 0; } -bool DownpourCtrAccessor::create_value(int stage, const float* value) { +bool DownpourCtrAccessor::CreateValue(int stage, const float* value) { // stage == 0, pull // stage == 1, push if (stage == 0) { return true; } else if (stage == 1) { - auto show = DownpourCtrPushValue::show(const_cast(value)); - auto click = DownpourCtrPushValue::click(const_cast(value)); + auto show = DownpourCtrPushValue::Show(const_cast(value)); + auto click = DownpourCtrPushValue::Click(const_cast(value)); auto score = show_click_score(show, click); if (score <= 0) { return false; @@ -404,15 +395,14 @@ float DownpourCtrAccessor::show_click_score(float show, float click) { return (show - click) * nonclk_coeff + click * click_coeff; } -std::string DownpourCtrAccessor::parse_to_string(const float* v, - int param_size) { +std::string DownpourCtrAccessor::ParseToString(const float* v, int param_size) { thread_local std::ostringstream os; os.clear(); os.str(""); os << v[0] << " " << v[1] << " " << v[2] << " " << v[3] << " " << v[4] << " " << v[5] << " " << v[6]; - auto show = DownpourCtrFeatureValue::show(const_cast(v)); - auto click = DownpourCtrFeatureValue::click(const_cast(v)); + auto show = DownpourCtrFeatureValue::Show(const_cast(v)); + auto click = DownpourCtrFeatureValue::Click(const_cast(v)); auto score = show_click_score(show, click); if (score >= _config.embedx_threshold() && param_size > 7) { os << " " << v[7]; @@ -423,22 +413,21 @@ std::string DownpourCtrAccessor::parse_to_string(const float* v, return os.str(); } -int DownpourCtrAccessor::parse_from_string(const std::string& str, - float* value) { +int DownpourCtrAccessor::ParseFromString(const std::string& str, float* value) { int embedx_dim = _config.embedx_dim(); - float data_buff[dim()]; + float data_buff[Dim()]; float* data_buff_ptr = data_buff; _embedx_sgd_rule->init_value( - data_buff_ptr + DownpourCtrFeatureValue::embedx_w_index(), + data_buff_ptr + DownpourCtrFeatureValue::Embedx_W_Index(), data_buff_ptr + DownpourCtrFeatureValue::embedx_g2sum_index()); auto str_len = paddle::string::str_to_float(str.data(), data_buff_ptr); CHECK(str_len >= 6) << "expect more than 6 real:" << str_len; // no slot, embedx - int value_dim = dim(); + int value_dim = Dim(); int embedx_g2sum_index = DownpourCtrFeatureValue::embedx_g2sum_index(); - value[DownpourCtrFeatureValue::slot_index()] = -1; + value[DownpourCtrFeatureValue::SlotIndex()] = -1; // other case if (str_len == (value_dim - 1)) { memcpy(value, data_buff_ptr, (embedx_g2sum_index - 1) * sizeof(float)); @@ -494,8 +483,8 @@ void DownpourCtrAccessor::update_time_decay(float* value, if (day_diff >= _config.ctr_accessor_param().delete_after_unseen_days()) { return; } - DownpourCtrFeatureValue::show(value) *= _time_decay_rates[day_diff]; - DownpourCtrFeatureValue::click(value) *= _time_decay_rates[day_diff]; + DownpourCtrFeatureValue::Show(value) *= _time_decay_rates[day_diff]; + DownpourCtrFeatureValue::Click(value) *= _time_decay_rates[day_diff]; if (is_update_seen_day) { DownpourCtrFeatureValue::unseen_days(value) = _day_id; } diff --git a/paddle/fluid/distributed/ps/table/downpour_ctr_accessor.h b/paddle/fluid/distributed/ps/table/downpour_ctr_accessor.h index 6ff6c0438310e..de1f080f42e1f 100644 --- a/paddle/fluid/distributed/ps/table/downpour_ctr_accessor.h +++ b/paddle/fluid/distributed/ps/table/downpour_ctr_accessor.h @@ -42,32 +42,30 @@ class DownpourCtrAccessor : public ValueAccessor { std::vector embedx_w; */ - static int dim(int embedx_dim) { return 8 + embedx_dim; } - static int dim_size(size_t dim, int embedx_dim) { return sizeof(float); } - static int size(int embedx_dim) { return dim(embedx_dim) * sizeof(float); } + static int Dim(int embedx_dim) { return 8 + embedx_dim; } + static int DimSize(size_t dim, int embedx_dim) { return sizeof(float); } + static int Size(int embedx_dim) { return Dim(embedx_dim) * sizeof(float); } static int unseen_days_index() { return 0; } static int delta_score_index() { return DownpourCtrFeatureValue::unseen_days_index() + 1; } - static int show_index() { + static int ShowIndex() { return DownpourCtrFeatureValue::delta_score_index() + 1; } - static int click_index() { - return DownpourCtrFeatureValue::show_index() + 1; - } - static int embed_w_index() { - return DownpourCtrFeatureValue::click_index() + 1; + static int ClickIndex() { return DownpourCtrFeatureValue::ShowIndex() + 1; } + static int Embed_W_Index() { + return DownpourCtrFeatureValue::ClickIndex() + 1; } static int embed_g2sum_index() { - return DownpourCtrFeatureValue::embed_w_index() + 1; + return DownpourCtrFeatureValue::Embed_W_Index() + 1; } - static int slot_index() { + static int SlotIndex() { return DownpourCtrFeatureValue::embed_g2sum_index() + 1; } static int embedx_g2sum_index() { - return DownpourCtrFeatureValue::slot_index() + 1; + return DownpourCtrFeatureValue::SlotIndex() + 1; } - static int embedx_w_index() { + static int Embedx_W_Index() { return DownpourCtrFeatureValue::embedx_g2sum_index() + 1; } static float& unseen_days(float* val) { @@ -76,17 +74,17 @@ class DownpourCtrAccessor : public ValueAccessor { static float& delta_score(float* val) { return val[DownpourCtrFeatureValue::delta_score_index()]; } - static float& show(float* val) { - return val[DownpourCtrFeatureValue::show_index()]; + static float& Show(float* val) { + return val[DownpourCtrFeatureValue::ShowIndex()]; } - static float& click(float* val) { - return val[DownpourCtrFeatureValue::click_index()]; + static float& Click(float* val) { + return val[DownpourCtrFeatureValue::ClickIndex()]; } - static float& slot(float* val) { - return val[DownpourCtrFeatureValue::slot_index()]; + static float& Slot(float* val) { + return val[DownpourCtrFeatureValue::SlotIndex()]; } - static float& embed_w(float* val) { - return val[DownpourCtrFeatureValue::embed_w_index()]; + static float& EmbedW(float* val) { + return val[DownpourCtrFeatureValue::Embed_W_Index()]; } static float& embed_g2sum(float* val) { return val[DownpourCtrFeatureValue::embed_g2sum_index()]; @@ -94,8 +92,8 @@ class DownpourCtrAccessor : public ValueAccessor { static float& embedx_g2sum(float* val) { return val[DownpourCtrFeatureValue::embedx_g2sum_index()]; } - static float* embedx_w(float* val) { - return (val + DownpourCtrFeatureValue::embedx_w_index()); + static float* EmbedxW(float* val) { + return (val + DownpourCtrFeatureValue::Embedx_W_Index()); } }; @@ -108,24 +106,24 @@ class DownpourCtrAccessor : public ValueAccessor { std::vector embedx_g; */ - static int dim(int embedx_dim) { return 4 + embedx_dim; } + static int Dim(int embedx_dim) { return 4 + embedx_dim; } - static int dim_size(int dim, int embedx_dim) { return sizeof(float); } - static int size(int embedx_dim) { return dim(embedx_dim) * sizeof(float); } - static int slot_index() { return 0; } - static int show_index() { return DownpourCtrPushValue::slot_index() + 1; } - static int click_index() { return DownpourCtrPushValue::show_index() + 1; } - static int embed_g_index() { - return DownpourCtrPushValue::click_index() + 1; - } - static int embedx_g_index() { - return DownpourCtrPushValue::embed_g_index() + 1; - } - static float& slot(float* val) { return val[0]; } - static float& show(float* val) { return val[1]; } - static float& click(float* val) { return val[2]; } - static float& embed_g(float* val) { return val[3]; } - static float* embedx_g(float* val) { return val + 4; } + static int DimSize(int dim, int embedx_dim) { return sizeof(float); } + static int Size(int embedx_dim) { return Dim(embedx_dim) * sizeof(float); } + static int SlotIndex() { return 0; } + static int ShowIndex() { return DownpourCtrPushValue::SlotIndex() + 1; } + static int ClickIndex() { return DownpourCtrPushValue::ShowIndex() + 1; } + static int Embed_G_Index() { + return DownpourCtrPushValue::ClickIndex() + 1; + } + static int Embedx_G_Index() { + return DownpourCtrPushValue::Embed_G_Index() + 1; + } + static float& Slot(float* val) { return val[0]; } + static float& Show(float* val) { return val[1]; } + static float& Click(float* val) { return val[2]; } + static float& EmbedG(float* val) { return val[3]; } + static float* EmbedxG(float* val) { return val + 4; } }; struct DownpourCtrPullValue { @@ -136,95 +134,95 @@ class DownpourCtrAccessor : public ValueAccessor { std::vector embedx_w; */ - static int dim(int embedx_dim) { return 3 + embedx_dim; } - static int dim_size(size_t dim) { return sizeof(float); } - static int size(int embedx_dim) { return dim(embedx_dim) * sizeof(float); } - static int show_index() { return 0; } - static int click_index() { return 1; } - static int embed_w_index() { return 2; } - static int embedx_w_index() { return 3; } - static float& show(float* val) { - return val[DownpourCtrPullValue::show_index()]; + static int Dim(int embedx_dim) { return 3 + embedx_dim; } + static int DimSize(size_t dim) { return sizeof(float); } + static int Size(int embedx_dim) { return Dim(embedx_dim) * sizeof(float); } + static int ShowIndex() { return 0; } + static int ClickIndex() { return 1; } + static int Embed_W_Index() { return 2; } + static int Embedx_W_Index() { return 3; } + static float& Show(float* val) { + return val[DownpourCtrPullValue::ShowIndex()]; } - static float& click(float* val) { - return val[DownpourCtrPullValue::click_index()]; + static float& Click(float* val) { + return val[DownpourCtrPullValue::ClickIndex()]; } - static float& embed_w(float* val) { - return val[DownpourCtrPullValue::embed_w_index()]; + static float& EmbedW(float* val) { + return val[DownpourCtrPullValue::Embed_W_Index()]; } - static float* embedx_w(float* val) { - return val + DownpourCtrPullValue::embedx_w_index(); + static float* EmbedxW(float* val) { + return val + DownpourCtrPullValue::Embedx_W_Index(); } }; DownpourCtrAccessor() {} virtual ~DownpourCtrAccessor() {} - virtual int initialize(); + virtual int Initialize(); virtual void SetTableInfo(AccessorInfo& info); virtual size_t GetTableInfo(InfoKey key); // value维度 - virtual size_t dim(); + size_t Dim(); // value各个维度的size - virtual size_t dim_size(size_t dim); + size_t DimSize(size_t dim); // value各维度相加总size - virtual size_t size(); + size_t Size(); // value中mf动态长度部分总size大小, sparse下生效 - virtual size_t mf_size(); + size_t MFSize(); // pull value维度 - virtual size_t select_dim(); + size_t SelectDim(); // pull value各个维度的size - virtual size_t select_dim_size(size_t dim); + size_t SelectDimSize(size_t dim); // pull value各维度相加总size - virtual size_t select_size(); + size_t SelectSize(); // push value维度 - virtual size_t update_dim(); + size_t UpdateDim(); // push value各个维度的size - virtual size_t update_dim_size(size_t dim); + size_t UpdateDimSize(size_t dim); // push value各维度相加总size - virtual size_t update_size(); + size_t UpdateSize(); // 判断该value是否进行shrink - virtual bool shrink(float* value); + virtual bool Shrink(float* value); // 判断该value是否保存到ssd virtual bool save_ssd(float* value); - virtual bool need_extend_mf(float* value); - virtual bool has_mf(size_t size); + virtual bool NeedExtendMF(float* value); + virtual bool HasMF(size_t size); // 判断该value是否在save阶段dump, // param作为参数用于标识save阶段,如downpour的xbox与batch_model // param = 0, save all feature // param = 1, save delta feature // param = 3, save all feature with time decay - virtual bool save(float* value, int param) override; + virtual bool Save(float* value, int param) override; // update delta_score and unseen_days after save - virtual void update_stat_after_save(float* value, int param) override; + virtual void UpdateStatAfterSave(float* value, int param) override; // virtual bool save_cache(float* value, int param, double // global_cache_threshold) override; // keys不存在时,为values生成随机值 // 要求value的内存由外部调用者分配完毕 - virtual int32_t create(float** value, size_t num); + virtual int32_t Create(float** value, size_t num); // 从values中选取到select_values中 - virtual int32_t select(float** select_values, const float** values, + virtual int32_t Select(float** select_values, const float** values, size_t num); // 将update_values聚合到一起 - virtual int32_t merge(float** update_values, + virtual int32_t Merge(float** update_values, const float** other_update_values, size_t num); // 将update_values聚合到一起,通过it.next判定是否进入下一个key - // virtual int32_t merge(float** update_values, iterator it); + // virtual int32_t Merge(float** update_values, iterator it); // 将update_values更新应用到values中 - virtual int32_t update(float** values, const float** update_values, + virtual int32_t Update(float** values, const float** update_values, size_t num); - virtual std::string parse_to_string(const float* value, int param) override; - virtual int32_t parse_from_string(const std::string& str, float* v) override; - virtual bool create_value(int type, const float* value); + virtual std::string ParseToString(const float* value, int param) override; + virtual int32_t ParseFromString(const std::string& str, float* v) override; + virtual bool CreateValue(int type, const float* value); //这个接口目前只用来取show - virtual float get_field(float* value, const std::string& name) override { + virtual float GetField(float* value, const std::string& name) override { CHECK(name == "show"); if (name == "show") { auto unseen_days = DownpourCtrFeatureValue::unseen_days(value); int16_t day_diff = _day_id - unseen_days; auto show_right = - DownpourCtrFeatureValue::show(value) * _time_decay_rates[day_diff]; + DownpourCtrFeatureValue::Show(value) * _time_decay_rates[day_diff]; return (float)show_right; } return 0.0; diff --git a/paddle/fluid/distributed/ps/table/memory_sparse_table.cc b/paddle/fluid/distributed/ps/table/memory_sparse_table.cc index 3f5c484eab825..61ea2f8f2007e 100644 --- a/paddle/fluid/distributed/ps/table/memory_sparse_table.cc +++ b/paddle/fluid/distributed/ps/table/memory_sparse_table.cc @@ -99,9 +99,9 @@ int32_t MemorySparseTable::load(const std::string& path, channel_config.path = file_list[file_start_idx + i]; VLOG(1) << "MemorySparseTable::load begin load " << channel_config.path << " into local shard " << i; - channel_config.converter = _value_accesor->converter(load_param).converter; + channel_config.converter = _value_accesor->Converter(load_param).converter; channel_config.deconverter = - _value_accesor->converter(load_param).deconverter; + _value_accesor->Converter(load_param).deconverter; bool is_read_failed = false; int retry_num = 0; @@ -119,8 +119,7 @@ int32_t MemorySparseTable::load(const std::string& path, uint64_t key = std::strtoul(line_data.data(), &end, 10); auto& value = shard[key]; value.resize(feature_value_size); - int parse_size = - _value_accesor->parse_from_string(++end, value.data()); + int parse_size = _value_accesor->ParseFromString(++end, value.data()); value.resize(parse_size); // for debug @@ -196,8 +195,7 @@ int32_t MemorySparseTable::load_local_fs(const std::string& path, uint64_t key = std::strtoul(line_data.data(), &end, 10); auto& value = shard[key]; value.resize(feature_value_size); - int parse_size = - _value_accesor->parse_from_string(++end, value.data()); + int parse_size = _value_accesor->ParseFromString(++end, value.data()); value.resize(parse_size); } file.close(); @@ -253,9 +251,9 @@ int32_t MemorySparseTable::save(const std::string& dirname, paddle::string::format_string("%s/part-%03d-%05d", table_path.c_str(), _shard_idx, file_start_idx + i); } - channel_config.converter = _value_accesor->converter(save_param).converter; + channel_config.converter = _value_accesor->Converter(save_param).converter; channel_config.deconverter = - _value_accesor->converter(save_param).deconverter; + _value_accesor->Converter(save_param).deconverter; bool is_write_failed = false; int feasign_size = 0; int retry_num = 0; @@ -268,8 +266,8 @@ int32_t MemorySparseTable::save(const std::string& dirname, auto write_channel = _afs_client.open_w(channel_config, 1024 * 1024 * 40, &err_no); for (auto it = shard.begin(); it != shard.end(); ++it) { - if (_value_accesor->save(it.value().data(), save_param)) { - std::string format_value = _value_accesor->parse_to_string( + if (_value_accesor->Save(it.value().data(), save_param)) { + std::string format_value = _value_accesor->ParseToString( it.value().data(), it.value().size()); if (0 != write_channel->write_line(paddle::string::format_string( @@ -302,7 +300,7 @@ int32_t MemorySparseTable::save(const std::string& dirname, } while (is_write_failed); feasign_size_all += feasign_size; for (auto it = shard.begin(); it != shard.end(); ++it) { - _value_accesor->update_stat_after_save(it.value().data(), save_param); + _value_accesor->UpdateStatAfterSave(it.value().data(), save_param); } LOG(INFO) << "MemorySparseTable save prefix success, path: " << channel_config.path; @@ -334,9 +332,9 @@ int32_t MemorySparseTable::save_local_fs(const std::string& dirname, std::ofstream os; os.open(file_name); for (auto it = shard.begin(); it != shard.end(); ++it) { - if (_value_accesor->save(it.value().data(), save_param)) { - std::string format_value = _value_accesor->parse_to_string( - it.value().data(), it.value().size()); + if (_value_accesor->Save(it.value().data(), save_param)) { + std::string format_value = + _value_accesor->ParseToString(it.value().data(), it.value().size()); std::string out_line = paddle::string::format_string( "%lu %s\n", it.key(), format_value.c_str()); // VLOG(2) << out_line.c_str(); @@ -370,7 +368,7 @@ int64_t MemorySparseTable::local_mf_size() { auto& local_shard = _local_shards[shard_id]; for (auto it = local_shard.begin(); it != local_shard.end(); ++it) { - if (_value_accesor->has_mf(it.value().size())) { + if (_value_accesor->HasMF(it.value().size())) { size_arr[shard_id] += 1; } } @@ -453,7 +451,7 @@ int32_t MemorySparseTable::pull_sparse(float* pull_values, auto& feature_value = local_shard[key]; feature_value.resize(data_size); float* data_ptr = feature_value.data(); - _value_accesor->create(&data_buffer_ptr, 1); + _value_accesor->Create(&data_buffer_ptr, 1); memcpy(data_ptr, data_buffer_ptr, data_size * sizeof(float)); } @@ -467,7 +465,7 @@ int32_t MemorySparseTable::pull_sparse(float* pull_values, } auto offset = keys[i].second; float* select_data = pull_values + select_value_size * offset; - _value_accesor->select(&select_data, + _value_accesor->Select(&select_data, (const float**)&data_buffer_ptr, 1); } @@ -484,8 +482,8 @@ int32_t MemorySparseTable::pull_sparse(float* pull_values, int32_t MemorySparseTable::pull_sparse_ptr(char** pull_values, const uint64_t* keys, size_t num) { CostTimer timer("pscore_sparse_select_all"); - size_t value_size = _value_accesor->size() / sizeof(float); - size_t mf_value_size = _value_accesor->mf_size() / sizeof(float); + size_t value_size = _value_accesor->GetTableInfo(SIZE) / sizeof(float); + size_t mf_value_size = _value_accesor->GetTableInfo(MF_SIZE) / sizeof(float); std::vector> tasks(_real_local_shard_num); std::vector>> task_keys( @@ -514,7 +512,7 @@ int32_t MemorySparseTable::pull_sparse_ptr(char** pull_values, auto& feature_value = local_shard[key]; feature_value.resize(data_size); float* data_ptr = feature_value.data(); - _value_accesor->create(&data_buffer_ptr, 1); + _value_accesor->Create(&data_buffer_ptr, 1); memcpy(data_ptr, data_buffer_ptr, data_size * sizeof(float)); ret = &feature_value; } else { @@ -564,13 +562,13 @@ int32_t MemorySparseTable::push_sparse(const uint64_t* keys, auto itr = local_shard.find(key); if (itr == local_shard.end()) { if (FLAGS_pserver_enable_create_feasign_randomly && - !_value_accesor->create_value(1, update_data)) { + !_value_accesor->CreateValue(1, update_data)) { continue; } auto value_size = value_col - mf_value_col; auto& feature_value = local_shard[key]; feature_value.resize(value_size); - _value_accesor->create(&data_buffer_ptr, 1); + _value_accesor->Create(&data_buffer_ptr, 1); memcpy(feature_value.data(), data_buffer_ptr, value_size * sizeof(float)); itr = local_shard.find(key); @@ -581,16 +579,16 @@ int32_t MemorySparseTable::push_sparse(const uint64_t* keys, size_t value_size = feature_value.size(); if (value_size == value_col) { // 已拓展到最大size, 则就地update - _value_accesor->update(&value_data, &update_data, 1); + _value_accesor->Update(&value_data, &update_data, 1); } else { // 拷入buffer区进行update,然后再回填,不需要的mf则回填时抛弃了 memcpy(data_buffer_ptr, value_data, value_size * sizeof(float)); - _value_accesor->update(&data_buffer_ptr, &update_data, 1); + _value_accesor->Update(&data_buffer_ptr, &update_data, 1); - if (_value_accesor->need_extend_mf(data_buffer)) { + if (_value_accesor->NeedExtendMF(data_buffer)) { feature_value.resize(value_col); value_data = feature_value.data(); - _value_accesor->create(&value_data, 1); + _value_accesor->Create(&value_data, 1); } memcpy(value_data, data_buffer_ptr, value_size * sizeof(float)); } @@ -641,13 +639,13 @@ int32_t MemorySparseTable::_push_sparse(const uint64_t* keys, auto itr = local_shard.find(key); if (itr == local_shard.end()) { if (FLAGS_pserver_enable_create_feasign_randomly && - !_value_accesor->create_value(1, update_data)) { + !_value_accesor->CreateValue(1, update_data)) { continue; } auto value_size = value_col - mf_value_col; auto& feature_value = local_shard[key]; feature_value.resize(value_size); - _value_accesor->create(&data_buffer_ptr, 1); + _value_accesor->Create(&data_buffer_ptr, 1); memcpy(feature_value.data(), data_buffer_ptr, value_size * sizeof(float)); itr = local_shard.find(key); @@ -656,15 +654,15 @@ int32_t MemorySparseTable::_push_sparse(const uint64_t* keys, float* value_data = feature_value.data(); size_t value_size = feature_value.size(); if (value_size == value_col) { // 已拓展到最大size, 则就地update - _value_accesor->update(&value_data, &update_data, 1); + _value_accesor->Update(&value_data, &update_data, 1); } else { // 拷入buffer区进行update,然后再回填,不需要的mf则回填时抛弃了 memcpy(data_buffer_ptr, value_data, value_size * sizeof(float)); - _value_accesor->update(&data_buffer_ptr, &update_data, 1); - if (_value_accesor->need_extend_mf(data_buffer)) { + _value_accesor->Update(&data_buffer_ptr, &update_data, 1); + if (_value_accesor->NeedExtendMF(data_buffer)) { feature_value.resize(value_col); value_data = feature_value.data(); - _value_accesor->create(&value_data, 1); + _value_accesor->Create(&value_data, 1); } memcpy(value_data, data_buffer_ptr, value_size * sizeof(float)); } @@ -688,7 +686,7 @@ int32_t MemorySparseTable::shrink(const std::string& param) { // shrink auto& shard = _local_shards[shard_id]; for (auto it = shard.begin(); it != shard.end();) { - if (_value_accesor->shrink(it.value().data())) { + if (_value_accesor->Shrink(it.value().data())) { it = shard.erase(it); } else { ++it; diff --git a/paddle/fluid/distributed/ps/table/sparse_accessor.cc b/paddle/fluid/distributed/ps/table/sparse_accessor.cc index 651ff9d00e49a..511b36389aaee 100644 --- a/paddle/fluid/distributed/ps/table/sparse_accessor.cc +++ b/paddle/fluid/distributed/ps/table/sparse_accessor.cc @@ -20,7 +20,7 @@ namespace paddle { namespace distributed { -int SparseAccessor::initialize() { +int SparseAccessor::Initialize() { auto name = _config.embed_sgd_param().name(); _embed_sgd_rule = CREATE_PSCORE_CLASS(SparseValueSGDRule, name); _embed_sgd_rule->load_config(_config.embed_sgd_param(), 1); @@ -39,73 +39,72 @@ int SparseAccessor::initialize() { } void SparseAccessor::SetTableInfo(AccessorInfo& info) { - info.dim = dim(); - info.size = size(); - info.select_dim = select_dim(); - info.select_size = select_size(); - info.update_dim = update_dim(); - info.update_size = update_size(); - info.mf_size = mf_size(); - info.fea_dim = fea_dim(); + info.dim = Dim(); + info.size = Size(); + info.select_dim = SelectDim(); + info.select_size = SelectSize(); + info.update_dim = UpdateDim(); + info.update_size = UpdateSize(); + info.mf_size = MFSize(); } size_t SparseAccessor::GetTableInfo(InfoKey key) { switch (key) { case DIM: - return dim(); + return Dim(); case SIZE: - return size(); + return Size(); case SELECT_DIM: - return select_dim(); + return SelectDim(); case SELECT_SIZE: - return select_size(); + return SelectSize(); case UPDATE_DIM: - return update_dim(); + return UpdateDim(); case UPDATE_SIZE: - return update_size(); + return UpdateSize(); case MF_SIZE: - return mf_size(); - case FEA_DIM: - return fea_dim(); + return MFSize(); + default: + return 0; } return 0; } -size_t SparseAccessor::dim() { return sparse_feature_value.dim(); } +size_t SparseAccessor::Dim() { return sparse_feature_value.Dim(); } -size_t SparseAccessor::dim_size(size_t dim) { +size_t SparseAccessor::DimSize(size_t dim) { auto embedx_dim = _config.embedx_dim(); - return sparse_feature_value.dim_size(dim, embedx_dim); + return sparse_feature_value.DimSize(dim, embedx_dim); } -size_t SparseAccessor::size() { return sparse_feature_value.size(); } +size_t SparseAccessor::Size() { return sparse_feature_value.Size(); } -size_t SparseAccessor::mf_size() { +size_t SparseAccessor::MFSize() { return (_config.embedx_dim() + sparse_feature_value.embedx_sgd_dim) * sizeof(float); // embedx embedx_g2sum } // pull value -size_t SparseAccessor::select_dim() { +size_t SparseAccessor::SelectDim() { auto embedx_dim = _config.embedx_dim(); return 1 + embedx_dim; } -size_t SparseAccessor::select_dim_size(size_t dim) { return sizeof(float); } +size_t SparseAccessor::SelectDimSize(size_t dim) { return sizeof(float); } -size_t SparseAccessor::select_size() { return select_dim() * sizeof(float); } +size_t SparseAccessor::SelectSize() { return SelectDim() * sizeof(float); } // push value -size_t SparseAccessor::update_dim() { +size_t SparseAccessor::UpdateDim() { auto embedx_dim = _config.embedx_dim(); return 4 + embedx_dim; } -size_t SparseAccessor::update_dim_size(size_t dim) { return sizeof(float); } +size_t SparseAccessor::UpdateDimSize(size_t dim) { return sizeof(float); } -size_t SparseAccessor::update_size() { return update_dim() * sizeof(float); } +size_t SparseAccessor::UpdateSize() { return UpdateDim() * sizeof(float); } -bool SparseAccessor::shrink(float* value) { +bool SparseAccessor::Shrink(float* value) { auto base_threshold = _config.ctr_accessor_param().base_threshold(); auto delta_threshold = _config.ctr_accessor_param().delta_threshold(); auto delete_after_unseen_days = @@ -113,12 +112,12 @@ bool SparseAccessor::shrink(float* value) { auto delete_threshold = _config.ctr_accessor_param().delete_threshold(); // time_decay first - sparse_feature_value.show(value) *= _show_click_decay_rate; - sparse_feature_value.click(value) *= _show_click_decay_rate; + sparse_feature_value.Show(value) *= _show_click_decay_rate; + sparse_feature_value.Click(value) *= _show_click_decay_rate; // shrink after - auto score = show_click_score(sparse_feature_value.show(value), - sparse_feature_value.click(value)); + auto score = show_click_score(sparse_feature_value.Show(value), + sparse_feature_value.Click(value)); auto unseen_days = sparse_feature_value.unseen_days(value); if (score < delete_threshold || unseen_days > delete_after_unseen_days) { return true; @@ -126,7 +125,7 @@ bool SparseAccessor::shrink(float* value) { return false; } -bool SparseAccessor::save(float* value, int param) { +bool SparseAccessor::Save(float* value, int param) { auto base_threshold = _config.ctr_accessor_param().base_threshold(); auto delta_threshold = _config.ctr_accessor_param().delta_threshold(); auto delta_keep_days = _config.ctr_accessor_param().delta_keep_days(); @@ -142,8 +141,8 @@ bool SparseAccessor::save(float* value, int param) { case 1: // save xbox base case 2: { - if (show_click_score(sparse_feature_value.show(value), - sparse_feature_value.click(value)) >= + if (show_click_score(sparse_feature_value.Show(value), + sparse_feature_value.Click(value)) >= base_threshold && sparse_feature_value.delta_score(value) >= delta_threshold && sparse_feature_value.unseen_days(value) <= delta_keep_days) { @@ -171,7 +170,7 @@ bool SparseAccessor::save(float* value, int param) { } } -void SparseAccessor::update_stat_after_save(float* value, int param) { +void SparseAccessor::UpdateStatAfterSave(float* value, int param) { auto base_threshold = _config.ctr_accessor_param().base_threshold(); auto delta_threshold = _config.ctr_accessor_param().delta_threshold(); auto delta_keep_days = _config.ctr_accessor_param().delta_keep_days(); @@ -180,8 +179,8 @@ void SparseAccessor::update_stat_after_save(float* value, int param) { } switch (param) { case 1: { - if (show_click_score(sparse_feature_value.show(value), - sparse_feature_value.click(value)) >= + if (show_click_score(sparse_feature_value.Show(value), + sparse_feature_value.Click(value)) >= base_threshold && sparse_feature_value.delta_score(value) >= delta_threshold && sparse_feature_value.unseen_days(value) <= delta_keep_days) { @@ -198,48 +197,48 @@ void SparseAccessor::update_stat_after_save(float* value, int param) { } } -int32_t SparseAccessor::create(float** values, size_t num) { +int32_t SparseAccessor::Create(float** values, size_t num) { auto embedx_dim = _config.embedx_dim(); for (size_t value_item = 0; value_item < num; ++value_item) { float* value = values[value_item]; value[sparse_feature_value.unseen_days_index()] = 0; value[sparse_feature_value.delta_score_index()] = 0; - value[sparse_feature_value.show_index()] = 0; - value[sparse_feature_value.click_index()] = 0; - value[sparse_feature_value.slot_index()] = -1; + value[sparse_feature_value.ShowIndex()] = 0; + value[sparse_feature_value.ClickIndex()] = 0; + value[sparse_feature_value.SlotIndex()] = -1; _embed_sgd_rule->init_value( - value + sparse_feature_value.embed_w_index(), + value + sparse_feature_value.Embed_W_Index(), value + sparse_feature_value.embed_g2sum_index()); _embedx_sgd_rule->init_value( - value + sparse_feature_value.embedx_w_index(), + value + sparse_feature_value.Embedx_W_Index(), value + sparse_feature_value.embedx_g2sum_index(), false); } return 0; } -bool SparseAccessor::need_extend_mf(float* value) { - float show = value[sparse_feature_value.show_index()]; - float click = value[sparse_feature_value.click_index()]; +bool SparseAccessor::NeedExtendMF(float* value) { + float show = value[sparse_feature_value.ShowIndex()]; + float click = value[sparse_feature_value.ClickIndex()]; float score = (show - click) * _config.ctr_accessor_param().nonclk_coeff() + click * _config.ctr_accessor_param().click_coeff(); return score >= _config.embedx_threshold(); } -bool SparseAccessor::has_mf(size_t size) { +bool SparseAccessor::HasMF(size_t size) { return size > sparse_feature_value.embedx_g2sum_index(); } // from SparseFeatureValue to SparsePullValue -int32_t SparseAccessor::select(float** select_values, const float** values, +int32_t SparseAccessor::Select(float** select_values, const float** values, size_t num) { auto embedx_dim = _config.embedx_dim(); for (size_t value_item = 0; value_item < num; ++value_item) { float* select_value = select_values[value_item]; const float* value = values[value_item]; - select_value[SparsePullValue::embed_w_index()] = - value[sparse_feature_value.embed_w_index()]; - memcpy(select_value + SparsePullValue::embedx_w_index(), - value + sparse_feature_value.embedx_w_index(), + select_value[SparsePullValue::Embed_W_Index()] = + value[sparse_feature_value.Embed_W_Index()]; + memcpy(select_value + SparsePullValue::Embedx_W_Index(), + value + sparse_feature_value.Embedx_W_Index(), embedx_dim * sizeof(float)); } return 0; @@ -248,15 +247,15 @@ int32_t SparseAccessor::select(float** select_values, const float** values, // from SparsePushValue to SparsePushValue // first dim: item // second dim: field num -int32_t SparseAccessor::merge(float** update_values, +int32_t SparseAccessor::Merge(float** update_values, const float** other_update_values, size_t num) { auto embedx_dim = _config.embedx_dim(); - size_t total_dim = SparsePushValue::dim(embedx_dim); + size_t total_dim = SparsePushValue::Dim(embedx_dim); for (size_t value_item = 0; value_item < num; ++value_item) { float* update_value = update_values[value_item]; const float* other_update_value = other_update_values[value_item]; for (auto i = 0u; i < total_dim; ++i) { - if (i != SparsePushValue::slot_index()) { + if (i != SparsePushValue::SlotIndex()) { update_value[i] += other_update_value[i]; } } @@ -267,43 +266,43 @@ int32_t SparseAccessor::merge(float** update_values, // from SparsePushValue to SparseFeatureValue // first dim: item // second dim: field num -int32_t SparseAccessor::update(float** update_values, const float** push_values, +int32_t SparseAccessor::Update(float** update_values, const float** push_values, size_t num) { auto embedx_dim = _config.embedx_dim(); for (size_t value_item = 0; value_item < num; ++value_item) { float* update_value = update_values[value_item]; const float* push_value = push_values[value_item]; - float push_show = push_value[SparsePushValue::show_index()]; - float push_click = push_value[SparsePushValue::click_index()]; - float slot = push_value[SparsePushValue::slot_index()]; - update_value[sparse_feature_value.show_index()] += push_show; - update_value[sparse_feature_value.click_index()] += push_click; - update_value[sparse_feature_value.slot_index()] = slot; + float push_show = push_value[SparsePushValue::ShowIndex()]; + float push_click = push_value[SparsePushValue::ClickIndex()]; + float slot = push_value[SparsePushValue::SlotIndex()]; + update_value[sparse_feature_value.ShowIndex()] += push_show; + update_value[sparse_feature_value.ClickIndex()] += push_click; + update_value[sparse_feature_value.SlotIndex()] = slot; update_value[sparse_feature_value.delta_score_index()] += (push_show - push_click) * _config.ctr_accessor_param().nonclk_coeff() + push_click * _config.ctr_accessor_param().click_coeff(); update_value[sparse_feature_value.unseen_days_index()] = 0; _embed_sgd_rule->update_value( - update_value + sparse_feature_value.embed_w_index(), + update_value + sparse_feature_value.Embed_W_Index(), update_value + sparse_feature_value.embed_g2sum_index(), - push_value + SparsePushValue::embed_g_index()); + push_value + SparsePushValue::Embed_G_Index()); _embedx_sgd_rule->update_value( - update_value + sparse_feature_value.embedx_w_index(), + update_value + sparse_feature_value.Embedx_W_Index(), update_value + sparse_feature_value.embedx_g2sum_index(), - push_value + SparsePushValue::embedx_g_index()); + push_value + SparsePushValue::Embedx_G_Index()); } return 0; } -bool SparseAccessor::create_value(int stage, const float* value) { +bool SparseAccessor::CreateValue(int stage, const float* value) { // stage == 0, pull // stage == 1, push if (stage == 0) { return true; } else if (stage == 1) { // operation - auto show = SparsePushValue::show(const_cast(value)); - auto click = SparsePushValue::click(const_cast(value)); + auto show = SparsePushValue::Show(const_cast(value)); + auto click = SparsePushValue::Click(const_cast(value)); auto score = show_click_score(show, click); if (score <= 0) { return false; @@ -324,34 +323,34 @@ float SparseAccessor::show_click_score(float show, float click) { return (show - click) * nonclk_coeff + click * click_coeff; } -std::string SparseAccessor::parse_to_string(const float* v, int param) { +std::string SparseAccessor::ParseToString(const float* v, int param) { thread_local std::ostringstream os; os.clear(); os.str(""); os << v[0] << " " << v[1] << " " << v[2] << " " << v[3] << " " << v[4] << " " << v[5]; for (int i = sparse_feature_value.embed_g2sum_index(); - i < sparse_feature_value.embedx_w_index(); i++) { + i < sparse_feature_value.Embedx_W_Index(); i++) { os << " " << v[i]; } - auto show = sparse_feature_value.show(const_cast(v)); - auto click = sparse_feature_value.click(const_cast(v)); + auto show = sparse_feature_value.Show(const_cast(v)); + auto click = sparse_feature_value.Click(const_cast(v)); auto score = show_click_score(show, click); if (score >= _config.embedx_threshold() && - param > sparse_feature_value.embedx_w_index()) { - for (auto i = sparse_feature_value.embedx_w_index(); - i < sparse_feature_value.dim(); ++i) { + param > sparse_feature_value.Embedx_W_Index()) { + for (auto i = sparse_feature_value.Embedx_W_Index(); + i < sparse_feature_value.Dim(); ++i) { os << " " << v[i]; } } return os.str(); } -int SparseAccessor::parse_from_string(const std::string& str, float* value) { +int SparseAccessor::ParseFromString(const std::string& str, float* value) { int embedx_dim = _config.embedx_dim(); _embedx_sgd_rule->init_value( - value + sparse_feature_value.embedx_w_index(), + value + sparse_feature_value.Embedx_W_Index(), value + sparse_feature_value.embedx_g2sum_index()); auto ret = paddle::string::str_to_float(str.data(), value); CHECK(ret >= 6) << "expect more than 6 real:" << ret; diff --git a/paddle/fluid/distributed/ps/table/sparse_accessor.h b/paddle/fluid/distributed/ps/table/sparse_accessor.h index cdc4c1dc6200e..b11acff6aaaa3 100644 --- a/paddle/fluid/distributed/ps/table/sparse_accessor.h +++ b/paddle/fluid/distributed/ps/table/sparse_accessor.h @@ -40,27 +40,27 @@ class SparseAccessor : public ValueAccessor { std::float embedx_g2sum; */ - int dim() { return 6 + embed_sgd_dim + embedx_sgd_dim + embedx_dim; } - int dim_size(size_t dim, int embedx_dim) { return sizeof(float); } - int size() { return dim() * sizeof(float); } - int slot_index() { return 0; } - int unseen_days_index() { return slot_index() + 1; } + int Dim() { return 6 + embed_sgd_dim + embedx_sgd_dim + embedx_dim; } + int DimSize(size_t dim, int embedx_dim) { return sizeof(float); } + int Size() { return Dim() * sizeof(float); } + int SlotIndex() { return 0; } + int unseen_days_index() { return SlotIndex() + 1; } int delta_score_index() { return unseen_days_index() + 1; } - int show_index() { return delta_score_index() + 1; } - int click_index() { return show_index() + 1; } - int embed_w_index() { return click_index() + 1; } - int embed_g2sum_index() { return embed_w_index() + 1; } - int embedx_w_index() { return embed_g2sum_index() + embed_sgd_dim; } - int embedx_g2sum_index() { return embedx_w_index() + embedx_dim; } + int ShowIndex() { return delta_score_index() + 1; } + int ClickIndex() { return ShowIndex() + 1; } + int Embed_W_Index() { return ClickIndex() + 1; } + int embed_g2sum_index() { return Embed_W_Index() + 1; } + int Embedx_W_Index() { return embed_g2sum_index() + embed_sgd_dim; } + int embedx_g2sum_index() { return Embedx_W_Index() + embedx_dim; } float& unseen_days(float* val) { return val[unseen_days_index()]; } float& delta_score(float* val) { return val[delta_score_index()]; } - float& show(float* val) { return val[show_index()]; } - float& click(float* val) { return val[click_index()]; } - float& slot(float* val) { return val[slot_index()]; } - float& embed_w(float* val) { return val[embed_w_index()]; } + float& Show(float* val) { return val[ShowIndex()]; } + float& Click(float* val) { return val[ClickIndex()]; } + float& Slot(float* val) { return val[SlotIndex()]; } + float& EmbedW(float* val) { return val[Embed_W_Index()]; } float& embed_g2sum(float* val) { return val[embed_g2sum_index()]; } - float& embedx_w(float* val) { return val[embedx_w_index()]; } + float& EmbedxW(float* val) { return val[Embedx_W_Index()]; } float& embedx_g2sum(float* val) { return val[embedx_g2sum_index()]; } int embed_sgd_dim; @@ -77,29 +77,25 @@ class SparseAccessor : public ValueAccessor { std::vector embedx_g; */ - static int dim(int embedx_dim) { return 4 + embedx_dim; } - - static int dim_size(int dim, int embedx_dim) { return sizeof(float); } - static int size(int embedx_dim) { return dim(embedx_dim) * sizeof(float); } - static int slot_index() { return 0; } - static int show_index() { return SparsePushValue::slot_index() + 1; } - static int click_index() { return SparsePushValue::show_index() + 1; } - static int embed_g_index() { return SparsePushValue::click_index() + 1; } - static int embedx_g_index() { return SparsePushValue::embed_g_index() + 1; } - static float& slot(float* val) { - return val[SparsePushValue::slot_index()]; + static int Dim(int embedx_dim) { return 4 + embedx_dim; } + + static int DimSize(int dim, int embedx_dim) { return sizeof(float); } + static int Size(int embedx_dim) { return Dim(embedx_dim) * sizeof(float); } + static int SlotIndex() { return 0; } + static int ShowIndex() { return SparsePushValue::SlotIndex() + 1; } + static int ClickIndex() { return SparsePushValue::ShowIndex() + 1; } + static int Embed_G_Index() { return SparsePushValue::ClickIndex() + 1; } + static int Embedx_G_Index() { return SparsePushValue::Embed_G_Index() + 1; } + static float& Slot(float* val) { return val[SparsePushValue::SlotIndex()]; } + static float& Show(float* val) { return val[SparsePushValue::ShowIndex()]; } + static float& Click(float* val) { + return val[SparsePushValue::ClickIndex()]; } - static float& show(float* val) { - return val[SparsePushValue::show_index()]; + static float& EmbedG(float* val) { + return val[SparsePushValue::Embed_G_Index()]; } - static float& click(float* val) { - return val[SparsePushValue::click_index()]; - } - static float& embed_g(float* val) { - return val[SparsePushValue::embed_g_index()]; - } - static float* embedx_g(float* val) { - return val + SparsePushValue::embedx_g_index(); + static float* EmbedxG(float* val) { + return val + SparsePushValue::Embedx_G_Index(); } }; @@ -109,82 +105,82 @@ class SparseAccessor : public ValueAccessor { std::vector embedx_w; */ - static int dim(int embedx_dim) { return 1 + embedx_dim; } - static int dim_size(size_t dim) { return sizeof(float); } - static int size(int embedx_dim) { return dim(embedx_dim) * sizeof(float); } - static int embed_w_index() { return 0; } - static int embedx_w_index() { return 1; } - static float& embed_w(float* val) { - return val[SparsePullValue::embed_w_index()]; + static int Dim(int embedx_dim) { return 1 + embedx_dim; } + static int DimSize(size_t dim) { return sizeof(float); } + static int Size(int embedx_dim) { return Dim(embedx_dim) * sizeof(float); } + static int Embed_W_Index() { return 0; } + static int Embedx_W_Index() { return 1; } + static float& EmbedW(float* val) { + return val[SparsePullValue::Embed_W_Index()]; } - static float* embedx_w(float* val) { - return val + SparsePullValue::embedx_w_index(); + static float* EmbedxW(float* val) { + return val + SparsePullValue::Embedx_W_Index(); } }; SparseAccessor() {} - virtual int initialize(); + virtual int Initialize(); virtual void SetTableInfo(AccessorInfo& info); virtual size_t GetTableInfo(InfoKey key); virtual ~SparseAccessor() {} // value维度 - virtual size_t dim(); + size_t Dim(); // value各个维度的size - virtual size_t dim_size(size_t dim); + size_t DimSize(size_t dim); // value各维度相加总size - virtual size_t size(); + size_t Size(); // value中mf动态长度部分总size大小, sparse下生效 - virtual size_t mf_size(); + size_t MFSize(); // pull value维度 - virtual size_t select_dim(); + size_t SelectDim(); // pull value各个维度的size - virtual size_t select_dim_size(size_t dim); + size_t SelectDimSize(size_t dim); // pull value各维度相加总size - virtual size_t select_size(); + size_t SelectSize(); // push value维度 - virtual size_t update_dim(); + size_t UpdateDim(); // push value各个维度的size - virtual size_t update_dim_size(size_t dim); + size_t UpdateDimSize(size_t dim); // push value各维度相加总size - virtual size_t update_size(); + size_t UpdateSize(); // 判断该value是否进行shrink - virtual bool shrink(float* value); + virtual bool Shrink(float* value); // 判断该value是否保存到ssd // virtual bool save_ssd(float* value); - virtual bool need_extend_mf(float* value); - virtual bool has_mf(size_t size); + virtual bool NeedExtendMF(float* value); + virtual bool HasMF(size_t size); // 判断该value是否在save阶段dump, // param作为参数用于标识save阶段,如downpour的xbox与batch_model // param = 0, save all feature // param = 1, save delta feature // param = 2, save xbox base feature - bool save(float* value, int param) override; + bool Save(float* value, int param) override; // update delta_score and unseen_days after save - void update_stat_after_save(float* value, int param) override; + void UpdateStatAfterSave(float* value, int param) override; // keys不存在时,为values生成随机值 // 要求value的内存由外部调用者分配完毕 - virtual int32_t create(float** value, size_t num); + virtual int32_t Create(float** value, size_t num); // 从values中选取到select_values中 - virtual int32_t select(float** select_values, const float** values, + virtual int32_t Select(float** select_values, const float** values, size_t num); // 将update_values聚合到一起 - virtual int32_t merge(float** update_values, + virtual int32_t Merge(float** update_values, const float** other_update_values, size_t num); // 将update_values聚合到一起,通过it.next判定是否进入下一个key - // virtual int32_t merge(float** update_values, iterator it); + // virtual int32_t Merge(float** update_values, iterator it); // 将update_values更新应用到values中 - virtual int32_t update(float** values, const float** update_values, + virtual int32_t Update(float** values, const float** update_values, size_t num); - std::string parse_to_string(const float* value, int param) override; - int32_t parse_from_string(const std::string& str, float* v) override; - virtual bool create_value(int type, const float* value); + std::string ParseToString(const float* value, int param) override; + int32_t ParseFromString(const std::string& str, float* v) override; + virtual bool CreateValue(int type, const float* value); // 这个接口目前只用来取show - float get_field(float* value, const std::string& name) override { + float GetField(float* value, const std::string& name) override { // CHECK(name == "show"); if (name == "show") { - return sparse_feature_value.show(value); + return sparse_feature_value.Show(value); } return 0.0; } diff --git a/paddle/fluid/distributed/ps/table/table.cc b/paddle/fluid/distributed/ps/table/table.cc index 6faa3e2632e28..99790606f0b31 100644 --- a/paddle/fluid/distributed/ps/table/table.cc +++ b/paddle/fluid/distributed/ps/table/table.cc @@ -97,7 +97,7 @@ int32_t Table::initialize_accessor() { << ", accessor_name:" << _config.accessor().accessor_class(); return -1; } - if (accessor->configure(_config.accessor()) || accessor->initialize() != 0) { + if (accessor->Configure(_config.accessor()) || accessor->Initialize() != 0) { LOG(ERROR) << " accessor initialize failed, table_id:" << _config.table_id() << ", accessor_name:" << _config.accessor().accessor_class(); return -1; diff --git a/paddle/fluid/distributed/ps/table/tensor_accessor.cc b/paddle/fluid/distributed/ps/table/tensor_accessor.cc index 77014141783c3..43b791b6ac03b 100644 --- a/paddle/fluid/distributed/ps/table/tensor_accessor.cc +++ b/paddle/fluid/distributed/ps/table/tensor_accessor.cc @@ -18,86 +18,70 @@ namespace paddle { namespace distributed { -int CommMergeAccessor::initialize() { return 0; } +int CommMergeAccessor::Initialize() { return 0; } void CommMergeAccessor::SetTableInfo(AccessorInfo &info) { - info.dim = dim(); - info.size = size(); - info.select_dim = select_dim(); - info.select_size = select_size(); - info.update_dim = update_dim(); - info.update_size = update_size(); - info.mf_size = mf_size(); + info.select_dim = SelectDim(); + info.select_size = SelectSize(); + info.update_dim = UpdateDim(); + info.update_size = UpdateSize(); info.fea_dim = fea_dim(); } size_t CommMergeAccessor::GetTableInfo(InfoKey key) { switch (key) { - case DIM: - return dim(); - case SIZE: - return size(); case SELECT_DIM: - return select_dim(); + return SelectDim(); case SELECT_SIZE: - return select_size(); + return SelectSize(); case UPDATE_DIM: - return update_dim(); + return UpdateDim(); case UPDATE_SIZE: - return update_size(); - case MF_SIZE: - return mf_size(); + return UpdateSize(); case FEA_DIM: return fea_dim(); + default: + return 0; } return 0; } -// value 维度 -size_t CommMergeAccessor::dim() { return 0; } - -// value 各个维度的size -size_t CommMergeAccessor::dim_size(size_t dim) { return 0; } - -// value 各维度相加总size -size_t CommMergeAccessor::size() { return 0; } - // pull value 维度 -size_t CommMergeAccessor::select_dim() { return _config.embedx_dim(); } +size_t CommMergeAccessor::SelectDim() { return _config.embedx_dim(); } // pull value 各个维度的size -size_t CommMergeAccessor::select_dim_size(size_t dim) { return sizeof(float); } +size_t CommMergeAccessor::SelectDimSize(size_t dim) { return sizeof(float); } // pull value 各维度相加总size -size_t CommMergeAccessor::select_size() { return select_dim() * sizeof(float); } +size_t CommMergeAccessor::SelectSize() { return SelectDim() * sizeof(float); } // push value 维度 -size_t CommMergeAccessor::update_dim() { return _config.embedx_dim(); } +size_t CommMergeAccessor::UpdateDim() { return _config.embedx_dim(); } // push value 各个维度的size -size_t CommMergeAccessor::update_dim_size(size_t dim) { return sizeof(float); } +size_t CommMergeAccessor::UpdateDimSize(size_t dim) { return sizeof(float); } // push value 各维度相加总size -size_t CommMergeAccessor::update_size() { return update_dim() * sizeof(float); } +size_t CommMergeAccessor::UpdateSize() { return UpdateDim() * sizeof(float); } // 判断该value 是否进行shrink -bool CommMergeAccessor::shrink(float * /*value*/) { return false; } +bool CommMergeAccessor::Shrink(float * /*value*/) { return false; } // 判断该value 是否在save阶段dump, // param作为参数用于标识save阶段,如downpour的xbox与batch_model -bool CommMergeAccessor::save(float * /*value*/, int /*param*/) { return true; } +bool CommMergeAccessor::Save(float * /*value*/, int /*param*/) { return true; } // keys不存在时,为values生成随机值 -int32_t CommMergeAccessor::create(float **value, size_t num) { return 0; } +int32_t CommMergeAccessor::Create(float **value, size_t num) { return 0; } // 从values中选取到select_values中 -int32_t CommMergeAccessor::select(float **select_values, const float **values, +int32_t CommMergeAccessor::Select(float **select_values, const float **values, size_t num) { return 0; } // 将update_values聚合到一起 -int32_t CommMergeAccessor::merge(float **update_values, +int32_t CommMergeAccessor::Merge(float **update_values, const float **other_update_values, size_t num) { Eigen::Map u_mat(update_values[0], 1, num); @@ -109,13 +93,13 @@ int32_t CommMergeAccessor::merge(float **update_values, // 将update_values聚合到一起,通过it.next判定是否进入下一个key // int32_t merge(float** update_values, iterator it); // 将update_values更新应用到values中 -int32_t CommMergeAccessor::update(float **values, const float **update_values, +int32_t CommMergeAccessor::Update(float **values, const float **update_values, size_t num) { return 0; } -int CommMergeAccessor::set_weight(float **values, const float **update_values, - size_t num) { +int CommMergeAccessor::SetWeight(float **values, const float **update_values, + size_t num) { return 0; } diff --git a/paddle/fluid/distributed/ps/table/tensor_accessor.h b/paddle/fluid/distributed/ps/table/tensor_accessor.h index 6f5b69a392bc5..1b454fe0c734b 100644 --- a/paddle/fluid/distributed/ps/table/tensor_accessor.h +++ b/paddle/fluid/distributed/ps/table/tensor_accessor.h @@ -29,53 +29,49 @@ class CommMergeAccessor : public ValueAccessor { public: CommMergeAccessor() {} virtual ~CommMergeAccessor() {} - virtual int initialize(); + virtual int Initialize(); virtual void SetTableInfo(AccessorInfo &info); virtual size_t GetTableInfo(InfoKey key); // value维度 - virtual size_t dim(); - // value各个维度的size - virtual size_t dim_size(size_t dim); - // value各维度相加总size - virtual size_t size(); // pull value维度 - virtual size_t select_dim(); + size_t SelectDim(); // pull value各个维度的size - virtual size_t select_dim_size(size_t dim); + size_t SelectDimSize(size_t dim); // pull value各维度相加总size - virtual size_t select_size(); + size_t SelectSize(); // push value维度 - virtual size_t update_dim(); + size_t UpdateDim(); // push value各个维度的size - virtual size_t update_dim_size(size_t dim); + size_t UpdateDimSize(size_t dim); // push value各维度相加总size - virtual size_t update_size(); + size_t UpdateSize(); + size_t fea_dim() { return _config.fea_dim(); } // 判断该value是否进行shrink - virtual bool shrink(float * /*value*/); + virtual bool Shrink(float * /*value*/); // 判断该value是否在save阶段dump, // param作为参数用于标识save阶段,如downpour的xbox与batch_model - virtual bool save(float * /*value*/, int /*param*/); + virtual bool Save(float * /*value*/, int /*param*/); // keys不存在时,为values生成随机值 - virtual int32_t create(float **value, size_t num); + virtual int32_t Create(float **value, size_t num); // 从values中选取到select_values中 - virtual int32_t select(float **select_values, const float **values, + virtual int32_t Select(float **select_values, const float **values, size_t num); // 将update_values聚合到一起 - virtual int32_t merge(float **update_values, + virtual int32_t Merge(float **update_values, const float **other_update_values, size_t num); // 将update_values聚合到一起,通过it.next判定是否进入下一个key - // virtual int32_t merge(float** update_values, iterator it); + // virtual int32_t Merge(float** update_values, iterator it); // 将update_values更新应用到values中 - virtual int32_t update(float **values, const float **update_values, + virtual int32_t Update(float **values, const float **update_values, size_t num); - virtual int set_weight(float **values, const float **update_values, - size_t num); - virtual std::string parse_to_string(const float *value, int param) { + virtual int SetWeight(float **values, const float **update_values, + size_t num); + virtual std::string ParseToString(const float *value, int param) { return ""; } - virtual int parse_from_string(const std::string &str, float *v) { return 0; } + virtual int ParseFromString(const std::string &str, float *v) { return 0; } }; } // namespace distributed } // namespace paddle diff --git a/paddle/fluid/distributed/test/ctr_accessor_test.cc b/paddle/fluid/distributed/test/ctr_accessor_test.cc index 835b1a361573d..8d9d0abd2394c 100644 --- a/paddle/fluid/distributed/test/ctr_accessor_test.cc +++ b/paddle/fluid/distributed/test/ctr_accessor_test.cc @@ -67,49 +67,49 @@ TableAccessorParameter gen_param() { TEST(downpour_feature_value_accessor_test, test_shrink) { TableAccessorParameter parameter = gen_param(); CtrCommonAccessor* acc = new CtrCommonAccessor(); - ASSERT_EQ(acc->configure(parameter), 0); - ASSERT_EQ(acc->initialize(), 0); + ASSERT_EQ(acc->Configure(parameter), 0); + ASSERT_EQ(acc->Initialize(), 0); VLOG(3) << "size of struct: " << acc->common_feature_value.embed_sgd_dim << " " << acc->common_feature_value.embedx_dim << " " << acc->common_feature_value.embedx_sgd_dim << " " - << acc->common_feature_value.dim() << "\n"; + << acc->common_feature_value.Dim() << "\n"; - float* value = new float[acc->dim()]; - for (auto i = 0u; i < acc->dim(); ++i) { + float* value = new float[acc->Dim()]; + for (auto i = 0u; i < acc->Dim(); ++i) { value[i] = i * 1.0; } - ASSERT_TRUE(!acc->shrink(value)); + ASSERT_TRUE(!acc->Shrink(value)); // set unseen_days too long value[1] = 1000; // set delta score too small value[2] = 0.001; - ASSERT_TRUE(acc->shrink(value)); + ASSERT_TRUE(acc->Shrink(value)); } TEST(downpour_feature_value_accessor_test, test_save) { TableAccessorParameter parameter = gen_param(); CtrCommonAccessor* acc = new CtrCommonAccessor(); - ASSERT_EQ(acc->configure(parameter), 0); - ASSERT_EQ(acc->initialize(), 0); + ASSERT_EQ(acc->Configure(parameter), 0); + ASSERT_EQ(acc->Initialize(), 0); - float* value = new float[acc->dim()]; - for (auto i = 0u; i < acc->dim(); ++i) { + float* value = new float[acc->Dim()]; + for (auto i = 0u; i < acc->Dim(); ++i) { value[i] = i * 1.0; } // save all feature - ASSERT_TRUE(acc->save(value, 0)); + ASSERT_TRUE(acc->Save(value, 0)); // save delta feature - ASSERT_TRUE(acc->save(value, 1)); + ASSERT_TRUE(acc->Save(value, 1)); // save base feature with time decay - ASSERT_TRUE(acc->save(value, 2)); + ASSERT_TRUE(acc->Save(value, 2)); VLOG(3) << "test_save:"; - for (auto i = 0u; i < acc->dim(); ++i) { + for (auto i = 0u; i < acc->Dim(); ++i) { VLOG(3) << value[i]; } } @@ -117,8 +117,8 @@ TEST(downpour_feature_value_accessor_test, test_save) { TEST(downpour_feature_value_accessor_test, test_create) { TableAccessorParameter parameter = gen_param(); CtrCommonAccessor* acc = new CtrCommonAccessor(); - ASSERT_EQ(acc->configure(parameter), 0); - ASSERT_EQ(acc->initialize(), 0); + ASSERT_EQ(acc->Configure(parameter), 0); + ASSERT_EQ(acc->Initialize(), 0); const int field_size = 7 + 8; const int item_size = 10; @@ -127,7 +127,7 @@ TEST(downpour_feature_value_accessor_test, test_create) { for (auto i = 0u; i < item_size; ++i) { value[i] = new float[field_size]; } - ASSERT_EQ(acc->create(value, item_size), 0); + ASSERT_EQ(acc->Create(value, item_size), 0); for (auto i = 0u; i < item_size; ++i) { for (auto j = 0u; j < field_size; ++j) { @@ -141,11 +141,11 @@ TEST(downpour_feature_value_accessor_test, test_create) { TEST(downpour_feature_value_accessor_test, test_update) { TableAccessorParameter parameter = gen_param(); CtrCommonAccessor* acc = new CtrCommonAccessor(); - ASSERT_EQ(acc->configure(parameter), 0); - ASSERT_EQ(acc->initialize(), 0); + ASSERT_EQ(acc->Configure(parameter), 0); + ASSERT_EQ(acc->Initialize(), 0); - VLOG(3) << "dim: " << acc->common_feature_value.dim() << "\n"; - VLOG(3) << "update_dim: " << acc->update_dim() << "\n"; + VLOG(3) << "dim: " << acc->common_feature_value.Dim() << "\n"; + VLOG(3) << "update_dim: " << acc->GetTableInfo(UPDATE_DIM) << "\n"; const int field_size = 7 + 8; const int item_size = 10; @@ -162,8 +162,8 @@ TEST(downpour_feature_value_accessor_test, test_update) { typedef const float* const_float_ptr; const_float_ptr* grad = new const_float_ptr[item_size]; for (auto i = 0u; i < item_size; ++i) { - float* p = new float[acc->update_dim()]; - for (auto j = 0u; j < acc->update_dim(); ++j) { + float* p = new float[acc->GetTableInfo(UPDATE_DIM)]; + for (auto j = 0u; j < acc->GetTableInfo(UPDATE_DIM); ++j) { p[j] = i; } grad[i] = p; @@ -251,14 +251,14 @@ TEST(downpour_feature_value_accessor_test, test_update) { acc->_embedx_sgd_rule->update_value(&v.embedx_w[0], &v.embedx_g2sum[0], &push_v.embedx_g[0]); - float* ptr = new float[acc->dim()]; + float* ptr = new float[acc->Dim()]; v.to_array(ptr, parameter.embedx_dim()); exp_value.push_back(ptr); } - acc->update(value, grad, item_size); + acc->Update(value, grad, item_size); for (auto i = 0u; i < item_size; ++i) { - for (auto j = 0u; j < acc->dim(); ++j) { + for (auto j = 0u; j < acc->Dim(); ++j) { VLOG(3) << value[i][j] << ":" << exp_value[i][j] << " "; ASSERT_FLOAT_EQ(value[i][j], exp_value[i][j]); } @@ -268,8 +268,8 @@ TEST(downpour_feature_value_accessor_test, test_update) { TEST(downpour_feature_value_accessor_test, test_show_click_score) { TableAccessorParameter parameter = gen_param(); CtrCommonAccessor* acc = new CtrCommonAccessor(); - ASSERT_EQ(acc->configure(parameter), 0); - ASSERT_EQ(acc->initialize(), 0); + ASSERT_EQ(acc->Configure(parameter), 0); + ASSERT_EQ(acc->Initialize(), 0); float show = 10; float click = 6; @@ -279,8 +279,8 @@ TEST(downpour_feature_value_accessor_test, test_show_click_score) { TEST(downpour_feature_value_accessor_test, test_string_related) { TableAccessorParameter parameter = gen_param(); CtrCommonAccessor* acc = new CtrCommonAccessor(); - ASSERT_EQ(acc->configure(parameter), 0); - ASSERT_EQ(acc->initialize(), 0); + ASSERT_EQ(acc->Configure(parameter), 0); + ASSERT_EQ(acc->Initialize(), 0); const int field_size = 15; float* value = new float[field_size]; @@ -288,12 +288,12 @@ TEST(downpour_feature_value_accessor_test, test_string_related) { value[i] = i; } - auto str = acc->parse_to_string(value, 0); + auto str = acc->ParseToString(value, 0); VLOG(3) << str << std::endl; str = "0 1 2 3 4 5 6"; - ASSERT_NE(acc->parse_from_string(str, value), 0); + ASSERT_NE(acc->ParseFromString(str, value), 0); // make sure init_zero=true for (auto i = 7; i < 15; ++i) { diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc index 9039cf8eba95a..f5bdbcd968452 100644 --- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc +++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc @@ -1824,7 +1824,7 @@ static std::pair GenerateForwardFunctionContents( // Bump inplace version of inplace tensor. auto inplace_input_name = inplace_map[output_name]; const char* FWD_OUT_TENSOR_TEMPLATE = - " egr::EagerUtils::ModifyInplaceInput(outs[\"%s\"][0], &%s);\n" + " egr::EagerUtils::GetOutput(outs[\"%s\"][0], &%s);\n" " %s.bump_inplace_version();\n" " VLOG(3) << \"Tensor(\" << %s.name() << \") uses Inplace " "Strategy.\";\n"; @@ -2803,7 +2803,7 @@ static void DygraphCodeGeneration(const std::string& output_dir) { // Inplace Function Generator. // `sum` op has duplicate input. Don't consider adding inplace strategy // for `sum` in temporary. - if (op_type != "sum" && infer_inplace) { + if (infer_inplace && !special_inplace_op_set.count(op_type)) { auto in_to_outs = infer_inplace(true); for (auto& inplace_pair : in_to_outs) { inplace_map[inplace_pair.second] = inplace_pair.first; diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py index 6ad0aa3dff25a..e16bcb187f85a 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py @@ -44,7 +44,7 @@ 'Scalar(int64_t)' : 'paddle::experimental::Scalar', 'Scalar(float)' : 'paddle::experimental::Scalar', 'Scalar(double)' : 'paddle::experimental::Scalar', - 'ScalarArray' : 'paddle::experimental::ScalarArray' + 'IntArray' : 'paddle::experimental::IntArray' } diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py index 63eb1ee46a822..463c50658cd32 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py @@ -45,7 +45,7 @@ def SkipAPIGeneration(forward_api_name): "std::vector": "CastPyArg2Float64s", "std::vector": "CastPyArg2Strings", "paddle::experimental::Scalar": "CastPyArg2Scalar", - "paddle::experimental::ScalarArray": "CastPyArg2ScalarArray", + "paddle::experimental::IntArray": "CastPyArg2IntArray", "paddle::experimental::Place": "CastPyArg2Place", "paddle::experimental::DataType": "CastPyArg2DataType", } @@ -140,7 +140,7 @@ def FindParsingFunctionFromAttributeType(atype): #include "paddle/phi/common/backend.h" #include "paddle/phi/common/data_type.h" #include "paddle/phi/common/scalar.h" -#include "paddle/phi/common/scalar_array.h" +#include "paddle/phi/common/int_array.h" #include "paddle/phi/api/include/sparse_api.h" #include "paddle/phi/api/include/strings_api.h" #include "paddle/fluid/pybind/op_function_common.h" diff --git a/paddle/fluid/eager/grad_node_info.cc b/paddle/fluid/eager/grad_node_info.cc index 5f3dfe8e513ed..22266ff386293 100644 --- a/paddle/fluid/eager/grad_node_info.cc +++ b/paddle/fluid/eager/grad_node_info.cc @@ -19,6 +19,7 @@ #include "paddle/phi/common/data_type.h" #include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/sparse_coo_tensor.h" #include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/framework/data_type.h" @@ -124,29 +125,32 @@ void GradNodeBase::SetGradInMeta(const paddle::experimental::Tensor& fwd_out, return; } + phi::DenseTensor* dense_tensor = nullptr; // Record TensorMeta if (phi::DenseTensor::classof(fwd_out.impl().get())) { // Only Copy Meta - phi::DenseTensor* dense_tensor = - static_cast(fwd_out.impl().get()); - - PADDLE_ENFORCE_NE( - dense_tensor->meta().dtype, phi::DataType::UNDEFINED, - paddle::platform::errors::Fatal( - "Attempting to copy DenseTensorMeta with phi::DataType::UNDEFINED," - "which is illegal.")); - - meta.SetTensorMeta(dense_tensor->meta()); - meta.SetPlace(fwd_out.inner_place()); - - if (paddle::framework::IsComplexType( - paddle::framework::TransToProtoVarType(dense_tensor->type()))) { - need_complex_to_real_ = true; - } + dense_tensor = static_cast(fwd_out.impl().get()); + } else if (phi::SparseCooTensor::classof(fwd_out.impl().get())) { + phi::SparseCooTensor* coo_tensor = + static_cast(fwd_out.impl().get()); + dense_tensor = coo_tensor->mutable_non_zero_elements(); } else { VLOG(6) << "Unable to initialize the DenseTensorMeta of GradSlotMeta with " "non-DenseTensor argument."; } + PADDLE_ENFORCE_NE( + dense_tensor->meta().dtype, phi::DataType::UNDEFINED, + paddle::platform::errors::Fatal( + "Attempting to copy DenseTensorMeta with phi::DataType::UNDEFINED," + "which is illegal.")); + + meta.SetTensorMeta(dense_tensor->meta()); + meta.SetPlace(fwd_out.inner_place()); + + if (paddle::framework::IsComplexType( + paddle::framework::TransToProtoVarType(dense_tensor->type()))) { + need_complex_to_real_ = true; + } } void GradNodeBase::SetGradInMeta( diff --git a/paddle/fluid/eager/tensor_wrapper.h b/paddle/fluid/eager/tensor_wrapper.h index d734fd51efdfb..dc4cf379390f1 100644 --- a/paddle/fluid/eager/tensor_wrapper.h +++ b/paddle/fluid/eager/tensor_wrapper.h @@ -94,9 +94,9 @@ class TensorWrapper { return paddle::experimental::Tensor(); } - // if it's full_reserved just return the full copy of tensor check_inplace_version(); + // if it's full_reserved just return the full copy of tensor paddle::experimental::Tensor recovered_tensor = intermidiate_tensor_; if (!full_reserved_) { std::shared_ptr new_grad_node = grad_node; @@ -122,10 +122,10 @@ class TensorWrapper { static_cast(intermidiate_tensor_.impl().get()); auto& inplace_version_counter = dense_tensor->InplaceVersionCounter(); - uint32_t current_inplace_version = - inplace_version_counter.CurrentVersion(); + uint32_t wrapper_version_snapshot = inplace_version_snapshot_; + uint32_t tensor_version = inplace_version_counter.CurrentVersion(); PADDLE_ENFORCE_EQ( - current_inplace_version, inplace_version_snapshot_, + tensor_version, wrapper_version_snapshot, paddle::platform::errors::PermissionDenied( "Tensor '%s' used in gradient computation has been " "modified by an inplace operation. " @@ -133,14 +133,14 @@ class TensorWrapper { "Please fix your code to void calling an inplace operator " "after using the Tensor which will used in gradient " "computation.", - intermidiate_tensor_.name(), current_inplace_version, - inplace_version_snapshot_)); - VLOG(6) << " The inplace_version_snapshot_ of Tensor '" - << intermidiate_tensor_.name() << "' is [ " - << inplace_version_snapshot_ << " ]"; - VLOG(6) << " The current_inplace_version of Tensor '" + intermidiate_tensor_.name(), tensor_version, + wrapper_version_snapshot)); + VLOG(6) << " The wrapper_version_snapshot of Tensor '" << intermidiate_tensor_.name() << "' is [ " - << current_inplace_version << " ]"; + << wrapper_version_snapshot << " ]"; + VLOG(6) << " The tensor_version of Tensor '" + << intermidiate_tensor_.name() << "' is [ " << tensor_version + << " ]"; } } diff --git a/paddle/fluid/eager/utils.cc b/paddle/fluid/eager/utils.cc index 5328033fc749b..dfbc96a9db836 100644 --- a/paddle/fluid/eager/utils.cc +++ b/paddle/fluid/eager/utils.cc @@ -271,27 +271,6 @@ void EagerUtils::HandleViewBetweenInputAndOutput( } } -void EagerUtils::ModifyInplaceInput( - const std::shared_ptr& inplace_variable, - paddle::experimental::Tensor* inplace_tensor) { - // Only modify the meta information of the inplace tensor, because - // EagerVariable cannot modify Tensor's meta information after inplace - // op (such as ``reshape``) is executed. - PADDLE_ENFORCE_NOT_NULL(inplace_tensor, - paddle::platform::errors::Fatal( - "Inplace Tensor is null and cannot be modified. " - "We are tring to Modify Inplace Input from its " - "shared_ptr, this error may indicate the inplace " - " input is nullptr")); - if (phi::DenseTensor::classof(inplace_variable->GetTensorBase().get())) { - phi::DenseTensor* variable_dense_tensor = - static_cast(inplace_variable->GetTensorBase().get()); - phi::DenseTensor* tensor_dense_tensor = - static_cast(inplace_tensor->impl().get()); - tensor_dense_tensor->set_meta(variable_dense_tensor->meta()); - } -} - std::vector EagerUtils::GetOutputs( const std::vector>& outs) { std::vector res; diff --git a/paddle/fluid/eager/utils.h b/paddle/fluid/eager/utils.h index 4c3f5c88e4c93..beb46d876c4a1 100644 --- a/paddle/fluid/eager/utils.h +++ b/paddle/fluid/eager/utils.h @@ -203,9 +203,6 @@ class EagerUtils { static std::vector> CreateVars( const size_t num); // Construct Tensor From var - static void ModifyInplaceInput( - const std::shared_ptr& inplace_variable, - paddle::experimental::Tensor* inplace_tensor); static std::vector GetOutputs( const std::vector>& outs); static paddle::experimental::Tensor GetOutput( diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 09ced6bd0d5ce..e92e160c7ae3b 100755 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -300,7 +300,7 @@ if(WITH_DISTRIBUTE) lod_rank_table feed_fetch_method collective_helper ${GLOB_DISTRIBUTE_DEPS} graph_to_program_pass variable_helper data_feed_proto timer monitor heter_service_proto fleet_executor ${BRPC_DEP}) - set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") + set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor -Wno-error=parentheses") if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0) set(DISTRIBUTE_COMPILE_FLAGS "${DISTRIBUTE_COMPILE_FLAGS} -faligned-new") @@ -320,7 +320,7 @@ if(WITH_DISTRIBUTE) index_sampler index_wrapper sampler index_dataset_proto lod_rank_table fs shell fleet_wrapper heter_wrapper box_wrapper metrics lodtensor_printer feed_fetch_method graph_to_program_pass variable_helper timer monitor heter_service_proto fleet heter_server brpc fleet_executor) - set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") + set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor -Wno-error=parentheses") if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0) set(DISTRIBUTE_COMPILE_FLAGS "${DISTRIBUTE_COMPILE_FLAGS} -faligned-new") diff --git a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt index ead6dd7e6898d..983208c0608ae 100644 --- a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt +++ b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt @@ -10,12 +10,14 @@ IF(WITH_GPU) nv_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h hashtable.h mem_pool.h DEPS ${HETERPS_DEPS}) nv_test(test_heter_comm SRCS feature_value.h DEPS heter_comm) nv_library(heter_ps SRCS heter_ps.cu DEPS heter_comm) - nv_library(graph_gpu_ps SRCS graph_gpu_ps_table.h DEPS heter_comm table) - nv_test(test_graph_comm SRCS test_graph.cu DEPS graph_gpu_ps) - nv_test(test_cpu_graph_sample SRCS test_cpu_graph_sample.cu DEPS graph_gpu_ps) - #nv_test(test_sample_rate SRCS test_sample_rate.cu DEPS graph_gpu_ps) - # ADD_EXECUTABLE(test_sample_rate test_sample_rate.cu) - # target_link_libraries(test_sample_rate graph_gpu_ps) + if(WITH_PSCORE) + nv_library(graph_gpu_ps SRCS graph_gpu_ps_table.h DEPS heter_comm table) + nv_test(test_graph_comm SRCS test_graph.cu DEPS graph_gpu_ps) + nv_test(test_cpu_graph_sample SRCS test_cpu_graph_sample.cu DEPS graph_gpu_ps) + #nv_test(test_sample_rate SRCS test_sample_rate.cu DEPS graph_gpu_ps) + # ADD_EXECUTABLE(test_sample_rate test_sample_rate.cu) + # target_link_libraries(test_sample_rate graph_gpu_ps) + endif() ENDIF() IF(WITH_ROCM) hip_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h hashtable.h DEPS cub device_context) diff --git a/paddle/fluid/framework/infershape_utils.cc b/paddle/fluid/framework/infershape_utils.cc index cb34c52e52672..ecc5fbdcf945d 100644 --- a/paddle/fluid/framework/infershape_utils.cc +++ b/paddle/fluid/framework/infershape_utils.cc @@ -21,8 +21,8 @@ limitations under the License. */ #include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/phi_utils.h" #include "paddle/fluid/platform/enforce.h" +#include "paddle/phi/common/int_array.h" #include "paddle/phi/common/scalar.h" -#include "paddle/phi/common/scalar_array.h" #include "paddle/phi/core/compat/arg_map_context.h" #include "paddle/phi/core/compat/convert_utils.h" #include "paddle/phi/core/compat/op_utils.h" @@ -363,12 +363,12 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx, auto attr_reader = ctx->Attrs(); for (size_t i = 0; i < attr_names.size(); ++i) { auto attr_name = attr_names[i]; - if (attr_defs[i].type_index == std::type_index(typeid(phi::ScalarArray))) { - // When attr is a vector_tensor or tensor, transform it to ScalarArray + if (attr_defs[i].type_index == std::type_index(typeid(phi::IntArray))) { + // When attr is a vector_tensor or tensor, transform it to IntArray if (ctx->HasInputs(attr_name) || ctx->HasInput(attr_name)) { const auto& infershape_inputs = ctx->GetInputVarPtrs(attr_name); if (ctx->IsRuntime()) { - // If is in runtime, we will get tensor's value for ScalarArray + // If is in runtime, we will get tensor's value for IntArray // and push it into attrs std::vector vars; vars.reserve(infershape_inputs.size()); @@ -377,13 +377,13 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx, } if (infershape_inputs.size() != 1) { infer_meta_context.EmplaceBackAttr( - std::move(experimental::MakePhiScalarArrayFromVarList(vars))); + std::move(experimental::MakePhiIntArrayFromVarList(vars))); } else { infer_meta_context.EmplaceBackAttr( - std::move(experimental::MakePhiScalarArrayFromVar(*vars[0]))); + std::move(experimental::MakePhiIntArrayFromVar(*vars[0]))); } } else { - // If is not in runtime, we will set default value(-1) for ScalarArray + // If is not in runtime, we will set default value(-1) for IntArray std::vector vars; vars.reserve(infershape_inputs.size()); for (size_t i = 0; i < infershape_inputs.size(); ++i) { @@ -400,7 +400,7 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx, if (num_ele <= 0) { PADDLE_THROW(platform::errors::Unimplemented( - "Invalid number for construct phi::ScalarArray, expected " + "Invalid number for construct phi::IntArray, expected " "number > 0, but actually is %d. ", num_ele)); } @@ -408,7 +408,7 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx, } else { num_ele = vars.size(); } - phi::ScalarArray tensor_attr(std::vector(num_ele, -1)); + phi::IntArray tensor_attr(std::vector(num_ele, -1)); tensor_attr.SetFromTensor(true); infer_meta_context.EmplaceBackAttr(std::move(tensor_attr)); } @@ -417,18 +417,18 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx, if (std::type_index(attr.type()) == std::type_index(typeid(std::vector))) { infer_meta_context.EmplaceBackAttr(std::move( - phi::ScalarArray(BOOST_GET_CONST(std::vector, attr)))); + phi::IntArray(BOOST_GET_CONST(std::vector, attr)))); } else if (std::type_index(attr.type()) == std::type_index(typeid(std::vector))) { infer_meta_context.EmplaceBackAttr(std::move( - phi::ScalarArray(BOOST_GET_CONST(std::vector, attr)))); + phi::IntArray(BOOST_GET_CONST(std::vector, attr)))); } else if (std::type_index(attr.type()) == std::type_index(typeid(int))) { infer_meta_context.EmplaceBackAttr( - phi::ScalarArray({BOOST_GET_CONST(int, attr)})); + phi::IntArray({BOOST_GET_CONST(int, attr)})); } else { PADDLE_THROW(platform::errors::Unimplemented( - "Unsupported cast op attribute `%s` to ScalarArray when " + "Unsupported cast op attribute `%s` to IntArray when " "construct InferMetaContext.", attr_name)); } diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc index e30dd21fc5c0e..a2f9d90406736 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.cc +++ b/paddle/fluid/framework/new_executor/interpretercore.cc @@ -501,7 +501,7 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) { } // for debug nan/inf - if (FLAGS_check_nan_inf) { + if (op_with_kernel != nullptr && FLAGS_check_nan_inf) { VLOG(4) << "Check nan/inf"; framework::details::CheckOpHasNanOrInf( *op, *global_scope_, @@ -542,10 +542,12 @@ void InterpreterCore::ExecuteInstructionList( if (exception_holder_.Type() != "EOF") { async_work_queue_->Cancel(); } + VLOG(4) << "Cancel ok"; PADDLE_ENFORCE_EQ( main_thread_blocker_.Clear(), 0, platform::errors::PreconditionNotMet( "main_thread_blocker_.Clear() return -1, clear failed")); + VLOG(4) << "clear ok"; exception_holder_.ReThrow(); } } @@ -637,15 +639,18 @@ void InterpreterCore::RunInstructionAsync( auto* op = instr_node.OpBase(); platform::RecordEvent instruction_event( op->Type(), platform::TracerEventType::Operator, 1); - interpreter::WaitEvent(instr_node, place_); try { + interpreter::WaitEvent(instr_node, place_); + RunInstruction(instr_node); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) RecordStreamForGC(instr_node); #endif CheckGC(instr_node, atomic_var_ref); + + interpreter::RecordEvent(instr_node, place_); } catch (platform::EnforceNotMet& ex) { framework::InsertCallStackInfo(op->Type(), op->Attrs(), &ex); exception_holder_.Catch(std::make_exception_ptr(std::move(ex))); @@ -677,8 +682,6 @@ void InterpreterCore::RunInstructionAsync( } } - interpreter::RecordEvent(instr_node, place_); - RunNextInstructions(instr_node, &ready_ops, atomic_deps, atomic_var_ref); } } diff --git a/paddle/fluid/framework/new_executor/standalone_executor_test.cc b/paddle/fluid/framework/new_executor/standalone_executor_test.cc index 8a3b40bbd76ef..b5670565e2a64 100644 --- a/paddle/fluid/framework/new_executor/standalone_executor_test.cc +++ b/paddle/fluid/framework/new_executor/standalone_executor_test.cc @@ -35,7 +35,7 @@ USE_OP_ITSELF(elementwise_add); USE_OP_ITSELF(sigmoid); USE_OP_ITSELF(tanh); USE_OP_ITSELF(elementwise_mul); -USE_OP(softmax_with_cross_entropy); +USE_OP_ITSELF(softmax_with_cross_entropy); USE_OP_ITSELF(reduce_mean); USE_OP_ITSELF(reduce_sum); USE_OP_ITSELF(reduce_sum_grad); @@ -83,6 +83,8 @@ PD_DECLARE_KERNEL(max_raw, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(sgd, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(slice, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(slice_grad, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(cross_entropy_with_softmax, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(cross_entropy_with_softmax_grad, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(sqrt, GPU, ALL_LAYOUT); DECLARE_double(eager_delete_tensor_gb); diff --git a/paddle/fluid/framework/new_executor/workqueue/thread_data_registry.h b/paddle/fluid/framework/new_executor/workqueue/thread_data_registry.h index ffdddc39a31e3..98ed2c1ffc4b3 100644 --- a/paddle/fluid/framework/new_executor/workqueue/thread_data_registry.h +++ b/paddle/fluid/framework/new_executor/workqueue/thread_data_registry.h @@ -60,18 +60,51 @@ class ThreadDataRegistry { } private: - // types +// types +// Lock types +#if defined(__clang__) || defined(__GNUC__) // CLANG or GCC +#ifndef __APPLE__ +#if __cplusplus >= 201703L + using LockType = std::shared_mutex; + using SharedLockGuardType = std::shared_lock; +#elif __cplusplus >= 201402L using LockType = std::shared_timed_mutex; + using SharedLockGuardType = std::shared_lock; +#else + using LockType = std::mutex; + using SharedLockGuardType = std::lock_guard; +#endif +// Special case : mac. https://github.com/facebook/react-native/issues/31250 +#else + using LockType = std::mutex; + using SharedLockGuardType = std::lock_guard; +#endif +#elif defined(_MSC_VER) // MSVC +#if _MSVC_LANG >= 201703L + using LockType = std::shared_mutex; + using SharedLockGuardType = std::shared_lock; +#elif _MSVC_LANG >= 201402L + using LockType = std::shared_timed_mutex; + using SharedLockGuardType = std::shared_lock; +#else + using LockType = std::mutex; + using SharedLockGuardType = std::lock_guard; +#endif +#else // other compilers + using LockType = std::mutex; + using SharedLockGuardType = std::lock_guard; +#endif + class ThreadDataHolder; class ThreadDataRegistryImpl { public: void RegisterData(uint64_t tid, ThreadDataHolder* tls_obj) { - std::lock_guard lock(lock_); + std::lock_guard guard(lock_); tid_map_[tid] = tls_obj; } void UnregisterData(uint64_t tid) { - std::lock_guard lock(lock_); + std::lock_guard guard(lock_); tid_map_.erase(tid); } @@ -79,7 +112,7 @@ class ThreadDataRegistry { std::is_copy_constructible::value>> std::unordered_map GetAllThreadDataByValue() { std::unordered_map data_copy; - std::shared_lock lock(lock_); + SharedLockGuardType guard(lock_); data_copy.reserve(tid_map_.size()); for (auto& kv : tid_map_) { data_copy.emplace(kv.first, kv.second->GetData()); @@ -90,7 +123,7 @@ class ThreadDataRegistry { std::unordered_map> GetAllThreadDataByRef() { std::unordered_map> data_ref; - std::shared_lock lock(lock_); + SharedLockGuardType guard(lock_); data_ref.reserve(tid_map_.size()); for (auto& kv : tid_map_) { data_ref.emplace(kv.first, std::ref(kv.second->GetData())); diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index cf2a36cde1f1f..19fa0f66739ce 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -33,8 +33,8 @@ limitations under the License. */ #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler/event_tracing.h" +#include "paddle/phi/common/int_array.h" #include "paddle/phi/common/scalar.h" -#include "paddle/phi/common/scalar_array.h" #include "paddle/phi/core/kernel_factory.h" #include "paddle/phi/ops/compat/signatures.h" @@ -1120,6 +1120,56 @@ static void CheckTensorNANOrInf(const std::string& op_type, op_type, name)); } +bool OperatorWithKernel::SupportGPU() const { + auto phi_kernels = phi::KernelFactory::Instance().SelectKernelMap( + phi::TransToPhiKernelName(type_)); + auto has_phi_kernel = + std::any_of(phi_kernels.begin(), phi_kernels.end(), + [](phi::KernelKeyMap::const_reference kern_pair) { + return kern_pair.first.backend() == phi::Backend::GPU; + }); + if (has_phi_kernel) { + return true; + } else { + auto kernel_iter = OperatorWithKernel::AllOpKernels().find(type_); + if (kernel_iter == OperatorWithKernel::AllOpKernels().end()) { + return false; + } else { + auto& op_kernels = kernel_iter->second; + return std::any_of( + op_kernels.begin(), op_kernels.end(), + [](OpKernelMap::const_reference kern_pair) { + return platform::is_gpu_place(kern_pair.first.place_); + }); + } + } +} + +bool OperatorWithKernel::SupportNPU() const { + auto phi_kernels = phi::KernelFactory::Instance().SelectKernelMap( + phi::TransToPhiKernelName(type_)); + auto has_phi_kernel = + std::any_of(phi_kernels.begin(), phi_kernels.end(), + [](phi::KernelKeyMap::const_reference kern_pair) { + return kern_pair.first.backend() == phi::Backend::NPU; + }); + if (has_phi_kernel) { + return true; + } else { + auto kernel_iter = OperatorWithKernel::AllOpKernels().find(type_); + if (kernel_iter == OperatorWithKernel::AllOpKernels().end()) { + return false; + } else { + auto& op_kernels = kernel_iter->second; + return std::any_of( + op_kernels.begin(), op_kernels.end(), + [](OpKernelMap::const_reference kern_pair) { + return platform::is_npu_place(kern_pair.first.place_); + }); + } + } +} + bool OperatorWithKernel::SupportsMKLDNN( const proto::VarType::Type data_type) const { auto op_kernel_iter = OperatorWithKernel::AllOpKernels().find(type_); @@ -1550,6 +1600,17 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const { expected_kernel_key.place_ = platform::CPUPlace(); kernel_iter = kernels.find(expected_kernel_key); } +#endif +#ifdef PADDLE_WITH_CUSTOM_DEVICE + if (kernel_iter == kernels.end() && + platform::is_custom_place(expected_kernel_key.place_)) { + VLOG(3) << "missing " << expected_kernel_key.place_.GetDeviceType() + << " kernel: " << type_ + << ", expected_kernel_key:" << expected_kernel_key + << ", fallbacking to CPU one!"; + expected_kernel_key.place_ = platform::CPUPlace(); + kernel_iter = kernels.find(expected_kernel_key); + } #endif PADDLE_ENFORCE_NE(kernel_iter, kernels.end(), platform::errors::NotFound( @@ -2106,7 +2167,11 @@ void OperatorWithKernel::BuildPhiKernelContext( typeid(paddle::optional)) || input_defs[i].type_index == std::type_index( - typeid(paddle::optional)))) { + typeid(paddle::optional)) || + input_defs[i].type_index == + std::type_index( + typeid(paddle::optional< + const std::vector>)))) { pt_kernel_context->EmplaceBackInputWithoutSetRange(nullptr); auto end_idx = start_idx + 1; pt_kernel_context->AssignInputRange(std::make_pair(start_idx, end_idx), @@ -2198,24 +2263,24 @@ void OperatorWithKernel::BuildPhiKernelContext( VLOG(4) << "Done outputs"; for (size_t i = 0; i < attr_names.size(); ++i) { - if (attr_defs[i].type_index == std::type_index(typeid(phi::ScalarArray))) { + if (attr_defs[i].type_index == std::type_index(typeid(phi::IntArray))) { auto attr_iter = Attrs().find(attr_names[i]); if (attr_iter != Attrs().end()) { // shape is in the attribute if (std::type_index(attr_iter->second.type()) == std::type_index(typeid(std::vector))) { - pt_kernel_context->EmplaceBackAttr(std::move(phi::ScalarArray( + pt_kernel_context->EmplaceBackAttr(std::move(phi::IntArray( BOOST_GET_CONST(std::vector, attr_iter->second)))); } else if (std::type_index(attr_iter->second.type()) == std::type_index(typeid(std::vector))) { - pt_kernel_context->EmplaceBackAttr(std::move(phi::ScalarArray( + pt_kernel_context->EmplaceBackAttr(std::move(phi::IntArray( BOOST_GET_CONST(std::vector, attr_iter->second)))); } else if (std::type_index(attr_iter->second.type()) == std::type_index(typeid(int32_t))) { - pt_kernel_context->EmplaceBackAttr(std::move(phi::ScalarArray( - &BOOST_GET_CONST(int32_t, attr_iter->second), 1))); + pt_kernel_context->EmplaceBackAttr(std::move( + phi::IntArray(&BOOST_GET_CONST(int32_t, attr_iter->second), 1))); } else { PADDLE_THROW(platform::errors::Unimplemented( - "Unsupported cast op attribute `%s` to ScalarArray when " + "Unsupported cast op attribute `%s` to IntArray when " "construct KernelContext.", attr_names[i])); } @@ -2223,10 +2288,10 @@ void OperatorWithKernel::BuildPhiKernelContext( auto& ins_vector = ctx.inputs.at(attr_names[i]); if (ins_vector.size() == 1) { // ShapeTensor pt_kernel_context->EmplaceBackAttr(std::move( - experimental::MakePhiScalarArrayFromVar(*ins_vector.front()))); + experimental::MakePhiIntArrayFromVar(*ins_vector.front()))); } else { // ShapeTensorList - pt_kernel_context->EmplaceBackAttr(std::move( - experimental::MakePhiScalarArrayFromVarList(ins_vector))); + pt_kernel_context->EmplaceBackAttr( + std::move(experimental::MakePhiIntArrayFromVarList(ins_vector))); } } } else if (attr_defs[i].type_index == @@ -2368,6 +2433,10 @@ void OperatorWithKernel::BuildPhiKernelContext( std::type_index(typeid(std::vector))) { pt_kernel_context->EmplaceBackAttr( BOOST_GET_CONST(std::vector, attr_it->second)); + } else if (attr_defs[i].type_index == + std::type_index(typeid(std::vector))) { + pt_kernel_context->EmplaceBackAttr( + BOOST_GET_CONST(std::vector, attr_it->second)); } else { PADDLE_THROW(platform::errors::Unimplemented( "Unsupported cast op attribute `%s` when construct " diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index ce22f09944778..f7fc83f1d6d30 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -560,39 +560,10 @@ class OperatorWithKernel : public OperatorBase { return g_all_op_kernels; } - bool SupportGPU() const override { - auto phi_kernels = phi::KernelFactory::Instance().SelectKernelMap( - phi::TransToPhiKernelName(type_)); - auto has_phi_kernel = - std::any_of(phi_kernels.begin(), phi_kernels.end(), - [](phi::KernelKeyMap::const_reference kern_pair) { - return kern_pair.first.backend() == phi::Backend::GPU; - }); - if (has_phi_kernel) { - return true; - } else { - auto kernel_iter = OperatorWithKernel::AllOpKernels().find(type_); - if (kernel_iter == OperatorWithKernel::AllOpKernels().end()) { - return false; - } else { - auto& op_kernels = kernel_iter->second; - return std::any_of( - op_kernels.begin(), op_kernels.end(), - [](OpKernelMap::const_reference kern_pair) { - return platform::is_gpu_place(kern_pair.first.place_); - }); - } - } - } + bool SupportGPU() const override; + + bool SupportNPU() const override; - bool SupportNPU() const override { - // TODO(zhiqiu): support phi if needed? - auto& op_kernels = OperatorWithKernel::AllOpKernels().at(type_); - return std::any_of(op_kernels.begin(), op_kernels.end(), - [](OpKernelMap::const_reference kern_pair) { - return platform::is_npu_place(kern_pair.first.place_); - }); - } bool SupportMLU() const override { // TODO(zhiqiu): support phi if needed? auto& op_kernels = OperatorWithKernel::AllOpKernels().at(type_); diff --git a/paddle/fluid/framework/phi_utils.cc b/paddle/fluid/framework/phi_utils.cc index 14997dd961013..8e6f082da1026 100644 --- a/paddle/fluid/framework/phi_utils.cc +++ b/paddle/fluid/framework/phi_utils.cc @@ -87,7 +87,7 @@ phi::KernelKey TransOpKernelTypeToPhiKernelKey( } else if (kernel_type.library_type_ == LibraryType::kKP) { backend = phi::Backend::KPS; } else { - // do + // do nothing } paddle::experimental::DataLayout layout = kernel_type.data_layout_; paddle::experimental::DataType dtype = @@ -102,7 +102,7 @@ phi::KernelKey FallBackToCpu(const OpKernelType& expected_kernel_key, if (platform::is_xpu_place(expected_kernel_key.place_) || paddle::platform::is_in_xpu_black_list(op.Type())) { VLOG(3) << "phi missing XPU kernel: " << op.Type() - << "phipected_kernel_key:" << expected_kernel_key + << ", phipected_kernel_key:" << expected_kernel_key << ", fallbacking to CPU one!"; return phi::KernelKey(phi::Backend::CPU, kernel_key.layout(), kernel_key.dtype()); @@ -111,7 +111,7 @@ phi::KernelKey FallBackToCpu(const OpKernelType& expected_kernel_key, #ifdef PADDLE_WITH_ASCEND_CL if (platform::is_npu_place(expected_kernel_key.place_)) { VLOG(3) << "phi missing NPU kernel: " << op.Type() - << "phipected_kernel_key:" << expected_kernel_key + << ", phipected_kernel_key:" << expected_kernel_key << ", fallbacking to CPU one!"; return phi::KernelKey(phi::Backend::CPU, kernel_key.layout(), kernel_key.dtype()); @@ -120,7 +120,7 @@ phi::KernelKey FallBackToCpu(const OpKernelType& expected_kernel_key, #ifdef PADDLE_WITH_MLU if (platform::is_mlu_place(expected_kernel_key.place_)) { VLOG(3) << "phi missing MLU kernel: " << op.Type() - << "phipected_kernel_key:" << expected_kernel_key + << ", phipected_kernel_key:" << expected_kernel_key << ", fallbacking to CPU one!"; return phi::KernelKey(phi::Backend::CPU, kernel_key.layout(), kernel_key.dtype()); @@ -128,8 +128,18 @@ phi::KernelKey FallBackToCpu(const OpKernelType& expected_kernel_key, #endif #ifdef PADDLE_WITH_IPU if (platform::is_ipu_place(expected_kernel_key.place_)) { - VLOG(3) << "pten missing IPU kernel: " << op.Type() - << ", expected_kernel_key:" << expected_kernel_key + VLOG(3) << "phi missing IPU kernel: " << op.Type() + << ", phipected_kernel_key:" << expected_kernel_key + << ", fallbacking to CPU one!"; + return phi::KernelKey(phi::Backend::CPU, kernel_key.layout(), + kernel_key.dtype()); + } +#endif +#ifdef PADDLE_WITH_CUSTOM_DEVICE + if (platform::is_custom_place(expected_kernel_key.place_)) { + VLOG(3) << "phi missing " << expected_kernel_key.place_.GetDeviceType() + << " kernel: " << op.Type() + << ", phipected_kernel_key:" << expected_kernel_key << ", fallbacking to CPU one!"; return phi::KernelKey(phi::Backend::CPU, kernel_key.layout(), kernel_key.dtype()); diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc index ffac264b51d50..077dd54bc9fa5 100644 --- a/paddle/fluid/imperative/prepared_operator.cc +++ b/paddle/fluid/imperative/prepared_operator.cc @@ -19,8 +19,8 @@ #include "paddle/fluid/framework/details/nan_inf_utils.h" #include "paddle/fluid/imperative/infer_shape_context.h" #include "paddle/fluid/imperative/tracer.h" +#include "paddle/phi/common/int_array.h" #include "paddle/phi/common/scalar.h" -#include "paddle/phi/common/scalar_array.h" #include "paddle/utils/small_vector.h" #ifdef PADDLE_WITH_XPU #include "paddle/fluid/platform/device/xpu/xpu_op_list.h" @@ -191,12 +191,23 @@ PreparedOp PrepareImpl(const NameVarMap& ins, bool is_xpu_kp_support = (use_xpu_kp_kernel_rt || use_xpu_kp_kernel_debug); if (is_xpu_kp_support) { + auto expected_kernel_key_library_type = + expected_kernel_key.library_type_; expected_kernel_key.library_type_ = paddle::framework::LibraryType::kKP; - VLOG(3) << "modify XPU KP kernel: " << op.Type() + VLOG(3) << "modifing XPU KP kernel: " << op.Type() << ", using_kernel_key:" << expected_kernel_key; + phi::KernelKey try_pt_kernel_key = + TransOpKernelTypeToPhiKernelKey(expected_kernel_key); + if (!phi::KernelFactory::Instance().IsSelectKernelValid( + pt_kernel_name, try_pt_kernel_key)) { + expected_kernel_key.library_type_ = expected_kernel_key_library_type; + VLOG(3) << "modify XPU KP kernel: " << op.Type() << " is failed " + << expected_kernel_key; + } } } #endif + pt_kernel_key = TransOpKernelTypeToPhiKernelKey(expected_kernel_key); auto pt_kernel = phi::KernelFactory::Instance().SelectKernel(pt_kernel_name, pt_kernel_key); @@ -227,6 +238,20 @@ PreparedOp PrepareImpl(const NameVarMap& ins, auto& all_op_kernels = op.AllOpKernels(); auto kernels_iter = all_op_kernels.find(op.Type()); +#ifdef PADDLE_WITH_XPU_KP + bool use_xpu_kp_kernel_rt = + paddle::platform::is_xpu_place(expected_kernel_key.place_) && + FLAGS_run_kp_kernel && + paddle::platform::is_xpu_kp_support_op(op.Type(), expected_kernel_key); + bool use_xpu_kp_kernel_debug = + paddle::platform::is_xpu_place(expected_kernel_key.place_) && + paddle::platform::is_in_xpu_kpwhite_list(op.Type()); + bool is_xpu_kp_support = (use_xpu_kp_kernel_rt || use_xpu_kp_kernel_debug); + if (is_xpu_kp_support) { + expected_kernel_key.library_type_ = paddle::framework::LibraryType::kKP; + } +#endif + if ((kernels_iter == all_op_kernels.end() || kernels_iter->second.find(expected_kernel_key) == kernels_iter->second.end()) @@ -255,6 +280,7 @@ PreparedOp PrepareImpl(const NameVarMap& ins, platform::errors::NotFound( "There are no kernels which are registered in the %s operator.", op.Type())); + auto& kernels = kernels_iter->second; auto kernel_iter = kernels.find(expected_kernel_key); @@ -271,18 +297,12 @@ PreparedOp PrepareImpl(const NameVarMap& ins, #ifdef PADDLE_WITH_XPU_KP if (paddle::platform::is_xpu_place(expected_kernel_key.place_)) { - bool use_xpu_kp_kernel_rt = - FLAGS_run_kp_kernel && - paddle::platform::is_xpu_kp_support_op(op.Type(), expected_kernel_key); - bool use_xpu_kp_kernel_debug = - paddle::platform::is_in_xpu_kpwhite_list(op.Type()); if (use_xpu_kp_kernel_rt) { VLOG(3) << "xpu_kp using rt mode "; } if (use_xpu_kp_kernel_debug) { VLOG(3) << "xpu_kp using debug mode "; } - bool is_xpu_kp_support = (use_xpu_kp_kernel_rt || use_xpu_kp_kernel_debug); if (is_xpu_kp_support) { expected_kernel_key.library_type_ = paddle::framework::LibraryType::kKP; kernel_iter = kernels.find(expected_kernel_key); @@ -464,6 +484,11 @@ static void PreparedOpRunPtImpl( pt_kernel(&pt_kernel_context); } + if (FLAGS_check_nan_inf) { + framework::details::CheckOpHasNanOrInfInDygraph( + op.Type(), outs, dev_ctx->GetPlace()); + } + if (FLAGS_benchmark) { dev_ctx->Wait(); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h index 9daac181d57de..04d0b4ca7a5db 100644 --- a/paddle/fluid/imperative/prepared_operator.h +++ b/paddle/fluid/imperative/prepared_operator.h @@ -272,6 +272,14 @@ void BuildDygraphPhiKernelContext( auto end_idx = start_idx + 1; kernel_ctx->AssignInputRange(std::make_pair(start_idx, end_idx), i); continue; + } else if (input_defs[i].type_index == + std::type_index( + typeid(paddle::optional< + const std::vector>))) { + kernel_ctx->EmplaceBackInputWithoutSetRange(nullptr); + auto end_idx = start_idx + 1; + kernel_ctx->AssignInputRange(std::make_pair(start_idx, end_idx), i); + continue; } else { PADDLE_THROW(phi::errors::NotFound( "Can not find input variable '%s' for %s OP, please check whether " @@ -361,26 +369,26 @@ void BuildDygraphPhiKernelContext( } for (size_t i = 0; i < attr_names.size(); ++i) { - if (attr_defs[i].type_index == std::type_index(typeid(phi::ScalarArray))) { + if (attr_defs[i].type_index == std::type_index(typeid(phi::IntArray))) { if (attrs.find(attr_names[i]) != attrs.end()) { // shape is in the attribute auto& attr = GetAttr(attrs, default_attrs, attr_names[i]); if (std::type_index(attr.type()) == std::type_index(typeid(std::vector))) { kernel_ctx->EmplaceBackAttr(std::move( - phi::ScalarArray(BOOST_GET_CONST(std::vector, attr)))); + phi::IntArray(BOOST_GET_CONST(std::vector, attr)))); } else if (std::type_index(attr.type()) == std::type_index(typeid(std::vector))) { kernel_ctx->EmplaceBackAttr(std::move( - phi::ScalarArray(BOOST_GET_CONST(std::vector, attr)))); + phi::IntArray(BOOST_GET_CONST(std::vector, attr)))); } else if (std::type_index(attr.type()) == std::type_index(typeid(int64_t))) { kernel_ctx->EmplaceBackAttr( - std::move(phi::ScalarArray(&BOOST_GET_CONST(int64_t, attr), 1))); + std::move(phi::IntArray(&BOOST_GET_CONST(int64_t, attr), 1))); } else if (std::type_index(attr.type()) == std::type_index(typeid(int32_t))) { kernel_ctx->EmplaceBackAttr( - std::move(phi::ScalarArray(&BOOST_GET_CONST(int32_t, attr), 1))); + std::move(phi::IntArray(&BOOST_GET_CONST(int32_t, attr), 1))); } else if (attr_defs[i].type_index == std::type_index(typeid(std::vector))) { const auto& vector_int_attr = BOOST_GET_CONST(std::vector, attr); @@ -395,15 +403,15 @@ void BuildDygraphPhiKernelContext( auto& ins_vector = ins.at(attr_names[i]); if (ins_vector.size() == 1) { // ShapeTensor kernel_ctx->EmplaceBackAttr(std::move( - experimental::MakePhiScalarArrayFromVar(ins_vector[0]->Var()))); + experimental::MakePhiIntArrayFromVar(ins_vector[0]->Var()))); } else { // ShapeTensorList std::vector variables; variables.reserve(ins_vector.size()); for (const auto& var_base : ins_vector) { variables.push_back(var_base->MutableVar()); } - kernel_ctx->EmplaceBackAttr(std::move( - experimental::MakePhiScalarArrayFromVarList(variables))); + kernel_ctx->EmplaceBackAttr( + std::move(experimental::MakePhiIntArrayFromVarList(variables))); } } } else if (attr_defs[i].type_index == @@ -545,6 +553,9 @@ void BuildDygraphPhiKernelContext( std::type_index(typeid(std::vector))) { kernel_ctx->EmplaceBackAttr( BOOST_GET_CONST(std::vector, attr)); + } else if (attr_defs[i].type_index == + std::type_index(typeid(std::vector))) { + kernel_ctx->EmplaceBackAttr(BOOST_GET_CONST(std::vector, attr)); } else { PADDLE_THROW(platform::errors::Unimplemented( "Unsupported cast op attribute `%s` when construct " diff --git a/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc b/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc index 2570325c24abc..8f7e51009223a 100644 --- a/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc @@ -156,6 +156,8 @@ void profile(bool use_mkldnn = false) { if (use_mkldnn) { config.EnableMKLDNN(); + config.pass_builder()->AppendPass("fc_mkldnn_pass"); + config.pass_builder()->AppendPass("fc_act_mkldnn_fuse_pass"); } std::vector> outputs; diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index 7619767123f84..f4dfb76884f17 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -354,8 +354,7 @@ class AllocatorFacadePrivate { } #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - bool HasCUDAAllocator(const platform::CUDAPlace& place, - const gpuStream_t& stream) { + bool HasCUDAAllocator(const platform::CUDAPlace& place, gpuStream_t stream) { auto it = cuda_allocators_.find(place); if (it == cuda_allocators_.end()) { return false; @@ -366,7 +365,7 @@ class AllocatorFacadePrivate { } const std::shared_ptr& GetAllocator( - const platform::CUDAPlace& place, const gpuStream_t& stream, + const platform::CUDAPlace& place, gpuStream_t stream, bool create_if_not_found = false) { if (LIKELY(!IsCUDAGraphCapturing())) { if (stream == GetDefaultStream(place)) { @@ -407,14 +406,13 @@ class AllocatorFacadePrivate { return iter->second; } - const gpuStream_t& GetDefaultStream(const platform::CUDAPlace& place) const { + gpuStream_t GetDefaultStream(const platform::CUDAPlace& place) const { const std::shared_ptr& allocator = GetDefaultStreamSafeCUDAAllocator(place); return allocator->GetDefaultStream(); } - void SetDefaultStream(const platform::CUDAPlace& place, - const gpuStream_t& stream) { + void SetDefaultStream(const platform::CUDAPlace& place, gpuStream_t stream) { const std::shared_ptr& allocator = GetDefaultStreamSafeCUDAAllocator(place); allocator->SetDefaultStream(stream); @@ -424,7 +422,7 @@ class AllocatorFacadePrivate { } void RecordStream(std::shared_ptr allocation, - const gpuStream_t& stream) { + gpuStream_t stream) { std::shared_ptr stream_safe_cuda_allocation = std::dynamic_pointer_cast(allocation); if (stream_safe_cuda_allocation != nullptr) { @@ -434,7 +432,7 @@ class AllocatorFacadePrivate { } } - const gpuStream_t GetStream( + gpuStream_t GetStream( const std::shared_ptr& allocation) const { const std::shared_ptr stream_safe_cuda_allocation = @@ -1044,7 +1042,7 @@ bool AllocatorFacade::IsStreamSafeCUDAAllocatorUsed() { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) uint64_t AllocatorFacade::Release(const platform::CUDAPlace& place, - const gpuStream_t& stream) { + gpuStream_t stream) { AllocatorFacadePrivate* m = GetPrivate(); if (!m->IsStreamSafeCUDAAllocatorUsed()) { VLOG(6) << "Warning: StreamSafeCUDAAllocator is not used!"; @@ -1055,12 +1053,12 @@ uint64_t AllocatorFacade::Release(const platform::CUDAPlace& place, } void AllocatorFacade::RecordStream(std::shared_ptr allocation, - const gpuStream_t& stream) { + gpuStream_t stream) { GetPrivate()->RecordStream(allocation, stream); } const std::shared_ptr& AllocatorFacade::GetAllocator( - const platform::Place& place, const gpuStream_t& stream) { + const platform::Place& place, gpuStream_t stream) { AllocatorFacadePrivate* m = GetPrivate(); if (!m->IsStreamSafeCUDAAllocatorUsed()) { @@ -1075,13 +1073,13 @@ const std::shared_ptr& AllocatorFacade::GetAllocator( return m->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1); } -const gpuStream_t AllocatorFacade::GetStream( +gpuStream_t AllocatorFacade::GetStream( const std::shared_ptr& allocation) const { return GetPrivate()->GetStream(allocation); } void AllocatorFacade::SetDefaultStream(const platform::CUDAPlace& place, - const gpuStream_t& stream) { + gpuStream_t stream) { if (m_->IsStreamSafeCUDAAllocatorUsed()) { m_->SetDefaultStream(place, stream); } diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h index d5c1e7c908c79..1dea50edccf2e 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.h +++ b/paddle/fluid/memory/allocation/allocator_facade.h @@ -80,15 +80,12 @@ class AllocatorFacade { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) // TODO(zhiqiu): change gpuStream_t to phi::Stream if needed. - uint64_t Release(const platform::CUDAPlace& place, const gpuStream_t& stream); - void RecordStream(std::shared_ptr allocation, - const gpuStream_t& stream); + uint64_t Release(const platform::CUDAPlace& place, gpuStream_t stream); + void RecordStream(std::shared_ptr allocation, gpuStream_t stream); const std::shared_ptr& GetAllocator(const platform::Place& place, - const gpuStream_t& stream); - const gpuStream_t GetStream( - const std::shared_ptr& allocation) const; - void SetDefaultStream(const platform::CUDAPlace& place, - const gpuStream_t& stream); + gpuStream_t stream); + gpuStream_t GetStream(const std::shared_ptr& allocation) const; + void SetDefaultStream(const platform::CUDAPlace& place, gpuStream_t stream); #endif #ifdef PADDLE_WITH_CUDA diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc index 7e47d35176bac..82233fd4fe821 100644 --- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc +++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc @@ -33,7 +33,7 @@ StreamSafeCUDAAllocation::StreamSafeCUDAAllocation( owning_stream_(std::move(owning_stream)), allocator_(allocator->shared_from_this()) {} -void StreamSafeCUDAAllocation::RecordStream(const gpuStream_t& stream) { +void StreamSafeCUDAAllocation::RecordStream(gpuStream_t stream) { VLOG(8) << "Try record stream " << stream << " for address " << ptr(); if (stream == owning_stream_) { return; @@ -90,7 +90,7 @@ bool StreamSafeCUDAAllocation::CanBeFreed() { return true; } -const gpuStream_t& StreamSafeCUDAAllocation::GetOwningStream() const { +gpuStream_t StreamSafeCUDAAllocation::GetOwningStream() const { return owning_stream_; } @@ -102,7 +102,7 @@ void StreamSafeCUDAAllocation::RecordGraphCapturingStreams() { } void StreamSafeCUDAAllocation::RecordStreamWithNoGraphCapturing( - const gpuStream_t& stream) { + gpuStream_t stream) { gpuEvent_t record_event; auto it = outstanding_event_map_.find(stream); if (it == outstanding_event_map_.end()) { @@ -154,11 +154,11 @@ StreamSafeCUDAAllocator::~StreamSafeCUDAAllocator() { bool StreamSafeCUDAAllocator::IsAllocThreadSafe() const { return true; } -const gpuStream_t& StreamSafeCUDAAllocator::GetDefaultStream() const { +gpuStream_t StreamSafeCUDAAllocator::GetDefaultStream() const { return default_stream_; } -void StreamSafeCUDAAllocator::SetDefaultStream(const gpuStream_t& stream) { +void StreamSafeCUDAAllocator::SetDefaultStream(gpuStream_t stream) { default_stream_ = stream; } diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h index 65af32c701b75..32d3896e66bbf 100644 --- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h +++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h @@ -39,13 +39,13 @@ class StreamSafeCUDAAllocation : public Allocation { gpuStream_t owning_stream, StreamSafeCUDAAllocator *allocator); - void RecordStream(const gpuStream_t &stream); + void RecordStream(gpuStream_t stream); bool CanBeFreed(); - const gpuStream_t &GetOwningStream() const; + gpuStream_t GetOwningStream() const; private: void RecordGraphCapturingStreams(); - void RecordStreamWithNoGraphCapturing(const gpuStream_t &stream); + void RecordStreamWithNoGraphCapturing(gpuStream_t stream); DecoratedAllocationPtr underlying_allocation_; std::set graph_capturing_stream_set_; std::map outstanding_event_map_; @@ -66,8 +66,8 @@ class StreamSafeCUDAAllocator ~StreamSafeCUDAAllocator(); bool IsAllocThreadSafe() const override; - const gpuStream_t &GetDefaultStream() const; - void SetDefaultStream(const gpuStream_t &stream); + gpuStream_t GetDefaultStream() const; + void SetDefaultStream(gpuStream_t stream); protected: phi::Allocation *AllocateImpl(size_t size) override; diff --git a/paddle/fluid/memory/malloc.cc b/paddle/fluid/memory/malloc.cc index f3de317dd1df5..50180b4b6a1a6 100644 --- a/paddle/fluid/memory/malloc.cc +++ b/paddle/fluid/memory/malloc.cc @@ -57,17 +57,16 @@ void* GetBasePtr(const std::shared_ptr& allocation) { } #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -uint64_t Release(const platform::CUDAPlace& place, const gpuStream_t& stream) { +uint64_t Release(const platform::CUDAPlace& place, gpuStream_t stream) { return allocation::AllocatorFacade::Instance().Release(place, stream); } -void RecordStream(std::shared_ptr allocation, - const gpuStream_t& stream) { +void RecordStream(std::shared_ptr allocation, gpuStream_t stream) { return allocation::AllocatorFacade::Instance().RecordStream(allocation, stream); } -const gpuStream_t GetStream(const std::shared_ptr& allocation) { +gpuStream_t GetStream(const std::shared_ptr& allocation) { return allocation::AllocatorFacade::Instance().GetStream(allocation); } diff --git a/paddle/fluid/memory/malloc.h b/paddle/fluid/memory/malloc.h index e6d910579ba95..796bdcf0ec2f6 100644 --- a/paddle/fluid/memory/malloc.h +++ b/paddle/fluid/memory/malloc.h @@ -50,13 +50,11 @@ extern bool InSameStream(const std::shared_ptr& allocation, extern void* GetBasePtr(const std::shared_ptr& allocation); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -extern uint64_t Release(const platform::CUDAPlace& place, - const gpuStream_t& stream); +extern uint64_t Release(const platform::CUDAPlace& place, gpuStream_t stream); -void RecordStream(std::shared_ptr allocation, - const gpuStream_t& stream); +void RecordStream(std::shared_ptr allocation, gpuStream_t stream); -const gpuStream_t GetStream(const std::shared_ptr& allocation); +gpuStream_t GetStream(const std::shared_ptr& allocation); #endif } // namespace memory } // namespace paddle diff --git a/paddle/fluid/operators/abs_op.cc b/paddle/fluid/operators/abs_op.cc index 71bcb4e201541..b9517e1cc863c 100644 --- a/paddle/fluid/operators/abs_op.cc +++ b/paddle/fluid/operators/abs_op.cc @@ -166,7 +166,7 @@ class AbsDoubleGradOp : public framework::OperatorWithKernel { } // namespace paddle DECLARE_INFER_SHAPE_FUNCTOR(abs, AbsInferShapeFunctor, - PD_INFER_META(phi::UnchangedInferMeta)); + PD_INFER_META(phi::RealAndImagInferMeta)); namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/interpolate_v2_op.cc b/paddle/fluid/operators/interpolate_v2_op.cc index 4b5a18141d5aa..d0d7b7694fc3a 100644 --- a/paddle/fluid/operators/interpolate_v2_op.cc +++ b/paddle/fluid/operators/interpolate_v2_op.cc @@ -9,11 +9,15 @@ See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/interpolate_v2_op.h" #include #include #include + +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/multiary.h" + #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" #endif @@ -722,64 +726,51 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(InterpolateV2GradNoNeedBufferVarsInferer, // not // compatible with interp_op, so a new one is added in paddle2.0 namespace ops = paddle::operators; + +DECLARE_INFER_SHAPE_FUNCTOR(bilinear_interp_v2, BilinearInterpInferShapeFunctor, + PD_INFER_META(phi::InterpolateInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(nearest_interp_v2, NearestInterpInferShapeFunctor, + PD_INFER_META(phi::InterpolateInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(trilinear_interp_v2, + TrilinearInterpInferShapeFunctor, + PD_INFER_META(phi::InterpolateInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(bicubic_interp_v2, BicubicInterpInferShapeFunctor, + PD_INFER_META(phi::InterpolateInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(linear_interp_v2, LinearInterpInferShapeFunctor, + PD_INFER_META(phi::InterpolateInferMeta)); + REGISTER_OPERATOR(bilinear_interp_v2, ops::InterpolateV2Op, ops::InterpolateV2OpMaker, ops::InterpolateV2GradMaker, - ops::InterpolateV2GradMaker); + ops::InterpolateV2GradMaker, + BilinearInterpInferShapeFunctor); REGISTER_OPERATOR(bilinear_interp_v2_grad, ops::InterpolateV2OpGrad, ops::InterpolateV2GradNoNeedBufferVarsInferer); REGISTER_OPERATOR(nearest_interp_v2, ops::InterpolateV2Op, ops::InterpolateV2OpMaker, ops::InterpolateV2GradMaker, - ops::InterpolateV2GradMaker); + ops::InterpolateV2GradMaker, + NearestInterpInferShapeFunctor); REGISTER_OPERATOR(nearest_interp_v2_grad, ops::InterpolateV2OpGrad, ops::InterpolateV2GradNoNeedBufferVarsInferer); REGISTER_OPERATOR(trilinear_interp_v2, ops::InterpolateV2Op, ops::InterpolateV2OpMaker, ops::InterpolateV2GradMaker, - ops::InterpolateV2GradMaker); + ops::InterpolateV2GradMaker, + TrilinearInterpInferShapeFunctor); REGISTER_OPERATOR(trilinear_interp_v2_grad, ops::InterpolateV2OpGrad, ops::InterpolateV2GradNoNeedBufferVarsInferer); REGISTER_OPERATOR(bicubic_interp_v2, ops::InterpolateV2Op, ops::InterpolateV2OpMaker, ops::InterpolateV2GradMaker, - ops::InterpolateV2GradMaker); + ops::InterpolateV2GradMaker, + BicubicInterpInferShapeFunctor); REGISTER_OPERATOR(bicubic_interp_v2_grad, ops::InterpolateV2OpGrad, ops::InterpolateV2GradNoNeedBufferVarsInferer); -REGISTER_OP_CPU_KERNEL(bilinear_interp_v2, ops::InterpolateV2Kernel, - ops::InterpolateV2Kernel, - ops::InterpolateV2Kernel); -REGISTER_OP_CPU_KERNEL(bilinear_interp_v2_grad, - ops::InterpolateV2GradKernel, - ops::InterpolateV2GradKernel); -REGISTER_OP_CPU_KERNEL(nearest_interp_v2, ops::InterpolateV2Kernel, - ops::InterpolateV2Kernel, - ops::InterpolateV2Kernel, - ops::InterpolateV2Kernel, - ops::InterpolateV2Kernel); -REGISTER_OP_CPU_KERNEL(nearest_interp_v2_grad, - ops::InterpolateV2GradKernel, - ops::InterpolateV2GradKernel); -REGISTER_OP_CPU_KERNEL(trilinear_interp_v2, ops::InterpolateV2Kernel, - ops::InterpolateV2Kernel, - ops::InterpolateV2Kernel); -REGISTER_OP_CPU_KERNEL(trilinear_interp_v2_grad, - ops::InterpolateV2GradKernel, - ops::InterpolateV2GradKernel); REGISTER_OPERATOR(linear_interp_v2, ops::InterpolateV2Op, ops::InterpolateV2OpMaker, ops::InterpolateV2GradMaker, - ops::InterpolateV2GradMaker); + ops::InterpolateV2GradMaker, + LinearInterpInferShapeFunctor); REGISTER_OPERATOR(linear_interp_v2_grad, ops::InterpolateV2OpGrad, ops::InterpolateV2GradNoNeedBufferVarsInferer); -REGISTER_OP_CPU_KERNEL(linear_interp_v2, ops::InterpolateV2Kernel, - ops::InterpolateV2Kernel, - ops::InterpolateV2Kernel); -REGISTER_OP_CPU_KERNEL(linear_interp_v2_grad, - ops::InterpolateV2GradKernel, - ops::InterpolateV2GradKernel); -REGISTER_OP_CPU_KERNEL(bicubic_interp_v2, ops::InterpolateV2Kernel, - ops::InterpolateV2Kernel); -REGISTER_OP_CPU_KERNEL(bicubic_interp_v2_grad, - ops::InterpolateV2GradKernel, - ops::InterpolateV2GradKernel); diff --git a/paddle/fluid/operators/interpolate_v2_op.cu b/paddle/fluid/operators/interpolate_v2_op.cu deleted file mode 100644 index cd297c53f89a0..0000000000000 --- a/paddle/fluid/operators/interpolate_v2_op.cu +++ /dev/null @@ -1,2210 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#include -#include -#include "paddle/fluid/operators/interpolate_v2_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" -#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" -#include "paddle/fluid/platform/fast_divmod.h" -#include "paddle/phi/kernels/funcs/math_cuda_utils.h" - -namespace paddle { -namespace operators { - -using framework::Tensor; -using platform::FastDivMod; -using DataLayout = framework::DataLayout; - -static inline int GetLastPow2(int n) { - n |= (n >> 1); - n |= (n >> 2); - n |= (n >> 4); - n |= (n >> 8); - n |= (n >> 16); - return std::max(1, n - (n >> 1)); -} - -inline platform::GpuLaunchConfig GetGpuLaunchConfig3D( - const platform::CUDADeviceContext& context, int num_img, int height, - int width) { - const int kThreadsPerBlock = 256; - int max_threads_per_block = context.GetMaxThreadsPerBlock(); // 1024 - int max_threads = std::min(kThreadsPerBlock, max_threads_per_block); - - int block_x = std::min(GetLastPow2(width), max_threads); - int block_y = std::min(GetLastPow2(height), max_threads / block_x); - int block_z = std::min(num_img, max_threads / block_x / block_y); - - auto max_grid_dim = context.GetCUDAMaxGridDimSize(); - int grid_x = std::min(max_grid_dim[0], platform::DivUp(width, block_x)); - int grid_y = std::min(max_grid_dim[1], platform::DivUp(height, block_y)); - int grid_z = - std::min(max_grid_dim[2], platform::DivUp(num_img, block_z * 4)); - - const int capability = context.GetComputeCapability(); - platform::GpuLaunchConfig config; - config.compute_capability = capability; - config.thread_per_block = dim3(block_x, block_y, block_z); - config.block_per_grid = dim3(grid_x, grid_y, grid_z); - return config; -} - -template -__forceinline__ __device__ void PreCalculatorForLinearInterpInputIndex( - int* in_img_idx, int* x_id, T* lambda1, T* lambda2, T src_x, - const int in_img_x) { - src_x = (src_x > 0) ? src_x : 0.f; - *in_img_idx = static_cast(src_x); - *x_id = (*in_img_idx < in_img_x - 1) ? 1 : 0; - *lambda1 = src_x - *in_img_idx; - *lambda2 = 1.f - *lambda1; -} - -struct FastDivModForInterpolate { - public: - FastDivMod channels_div; - FastDivMod output_w_div; - FastDivMod output_wc_div; - - explicit HOSTDEVICE FastDivModForInterpolate(const int channels, - const int output_w, - const int outout_wc) - : channels_div(FastDivMod(channels)), - output_w_div(FastDivMod(output_w)), - output_wc_div(FastDivMod(outout_wc)) {} -}; - -template -__global__ void KeNearestNeighborInterpNCHWFw( - const T* in, const size_t in_img_h, const size_t in_img_w, T* out, - const size_t out_img_h, const size_t out_img_w, const size_t nc, - const float ratio_h, const float ratio_w, const bool align_corners) { - int out_img_idx = threadIdx.x + blockIdx.x * blockDim.x; - int out_img_idy = threadIdx.y + blockIdx.y * blockDim.y; - int nc_id = threadIdx.z + blockIdx.z * blockDim.z; - int nc_stride = blockDim.z * gridDim.z; - - // nearest_sampling by multiple read in_addr and write to out_addr - int in_img_idx = (align_corners) - ? static_cast(ratio_w * out_img_idx + 0.5) - : static_cast(ratio_w * out_img_idx); - int in_img_idy = (align_corners) - ? static_cast(ratio_h * out_img_idy + 0.5) - : static_cast(ratio_h * out_img_idy); - - int in_index = (nc_id * in_img_h + in_img_idy) * in_img_w + in_img_idx; - int in_index_stride = nc_stride * in_img_h * in_img_w; - - int out_index = (nc_id * out_img_h + out_img_idy) * out_img_w + out_img_idx; - int out_index_stride = nc_stride * out_img_h * out_img_w; - - // prevent from multiple threads writing - if (out_img_idx < out_img_w && out_img_idy < out_img_h) { - while (nc_id < nc) { - out[out_index] = in[in_index]; - in_index += in_index_stride; - out_index += out_index_stride; - nc_id += nc_stride; - } - } -} - -template -__global__ void KeNearestNeighborInterpFw( - const T* in, const size_t in_img_h, const size_t in_img_w, - const size_t input_h, const size_t input_w, T* out, const size_t out_img_h, - const size_t out_img_w, const size_t output_h, const size_t output_w, - const size_t num_channels, const float ratio_h, const float ratio_w, - const bool align_corners, FastDivModForInterpolate divmods) { - int nthreads = output_h * output_w; - int tid = blockIdx.x * blockDim.x + threadIdx.x; - int stride = blockDim.x * gridDim.x; - int in_img_size = in_img_h * in_img_w; - int out_img_size = out_img_h * out_img_w; - - for (; tid < nthreads; tid += stride) { - auto out_id_divmod = divmods.output_w_div.Divmod(tid); - int out_id_h = out_id_divmod.val[0]; - int out_id_w = out_id_divmod.val[1]; - - int channel_id = divmods.channels_div.Divmod(tid).val[1]; - auto outimg_id_divmod = divmods.output_wc_div.Divmod(out_id_w); - int out_img_idy = outimg_id_divmod.val[0]; - int out_img_idx = - divmods.channels_div.Divmod(outimg_id_divmod.val[1]).val[0]; - - int in_img_idy = (align_corners) - ? static_cast(ratio_h * out_img_idy + 0.5) - : static_cast(ratio_h * out_img_idy); - int in_img_idx = (align_corners) - ? static_cast(ratio_w * out_img_idx + 0.5) - : static_cast(ratio_w * out_img_idx); - - out[tid] = in[out_id_h * input_w + in_img_idy * in_img_w * num_channels + - in_img_idx * num_channels + channel_id]; - } -} - -template -__global__ void KeNearestNeighbor3DInterpFw( - const T* in, const size_t in_img_d, const size_t in_img_h, - const size_t in_img_w, const size_t input_h, const size_t input_w, T* out, - const size_t out_img_d, const size_t out_img_h, const size_t out_img_w, - const size_t output_h, const size_t output_w, const size_t num_channels, - const float ratio_d, const float ratio_h, const float ratio_w, - const bool align_corners, const DataLayout data_layout) { - int nthreads = output_h * output_w; // ncdhw - int tid = blockIdx.x * blockDim.x + threadIdx.x; - int stride = blockDim.x * gridDim.x; - for (; tid < nthreads; tid += stride) { - int out_id_h = tid / output_w; - int out_id_w = tid % output_w; - int in_img_size = input_w / num_channels; - int out_img_size = output_w / num_channels; - - int channel_id, out_img_idt, out_img_idy, out_img_idx; - if (data_layout == DataLayout::kNCHW) { - channel_id = out_id_w / out_img_size; - out_img_idt = (out_id_w % out_img_size) / out_img_h / out_img_w; - out_img_idy = ((out_id_w % out_img_size) / out_img_w) % out_img_h; - out_img_idx = tid % out_img_w; - } else { - out_img_idt = out_id_w / (out_img_h * out_img_w * num_channels); - out_img_idy = out_id_w % (out_img_h * out_img_w * num_channels) / - (out_img_w * num_channels); - out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels; - channel_id = tid % num_channels; - } - - int in_img_idt = (align_corners) - ? static_cast(ratio_d * out_img_idt + 0.5) - : static_cast(ratio_d * out_img_idt); - - int in_img_idy = (align_corners) - ? static_cast(ratio_h * out_img_idy + 0.5) - : static_cast(ratio_h * out_img_idy); - int in_img_idx = (align_corners) - ? static_cast(ratio_w * out_img_idx + 0.5) - : static_cast(ratio_w * out_img_idx); - - if (data_layout == DataLayout::kNCHW) { - out[tid] = in[out_id_h * input_w + channel_id * in_img_size + - in_img_idt * in_img_h * in_img_w + in_img_idy * in_img_w + - in_img_idx]; - } else { - out[tid] = in[out_id_h * input_w + - in_img_idt * in_img_h * in_img_w * num_channels + - in_img_idy * in_img_w * num_channels + - in_img_idx * num_channels + channel_id]; - } - } -} - -template -__global__ void KeNearestNeighborInterpNCHWBw( - T* in, const size_t in_img_h, const size_t in_img_w, const T* out, - const size_t out_img_h, const size_t out_img_w, const size_t nc, - const float ratio_h, const float ratio_w, const bool align_corners) { - int out_img_idx = threadIdx.x + blockIdx.x * blockDim.x; - int out_img_idy = threadIdx.y + blockIdx.y * blockDim.y; - int nc_id = threadIdx.z + blockIdx.z * blockDim.z; - int nc_stride = blockDim.z * gridDim.z; - - // nearest_sampling by multiple read in_addr and write to out_addr - int in_img_idx = (align_corners) - ? static_cast(ratio_w * out_img_idx + 0.5) - : static_cast(ratio_w * out_img_idx); - int in_img_idy = (align_corners) - ? static_cast(ratio_h * out_img_idy + 0.5) - : static_cast(ratio_h * out_img_idy); - - int in_index = (nc_id * in_img_h + in_img_idy) * in_img_w + in_img_idx; - int in_index_stride = nc_stride * in_img_h * in_img_w; - - int out_index = (nc_id * out_img_h + out_img_idy) * out_img_w + out_img_idx; - int out_index_stride = nc_stride * out_img_h * out_img_w; - - // prevent from multiple threads writing - if (out_img_idx < out_img_w && out_img_idy < out_img_h) { - while (nc_id < nc) { - T* in_pos = &in[in_index]; - const T out_pos = out[out_index]; - platform::CudaAtomicAdd(in_pos, out_pos); - in_index += in_index_stride; - out_index += out_index_stride; - nc_id += nc_stride; - } - } -} - -template -__global__ void KeNearestNeighborInterpBw( - T* in, const size_t in_img_h, const size_t in_img_w, const size_t input_h, - const size_t input_w, const T* out, const size_t out_img_h, - const size_t out_img_w, const size_t output_h, const size_t output_w, - const size_t num_channels, const float ratio_h, const float ratio_w, - const bool align_corners, FastDivModForInterpolate divmods) { - int nthreads = output_h * output_w; - int tid = blockIdx.x * blockDim.x + threadIdx.x; - int stride = blockDim.x * gridDim.x; - int in_img_size = in_img_h * in_img_w; - int out_img_size = out_img_h * out_img_w; - - for (; tid < nthreads; tid += stride) { - auto out_id_divmod = divmods.output_w_div.Divmod(tid); - int out_id_h = out_id_divmod.val[0]; - int out_id_w = out_id_divmod.val[1]; - - int channel_id = divmods.channels_div.Divmod(tid).val[1]; - auto outimg_id_divmod = divmods.output_wc_div.Divmod(out_id_w); - int out_img_idy = outimg_id_divmod.val[0]; - int out_img_idx = - divmods.channels_div.Divmod(outimg_id_divmod.val[1]).val[0]; - - int in_img_idy = (align_corners) - ? static_cast(ratio_h * out_img_idy + 0.5) - : static_cast(ratio_h * out_img_idy); - int in_img_idx = (align_corners) - ? static_cast(ratio_w * out_img_idx + 0.5) - : static_cast(ratio_w * out_img_idx); - - T* in_pos = &in[out_id_h * input_w + in_img_idy * in_img_w * num_channels + - in_img_idx * num_channels + channel_id]; - - const T out_pos = out[tid]; - platform::CudaAtomicAdd(in_pos, out_pos); - } -} - -template -__global__ void KeNearestNeighbor3DInterpBw( - T* in, const size_t in_img_d, const size_t in_img_h, const size_t in_img_w, - const size_t input_h, const size_t input_w, const T* out, - const size_t out_img_d, const size_t out_img_h, const size_t out_img_w, - const size_t output_h, const size_t output_w, const size_t num_channels, - const float ratio_d, const float ratio_h, const float ratio_w, - const bool align_corners, const DataLayout data_layout) { - int nthreads = output_h * output_w; - int tid = blockIdx.x * blockDim.x + threadIdx.x; - int stride = blockDim.x * gridDim.x; - for (; tid < nthreads; tid += stride) { - int out_id_h = tid / output_w; - int out_id_w = tid % output_w; - int in_img_size = input_w / num_channels; - int out_img_size = output_w / num_channels; - - int channel_id, out_img_idt, out_img_idy, out_img_idx; - if (data_layout == DataLayout::kNCHW) { - channel_id = out_id_w / out_img_size; - out_img_idt = (out_id_w % out_img_size) / out_img_h / out_img_w; - out_img_idy = ((out_id_w % out_img_size) / out_img_w) % out_img_h; - out_img_idx = tid % out_img_w; - } else { - out_img_idt = out_id_w / (out_img_h * out_img_w * num_channels); - out_img_idy = out_id_w % (out_img_h * out_img_w * num_channels) / - (out_img_w * num_channels); - out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels; - channel_id = tid % num_channels; - } - - int in_img_idt = (align_corners) - ? static_cast(ratio_d * out_img_idt + 0.5) - : static_cast(ratio_d * out_img_idt); - int in_img_idy = (align_corners) - ? static_cast(ratio_h * out_img_idy + 0.5) - : static_cast(ratio_h * out_img_idy); - int in_img_idx = (align_corners) - ? static_cast(ratio_w * out_img_idx + 0.5) - : static_cast(ratio_w * out_img_idx); - - T* in_pos; - if (data_layout == DataLayout::kNCHW) { - in_pos = &in[out_id_h * input_w + channel_id * in_img_size + - in_img_idt * in_img_h * in_img_w + in_img_idy * in_img_w + - in_img_idx]; - } else { - in_pos = &in[out_id_h * input_w + - in_img_idt * in_img_h * in_img_w * num_channels + - in_img_idy * in_img_w * num_channels + - in_img_idx * num_channels + channel_id]; - } - const T out_pos = out[out_id_h * output_w + out_id_w]; - platform::CudaAtomicAdd(in_pos, out_pos); - } -} - -template -__global__ void KeLinearInterpFw(const T* in, const size_t in_img_w, - const size_t input_w, T* out, - const size_t out_img_w, const size_t output_h, - const size_t output_w, - const size_t num_channels, const float ratio_w, - const bool align_corners, const int align_mode, - const DataLayout data_layout) { - int nthreads = output_h * output_w; - int tid = blockIdx.x * blockDim.x + threadIdx.x; - int stride = blockDim.x * gridDim.x; - bool align_flag = (align_mode == 0 && !align_corners); - for (; tid < nthreads; tid += stride) { - int out_id_h = tid / output_w; - int out_id_w = tid % output_w; - int in_img_size = input_w / num_channels; - int out_img_size = output_w / num_channels; - - int channel_id, out_img_idy, out_img_idx; - if (data_layout == DataLayout::kNCHW) { - channel_id = out_id_w / out_img_size; - out_img_idx = tid % out_img_w; - } else { - out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels; - channel_id = tid % num_channels; - } - - int in_img_idx = align_flag - ? static_cast(ratio_w * (out_img_idx + 0.5) - 0.5) - : static_cast(ratio_w * out_img_idx); - in_img_idx = (in_img_idx > 0) ? in_img_idx : 0; // w - int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0; // w_id - - T src_w = ratio_w * (out_img_idx + 0.5) - 0.5; - src_w = (src_w > 0) ? src_w : 0; - T w1lambda = - align_flag ? src_w - in_img_idx : ratio_w * out_img_idx - in_img_idx; - T w2lambda = 1.f - w1lambda; - - if (data_layout == DataLayout::kNCHW) { - const T* in_pos = - &in[out_id_h * out_id_w + channel_id * in_img_size + in_img_idx]; - // linear interpolation - out[out_id_h * output_w + out_id_w] = - w2lambda * in_pos[0] + w1lambda * in_pos[w_id]; - - } else { - const T* in_pos = - &in[out_id_h * input_w + in_img_idx * num_channels + channel_id]; - // linear interpolation - out[out_id_h * output_w + out_id_w] = - w2lambda * in_pos[0] + w1lambda * in_pos[w_id * num_channels]; - } - } -} - -template -__global__ void KeLinearInterpBw(T* in, const size_t in_img_w, - const size_t input_w, const T* out, - const size_t out_img_w, const size_t output_h, - const size_t output_w, - const size_t num_channels, const T ratio_w, - const bool align_corners, const int align_mode, - const DataLayout data_layout) { - int nthreads = output_h * output_w; - int tid = blockIdx.x * blockDim.x + threadIdx.x; - int stride = blockDim.x * gridDim.x; - bool align_flag = (align_mode == 0 && !align_corners); - for (; tid < nthreads; tid += stride) { - int out_id_h = tid / output_w; - int out_id_w = tid % output_w; - int in_img_size = input_w / num_channels; - int out_img_size = output_w / num_channels; - - int channel_id, out_img_idx; - if (data_layout == DataLayout::kNCHW) { - channel_id = out_id_w / out_img_size; - out_img_idx = tid % out_img_w; - } else { - out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels; - channel_id = tid % num_channels; - } - - int in_img_idx = align_flag ? ratio_w * (out_img_idx + 0.5) - 0.5 - : ratio_w * out_img_idx; - in_img_idx = (in_img_idx > 0) ? in_img_idx : 0; // w - int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0; // w_id - - T src_w = ratio_w * (out_img_idx + 0.5) - 0.5; - src_w = (src_w > 0) ? src_w : 0; - T w1lambda = - align_flag ? src_w - in_img_idx : ratio_w * out_img_idx - in_img_idx; - T w2lambda = 1.f - w1lambda; - - T* in_pos; - if (data_layout == DataLayout::kNCHW) { - in_pos = &in[out_id_h * input_w + channel_id * in_img_size + in_img_idx]; - } else { - in_pos = &in[out_id_h * input_w + in_img_idx * num_channels + channel_id]; - } - const T* out_pos = &out[out_id_w]; - - if (data_layout == DataLayout::kNCHW) { - platform::CudaAtomicAdd(&in_pos[0], w2lambda * out_pos[0]); - platform::CudaAtomicAdd(&in_pos[w_id], w1lambda * out_pos[0]); - } else { - platform::CudaAtomicAdd(&in_pos[0], w2lambda * out_pos[0]); - platform::CudaAtomicAdd(&in_pos[w_id * num_channels], - w1lambda * out_pos[0]); - } - } -} - -template -__global__ void KeBilinearInterpNCHWFw(const T* in, const size_t in_img_h, - const size_t in_img_w, T* out, - const size_t out_img_h, - const size_t out_img_w, const size_t nc, - const float ratio_h, const float ratio_w, - const T align_type_value) { - int out_img_idx = threadIdx.x + blockIdx.x * blockDim.x; - int out_img_idy = threadIdx.y + blockIdx.y * blockDim.y; - int nc_id = threadIdx.z + blockIdx.z * blockDim.z; - int nc_stride = blockDim.z * gridDim.z; - - int in_img_idx, in_img_idy, h_id, w_id; - T h1lambda, w1lambda, h2lambda, w2lambda; - T src_w = ratio_w * (out_img_idx + align_type_value) - align_type_value; - T src_h = ratio_h * (out_img_idy + align_type_value) - align_type_value; - - PreCalculatorForLinearInterpInputIndex(&in_img_idx, &w_id, &w1lambda, - &w2lambda, src_w, in_img_w); - PreCalculatorForLinearInterpInputIndex(&in_img_idy, &h_id, &h1lambda, - &h2lambda, src_h, in_img_h); - - int in_index = (nc_id * in_img_h + in_img_idy) * in_img_w + in_img_idx; - int in_index_stride = nc_stride * in_img_h * in_img_w; - - int out_index = (nc_id * out_img_h + out_img_idy) * out_img_w + out_img_idx; - int out_index_stride = nc_stride * out_img_h * out_img_w; - - // prevent from multiple threads writing - if (out_img_idx < out_img_w && out_img_idy < out_img_h) { - while (nc_id < nc) { - const T* in_pos = &in[in_index]; - out[out_index] = - h2lambda * (w2lambda * in_pos[0] + w1lambda * in_pos[w_id]) + - h1lambda * (w2lambda * in_pos[h_id * in_img_w] + - w1lambda * in_pos[h_id * in_img_w + w_id]); - - in_index += in_index_stride; - out_index += out_index_stride; - nc_id += nc_stride; - } - } -} - -template -__global__ void KeBilinearInterpFw( - const T* in, const size_t in_img_h, const size_t in_img_w, - const size_t input_h, const size_t input_w, T* out, const size_t out_img_h, - const size_t out_img_w, const size_t output_h, const size_t output_w, - const size_t num_channels, const float ratio_h, const float ratio_w, - const T align_type_value, FastDivModForInterpolate divmods) { - int nthreads = output_h * output_w; - int tid = blockIdx.x * blockDim.x + threadIdx.x; - int stride = blockDim.x * gridDim.x; - - for (; tid < nthreads; tid += stride) { - auto out_id_divmod = divmods.output_w_div.Divmod(tid); - int out_id_h = out_id_divmod.val[0]; - int out_id_w = out_id_divmod.val[1]; - - int channel_id = divmods.channels_div.Divmod(tid).val[1]; - auto outimg_id_divmod = divmods.output_wc_div.Divmod(out_id_w); - int out_img_idy = outimg_id_divmod.val[0]; - int out_img_idx = - divmods.channels_div.Divmod(outimg_id_divmod.val[1]).val[0]; - - int in_img_idx, in_img_idy, h_id, w_id; - T h1lambda, w1lambda, h2lambda, w2lambda; - T src_w = ratio_w * (out_img_idx + align_type_value) - align_type_value; - T src_h = ratio_h * (out_img_idy + align_type_value) - align_type_value; - - PreCalculatorForLinearInterpInputIndex(&in_img_idx, &w_id, &w1lambda, - &w2lambda, src_w, in_img_w); - PreCalculatorForLinearInterpInputIndex(&in_img_idy, &h_id, &h1lambda, - &h2lambda, src_h, in_img_h); - - // bilinear interpolation - const T* in_pos = - &in[out_id_h * input_w + in_img_idy * in_img_w * num_channels + - in_img_idx * num_channels + channel_id]; - out[tid] = - h2lambda * - (w2lambda * in_pos[0] + w1lambda * in_pos[w_id * num_channels]) + - h1lambda * - (w2lambda * in_pos[h_id * in_img_w * num_channels] + - w1lambda * - in_pos[h_id * in_img_w * num_channels + w_id * num_channels]); - } -} - -/* Calculate the minimum of partial elements in a block */ -template -__inline__ __device__ T PartialBlockMin(T val, size_t threads_num_in_block, - unsigned mask) { - __shared__ T shared[WARP_SIZE]; - __shared__ T shared_last_val; - __shared__ int shared_last_idx; - int lane = threadIdx.x & 0x1f; - int wid = threadIdx.x >> 5; - int threshold = (threads_num_in_block & (-WARP_SIZE)); - - if (threadIdx.x < threshold) { - shared_last_idx = (threshold >> 5) - 1; - val = phi::funcs::warpReduceMin(val, mask); - if (lane == 0) { - shared[wid] = val; - } - } else { - shared_last_val = std::numeric_limits::max(); - platform::CudaAtomicMin(&shared_last_val, val); - shared[wid] = shared_last_val; - shared_last_idx = wid; - } - __syncthreads(); - - if (threadIdx.x < threshold) { - val = (lane <= shared_last_idx) ? shared[lane] - : std::numeric_limits::max(); - val = phi::funcs::warpReduceMin(val, mask); - shared_last_val = val; - } - __syncthreads(); - if (threadIdx.x >= threshold) { - val = shared_last_val; - } - return val; -} - -template -__global__ void KeBilinearInterpBwShareMemory( - T* in, const int in_h, const int in_w, const T* __restrict__ out, - const int out_h, const int out_w, const int n, const int num_channels, - float ratio_h, float ratio_w, const T align_type_value, bool is_nchw) { - __shared__ T s_data[2][1024]; - int tid = blockIdx.x * blockDim.x + threadIdx.x; - int stride = blockDim.x * gridDim.x; - int in_chw = in_h * in_w * num_channels; - int out_chw = num_channels * out_h * out_w; - int nthreads = n * out_chw; - - for (; tid < nthreads; tid += stride) { - int out_id_h = tid / out_chw; - int out_id_w = tid % out_chw; - const int in_img_size = in_h * in_w; - const int out_img_size = out_h * out_w; - T value = out[out_id_h * out_chw + out_id_w]; - - int channel_id = out_id_w / out_img_size; - int out_img_idy = (out_id_w % out_img_size) / out_w; - int out_img_idx = tid % out_w; - - int in_img_idx, in_img_idy, w_id, h_id; - T w1lambda, h1lambda, w2lambda, h2lambda; - T src_w = ratio_w * (out_img_idx + align_type_value) - align_type_value; - T src_h = ratio_h * (out_img_idy + align_type_value) - align_type_value; - - PreCalculatorForLinearInterpInputIndex(&in_img_idx, &w_id, &w1lambda, - &w2lambda, src_w, in_w); - PreCalculatorForLinearInterpInputIndex(&in_img_idy, &h_id, &h1lambda, - &h2lambda, src_h, in_h); - - // top_left_index is just input_index. - int input_index = out_id_h * in_chw + channel_id * in_img_size + - in_img_idy * in_w + in_img_idx; - int top_right_index = input_index + w_id; - int bot_left_index = input_index + h_id * in_w; - int bot_right_index = input_index + h_id * in_w + w_id; - int in_top_min_index, in_bot_min_index; - - s_data[0][threadIdx.x] = 0.f; - s_data[1][threadIdx.x] = 0.f; - int remain = nthreads - (tid & (-blockDim.x)); - int in_top_max_index = - phi::funcs::blockReduceMax(top_right_index, FINAL_MASK); - int in_bot_max_index = - phi::funcs::blockReduceMax(bot_right_index, FINAL_MASK); - - if (remain > blockDim.x) { - in_top_min_index = phi::funcs::blockReduceMin(input_index, FINAL_MASK); - in_bot_min_index = phi::funcs::blockReduceMin(bot_left_index, FINAL_MASK); - } else { - in_top_min_index = PartialBlockMin(input_index, remain, FINAL_MASK); - in_bot_min_index = PartialBlockMin(bot_left_index, remain, FINAL_MASK); - } - int upper_limit_share_idx = (in_top_max_index - in_top_min_index) > - (in_bot_max_index - in_bot_min_index) - ? (in_top_max_index - in_top_min_index) - : (in_bot_max_index - in_bot_min_index); - if (h_id != 0) { - platform::CudaAtomicAdd(&s_data[0][input_index - in_top_min_index], - h2lambda * w2lambda * value); - platform::CudaAtomicAdd(&s_data[0][top_right_index - in_top_min_index], - h2lambda * w1lambda * value); - platform::CudaAtomicAdd(&s_data[1][bot_left_index - in_bot_min_index], - h1lambda * w2lambda * value); - platform::CudaAtomicAdd(&s_data[1][bot_right_index - in_bot_min_index], - h1lambda * w1lambda * value); - } else { - platform::CudaAtomicAdd(&s_data[0][top_right_index - in_top_min_index], - (h2lambda + h1lambda) * w1lambda * value); - platform::CudaAtomicAdd(&s_data[1][bot_left_index - in_bot_min_index], - (h1lambda + h2lambda) * w2lambda * value); - } - __syncthreads(); - - if (threadIdx.x <= upper_limit_share_idx) { - platform::CudaAtomicAdd(&in[in_top_min_index + threadIdx.x], - s_data[0][threadIdx.x]); - platform::CudaAtomicAdd(&in[in_bot_min_index + threadIdx.x], - s_data[1][threadIdx.x]); - } - } -} - -__device__ __forceinline__ int GetInputIndex(const size_t nc, const int height, - const int width, const int h, - const int w) { - return (nc * height + h) * width + w; -} - -template -__global__ void KeBilinearInterpNCHWBw(T* in, const int in_h, const int in_w, - const int out_h, const int out_w, - const int n, const int num_channels, - float ratio_h, float ratio_w, - const T* __restrict__ out, - const T align_type_value) { - int index = threadIdx.x + blockDim.x * blockIdx.x; - int stride = blockDim.x * gridDim.x; - int num_out = n * num_channels * out_h * out_w; - int num_in = n * num_channels * in_h * in_w; - - for (; index < num_out; index += stride) { - int index_tmp = index; - int w2 = index_tmp % out_w; - index_tmp /= out_w; - int h2 = index_tmp % out_h; - int nc = index_tmp / out_h; - - int h1, y_id; - T h1lambda, h0lambda; - T src_y = ratio_h * (h2 + align_type_value) - align_type_value; - - PreCalculatorForLinearInterpInputIndex(&h1, &y_id, &h1lambda, &h0lambda, - src_y, in_h); - int w1, x_id; - T w1lambda, w0lambda; - T src_x = ratio_w * (w2 + align_type_value) - align_type_value; - PreCalculatorForLinearInterpInputIndex(&w1, &x_id, &w1lambda, &w0lambda, - src_x, in_w); - - T d2val = out[index]; - - platform::CudaAtomicAdd(in + GetInputIndex(nc, in_h, in_w, h1, w1), - h0lambda * w0lambda * d2val); - platform::CudaAtomicAdd(in + GetInputIndex(nc, in_h, in_w, h1, w1 + x_id), - h0lambda * w1lambda * d2val); - platform::CudaAtomicAdd(in + GetInputIndex(nc, in_h, in_w, h1 + y_id, w1), - h1lambda * w0lambda * d2val); - platform::CudaAtomicAdd( - in + GetInputIndex(nc, in_h, in_w, h1 + y_id, w1 + x_id), - h1lambda * w1lambda * d2val); - } -} - -template -__global__ void KeBilinearInterpBw(T* in, const int in_h, const int in_w, - const T* __restrict__ out, const int out_h, - const int out_w, const int n, - const int out_chw, const int num_channels, - float ratio_h, float ratio_w, - const T align_type_value, - FastDivModForInterpolate divmods) { - int tid = blockIdx.x * blockDim.x + threadIdx.x; - int stride = blockDim.x * gridDim.x; - int in_chw = in_h * in_w * num_channels; - int nthreads = n * out_chw; - - for (; tid < nthreads; tid += stride) { - auto out_id_divmod = divmods.output_w_div.Divmod(tid); - int out_id_h = out_id_divmod.val[0]; - int out_id_w = out_id_divmod.val[1]; - - int channel_id = divmods.channels_div.Divmod(tid).val[1]; - auto outimg_id_divmod = divmods.output_wc_div.Divmod(out_id_w); - int out_img_idy = outimg_id_divmod.val[0]; - int out_img_idx = - divmods.channels_div.Divmod(outimg_id_divmod.val[1]).val[0]; - - int in_img_idx, in_img_idy, w_id, h_id; - T w1lambda, h1lambda, w2lambda, h2lambda; - T src_w = ratio_w * (out_img_idx + align_type_value) - align_type_value; - T src_h = ratio_h * (out_img_idy + align_type_value) - align_type_value; - - PreCalculatorForLinearInterpInputIndex(&in_img_idx, &w_id, &w1lambda, - &w2lambda, src_w, in_w); - PreCalculatorForLinearInterpInputIndex(&in_img_idy, &h_id, &h1lambda, - &h2lambda, src_h, in_h); - - T value = out[tid]; - T* in_pos = &in[out_id_h * in_chw + in_img_idy * in_w * num_channels + - in_img_idx * num_channels + channel_id]; - platform::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * value); - platform::CudaAtomicAdd(&in_pos[w_id * num_channels], - h2lambda * w1lambda * value); - platform::CudaAtomicAdd(&in_pos[h_id * in_w * num_channels], - h1lambda * w2lambda * value); - platform::CudaAtomicAdd( - &in_pos[h_id * in_w * num_channels + w_id * num_channels], - h1lambda * w1lambda * value); - } -} - -template -__global__ void KeTrilinearInterpFw( - const T* in, const size_t in_img_d, const size_t in_img_h, - const size_t in_img_w, const size_t input_h, const size_t input_w, T* out, - const size_t out_img_d, const size_t out_img_h, const size_t out_img_w, - const size_t output_h, const size_t output_w, const size_t num_channels, - const float ratio_d, const float ratio_h, const float ratio_w, - const bool align_corners, const int align_mode, - const DataLayout data_layout) { - int nthreads = output_h * output_w; - int tid = blockIdx.x * blockDim.x + threadIdx.x; - int stride = blockDim.x * gridDim.x; - bool align_flag = (align_mode == 0 && !align_corners); - for (; tid < nthreads; tid += stride) { - int out_id_h = tid / output_w; - int out_id_w = tid % output_w; - int in_img_size = input_w / num_channels; - int out_img_size = output_w / num_channels; - - int channel_id, out_img_idt, out_img_idy, out_img_idx; - if (data_layout == DataLayout::kNCHW) { - channel_id = out_id_w / out_img_size; - out_img_idt = (out_id_w % out_img_size) / out_img_h / out_img_w; - out_img_idy = ((out_id_w % out_img_size) / out_img_w) % out_img_h; - out_img_idx = tid % out_img_w; - } else { - out_img_idt = out_id_w / (out_img_h * out_img_w * num_channels); - out_img_idy = out_id_w % (out_img_h * out_img_w * num_channels) / - (out_img_w * num_channels); - out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels; - channel_id = tid % num_channels; - } - - int in_img_idt = align_flag - ? static_cast(ratio_d * (out_img_idt + 0.5) - 0.5) - : static_cast(ratio_d * out_img_idt); - in_img_idt = (in_img_idt > 0) ? in_img_idt : 0; - int d_id = (in_img_idt < in_img_d - 1) ? 1 : 0; - T src_d = ratio_d * (out_img_idt + 0.5) - 0.5; - src_d = (src_d > 0) ? src_d : 0; - T d1lambda = - align_flag ? src_d - in_img_idt : ratio_d * out_img_idt - in_img_idt; - T d2lambda = 1.f - d1lambda; - - int in_img_idy = align_flag - ? static_cast(ratio_h * (out_img_idy + 0.5) - 0.5) - : static_cast(ratio_h * out_img_idy); - in_img_idy = (in_img_idy > 0) ? in_img_idy : 0; - int h_id = (in_img_idy < in_img_h - 1) ? 1 : 0; - T src_h = ratio_h * (out_img_idy + 0.5) - 0.5; - src_h = (src_h > 0) ? src_h : 0; - T h1lambda = - align_flag ? src_h - in_img_idy : ratio_h * out_img_idy - in_img_idy; - T h2lambda = 1.f - h1lambda; - - int in_img_idx = align_flag - ? static_cast(ratio_w * (out_img_idx + 0.5) - 0.5) - : static_cast(ratio_w * out_img_idx); - in_img_idx = (in_img_idx > 0) ? in_img_idx : 0; - int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0; - T src_w = ratio_w * (out_img_idx + 0.5) - 0.5; - src_w = (src_w > 0) ? src_w : 0; - T w1lambda = - align_flag ? src_w - in_img_idx : ratio_w * out_img_idx - in_img_idx; - T w2lambda = 1.f - w1lambda; - - if (data_layout == DataLayout::kNCHW) { - int in_pos1_idx = out_id_h * input_w + channel_id * in_img_size + - (in_img_idt * in_img_h + in_img_idy) * in_img_w + - in_img_idx; - const T* in_pos1 = &in[in_pos1_idx]; - int in_pos2_idx = in_pos1_idx + d_id * in_img_h * in_img_w; - const T* in_pos2 = &in[in_pos2_idx]; - - // trilinear interpolation - out[out_id_h * output_w + out_id_w] = - d2lambda * - (h2lambda * (w2lambda * in_pos1[0] + w1lambda * in_pos1[w_id]) + - h1lambda * (w2lambda * in_pos1[h_id * in_img_w] + - w1lambda * in_pos1[h_id * in_img_w + w_id])) + - d1lambda * - (h2lambda * (w2lambda * in_pos2[0] + w1lambda * in_pos2[w_id]) + - h1lambda * (w2lambda * in_pos2[h_id * in_img_w] + - w1lambda * in_pos2[h_id * in_img_w + w_id])); - - } else { - int in_pos1_idx = out_id_h * input_w + - in_img_idt * in_img_h * in_img_w * num_channels + - in_img_idy * in_img_w * num_channels + - in_img_idx * num_channels + channel_id; - const T* in_pos1 = &in[in_pos1_idx]; - int in_pos2_idx = in_pos1_idx + d_id * in_img_h * in_img_w * num_channels; - const T* in_pos2 = &in[in_pos2_idx]; - - // trilinear interpolation - out[out_id_h * output_w + out_id_w] = - d2lambda * - (h2lambda * (w2lambda * in_pos1[0] + - w1lambda * in_pos1[w_id * num_channels]) + - h1lambda * (w2lambda * in_pos1[h_id * in_img_w * num_channels] + - w1lambda * in_pos1[h_id * in_img_w * num_channels + - w_id * num_channels])) + - d1lambda * - (h2lambda * (w2lambda * in_pos2[0] + - w1lambda * in_pos2[w_id * num_channels]) + - h1lambda * (w2lambda * in_pos2[h_id * in_img_w * num_channels] + - w1lambda * in_pos2[h_id * in_img_w * num_channels + - w_id * num_channels])); - } - } -} - -template -__global__ void KeTrilinearInterpBw( - T* in, const size_t in_img_d, const size_t in_img_h, const size_t in_img_w, - const size_t input_h, const size_t input_w, const T* out, - const size_t out_img_d, const size_t out_img_h, const size_t out_img_w, - const size_t output_h, const size_t output_w, const size_t num_channels, - const T ratio_d, const T ratio_h, const T ratio_w, const bool align_corners, - const int align_mode, const DataLayout data_layout) { - int nthreads = output_h * output_w; - int tid = blockIdx.x * blockDim.x + threadIdx.x; - int stride = blockDim.x * gridDim.x; - bool align_flag = (align_mode == 0 && !align_corners); - for (; tid < nthreads; tid += stride) { - int out_id_h = tid / output_w; - int out_id_w = tid % output_w; - int in_img_size = input_w / num_channels; - int out_img_size = output_w / num_channels; - - int channel_id, out_img_idt, out_img_idy, out_img_idx; - if (data_layout == DataLayout::kNCHW) { - channel_id = out_id_w / out_img_size; - out_img_idt = (out_id_w % out_img_size) / out_img_h / out_img_w; - out_img_idy = ((out_id_w % out_img_size) / out_img_w) % out_img_h; - out_img_idx = tid % out_img_w; - } else { - out_img_idt = out_id_w / (out_img_h * out_img_w * num_channels); - out_img_idy = out_id_w % (out_img_h * out_img_w * num_channels) / - (out_img_w * num_channels); - out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels; - channel_id = tid % num_channels; - } - - int in_img_idt = align_flag - ? static_cast(ratio_d * (out_img_idt + 0.5) - 0.5) - : static_cast(ratio_d * out_img_idt); - in_img_idt = (in_img_idt > 0) ? in_img_idt : 0; - int d_id = (in_img_idt < in_img_d - 1) ? 1 : 0; - T src_d = ratio_d * (out_img_idt + 0.5) - 0.5; - src_d = (src_d > 0) ? src_d : 0; - T d1lambda = - align_flag ? src_d - in_img_idt : ratio_d * out_img_idt - in_img_idt; - T d2lambda = 1.f - d1lambda; - - int in_img_idy = align_flag - ? static_cast(ratio_h * (out_img_idy + 0.5) - 0.5) - : static_cast(ratio_h * out_img_idy); - in_img_idy = (in_img_idy > 0) ? in_img_idy : 0; - int h_id = (in_img_idy < in_img_h - 1) ? 1 : 0; - T src_h = ratio_h * (out_img_idy + 0.5) - 0.5; - src_h = (src_h > 0) ? src_h : 0; - T h1lambda = - align_flag ? src_h - in_img_idy : ratio_h * out_img_idy - in_img_idy; - T h2lambda = 1.f - h1lambda; - - int in_img_idx = align_flag - ? static_cast(ratio_w * (out_img_idx + 0.5) - 0.5) - : static_cast(ratio_w * out_img_idx); - in_img_idx = (in_img_idx > 0) ? in_img_idx : 0; - int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0; - T src_w = ratio_w * (out_img_idx + 0.5) - 0.5; - src_w = (src_w > 0) ? src_w : 0; - T w1lambda = - align_flag ? src_w - in_img_idx : ratio_w * out_img_idx - in_img_idx; - T w2lambda = 1.f - w1lambda; - - if (data_layout == DataLayout::kNCHW) { - int in_pos1_idx = out_id_h * input_w + channel_id * in_img_size + - (in_img_idt * in_img_h + in_img_idy) * in_img_w + - in_img_idx; - T* in_pos1 = &in[in_pos1_idx]; - int in_pos2_idx = in_pos1_idx + d_id * in_img_h * in_img_w; - T* in_pos2 = &in[in_pos2_idx]; - - const T* out_pos = &out[out_id_h * output_w + out_id_w]; - - // trilinear interpolation grad - platform::CudaAtomicAdd(&in_pos1[0], - d2lambda * h2lambda * w2lambda * out_pos[0]); - platform::CudaAtomicAdd(&in_pos1[w_id], - d2lambda * h2lambda * w1lambda * out_pos[0]); - platform::CudaAtomicAdd(&in_pos1[h_id * in_img_w], - d2lambda * h1lambda * w2lambda * out_pos[0]); - platform::CudaAtomicAdd(&in_pos1[h_id * in_img_w + w_id], - d2lambda * h1lambda * w1lambda * out_pos[0]); - platform::CudaAtomicAdd(&in_pos2[0], - d1lambda * h2lambda * w2lambda * out_pos[0]); - platform::CudaAtomicAdd(&in_pos2[w_id], - d1lambda * h2lambda * w1lambda * out_pos[0]); - platform::CudaAtomicAdd(&in_pos2[h_id * in_img_w], - d1lambda * h1lambda * w2lambda * out_pos[0]); - platform::CudaAtomicAdd(&in_pos2[h_id * in_img_w + w_id], - d1lambda * h1lambda * w1lambda * out_pos[0]); - } else { - int in_pos1_idx = out_id_h * input_w + - in_img_idt * in_img_h * in_img_w * num_channels + - in_img_idy * in_img_w * num_channels + - in_img_idx * num_channels + channel_id; - T* in_pos1 = &in[in_pos1_idx]; - int in_pos2_idx = in_pos1_idx + d_id * in_img_h * in_img_w * num_channels; - T* in_pos2 = &in[in_pos2_idx]; - - const T* out_pos = &out[out_id_h * output_w + out_id_w]; - - // trilinear interpolation grad - platform::CudaAtomicAdd(&in_pos1[0], - d2lambda * h2lambda * w2lambda * out_pos[0]); - platform::CudaAtomicAdd(&in_pos1[w_id * num_channels], - d2lambda * h2lambda * w1lambda * out_pos[0]); - platform::CudaAtomicAdd(&in_pos1[h_id * in_img_w * num_channels], - d2lambda * h1lambda * w2lambda * out_pos[0]); - platform::CudaAtomicAdd( - &in_pos1[h_id * in_img_w * num_channels + w_id * num_channels], - d2lambda * h1lambda * w1lambda * out_pos[0]); - platform::CudaAtomicAdd(&in_pos2[0], - d1lambda * h2lambda * w2lambda * out_pos[0]); - platform::CudaAtomicAdd(&in_pos2[w_id * num_channels], - d1lambda * h2lambda * w1lambda * out_pos[0]); - platform::CudaAtomicAdd(&in_pos2[h_id * in_img_w * num_channels], - d1lambda * h1lambda * w2lambda * out_pos[0]); - platform::CudaAtomicAdd( - &in_pos2[h_id * in_img_w * num_channels + w_id * num_channels], - d1lambda * h1lambda * w1lambda * out_pos[0]); - } - } -} - -template -__device__ __forceinline__ static T Kecubic_interp(const T x0, const T x1, - const T x2, const T x3, - T t) { - T coeffs[4]; - T a = -0.75; - T x_1 = t; - T x_2 = 1.0 - t; - coeffs[0] = cubic_convolution2(x_1 + 1.0, a); - coeffs[1] = cubic_convolution1(x_1, a); - coeffs[2] = cubic_convolution1(x_2, a); - coeffs[3] = cubic_convolution2(x_2 + 1.0, a); - return x0 * coeffs[0] + x1 * coeffs[1] + x2 * coeffs[2] + x3 * coeffs[3]; -} - -template -__global__ void KeBicubicInterpFw( - const T* in, const size_t in_img_h, const size_t in_img_w, - const size_t input_h, const size_t input_w, T* out, const size_t out_img_h, - const size_t out_img_w, const size_t output_h, const size_t output_w, - const size_t num_channels, const float ratio_h, const float ratio_w, - const bool align_corners, const DataLayout data_layout) { - int nthreads = output_h * output_w; - int tid = blockIdx.x * blockDim.x + threadIdx.x; - int stride = blockDim.x * gridDim.x; - - for (; tid < nthreads; tid += stride) { - int out_id_h = tid / output_w; - int out_id_w = tid % output_w; - int in_img_size = input_w / num_channels; - int out_img_size = output_w / num_channels; - - int channel_id, out_img_idy, out_img_idx; - - if (data_layout == DataLayout::kNCHW) { - channel_id = out_id_w / out_img_size; - out_img_idy = (out_id_w % out_img_size) / out_img_w; - out_img_idx = tid % out_img_w; - } else { - out_img_idy = out_id_w / (out_img_w * num_channels); - out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels; - channel_id = tid % num_channels; - } - - T in_img_idy = align_corners - ? static_cast(ratio_h * out_img_idy) - : static_cast(ratio_h * (out_img_idy + 0.5) - 0.5); - int input_y = floorf(in_img_idy); - const T y_t = in_img_idy - input_y; - - T in_img_idx = align_corners - ? static_cast(ratio_w * out_img_idx) - : static_cast(ratio_w * (out_img_idx + 0.5) - 0.5); - int input_x = floorf(in_img_idx); - const T x_t = in_img_idx - input_x; - - T coefficients[4]; - const T* in_pos_0; - const T* in_pos_1; - const T* in_pos_2; - const T* in_pos_3; - int access_x_0; - if (data_layout == DataLayout::kNCHW) { - for (int k = 0; k < 4; k++) { - int access_y = - max(min(input_y - 1 + k, static_cast(in_img_h - 1)), 0); - access_x_0 = max(min(input_x - 1, static_cast(in_img_w - 1)), 0); - int access_x_1 = - max(min(input_x + 0, static_cast(in_img_w - 1)), 0); - int access_x_2 = - max(min(input_x + 1, static_cast(in_img_w - 1)), 0); - int access_x_3 = - max(min(input_x + 2, static_cast(in_img_w - 1)), 0); - - in_pos_0 = &in[out_id_h * input_w + channel_id * in_img_size + - access_y * in_img_w + access_x_0]; - in_pos_1 = &in[out_id_h * input_w + channel_id * in_img_size + - access_y * in_img_w + access_x_1]; - in_pos_2 = &in[out_id_h * input_w + channel_id * in_img_size + - access_y * in_img_w + access_x_2]; - in_pos_3 = &in[out_id_h * input_w + channel_id * in_img_size + - access_y * in_img_w + access_x_3]; - - coefficients[k] = Kecubic_interp(in_pos_0[0], in_pos_1[0], - in_pos_2[0], in_pos_3[0], x_t); - } - - out[out_id_h * output_w + out_id_w] = - Kecubic_interp(coefficients[0], coefficients[1], coefficients[2], - coefficients[3], y_t); - - } else { - for (int k = 0; k < 4; k++) { - int access_y = - max(min(input_y - 1 + k, static_cast((in_img_h - 1))), 0); - int access_x_0 = - max(min(input_x - 1, static_cast((in_img_w - 1))), 0); - int access_x_1 = - max(min(input_x + 0, static_cast((in_img_w - 1))), 0); - int access_x_2 = - max(min(input_x + 1, static_cast((in_img_w - 1))), 0); - int access_x_3 = - max(min(input_x + 2, static_cast((in_img_w - 1))), 0); - - const T* in_pos_0 = - &in[out_id_h * input_w + access_y * in_img_w * num_channels + - access_x_0 * num_channels + channel_id]; - const T* in_pos_1 = - &in[out_id_h * input_w + access_y * in_img_w * num_channels + - access_x_1 * num_channels + channel_id]; - const T* in_pos_2 = - &in[out_id_h * input_w + access_y * in_img_w * num_channels + - access_x_2 * num_channels + channel_id]; - const T* in_pos_3 = - &in[out_id_h * input_w + access_y * in_img_w * num_channels + - access_x_3 * num_channels + channel_id]; - - coefficients[k] = Kecubic_interp(in_pos_0[0], in_pos_1[0], in_pos_2[0], - in_pos_3[0], x_t); - } - - out[out_id_h * output_w + out_id_w] = - static_cast(Kecubic_interp(coefficients[0], coefficients[1], - coefficients[2], coefficients[3], y_t)); - } - } -} - -template -__global__ void KeBicubicInterpBw( - T* in, const size_t in_img_h, const size_t in_img_w, const size_t input_h, - const size_t input_w, const T* out, const size_t out_img_h, - const size_t out_img_w, const size_t output_h, const size_t output_w, - const size_t num_channels, const float ratio_h, const float ratio_w, - const bool align_corners, const DataLayout data_layout) { - int nthreads = output_h * output_w; - int tid = blockIdx.x * blockDim.x + threadIdx.x; - int stride = blockDim.x * gridDim.x; - - for (; tid < nthreads; tid += stride) { - int out_id_h = tid / output_w; - int out_id_w = tid % output_w; - int in_img_size = input_w / num_channels; - int out_img_size = output_w / num_channels; - - int channel_id, out_img_idy, out_img_idx; - if (data_layout == DataLayout::kNCHW) { - channel_id = out_id_w / out_img_size; - out_img_idy = (out_id_w % out_img_size) / out_img_w; - out_img_idx = tid % out_img_w; - } else { - out_img_idy = out_id_w / (out_img_w * num_channels); - out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels; - channel_id = tid % num_channels; - } - - T in_img_idy = align_corners - ? static_cast(ratio_h * out_img_idy) - : static_cast(ratio_h * (out_img_idy + 0.5) - 0.5); - int input_y = floorf(in_img_idy); - const T y_t = in_img_idy - input_y; - - T in_img_idx = align_corners - ? static_cast(ratio_w * out_img_idx) - : static_cast(ratio_w * (out_img_idx + 0.5) - 0.5); - int input_x = floorf(in_img_idx); - - const T x_t = in_img_idx - input_x; - - T x_coeffs[4]; - T y_coeffs[4]; - - get_cubic_upsample_coefficients(x_coeffs, x_t); - get_cubic_upsample_coefficients(y_coeffs, y_t); - - const T* out_pos = &out[out_id_h * output_w + out_id_w]; - T* in_pos; - - for (int i = 0; i < 4; i++) { - for (int j = 0; j < 4; j++) { - int access_y = max(min(static_cast(input_y - 1 + j), - static_cast(in_img_h - 1)), - 0); - int access_x = max(min(static_cast(input_x - 1 + i), - static_cast(in_img_w - 1)), - 0); - if (data_layout == DataLayout::kNCHW) { - in_pos = &in[out_id_h * input_w + channel_id * in_img_size + - access_y * in_img_w + access_x]; - } else { - in_pos = &in[out_id_h * input_w + access_y * in_img_w * num_channels + - access_x * num_channels + channel_id]; - } - platform::CudaAtomicAdd(&in_pos[0], - (out_pos[0] * y_coeffs[j] * x_coeffs[i])); - } - } - } -} - -template -static void Interpolate1DCUDAFwd(const framework::ExecutionContext& ctx, - const Tensor& input, Tensor* output) { - auto* input_data = input.data(); - - const std::string data_layout_str = ctx.Attr("data_layout"); - const DataLayout data_layout = framework::StringToDataLayout(data_layout_str); - int n, c, in_d, in_h, in_w; - ExtractNCDWH(input.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w); - - auto interp_method = ctx.Attr("interp_method"); - bool align_corners = ctx.Attr("align_corners"); - int align_mode = ctx.Attr("align_mode"); - - int out_w = ctx.Attr("out_w"); - - auto list_new_shape_tensor = ctx.MultiInput("SizeTensor"); - float scale_w = -1; - if (list_new_shape_tensor.size() > 0) { - // have size tensor - auto new_size = get_new_shape(list_new_shape_tensor); - out_w = new_size[0]; - } else { - auto scale_tensor = ctx.Input("Scale"); - auto scale = ctx.Attr>("scale"); - if (scale_tensor != nullptr) { - auto scale_data = get_new_data_from_tensor(scale_tensor); - scale_w = scale_data[0]; - PADDLE_ENFORCE_EQ( - scale_w > 0, true, - platform::errors::InvalidArgument( - "The scale_w in input 'Scale' Tensor of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_w)); - } else { - if (scale.size() > 0) { - scale_w = scale[0]; - PADDLE_ENFORCE_EQ( - scale_w > 0, true, - platform::errors::InvalidArgument( - "The scale_w in Attr(scale) of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_w)); - } - } - if (scale_w > 0.) { - out_w = static_cast(in_w * scale_w); - } - auto out_size = ctx.Input("OutSize"); - if (out_size != nullptr) { - Tensor sizes; - framework::TensorCopySync(*out_size, platform::CPUPlace(), &sizes); - auto size_data = sizes.data(); - out_w = size_data[0]; - } - } - PADDLE_ENFORCE_GT(out_w, 0, platform::errors::InvalidArgument( - "out_w in Attr(out_shape) of Op(interpolate) " - "should be greater than 0.")); - framework::DDim dim_out; - if (data_layout == DataLayout::kNCHW) { - dim_out = {n, c, out_w}; - } else { - dim_out = {n, out_w, c}; - } - auto output_data = output->mutable_data(dim_out, ctx.GetPlace()); - - if (in_w == out_w) { - framework::TensorCopy(input, ctx.GetPlace(), output); - return; - } - - float ratio_w = 0.f; - if (out_w > 1) { - float new_scale_w = 0.f; - new_scale_w = (scale_w > 0) ? static_cast(1. / scale_w) - : static_cast(in_w) / out_w; - ratio_w = (align_corners) ? static_cast(in_w - 1.0) / (out_w - 1.0) - : static_cast(new_scale_w); - } - - int64_t in_cw = c * in_w; - int64_t out_cw = c * out_w; - auto pixelNum = n * out_cw; - - platform::GpuLaunchConfig config = - platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), pixelNum); - - if ("linear" == interp_method) { - KeLinearInterpFw<<>>( - input_data, in_w, in_cw, output_data, out_w, n, out_cw, c, ratio_w, - align_corners, align_mode, data_layout); - } -} - -template -static void Interpolate2DCUDAFwd(const framework::ExecutionContext& ctx, - const Tensor& input, Tensor* output) { - auto* input_data = input.data(); - - const std::string data_layout_str = ctx.Attr("data_layout"); - const DataLayout data_layout = framework::StringToDataLayout(data_layout_str); - int n, c, in_d, in_h, in_w; - ExtractNCDWH(input.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w); - - auto interp_method = ctx.Attr("interp_method"); - bool align_corners = ctx.Attr("align_corners"); - int align_mode = ctx.Attr("align_mode"); - - int out_h = ctx.Attr("out_h"); - int out_w = ctx.Attr("out_w"); - - auto list_new_shape_tensor = ctx.MultiInput("SizeTensor"); - float scale_w = -1; - float scale_h = -1; - if (list_new_shape_tensor.size() > 0) { - // have size tensor - auto new_size = get_new_shape(list_new_shape_tensor); - out_h = new_size[0]; - out_w = new_size[1]; - } else { - auto scale_tensor = ctx.Input("Scale"); - auto scale = ctx.Attr>("scale"); - if (scale_tensor != nullptr) { - auto scale_data = get_new_data_from_tensor(scale_tensor); - if (scale_data.size() > 1) { - scale_h = scale_data[0]; - scale_w = scale_data[1]; - } else { - scale_h = scale_data[0]; - scale_w = scale_data[0]; - } - - PADDLE_ENFORCE_EQ( - scale_w > 0, true, - platform::errors::InvalidArgument( - "The scale_w in input 'Scale' Tensor of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_w)); - PADDLE_ENFORCE_EQ( - scale_h > 0, true, - platform::errors::InvalidArgument( - "The scale_h in input 'Scale' Tensor of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_h)); - } else { - if (scale.size() > 1) { - scale_w = scale[1]; - scale_h = scale[0]; - - PADDLE_ENFORCE_EQ( - scale_w > 0, true, - platform::errors::InvalidArgument( - "The scale_w in Attr(scale) of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_w)); - PADDLE_ENFORCE_EQ( - scale_h > 0, true, - platform::errors::InvalidArgument( - "The scale_h in Attr(scale) of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_h)); - } - } - if (scale_w > 0. && scale_h > 0.) { - out_h = static_cast(in_h * scale_h); - out_w = static_cast(in_w * scale_w); - } - auto out_size = ctx.Input("OutSize"); - if (out_size != nullptr) { - Tensor sizes; - framework::TensorCopySync(*out_size, platform::CPUPlace(), &sizes); - auto size_data = sizes.data(); - out_h = size_data[0]; - out_w = size_data[1]; - } - } - PADDLE_ENFORCE_GT(out_h, 0, platform::errors::InvalidArgument( - "out_h in Attr(out_shape) of Op(interpolate) " - "should be greater than 0.")); - PADDLE_ENFORCE_GT(out_w, 0, platform::errors::InvalidArgument( - "out_w in Attr(out_shape) of Op(interpolate) " - "should be greater than 0.")); - - framework::DDim dim_out; - if (data_layout == DataLayout::kNCHW) { - dim_out = {n, c, out_h, out_w}; - } else { - dim_out = {n, out_h, out_w, c}; - } - auto output_data = output->mutable_data(dim_out, ctx.GetPlace()); - - if (in_h == out_h && in_w == out_w) { - framework::TensorCopy(input, ctx.GetPlace(), output); - return; - } - - float ratio_h = 0.f; - float ratio_w = 0.f; - if (out_h > 1) { - float new_scale_h = 0.f; - new_scale_h = (scale_h > 0) ? static_cast(1. / scale_h) - : static_cast(in_h) / out_h; - ratio_h = (align_corners) ? static_cast(in_h - 1) / (out_h - 1) - : static_cast(new_scale_h); - } - if (out_w > 1) { - float new_scale_w = 0.f; - new_scale_w = (scale_w > 0) ? static_cast(1. / scale_w) - : static_cast(in_w) / out_w; - ratio_w = (align_corners) ? static_cast(in_w - 1) / (out_w - 1) - : static_cast(new_scale_w); - } - - int64_t in_hw = in_h * in_w; - int64_t out_hw = out_h * out_w; - int64_t in_chw = c * in_hw; - int64_t out_chw = c * out_hw; - - auto pixelNum = n * out_chw; - - platform::GpuLaunchConfig config = - platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), pixelNum); - - if ("nearest" == interp_method) { - if (data_layout == DataLayout::kNCHW) { - // get launch 3D config - int nc = n * c; - platform::GpuLaunchConfig config_3d = - GetGpuLaunchConfig3D(ctx.cuda_device_context(), nc, out_h, out_w); - KeNearestNeighborInterpNCHWFw< - T><<>>( - input_data, in_h, in_w, output_data, out_h, out_w, nc, ratio_h, - ratio_w, align_corners); - } else { - int64_t cw = c * out_w; - auto interp_divmods = FastDivModForInterpolate(c, out_chw, cw); - KeNearestNeighborInterpFw< - T><<>>( - input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n, - out_chw, c, ratio_h, ratio_w, align_corners, interp_divmods); - } - } else if ("bilinear" == interp_method) { - dim3 thread_num = config.thread_per_block; -#ifdef WITH_NV_JETSON - if (config.compute_capability == 53 || config.compute_capability == 62) { - thread_num = 512; - } -#endif - const T align_type_value = (align_mode == 0 && !align_corners) ? 0.5f : 0; - if (data_layout == DataLayout::kNCHW) { - // get launch 3D config - int nc = n * c; - platform::GpuLaunchConfig config_3d = - GetGpuLaunchConfig3D(ctx.cuda_device_context(), nc, out_h, out_w); - KeBilinearInterpNCHWFw< - T><<>>( - input_data, in_h, in_w, output_data, out_h, out_w, nc, ratio_h, - ratio_w, align_type_value); - } else { - int64_t cw = c * out_w; - auto interp_divmods = FastDivModForInterpolate(c, out_chw, cw); - KeBilinearInterpFw<<>>( - input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n, - out_chw, c, ratio_h, ratio_w, align_type_value, interp_divmods); - } - } else if ("bicubic" == interp_method) { -#ifdef __HIPCC__ - constexpr int thread_per_block = 256; -#else - constexpr int thread_per_block = 512; -#endif - KeBicubicInterpFw<<>>( - input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n, - out_chw, c, ratio_h, ratio_w, align_corners, data_layout); - } -} - -template -static void Interpolate3DCUDAFwd(const framework::ExecutionContext& ctx, - const Tensor& input, Tensor* output) { - auto* input_data = input.data(); - - const std::string data_layout_str = ctx.Attr("data_layout"); - const DataLayout data_layout = framework::StringToDataLayout(data_layout_str); - int n, c, in_d, in_h, in_w; - ExtractNCDWH(input.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w); - - auto interp_method = ctx.Attr("interp_method"); - bool align_corners = ctx.Attr("align_corners"); - int align_mode = ctx.Attr("align_mode"); - - int out_d = ctx.Attr("out_d"); - int out_h = ctx.Attr("out_h"); - int out_w = ctx.Attr("out_w"); - - auto list_new_shape_tensor = ctx.MultiInput("SizeTensor"); - float scale_w = -1; - float scale_d = -1; - float scale_h = -1; - if (list_new_shape_tensor.size() > 0) { - // have size tensor - auto new_size = get_new_shape(list_new_shape_tensor); - out_d = new_size[0]; - out_h = new_size[1]; - out_w = new_size[2]; - } else { - auto scale_tensor = ctx.Input("Scale"); - auto scale = ctx.Attr>("scale"); - if (scale_tensor != nullptr) { - auto scale_data = get_new_data_from_tensor(scale_tensor); - if (scale_data.size() > 1) { - scale_d = scale_data[0]; - scale_h = scale_data[1]; - scale_w = scale_data[2]; - } else { - scale_d = scale_data[0]; - scale_h = scale_data[0]; - scale_w = scale_data[0]; - } - - PADDLE_ENFORCE_EQ( - scale_w > 0, true, - platform::errors::InvalidArgument( - "The scale_w in input 'Scale' Tensor of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_w)); - PADDLE_ENFORCE_EQ( - scale_h > 0, true, - platform::errors::InvalidArgument( - "The scale_h in input 'Scale' Tensor of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_h)); - PADDLE_ENFORCE_EQ( - scale_d > 0, true, - platform::errors::InvalidArgument( - "The scale_d in input 'Scale' Tensor of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_d)); - } else { - if (scale.size() > 1) { - scale_d = scale[0]; - scale_h = scale[1]; - scale_w = scale[2]; - - PADDLE_ENFORCE_EQ( - scale_w > 0, true, - platform::errors::InvalidArgument( - "The scale_w in Attr(scale) of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_w)); - PADDLE_ENFORCE_EQ( - scale_h > 0, true, - platform::errors::InvalidArgument( - "The scale_h in Attr(scale) of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_h)); - PADDLE_ENFORCE_EQ( - scale_d > 0, true, - platform::errors::InvalidArgument( - "The scale_d in Attr(scale) of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_d)); - } - } - if (scale_d > 0. && scale_h > 0. && scale_w > 0.) { - out_d = static_cast(in_d * scale_d); - out_h = static_cast(in_h * scale_h); - out_w = static_cast(in_w * scale_w); - } - auto out_size = ctx.Input("OutSize"); - if (out_size != nullptr) { - Tensor sizes; - framework::TensorCopySync(*out_size, platform::CPUPlace(), &sizes); - auto size_data = sizes.data(); - out_d = size_data[0]; - out_h = size_data[1]; - out_w = size_data[2]; - } - } - PADDLE_ENFORCE_GT(out_d, 0, platform::errors::InvalidArgument( - "out_d in Attr(out_shape) of Op(interpolate) " - "should be greater than 0.")); - PADDLE_ENFORCE_GT(out_h, 0, platform::errors::InvalidArgument( - "out_h in Attr(out_shape) of Op(interpolate) " - "should be greater than 0.")); - PADDLE_ENFORCE_GT(out_w, 0, platform::errors::InvalidArgument( - "out_w in Attr(out_shape) of Op(interpolate) " - "should be greater than 0.")); - - framework::DDim dim_out; - if (data_layout == DataLayout::kNCHW) { - dim_out = {n, c, out_d, out_h, out_w}; - } else { - dim_out = {n, out_d, out_h, out_w, c}; - } - auto output_data = output->mutable_data(dim_out, ctx.GetPlace()); - - if (in_d == out_d && in_h == out_h && in_w == out_w) { - framework::TensorCopy(input, ctx.GetPlace(), output); - return; - } - - float ratio_d = 0.f; - float ratio_h = 0.f; - float ratio_w = 0.f; - if (out_d > 1) { - float new_scale_d = 0.f; - new_scale_d = (scale_d > 0) ? static_cast(1. / scale_d) - : static_cast(in_d) / out_d; - ratio_d = (align_corners) ? static_cast(in_d - 1) / (out_d - 1) - : static_cast(new_scale_d); - } - if (out_h > 1) { - float new_scale_h = 0.f; - new_scale_h = (scale_h > 0) ? static_cast(1. / scale_h) - : static_cast(in_h) / out_h; - ratio_h = (align_corners) ? static_cast(in_h - 1) / (out_h - 1) - : static_cast(new_scale_h); - } - if (out_w > 1) { - float new_scale_w = 0.f; - new_scale_w = (scale_w > 0) ? static_cast(1. / scale_w) - : static_cast(in_w) / out_w; - ratio_w = (align_corners) ? static_cast(in_w - 1) / (out_w - 1) - : static_cast(new_scale_w); - } - - int64_t in_dhw = in_d * in_h * in_w; - int64_t out_dhw = out_d * out_h * out_w; - int64_t in_cdhw = c * in_dhw; - int64_t out_cdhw = c * out_dhw; - - auto pixelNum = n * out_cdhw; - - platform::GpuLaunchConfig config = - platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), pixelNum); - - if ("trilinear" == interp_method) { - KeTrilinearInterpFw<<>>( - input_data, in_d, in_h, in_w, n, in_cdhw, output_data, out_d, out_h, - out_w, n, out_cdhw, c, ratio_d, ratio_h, ratio_w, align_corners, - align_mode, data_layout); - } else if ("nearest" == interp_method) { - KeNearestNeighbor3DInterpFw< - T><<>>( - input_data, in_d, in_h, in_w, n, in_cdhw, output_data, out_d, out_h, - out_w, n, out_cdhw, c, ratio_d, ratio_h, ratio_w, align_corners, - data_layout); - } -} - -template -static void Interpolate1DCUDABwd(const framework::ExecutionContext& ctx, - Tensor* input_grad, const Tensor output_grad) { - auto* input = ctx.Input("X"); - const std::string data_layout_str = ctx.Attr("data_layout"); - const DataLayout data_layout = framework::StringToDataLayout(data_layout_str); - int n, c, in_d, in_h, in_w; - ExtractNCDWH(input->dims(), data_layout, &n, &c, &in_d, &in_h, &in_w); - - auto interp_method = ctx.Attr("interp_method"); - bool align_corners = ctx.Attr("align_corners"); - int align_mode = ctx.Attr("align_mode"); - - int out_w = ctx.Attr("out_w"); - float scale_w = -1; - auto scale_tensor = ctx.Input("Scale"); - auto scale = ctx.Attr>("scale"); - if (scale_tensor != nullptr) { - auto scale_data = get_new_data_from_tensor(scale_tensor); - scale_w = scale_data[0]; - PADDLE_ENFORCE_EQ( - scale_w > 0, true, - platform::errors::InvalidArgument( - "The scale_w in input 'Scale' Tensor of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_w)); - } else { - if (scale.size() > 0) { - scale_w = scale[0]; - - PADDLE_ENFORCE_EQ( - scale_w > 0, true, - platform::errors::InvalidArgument( - "The scale_w in Attr(scale) of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_w)); - } - } - if (scale_w > 0.) { - out_w = static_cast(in_w * scale_w); - } - - auto out_size = ctx.Input("OutSize"); - if (out_size != nullptr) { - Tensor sizes; - framework::TensorCopySync(*out_size, platform::CPUPlace(), &sizes); - auto size_data = sizes.data(); - out_w = size_data[0]; - } - auto list_new_size_tensor = ctx.MultiInput("SizeTensor"); - if (list_new_size_tensor.size() > 0) { - // have size tensor - auto new_size = get_new_shape(list_new_size_tensor); - out_w = new_size[0]; - } - - auto* output_grad_data = output_grad.data(); - framework::DDim dim_grad; - if (data_layout == DataLayout::kNCHW) { - dim_grad = {n, c, in_w}; - } else { - dim_grad = {n, in_w, c}; - } - input_grad->mutable_data(dim_grad, ctx.GetPlace()); - auto* input_grad_data = input_grad->mutable_data(dim_grad, ctx.GetPlace()); - auto& device_ctx = ctx.template device_context(); - phi::funcs::SetConstant zero; - zero(device_ctx, input_grad, static_cast(0.0)); - - if (in_w == out_w) { - framework::TensorCopy(output_grad, ctx.GetPlace(), input_grad); - return; - } - - float ratio_w = 0.f; - if (out_w > 1) { - float new_scale_w = 0.f; - new_scale_w = (scale_w > 0) ? static_cast(1. / scale_w) - : static_cast(in_w) / out_w; - ratio_w = (align_corners) ? static_cast(in_w - 1) / (out_w - 1) - : static_cast(new_scale_w); - } - int64_t in_cw = c * in_w; - int64_t out_cw = c * out_w; - auto pixelNum = n * out_cw; - - platform::GpuLaunchConfig config = - platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), pixelNum); - - if ("linear" == interp_method) { - KeLinearInterpBw<<>>( - input_grad_data, in_w, in_cw, output_grad_data, out_w, n, out_cw, c, - ratio_w, align_corners, align_mode, data_layout); - } -} - -template -static void Interpolate2DCUDABwd(const framework::ExecutionContext& ctx, - Tensor* input_grad, const Tensor output_grad) { - auto* input = ctx.Input("X"); - const std::string data_layout_str = ctx.Attr("data_layout"); - const DataLayout data_layout = framework::StringToDataLayout(data_layout_str); - int n, c, in_d, in_h, in_w; - ExtractNCDWH(input->dims(), data_layout, &n, &c, &in_d, &in_h, &in_w); - - auto interp_method = ctx.Attr("interp_method"); - bool align_corners = ctx.Attr("align_corners"); - int align_mode = ctx.Attr("align_mode"); - - int out_h = ctx.Attr("out_h"); - int out_w = ctx.Attr("out_w"); - float scale_h = -1; - float scale_w = -1; - auto scale_tensor = ctx.Input("Scale"); - auto scale = ctx.Attr>("scale"); - if (scale_tensor != nullptr) { - auto scale_data = get_new_data_from_tensor(scale_tensor); - if (scale_data.size() > 1) { - scale_h = scale_data[0]; - scale_w = scale_data[1]; - } else { - scale_h = scale_data[0]; - scale_w = scale_data[0]; - } - - PADDLE_ENFORCE_EQ( - scale_w > 0, true, - platform::errors::InvalidArgument( - "The scale_w in input 'Scale' Tensor of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_w)); - PADDLE_ENFORCE_EQ( - scale_h > 0, true, - platform::errors::InvalidArgument( - "The scale_h in input 'Scale' Tensor of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_h)); - } else { - if (scale.size() > 1) { - scale_w = scale[1]; - scale_h = scale[0]; - - PADDLE_ENFORCE_EQ( - scale_w > 0, true, - platform::errors::InvalidArgument( - "The scale_w in Attr(scale) of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_w)); - PADDLE_ENFORCE_EQ( - scale_h > 0, true, - platform::errors::InvalidArgument( - "The scale_h in Attr(scale) of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_h)); - } - } - if (scale_w > 0. && scale_h > 0.) { - out_h = static_cast(in_h * scale_h); - out_w = static_cast(in_w * scale_w); - } - - auto out_size = ctx.Input("OutSize"); - if (out_size != nullptr) { - Tensor sizes; - framework::TensorCopySync(*out_size, platform::CPUPlace(), &sizes); - auto size_data = sizes.data(); - out_h = size_data[0]; - out_w = size_data[1]; - } - auto list_new_size_tensor = ctx.MultiInput("SizeTensor"); - if (list_new_size_tensor.size() > 0) { - // have size tensor - auto new_size = get_new_shape(list_new_size_tensor); - out_h = new_size[0]; - out_w = new_size[1]; - } - - auto* output_grad_data = output_grad.data(); - framework::DDim dim_grad; - if (data_layout == DataLayout::kNCHW) { - dim_grad = {n, c, in_h, in_w}; - } else { - dim_grad = {n, in_h, in_w, c}; - } - input_grad->mutable_data(dim_grad, ctx.GetPlace()); - auto* input_grad_data = input_grad->mutable_data(dim_grad, ctx.GetPlace()); - auto& device_ctx = ctx.template device_context(); - phi::funcs::SetConstant zero; - zero(device_ctx, input_grad, static_cast(0.0)); - - if (in_h == out_h && in_w == out_w) { - framework::TensorCopy(output_grad, ctx.GetPlace(), input_grad); - return; - } - - float ratio_h = 0.f; - float ratio_w = 0.f; - if (out_h > 1) { - float new_scale_h = 0.f; - new_scale_h = (scale_h > 0) ? static_cast(1. / scale_h) - : static_cast(in_h) / out_h; - ratio_h = (align_corners) ? static_cast(in_h - 1) / (out_h - 1) - : static_cast(new_scale_h); - } - if (out_w > 1) { - float new_scale_w = 0.f; - new_scale_w = (scale_w > 0) ? static_cast(1. / scale_w) - : static_cast(in_w) / out_w; - ratio_w = (align_corners) ? static_cast(in_w - 1) / (out_w - 1) - : static_cast(new_scale_w); - } - - int64_t in_hw = in_h * in_w; - int64_t out_hw = out_h * out_w; - int64_t in_chw = c * in_hw; - int64_t out_chw = c * out_hw; - auto pixelNum = n * out_chw; - - platform::GpuLaunchConfig config = - platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), pixelNum); - - if ("nearest" == interp_method) { - if (data_layout == DataLayout::kNCHW) { - // get launch 3D config - int nc = n * c; - platform::GpuLaunchConfig config_3d = - GetGpuLaunchConfig3D(ctx.cuda_device_context(), nc, out_h, out_w); - KeNearestNeighborInterpNCHWBw< - T><<>>( - input_grad_data, in_h, in_w, output_grad_data, out_h, out_w, nc, - ratio_h, ratio_w, align_corners); - } else { - int64_t cw = c * out_w; - auto interp_divmods = FastDivModForInterpolate(c, out_chw, cw); - KeNearestNeighborInterpBw< - T><<>>( - input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h, - out_w, n, out_chw, c, ratio_h, ratio_w, align_corners, - interp_divmods); - } - } else if ("bilinear" == interp_method) { - const T align_type_value = (align_mode == 0 && !align_corners) ? 0.5f : 0; - bool is_nchw = (data_layout == DataLayout::kNCHW) ? true : false; - bool optimize_flag = false; -#ifndef __HIPCC__ - optimize_flag = (in_h < (out_h >> 6) && in_w < (out_w >> 6)) - ? true - : ((in_h == 1 && in_w == 1) ? true : false); -#endif - - if (optimize_flag & is_nchw) { - KeBilinearInterpBwShareMemory< - T><<>>( - input_grad_data, in_h, in_w, output_grad_data, out_h, out_w, n, c, - ratio_h, ratio_w, align_type_value, is_nchw); - } else if (!optimize_flag & is_nchw) { - // - const int num_kernels = n * c * out_h * out_w; - const int num_threads = - std::min(ctx.cuda_device_context().GetMaxThreadsPerBlock(), 1024); - KeBilinearInterpNCHWBw< - T><<>>( - input_grad_data, in_h, in_w, out_h, out_w, n, c, ratio_h, ratio_w, - output_grad_data, align_type_value); - } else { - int64_t cw = c * out_w; - auto interp_divmods = FastDivModForInterpolate(c, out_chw, cw); - KeBilinearInterpBw<<>>( - input_grad_data, in_h, in_w, output_grad_data, out_h, out_w, n, - out_chw, c, ratio_h, ratio_w, align_type_value, interp_divmods); - } - } else if ("bicubic" == interp_method) { -#ifdef __HIPCC__ - constexpr int thread_per_block = 256; -#else - constexpr int thread_per_block = 512; -#endif - KeBicubicInterpBw<<>>( - input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h, out_w, - n, out_chw, c, ratio_h, ratio_w, align_corners, data_layout); - } -} - -template -static void Interpolate3DCUDABwd(const framework::ExecutionContext& ctx, - Tensor* input_grad, - const Tensor& output_grad) { - auto* input = ctx.Input("X"); - const std::string data_layout_str = ctx.Attr("data_layout"); - const DataLayout data_layout = framework::StringToDataLayout(data_layout_str); - int n, c, in_d, in_h, in_w; - ExtractNCDWH(input->dims(), data_layout, &n, &c, &in_d, &in_h, &in_w); - - auto interp_method = ctx.Attr("interp_method"); - bool align_corners = ctx.Attr("align_corners"); - int align_mode = ctx.Attr("align_mode"); - - int out_d = ctx.Attr("out_d"); - int out_h = ctx.Attr("out_h"); - int out_w = ctx.Attr("out_w"); - float scale_d = -1; - float scale_h = -1; - float scale_w = -1; - auto scale_tensor = ctx.Input("Scale"); - auto scale = ctx.Attr>("scale"); - if (scale_tensor != nullptr) { - auto scale_data = get_new_data_from_tensor(scale_tensor); - if (scale_data.size() > 1) { - scale_d = scale_data[0]; - scale_h = scale_data[1]; - scale_w = scale_data[2]; - } else { - scale_d = scale_data[0]; - scale_h = scale_data[0]; - scale_w = scale_data[0]; - } - PADDLE_ENFORCE_EQ( - scale_w > 0, true, - platform::errors::InvalidArgument( - "The scale_w in input 'Scale' Tensor of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_w)); - PADDLE_ENFORCE_EQ( - scale_h > 0, true, - platform::errors::InvalidArgument( - "The scale_h in input 'Scale' Tensor of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_h)); - PADDLE_ENFORCE_EQ( - scale_d > 0, true, - platform::errors::InvalidArgument( - "The scale_d in input 'Scale' Tensor of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_d)); - } else { - if (scale.size() > 1) { - scale_d = scale[0]; - scale_h = scale[1]; - scale_w = scale[2]; - - PADDLE_ENFORCE_EQ( - scale_w > 0, true, - platform::errors::InvalidArgument( - "The scale_w in Attr(scale) of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_w)); - PADDLE_ENFORCE_EQ( - scale_h > 0, true, - platform::errors::InvalidArgument( - "The scale_h in Attr(scale) of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_h)); - PADDLE_ENFORCE_EQ( - scale_d > 0, true, - platform::errors::InvalidArgument( - "The scale_d in Attr(scale) of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_d)); - } - } - if (scale_d > 0. && scale_h > 0. && scale_w > 0.) { - out_d = static_cast(in_d * scale_d); - out_h = static_cast(in_h * scale_h); - out_w = static_cast(in_w * scale_w); - } - - auto out_size = ctx.Input("OutSize"); - if (out_size != nullptr) { - Tensor sizes; - framework::TensorCopySync(*out_size, platform::CPUPlace(), &sizes); - auto size_data = sizes.data(); - out_d = size_data[0]; - out_h = size_data[1]; - out_w = size_data[2]; - } - auto list_new_size_tensor = ctx.MultiInput("SizeTensor"); - if (list_new_size_tensor.size() > 0) { - // have size tensor - auto new_size = get_new_shape(list_new_size_tensor); - out_d = new_size[0]; - out_h = new_size[1]; - out_w = new_size[2]; - } - - auto* output_grad_data = output_grad.data(); - framework::DDim dim_grad; - if (data_layout == DataLayout::kNCHW) { - dim_grad = {n, c, in_d, in_h, in_w}; - } else { - dim_grad = {n, in_d, in_h, in_w, c}; - } - auto* input_grad_data = input_grad->mutable_data(dim_grad, ctx.GetPlace()); - auto& device_ctx = ctx.template device_context(); - phi::funcs::SetConstant zero; - zero(device_ctx, input_grad, static_cast(0.0)); - - if (in_d == out_d && in_h == out_h && in_w == out_w) { - framework::TensorCopy(output_grad, ctx.GetPlace(), input_grad); - return; - } - - float ratio_d = 0.f; - float ratio_h = 0.f; - float ratio_w = 0.f; - if (out_d > 1) { - float new_scale_d = 0.f; - new_scale_d = (scale_d > 0) ? static_cast(1. / scale_d) - : static_cast(in_d) / out_d; - ratio_d = (align_corners) ? static_cast(in_d - 1) / (out_d - 1) - : static_cast(new_scale_d); - } - if (out_h > 1) { - float new_scale_h = 0.f; - new_scale_h = (scale_h > 0) ? static_cast(1. / scale_h) - : static_cast(in_h) / out_h; - ratio_h = (align_corners) ? static_cast(in_h - 1) / (out_h - 1) - : static_cast(new_scale_h); - } - if (out_w > 1) { - float new_scale_w = 0.f; - new_scale_w = (scale_w > 0) ? static_cast(1. / scale_w) - : static_cast(in_w) / out_w; - ratio_w = (align_corners) ? static_cast(in_w - 1) / (out_w - 1) - : static_cast(new_scale_w); - } - - int64_t in_dhw = in_d * in_h * in_w; - int64_t out_dhw = out_d * out_h * out_w; - int64_t in_cdhw = c * in_dhw; - int64_t out_cdhw = c * out_dhw; - - auto pixelNum = n * out_cdhw; - - platform::GpuLaunchConfig config = - platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), pixelNum); - - if ("trilinear" == interp_method) { - KeTrilinearInterpBw<<>>( - input_grad_data, in_d, in_h, in_w, n, in_cdhw, output_grad_data, out_d, - out_h, out_w, n, out_cdhw, c, ratio_d, ratio_h, ratio_w, align_corners, - align_mode, data_layout); - } else if ("nearest" == interp_method) { - KeNearestNeighbor3DInterpBw< - T><<>>( - input_grad_data, in_d, in_h, in_w, n, in_cdhw, output_grad_data, out_d, - out_h, out_w, n, out_cdhw, c, ratio_d, ratio_h, ratio_w, align_corners, - data_layout); - } -} - -template -class InterpolateOpV2CUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::NotFound("This kernel only runs on GPU device.")); - auto* input = ctx.Input("X"); - auto* output = ctx.Output("Out"); - - auto input_dims = input->dims(); - if (input_dims.size() == 3) { // 1D interpolation - Interpolate1DCUDAFwd(ctx, *input, output); - } else if (input_dims.size() == 4) { // 2D interpolation - Interpolate2DCUDAFwd(ctx, *input, output); - } else if (input_dims.size() == 5) { // 3D interpolation - Interpolate3DCUDAFwd(ctx, *input, output); - } - } -}; - -template -class InterpolateV2GradOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::NotFound("This kernel only runs on GPU device.")); - auto* input_grad = ctx.Output(framework::GradVarName("X")); - auto* output_grad = ctx.Input(framework::GradVarName("Out")); - - auto output_grad_dims = output_grad->dims(); - if (output_grad_dims.size() == 3) { // 1D interpolation - Interpolate1DCUDABwd(ctx, input_grad, *output_grad); - } else if (output_grad_dims.size() == 4) { // 2D interpolation - Interpolate2DCUDABwd(ctx, input_grad, *output_grad); - } else if (output_grad_dims.size() == 5) { // 3D interpolation - Interpolate3DCUDABwd(ctx, input_grad, *output_grad); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_CUDA_KERNEL(bilinear_interp_v2, - ops::InterpolateOpV2CUDAKernel, - ops::InterpolateOpV2CUDAKernel, - ops::InterpolateOpV2CUDAKernel); -REGISTER_OP_CUDA_KERNEL(bilinear_interp_v2_grad, - ops::InterpolateV2GradOpCUDAKernel, - ops::InterpolateV2GradOpCUDAKernel); -REGISTER_OP_CUDA_KERNEL(nearest_interp_v2, - ops::InterpolateOpV2CUDAKernel, - ops::InterpolateOpV2CUDAKernel, - ops::InterpolateOpV2CUDAKernel, - ops::InterpolateOpV2CUDAKernel); -REGISTER_OP_CUDA_KERNEL(nearest_interp_v2_grad, - ops::InterpolateV2GradOpCUDAKernel, - ops::InterpolateV2GradOpCUDAKernel); -REGISTER_OP_CUDA_KERNEL(trilinear_interp_v2, - ops::InterpolateOpV2CUDAKernel, - ops::InterpolateOpV2CUDAKernel, - ops::InterpolateOpV2CUDAKernel); -REGISTER_OP_CUDA_KERNEL(trilinear_interp_v2_grad, - ops::InterpolateV2GradOpCUDAKernel, - ops::InterpolateV2GradOpCUDAKernel); -REGISTER_OP_CUDA_KERNEL(linear_interp_v2, ops::InterpolateOpV2CUDAKernel, - ops::InterpolateOpV2CUDAKernel, - ops::InterpolateOpV2CUDAKernel); -REGISTER_OP_CUDA_KERNEL(linear_interp_v2_grad, - ops::InterpolateV2GradOpCUDAKernel, - ops::InterpolateV2GradOpCUDAKernel); -REGISTER_OP_CUDA_KERNEL(bicubic_interp_v2, - ops::InterpolateOpV2CUDAKernel, - ops::InterpolateOpV2CUDAKernel, - ops::InterpolateOpV2CUDAKernel); -REGISTER_OP_CUDA_KERNEL(bicubic_interp_v2_grad, - ops::InterpolateV2GradOpCUDAKernel, - ops::InterpolateV2GradOpCUDAKernel); diff --git a/paddle/fluid/operators/interpolate_v2_op.h b/paddle/fluid/operators/interpolate_v2_op.h deleted file mode 100644 index f99d3f6c32442..0000000000000 --- a/paddle/fluid/operators/interpolate_v2_op.h +++ /dev/null @@ -1,1618 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#pragma once -#include -#include -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/core/hostdevice.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -template -using EigenTensor = framework::EigenTensor; -using Tensor = framework::Tensor; -using DataLayout = framework::DataLayout; - -inline std::vector get_new_shape( - const std::vector& list_new_shape_tensor) { - // get tensor from - std::vector vec_new_shape; - for (size_t i = 0; i < list_new_shape_tensor.size(); ++i) { - auto tensor = list_new_shape_tensor[i]; - PADDLE_ENFORCE_EQ(tensor->dims(), phi::make_ddim({1}), - platform::errors::InvalidArgument( - "The shape of dimension tensor should be [1]," - "but received d%.", - tensor->dims())); - if (platform::is_gpu_place(tensor->place())) { - framework::Tensor temp; - paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp); - vec_new_shape.push_back(static_cast(*temp.data())); - } else { - vec_new_shape.push_back(static_cast(*tensor->data())); - } - } - - return vec_new_shape; -} - -template -inline std::vector get_new_data_from_tensor(const Tensor* new_data_tensor) { - std::vector vec_new_data; - auto* new_data = new_data_tensor->data(); - framework::Tensor cpu_starts_tensor; - if (platform::is_gpu_place(new_data_tensor->place())) { - paddle::framework::TensorCopySync(*new_data_tensor, platform::CPUPlace(), - &cpu_starts_tensor); - new_data = cpu_starts_tensor.data(); - } -#ifdef PADDLE_WITH_ASCEND_CL - if (platform::is_npu_place(new_data_tensor->place())) { - paddle::framework::TensorCopySync(*new_data_tensor, platform::CPUPlace(), - &cpu_starts_tensor); - new_data = cpu_starts_tensor.data(); - } -#endif -#ifdef PADDLE_WITH_XPU - if (platform::is_xpu_place(new_data_tensor->place())) { - paddle::framework::TensorCopySync(*new_data_tensor, platform::CPUPlace(), - &cpu_starts_tensor); - new_data = cpu_starts_tensor.data(); - } -#endif - vec_new_data = std::vector(new_data, new_data + new_data_tensor->numel()); - return vec_new_data; -} - -inline void ExtractNCDWH(const framework::DDim& dims, - const DataLayout& data_layout, int* N, int* C, int* D, - int* H, int* W) { - *N = dims[0]; - - if (dims.size() == 3) { - *C = data_layout == DataLayout::kNCHW ? dims[1] : dims[2]; - *D = 1; - *H = 1; - *W = data_layout == DataLayout::kNCHW ? dims[2] : dims[1]; - } else if (dims.size() == 4) { - *C = data_layout == DataLayout::kNCHW ? dims[1] : dims[3]; - *D = 1; - *H = data_layout == DataLayout::kNCHW ? dims[2] : dims[1]; - *W = data_layout == DataLayout::kNCHW ? dims[3] : dims[2]; - } else { - *C = data_layout == DataLayout::kNCHW ? dims[1] : dims[4]; - *D = data_layout == DataLayout::kNCHW ? dims[2] : dims[1]; - *H = data_layout == DataLayout::kNCHW ? dims[3] : dims[2]; - *W = data_layout == DataLayout::kNCHW ? dims[4] : dims[3]; - } -} - -template -static void NearestNeighborInterpolate(const Tensor& input, Tensor* output, - const float ratio_h, const float ratio_w, - const int n, const int c, - const int out_h, const int out_w, - const bool align_corners, - const DataLayout& data_layout) { - auto input_t = EigenTensor::From(input); - auto output_t = EigenTensor::From(*output); - for (int k = 0; k < out_h; k++) { // loop for images - int in_k = (align_corners) ? static_cast(ratio_h * k + 0.5) - : static_cast(ratio_h * k); - - for (int l = 0; l < out_w; l++) { - int in_l = (align_corners) ? static_cast(ratio_w * l + 0.5) - : static_cast(ratio_w * l); - - for (int i = 0; i < n; i++) { // loop for batches - for (int j = 0; j < c; j++) { // loop for channels - if (data_layout == DataLayout::kNCHW) { - output_t(i, j, k, l) = input_t(i, j, in_k, in_l); - } else { - output_t(i, k, l, j) = input_t(i, in_k, in_l, j); - } - } - } - } - } -} - -template -static void NearestNeighbor3DInterpolate( - const Tensor& input, Tensor* output, const float ratio_d, - const float ratio_h, const float ratio_w, const int n, const int c, - const int out_d, const int out_h, const int out_w, const bool align_corners, - const DataLayout& data_layout) { - auto input_t = EigenTensor::From(input); - auto output_t = EigenTensor::From(*output); - for (int d = 0; d < out_d; d++) { // loop for images - int in_d = (align_corners) ? static_cast(ratio_d * d + 0.5) - : static_cast(ratio_d * d); - for (int k = 0; k < out_h; k++) { - int in_k = (align_corners) ? static_cast(ratio_h * k + 0.5) - : static_cast(ratio_h * k); - - for (int l = 0; l < out_w; l++) { - int in_l = (align_corners) ? static_cast(ratio_w * l + 0.5) - : static_cast(ratio_w * l); - - for (int i = 0; i < n; i++) { // loop for batches - for (int j = 0; j < c; j++) { // loop for channels - if (data_layout == DataLayout::kNCHW) { - output_t(i, j, d, k, l) = input_t(i, j, in_d, in_k, in_l); - } else { // NDHWC - output_t(i, d, k, l, j) = input_t(i, in_d, in_k, in_l, j); - } - } - } - } - } - } -} - -template -static void LinearInterpolation(const Tensor& input, Tensor* output, - const float ratio_w, const int in_w, - const int n, const int c, const int out_w, - const bool align_corners, const bool align_mode, - const DataLayout data_layout) { - auto input_t = EigenTensor::From(input); - auto output_t = EigenTensor::From(*output); - bool align_flag = (align_mode == 0 && !align_corners); - - std::vector vx_w, vx_e; - std::vector vd_w, vd_e; - vx_w.reserve(out_w); - vx_e.reserve(out_w); - vd_w.reserve(out_w); - vd_e.reserve(out_w); -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for -#endif - for (int l = 0; l < out_w; l++) { - int x_w = align_flag ? static_cast(ratio_w * (l + 0.5) - 0.5) - : static_cast(ratio_w * l); - x_w = (x_w > 0) ? x_w : 0; // w - int x_e = (x_w < (in_w - 1)) ? (x_w + 1) : x_w; // w_id - - float idx_src_x = ratio_w * (l + 0.5) - 0.5; - idx_src_x = (idx_src_x > 0) ? idx_src_x : 0; - float d_w = align_flag ? idx_src_x - x_w : ratio_w * l - x_w; // w1lambda - float d_e = 1.f - d_w; // w2lambda - { - vx_w[l] = x_w; - vx_e[l] = x_e; - vd_w[l] = d_w; - vd_e[l] = d_e; - } - } - -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for collapse(3) -#endif - for (int i = 0; i < n; i++) { // loop for batches - for (int j = 0; j < c; j++) { // loop for channels - for (int l = 0; l < out_w; l++) { - // linear interpolation - T out_t; - if (data_layout == DataLayout::kNCHW) { - out_t = input_t(i, j, vx_w[l]) * vd_e[l] + - input_t(i, j, vx_e[l]) * vd_w[l]; - output_t(i, j, l) = out_t; - } else { - out_t = input_t(i, vx_w[l], j) * vd_e[l] + - input_t(i, vx_e[l], j) * vd_w[l]; - output_t(i, l, j) = out_t; - } - } - } - } -} - -template -static void LinearInterpolationGrad(const Tensor& output_grad, - Tensor* input_grad, const float ratio_w, - const int in_w, const int n, const int c, - const int out_w, const bool align_corners, - const int align_mode, - const DataLayout data_layout) { - auto input_grad_t = EigenTensor::From(*input_grad); - auto output_grad_t = EigenTensor::From(output_grad); - bool align_flag = (align_mode == 0 && !align_corners); - for (int l = 0; l < out_w; l++) { - int x_w = align_flag ? static_cast(ratio_w * (l + 0.5) - 0.5) - : static_cast(ratio_w * l); - x_w = (x_w > 0) ? x_w : 0; // w - int x_e = (x_w < (in_w - 1)) ? (x_w + 1) : x_w; // w_id - - float idx_src_x = ratio_w * (l + 0.5) - 0.5; - idx_src_x = (idx_src_x > 0) ? idx_src_x : 0; - float d_w = align_flag ? idx_src_x - x_w : ratio_w * l - x_w; // w1lambda - float d_e = 1.f - d_w; // w2lambda - - for (int i = 0; i < n; i++) { // loop for batches - for (int j = 0; j < c; j++) { // loop for channels - // linear interpolation grad - if (data_layout == DataLayout::kNCHW) { - const T grad = output_grad_t(i, j, l); - input_grad_t(i, j, x_w) += static_cast(grad * d_e); - input_grad_t(i, j, x_e) += static_cast(grad * d_w); - } else { - const T grad = output_grad_t(i, l, j); - input_grad_t(i, x_w, j) += static_cast(grad * d_e); - input_grad_t(i, x_e, j) += static_cast(grad * d_w); - } - } - } - } -} - -template -static void BilinearInterpolation(const Tensor& input, Tensor* output, - const float ratio_h, const float ratio_w, - const int in_h, const int in_w, const int n, - const int c, const int out_h, const int out_w, - const bool align_corners, - const bool align_mode, - const DataLayout data_layout) { - auto input_t = EigenTensor::From(input); - auto output_t = EigenTensor::From(*output); - bool align_flag = (align_mode == 0 && !align_corners); - - std::vector vy_n, vy_s; - std::vector vd_n, vd_s; - vy_n.reserve(out_h); - vy_s.reserve(out_h); - vd_n.reserve(out_h); - vd_s.reserve(out_h); -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for -#endif - for (int k = 0; k < out_h; k++) { - int y_n = align_flag ? static_cast(ratio_h * (k + 0.5) - 0.5) - : static_cast(ratio_h * k); - y_n = (y_n > 0) ? y_n : 0; - int y_s = (y_n + 1) < (in_h - 1) ? (y_n + 1) : (in_h - 1); - float idx_src_y = ratio_h * (k + 0.5) - 0.5; - idx_src_y = (idx_src_y > 0) ? idx_src_y : 0; - float d_n = align_flag ? idx_src_y - y_n : ratio_h * k - y_n; - float d_s = 1.f - d_n; - { - vy_n[k] = y_n; - vy_s[k] = y_s; - vd_n[k] = d_n; - vd_s[k] = d_s; - } - } - - std::vector vx_w, vx_e; - std::vector vd_w, vd_e; - vx_w.reserve(out_w); - vx_e.reserve(out_w); - vd_w.reserve(out_w); - vd_e.reserve(out_w); -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for -#endif - for (int l = 0; l < out_w; l++) { - int x_w = (align_mode == 0 && !align_corners) - ? static_cast(ratio_w * (l + 0.5) - 0.5) - : static_cast(ratio_w * l); - x_w = (x_w > 0) ? x_w : 0; - int x_e = (x_w + 1) < (in_w - 1) ? (x_w + 1) : (in_w - 1); - float idx_src_x = ratio_w * (l + 0.5) - 0.5; - idx_src_x = (idx_src_x > 0) ? idx_src_x : 0; - float d_w = align_flag ? idx_src_x - x_w : ratio_w * l - x_w; - float d_e = 1.f - d_w; - { - vx_w[l] = x_w; - vx_e[l] = x_e; - vd_w[l] = d_w; - vd_e[l] = d_e; - } - } - -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for collapse(4) -#endif - for (int i = 0; i < n; i++) { // loop for batches - for (int j = 0; j < c; j++) { // loop for channels - for (int k = 0; k < out_h; k++) { // loop for images - for (int l = 0; l < out_w; l++) { - // bilinear interpolation - T out_t; - if (data_layout == DataLayout::kNCHW) { - out_t = input_t(i, j, vy_n[k], vx_w[l]) * vd_s[k] * vd_e[l] + - input_t(i, j, vy_s[k], vx_w[l]) * vd_n[k] * vd_e[l] + - input_t(i, j, vy_n[k], vx_e[l]) * vd_s[k] * vd_w[l] + - input_t(i, j, vy_s[k], vx_e[l]) * vd_n[k] * vd_w[l]; - output_t(i, j, k, l) = out_t; - - } else { - out_t = input_t(i, vy_n[k], vx_w[l], j) * vd_s[k] * vd_e[l] + - input_t(i, vy_s[k], vx_w[l], j) * vd_n[k] * vd_e[l] + - input_t(i, vy_n[k], vx_e[l], j) * vd_s[k] * vd_w[l] + - input_t(i, vy_s[k], vx_e[l], j) * vd_n[k] * vd_w[l]; - output_t(i, k, l, j) = out_t; - } - } - } - } - } -} - -template -static void TrilinearInterpolation( - const Tensor& input, Tensor* output, const float ratio_d, - const float ratio_h, const float ratio_w, const int in_d, const int in_h, - const int in_w, const int n, const int c, const int out_d, const int out_h, - const int out_w, const bool align_corners, const bool align_mode, - const DataLayout& data_layout) { - auto input_t = EigenTensor::From(input); - auto output_t = EigenTensor::From(*output); - bool align_flag = (align_mode == 0 && !align_corners); - - std::vector vt_f, vt_b; - std::vector vd_f, vd_b; - vt_f.reserve(out_d); - vt_b.reserve(out_d); - vd_f.reserve(out_d); - vd_b.reserve(out_d); -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for -#endif - for (int j = 0; j < out_d; j++) { - int t_f = align_flag ? static_cast(ratio_d * (j + 0.5) - 0.5) - : static_cast(ratio_d * j); - t_f = (t_f > 0) ? t_f : 0; - int t_b = (t_f + 1) < (in_d - 1) ? (t_f + 1) : (in_d - 1); - float idx_src_t = ratio_d * (j + 0.5) - 0.5; - idx_src_t = (idx_src_t > 0) ? idx_src_t : 0; - float d_f = align_flag ? idx_src_t - t_f : ratio_d * j - t_f; - float d_b = 1.f - d_f; - { - vt_f[j] = t_f; - vt_b[j] = t_b; - vd_f[j] = d_f; - vd_b[j] = d_b; - } - } - - std::vector vy_n, vy_s; - std::vector vd_n, vd_s; - vy_n.reserve(out_h); - vy_s.reserve(out_h); - vd_n.reserve(out_h); - vd_s.reserve(out_h); -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for -#endif - for (int k = 0; k < out_h; k++) { - int y_n = align_flag ? static_cast(ratio_h * (k + 0.5) - 0.5) - : static_cast(ratio_h * k); - y_n = (y_n > 0) ? y_n : 0; - int y_s = (y_n + 1) < (in_h - 1) ? (y_n + 1) : (in_h - 1); - float idx_src_y = ratio_h * (k + 0.5) - 0.5; - idx_src_y = (idx_src_y > 0) ? idx_src_y : 0; - float d_n = align_flag ? idx_src_y - y_n : ratio_h * k - y_n; - float d_s = 1.f - d_n; - { - vy_n[k] = y_n; - vy_s[k] = y_s; - vd_n[k] = d_n; - vd_s[k] = d_s; - } - } - - std::vector vx_w, vx_e; - std::vector vd_w, vd_e; - vx_w.reserve(out_w); - vx_e.reserve(out_w); - vd_w.reserve(out_w); - vd_e.reserve(out_w); -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for -#endif - for (int l = 0; l < out_w; l++) { - int x_w = (align_mode == 0 && !align_corners) - ? static_cast(ratio_w * (l + 0.5) - 0.5) - : static_cast(ratio_w * l); - x_w = (x_w > 0) ? x_w : 0; - int x_e = (x_w + 1) < (in_w - 1) ? (x_w + 1) : (in_w - 1); - float idx_src_x = ratio_w * (l + 0.5) - 0.5; - idx_src_x = (idx_src_x > 0) ? idx_src_x : 0; - float d_w = align_flag ? idx_src_x - x_w : ratio_w * l - x_w; - float d_e = 1.f - d_w; - { - vx_w[l] = x_w; - vx_e[l] = x_e; - vd_w[l] = d_w; - vd_e[l] = d_e; - } - } - -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for collapse(5) -#endif - for (int b = 0; b < n; b++) { // loop for batches - for (int i = 0; i < c; i++) { // loop for channels - for (int j = 0; j < out_d; j++) { // loop for D, H, W - for (int k = 0; k < out_h; k++) { - for (int l = 0; l < out_w; l++) { - // trilinear interpolation - if (data_layout == DataLayout::kNCHW) { - T out_t = input_t(b, i, vt_f[j], vy_n[k], vx_w[l]) * vd_b[j] * - vd_s[k] * vd_e[l] + - input_t(b, i, vt_f[j], vy_n[k], vx_e[l]) * vd_b[j] * - vd_s[k] * vd_w[l] + - input_t(b, i, vt_f[j], vy_s[k], vx_w[l]) * vd_b[j] * - vd_n[k] * vd_e[l] + - input_t(b, i, vt_f[j], vy_s[k], vx_e[l]) * vd_b[j] * - vd_n[k] * vd_w[l] + - input_t(b, i, vt_b[j], vy_n[k], vx_w[l]) * vd_f[j] * - vd_s[k] * vd_e[l] + - input_t(b, i, vt_b[j], vy_n[k], vx_e[l]) * vd_f[j] * - vd_s[k] * vd_w[l] + - input_t(b, i, vt_b[j], vy_s[k], vx_w[l]) * vd_f[j] * - vd_n[k] * vd_e[l] + - input_t(b, i, vt_b[j], vy_s[k], vx_e[l]) * vd_f[j] * - vd_n[k] * vd_w[l]; - output_t(b, i, j, k, l) = out_t; - } else { - T out_t = input_t(b, vt_f[j], vy_n[k], vx_w[l], i) * vd_b[j] * - vd_s[k] * vd_e[l] + - input_t(b, vt_f[j], vy_n[k], vx_e[l], i) * vd_b[j] * - vd_s[k] * vd_w[l] + - input_t(b, vt_f[j], vy_s[k], vx_w[l], i) * vd_b[j] * - vd_n[k] * vd_e[l] + - input_t(b, vt_f[j], vy_s[k], vx_e[l], i) * vd_b[j] * - vd_n[k] * vd_w[l] + - input_t(b, vt_b[j], vy_n[k], vx_w[l], i) * vd_f[j] * - vd_s[k] * vd_e[l] + - input_t(b, vt_b[j], vy_n[k], vx_e[l], i) * vd_f[j] * - vd_s[k] * vd_w[l] + - input_t(b, vt_b[j], vy_s[k], vx_w[l], i) * vd_f[j] * - vd_n[k] * vd_e[l] + - input_t(b, vt_b[j], vy_s[k], vx_e[l], i) * vd_f[j] * - vd_n[k] * vd_w[l]; - output_t(b, j, k, l, i) = out_t; - } - } - } - } - } - } -} - -template -HOSTDEVICE inline T cubic_convolution1(T x, T A) { - return ((A + 2) * x - (A + 3)) * x * x + 1; -} - -template -HOSTDEVICE inline T cubic_convolution2(T x, T A) { - return ((A * x - 5 * A) * x + 8 * A) * x - 4 * A; -} - -template -HOSTDEVICE inline void get_cubic_upsample_coefficients(T coeffs[4], T t) { - T A = -0.75; - - T x1 = t; - coeffs[0] = cubic_convolution2(x1 + 1.0, A); - coeffs[1] = cubic_convolution1(x1, A); - - // opposite coefficients - T x2 = 1.0 - t; - coeffs[2] = cubic_convolution1(x2, A); - coeffs[3] = cubic_convolution2(x2 + 1.0, A); -} - -template -static inline T cubic_interp(T x0, T x1, T x2, T x3, T t) { - T coeffs[4]; - get_cubic_upsample_coefficients(coeffs, t); - - return x0 * coeffs[0] + x1 * coeffs[1] + x2 * coeffs[2] + x3 * coeffs[3]; -} - -template -static void BicubicInterpolation(const Tensor& input, Tensor* output, - const float ratio_h, const float ratio_w, - const int in_h, const int in_w, const int n, - const int c, const int out_h, const int out_w, - const bool align_corners, - const DataLayout data_layout) { - auto input_t = EigenTensor::From(input); - auto output_t = EigenTensor::From(*output); - - for (int k = 0; k < out_h; k++) { // loop for images - T y_n = align_corners ? static_cast(ratio_h * k) - : static_cast(ratio_h * (k + 0.5) - 0.5); - int input_y = floorf(y_n); - const T y_t = y_n - input_y; - - for (int l = 0; l < out_w; l++) { - T x_n = align_corners ? static_cast(ratio_w * l) - : static_cast(ratio_w * (l + 0.5) - 0.5); - int input_x = floorf(x_n); - const T x_t = x_n - input_x; - - for (int i = 0; i < n; i++) { // loop for batches - for (int j = 0; j < c; j++) { // loop for channels - T coefficients[4]; - // interp 4 times in x direction - for (int ii = 0; ii < 4; ii++) { - int access_y = std::max(std::min(input_y - 1 + ii, in_h - 1), - static_cast(0)); - int access_x_0 = - std::max(std::min(input_x - 1, in_w - 1), static_cast(0)); - int access_x_1 = - std::max(std::min(input_x + 0, in_w - 1), static_cast(0)); - int access_x_2 = - std::max(std::min(input_x + 1, in_w - 1), static_cast(0)); - int access_x_3 = - std::max(std::min(input_x + 2, in_w - 1), static_cast(0)); - if (data_layout == DataLayout::kNCHW) { - coefficients[ii] = - cubic_interp(input_t(i, j, access_y, access_x_0), - input_t(i, j, access_y, access_x_1), - input_t(i, j, access_y, access_x_2), - input_t(i, j, access_y, access_x_3), x_t); - } else { - coefficients[ii] = - cubic_interp(input_t(i, access_y, access_x_0, j), - input_t(i, access_y, access_x_1, j), - input_t(i, access_y, access_x_2, j), - input_t(i, access_y, access_x_3, j), x_t); - } - } - - // interp y direction - if (data_layout == DataLayout::kNCHW) { - output_t(i, j, k, l) = - cubic_interp(coefficients[0], coefficients[1], - coefficients[2], coefficients[3], y_t); - } else { - output_t(i, k, l, j) = - cubic_interp(coefficients[0], coefficients[1], - coefficients[2], coefficients[3], y_t); - } - } - } - } - } -} - -template -static void NearestNeighborInterpolateGrad( - const Tensor& output_grad, Tensor* input_grad, const float ratio_h, - const float ratio_w, const int n, const int c, const int out_h, - const int out_w, const bool align_corners, const DataLayout data_layout) { - auto input_grad_t = EigenTensor::From(*input_grad); - auto output_grad_t = EigenTensor::From(output_grad); - - for (int k = 0; k < out_h; k++) { // loop for images - int in_k = (align_corners) ? static_cast(ratio_h * k + 0.5) - : static_cast(ratio_h * k); - - for (int l = 0; l < out_w; l++) { - int in_l = (align_corners) ? static_cast(ratio_w * l + 0.5) - : static_cast(ratio_w * l); - - for (int i = 0; i < n; i++) { // loop for batches - for (int j = 0; j < c; j++) { // loop for channels - if (data_layout == DataLayout::kNCHW) { - input_grad_t(i, j, in_k, in_l) += output_grad_t(i, j, k, l); - } else { - input_grad_t(i, in_k, in_l, j) += output_grad_t(i, k, l, j); - } - } - } - } - } -} - -template -static void NearestNeighbor3DInterpolateGrad( - const Tensor& output_grad, Tensor* input_grad, const float ratio_d, - const float ratio_h, const float ratio_w, const int n, const int c, - const int out_d, const int out_h, const int out_w, const bool align_corners, - const DataLayout data_layout) { - auto input_grad_t = EigenTensor::From(*input_grad); - auto output_grad_t = EigenTensor::From(output_grad); - - for (int d = 0; d < out_d; d++) { - int in_d = (align_corners) ? static_cast(ratio_d * d + 0.5) - : static_cast(ratio_d * d); - for (int k = 0; k < out_h; k++) { // loop for images - int in_k = (align_corners) ? static_cast(ratio_h * k + 0.5) - : static_cast(ratio_h * k); - - for (int l = 0; l < out_w; l++) { - int in_l = (align_corners) ? static_cast(ratio_w * l + 0.5) - : static_cast(ratio_w * l); - - for (int i = 0; i < n; i++) { // loop for batches - for (int j = 0; j < c; j++) { // loop for channels - if (data_layout == DataLayout::kNCHW) { - input_grad_t(i, j, in_d, in_k, in_l) += - output_grad_t(i, j, d, k, l); - } else { - input_grad_t(i, in_d, in_k, in_l, j) += - output_grad_t(i, d, k, l, j); - } - } - } - } - } - } -} - -template -static void BilinearInterpolationGrad( - const Tensor& output_grad, Tensor* input_grad, const float ratio_h, - const float ratio_w, const int in_h, const int in_w, const int n, - const int c, const int out_h, const int out_w, const bool align_corners, - const int align_mode, const DataLayout data_layout) { - auto input_grad_t = EigenTensor::From(*input_grad); - auto output_grad_t = EigenTensor::From(output_grad); - bool align_flag = (align_mode == 0 && !align_corners); - for (int k = 0; k < out_h; k++) { // loop for images - int y_n = align_flag ? static_cast(ratio_h * (k + 0.5) - 0.5) - : static_cast(ratio_h * k); - y_n = (y_n > 0) ? y_n : 0; - int y_s = (y_n + 1) < (in_h - 1) ? (y_n + 1) : (in_h - 1); - float idx_src_y = ratio_h * (k + 0.5) - 0.5; - idx_src_y = (idx_src_y > 0) ? idx_src_y : 0; - float d_n = align_flag ? idx_src_y - y_n : ratio_h * k - y_n; - float d_s = 1.f - d_n; - - for (int l = 0; l < out_w; l++) { - int x_w = align_flag ? static_cast(ratio_w * (l + 0.5) - 0.5) - : static_cast(ratio_w * l); - x_w = (x_w > 0) ? x_w : 0; - int x_e = (x_w + 1) < (in_w - 1) ? (x_w + 1) : (in_w - 1); - float idx_src_x = ratio_w * (l + 0.5) - 0.5; - idx_src_x = (idx_src_x > 0) ? idx_src_x : 0; - float d_w = align_flag ? idx_src_x - x_w : ratio_w * l - x_w; - float d_e = 1.f - d_w; - - for (int i = 0; i < n; i++) { // loop for batches - for (int j = 0; j < c; j++) { // loop for channels - // bilinear interpolation grad - if (data_layout == DataLayout::kNCHW) { - const T grad = output_grad_t(i, j, k, l); - input_grad_t(i, j, y_n, x_w) += static_cast(grad * d_s * d_e); - input_grad_t(i, j, y_s, x_w) += static_cast(grad * d_n * d_e); - input_grad_t(i, j, y_n, x_e) += static_cast(grad * d_s * d_w); - input_grad_t(i, j, y_s, x_e) += static_cast(grad * d_n * d_w); - } else { - const T grad = output_grad_t(i, k, l, j); - input_grad_t(i, y_n, x_w, j) += static_cast(grad * d_s * d_e); - input_grad_t(i, y_s, x_w, j) += static_cast(grad * d_n * d_e); - input_grad_t(i, y_n, x_e, j) += static_cast(grad * d_s * d_w); - input_grad_t(i, y_s, x_e, j) += static_cast(grad * d_n * d_w); - } - } - } - } - } -} - -template -static void TrilinearInterpolationGrad( - const Tensor& output_grad, Tensor* input_grad, const float ratio_d, - const float ratio_h, const float ratio_w, const int in_d, const int in_h, - const int in_w, const int n, const int c, const int out_d, const int out_h, - const int out_w, const bool align_corners, const int align_mode, - const DataLayout data_layout) { - auto input_grad_t = EigenTensor::From(*input_grad); - auto output_grad_t = EigenTensor::From(output_grad); - bool align_flag = (align_mode == 0 && !align_corners); - for (int j = 0; j < out_d; j++) { // loop for D - int t_f = align_flag ? static_cast(ratio_d * (j + 0.5) - 0.5) - : static_cast(ratio_d * j); - t_f = (t_f > 0) ? t_f : 0; - int t_b = (t_f + 1) < (in_d - 1) ? (t_f + 1) : (in_d - 1); - float idx_src_t = ratio_d * (j + 0.5) - 0.5; - idx_src_t = (idx_src_t > 0) ? idx_src_t : 0; - float d_f = align_flag ? idx_src_t - t_f : ratio_d * j - t_f; - float d_b = 1.f - d_f; - - for (int k = 0; k < out_h; k++) { // loop for H - int y_n = align_flag ? static_cast(ratio_h * (k + 0.5) - 0.5) - : static_cast(ratio_h * k); - y_n = (y_n > 0) ? y_n : 0; - int y_s = (y_n + 1) < (in_h - 1) ? (y_n + 1) : (in_h - 1); - float idx_src_y = ratio_h * (k + 0.5) - 0.5; - idx_src_y = (idx_src_y > 0) ? idx_src_y : 0; - float d_n = align_flag ? idx_src_y - y_n : ratio_h * k - y_n; - float d_s = 1.f - d_n; - - for (int l = 0; l < out_w; l++) { // loop for W - int x_w = align_flag ? static_cast(ratio_w * (l + 0.5) - 0.5) - : static_cast(ratio_w * l); - x_w = (x_w > 0) ? x_w : 0; - int x_e = (x_w + 1) < (in_w - 1) ? (x_w + 1) : (in_w - 1); - float idx_src_x = ratio_w * (l + 0.5) - 0.5; - idx_src_x = (idx_src_x > 0) ? idx_src_x : 0; - float d_w = align_flag ? idx_src_x - x_w : ratio_w * l - x_w; - float d_e = 1.f - d_w; - - for (int b = 0; b < n; b++) { // loop for batches - for (int i = 0; i < c; i++) { // loop for channels - // trilinear interpolation grad - if (data_layout == DataLayout::kNCHW) { - const T grad = output_grad_t(b, i, j, k, l); - input_grad_t(b, i, t_f, y_n, x_w) += - static_cast(grad * d_b * d_s * d_e); - input_grad_t(b, i, t_f, y_n, x_e) += - static_cast(grad * d_b * d_s * d_w); - input_grad_t(b, i, t_f, y_s, x_w) += - static_cast(grad * d_b * d_n * d_e); - input_grad_t(b, i, t_f, y_s, x_e) += - static_cast(grad * d_b * d_n * d_w); - input_grad_t(b, i, t_b, y_n, x_w) += - static_cast(grad * d_f * d_s * d_e); - input_grad_t(b, i, t_b, y_n, x_e) += - static_cast(grad * d_f * d_s * d_w); - input_grad_t(b, i, t_b, y_s, x_w) += - static_cast(grad * d_f * d_n * d_e); - input_grad_t(b, i, t_b, y_s, x_e) += - static_cast(grad * d_f * d_n * d_w); - } else { - const T grad = output_grad_t(b, j, k, l, i); - input_grad_t(b, t_f, y_n, x_w, i) += - static_cast(grad * d_b * d_s * d_e); - input_grad_t(b, t_f, y_n, x_e, i) += - static_cast(grad * d_b * d_s * d_w); - input_grad_t(b, t_f, y_s, x_w, i) += - static_cast(grad * d_b * d_n * d_e); - input_grad_t(b, t_f, y_s, x_e, i) += - static_cast(grad * d_b * d_n * d_w); - input_grad_t(b, t_b, y_n, x_w, i) += - static_cast(grad * d_f * d_s * d_e); - input_grad_t(b, t_b, y_n, x_e, i) += - static_cast(grad * d_f * d_s * d_w); - input_grad_t(b, t_b, y_s, x_w, i) += - static_cast(grad * d_f * d_n * d_e); - input_grad_t(b, t_b, y_s, x_e, i) += - static_cast(grad * d_f * d_n * d_w); - } - } - } - } - } - } -} - -template -static void BicubicInterpolationGrad(const Tensor& output_grad, - Tensor* input_grad, const float ratio_h, - const float ratio_w, const int in_h, - const int in_w, const int n, const int c, - const int out_h, const int out_w, - const bool align_corners, - const DataLayout data_layout) { - auto input_grad_t = EigenTensor::From(*input_grad); - auto output_grad_t = EigenTensor::From(output_grad); - - for (int k = 0; k < out_h; k++) { // loop for images - T y_n = align_corners ? static_cast(ratio_h * k) - : static_cast(ratio_h * (k + 0.5) - 0.5); - int input_y = floorf(y_n); - T y_t = y_n - input_y; - - for (int l = 0; l < out_w; l++) { - T x_n = align_corners ? static_cast(ratio_w * l) - : static_cast(ratio_w * (l + 0.5) - 0.5); - int input_x = floorf(x_n); - T x_t = x_n - input_x; - - T x_coeffs[4]; - T y_coeffs[4]; - - get_cubic_upsample_coefficients(x_coeffs, x_t); - get_cubic_upsample_coefficients(y_coeffs, y_t); - - for (int i = 0; i < n; i++) { // loop for batches - for (int j = 0; j < c; j++) { // loop for channels - // bicubic interpolation grad - for (int ii = 0; ii < 4; ii++) { - for (int jj = 0; jj < 4; jj++) { - int access_x = std::max(std::min(input_x - 1 + ii, in_w - 1), - static_cast(0)); - int access_y = std::max(std::min(input_y - 1 + jj, in_h - 1), - static_cast(0)); - if (data_layout == DataLayout::kNCHW) { - T grad = output_grad_t(i, j, k, l); - input_grad_t(i, j, access_y, access_x) += - grad * y_coeffs[jj] * x_coeffs[ii]; - } else { - T grad = output_grad_t(i, k, l, j); - input_grad_t(i, access_y, access_x, j) += - grad * y_coeffs[jj] * x_coeffs[ii]; - } - } - } - } - } - } - } -} - -template -static void Interpolate1DCPUFwd(const framework::ExecutionContext& ctx, - const Tensor& input, Tensor* output) { - const std::string data_layout_str = ctx.Attr("data_layout"); - const DataLayout data_layout = framework::StringToDataLayout(data_layout_str); - int n, c, in_d, in_h, in_w; - ExtractNCDWH(input.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w); - - auto interp_method = ctx.Attr("interp_method"); - bool align_corners = ctx.Attr("align_corners"); - int align_mode = ctx.Attr("align_mode"); - - int out_w = ctx.Attr("out_w"); - auto list_new_size_tensor = ctx.MultiInput("SizeTensor"); - float scale_w = -1.; - if (list_new_size_tensor.size() > 0) { - // have size tensor - auto new_size = get_new_shape(list_new_size_tensor); - out_w = new_size[0]; - } else { - // float scale_w = -1; - auto scale_tensor = ctx.Input("Scale"); - auto scale = ctx.Attr>("scale"); - if (scale_tensor != nullptr) { - auto scale_data = get_new_data_from_tensor(scale_tensor); - scale_w = scale_data[0]; - PADDLE_ENFORCE_EQ( - scale_w > 0, true, - platform::errors::InvalidArgument( - "The scale_w in input 'Scale' Tensor of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_w)); - } else { - if (scale.size() > 0) { - scale_w = scale[0]; - - PADDLE_ENFORCE_EQ( - scale_w > 0, true, - platform::errors::InvalidArgument( - "The scale_w in Attr(scale) of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_w)); - } - } - if (scale_w > 0.) { - out_w = static_cast(in_w * scale_w); - } - auto out_size = ctx.Input("OutSize"); - if (out_size != nullptr) { - auto out_size_data = get_new_data_from_tensor(out_size); - out_w = out_size_data[0]; - } - } - PADDLE_ENFORCE_GT(out_w, 0, platform::errors::InvalidArgument( - "out_w in Attr(out_shape) of Op(interpolate) " - "should be greater than 0.")); - framework::DDim dim_out; - if (data_layout == DataLayout::kNCHW) { - dim_out = {n, c, out_w}; - } else { - dim_out = {n, out_w, c}; - } - output->mutable_data(dim_out, ctx.GetPlace()); - - if (in_w == out_w) { - framework::TensorCopy(input, ctx.GetPlace(), output); - return; - } - - float ratio_w = 0.f; - if (out_w > 1) { - float new_scale_w = 0.f; - new_scale_w = (scale_w > 0) ? static_cast(1. / scale_w) - : static_cast(in_w) / out_w; - ratio_w = (align_corners) ? static_cast(in_w - 1) / (out_w - 1) - : static_cast(new_scale_w); - } - if ("linear" == interp_method) { - LinearInterpolation(input, output, ratio_w, in_w, n, c, out_w, - align_corners, align_mode, data_layout); - } -} - -template -static void Interpolate2DCPUFwd(const framework::ExecutionContext& ctx, - const Tensor& input, Tensor* output) { - const std::string data_layout_str = ctx.Attr("data_layout"); - const DataLayout data_layout = framework::StringToDataLayout(data_layout_str); - int n, c, in_d, in_h, in_w; - ExtractNCDWH(input.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w); - - auto interp_method = ctx.Attr("interp_method"); - bool align_corners = ctx.Attr("align_corners"); - int align_mode = ctx.Attr("align_mode"); - - int out_h = ctx.Attr("out_h"); - int out_w = ctx.Attr("out_w"); - float scale_h = -1; - float scale_w = -1; - - auto list_new_size_tensor = ctx.MultiInput("SizeTensor"); - if (list_new_size_tensor.size() > 0) { - // have size tensor - auto new_size = get_new_shape(list_new_size_tensor); - out_h = new_size[0]; - out_w = new_size[1]; - } else { - auto scale_tensor = ctx.Input("Scale"); - auto scale = ctx.Attr>("scale"); - if (scale_tensor != nullptr) { - auto scale_data = get_new_data_from_tensor(scale_tensor); - if (scale_data.size() > 1) { - scale_h = scale_data[0]; - scale_w = scale_data[1]; - } else { - scale_h = scale_data[0]; - scale_w = scale_data[0]; - } - PADDLE_ENFORCE_EQ( - scale_w > 0, true, - platform::errors::InvalidArgument( - "The scale_w in input 'Scale' Tensor of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_w)); - PADDLE_ENFORCE_EQ( - scale_h > 0, true, - platform::errors::InvalidArgument( - "The scale_h in input 'Scale' Tensor of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_h)); - } else { - if (scale.size() > 1) { - scale_h = scale[0]; - scale_w = scale[1]; - - PADDLE_ENFORCE_EQ( - scale_w > 0, true, - platform::errors::InvalidArgument( - "The scale_w in Attr(scale) of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_w)); - PADDLE_ENFORCE_EQ( - scale_h > 0, true, - platform::errors::InvalidArgument( - "The scale_h in Attr(scale) of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_h)); - } - } - if (scale_h > 0. && scale_w > 0.) { - out_h = static_cast(in_h * scale_h); - out_w = static_cast(in_w * scale_w); - } - auto out_size = ctx.Input("OutSize"); - if (out_size != nullptr) { - auto out_size_data = get_new_data_from_tensor(out_size); - out_h = out_size_data[0]; - out_w = out_size_data[1]; - } - } - PADDLE_ENFORCE_GT(out_h, 0, platform::errors::InvalidArgument( - "out_h in Attr(out_shape) of Op(interpolate) " - "should be greater than 0.")); - PADDLE_ENFORCE_GT(out_w, 0, platform::errors::InvalidArgument( - "out_w in Attr(out_shape) of Op(interpolate) " - "should be greater than 0.")); - framework::DDim dim_out; - if (data_layout == DataLayout::kNCHW) { - dim_out = {n, c, out_h, out_w}; - } else { - dim_out = {n, out_h, out_w, c}; - } - output->mutable_data(dim_out, ctx.GetPlace()); - - if (in_h == out_h && in_w == out_w) { - framework::TensorCopy(input, ctx.GetPlace(), output); - return; - } - - float ratio_h = 0.f; - float ratio_w = 0.f; - if (out_h > 1) { - float new_scale_h = 0.f; - new_scale_h = (scale_h > 0) ? static_cast(1. / scale_h) - : static_cast(in_h) / out_h; - ratio_h = (align_corners) ? static_cast(in_h - 1) / (out_h - 1) - : static_cast(new_scale_h); - } - if (out_w > 1) { - float new_scale_w = 0.f; - new_scale_w = (scale_w > 0) ? static_cast(1. / scale_w) - : static_cast(in_w) / out_w; - ratio_w = (align_corners) ? static_cast(in_w - 1) / (out_w - 1) - : static_cast(new_scale_w); - } - - if ("bilinear" == interp_method) { - BilinearInterpolation(input, output, ratio_h, ratio_w, in_h, in_w, n, c, - out_h, out_w, align_corners, align_mode, - data_layout); - } else if ("nearest" == interp_method) { - NearestNeighborInterpolate(input, output, ratio_h, ratio_w, n, c, out_h, - out_w, align_corners, data_layout); - } else if ("bicubic" == interp_method) { - BicubicInterpolation(input, output, ratio_h, ratio_w, in_h, in_w, n, c, - out_h, out_w, align_corners, data_layout); - } -} - -template -static void Interpolate3DCPUFwd(const framework::ExecutionContext& ctx, - const Tensor& input, Tensor* output) { - const std::string data_layout_str = ctx.Attr("data_layout"); - const DataLayout data_layout = framework::StringToDataLayout(data_layout_str); - int n, c, in_d, in_h, in_w; - ExtractNCDWH(input.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w); - - auto interp_method = ctx.Attr("interp_method"); - bool align_corners = ctx.Attr("align_corners"); - int align_mode = ctx.Attr("align_mode"); - - int out_d = ctx.Attr("out_d"); - int out_h = ctx.Attr("out_h"); - int out_w = ctx.Attr("out_w"); - - float scale_d = -1; - float scale_h = -1; - float scale_w = -1; - - auto list_new_size_tensor = ctx.MultiInput("SizeTensor"); - if (list_new_size_tensor.size() > 0) { - // have size tensor - auto new_size = get_new_shape(list_new_size_tensor); - out_d = new_size[0]; - out_h = new_size[1]; - out_w = new_size[2]; - } else { - auto scale_tensor = ctx.Input("Scale"); - auto scale = ctx.Attr>("scale"); - if (scale_tensor != nullptr) { - auto scale_data = get_new_data_from_tensor(scale_tensor); - if (scale_data.size() > 1) { - scale_d = scale_data[0]; - scale_h = scale_data[1]; - scale_w = scale_data[2]; - } else { - scale_d = scale_data[0]; - scale_h = scale_data[0]; - scale_w = scale_data[0]; - } - PADDLE_ENFORCE_EQ( - scale_w > 0, true, - platform::errors::InvalidArgument( - "The scale_w in input 'Scale' Tensor of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_w)); - PADDLE_ENFORCE_EQ( - scale_h > 0, true, - platform::errors::InvalidArgument( - "The scale_h in input 'Scale' Tensor of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_h)); - PADDLE_ENFORCE_EQ( - scale_d > 0, true, - platform::errors::InvalidArgument( - "The scale_d in input 'Scale' Tensor of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_d)); - } else { - if (scale.size() > 1) { - scale_d = scale[0]; - scale_h = scale[1]; - scale_w = scale[2]; - - PADDLE_ENFORCE_EQ( - scale_w > 0, true, - platform::errors::InvalidArgument( - "The scale_w in Attr(scale) of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_w)); - PADDLE_ENFORCE_EQ( - scale_h > 0, true, - platform::errors::InvalidArgument( - "The scale_h in Attr(scale) of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_h)); - PADDLE_ENFORCE_EQ( - scale_d > 0, true, - platform::errors::InvalidArgument( - "The scale_d in Attr(scale) of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_d)); - } - } - if (scale_w > 0. && scale_h > 0. && scale_d > 0.) { - out_d = static_cast(in_d * scale_d); - out_h = static_cast(in_h * scale_h); - out_w = static_cast(in_w * scale_w); - } - auto out_size = ctx.Input("OutSize"); - if (out_size != nullptr) { - auto out_size_data = get_new_data_from_tensor(out_size); - out_d = out_size_data[0]; - out_h = out_size_data[1]; - out_w = out_size_data[2]; - } - } - PADDLE_ENFORCE_GT(out_d, 0, platform::errors::InvalidArgument( - "out_d in Attr(out_shape) of Op(interpolate) " - "should be greater than 0.")); - PADDLE_ENFORCE_GT(out_h, 0, platform::errors::InvalidArgument( - "out_h in Attr(out_shape) of Op(interpolate) " - "should be greater than 0.")); - PADDLE_ENFORCE_GT(out_w, 0, platform::errors::InvalidArgument( - "out_w in Attr(out_shape) of Op(interpolate) " - "should be greater than 0.")); - - framework::DDim dim_out; - if (data_layout == DataLayout::kNCHW) { - dim_out = {n, c, out_d, out_h, out_w}; - } else { - dim_out = {n, out_d, out_h, out_w, c}; - } - - output->mutable_data(dim_out, ctx.GetPlace()); - - if (in_d == out_d && in_h == out_h && in_w == out_w) { - framework::TensorCopy(input, ctx.GetPlace(), output); - return; - } - - float ratio_d = 0.f; - float ratio_h = 0.f; - float ratio_w = 0.f; - if (out_d > 1) { - float new_scale_d = 0.f; - new_scale_d = (scale_d > 0) ? static_cast(1. / scale_d) - : static_cast(in_d) / out_d; - ratio_d = (align_corners) ? static_cast(in_d - 1) / (out_d - 1) - : static_cast(new_scale_d); - } - if (out_h > 1) { - float new_scale_h = 0.f; - new_scale_h = (scale_h > 0) ? static_cast(1. / scale_h) - : static_cast(in_h) / out_h; - ratio_h = (align_corners) ? static_cast(in_h - 1) / (out_h - 1) - : static_cast(new_scale_h); - } - if (out_w > 1) { - float new_scale_w = 0.f; - new_scale_w = (scale_w > 0) ? static_cast(1. / scale_w) - : static_cast(in_w) / out_w; - ratio_w = (align_corners) ? static_cast(in_w - 1) / (out_w - 1) - : static_cast(new_scale_w); - } - - if ("trilinear" == interp_method) { - TrilinearInterpolation(input, output, ratio_d, ratio_h, ratio_w, in_d, - in_h, in_w, n, c, out_d, out_h, out_w, - align_corners, align_mode, data_layout); - } else if ("nearest" == interp_method) { - NearestNeighbor3DInterpolate(input, output, ratio_d, ratio_h, ratio_w, n, - c, out_d, out_h, out_w, align_corners, - data_layout); - } -} - -template -static void Interpolate1DCPUBwd(const framework::ExecutionContext& ctx, - Tensor* input_grad, const Tensor& output_grad) { - auto* input = ctx.Input("X"); - const std::string data_layout_str = ctx.Attr("data_layout"); - const DataLayout data_layout = framework::StringToDataLayout(data_layout_str); - int n, c, in_d, in_h, in_w; - ExtractNCDWH(input->dims(), data_layout, &n, &c, &in_d, &in_h, &in_w); - - auto interp_method = ctx.Attr("interp_method"); - bool align_corners = ctx.Attr("align_corners"); - int align_mode = ctx.Attr("align_mode"); - - int out_w = ctx.Attr("out_w"); - float scale_w = -1.0; - auto scale_tensor = ctx.Input("Scale"); - auto scale = ctx.Attr>("scale"); - if (scale_tensor != nullptr) { - auto scale_data = get_new_data_from_tensor(scale_tensor); - scale_w = scale_data[0]; - PADDLE_ENFORCE_EQ( - scale_w > 0, true, - platform::errors::InvalidArgument( - "The scale_w in input 'Scale' Tensor of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_w)); - } else { - if (scale.size() > 0) { - scale_w = scale[0]; - PADDLE_ENFORCE_EQ( - scale_w > 0, true, - platform::errors::InvalidArgument( - "The scale_w in Attr(scale) of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_w)); - } - } - if (scale_w > 0.) { - out_w = static_cast(in_w * scale_w); - } - auto out_size = ctx.Input("OutSize"); - if (out_size != nullptr) { - auto out_size_data = get_new_data_from_tensor(out_size); - out_w = out_size_data[0]; - } - auto list_new_size_tensor = ctx.MultiInput("SizeTensor"); - if (list_new_size_tensor.size() > 0) { - // have size tensor - auto new_size = get_new_shape(list_new_size_tensor); - out_w = new_size[0]; - } - - framework::DDim dim_grad; - if (data_layout == DataLayout::kNCHW) { - dim_grad = {n, c, in_w}; - } else { - dim_grad = {n, in_w, c}; - } - input_grad->mutable_data(dim_grad, ctx.GetPlace()); - - auto& device_ctx = ctx.template device_context(); - phi::funcs::SetConstant zero; - zero(device_ctx, input_grad, static_cast(0.0)); - - if (in_w == out_w) { - framework::TensorCopy(output_grad, ctx.GetPlace(), input_grad); - return; - } - - float ratio_w = 0.f; - if (out_w > 1) { - float new_scale_w = 0.f; - new_scale_w = (scale_w > 0) ? static_cast(1. / scale_w) - : static_cast(in_w) / out_w; - ratio_w = (align_corners) ? static_cast(in_w - 1) / (out_w - 1) - : static_cast(new_scale_w); - } - if ("linear" == interp_method) { - LinearInterpolationGrad(output_grad, input_grad, ratio_w, in_w, n, c, - out_w, align_corners, align_mode, data_layout); - } -} - -template -static void Interpolate2DCPUBwd(const framework::ExecutionContext& ctx, - Tensor* input_grad, const Tensor& output_grad) { - auto* input = ctx.Input("X"); - const std::string data_layout_str = ctx.Attr("data_layout"); - const DataLayout data_layout = framework::StringToDataLayout(data_layout_str); - int n, c, in_d, in_h, in_w; - ExtractNCDWH(input->dims(), data_layout, &n, &c, &in_d, &in_h, &in_w); - - auto interp_method = ctx.Attr("interp_method"); - bool align_corners = ctx.Attr("align_corners"); - int align_mode = ctx.Attr("align_mode"); - - int out_h = ctx.Attr("out_h"); - int out_w = ctx.Attr("out_w"); - float scale_h = -1; - float scale_w = -1; - auto scale_tensor = ctx.Input("Scale"); - auto scale = ctx.Attr>("scale"); - if (scale_tensor != nullptr) { - auto scale_data = get_new_data_from_tensor(scale_tensor); - if (scale_data.size() > 1) { - scale_h = scale_data[0]; - scale_w = scale_data[1]; - } else { - scale_w = scale_data[0]; - scale_h = scale_data[0]; - } - PADDLE_ENFORCE_EQ( - scale_w > 0, true, - platform::errors::InvalidArgument( - "The scale_w in input 'Scale' Tensor of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_w)); - PADDLE_ENFORCE_EQ( - scale_h > 0, true, - platform::errors::InvalidArgument( - "The scale_h in input 'Scale' Tensor of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_h)); - } else { - if (scale.size() > 1) { - scale_h = scale[0]; - scale_w = scale[1]; - PADDLE_ENFORCE_EQ( - scale_w > 0, true, - platform::errors::InvalidArgument( - "The scale_w in Attr(scale) of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_w)); - PADDLE_ENFORCE_EQ( - scale_h > 0, true, - platform::errors::InvalidArgument( - "The scale_h in Attr(scale) of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_h)); - } - } - if (scale_h > 0. && scale_w > 0.) { - out_h = static_cast(in_h * scale_h); - out_w = static_cast(in_w * scale_w); - } - auto out_size = ctx.Input("OutSize"); - if (out_size != nullptr) { - auto out_size_data = get_new_data_from_tensor(out_size); - out_h = out_size_data[0]; - out_w = out_size_data[1]; - } - auto list_new_size_tensor = ctx.MultiInput("SizeTensor"); - if (list_new_size_tensor.size() > 0) { - // have size tensor - auto new_size = get_new_shape(list_new_size_tensor); - out_h = new_size[0]; - out_w = new_size[1]; - } - - framework::DDim dim_grad; - if (data_layout == DataLayout::kNCHW) { - dim_grad = {n, c, in_h, in_w}; - } else { - dim_grad = {n, in_h, in_w, c}; - } - input_grad->mutable_data(dim_grad, ctx.GetPlace()); - - auto& device_ctx = ctx.template device_context(); - phi::funcs::SetConstant zero; - zero(device_ctx, input_grad, static_cast(0.0)); - - if (in_h == out_h && in_w == out_w) { - framework::TensorCopy(output_grad, ctx.GetPlace(), input_grad); - return; - } - - float ratio_h = 0.f; - float ratio_w = 0.f; - if (out_h > 1) { - float new_scale_h = 0.f; - new_scale_h = (scale_h > 0) ? static_cast(1. / scale_h) - : static_cast(in_h) / out_h; - ratio_h = (align_corners) ? static_cast(in_h - 1) / (out_h - 1) - : static_cast(new_scale_h); - } - if (out_w > 1) { - float new_scale_w = 0.f; - new_scale_w = (scale_w > 0) ? static_cast(1. / scale_w) - : static_cast(in_w) / out_w; - ratio_w = (align_corners) ? static_cast(in_w - 1) / (out_w - 1) - : static_cast(new_scale_w); - } - - if ("bilinear" == interp_method) { - BilinearInterpolationGrad(output_grad, input_grad, ratio_h, ratio_w, - in_h, in_w, n, c, out_h, out_w, align_corners, - align_mode, data_layout); - } else if ("nearest" == interp_method) { - NearestNeighborInterpolateGrad(output_grad, input_grad, ratio_h, ratio_w, - n, c, out_h, out_w, align_corners, - data_layout); - } else if ("bicubic" == interp_method) { - BicubicInterpolationGrad(output_grad, input_grad, ratio_h, ratio_w, in_h, - in_w, n, c, out_h, out_w, align_corners, - data_layout); - } -} - -template -static void Interpolate3DCPUBwd(const framework::ExecutionContext& ctx, - Tensor* input_grad, const Tensor output_grad) { - auto* input = ctx.Input("X"); - const std::string data_layout_str = ctx.Attr("data_layout"); - const DataLayout data_layout = framework::StringToDataLayout(data_layout_str); - int n, c, in_d, in_h, in_w; - ExtractNCDWH(input->dims(), data_layout, &n, &c, &in_d, &in_h, &in_w); - - auto interp_method = ctx.Attr("interp_method"); - bool align_corners = ctx.Attr("align_corners"); - int align_mode = ctx.Attr("align_mode"); - - int out_d = ctx.Attr("out_d"); - int out_h = ctx.Attr("out_h"); - int out_w = ctx.Attr("out_w"); - float scale_d = -1; - float scale_h = -1; - float scale_w = -1; - auto scale_tensor = ctx.Input("Scale"); - auto scale = ctx.Attr>("scale"); - if (scale_tensor != nullptr) { - auto scale_data = get_new_data_from_tensor(scale_tensor); - if (scale_data.size() > 1) { - scale_d = scale_data[0]; - scale_h = scale_data[1]; - scale_w = scale_data[2]; - } else { - scale_d = scale_data[0]; - scale_h = scale_data[0]; - scale_w = scale_data[0]; - } - PADDLE_ENFORCE_EQ( - scale_w > 0, true, - platform::errors::InvalidArgument( - "The scale_w in input 'Scale' Tensor of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_w)); - PADDLE_ENFORCE_EQ( - scale_h > 0, true, - platform::errors::InvalidArgument( - "The scale_h in input 'Scale' Tensor of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_h)); - PADDLE_ENFORCE_EQ( - scale_d > 0, true, - platform::errors::InvalidArgument( - "The scale_d in input 'Scale' Tensor of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_d)); - } else { - if (scale.size() > 1) { - scale_d = scale[0]; - scale_h = scale[1]; - scale_w = scale[2]; - PADDLE_ENFORCE_EQ( - scale_w > 0, true, - platform::errors::InvalidArgument( - "The scale_w in Attr(scale) of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_w)); - PADDLE_ENFORCE_EQ( - scale_h > 0, true, - platform::errors::InvalidArgument( - "The scale_h in Attr(scale) of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_h)); - PADDLE_ENFORCE_EQ( - scale_d > 0, true, - platform::errors::InvalidArgument( - "The scale_d in Attr(scale) of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_d)); - } - } - if (scale_d > 0. && scale_h > 0. && scale_w > 0.) { - out_d = static_cast(in_d * scale_d); - out_h = static_cast(in_h * scale_h); - out_w = static_cast(in_w * scale_w); - } - auto out_size = ctx.Input("OutSize"); - if (out_size != nullptr) { - auto out_size_data = get_new_data_from_tensor(out_size); - out_d = out_size_data[0]; - out_h = out_size_data[1]; - out_w = out_size_data[2]; - } - auto list_new_size_tensor = ctx.MultiInput("SizeTensor"); - if (list_new_size_tensor.size() > 0) { - // have size tensor - auto new_size = get_new_shape(list_new_size_tensor); - out_d = new_size[0]; - out_h = new_size[1]; - out_w = new_size[2]; - } - - framework::DDim dim_grad; - if (data_layout == DataLayout::kNCHW) { - dim_grad = {n, c, in_d, in_h, in_w}; - } else { - dim_grad = {n, in_d, in_h, in_w, c}; - } - input_grad->mutable_data(dim_grad, ctx.GetPlace()); - auto& device_ctx = ctx.template device_context(); - phi::funcs::SetConstant zero; - zero(device_ctx, input_grad, static_cast(0.0)); - - if (in_d == out_d && in_h == out_h && in_w == out_w) { - framework::TensorCopy(output_grad, ctx.GetPlace(), input_grad); - return; - } - - float ratio_d = 0.f; - float ratio_h = 0.f; - float ratio_w = 0.f; - if (out_d > 1) { - float new_scale_d = 0.f; - new_scale_d = (scale_d > 0) ? static_cast(1. / scale_d) - : static_cast(in_d) / out_d; - ratio_d = (align_corners) ? static_cast(in_d - 1) / (out_d - 1) - : static_cast(new_scale_d); - } - if (out_h > 1) { - float new_scale_h = 0.f; - new_scale_h = (scale_h > 0) ? static_cast(1. / scale_h) - : static_cast(in_h) / out_h; - ratio_h = (align_corners) ? static_cast(in_h - 1) / (out_h - 1) - : static_cast(new_scale_h); - } - if (out_w > 1) { - float new_scale_w = 0.f; - new_scale_w = (scale_w > 0) ? static_cast(1. / scale_w) - : static_cast(in_w) / out_w; - ratio_w = (align_corners) ? static_cast(in_w - 1) / (out_w - 1) - : static_cast(new_scale_w); - } - - if ("trilinear" == interp_method) { - TrilinearInterpolationGrad( - output_grad, input_grad, ratio_d, ratio_h, ratio_w, in_d, in_h, in_w, n, - c, out_d, out_h, out_w, align_corners, align_mode, data_layout); - } else if ("nearest" == interp_method) { - NearestNeighbor3DInterpolateGrad(output_grad, input_grad, ratio_d, - ratio_h, ratio_w, n, c, out_d, out_h, - out_w, align_corners, data_layout); - } -} - -template -class InterpolateV2Kernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* input = ctx.Input("X"); - auto* output = ctx.Output("Out"); - - auto input_dims = input->dims(); - if (input_dims.size() == 3) { // 1D interpolation - Interpolate1DCPUFwd(ctx, *input, output); - } else if (input_dims.size() == 4) { // 2D interpolation - Interpolate2DCPUFwd(ctx, *input, output); - } else if (input_dims.size() == 5) { // 3D interpolation - Interpolate3DCPUFwd(ctx, *input, output); - } - } -}; - -template -class InterpolateV2GradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* input_grad = ctx.Output(framework::GradVarName("X")); - auto* output_grad = ctx.Input(framework::GradVarName("Out")); - - auto output_grad_dims = output_grad->dims(); - if (output_grad_dims.size() == 3) { // 1D interpolation grad - Interpolate1DCPUBwd(ctx, input_grad, *output_grad); - } else if (output_grad_dims.size() == 4) { // 2D interpolation grad - Interpolate2DCPUBwd(ctx, input_grad, *output_grad); - } else if (output_grad_dims.size() == 5) { // 3D interpolation grad - Interpolate3DCPUBwd(ctx, input_grad, *output_grad); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/interpolate_v2_op_npu.cc b/paddle/fluid/operators/interpolate_v2_op_npu.cc index bf29c2aabb801..615b5ea142b58 100644 --- a/paddle/fluid/operators/interpolate_v2_op_npu.cc +++ b/paddle/fluid/operators/interpolate_v2_op_npu.cc @@ -12,9 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/interpolate_v2_op.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/kernels/funcs/interpolate_function.h" + namespace paddle { namespace operators { @@ -401,7 +403,8 @@ class InterpolateV2NPUKernel : public framework::OpKernel { const DataLayout data_layout = framework::StringToDataLayout(data_layout_str); int n, c, in_d, in_h, in_w; - ExtractNCDWH(input_dims, data_layout, &n, &c, &in_d, &in_h, &in_w); + phi::funcs::ExtractNCDWH(input_dims, data_layout, &n, &c, &in_d, &in_h, + &in_w); auto interp_method = ctx.Attr("interp_method"); bool align_corners = ctx.Attr("align_corners"); @@ -431,14 +434,15 @@ class InterpolateV2NPUKernel : public framework::OpKernel { out_w = output_w[0]; } else if (ctx.HasInput("OutSize")) { auto out_size = ctx.Input("OutSize"); - auto out_size_data = get_new_data_from_tensor(out_size); + auto out_size_data = phi::funcs::get_new_data_from_tensor(out_size); out_h = out_size_data[0]; out_w = out_size_data[1]; } else { auto scale_tensor = ctx.Input("Scale"); auto scale = ctx.Attr>("scale"); if (scale_tensor != nullptr) { - auto scale_data = get_new_data_from_tensor(scale_tensor); + auto scale_data = + phi::funcs::get_new_data_from_tensor(scale_tensor); if (scale_data.size() > 1) { scale_h = scale_data[0]; scale_w = scale_data[1]; @@ -538,7 +542,8 @@ class InterpolateV2NPUGradKernel : public framework::OpKernel { const DataLayout data_layout = framework::StringToDataLayout(data_layout_str); int n, c, in_d, in_h, in_w; - ExtractNCDWH(input->dims(), data_layout, &n, &c, &in_d, &in_h, &in_w); + phi::funcs::ExtractNCDWH(input->dims(), data_layout, &n, &c, &in_d, &in_h, + &in_w); auto interp_method = ctx.Attr("interp_method"); bool align_corners = ctx.Attr("align_corners"); @@ -567,14 +572,15 @@ class InterpolateV2NPUGradKernel : public framework::OpKernel { out_w = output_w[0]; } else if (ctx.HasInput("OutSize")) { auto out_size = ctx.Input("OutSize"); - auto out_size_data = get_new_data_from_tensor(out_size); + auto out_size_data = phi::funcs::get_new_data_from_tensor(out_size); out_h = out_size_data[0]; out_w = out_size_data[1]; } else { auto scale_tensor = ctx.Input("Scale"); auto scale = ctx.Attr>("scale"); if (scale_tensor != nullptr) { - auto scale_data = get_new_data_from_tensor(scale_tensor); + auto scale_data = + phi::funcs::get_new_data_from_tensor(scale_tensor); if (scale_data.size() > 1) { scale_h = scale_data[0]; scale_w = scale_data[1]; diff --git a/paddle/fluid/operators/interpolate_v2_op_xpu.cc b/paddle/fluid/operators/interpolate_v2_op_xpu.cc index 850dbe025b9cb..9cbfc95158348 100644 --- a/paddle/fluid/operators/interpolate_v2_op_xpu.cc +++ b/paddle/fluid/operators/interpolate_v2_op_xpu.cc @@ -14,8 +14,7 @@ #include #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/interpolate_v2_op.h" - +#include "paddle/phi/kernels/funcs/interpolate_function.h" #ifdef PADDLE_WITH_XPU namespace paddle { @@ -57,7 +56,8 @@ class InterpolateV2XPUKernel : public framework::OpKernel { const DataLayout data_layout = framework::StringToDataLayout(data_layout_str); int n, c, in_d, in_h, in_w; - ExtractNCDWH(input_dims, data_layout, &n, &c, &in_d, &in_h, &in_w); + phi::funcs::ExtractNCDWH(input_dims, data_layout, &n, &c, &in_d, &in_h, + &in_w); auto interp_method = ctx.Attr("interp_method"); bool align_corners = ctx.Attr("align_corners"); @@ -78,7 +78,8 @@ class InterpolateV2XPUKernel : public framework::OpKernel { auto scale_tensor = ctx.Input("Scale"); auto scale = ctx.Attr>("scale"); if (scale_tensor != nullptr) { - auto scale_data = get_new_data_from_tensor(scale_tensor); + auto scale_data = + phi::funcs::get_new_data_from_tensor(scale_tensor); if (scale_data.size() > 1) { scale_h = scale_data[0]; scale_w = scale_data[1]; @@ -107,7 +108,8 @@ class InterpolateV2XPUKernel : public framework::OpKernel { } auto out_size = ctx.Input("OutSize"); if (out_size != nullptr) { - auto out_size_data = get_new_data_from_tensor(out_size); + auto out_size_data = + phi::funcs::get_new_data_from_tensor(out_size); out_h = out_size_data[0]; out_w = out_size_data[1]; } @@ -169,7 +171,8 @@ class InterpolateV2GradXPUKernel : public framework::OpKernel { const DataLayout data_layout = framework::StringToDataLayout(data_layout_str); int n, c, in_d, in_h, in_w; - ExtractNCDWH(input->dims(), data_layout, &n, &c, &in_d, &in_h, &in_w); + phi::funcs::ExtractNCDWH(input->dims(), data_layout, &n, &c, &in_d, &in_h, + &in_w); auto interp_method = ctx.Attr("interp_method"); bool align_corners = ctx.Attr("align_corners"); @@ -190,7 +193,8 @@ class InterpolateV2GradXPUKernel : public framework::OpKernel { auto scale_tensor = ctx.Input("Scale"); auto scale = ctx.Attr>("scale"); if (scale_tensor != nullptr) { - auto scale_data = get_new_data_from_tensor(scale_tensor); + auto scale_data = + phi::funcs::get_new_data_from_tensor(scale_tensor); if (scale_data.size() > 1) { scale_h = scale_data[0]; scale_w = scale_data[1]; @@ -219,7 +223,8 @@ class InterpolateV2GradXPUKernel : public framework::OpKernel { } auto out_size = ctx.Input("OutSize"); if (out_size != nullptr) { - auto out_size_data = get_new_data_from_tensor(out_size); + auto out_size_data = + phi::funcs::get_new_data_from_tensor(out_size); out_h = out_size_data[0]; out_w = out_size_data[1]; } diff --git a/paddle/fluid/operators/math/cross_entropy.cc b/paddle/fluid/operators/math/cross_entropy.cc index 0b0584608a300..cb2f59182c111 100644 --- a/paddle/fluid/operators/math/cross_entropy.cc +++ b/paddle/fluid/operators/math/cross_entropy.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/math/cross_entropy.h" #include "paddle/fluid/framework/convert_utils.h" +#include "paddle/phi/backends/cpu/cpu_context.h" namespace paddle { namespace platform { @@ -89,38 +90,38 @@ struct HardLabelCrossEntropyCPUFunctorImpl { const int axis_dim_; }; -template -class CrossEntropyFunctor { - public: - void operator()(const platform::CPUDeviceContext& ctx, framework::Tensor* out, - const framework::Tensor* prob, - const framework::Tensor* labels, const bool softLabel, - const int ignore_index, const int axis_dim) { - if (softLabel) { - const int batch_size = prob->dims()[0]; - const int num_classes = prob->dims()[1]; - const int num_remain = num_classes / axis_dim; - - Eigen::DSizes batch_axis_remain(batch_size, axis_dim, num_remain); - auto in = EigenMatrix::From(*prob); - auto lbl = EigenMatrix::From(*labels); - auto loss = EigenMatrix::From(*out); - - loss.device(*ctx.eigen_device()) = - -((lbl * in.log().unaryExpr(math::TolerableValue())) - .reshape(batch_axis_remain) - .sum(Eigen::DSizes(1))); - } else { - HardLabelCrossEntropyCPUFunctorImpl functor_impl( - out, prob, labels, ignore_index, axis_dim); - framework::VisitIntDataType( - framework::TransToProtoVarType(labels->dtype()), functor_impl); - } +template +void CrossEntropyFunctor::operator()( + const DeviceContext& ctx, framework::Tensor* out, + const framework::Tensor* prob, const framework::Tensor* labels, + const bool softLabel, const int ignore_index, const int axis_dim) { + if (softLabel) { + const int batch_size = prob->dims()[0]; + const int num_classes = prob->dims()[1]; + const int num_remain = num_classes / axis_dim; + + Eigen::DSizes batch_axis_remain(batch_size, axis_dim, num_remain); + auto in = EigenMatrix::From(*prob); + auto lbl = EigenMatrix::From(*labels); + auto loss = EigenMatrix::From(*out); + + loss.device(*ctx.eigen_device()) = + -((lbl * in.log().unaryExpr(math::TolerableValue())) + .reshape(batch_axis_remain) + .sum(Eigen::DSizes(1))); + } else { + HardLabelCrossEntropyCPUFunctorImpl functor_impl(out, prob, labels, + ignore_index, axis_dim); + framework::VisitIntDataType(framework::TransToProtoVarType(labels->dtype()), + functor_impl); } -}; +} template class CrossEntropyFunctor; template class CrossEntropyFunctor; + +template class CrossEntropyFunctor; +template class CrossEntropyFunctor; } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/math/cross_entropy.cu b/paddle/fluid/operators/math/cross_entropy.cu index 829ac9fb55964..80e06d4b7f688 100644 --- a/paddle/fluid/operators/math/cross_entropy.cu +++ b/paddle/fluid/operators/math/cross_entropy.cu @@ -17,6 +17,7 @@ limitations under the License. */ #include "paddle/fluid/operators/math/cross_entropy.h" #include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_context.h" namespace paddle { namespace operators { @@ -93,46 +94,48 @@ struct HardLabelCrossEntropyCUDAFunctorImpl { gpuStream_t stream_; }; -template -class CrossEntropyFunctor { - public: - void operator()(const platform::CUDADeviceContext& ctx, - framework::Tensor* out, const framework::Tensor* prob, - const framework::Tensor* labels, const bool softLabel, - const int ignore_index, const int axis_dim) { - const T* prob_data = prob->data(); - T* loss_data = out->mutable_data(ctx.GetPlace()); - - int batch_size = prob->dims()[0]; - int class_num = prob->dims()[1]; +template +void CrossEntropyFunctor::operator()( + const DeviceContext& ctx, framework::Tensor* out, + const framework::Tensor* prob, const framework::Tensor* labels, + const bool softLabel, const int ignore_index, const int axis_dim) { + const T* prob_data = prob->data(); + T* loss_data = out->mutable_data(ctx.GetPlace()); + + int batch_size = prob->dims()[0]; + int class_num = prob->dims()[1]; #ifdef __HIPCC__ - constexpr int kMaxBlockDim = 256; + constexpr int kMaxBlockDim = 256; #else - constexpr int kMaxBlockDim = 512; + constexpr int kMaxBlockDim = 512; #endif - if (softLabel) { - const T* label_data = labels->data(); - int block = class_num > kMaxBlockDim - ? kMaxBlockDim - : pow(2, static_cast(std::log2(class_num))); - - SoftCrossEntropyKernel<<>>( - loss_data, prob_data, label_data, class_num); - } else { - HardLabelCrossEntropyCUDAFunctorImpl functor( - loss_data, prob_data, labels->data(), batch_size, class_num, - ignore_index, kMaxBlockDim, ctx.stream()); - framework::VisitDataType(framework::TransToProtoVarType(labels->dtype()), - functor); - } + if (softLabel) { + const T* label_data = labels->data(); + int block = class_num > kMaxBlockDim + ? kMaxBlockDim + : pow(2, static_cast(std::log2(class_num))); + + SoftCrossEntropyKernel<<>>( + loss_data, prob_data, label_data, class_num); + } else { + HardLabelCrossEntropyCUDAFunctorImpl functor( + loss_data, prob_data, labels->data(), batch_size, class_num, + ignore_index, kMaxBlockDim, ctx.stream()); + framework::VisitDataType(framework::TransToProtoVarType(labels->dtype()), + functor); } -}; +} template class CrossEntropyFunctor; template class CrossEntropyFunctor; template class CrossEntropyFunctor; + +template class CrossEntropyFunctor; +template class CrossEntropyFunctor; +template class CrossEntropyFunctor; + } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/math/softmax.cu b/paddle/fluid/operators/math/softmax.cu index 83b124902ebb7..e960dc8a60832 100644 --- a/paddle/fluid/operators/math/softmax.cu +++ b/paddle/fluid/operators/math/softmax.cu @@ -29,9 +29,9 @@ using DataLayout = platform::DataLayout; template using CudnnDataType = platform::CudnnDataType; -template -void SoftmaxCUDNNFunctor::operator()( - const platform::CUDADeviceContext& context, const framework::Tensor* X, +template +void SoftmaxCUDNNFunctor::operator()( + const DeviceContext& context, const framework::Tensor* X, framework::Tensor* Y) { // ------------------- cudnn descriptors --------------------- ScopedTensorDescriptor xDesc; @@ -69,9 +69,9 @@ void SoftmaxCUDNNFunctor::operator()( #endif } -template -void SoftmaxGradCUDNNFunctor::operator()( - const platform::CUDADeviceContext& context, const framework::Tensor* Y, +template +void SoftmaxGradCUDNNFunctor::operator()( + const DeviceContext& context, const framework::Tensor* Y, const framework::Tensor* YGrad, framework::Tensor* XGrad) { // ------------------- cudnn descriptors --------------------- ScopedTensorDescriptor yDesc; @@ -116,19 +116,31 @@ void SoftmaxGradCUDNNFunctor::operator()( #endif } -template class SoftmaxCUDNNFunctor; -template class SoftmaxCUDNNFunctor; -template class SoftmaxGradCUDNNFunctor; -template class SoftmaxGradCUDNNFunctor; +template class SoftmaxCUDNNFunctor; +template class SoftmaxCUDNNFunctor; +template class SoftmaxGradCUDNNFunctor; +template class SoftmaxGradCUDNNFunctor; +template class SoftmaxCUDNNFunctor; +template class SoftmaxCUDNNFunctor; +template class SoftmaxGradCUDNNFunctor; +template class SoftmaxGradCUDNNFunctor; #if CUDNN_VERSION_MIN(8, 1, 0) -template class SoftmaxCUDNNFunctor; -template class SoftmaxGradCUDNNFunctor; +template class SoftmaxCUDNNFunctor; +template class SoftmaxGradCUDNNFunctor; +template class SoftmaxCUDNNFunctor; +template class SoftmaxGradCUDNNFunctor; #endif // MIOPEN do not support double #ifndef PADDLE_WITH_HIP -template class SoftmaxCUDNNFunctor; -template class SoftmaxGradCUDNNFunctor; +template class SoftmaxCUDNNFunctor; +template class SoftmaxGradCUDNNFunctor; +template class SoftmaxCUDNNFunctor; +template class SoftmaxGradCUDNNFunctor; #endif template class SoftmaxFunctor +template class SoftmaxCUDNNFunctor { public: - void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor* X, framework::Tensor* Y); + void operator()(const DeviceContext& context, const framework::Tensor* X, + framework::Tensor* Y); }; -template +template class SoftmaxGradCUDNNFunctor { public: - void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor* Y, const framework::Tensor* y_grad, - framework::Tensor* x_grad); + void operator()(const DeviceContext& context, const framework::Tensor* Y, + const framework::Tensor* y_grad, framework::Tensor* x_grad); }; #endif diff --git a/paddle/fluid/operators/pscore/CMakeLists.txt b/paddle/fluid/operators/pscore/CMakeLists.txt old mode 100644 new mode 100755 index baf82a9df31cb..863370540da82 --- a/paddle/fluid/operators/pscore/CMakeLists.txt +++ b/paddle/fluid/operators/pscore/CMakeLists.txt @@ -6,9 +6,9 @@ include(operators) set(DISTRIBUTE_DEPS "") -list(APPEND DISTRIBUTE_DEPS fleet ps_service brpc_utils heter_server heter_client ps_framework_proto framework_proto sendrecv_rpc brpc leveldb ssl crypto protobuf gflags glog zlib snappy device_context) +list(APPEND DISTRIBUTE_DEPS executor fleet ps_service brpc_utils heter_server heter_client ps_framework_proto framework_proto sendrecv_rpc brpc leveldb ssl crypto protobuf gflags glog zlib snappy device_context) -set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") +set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor -Wno-error=parentheses") if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0) set(DISTRIBUTE_COMPILE_FLAGS @@ -37,3 +37,6 @@ cc_test(send_and_recv_gpu_test SRCS send_and_recv_op_gpu_test.cc DEPS executor s set_source_files_properties(heter_listen_and_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) cc_test(heter_listen_and_server_test SRCS heter_listen_and_server_test.cc DEPS executor scope proto_desc scale_op heter_listen_and_serv_op ${RPC_DEPS} ${DISTRIBUTE_DEPS} eigen_function) + +#set_source_files_properties(heter_cloud_comm_cpu_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +#cc_test(heter_cloud_comm_cpu_test SRCS heter_cloud_comm_cpu_test.cc DEPS executor scope proto_desc scale_op heter_listen_and_serv_op ${RPC_DEPS} ${DISTRIBUTE_DEPS} eigen_function) diff --git a/paddle/fluid/operators/pscore/heter_cloud_comm_cpu_test.cc b/paddle/fluid/operators/pscore/heter_cloud_comm_cpu_test.cc new file mode 100644 index 0000000000000..2340f443c49fb --- /dev/null +++ b/paddle/fluid/operators/pscore/heter_cloud_comm_cpu_test.cc @@ -0,0 +1,247 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#if defined PADDLE_WITH_PSCORE +#include + +#include +#include +#include +#include +#include // NOLINT + +#include "gtest/gtest.h" +#include "paddle/fluid/distributed/ps/service/heter_client.h" +#include "paddle/fluid/distributed/ps/service/heter_server.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/op_version_registry.h" + +namespace framework = paddle::framework; +namespace platform = paddle::platform; +namespace distributed = paddle::distributed; + +using MultiVarMsg = ::paddle::distributed::MultiVariableMessage; + +void CreateVarsOnScope(framework::Scope* scope) { + auto var1 = scope->Var("w"); + var1->GetMutable(); + auto var2 = scope->Var("x"); + var2->GetMutable(); +} + +void InitTensorsOnClient(framework::Scope* scope, platform::CPUPlace* place, + int64_t rows_numel) { + CreateVarsOnScope(scope); + + auto w = scope->Var("w")->GetMutable(); + auto w_value = w->mutable_value(); + w_value->Resize({rows_numel, 10}); + for (int64_t i = 0; i < rows_numel; ++i) w->AutoGrownIndex(i, true); + + auto ptr = w_value->mutable_data(*place); + + for (int64_t i = 0; i < w_value->numel(); ++i) { + ptr[i] = static_cast(i / 10); + } + + auto x_var = scope->Var("x")->GetMutable(); + float* x_ptr = + x_var->mutable_data(framework::DDim({1, rows_numel}), *place); + for (int64_t i = 0; i < rows_numel; ++i) { + x_ptr[i] = 1.0; + } +} + +void StartSwitchServer( + std::shared_ptr& switch_server_ptr, // NOLINT + std::vector endpoints, + std::vector peer_endpoints) { + switch_server_ptr->SetPeerEndPoints(peer_endpoints); + switch_server_ptr->SetEndPoint(endpoints[0]); + /* + std::shared_ptr b_req_handler; + b_req_handler.reset(new distributed::SendAndRecvVariableHandler()); + switch_server_ptr->SetServiceHandler(b_req_handler); + + switch_server_ptr->SetLocalScope(); + + switch_server_ptr->RegisterServiceHandler( + std::to_string(distributed::PS_SAVE_WITH_SCOPE), + [&](const MultiVarMsg* request, MultiVarMsg* response, + brpc::Controller* cntl) -> int { + return b_req_handler->SaveInSwitchWithScope(request, response, cntl); + }); + + switch_server_ptr->RegisterServiceHandler(std::to_string(distributed::PS_SAVE_WITH_SHARD), + [&](const MultiVarMsg* request, MultiVarMsg* + response, + brpc::Controller* cntl) -> int { + return b_req_handler->SaveInSwitchWithShard( + request, response, cntl); + }); + + switch_server_ptr->RegisterServiceHandler(std::to_string(distributed::PS_QUERY_WITH_SCOPE), + [&](const MultiVarMsg* request, MultiVarMsg* + response, + brpc::Controller* cntl) -> int { + return b_req_handler->QueryInSwitchWithScope( + request, response, cntl); + }); + + switch_server_ptr->RegisterServiceHandler(std::to_string(distributed::PS_QUERY_WITH_SHARD), + [&](const MultiVarMsg* request, MultiVarMsg* + response, + brpc::Controller* cntl) -> int { + return b_req_handler->QueryInSwitchWithShard( + request, response, cntl); + }); + */ + switch_server_ptr->StartHeterService(false); +} + +void StartSwitchInterServer( + std::shared_ptr& switch_server_ptr, // NOLINT + std::vector endpoints, + std::vector peer_endpoints) { + switch_server_ptr->SetPeerEndPoints(peer_endpoints); + switch_server_ptr->SetInterEndpoint(endpoints[1]); + switch_server_ptr->StartHeterInterService(false); +} + +TEST(HETERSENDANDRECV, CPU) { + setenv("http_proxy", "", 1); + setenv("https_proxy", "", 1); + + // 启动 switch server A & B + std::string switch_a_endpoint("127.0.0.1:6000"); + std::string switch_a_endpoint_inter("127.0.0.1:6100"); + std::string switch_b_endpoint_inter("127.0.0.1:7100"); + std::string switch_b_endpoint("127.0.0.1:7000"); + + std::shared_ptr switch_server_ptr_a = + std::make_shared(); + std::vector end_points{switch_a_endpoint}; + std::vector peer_endpoints{switch_b_endpoint_inter}; + std::thread switch_server_a_thread(StartSwitchServer, + std::ref(switch_server_ptr_a), end_points, + peer_endpoints); + switch_server_ptr_a->WaitServerReady(); + + std::shared_ptr switch_server_ptr_b = + std::make_shared(); + end_points = {switch_b_endpoint, switch_b_endpoint_inter}; + peer_endpoints = {}; + std::thread switch_server_b_thread(StartSwitchServer, + std::ref(switch_server_ptr_b), end_points, + peer_endpoints); + switch_server_ptr_b->WaitServerReady(); + + end_points = {switch_b_endpoint, switch_b_endpoint_inter}; + peer_endpoints = {}; + std::thread switch_server_b_thread_inter(StartSwitchInterServer, + std::ref(switch_server_ptr_b), + end_points, peer_endpoints); + switch_server_ptr_b->WaitServerReady(); + + // 获取 client 实例 + std::shared_ptr heter_client_ptr_ = + distributed::HeterClient::GetInstance( + {switch_a_endpoint, switch_b_endpoint}, {}, 0); + + platform::CPUPlace place; + platform::CPUDeviceContext ctx(place); + framework::Executor exe(place); + + framework::ProgramDesc program; + exe.Prepare(program, 0); // solve undefined symbol: tensor_table.cc + std::shared_ptr send_scope_ptr = + std::make_shared(); + int64_t rows_numel = 10; + InitTensorsOnClient(send_scope_ptr.get(), &place, rows_numel); + LOG(INFO) << "InitTensorsOnClient done"; + + auto send_async = [&]() -> void { + /* + //std::string message_name = + std::to_string(distributed::PS_SAVE_WITH_SCOPE); + std::string message_name = "send and save"; + std::vector send_var_names{"w", "x"}; + int ret = heter_client_ptr_->Send(ctx, *send_scope_ptr, message_name, + send_var_names); + if (!ret) { + LOG(ERROR) << ">>>> worker send success"; + } + */ + ///* + std::vector vars_len{2, 4}; + std::vector values{1.0, 2.0, 3.0, 4.0, 5.0, 6.0}; + int64_t data_size = 6; + std::vector send_var_names{"w", "x"}; + int group_id = 0; + int ret = heter_client_ptr_->Send(group_id, send_var_names, vars_len, + values.data(), data_size); + if (!ret) { + LOG(INFO) << ">>>> worker send success"; + } + //*/ + }; + std::thread send_thread(send_async); + /* + std::string message_name = std::to_string(distributed::PS_QUERY_WITH_SCOPE); + std::vector recv_var_names{"w", "x"}; + std::shared_ptr recv_scope_ptr = + std::make_shared(); + int ret = heter_client_ptr_->Recv(ctx, *recv_scope_ptr, message_name, + recv_var_names); + if (!ret && recv_scope_ptr->FindVar("w") && recv_scope_ptr->FindVar("x")) { + LOG(INFO) << ">>>> worker recv success"; + } else { + LOG(INFO) << "worker recv failed"; + } + */ + ///* + int group_id = 0; + std::vector recv_var_names{"w", "x"}; + std::vector values; + int data_size = 6; + values.resize(data_size); + int ret = heter_client_ptr_->Recv(group_id, recv_var_names, values.data(), + data_size); + if (!ret) { + VLOG(4) << "queried data is: "; + for (auto f : values) { + VLOG(4) << f << " "; + } + LOG(INFO) << ">>>> worker recv success"; + } + //*/ + + send_thread.join(); + + switch_server_ptr_a->Stop(); + LOG(INFO) << "switch server A stopped"; + + switch_server_ptr_b->Stop(); + LOG(INFO) << "switch server B stopped"; + + switch_server_a_thread.join(); + LOG(INFO) << "switch_server_a_thread joined"; + + switch_server_b_thread.join(); + LOG(INFO) << "switch_server_b_thread joined"; + + switch_server_b_thread_inter.join(); + LOG(INFO) << "switch_server_b_thread_inter joined"; +} +#endif diff --git a/paddle/fluid/operators/pscore/heter_listen_and_serv_op.cc b/paddle/fluid/operators/pscore/heter_listen_and_serv_op.cc index 2c443e8c63cbe..2df0d7526a3d3 100644 --- a/paddle/fluid/operators/pscore/heter_listen_and_serv_op.cc +++ b/paddle/fluid/operators/pscore/heter_listen_and_serv_op.cc @@ -88,21 +88,20 @@ void HeterListenAndServOp::RunAsyncLoop(framework::ProgramDesc *program) const { for (size_t blkid = 1; blkid < num_blocks; ++blkid) { block_list.push_back(blkid); } - for (size_t i = 0; i < block_list.size(); ++i) { auto blkid = block_list[i]; auto it = message_to_block_id.find_value(blkid); - rpc_service_->RegisterServiceHandler( + heter_server_->RegisterServiceHandler( it->first, [&](const MultiVarMsg *request, MultiVarMsg *response, brpc::Controller *cntl) -> int { - return request_send_and_recv_handler_->Handle(request, response, - cntl); + return send_and_recv_variable_handler_->Handle(request, response, + cntl); }); } while (true) { - if (rpc_service_->IsExit() || rpc_service_->IsStop()) { - rpc_service_->Stop(); + if (heter_server_->IsExit() || heter_server_->IsStop()) { + heter_server_->Stop(); VLOG(0) << "get exit. rpc_processor stop!"; break; } @@ -110,8 +109,9 @@ void HeterListenAndServOp::RunAsyncLoop(framework::ProgramDesc *program) const { } // while(true) } -void RunServer(std::shared_ptr service) { - service->StartHeterService(); +void RunServer( + std::shared_ptr heter_server_ptr) { + heter_server_ptr->StartHeterService(); } void HeterListenAndServOp::RunImpl(const framework::Scope &scope, @@ -126,16 +126,16 @@ void HeterListenAndServOp::RunImpl(const framework::Scope &scope, auto fan_in = Attr("fanin"); auto inputs = Inputs("X"); - PADDLE_ENFORCE_EQ(rpc_service_, nullptr, + PADDLE_ENFORCE_EQ(heter_server_, nullptr, platform::errors::PreconditionNotMet( "RPC service has been created unexpectedly.")); std::string endpoint = Attr("endpoint"); VLOG(4) << "pserver_id: " << pserver_id << ", end_point:" << endpoint; - rpc_service_ = distributed::HeterServer::GetInstance(); - rpc_service_->SetEndPoint(endpoint); - rpc_service_->SetFanin(fan_in); + heter_server_ = distributed::HeterServer::GetInstance(); + heter_server_->SetEndPoint(endpoint); + heter_server_->SetFanin(fan_in); auto optimize_blocks = Attr>("optimize_blocks"); @@ -146,20 +146,18 @@ void HeterListenAndServOp::RunImpl(const framework::Scope &scope, auto *program = optimize_blocks[0]->Program(); - request_send_and_recv_handler_.reset( - new distributed::RequestSendAndRecvHandler()); - request_send_and_recv_handler_->SetScope(&scope); - request_send_and_recv_handler_->SetDevCtx(&dev_ctx); - rpc_service_->SetRequestHandler(request_send_and_recv_handler_); + send_and_recv_variable_handler_.reset( + new distributed::SendAndRecvVariableHandler()); + send_and_recv_variable_handler_->SetScope(&scope); + send_and_recv_variable_handler_->SetDevCtx(&dev_ctx); + heter_server_->SetServiceHandler(send_and_recv_variable_handler_); VLOG(2) << "RunAsyncLoop"; - auto message_to_block_id_str = - Attr>("message_to_block_id"); // start the server listening after all member initialized. - server_thread_.reset(new std::thread(RunServer, rpc_service_)); + server_thread_.reset(new std::thread(RunServer, heter_server_)); VLOG(3) << "wait server thread to become ready..."; - rpc_service_->WaitServerReady(); + heter_server_->WaitServerReady(); RunAsyncLoop(program); VLOG(3) << "Wait for Server_thread_ stop"; (server_thread_.get())->join(); diff --git a/paddle/fluid/operators/pscore/heter_listen_and_serv_op.h b/paddle/fluid/operators/pscore/heter_listen_and_serv_op.h old mode 100644 new mode 100755 index 2d2d8abe70627..3ecff083b00c7 --- a/paddle/fluid/operators/pscore/heter_listen_and_serv_op.h +++ b/paddle/fluid/operators/pscore/heter_listen_and_serv_op.h @@ -34,7 +34,7 @@ limitations under the License. */ namespace paddle { namespace distributed { -class HeterRequestHandler; +class ServiceHandlerBase; class HeterServer; } // namespace distributed } // namespace paddle @@ -82,10 +82,10 @@ class HeterListenAndServOp : public framework::OperatorBase { const platform::Place& dev_place) const override; protected: - mutable std::shared_ptr rpc_service_; + mutable std::shared_ptr heter_server_; mutable std::shared_ptr server_thread_; - mutable std::shared_ptr - request_send_and_recv_handler_; + mutable std::shared_ptr + send_and_recv_variable_handler_; }; } // namespace operators diff --git a/paddle/fluid/operators/pscore/heter_listen_and_server_test.cc b/paddle/fluid/operators/pscore/heter_listen_and_server_test.cc index b024fe76b0972..ab2fcba51062f 100644 --- a/paddle/fluid/operators/pscore/heter_listen_and_server_test.cc +++ b/paddle/fluid/operators/pscore/heter_listen_and_server_test.cc @@ -142,7 +142,7 @@ void InitTensorsOnServer(framework::Scope* scope, platform::CPUPlace* place, CreateVarsOnScope(scope, place); } -void StartHeterServer(std::string endpoint) { +void RunHeterServerOp(std::string endpoint) { framework::ProgramDesc program; framework::Scope scope; platform::CPUPlace place; @@ -167,10 +167,10 @@ TEST(HETER_LISTEN_AND_SERV, CPU) { std::string previous_endpoint = endpoint; LOG(INFO) << "before StartSendAndRecvServer"; FLAGS_eager_delete_tensor_gb = -1; - std::thread server_thread(StartHeterServer, endpoint); + std::thread server_thread(RunHeterServerOp, endpoint); sleep(1); - auto b_rpc_service = distributed::HeterServer::GetInstance(); - b_rpc_service->WaitServerReady(); + auto heter_server_ptr_ = distributed::HeterServer::GetInstance(); + heter_server_ptr_->WaitServerReady(); using MicroScope = std::unordered_map>>; using MiniScope = std::unordered_map; @@ -185,8 +185,8 @@ TEST(HETER_LISTEN_AND_SERV, CPU) { (*micro_scope).push_back(micro_scope_0); (*micro_scope).push_back(micro_scope_1); (*micro_scopes)[0] = micro_scope; - b_rpc_service->SetMicroBatchScopes(micro_scopes); - b_rpc_service->SetMiniBatchScopes(mini_scopes); + heter_server_ptr_->SetMicroBatchScopes(micro_scopes); + heter_server_ptr_->SetMiniBatchScopes(mini_scopes); using TaskQueue = std::unordered_map>>(); - b_rpc_service->SetTaskQueue(task_queue_); + heter_server_ptr_->SetTaskQueue(task_queue_); LOG(INFO) << "before HeterClient::GetInstance"; - distributed::HeterClient* rpc_client = + distributed::HeterClient* heter_client_ptr_ = distributed::HeterClient::GetInstance({endpoint}, {previous_endpoint}, 0) .get(); - PADDLE_ENFORCE_NE(rpc_client, nullptr, - platform::errors::InvalidArgument( - "Client Start Fail, Check Your Code & Env")); - framework::Scope* scope = (*micro_scope)[0]; platform::CPUPlace place; platform::CPUDeviceContext ctx(place); @@ -224,8 +220,8 @@ TEST(HETER_LISTEN_AND_SERV, CPU) { std::vector recv_var = {}; LOG(INFO) << "before SendAndRecvAsync"; - rpc_client->SendAndRecvAsync(ctx, *scope, in_var_name, send_var, recv_var, - "forward"); + heter_client_ptr_->SendAndRecvAsync(ctx, *scope, in_var_name, send_var, + recv_var, "forward"); auto task = (*task_queue_)[0]->Pop(); PADDLE_ENFORCE_EQ( task.first, "x", @@ -234,15 +230,15 @@ TEST(HETER_LISTEN_AND_SERV, CPU) { InitTensorsOnClient2((*micro_scope)[1], &place, rows_numel); LOG(INFO) << "before SendAndRecvAsync 2"; - rpc_client->SendAndRecvAsync(ctx, *((*micro_scope)[1]), in_var_name, send_var, - recv_var, "backward"); + heter_client_ptr_->SendAndRecvAsync(ctx, *((*micro_scope)[1]), in_var_name, + send_var, recv_var, "backward"); auto task2 = (*task_queue_)[0]->Pop(); PADDLE_ENFORCE_EQ( task2.first, "x", platform::errors::InvalidArgument( "Recv message and Send message name not match, Check your Code")); - rpc_client->Stop(); + heter_client_ptr_->Stop(); LOG(INFO) << "end server Stop"; server_thread.join(); LOG(INFO) << "end server thread join"; diff --git a/paddle/fluid/operators/pscore/heter_server_test.cc b/paddle/fluid/operators/pscore/heter_server_test.cc index 6ab4204b2f9df..d4ee00d10a50b 100644 --- a/paddle/fluid/operators/pscore/heter_server_test.cc +++ b/paddle/fluid/operators/pscore/heter_server_test.cc @@ -34,8 +34,6 @@ using VarMsg = ::paddle::distributed::VariableMessage; USE_OP_ITSELF(scale); -std::shared_ptr b_rpc_service; - std::string get_ip_port() { std::mt19937 rng; rng.seed(std::random_device()()); @@ -171,31 +169,32 @@ void StartSendAndRecvServer(std::string endpoint) { InitTensorsOnServer(&scope, &place, 10); LOG(INFO) << "end InitTensorsOnServer"; - std::shared_ptr b_req_handler; - b_req_handler.reset(new distributed::RequestSendAndRecvHandler()); + std::shared_ptr b_req_handler; + b_req_handler.reset(new distributed::SendAndRecvVariableHandler()); LOG(INFO) << "before SetDevCtx"; b_req_handler->SetDevCtx(&ctx); LOG(INFO) << "before SetScope"; b_req_handler->SetScope(&scope); LOG(INFO) << "before HeterServer::GetInstance"; - b_rpc_service = distributed::HeterServer::GetInstance(); - b_rpc_service->SetEndPoint(endpoint); + std::shared_ptr heter_server_ptr_ = + distributed::HeterServer::GetInstance(); + heter_server_ptr_->SetEndPoint(endpoint); LOG(INFO) << "before HeterServer::RegisterServiceHandler"; - b_rpc_service->RegisterServiceHandler( + heter_server_ptr_->RegisterServiceHandler( in_var_name, [&](const MultiVarMsg* request, MultiVarMsg* response, brpc::Controller* cntl) -> int { return b_req_handler->Handle(request, response, cntl); }); - b_rpc_service->RegisterServiceHandler( + heter_server_ptr_->RegisterServiceHandler( in_var_name2, [&](const MultiVarMsg* request, MultiVarMsg* response, brpc::Controller* cntl) -> int { return b_req_handler->Handle(request, response, cntl); }); - b_rpc_service->SetRequestHandler(b_req_handler); + heter_server_ptr_->SetServiceHandler(b_req_handler); LOG(INFO) << "before HeterServer::RunServer"; - RunServer(b_rpc_service); - // std::thread server_thread(std::bind(RunServer, b_rpc_service)); + RunServer(heter_server_ptr_); + // std::thread server_thread(std::bind(RunServer, heter_server_ptr_)); // server_thread.join(); } @@ -206,9 +205,10 @@ TEST(SENDANDRECV, CPU) { std::string endpoint = get_ip_port(); std::string previous_endpoint = endpoint; LOG(INFO) << "before StartSendAndRecvServer"; - b_rpc_service = distributed::HeterServer::GetInstance(); + std::shared_ptr heter_server_ptr_ = + distributed::HeterServer::GetInstance(); std::thread server_thread(StartSendAndRecvServer, endpoint); - b_rpc_service->WaitServerReady(); + heter_server_ptr_->WaitServerReady(); using MicroScope = std::unordered_map>>; using MiniScope = std::unordered_map; @@ -223,8 +223,8 @@ TEST(SENDANDRECV, CPU) { (*micro_scope).push_back(micro_scope_0); (*micro_scope).push_back(micro_scope_1); (*micro_scopes)[0] = micro_scope; - b_rpc_service->SetMicroBatchScopes(micro_scopes); - b_rpc_service->SetMiniBatchScopes(mini_scopes); + heter_server_ptr_->SetMicroBatchScopes(micro_scopes); + heter_server_ptr_->SetMiniBatchScopes(mini_scopes); using TaskQueue = std::unordered_map>>(); - b_rpc_service->SetTaskQueue(task_queue_); + heter_server_ptr_->SetTaskQueue(task_queue_); LOG(INFO) << "before HeterClient::GetInstance"; - distributed::HeterClient* rpc_client = + distributed::HeterClient* heter_client_ptr_ = distributed::HeterClient::GetInstance({endpoint}, {previous_endpoint}, 0) .get(); - PADDLE_ENFORCE_NE(rpc_client, nullptr, - platform::errors::InvalidArgument( - "Client Start Fail, Check Your Code & Env")); - framework::Scope* scope = (*micro_scope)[0]; platform::CPUPlace place; platform::CPUDeviceContext ctx(place); @@ -262,8 +258,8 @@ TEST(SENDANDRECV, CPU) { std::vector recv_var = {}; LOG(INFO) << "before SendAndRecvAsync"; - rpc_client->SendAndRecvAsync(ctx, *scope, in_var_name, send_var, recv_var, - "forward"); + heter_client_ptr_->SendAndRecvAsync(ctx, *scope, in_var_name, send_var, + recv_var, "forward"); LOG(INFO) << "client wait for Pop"; auto task = (*task_queue_)[0]->Pop(); @@ -276,8 +272,8 @@ TEST(SENDANDRECV, CPU) { InitTensorsOnClient2((*micro_scope)[1], &place, rows_numel); LOG(INFO) << "before SendAndRecvAsync 2"; std::string in_var_name2("y"); - rpc_client->SendAndRecvAsync(ctx, *((*micro_scope)[1]), in_var_name2, - send_var, recv_var, "backward"); + heter_client_ptr_->SendAndRecvAsync(ctx, *((*micro_scope)[1]), in_var_name2, + send_var, recv_var, "backward"); LOG(INFO) << "after SendAndRecvAsync 2"; auto task2 = (*task_queue_)[0]->Pop(); @@ -286,8 +282,7 @@ TEST(SENDANDRECV, CPU) { platform::errors::InvalidArgument( "Recv message and Send message name not match, Check your Code")); - rpc_client->FinalizeWorker(); - b_rpc_service->Stop(); + heter_server_ptr_->Stop(); LOG(INFO) << "end server Stop"; server_thread.join(); LOG(INFO) << "end server thread join"; diff --git a/paddle/fluid/operators/pscore/send_and_recv_op_cpu_test.cc b/paddle/fluid/operators/pscore/send_and_recv_op_cpu_test.cc old mode 100644 new mode 100755 index 26da0d3696fdf..7c25d38d1ebad --- a/paddle/fluid/operators/pscore/send_and_recv_op_cpu_test.cc +++ b/paddle/fluid/operators/pscore/send_and_recv_op_cpu_test.cc @@ -36,8 +36,6 @@ using VarMsg = ::paddle::distributed::VariableMessage; USE_OP_ITSELF(scale); USE_OP(send_and_recv); -std::shared_ptr b_rpc_service; - std::string get_ip_port() { std::mt19937 rng; rng.seed(std::random_device()()); @@ -148,14 +146,15 @@ void StartSendAndRecvServer(std::string endpoint) { InitTensorsOnServer(&scope, &place, 10); LOG(INFO) << "end InitTensorsOnServer"; - std::shared_ptr b_req_handler; - b_req_handler.reset(new distributed::RequestSendAndRecvHandler()); + std::shared_ptr b_req_handler; + b_req_handler.reset(new distributed::SendAndRecvVariableHandler()); LOG(INFO) << "before SetDevCtx"; b_req_handler->SetDevCtx(&ctx); LOG(INFO) << "before SetScope"; b_req_handler->SetScope(&scope); LOG(INFO) << "before HeterServer::GetInstance"; - b_rpc_service = distributed::HeterServer::GetInstance(); + std::shared_ptr b_rpc_service = + distributed::HeterServer::GetInstance(); b_rpc_service->SetEndPoint(endpoint); LOG(INFO) << "before HeterServer::RegisterServiceHandler"; b_rpc_service->RegisterServiceHandler( @@ -164,7 +163,7 @@ void StartSendAndRecvServer(std::string endpoint) { return b_req_handler->Handle(request, response, cntl); }); - b_rpc_service->SetRequestHandler(b_req_handler); + b_rpc_service->SetServiceHandler(b_req_handler); LOG(INFO) << "before HeterServer::RunServer"; RunServer(b_rpc_service); @@ -179,7 +178,8 @@ TEST(SENDANDRECV, CPU) { std::string endpoint = get_ip_port(); std::string previous_endpoint = endpoint; LOG(INFO) << "before StartSendAndRecvServer"; - b_rpc_service = distributed::HeterServer::GetInstance(); + std::shared_ptr b_rpc_service = + distributed::HeterServer::GetInstance(); std::thread server_thread(StartSendAndRecvServer, endpoint); b_rpc_service->WaitServerReady(); using MicroScope = @@ -292,7 +292,6 @@ TEST(SENDANDRECV, CPU) { platform::errors::InvalidArgument( "Recv message and Send message name not match, Check your Code")); - rpc_client->FinalizeWorker(); b_rpc_service->Stop(); LOG(INFO) << "end server Stop"; server_thread.join(); diff --git a/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc b/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc index a5e292a05e1ff..4054846460b07 100644 --- a/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc +++ b/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc @@ -167,8 +167,8 @@ void StartSendAndRecvServer(std::string endpoint) { InitTensorsOnServer(&scope, &place, 10); LOG(INFO) << "end InitTensorsOnServer"; - std::shared_ptr b_req_handler; - b_req_handler.reset(new distributed::RequestSendAndRecvHandler()); + std::shared_ptr b_req_handler; + b_req_handler.reset(new distributed::SendAndRecvVariableHandler()); LOG(INFO) << "before SetDevCtx"; b_req_handler->SetDevCtx(&ctx); LOG(INFO) << "before SetScope"; @@ -183,7 +183,7 @@ void StartSendAndRecvServer(std::string endpoint) { return b_req_handler->Handle(request, response, cntl); }); - b_rpc_service2->SetRequestHandler(b_req_handler); + b_rpc_service2->SetServiceHandler(b_req_handler); LOG(INFO) << "before HeterServer::RunServer"; RunServer(b_rpc_service2); @@ -228,13 +228,11 @@ TEST(SENDANDRECV, GPU) { b_rpc_service2->SetTaskQueue(task_queue_); LOG(INFO) << "before HeterClient::GetInstance"; - distributed::HeterClient* rpc_client = - distributed::HeterClient::GetInstance({endpoint}, {previous_endpoint}, 0) - .get(); - - PADDLE_ENFORCE_NE(rpc_client, nullptr, - platform::errors::InvalidArgument( - "Client Start Fail, Check Your Code & Env")); + std::shared_ptr heter_client_ptr_ = + distributed::HeterClient::GetInstance({endpoint}, {previous_endpoint}, 0); + if (heter_client_ptr_ == nullptr) { + LOG(ERROR) << "heter_client_ptr_ is null"; + } framework::Scope* scope = (*micro_scope)[0]; platform::CUDAPlace place; @@ -316,7 +314,6 @@ TEST(SENDANDRECV, GPU) { platform::errors::InvalidArgument( "Recv message and Send message name not match, Check your Code")); - rpc_client->FinalizeWorker(); b_rpc_service2->Stop(); LOG(INFO) << "end server Stop"; server_thread.join(); diff --git a/paddle/fluid/operators/range_op.cc b/paddle/fluid/operators/range_op.cc index 3c2fe8b9e5d9f..ddfbdbace054d 100644 --- a/paddle/fluid/operators/range_op.cc +++ b/paddle/fluid/operators/range_op.cc @@ -14,6 +14,10 @@ limitations under the License. */ #include "paddle/fluid/operators/range_op.h" #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/ternary.h" namespace paddle { namespace operators { @@ -22,51 +26,6 @@ class RangeOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override { - if (ctx->HasInput("Start")) { - auto s_dims = ctx->GetInputDim("Start"); - PADDLE_ENFORCE_EQ( - s_dims.size(), 1, - platform::errors::InvalidArgument( - "The dim of the shape of Input(Start) should be 1, but got %d", - s_dims.size())); - - PADDLE_ENFORCE_EQ(s_dims[0], 1, - platform::errors::InvalidArgument( - "The first dim of the shape of Input(Start) should " - "be 1, but got %d", - s_dims[0])); - } - if (ctx->HasInput("End")) { - auto e_dims = ctx->GetInputDim("End"); - PADDLE_ENFORCE_EQ( - e_dims.size(), 1, - platform::errors::InvalidArgument( - "The dim of the shape of Input(End) should be 1, but got %d", - e_dims.size())); - - PADDLE_ENFORCE_EQ(e_dims[0], 1, platform::errors::InvalidArgument( - "The first dim of the shape of " - "Input(End) should be 1, but got %d", - e_dims[0])); - } - if (ctx->HasInput("Step")) { - auto step_dims = ctx->GetInputDim("Step"); - PADDLE_ENFORCE_EQ( - step_dims.size(), 1, - platform::errors::InvalidArgument( - "The dim of the shape of Input(Step) should be 1, but got %d", - step_dims.size())); - - PADDLE_ENFORCE_EQ(step_dims[0], 1, - platform::errors::InvalidArgument( - "The first dim of the shape of Input(Step) should " - "be 1, but got %d", - step_dims[0])); - } - ctx->SetOutputDim("Out", {-1}); - } - protected: framework::OpKernelType GetKernelTypeForVar( const std::string &var_name, const framework::Tensor &tensor, @@ -101,7 +60,7 @@ class RangeOpMaker : public framework::OpProtoAndCheckerMaker { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_WITHOUT_GRADIENT(range, ops::RangeOp, ops::RangeOpMaker); -REGISTER_OP_CPU_KERNEL(range, ops::CPURangeKernel, - ops::CPURangeKernel, ops::CPURangeKernel, - ops::CPURangeKernel); +DECLARE_INFER_SHAPE_FUNCTOR(range, RangeInferMetaFunctor, + PD_INFER_META(phi::RangeInferMeta)); +REGISTER_OP_WITHOUT_GRADIENT(range, ops::RangeOp, ops::RangeOpMaker, + RangeInferMetaFunctor); diff --git a/paddle/fluid/operators/range_op.cu b/paddle/fluid/operators/range_op.cu deleted file mode 100644 index 1b1d41ae4c5c7..0000000000000 --- a/paddle/fluid/operators/range_op.cu +++ /dev/null @@ -1,61 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/range_op.h" -#include "paddle/fluid/operators/utils.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" - -namespace paddle { -namespace operators { - -template -__global__ void RangeKernel(T start, T step, int64_t size, T* out) { - CUDA_KERNEL_LOOP(index, size) { out[index] = start + step * index; } -} - -template -class CUDARangeKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* start_t = context.Input("Start"); - auto* end_t = context.Input("End"); - auto* step_t = context.Input("Step"); - auto* out = context.Output("Out"); - - T start = GetValue(start_t); - T end = GetValue(end_t); - T step = GetValue(step_t); - - int64_t size = 0; - GetSize(start, end, step, &size); - out->Resize(phi::make_ddim({size})); - T* out_data = out->mutable_data(context.GetPlace()); - - auto stream = context.cuda_device_context().stream(); - int block = std::min(size, static_cast(256)); - int grid = (size + block - 1) / block; - RangeKernel<<>>(start, step, size, out_data); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL(range, ops::CUDARangeKernel, - ops::CUDARangeKernel, - ops::CUDARangeKernel, - ops::CUDARangeKernel); diff --git a/paddle/fluid/operators/range_op_npu_test.cc b/paddle/fluid/operators/range_op_npu_test.cc index c7e91ba35dee1..ac32170d93957 100644 --- a/paddle/fluid/operators/range_op_npu_test.cc +++ b/paddle/fluid/operators/range_op_npu_test.cc @@ -30,7 +30,7 @@ limitations under the License. */ namespace f = paddle::framework; namespace p = paddle::platform; -USE_OP(range); +USE_OP_ITSELF(range); USE_OP_DEVICE_KERNEL(range, NPU); template diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc index 4a4210845ca08..0befc873ed696 100644 --- a/paddle/fluid/operators/reshape_op.cc +++ b/paddle/fluid/operators/reshape_op.cc @@ -21,7 +21,7 @@ limitations under the License. */ // only can include the headers in paddle/phi/api dirs #include "paddle/phi/api/lib/utils/tensor_utils.h" #include "paddle/phi/backends/cpu/cpu_context.h" -#include "paddle/phi/common/scalar_array.h" +#include "paddle/phi/common/int_array.h" #include "paddle/phi/core/infermeta_utils.h" #include "paddle/phi/infermeta/backward.h" #include "paddle/phi/infermeta/unary.h" @@ -354,7 +354,7 @@ class ReshapeKernel { auto *shape_tensor = ctx.HasInput("Shape") ? ctx.Input("Shape") : nullptr; - phi::ScalarArray pt_scalar_shape; + phi::IntArray pt_scalar_shape; if (list_new_shape_tensor.size() > 0) { // have shape tensor std::vector pt_vec_shape; @@ -369,7 +369,7 @@ class ReshapeKernel { pt_vec_shape.push_back(*tensor); } } - pt_scalar_shape = phi::ScalarArray(pt_vec_shape); + pt_scalar_shape = phi::IntArray(pt_vec_shape); } else if (shape_tensor) { phi::DenseTensor pt_shape; if (platform::is_gpu_place(shape_tensor->place()) || @@ -381,10 +381,10 @@ class ReshapeKernel { } else { pt_shape = *shape_tensor; } - pt_scalar_shape = phi::ScalarArray(pt_shape); + pt_scalar_shape = phi::IntArray(pt_shape); } else { auto &shape_attr = ctx.Attr>("shape"); - pt_scalar_shape = phi::ScalarArray(shape_attr); + pt_scalar_shape = phi::IntArray(shape_attr); } if (platform::is_cpu_place(ctx.GetPlace())) { auto &dev_ctx = ctx.device_context(); diff --git a/paddle/fluid/operators/rnn_op_xpu.cc b/paddle/fluid/operators/rnn_op_xpu.cc index 183f83dbae7c3..2dee4e889f739 100644 --- a/paddle/fluid/operators/rnn_op_xpu.cc +++ b/paddle/fluid/operators/rnn_op_xpu.cc @@ -1,4 +1,4 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at @@ -13,6 +13,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/utils.h" +#include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/platform/device/xpu/xpu_header.h" #include "paddle/fluid/platform/device_context.h" @@ -21,9 +22,7 @@ namespace operators { using Tensor = framework::Tensor; using DDim = framework::DDim; - using TensorList = std::vector; - template void reset_parameter_vector(const std::vector& raw_params_vec, const int& num_layers, const bool& is_bidirec, @@ -51,54 +50,89 @@ void reset_parameter_vector(const std::vector& raw_params_vec, } } +template +void RunLSTMLayer(const framework::ExecutionContext& ctx, int seq_len, + int batch_size, int xdim, int hidden_size, const T* x, T* y, + const T* init_h, const T* init_c, T* last_h, T* last_c, + int state_offset, const std::vector& seq_len_tensor, + const std::vector& param_list, T* i_f_g_o, T* c, + bool is_bidirect, int layer_idx, int offset) { + bool is_reverse = false; + if (is_bidirect) { + layer_idx = 2 * layer_idx + offset; + if (offset > 0) { + is_reverse = true; + } + } + auto w_x = param_list[0 + offset * 4]; + auto w_h = param_list[1 + offset * 4]; + auto b_x = param_list[2 + offset * 4]; + auto b_h = param_list[3 + offset * 4]; + + auto h_0 = init_h + layer_idx * state_offset; + auto c_0 = init_c + layer_idx * state_offset; + auto last_h_ptr = last_h + layer_idx * state_offset; + auto last_c_ptr = last_c + layer_idx * state_offset; + auto& dev_ctx = ctx.template device_context(); + int r = xpu::lstm_train( + dev_ctx.x_context(), (const T*)x, (const T*)h_0, (const T*)c_0, + (const T*)w_x, (const T*)w_h, (const T*)b_x, (const T*)b_h, + reinterpret_cast(y), reinterpret_cast(last_h_ptr), + reinterpret_cast(last_c_ptr), batch_size, xdim, hidden_size, seq_len, + seq_len_tensor, is_reverse, nullptr, nullptr, nullptr, nullptr, + reinterpret_cast(i_f_g_o), reinterpret_cast(c), + xpu::Activation_t::TANH, xpu::Activation_t::SIGMOID); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "lstm_train"); +} + template class RnnXPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { + // Input auto* input = ctx.Input("Input"); auto pre_state = ctx.MultiInput("PreState"); auto weight_list = ctx.MultiInput("WeightList"); + bool has_seq_length = ctx.HasInput("SequenceLength"); + // Output auto state = ctx.MultiOutput("State"); auto* output = ctx.Output("Out"); + auto* dropout_mask = ctx.Output("DropoutState"); auto* reserve_data = ctx.Output("Reserve"); + // Attrbutes const int& num_layers = ctx.Attr("num_layers"); const bool& is_bidirec = ctx.Attr("is_bidirec"); const int& hidden_size = ctx.Attr("hidden_size"); const std::string& mode = ctx.Attr("mode"); - bool has_seq_length = ctx.HasInput("SequenceLength"); const Tensor* sequence_length = nullptr; if (has_seq_length) { sequence_length = ctx.Input("SequenceLength"); } + if (dropout_mask->IsInitialized()) { + if (dropout_mask->numel() != output->numel()) dropout_mask->clear(); + } + dropout_mask->mutable_data(output->dims(), ctx.GetPlace()); + PADDLE_ENFORCE_EQ( mode, "LSTM", platform::errors::InvalidArgument( "XPU only support LSTM mode now, current mode is %s", mode)); - PADDLE_ENFORCE_EQ(is_bidirec, false, - platform::errors::InvalidArgument( - "XPU only support unidirectional LSTM now")); - - PADDLE_ENFORCE_EQ( - num_layers, 1, - platform::errors::InvalidArgument( - "XPU only support 1 layer LSTM now, current layer num is %s", - num_layers)); - auto init_h = pre_state[0]; auto init_c = pre_state[1]; auto last_h = state[0]; auto last_c = state[1]; // check shape - int seq_len = input->dims()[0]; - int batch_size = input->dims()[1]; - int input_dim = input->dims()[2]; + const int& seq_len = input->dims()[0]; // time_step + const int& batch_size = input->dims()[1]; + const int& input_dim = input->dims()[2]; + const int& direction_num = is_bidirec ? 2 : 1; PADDLE_ENFORCE_EQ( - init_h->dims()[0], num_layers, + init_h->dims()[0], num_layers * direction_num, platform::errors::InvalidArgument("The num_layers of in RNN layer must" " be the same as first dim of init " "hidden, but received num_layers:%d," @@ -106,13 +140,13 @@ class RnnXPUKernel : public framework::OpKernel { num_layers, init_h->dims()[0])); PADDLE_ENFORCE_EQ( - init_c->dims()[0], num_layers, + init_c->dims()[0], num_layers * direction_num, platform::errors::InvalidArgument( "The num_layers of in RNN layer must" " be the same as first dim of cell state hidden, but received" " num_layers:%d, dim:%d", num_layers, init_c->dims()[0])); - + // weightlist std::vector> parameter_lists; parameter_lists.resize(num_layers); reset_parameter_vector(weight_list, num_layers, is_bidirec, @@ -122,41 +156,106 @@ class RnnXPUKernel : public framework::OpKernel { output->mutable_data(ctx.GetPlace()); last_h->mutable_data(ctx.GetPlace()); last_c->mutable_data(ctx.GetPlace()); - reserve_data->Resize({seq_len * batch_size * hidden_size * 5}); - reserve_data->mutable_data(ctx.GetPlace()); + reserve_data->Resize( + {num_layers * direction_num * seq_len * batch_size * hidden_size * 5}); + reserve_data->mutable_data(ctx.GetPlace()); + Tensor internal_output_1_tensor, internal_output_2_tensor; + T* internal_output_1_ptr = nullptr; + T* internal_output_2_ptr = nullptr; + if (num_layers >= 2) { + internal_output_1_tensor.Resize(output->dims()); + internal_output_1_ptr = + internal_output_1_tensor.mutable_data(ctx.GetPlace()); + } + if (num_layers >= 3) { + internal_output_2_tensor.Resize(output->dims()); + internal_output_2_ptr = + internal_output_2_tensor.mutable_data(ctx.GetPlace()); + } // get ptr from tensor auto x = input->data(); - auto h_0 = init_h->data(); - auto c_0 = init_c->data(); - auto w_x = parameter_lists[0][0]; - auto w_h = parameter_lists[0][1]; - auto b_x = parameter_lists[0][2]; - auto b_h = parameter_lists[0][3]; + auto init_h_ptr = init_h->data(); + auto init_c_ptr = init_c->data(); auto y = output->data(); auto last_h_ptr = last_h->data(); auto last_c_ptr = last_c->data(); auto i_f_g_o = reserve_data->data(); - auto c = i_f_g_o + seq_len * batch_size * hidden_size * 4; + auto c = + i_f_g_o + + num_layers * direction_num * seq_len * batch_size * hidden_size * 4; std::vector seq_len_tensor(batch_size, seq_len); if (has_seq_length) { seq_len_tensor = operators::GetDataFromTensor(sequence_length); } - // run kernel auto& dev_ctx = ctx.template device_context(); - int r = xpu::lstm_train( - dev_ctx.x_context(), (const T*)x, (const T*)h_0, (const T*)c_0, - (const T*)w_x, (const T*)w_h, (const T*)b_x, (const T*)b_h, - reinterpret_cast(y), reinterpret_cast(last_h_ptr), - reinterpret_cast(last_c_ptr), batch_size, input_dim, hidden_size, - seq_len, seq_len_tensor, nullptr, nullptr, nullptr, nullptr, - reinterpret_cast(i_f_g_o), reinterpret_cast(c)); - PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS, - platform::errors::External("RnnXPU(lstm) return wrong " - "value[%d %s]", - r, XPUAPIErrorMsg[r])); + int state_offset = pre_state[0]->dims()[1] * pre_state[0]->dims()[2]; + + for (int i = 0; i < num_layers; i++) { + const T* cur_input_ptr = nullptr; + int cur_xdim = -1; + i_f_g_o += i * direction_num * seq_len * batch_size * hidden_size * 4; + c += i * direction_num * seq_len * batch_size * hidden_size; + + if (i == 0) { + cur_input_ptr = x; + cur_xdim = input_dim; + } else if (i % 2 != 0) { + cur_input_ptr = internal_output_1_ptr; + cur_xdim = is_bidirec ? 2 * hidden_size : hidden_size; + } else { + cur_input_ptr = internal_output_2_ptr; + cur_xdim = is_bidirec ? 2 * hidden_size : hidden_size; + } + + T* cur_output_ptr = nullptr; + if (i == num_layers - 1) { + cur_output_ptr = y; + } else if (i % 2 != 0) { + cur_output_ptr = internal_output_2_ptr; + } else { + cur_output_ptr = internal_output_1_ptr; + } + + if (is_bidirec) { + std::vector output_vec(2); + std::vector output_ptr_vec(2); + for (int k = 0; k < 2; ++k) { + output_vec[k].Resize({seq_len, batch_size, output->dims()[2] / 2}); + output_ptr_vec[k] = output_vec[k].mutable_data(ctx.GetPlace()); + } + RunLSTMLayer( + ctx, seq_len, batch_size, cur_xdim, hidden_size, cur_input_ptr, + output_ptr_vec[0], init_h_ptr, init_c_ptr, last_h_ptr, last_c_ptr, + state_offset, seq_len_tensor, parameter_lists[i], i_f_g_o, c, + is_bidirec, i, 0); + + T* bw_i_f_g_o = i_f_g_o + seq_len * batch_size * hidden_size * 4; + T* bw_c = c + seq_len * batch_size * hidden_size; + RunLSTMLayer( + ctx, seq_len, batch_size, cur_xdim, hidden_size, cur_input_ptr, + output_ptr_vec[1], init_h_ptr, init_c_ptr, last_h_ptr, last_c_ptr, + state_offset, seq_len_tensor, parameter_lists[i], bw_i_f_g_o, bw_c, + is_bidirec, i, 1); + + // concat + int r = xpu::concat( + dev_ctx.x_context(), {output_ptr_vec[0], output_ptr_vec[1]}, + cur_output_ptr, {{seq_len, batch_size, hidden_size}, + {seq_len, batch_size, hidden_size}}, + 2); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "concat"); + xpu_wait(dev_ctx.x_context()->xpu_stream); + } else { + RunLSTMLayer( + ctx, seq_len, batch_size, cur_xdim, hidden_size, cur_input_ptr, + cur_output_ptr, init_h_ptr, init_c_ptr, last_h_ptr, last_c_ptr, + state_offset, seq_len_tensor, parameter_lists[i], i_f_g_o, c, + is_bidirec, i, 0); + } + } } }; @@ -221,7 +320,6 @@ class RnnXPUGradKernel : public framework::OpKernel { int seq_len = input->dims()[0]; int batch_size = input->dims()[1]; int input_dim = input->dims()[2]; - PADDLE_ENFORCE_EQ( init_h->dims()[0], num_layers, platform::errors::InvalidArgument("The num_layers of in RNN layer must" diff --git a/paddle/fluid/operators/sequence_ops/sequence_softmax_cudnn_op.cu.cc b/paddle/fluid/operators/sequence_ops/sequence_softmax_cudnn_op.cu.cc index 57064301d7afb..976c10d0f433f 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_softmax_cudnn_op.cu.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_softmax_cudnn_op.cu.cc @@ -58,7 +58,7 @@ class SequenceSoftmaxCUDNNKernel : public framework::OpKernel { phi::make_ddim({1UL, end_pos - start_pos}); x_i.Resize(dims_i); out_i.Resize(dims_i); - math::SoftmaxCUDNNFunctor()( + math::SoftmaxCUDNNFunctor()( ctx.template device_context(), &x_i, &out_i); } @@ -93,7 +93,7 @@ class SequenceSoftmaxGradCUDNNKernel : public framework::OpKernel { out_i.Resize(dims_i); out_grad_i.Resize(dims_i); x_grad_i.Resize(dims_i); - math::SoftmaxGradCUDNNFunctor()( + math::SoftmaxGradCUDNNFunctor()( ctx.template device_context(), &out_i, &out_grad_i, &x_grad_i); } diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc index 6f0881e9fc98f..22b592c1eb62a 100644 --- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc @@ -12,8 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/softmax_with_cross_entropy_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/phi/kernels/funcs/axis_utils.h" namespace paddle { namespace operators { @@ -335,12 +336,6 @@ REGISTER_OPERATOR(softmax_with_cross_entropy, ops::SoftmaxWithCrossEntropyOp, REGISTER_OPERATOR(softmax_with_cross_entropy_grad, ops::SoftmaxWithCrossEntropyOpGrad, ops::SoftmaxWithCrossEntropyGradInplaceInferer); -REGISTER_OP_CPU_KERNEL(softmax_with_cross_entropy, - ops::SoftmaxWithCrossEntropyKernel, - ops::SoftmaxWithCrossEntropyKernel); -REGISTER_OP_CPU_KERNEL(softmax_with_cross_entropy_grad, - ops::SoftmaxWithCrossEntropyGradKernel, - ops::SoftmaxWithCrossEntropyGradKernel); REGISTER_OP_VERSION(softmax_with_cross_entropy) #if defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_MLU) diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.h b/paddle/fluid/operators/softmax_with_cross_entropy_op.h deleted file mode 100644 index 4b875cbf5841f..0000000000000 --- a/paddle/fluid/operators/softmax_with_cross_entropy_op.h +++ /dev/null @@ -1,318 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include "paddle/fluid/framework/convert_utils.h" -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/cross_entropy.h" -#include "paddle/fluid/operators/math/softmax.h" -#include "paddle/phi/kernels/funcs/axis_utils.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -struct SoftmaxWithCrossEntropyFunctor { - public: - SoftmaxWithCrossEntropyFunctor(const framework::ExecutionContext& context, - const framework::Tensor& labels, - const bool soft_label, const Visitor& visitor) - : context_(context), - labels_(labels), - soft_label_(soft_label), - visitor_(visitor) {} - - template - void apply() const { - visitor_.template Apply(context_, labels_, soft_label_); - } - - private: - const framework::ExecutionContext& context_; - const framework::Tensor& labels_; - const bool soft_label_; - const Visitor& visitor_; -}; - -template -static void RunSoftmaxWithCrossEntropyFunctor( - const framework::ExecutionContext& context, const Visitor& visitor) { - const auto* labels = context.Input("Label"); - const bool soft_label = context.Attr("soft_label"); - SoftmaxWithCrossEntropyFunctor functor(context, *labels, - soft_label, visitor); - auto dtype = framework::TransToProtoVarType(labels->dtype()); - if (soft_label) { - PADDLE_ENFORCE_EQ( - dtype, framework::DataTypeTrait::DataType(), - platform::errors::InvalidArgument("The Input(Label) should be with the " - "same data type as Input(Logits).")); - functor.template apply(); - } else { - framework::VisitIntDataType(dtype, functor); - } -} - -template -class SoftmaxWithCrossEntropyKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - PADDLE_ENFORCE_EQ( - platform::is_cpu_place(context.GetPlace()), true, - platform::errors::Unimplemented("This kernel only runs on CPU.")); - const bool use_softmax = context.Attr("use_softmax"); - const Tensor* labels = context.Input("Label"); - const bool soft_label = context.Attr("soft_label"); - - // do not with softmax op, and input is softmax - if (!use_softmax) { - const Tensor* softmax = context.Input("Logits"); - Tensor* softmax_out = context.Output("Softmax"); - Tensor* loss = context.Output("Loss"); - const int rank = softmax->dims().size(); - const int axis = - phi::funcs::CanonicalAxis(context.Attr("axis"), rank); - int axis_dim = softmax->dims()[axis]; - - PADDLE_ENFORCE_GT( - axis_dim, 0, - platform::errors::InvalidArgument( - "The axis dimention should be larger than 0, but received " - "axis dimention is %d.", - axis_dim)); - - softmax_out->mutable_data(context.GetPlace()); - loss->mutable_data(context.GetPlace()); - - const int n = phi::funcs::SizeToAxis(axis, softmax->dims()); - - PADDLE_ENFORCE_GT( - n, 0, platform::errors::InvalidArgument( - "The size of axis should be larger than 0, but received " - "SizeToAxis of softmax is %d.", - n)); - - const int d = phi::funcs::SizeFromAxis(axis, softmax->dims()); - - Tensor softmax_2d, labels_2d, loss_2d, softmax_out_2d; - softmax_2d.ShareDataWith(*softmax).Resize({n, d}); - labels_2d.ShareDataWith(*labels).Resize({n, labels->numel() / n}); - loss_2d.ShareDataWith(*loss).Resize({n, d / axis_dim}); - softmax_out_2d.ShareDataWith(*softmax_out).Resize({n, d}); - - auto& dev_ctx = - context.template device_context(); - - math::CrossEntropyFunctor()( - dev_ctx, &loss_2d, &softmax_2d, &labels_2d, soft_label, - context.Attr("ignore_index"), axis_dim); - - // cause of input is softmax - // copy to output softmax, directly - framework::TensorCopy(*softmax, context.GetPlace(), - context.device_context(), softmax_out); - - return; - } - - const Tensor* logits = context.Input("Logits"); - Tensor* softmax = context.Output("Softmax"); - Tensor* loss = context.Output("Loss"); - - const int rank = logits->dims().size(); - const int axis = phi::funcs::CanonicalAxis(context.Attr("axis"), rank); - int axis_dim = logits->dims()[axis]; - PADDLE_ENFORCE_GT( - axis_dim, 0, - platform::errors::InvalidArgument( - "The axis dimention should be larger than 0, but received " - "axis dimention is %d.", - axis_dim)); - - softmax->mutable_data(context.GetPlace()); - loss->mutable_data(context.GetPlace()); - - const int n = phi::funcs::SizeToAxis(axis, logits->dims()); - PADDLE_ENFORCE_GT( - n, 0, platform::errors::InvalidArgument( - "The size of axis should be larger than 0, but received " - "SizeToAxis of logits is %d.", - n)); - - const int d = phi::funcs::SizeFromAxis(axis, logits->dims()); - Tensor logits_2d, softmax_2d, labels_2d, loss_2d; - logits_2d.ShareDataWith(*logits).Resize({n, d}); - softmax_2d.ShareDataWith(*softmax).Resize({n, d}); - labels_2d.ShareDataWith(*labels).Resize({n, labels->numel() / n}); - loss_2d.ShareDataWith(*loss).Resize({n, d / axis_dim}); - - auto& dev_ctx = - context.template device_context(); - math::SoftmaxFunctor()( - dev_ctx, axis_dim, &logits_2d, &softmax_2d); - math::CrossEntropyFunctor()( - dev_ctx, &loss_2d, &softmax_2d, &labels_2d, soft_label, - context.Attr("ignore_index"), axis_dim); - } -}; - -template -class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - RunSoftmaxWithCrossEntropyFunctor(context, *this); - } - - template - static void Apply(const framework::ExecutionContext& context, - const framework::Tensor& labels, const bool soft_label) { - const Tensor* out_grad = - context.Input(framework::GradVarName("Loss")); - Tensor* logit_grad = - context.Output(framework::GradVarName("Logits")); - const Tensor* softmax = context.Input("Softmax"); - const bool use_softmax = context.Attr("use_softmax"); - if (logit_grad != softmax || !use_softmax) { - framework::TensorCopy(*softmax, context.GetPlace(), - context.device_context(), logit_grad); - } - auto ignore_index = context.Attr("ignore_index"); - - const int rank = logit_grad->dims().size(); - const int axis = phi::funcs::CanonicalAxis(context.Attr("axis"), rank); - int axis_dim = logit_grad->dims()[axis]; - PADDLE_ENFORCE_GT( - axis_dim, 0, - platform::errors::InvalidArgument( - "The axis dimention should be larger than 0, but received " - "axis dimention is %d.", - axis_dim)); - - const int n = phi::funcs::SizeToAxis(axis, logit_grad->dims()); - PADDLE_ENFORCE_GT( - n, 0, platform::errors::InvalidArgument( - "The size of axis should be larger than 0, but received " - "SizeToAxis of logit_grad is %d.", - n)); - - const int d = phi::funcs::SizeFromAxis(axis, logit_grad->dims()); - Tensor logit_grad_2d, labels_2d, out_grad_2d; - logit_grad_2d.ShareDataWith(*logit_grad).Resize({n, d}); - labels_2d.ShareDataWith(labels).Resize({n, labels.numel() / n}); - out_grad_2d.ShareDataWith(*out_grad).Resize({n, d / axis_dim}); - auto out_grad_mat = framework::EigenMatrix::From(out_grad_2d); - auto logit_grad_mat = framework::EigenMatrix::From(logit_grad_2d); - auto& place = *context.template device_context() - .eigen_device(); - if (!use_softmax) { - // use_softmax step1 - if (soft_label) { - auto lbl_mat = framework::EigenMatrix::From(labels_2d); - logit_grad_mat.device(place) = - (-lbl_mat / logit_grad_mat); // for each sample ,i is sample id - logit_grad_mat.device(place) = - out_grad_mat.broadcast(Eigen::DSizes(1, axis_dim)) * - logit_grad_mat; - } else { - // use_softmax step2 - const auto* label_data = labels.template data(); - T* logit_grad_data = logit_grad->template data(); - const T* out_grad_data = out_grad->template data(); - const int remain = d / axis_dim; - for (int i = 0; i < n; ++i) { // for each sample_1_dim - for (int j = 0; j < remain; j++) { // for each sample_other_dims - int idx = i * remain + j; // this sample's label_idx. for 1d case, - // remain=1 and j=0, so, idx = i - auto lbl = static_cast(label_data[idx]); - if (lbl == ignore_index) { - for (int k = 0; k < axis_dim; ++k) { // for each class id's label - logit_grad_data[i * d + k * remain + j] = 0; - } - } else { - // only for this sample's label_idx, the label is 1, others is 0, - // so, only compute this label_idx's class - logit_grad_data[i * d + lbl * remain + j] = - (-1 / logit_grad_data[i * d + lbl * remain + j]) * - out_grad_data[idx]; - for (int k = 0; k < axis_dim; ++k) { // for each class id's label - if (k != - label_data[idx]) { // label_data[idx]: this sample's label - logit_grad_data[i * d + k * remain + j] = 0; - } - } - } - } - } - } - return; - } - // for use_softmax=False, continue - - if (soft_label) { - // when soft_label = True, ignore_index is not supported - auto lbl_mat = framework::EigenMatrix::From(labels_2d); - logit_grad_mat.device(place) = - out_grad_mat.broadcast(Eigen::DSizes(1, axis_dim)) * - (logit_grad_mat - lbl_mat); // for each sample ,i is sample id - // 1) compute dy/dx by p_j - y_j or P-Y, where j is class id, - // P=logit_grad_mat[i] is all class's probs, Y=lbl_mat[i] is - // all class's labels - // 2) compute dy * dy/dx by Chain rule, dy=out_grad_mat[i] - // for high dims, e.g. (n,c) or (n,d1,...,dm, c), compute grad by matrix - // operation - - } else { - logit_grad_mat.device(place) = - logit_grad_mat * // element_wise multiply - out_grad_mat.broadcast(Eigen::DSizes(1, axis_dim)); - - const auto* label_data = labels.template data(); - T* logit_grad_data = logit_grad->template data(); - const T* out_grad_data = out_grad->template data(); - const int remain = d / axis_dim; - for (int i = 0; i < n; ++i) { // for each sample_1_dim - for (int j = 0; j < remain; j++) { // for each sample_other_dims - int idx = i * remain + j; // this sample's label_idx. for 1d case, - // remain=1 and j=0, so, idx = i - auto lbl = static_cast(label_data[idx]); - if (lbl == ignore_index) { - for (int k = 0; k < axis_dim; ++k) { // for each class id's label - logit_grad_data[i * d + k * remain + j] = 0; - } - } else { - // only for this sample's label_idx, the label is 1, others is 0, - // so, only compute this label_idx's class - // for 1d case, remain=1 and j=0, so, [i * d + label_data[idx] * - // remain + j] = [i * d + label_data[idx]] - // let idx_x = i * d + label_data[idx] * remain + j, - // logit_grad_data[idx_x] = logit_grad_data[idx_x] - - // out_grad_data[idx] - // note: logit_grad_mat = logit_grad_mat * out_grad_mat - // so: logit_grad_data[idx_x] = (logit_grad_data[idx_x] - 1) * - // out_grad_data[idx] - // means: dy/dp * dy= ( p - y ) * dy - - logit_grad_data[i * d + lbl * remain + j] -= out_grad_data[idx]; - } - } - } - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op_mlu.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op_mlu.cc index 34650c2e06245..7056bcd4f76bc 100644 --- a/paddle/fluid/operators/softmax_with_cross_entropy_op_mlu.cc +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op_mlu.cc @@ -12,8 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/softmax_with_cross_entropy_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/mlu/mlu_baseop.h" +#include "paddle/phi/kernels/funcs/axis_utils.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc index 1f1fbea090c13..f64d9e022a1ad 100644 --- a/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc @@ -12,13 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/softmax_with_cross_entropy_op.h" - #include #include + +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/math/cross_entropy.h" #include "paddle/fluid/operators/math/softmax.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" +#include "paddle/phi/kernels/funcs/axis_utils.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op_xpu.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op_xpu.cc index b5514525f5981..c07467a9b0ba3 100644 --- a/paddle/fluid/operators/softmax_with_cross_entropy_op_xpu.cc +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op_xpu.cc @@ -12,20 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/softmax_with_cross_entropy_op.h" #ifdef PADDLE_WITH_XPU #include #include #include #include +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/device_wrapper.h" +#include "paddle/phi/kernels/funcs/axis_utils.h" #include "xpu/refactor/math.h" #include "xpu/refactor/nn.h" namespace paddle { namespace operators { +using Tensor = framework::Tensor; + template class SoftmaxWithCrossEntropyXPUKernel : public framework::OpKernel { using XPUType = typename XPUTypeTrait::Type; diff --git a/paddle/fluid/operators/stack_op.cc b/paddle/fluid/operators/stack_op.cc index af03ed668e8c8..a9fa78c4e4943 100644 --- a/paddle/fluid/operators/stack_op.cc +++ b/paddle/fluid/operators/stack_op.cc @@ -12,9 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/stack_op.h" #include #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/multiary.h" namespace plat = paddle::platform; namespace ops = paddle::operators; @@ -26,52 +29,6 @@ class StackOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE_GT(ctx->Inputs("X").size(), 0, - platform::errors::InvalidArgument( - "Number of Inputs(X) must be larger than 0, but" - " received value is:%d.", - ctx->Inputs("X").size())); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Y"), true, - platform::errors::InvalidArgument( - "Output(Y) of stack_op should not be null.")); - - auto input_dims = ctx->GetInputsDim("X"); - for (size_t i = 1; i < input_dims.size(); ++i) { - PADDLE_ENFORCE_EQ(input_dims[i], input_dims[0], - platform::errors::InvalidArgument( - "Dims of all Inputs(X) must be the same, but" - " received input %d dim is:%d not equal to input 0" - " dim:%d.", - i, input_dims[i], input_dims[0])); - } - - // Only lod of X[0] would be shared with Y - ctx->ShareLoD("X", /*->*/ "Y"); - - int axis = ctx->Attrs().Get("axis"); - int rank = input_dims[0].size(); - PADDLE_ENFORCE_GE( - axis, -(rank + 1), - platform::errors::InvalidArgument( - "Attr(axis) must be inside [-(rank+1), rank+1), where rank = %d, " - "but received axis is:%d.", - rank, axis)); - - PADDLE_ENFORCE_LT( - axis, rank + 1, - platform::errors::InvalidArgument( - "Attr(axis) must be inside [-(rank+1), rank+1), where rank = %d, " - "but received axis is:%d", - rank, axis)); - - if (axis < 0) axis += (rank + 1); - - auto vec = phi::vectorize(input_dims[0]); - vec.insert(vec.begin() + axis, input_dims.size()); - ctx->SetOutputDim("Y", phi::make_ddim(vec)); - } - framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { auto input_data_type = @@ -168,21 +125,10 @@ class StackGradOpMaker : public framework::SingleGradOpMaker { } // namespace operators } // namespace paddle +DECLARE_INFER_SHAPE_FUNCTOR(stack, StackInferMetaFunctor, + PD_INFER_META(phi::StackInferMeta)); REGISTER_OPERATOR(stack, ops::StackOp, ops::StackOpMaker, ops::StackGradOpMaker, - ops::StackGradOpMaker); + ops::StackGradOpMaker, + StackInferMetaFunctor); REGISTER_OPERATOR(stack_grad, ops::StackOpGrad); - -REGISTER_OP_CPU_KERNEL( - stack, ops::StackKernel, - ops::StackKernel, - ops::StackKernel, - ops::StackKernel, - ops::StackKernel); - -REGISTER_OP_CPU_KERNEL( - stack_grad, ops::StackGradKernel, - ops::StackGradKernel, - ops::StackGradKernel, - ops::StackGradKernel, - ops::StackGradKernel); diff --git a/paddle/fluid/operators/stack_op.cu b/paddle/fluid/operators/stack_op.cu deleted file mode 100644 index a56dd6aef4f66..0000000000000 --- a/paddle/fluid/operators/stack_op.cu +++ /dev/null @@ -1,207 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include -#include "paddle/fluid/operators/stack_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" - -namespace plat = paddle::platform; -namespace ops = paddle::operators; - -namespace paddle { -namespace operators { - -template -__global__ void StackCUDAKernel(T** input_ptrs, int split_size, int rows, - int cols, T* __restrict__ output) { - IntType grid_x = blockIdx.x * blockDim.x + threadIdx.x; - - for (; grid_x < cols; grid_x += blockDim.x * gridDim.x) { - IntType grid_y = blockIdx.y * blockDim.y + threadIdx.y; - - IntType split = grid_x / split_size; - const T* input_ptr = input_ptrs[split]; - IntType col_offset = grid_x % split_size; -#pragma unroll - for (; grid_y < rows; grid_y += blockDim.y * gridDim.y) { - output[grid_y * cols + grid_x] = - input_ptr[grid_y * split_size + col_offset]; - } - } -} - -template -class StackGPUKernel : public framework::OpKernel { - using Tensor = framework::LoDTensor; - - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto x = ctx.MultiInput("X"); - auto* y = ctx.Output("Y"); - - int axis = ctx.Attr("axis"); - if (axis < 0) axis += (x[0]->dims().size() + 1); - - int n = static_cast(x.size()); - auto* y_data = y->mutable_data(ctx.GetPlace()); - std::vector x_datas(n); - for (int i = 0; i < n; i++) { - x_datas[i] = x[i]->data(); - } - - auto& dev_ctx = ctx.template device_context(); - auto tmp_x_data = memory::Alloc(dev_ctx, x_datas.size() * sizeof(T*)); - memory::Copy(dev_ctx.GetPlace(), tmp_x_data->ptr(), platform::CPUPlace(), - reinterpret_cast(x_datas.data()), - x_datas.size() * sizeof(T*), dev_ctx.stream()); - - // Split x dim from axis to matrix - int x_row = 1, x_col = 1; - for (int i = 0; i < axis; ++i) { - x_row *= x[0]->dims()[i]; - } - x_col = x[0]->numel() / x_row; - int out_col = x_col * n; - - auto config = GetGpuLaunchConfig2D(dev_ctx, out_col, x_row); - - if (y->numel() < std::numeric_limits::max()) { - StackCUDAKernel<<>>( - reinterpret_cast(tmp_x_data->ptr()), x_col, x_row, out_col, - y_data); - } else { - StackCUDAKernel<<>>( - reinterpret_cast(tmp_x_data->ptr()), x_col, x_row, out_col, - y_data); - } - } -}; - -template -__global__ void UnStackHelperCUDAKernel(const T* __restrict__ input, - int pre_dim_size, int split_dim_size, - int suf_dim_size, int num_split, - T** output_ptrs) { - assert(blockDim.y == 1); - assert(blockDim.z == 1); - // In this case they are equal - assert(split_dim_size % num_split == 0); - - IntType size = pre_dim_size * split_dim_size * suf_dim_size; - IntType each_dim_size = split_dim_size / num_split; - - for (IntType offset = blockIdx.x * blockDim.x + threadIdx.x; offset < size; - offset += blockDim.x * gridDim.x) { - IntType i = offset / (split_dim_size * suf_dim_size); - IntType j = (offset % (split_dim_size * suf_dim_size)) / suf_dim_size; - IntType k = offset % suf_dim_size; - - T* output = output_ptrs[j / each_dim_size]; - if (output == nullptr) { - return; - } - IntType output_ind = i * each_dim_size * suf_dim_size + - (j % each_dim_size) * suf_dim_size + k; - *(output + output_ind) = input[offset]; - } -} - -template -class StackGradGPUKernel : public framework::OpKernel { - using Tensor = framework::LoDTensor; - - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* dy = ctx.Input(framework::GradVarName("Y")); - auto dx = ctx.MultiOutput(framework::GradVarName("X")); - int axis = ctx.Attr("axis"); - if (axis < 0) axis += dy->dims().size(); - - int n = dy->dims()[axis]; - PADDLE_ENFORCE_EQ(n, dx.size(), - platform::errors::InvalidArgument( - "Output dx size should be equal to n, but" - " received n is:%d dx size is:%d.", - n, dx.size())); - - // dx is output, so save each data address, then copy each dy into dx_data - std::vector outputs(n); - auto out_var_names = ctx.OutputNames(framework::GradVarName("X")); - for (size_t j = 0; j < dx.size(); ++j) { - if (dx[j] == nullptr) { - outputs[j] = nullptr; - } - if (out_var_names[j] != framework::kEmptyVarName && - dx[j]->numel() != 0UL) { - T* ptr = dx[j]->mutable_data(ctx.GetPlace()); - outputs[j] = ptr; - } else { - outputs[j] = nullptr; - } - } - auto dy_data = dy->data(); - // each dx should have same shape - int dy_pre = 1, dy_suf = 1; - auto dy_dims = dy->dims(); - int split_dim = n; - for (int i = 0; i < axis; ++i) { - dy_pre *= dy_dims[i]; - } - dy_suf = dy->numel() / (split_dim * dy_pre); - - auto& dev_ctx = ctx.template device_context(); - auto tmp_out_data = memory::Alloc(dev_ctx, outputs.size() * sizeof(T*)); - memory::Copy(dev_ctx.GetPlace(), tmp_out_data->ptr(), platform::CPUPlace(), - reinterpret_cast(outputs.data()), - outputs.size() * sizeof(T*), dev_ctx.stream()); - - auto config = GetGpuLaunchConfig1D(dev_ctx, dy_pre * split_dim * dy_suf); - - if (dy->numel() < std::numeric_limits::max()) { - UnStackHelperCUDAKernel< - T, int32_t><<>>( - dy_data, dy_pre, split_dim, dy_suf, split_dim, - reinterpret_cast(tmp_out_data->ptr())); - } else { - UnStackHelperCUDAKernel< - T, int64_t><<>>( - dy_data, dy_pre, split_dim, dy_suf, split_dim, - reinterpret_cast(tmp_out_data->ptr())); - } - } -}; - -} // namespace operators -} // namespace paddle - -REGISTER_OP_CUDA_KERNEL(stack, ops::StackGPUKernel, - ops::StackGPUKernel, ops::StackGPUKernel, - ops::StackGPUKernel, - ops::StackGPUKernel, - ops::StackGPUKernel); - -REGISTER_OP_CUDA_KERNEL(stack_grad, ops::StackGradGPUKernel, - ops::StackGradGPUKernel, - ops::StackGradGPUKernel, - ops::StackGradGPUKernel, - ops::StackGradGPUKernel, - ops::StackGradGPUKernel); diff --git a/paddle/fluid/operators/stack_op.h b/paddle/fluid/operators/stack_op.h deleted file mode 100644 index 03d5324528930..0000000000000 --- a/paddle/fluid/operators/stack_op.h +++ /dev/null @@ -1,120 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/for_range.h" - -namespace paddle { -namespace operators { - -template -struct StackGradFunctor { - HOSTDEVICE StackGradFunctor(const VecDxType &dx, const T *dy, int n, int post) - : dx_(dx), dy_(dy), n_(n), post_(post) {} - - HOSTDEVICE void operator()(int idx) { - int i = idx / (n_ * post_); - int which_x = idx / post_ - i * n_; - int x_index = i * post_ + idx % post_; - if (dx_[which_x] != nullptr) dx_[which_x][x_index] = dy_[idx]; - } - - private: - VecDxType dx_; - const T *dy_; - int n_; - int post_; -}; - -template -static inline void StackGradFunctorForRange(const DeviceContext &ctx, - const VecDxType &dx, const T *dy, - int total_num, int n, int post) { - platform::ForRange for_range(ctx, total_num); - for_range(StackGradFunctor(dx, dy, n, post)); -} - -template -class StackKernel : public framework::OpKernel { - using Tensor = framework::LoDTensor; - - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto x = ctx.MultiInput("X"); - auto *y = ctx.Output("Y"); - - int axis = ctx.Attr("axis"); - if (axis < 0) axis += (x[0]->dims().size() + 1); - - int n = static_cast(x.size()); - auto *y_data = y->mutable_data(ctx.GetPlace()); - std::vector x_datas(n); - for (int i = 0; i < n; i++) x_datas[i] = x[i]->data(); - - int pre = 1, post = 1; - auto &dim = x[0]->dims(); - for (auto i = 0; i < axis; ++i) pre *= dim[i]; - for (auto i = axis; i < dim.size(); ++i) post *= dim[i]; - - auto x_data_arr = x_datas.data(); - - size_t x_offset = 0; - size_t y_offset = 0; - for (int i = 0; i < pre; i++) { - for (int j = 0; j < n; j++) { - std::memcpy(y_data + y_offset, x_data_arr[j] + x_offset, - post * sizeof(T)); - y_offset += post; - } - x_offset += post; - } - } -}; - -template -class StackGradKernel : public framework::OpKernel { - using Tensor = framework::LoDTensor; - - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto *dy = ctx.Input(framework::GradVarName("Y")); - auto dx = ctx.MultiOutput(framework::GradVarName("X")); - int axis = ctx.Attr("axis"); - if (axis < 0) axis += dy->dims().size(); - int n = dy->dims()[axis]; - std::vector dx_datas(n); // NOLINT - - for (int i = 0; i < n; i++) { - if (dx[i] == nullptr) { - dx_datas[i] = nullptr; - } else { - dx_datas[i] = dx[i]->mutable_data(ctx.GetPlace()); - } - } - auto dy_data = dy->data(); - int pre = 1; - for (int i = 0; i < axis; ++i) pre *= dy->dims()[i]; - int total_num = dy->numel(); - int post = total_num / (n * pre); - auto &dev_ctx = ctx.template device_context(); - auto dx_data_arr = dx_datas.data(); - StackGradFunctorForRange(dev_ctx, dx_data_arr, dy_data, total_num, n, post); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/stack_op_npu.cc b/paddle/fluid/operators/stack_op_npu.cc index 3a6e5b2aca4b8..9d4ef0ffa20e2 100644 --- a/paddle/fluid/operators/stack_op_npu.cc +++ b/paddle/fluid/operators/stack_op_npu.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/stack_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/stack_op_xpu.cc b/paddle/fluid/operators/stack_op_xpu.cc index c5a20ed4d1c89..baaa2b4884ce3 100644 --- a/paddle/fluid/operators/stack_op_xpu.cc +++ b/paddle/fluid/operators/stack_op_xpu.cc @@ -13,9 +13,9 @@ // limitations under the License. #ifdef PADDLE_WITH_XPU -#include "paddle/fluid/operators/stack_op.h" #include #include +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/concat_op.h" #include "paddle/fluid/platform/device/xpu/xpu_header.h" diff --git a/paddle/fluid/operators/strided_slice_op.cc b/paddle/fluid/operators/strided_slice_op.cc index 0ff7d654fc29d..6f092bbef067e 100644 --- a/paddle/fluid/operators/strided_slice_op.cc +++ b/paddle/fluid/operators/strided_slice_op.cc @@ -228,7 +228,7 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(StridedSliceOpGradNoNeedBufferVarsInferer, namespace ops = paddle::operators; DECLARE_INFER_SHAPE_FUNCTOR(strided_slice, StridedSliceInferShape, - PD_INFER_META(phi::StridedSliceInferMeta)); + PD_INFER_META(phi::StridedSliceRawInferMeta)); REGISTER_OPERATOR(strided_slice, ops::StridedSliceOp, ops::StridedSliceOpMaker, ops::StridedSliceOpGradMaker, diff --git a/paddle/fluid/operators/unique_op.cc b/paddle/fluid/operators/unique_op.cc index 9044a881b738d..5c103e088b559 100644 --- a/paddle/fluid/operators/unique_op.cc +++ b/paddle/fluid/operators/unique_op.cc @@ -13,7 +13,11 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/unique_op.h" -#include "paddle/fluid/framework/op_version_registry.h" +#include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -25,62 +29,54 @@ class UniqueOp : public framework::OperatorWithKernel { void InferShape(framework::InferShapeContext* ctx) const override { OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "unique"); OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "unique"); - auto in_dims = ctx->GetInputDim("X"); - if (!ctx->Attrs().Get("is_sorted")) { - OP_INOUT_CHECK(ctx->HasOutput("Index"), "Output", "Index", "unique"); - PADDLE_ENFORCE_EQ(in_dims.size(), 1, - platform::errors::InvalidArgument( - "The Input(X) should be 1-D Tensor, " - "But now the dims of Input(X) is %d.", - in_dims.size())); - - ctx->SetOutputDim("Out", {-1}); - ctx->SetOutputDim("Index", in_dims); - return; - } bool return_index = ctx->Attrs().Get("return_index"); bool return_inverse = ctx->Attrs().Get("return_inverse"); bool return_counts = ctx->Attrs().Get("return_counts"); auto axis_vec = ctx->Attrs().Get>("axis"); + auto data_type = + static_cast(static_cast( + ctx->Attrs().Get("dtype"))); + + // Construct MetaTensor for InferMeta Func + using CompatMetaTensor = framework::CompatMetaTensor; + CompatMetaTensor x(ctx->GetInputVarPtrs("X")[0], ctx->IsRuntime()); + CompatMetaTensor out(ctx->GetOutputVarPtrs("Out")[0], ctx->IsRuntime()); + std::unique_ptr indices(nullptr); + std::unique_ptr index(nullptr); + std::unique_ptr counts(nullptr); if (return_index) { OP_INOUT_CHECK(ctx->HasOutput("Indices"), "Output", "Indices", "unique"); + indices = + std::move(std::unique_ptr(new CompatMetaTensor( + ctx->GetOutputVarPtrs("Indices")[0], ctx->IsRuntime()))); } if (return_inverse) { OP_INOUT_CHECK(ctx->HasOutput("Index"), "Output", "Index", "unique"); + index = std::move(std::unique_ptr(new CompatMetaTensor( + ctx->GetOutputVarPtrs("Index")[0], ctx->IsRuntime()))); } if (return_counts) { OP_INOUT_CHECK(ctx->HasOutput("Counts"), "Output", "Counts", "unique"); + counts = std::move(std::unique_ptr(new CompatMetaTensor( + ctx->GetOutputVarPtrs("Counts")[0], ctx->IsRuntime()))); } - - if (axis_vec.empty()) { - ctx->SetOutputDim("Out", {-1}); - if (return_inverse) { - ctx->SetOutputDim("Index", {phi::product(in_dims)}); - } + bool is_sorted = ctx->Attrs().Get("is_sorted"); + if (is_sorted) { + phi::UniqueInferMeta(x, return_index, return_inverse, return_counts, + axis_vec, data_type, &out, indices.get(), + index.get(), counts.get()); } else { - int axis = axis_vec[0]; - if (axis < 0) { - axis += in_dims.size(); - } - PADDLE_ENFORCE_LT( - axis, in_dims.size(), - platform::errors::InvalidArgument("The axis(%d) should be less than " - "the dimension size(%d) of x.", - axis, in_dims.size())); - auto out_dims = in_dims; - out_dims[axis] = -1; - ctx->SetOutputDim("Out", out_dims); - if (return_inverse) { - ctx->SetOutputDim("Index", {in_dims[axis]}); + OP_INOUT_CHECK(ctx->HasOutput("Index"), "Output", "Index", "unique"); + if (index == nullptr) { + index = + std::move(std::unique_ptr(new CompatMetaTensor( + ctx->GetOutputVarPtrs("Index")[0], ctx->IsRuntime()))); } - } - if (return_index) { - ctx->SetOutputDim("Indices", {-1}); - } - if (return_counts) { - ctx->SetOutputDim("Counts", {-1}); + phi::UniqueRawInferMeta(x, return_index, return_inverse, return_counts, + axis_vec, data_type, is_sorted, &out, + indices.get(), index.get(), counts.get()); } } @@ -152,40 +148,5 @@ class UniqueOpMaker : public framework::OpProtoAndCheckerMaker { } // namespace paddle namespace ops = paddle::operators; + REGISTER_OP_WITHOUT_GRADIENT(unique, ops::UniqueOp, ops::UniqueOpMaker); -REGISTER_OP_CPU_KERNEL( - unique, ops::UniqueKernel, - ops::UniqueKernel, - ops::UniqueKernel, - ops::UniqueKernel); -REGISTER_OP_VERSION(unique) - .AddCheckpoint( - R"ROC( - Upgrade unique, add 2 outputs [Indices, Counts] and 5 attribute - [return_index, return_inverse, return_counts, axis, is_sorted]. - )ROC", - paddle::framework::compatible::OpVersionDesc() - .NewOutput("Indices", - "The indices of the input tensor that result in the " - "unique tensor.") - .NewOutput("Counts", "The counts for each unique element.") - .NewAttr("return_index", - "If True, also return the indices of the input" - " tensor that result in the unique Tensor.", - false) - .NewAttr("return_inverse", - "If True, also return the indices for where elements" - " in the original input ended up in the returned unique " - "tensor.", - false) - .NewAttr("return_counts", - "If True, also return the counts for each unique element.", - false) - .NewAttr("axis", - "The axis to apply unique. If None, the input will be " - "flattened.", - std::vector{}) - .NewAttr("is_sorted", - "If True, the unique elements of X are in ascending order." - "Otherwise, the unique elements are not sorted.", - false)); diff --git a/paddle/fluid/operators/unique_op.cu b/paddle/fluid/operators/unique_op.cu deleted file mode 100644 index 871274c999c8b..0000000000000 --- a/paddle/fluid/operators/unique_op.cu +++ /dev/null @@ -1,474 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "paddle/fluid/framework/tensor_util.h" // TensorToVector() -#include "paddle/fluid/operators/unique_op.h" // TransComute() - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -// Binary function 'less than' -template -struct LessThan { - int col; - const InT* in_trans_data; - - LessThan(int64_t _col, const InT* _in_trans_data) - : col(_col), in_trans_data(_in_trans_data) {} - - __device__ bool operator()(int64_t a, int64_t b) const { - for (int i = 0; i < col; ++i) { - InT lhs = in_trans_data[i + a * col]; - InT rhs = in_trans_data[i + b * col]; - if (lhs < rhs) { - return true; - } else if (lhs > rhs) { - return false; - } - } - return false; - } -}; - -// Binary function 'equal_to' -template -struct BinaryEqual { - int64_t col; - const InT* in_trans_data; - - BinaryEqual(int64_t _col, const InT* _in_trans_data) - : col(_col), in_trans_data(_in_trans_data) {} - - __device__ bool operator()(int64_t a, int64_t b) const { - for (int64_t i = 0; i < col; ++i) { - InT lhs = in_trans_data[i + a * col]; - InT rhs = in_trans_data[i + b * col]; - if (lhs != rhs) { - return false; - } - } - return true; - } -}; - -// Binary function 'not_equal_to' -template -struct BinaryNotEqual { - int64_t col; - const InT* in_trans_data; - - BinaryNotEqual(int64_t _col, const InT* _in_trans_data) - : col(_col), in_trans_data(_in_trans_data) {} - - __device__ bool operator()(int64_t a, int64_t b) const { - for (int64_t i = 0; i < col; ++i) { - InT lhs = in_trans_data[i + a * col]; - InT rhs = in_trans_data[i + b * col]; - if (lhs != rhs) { - return true; - } - } - return false; - } -}; - -// index_select() function for Tensor -template -void IndexSelect(const framework::ExecutionContext& context, - const Tensor& input, const Tensor& index, Tensor* output, - int dim) { - auto input_dim = input.dims(); - auto input_dim_size = input_dim.size(); - auto output_dim = output->dims(); - - auto slice_size = 1; - for (auto i = dim + 1; i < input_dim_size; i++) { - slice_size *= input_dim[i]; - } - - auto input_width = slice_size * input_dim[dim]; - auto output_width = slice_size * output_dim[dim]; - - auto outer_nums = 1; - for (auto i = 0; i < dim; i++) { - outer_nums *= input_dim[i]; - } - - auto index_size = index.dims()[0]; - - std::vector input_vec; - std::vector index_vec; - paddle::framework::TensorToVector(input, context.device_context(), - &input_vec); - paddle::framework::TensorToVector(index, context.device_context(), - &index_vec); - std::vector out_vec(output->numel()); - - for (int i = 0; i < index_size; i++) { - PADDLE_ENFORCE_GE( - index_vec[i], 0, - platform::errors::InvalidArgument( - "Variable value (index) of OP(index_select) " - "expected >= 0 and < %ld, but got %ld. Please check input " - "value.", - input_dim[dim], index_vec[i])); - PADDLE_ENFORCE_LT( - index_vec[i], input_dim[dim], - platform::errors::InvalidArgument( - "Variable value (index) of OP(index_select) " - "expected >= 0 and < %ld, but got %ld. Please check input " - "value.", - input_dim[dim], index_vec[i])); - } - - for (auto i = 0; i < outer_nums; i++) { - auto input_start_offset = i * input_width; - auto output_start_offset = i * output_width; - - for (auto j = 0; j < index_size; j++) { - IndexT index_value = index_vec[j]; - for (auto k = 0; k < slice_size; k++) { - out_vec[output_start_offset + j * slice_size + k] = - input_vec[input_start_offset + index_value * slice_size + k]; - } - } - } - output->mutable_data(context.GetPlace()); - framework::TensorFromVector(out_vec, context.device_context(), output); - output->Resize(output_dim); -} - -// The core logic of computing Unique for a flattend Tensor -template -static void UniqueFlattendCUDATensor(const framework::ExecutionContext& context, - const Tensor& in, Tensor* out, - bool return_index, bool return_inverse, - bool return_counts, equal_T equal, - not_equal_T not_equal, int64_t num_input) { - // 0. Prepration - Tensor in_hat; - framework::TensorCopy(in, context.GetPlace(), &in_hat); - auto in_data_hat = in_hat.mutable_data(context.GetPlace()); - - Tensor* sorted_indices = context.Output("Indices"); - sorted_indices->Resize(phi::make_ddim({num_input})); - auto sorted_indices_data = - sorted_indices->mutable_data(context.GetPlace()); - thrust::sequence(thrust::device, sorted_indices_data, - sorted_indices_data + num_input); - thrust::sort_by_key(thrust::device, in_data_hat, in_data_hat + num_input, - sorted_indices_data); - - // 1. Calculate op result: 'out' - Tensor range; - range.Resize(phi::make_ddim({num_input + 1})); - auto range_data_ptr = range.mutable_data(context.GetPlace()); - thrust::sequence(thrust::device, range_data_ptr, - range_data_ptr + num_input + 1); - framework::TensorCopy(in_hat, context.GetPlace(), out); - int num_out; - auto out_data = out->mutable_data(context.GetPlace()); - num_out = thrust::unique_by_key(thrust::device, out_data, - out_data + num_input, range_data_ptr, equal) - .first - - out_data; - out->Resize(phi::make_ddim({num_out})); - - // 3. Calculate inverse index: 'inverse' - if (return_inverse) { - Tensor* inverse = context.Output("Index"); - inverse->Resize(phi::make_ddim({num_input})); - auto inverse_data = inverse->mutable_data(context.GetPlace()); - Tensor inv_loc; - inv_loc.Resize(phi::make_ddim({num_input})); - auto inv_loc_data_ptr = inv_loc.mutable_data(context.GetPlace()); - thrust::adjacent_difference(thrust::device, in_data_hat, - in_data_hat + num_input, inv_loc_data_ptr, - not_equal); - thrust::device_ptr inv_loc_data_dev(inv_loc_data_ptr); - inv_loc_data_dev[0] = 0; // without device_ptr, segmentation fault - thrust::inclusive_scan(thrust::device, inv_loc_data_ptr, - inv_loc_data_ptr + num_input, inv_loc_data_ptr); - thrust::scatter(thrust::device, inv_loc_data_ptr, - inv_loc_data_ptr + num_input, sorted_indices_data, - inverse_data); - } - - // 2. Calculate sorted index: 'sorted_indices' - if (return_index) { - Tensor indices; - indices.Resize(phi::make_ddim({num_input})); - auto indices_data_ptr = indices.mutable_data(context.GetPlace()); - thrust::copy(thrust::device, in_data_hat, in_data_hat + num_input, - indices_data_ptr); - thrust::unique_by_key(thrust::device, indices_data_ptr, - indices_data_ptr + num_input, sorted_indices_data, - equal); - sorted_indices->Resize(phi::make_ddim({num_out})); - } - - // 4. Calculate 'counts' - if (return_counts) { - Tensor* counts = context.Output("Counts"); - counts->Resize(phi::make_ddim({num_out})); - auto count_data = counts->mutable_data(context.GetPlace()); - // init 'count_data' as 0 - thrust::fill(thrust::device, count_data, count_data + num_out, 0); - thrust::device_ptr range_data_ptr_dev(range_data_ptr); - range_data_ptr_dev[num_out] = num_input; - thrust::adjacent_difference(thrust::device, range_data_ptr + 1, - range_data_ptr + num_out + 1, count_data); - } -} - -// The logic of compute unique with axis required, it's a little different -// from above function -template -static void ComputeUniqueDims(const framework::ExecutionContext& context, - Tensor* sorted_indices, - IndexT* sorted_indices_data, Tensor* out, - bool return_index, bool return_inverse, - bool return_counts, equal_T equal, - not_equal_T not_equal, int64_t row) { - // 1. inverse indices: 'inverse' - Tensor* inverse = context.Output("Index"); - inverse->Resize(phi::make_ddim({row})); - auto inverse_data = inverse->mutable_data(context.GetPlace()); - Tensor inv_loc; - inv_loc.Resize(phi::make_ddim({row})); - auto inv_loc_data_ptr = inv_loc.mutable_data(context.GetPlace()); - thrust::adjacent_difference(thrust::device, sorted_indices_data, - sorted_indices_data + row, inv_loc_data_ptr, - not_equal); - thrust::device_ptr inv_loc_data_dev(inv_loc_data_ptr); - inv_loc_data_dev[0] = 0; - thrust::inclusive_scan(thrust::device, inv_loc_data_ptr, - inv_loc_data_ptr + row, inv_loc_data_ptr); - thrust::scatter(thrust::device, inv_loc_data_ptr, inv_loc_data_ptr + row, - sorted_indices_data, inverse_data); - - // 2. sorted indices - Tensor range; - range.Resize(phi::make_ddim({row + 1})); - auto range_data_ptr = range.mutable_data(context.GetPlace()); - thrust::sequence(thrust::device, range_data_ptr, range_data_ptr + row + 1); - int num_out; - num_out = - thrust::unique_by_key(thrust::device, sorted_indices_data, - sorted_indices_data + row, range_data_ptr, equal) - .first - - sorted_indices_data; - thrust::device_ptr range_data_ptr_dev(range_data_ptr); - range_data_ptr_dev[num_out] = row; - sorted_indices->Resize(phi::make_ddim({num_out})); - - // 3. counts: 'counts' - Tensor* counts = context.Output("Counts"); - counts->Resize(phi::make_ddim({num_out})); - auto count_data = counts->mutable_data(context.GetPlace()); - thrust::fill(thrust::device, count_data, count_data + row, 0); - thrust::adjacent_difference(thrust::device, range_data_ptr + 1, - range_data_ptr + row + 1, count_data); -} - -// Calculate unique when 'axis' is set -template -static void UniqueDimsCUDATensor(const framework::ExecutionContext& context, - const Tensor& in, Tensor* out, - bool return_index, bool return_inverse, - bool return_counts, int axis) { - // 1. Transpose & reshape - // Transpose tensor: eg. axis=1, [dim0, dim1, dim2] -> [dim1, dim0, dim2] - std::vector permute(in.dims().size()); - std::iota(permute.begin(), permute.end(), 0); - permute[axis] = 0; - permute[0] = axis; - std::vector in_trans_dims_vec(phi::vectorize(in.dims())); - in_trans_dims_vec[axis] = in.dims()[0]; - in_trans_dims_vec[0] = in.dims()[axis]; - framework::Tensor in_trans; - framework::DDim in_trans_dims = phi::make_ddim(in_trans_dims_vec); - in_trans.Resize(in_trans_dims); - in_trans.mutable_data(context.GetPlace()); - auto& dev_ctx = context.cuda_device_context(); - TransCompute(in.dims().size(), // num of dims - dev_ctx, // device - in, // original Tensor - &in_trans, // Tensor after reshape - permute); // index of axis - - // Reshape tensor: eg. [dim1, dim0, dim2] -> [dim1, dim0*dim2] - framework::DDim in_trans_flat_dims = phi::flatten_to_2d(in_trans_dims, 1); - in_trans.Resize(in_trans_flat_dims); - - // now 'in_trans' is 2D - int64_t col = in_trans.dims()[1]; - int64_t row = in_trans.dims()[0]; - const InT* in_trans_data = in_trans.data(); - - Tensor* sorted_indices = context.Output("Indices"); - sorted_indices->Resize(phi::make_ddim({row})); - auto sorted_indices_data = - sorted_indices->mutable_data(context.GetPlace()); - - // 2. Calculate 'sorted_indices', 'inverse', 'counts' - // Init index and sort - thrust::sequence(thrust::device, sorted_indices_data, - sorted_indices_data + row); - thrust::sort(thrust::device, sorted_indices_data, sorted_indices_data + row, - LessThan(col, in_trans_data)); - ComputeUniqueDims( - context, sorted_indices, sorted_indices_data, out, return_index, - return_inverse, return_counts, BinaryEqual(col, in_trans_data), - BinaryNotEqual(col, in_trans_data), row); - - // 3. Select indices and reshape back to get 'out' - Tensor out_trans; - std::vector out_trans_dims_vec = in_trans_dims_vec; - out_trans_dims_vec[0] = sorted_indices->numel(); - out_trans.Resize(phi::make_ddim(out_trans_dims_vec)); - out_trans.mutable_data(context.GetPlace()); - - IndexSelect(context, in_trans, *sorted_indices, &out_trans, 0); - - std::swap(out_trans_dims_vec[0], out_trans_dims_vec[axis]); - out->Resize(phi::make_ddim(out_trans_dims_vec)); - out->mutable_data(context.GetPlace()); - std::vector out_trans_unbind = Unbind(out_trans); - math::ConcatFunctor concat_functor; - concat_functor(dev_ctx, out_trans_unbind, 0, &out_trans); - TransCompute(out_trans.dims().size(), dev_ctx, out_trans, - out, permute); -} - -// functor for processing a flattend Tensor -template -struct UniqueFlattendCUDAFunctor { - const framework::ExecutionContext& ctx_; - const Tensor& in_; - Tensor* out_; - const bool return_index_; - const bool return_inverse_; - const bool return_counts_; - - UniqueFlattendCUDAFunctor(const framework::ExecutionContext& context, - const Tensor& in, Tensor* out, bool return_index, - bool return_inverse, bool return_counts) - : ctx_(context), - in_(in), - out_(out), - return_index_(return_index), - return_inverse_(return_inverse), - return_counts_(return_counts) {} - - template - void apply() const { - UniqueFlattendCUDATensor( - ctx_, in_, out_, return_index_, return_inverse_, return_counts_, - thrust::equal_to(), thrust::not_equal_to(), in_.numel()); - } -}; - -// functor for processing a multi-dimentional Tensor -template -struct UniqueDimsCUDAFunctor { - const framework::ExecutionContext& ctx_; - const Tensor& in_; - Tensor* out_; - const int axis_; - const bool return_index_; - const bool return_inverse_; - const bool return_counts_; - - UniqueDimsCUDAFunctor(const framework::ExecutionContext& context, - const Tensor& in, Tensor* out, const int axis, - bool return_index, bool return_inverse, - bool return_counts) - : ctx_(context), - in_(in), - out_(out), - axis_(axis), - return_index_(return_index), - return_inverse_(return_inverse), - return_counts_(return_counts) {} - - template - void apply() const { - UniqueDimsCUDATensor( - ctx_, in_, out_, return_index_, return_inverse_, return_counts_, axis_); - } -}; - -// Unique_op CUDA implementation. -template -class UniqueKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.Input("X"); - auto* out = context.Output("Out"); - auto data_type = static_cast( - context.Attr("dtype")); - if (data_type == framework::proto::VarType::INT32) { - PADDLE_ENFORCE_LE( - x->numel() + 1, INT_MAX, - platform::errors::InvalidArgument( - "The number of elements in Input(X) should be less than or " - "equal to INT_MAX, but received num is %d. Please set `dtype` to " - "int64.", - x->numel())); - } - - std::vector axis_vec = context.Attr>("axis"); - bool return_index = context.Attr("return_index"); - bool return_inverse = context.Attr("return_inverse"); - bool return_counts = context.Attr("return_counts"); - - // if 'axis' is not required, flatten the Tensor. - if (axis_vec.empty()) { - framework::VisitDataTypeTiny( - data_type, - UniqueFlattendCUDAFunctor( - context, *x, out, return_index, return_inverse, return_counts)); - } else { - // 'axis' is required. - int axis = axis_vec[0]; - framework::VisitDataTypeTiny( - data_type, UniqueDimsCUDAFunctor( - context, *x, out, axis, return_index, return_inverse, - return_counts)); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_CUDA_KERNEL( - unique, ops::UniqueKernel, - ops::UniqueKernel, - ops::UniqueKernel, - ops::UniqueKernel); diff --git a/paddle/fluid/operators/unstack_op.cc b/paddle/fluid/operators/unstack_op.cc index 96320202b73fb..8c8684bf4b035 100644 --- a/paddle/fluid/operators/unstack_op.cc +++ b/paddle/fluid/operators/unstack_op.cc @@ -12,12 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/unstack_op.h" #include #include #include +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/for_range.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -25,43 +27,6 @@ namespace operators { class UnStackOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "UnStack"); - int axis = ctx->Attrs().Get("axis"); - int num = ctx->Attrs().Get("num"); - auto x_dim = ctx->GetInputDim("X"); - int rank = x_dim.size(); - PADDLE_ENFORCE_GE(axis, -rank, - platform::errors::InvalidArgument( - "The attribute axis is out of range, it must be " - "inside [-rank, rank), where rank = %d", - rank)); - PADDLE_ENFORCE_LT(axis, rank, - platform::errors::InvalidArgument( - "The attribute axis is out of range, it must be " - "inside [-rank, rank), where rank = %d", - rank)); - if (axis < 0) axis += rank; - - PADDLE_ENFORCE_EQ(ctx->Outputs("Y").size(), static_cast(num), - platform::errors::InvalidArgument( - "Number of Outputs(Y) is wrong. Got %d , but it must " - "equal to attribute num which is %d.", - ctx->Outputs("Y").size(), static_cast(num))); - if (x_dim[axis] > 0) { - PADDLE_ENFORCE_EQ( - num, x_dim[axis], - platform::errors::InvalidArgument( - "The number of attribute num is not equal to the length of the " - "%d axis of Input(X). Expect %d but got %d.", - axis, x_dim[axis], num)); - } - auto vec = phi::vectorize(x_dim); - vec.erase(vec.begin() + axis); - ctx->SetOutputsDim("Y", std::vector( // NOLINT - x_dim[axis], phi::make_ddim(vec))); - } }; class UnStackOpMaker : public framework::OpProtoAndCheckerMaker { @@ -141,20 +106,12 @@ class UnStackGradOp : public framework::OperatorWithKernel { namespace plat = paddle::platform; namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(unstack, UnStackInferMetaFunctor, + PD_INFER_META(phi::UnStackInferMeta)); + REGISTER_OPERATOR(unstack, ops::UnStackOp, ops::UnStackOpMaker, ops::UnStackGradOpMaker, - ops::UnStackGradOpMaker); + ops::UnStackGradOpMaker, + UnStackInferMetaFunctor); REGISTER_OPERATOR(unstack_grad, ops::UnStackGradOp); - -REGISTER_OP_CPU_KERNEL(unstack, - ops::UnStackKernel, - ops::UnStackKernel, - ops::UnStackKernel, - ops::UnStackKernel); - -REGISTER_OP_CPU_KERNEL(unstack_grad, - ops::UnStackGradKernel, - ops::UnStackGradKernel, - ops::UnStackGradKernel, - ops::UnStackGradKernel); diff --git a/paddle/fluid/operators/unstack_op.cu b/paddle/fluid/operators/unstack_op.cu deleted file mode 100644 index b591898a4d7aa..0000000000000 --- a/paddle/fluid/operators/unstack_op.cu +++ /dev/null @@ -1,32 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/unstack_op.h" - -namespace plat = paddle::platform; -namespace ops = paddle::operators; - -REGISTER_OP_CUDA_KERNEL( - unstack, ops::UnStackKernel, - ops::UnStackKernel, - ops::UnStackKernel, - ops::UnStackKernel, - ops::UnStackKernel); - -REGISTER_OP_CUDA_KERNEL( - unstack_grad, ops::UnStackGradKernel, - ops::UnStackGradKernel, - ops::UnStackGradKernel, - ops::UnStackGradKernel, - ops::UnStackGradKernel); diff --git a/paddle/fluid/operators/unstack_op.h b/paddle/fluid/operators/unstack_op.h deleted file mode 100644 index 413470e3db5d4..0000000000000 --- a/paddle/fluid/operators/unstack_op.h +++ /dev/null @@ -1,174 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/for_range.h" - -#if defined(__NVCC__) || defined(__HIPCC__) -#include -#endif - -namespace paddle { -namespace operators { - -template -struct StackFunctor { - HOSTDEVICE StackFunctor(const VecXType &x, T *y, int n, int post) - : x_(x), y_(y), n_(n), post_(post) {} - - HOSTDEVICE void operator()(int idx) { - int i = idx / (n_ * post_); - int which_x = idx / post_ - i * n_; - int x_index = i * post_ + idx % post_; - y_[idx] = x_[which_x][x_index]; - } - - private: - VecXType x_; - T *y_; - int n_; - int post_; -}; - -template -struct StackGradFunctor { - HOSTDEVICE StackGradFunctor(const VecDxType &dx, const T *dy, int n, int post) - : dx_(dx), dy_(dy), n_(n), post_(post) {} - - HOSTDEVICE void operator()(int idx) { - int i = idx / (n_ * post_); - int which_x = idx / post_ - i * n_; - int x_index = i * post_ + idx % post_; - dx_[which_x][x_index] = dy_[idx]; - } - - private: - VecDxType dx_; - const T *dy_; - int n_; - int post_; -}; - -template -static inline void StackFunctorForRange(const DeviceContext &ctx, - const VecXType &x, T *y, int total_num, - int n, int post) { - platform::ForRange for_range(ctx, total_num); - for_range(StackFunctor(x, y, n, post)); -} - -template -static inline void StackGradFunctorForRange(const DeviceContext &ctx, - const VecDxType &dx, const T *dy, - int total_num, int n, int post) { - platform::ForRange for_range(ctx, total_num); - for_range(StackGradFunctor(dx, dy, n, post)); -} - -template -class UnStackGradKernel : public framework::OpKernel { - using Tensor = framework::LoDTensor; - - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto x = ctx.MultiInput(framework::GradVarName("Y")); - auto *y = ctx.Output(framework::GradVarName("X")); - - int axis = ctx.Attr("axis"); - if (axis < 0) axis += (x[0]->dims().size() + 1); - - int n = static_cast(x.size()); - auto *y_data = y->mutable_data(ctx.GetPlace()); - std::vector x_datas(n); - for (int i = 0; i < n; i++) x_datas[i] = x[i]->data(); - - int pre = 1; - int post = 1; - auto &dim = x[0]->dims(); - for (auto i = 0; i < axis; ++i) pre *= dim[i]; - for (auto i = axis; i < dim.size(); ++i) post *= dim[i]; - -#if defined(__NVCC__) || defined(__HIPCC__) - int total_num = pre * n * post; - auto &dev_ctx = ctx.template device_context(); - - thrust::device_vector device_x_vec(x_datas); - auto x_data_arr = device_x_vec.data().get(); - - StackFunctorForRange(dev_ctx, x_data_arr, y_data, total_num, n, post); - - // Wait() must be called because device_x_vec may be destructed before - // kernel ends - dev_ctx.Wait(); -#else - auto x_data_arr = x_datas.data(); - - size_t x_offset = 0; - size_t y_offset = 0; - for (int i = 0; i < pre; i++) { - for (int j = 0; j < n; j++) { - std::memcpy(y_data + y_offset, x_data_arr[j] + x_offset, - post * sizeof(T)); - y_offset += post; - } - x_offset += post; - } -#endif - } -}; - -template -class UnStackKernel : public framework::OpKernel { - using Tensor = framework::LoDTensor; - - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto *dy = ctx.Input("X"); - auto dx = ctx.MultiOutput("Y"); - int axis = ctx.Attr("axis"); - if (axis < 0) axis += dy->dims().size(); - - int n = dy->dims()[axis]; - std::vector dx_datas(n); // NOLINT - for (int i = 0; i < n; i++) { - dx_datas[i] = dx[i]->mutable_data(ctx.GetPlace()); - } - auto dy_data = dy->data(); - if (dy->numel() == 0) return; - int pre = 1; - for (int i = 0; i < axis; ++i) pre *= dy->dims()[i]; - int total_num = dy->numel(); - int post = total_num / (n * pre); - - auto &dev_ctx = ctx.template device_context(); -#if defined(__NVCC__) || defined(__HIPCC__) - thrust::device_vector device_dx_vec(dx_datas); - auto dx_data_arr = device_dx_vec.data().get(); -#else - auto dx_data_arr = dx_datas.data(); -#endif - StackGradFunctorForRange(dev_ctx, dx_data_arr, dy_data, total_num, n, post); -#if defined(__NVCC__) || defined(__HIPCC__) - // Wait() must be called because device_dx_vec may be destructed before - // kernel ends - dev_ctx.Wait(); -#endif - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/unstack_op_npu.cc b/paddle/fluid/operators/unstack_op_npu.cc index fb88566e3426c..c55ec1fcf9044 100644 --- a/paddle/fluid/operators/unstack_op_npu.cc +++ b/paddle/fluid/operators/unstack_op_npu.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/unstack_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h index 5edab707e7e3a..897183f2cf589 100644 --- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h +++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h @@ -295,6 +295,7 @@ XPUOpMap& get_kl2_ops() { pOpKernelType(vartype::INT32, XPUPlace()), pOpKernelType(vartype::BOOL, XPUPlace()), pOpKernelType(vartype::FP32, XPUPlace())})}, + {"rnn", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"roi_align", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"roi_align_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, diff --git a/paddle/fluid/pybind/eager_op_function_generator.cc b/paddle/fluid/pybind/eager_op_function_generator.cc index 685e20aef2591..06d88be9bc8cc 100644 --- a/paddle/fluid/pybind/eager_op_function_generator.cc +++ b/paddle/fluid/pybind/eager_op_function_generator.cc @@ -433,7 +433,7 @@ GenerateOpFunctions() { std::map inplace_map; // `sum` op has duplicate input. Don't consider adding inplace strategy // for `sum` in temporary. - if (op_type != "sum" && infer_inplace) { + if (infer_inplace && !special_inplace_op_set.count(op_type)) { // Inplace OP: op_type_. // The inplace OP needs a new implementation method. auto in_to_outs = infer_inplace(true); diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc index af89861d151be..bee3e27a55167 100644 --- a/paddle/fluid/pybind/eager_utils.cc +++ b/paddle/fluid/pybind/eager_utils.cc @@ -950,9 +950,10 @@ paddle::experimental::Scalar CastPyArg2Scalar(PyObject* obj, return paddle::experimental::Scalar(1.0); } -paddle::experimental::ScalarArray CastPyArg2ScalarArray( - PyObject* obj, const std::string& op_type, ssize_t arg_pos) { - // In case of ScalarArray, only two possible PyObjects: +paddle::experimental::IntArray CastPyArg2IntArray(PyObject* obj, + const std::string& op_type, + ssize_t arg_pos) { + // In case of IntArray, only two possible PyObjects: // 1. list of int // 2. Tensor if (obj == Py_None) { @@ -968,12 +969,12 @@ paddle::experimental::ScalarArray CastPyArg2ScalarArray( auto type_name = std::string(type->tp_name); if (type_name == "list" || type_name == "tuple") { std::vector value = CastPyArg2Ints(obj, op_type, arg_pos); - return paddle::experimental::ScalarArray(value); + return paddle::experimental::IntArray(value); } else if (type_name == "paddle.Tensor") { paddle::experimental::Tensor& value = GetTensorFromPyObject( op_type, "" /*arg_name*/, obj, arg_pos, false /*dispensable*/); - return paddle::experimental::ScalarArray(value); + return paddle::experimental::IntArray(value); } else { PADDLE_THROW(platform::errors::InvalidArgument( @@ -983,8 +984,8 @@ paddle::experimental::ScalarArray CastPyArg2ScalarArray( ((PyTypeObject*)obj->ob_type)->tp_name)); // NOLINT } - // Fake a ScalarArray - return paddle::experimental::ScalarArray({1}); + // Fake a IntArray + return paddle::experimental::IntArray({1}); } paddle::framework::Scope* CastPyArg2ScopePtr(PyObject* obj) { diff --git a/paddle/fluid/pybind/eager_utils.h b/paddle/fluid/pybind/eager_utils.h index 15d289d7bc37d..bd78342e21f4b 100644 --- a/paddle/fluid/pybind/eager_utils.h +++ b/paddle/fluid/pybind/eager_utils.h @@ -13,8 +13,8 @@ limitations under the License. */ #include #include "paddle/phi/common/backend.h" #include "paddle/phi/common/data_type.h" +#include "paddle/phi/common/int_array.h" #include "paddle/phi/common/scalar.h" -#include "paddle/phi/common/scalar_array.h" #include "paddle/phi/core/dense_tensor.h" #include "pybind11/pybind11.h" @@ -150,8 +150,9 @@ paddle::experimental::Scalar CastPyArg2Scalar(PyObject* obj, const std::string& op_type, ssize_t arg_pos); -paddle::experimental::ScalarArray CastPyArg2ScalarArray( - PyObject* obj, const std::string& op_type, ssize_t arg_pos); +paddle::experimental::IntArray CastPyArg2IntArray(PyObject* obj, + const std::string& op_type, + ssize_t arg_pos); paddle::experimental::Place CastPyArg2Place(PyObject* obj, const std::string& op_type, diff --git a/paddle/fluid/pybind/op_function_generator.h b/paddle/fluid/pybind/op_function_generator.h index 2bfc16c7d5b0f..10c8a90ae0a36 100644 --- a/paddle/fluid/pybind/op_function_generator.h +++ b/paddle/fluid/pybind/op_function_generator.h @@ -36,6 +36,8 @@ std::map> op_ins_map = { {"gru_unit", {"Input", "HiddenPrev", "Weight", "Bias"}}, {"label_smooth", {"X", "PriorDist"}}, {"assign", {"X"}}, + {"crop", {"X", "Y", "Offsets"}}, + {"crop_tensor", {"X", "Shape", "Offsets"}}, {"reshape2", {"X", "Shape"}}, {"expand", {"X", "ExpandTimes"}}, {"slice", @@ -55,6 +57,7 @@ std::map> op_ins_map = { {"repeat_interleave", {"X", "RepeatsTensor"}}, {"roi_pool", {"X", "ROIs", "RoisNum"}}, {"roi_align", {"X", "ROIs", "RoisNum"}}, + {"prroi_pool", {"X", "ROIs", "BatchRoINums"}}, {"psroi_pool", {"X", "ROIs", "RoisNum"}}, {"collect_fpn_proposals", {"MultiLevelRois", "MultiLevelScores", "MultiLevelRoIsNum"}}, @@ -219,6 +222,7 @@ std::map> op_passing_outs_map = { {"c_reduce", {"Out"}}, {"c_scatter", {"Out"}}, {"barrier", {"Out"}}, + {"assign", {"Out"}}, {"fake_quantize_dequantize_moving_average_abs_max", {"Out", "OutScale", "OutAccum", "OutState"}}, {"fake_quantize_dequantize_abs_max", {"Out", "OutScale"}}, @@ -246,3 +250,12 @@ std::map> view_op_map = { {"reshape2", {"X", "Out"}}, {"flatten_contiguous_range", {"X", "Out"}}, }; + +// NOTE(pangyoki): Special inplace ops that are not supported in temporary. +// The input and output of some inplace ops are special, such as +// duplicate input. These inplace ops have no usage scenarios and +// are not supported in temporary. +std::set special_inplace_op_set = { + "sum", // `sum` op has duplicate input + "assign", // output of `assign` op is in `op_passing_outs_map` +}; diff --git a/paddle/infrt/host_context/value.h b/paddle/infrt/host_context/value.h index b0f56f020f486..ecd118818099d 100644 --- a/paddle/infrt/host_context/value.h +++ b/paddle/infrt/host_context/value.h @@ -38,9 +38,9 @@ #include "paddle/phi/backends/all_context.h" #include "paddle/phi/common/backend.h" #include "paddle/phi/common/data_type.h" +#include "paddle/phi/common/int_array.h" #include "paddle/phi/common/layout.h" #include "paddle/phi/common/scalar.h" -#include "paddle/phi/common/scalar_array.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/meta_tensor.h" @@ -90,7 +90,7 @@ using ValueVariantType = std::vector, std::vector<::phi::DenseTensor*>, paddle::experimental::ScalarBase<::phi::DenseTensor>, - paddle::experimental::ScalarArrayBase<::phi::DenseTensor>, + paddle::experimental::IntArrayBase<::phi::DenseTensor>, std::vector<::phi::MetaTensor*>, ::phi::MetaConfig, paddle::experimental::Backend, diff --git a/paddle/phi/api/all.h b/paddle/phi/api/all.h index 154b84670aaf9..4e0a4729916b3 100644 --- a/paddle/phi/api/all.h +++ b/paddle/phi/api/all.h @@ -32,9 +32,9 @@ limitations under the License. */ // phi common headers #include "paddle/phi/common/backend.h" #include "paddle/phi/common/data_type.h" +#include "paddle/phi/common/int_array.h" #include "paddle/phi/common/layout.h" #include "paddle/phi/common/scalar.h" -#include "paddle/phi/common/scalar_array.h" // original custom op headers #include "paddle/phi/api/ext/dispatch.h" diff --git a/paddle/phi/api/lib/api_custom_impl.cc b/paddle/phi/api/lib/api_custom_impl.cc index 1f6c37aa090e8..152873fe41072 100644 --- a/paddle/phi/api/lib/api_custom_impl.cc +++ b/paddle/phi/api/lib/api_custom_impl.cc @@ -64,7 +64,7 @@ Tensor copy_to_impl(const Tensor& x, Place place, bool blocking) { } std::vector split_impl(const Tensor& x, - const ScalarArray& num_or_sections, + const IntArray& num_or_sections, const Scalar& axis) { auto kernel_key_set = ParseKernelKeyByInputArgs(x); auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey(); @@ -107,13 +107,13 @@ std::vector split_impl(const Tensor& x, using kernel_signature = void (*)(const platform::DeviceContext&, const phi::DenseTensor&, - const phi::ScalarArray&, + const phi::IntArray&, const phi::Scalar&, std::vector&); auto* kernel_fn = kernel.GetVariadicKernelFn(); (*kernel_fn)(*dev_ctx, *dense_x, - phi::ScalarArray(num_or_sections), + phi::IntArray(num_or_sections), phi::Scalar(axis), dense_outs); diff --git a/paddle/phi/api/lib/api_custom_impl.h b/paddle/phi/api/lib/api_custom_impl.h index 48eda2d954647..b2f5a074d9288 100644 --- a/paddle/phi/api/lib/api_custom_impl.h +++ b/paddle/phi/api/lib/api_custom_impl.h @@ -15,9 +15,9 @@ limitations under the License. */ #pragma once #include "paddle/phi/api/include/tensor.h" +#include "paddle/phi/common/int_array.h" #include "paddle/phi/common/place.h" #include "paddle/phi/common/scalar.h" -#include "paddle/phi/common/scalar_array.h" namespace paddle { namespace experimental { @@ -25,7 +25,7 @@ namespace experimental { Tensor copy_to_impl(const Tensor& x, Place place, bool blocking); std::vector split_impl(const Tensor& x, - const ScalarArray& num_or_sections, + const IntArray& num_or_sections, const Scalar& axis); } // namespace experimental diff --git a/paddle/phi/api/lib/kernel_dispatch.h b/paddle/phi/api/lib/kernel_dispatch.h index 25b74e7fe31b9..be545ac9ce2f7 100644 --- a/paddle/phi/api/lib/kernel_dispatch.h +++ b/paddle/phi/api/lib/kernel_dispatch.h @@ -25,6 +25,8 @@ limitations under the License. */ #include "paddle/phi/common/data_type.h" #include "paddle/phi/common/layout.h" #include "paddle/phi/core/selected_rows.h" +#include "paddle/phi/core/sparse_coo_tensor.h" +#include "paddle/phi/core/sparse_csr_tensor.h" // TODO(chenweihang): split Key, Kernel, Factory into diff files #include "paddle/phi/core/kernel_factory.h" @@ -40,8 +42,10 @@ std::size_t CountLeadingZeros(uint64_t val); phi::DeviceContext* GetDeviceContextByBackend(phi::Backend backend); enum class KernelType { - DENSE_TENSOR_KENREL, // kernel for DenseTensor - SELECTED_ROWS_KENREL // kernel for SelectedRows + DENSE_TENSOR_KENREL, // kernel for DenseTensor + SELECTED_ROWS_KENREL, // kernel for SelectedRows + SPARSE_COO_KERNEL, // kernel for SparseCooTensor + SPARSE_CSR_KERNEL // kernel for SparseCsrTensor }; // TODO(chenweihang): support DataLayout and DataType selected @@ -130,6 +134,10 @@ struct KernelTypeParser : ArgsIterator { void operator()(const Tensor& x) { if (phi::SelectedRows::classof(x.impl().get())) { kernel_type = KernelType::SELECTED_ROWS_KENREL; + } else if (phi::SparseCooTensor::classof(x.impl().get())) { + kernel_type = KernelType::SPARSE_COO_KERNEL; + } else if (phi::SparseCsrTensor::classof(x.impl().get())) { + kernel_type = KernelType::SPARSE_CSR_KERNEL; } } diff --git a/paddle/phi/api/lib/tensor_method.cc b/paddle/phi/api/lib/tensor_method.cc index dde9980d0b951..c4c77ab93790d 100644 --- a/paddle/phi/api/lib/tensor_method.cc +++ b/paddle/phi/api/lib/tensor_method.cc @@ -15,7 +15,7 @@ limitations under the License. */ #include "paddle/phi/api/include/tensor.h" #include "paddle/phi/api/lib/ext_compat_utils.h" -#include "paddle/phi/common/scalar_array.h" +#include "paddle/phi/common/int_array.h" #include "paddle/phi/core/compat/convert_utils.h" #include "paddle/phi/core/tensor_base.h" @@ -177,6 +177,40 @@ void Tensor::copy_(const Tensor &src, target_place, blocking, static_cast(impl_.get())); + } else if (kernel_type == KernelType::SPARSE_COO_KERNEL) { + auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError( + "copy_sparse_coo", {kernel_backend, kernel_layout, kernel_data_type}); + VLOG(6) << "copy API kernel key: " << kernel_key; + VLOG(6) << "copy API kernel: " << kernel; + using kernel_signature = void (*)(const platform::DeviceContext &, + const phi::SparseCooTensor &, + phi::Place, + bool, + phi::SparseCooTensor *); + this->set_impl(std::make_shared()); + auto *kernel_fn = kernel.GetVariadicKernelFn(); + (*kernel_fn)(*dev_ctx, + (*(std::static_pointer_cast(src.impl_))), + target_place, + blocking, + static_cast(impl_.get())); + } else if (kernel_type == KernelType::SPARSE_CSR_KERNEL) { + auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError( + "copy_sparse_csr", {kernel_backend, kernel_layout, kernel_data_type}); + VLOG(6) << "copy API kernel key: " << kernel_key; + VLOG(6) << "copy API kernel: " << kernel; + using kernel_signature = void (*)(const platform::DeviceContext &, + const phi::SparseCsrTensor &, + phi::Place, + bool, + phi::SparseCsrTensor *); + this->set_impl(std::make_shared()); + auto *kernel_fn = kernel.GetVariadicKernelFn(); + (*kernel_fn)(*dev_ctx, + (*(std::static_pointer_cast(src.impl_))), + target_place, + blocking, + static_cast(impl_.get())); } else { PADDLE_THROW(phi::errors::InvalidArgument( "We currently only support dense tensor copy for now and if u need to " diff --git a/paddle/phi/api/lib/utils/tensor_utils.cc b/paddle/phi/api/lib/utils/tensor_utils.cc index 3d183ea7fee8b..5a6f1b1a7ee0c 100644 --- a/paddle/phi/api/lib/utils/tensor_utils.cc +++ b/paddle/phi/api/lib/utils/tensor_utils.cc @@ -62,35 +62,34 @@ phi::Scalar MakePhiScalarFromVar(const framework::Variable& variable) { } } -phi::ScalarArray MakePhiScalarArray(const paddle::framework::Tensor& src) { +phi::IntArray MakePhiIntArray(const paddle::framework::Tensor& src) { return {src}; } -phi::ScalarArray MakePhiScalarArrayFromVar( - const framework::Variable& variable) { +phi::IntArray MakePhiIntArrayFromVar(const framework::Variable& variable) { auto expected_place = phi::TransToPhiPlace(phi::Backend::CPU); if (variable.IsType()) { const auto& tensor = variable.Get(); if (!platform::is_same_place(tensor.place(), expected_place)) { framework::LoDTensor tmp_tensor; framework::TensorCopySync(tensor, expected_place, &tmp_tensor); - return MakePhiScalarArray(tmp_tensor); + return MakePhiIntArray(tmp_tensor); } else { - return MakePhiScalarArray(tensor); + return MakePhiIntArray(tensor); } } else { PADDLE_THROW(platform::errors::Unimplemented( - "Unsupport casting input `%s` type to ScalarArray when call pt " + "Unsupport casting input `%s` type to IntArray when call pt " "kernel.", framework::ToTypeName(variable.Type()))); } } -// TODO(chentianyu03): Inplace with ScalarArray constructor -phi::ScalarArray MakePhiScalarArrayFromVarList( +// TODO(chentianyu03): Inplace with IntArray constructor +phi::IntArray MakePhiIntArrayFromVarList( const std::vector& variable_list) { if (variable_list.size() == 0) { - return phi::ScalarArray(); + return phi::IntArray(); } auto expected_place = phi::TransToPhiPlace(phi::Backend::CPU); @@ -137,7 +136,7 @@ phi::ScalarArray MakePhiScalarArrayFromVarList( } } - phi::ScalarArray result{vector_data}; + phi::IntArray result{vector_data}; result.SetFromTensor(true); return result; diff --git a/paddle/phi/api/lib/utils/tensor_utils.h b/paddle/phi/api/lib/utils/tensor_utils.h index 64df59c1a2a2d..00199da1280e8 100644 --- a/paddle/phi/api/lib/utils/tensor_utils.h +++ b/paddle/phi/api/lib/utils/tensor_utils.h @@ -21,8 +21,8 @@ limitations under the License. */ #include "paddle/phi/api/lib/utils/allocator.h" #include "paddle/phi/api/lib/utils/storage.h" +#include "paddle/phi/common/int_array.h" #include "paddle/phi/common/scalar.h" -#include "paddle/phi/common/scalar_array.h" #include "paddle/phi/core/compat/convert_utils.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/kernel_factory.h" @@ -33,13 +33,13 @@ namespace experimental { std::unique_ptr MakePhiDenseTensor( const paddle::framework::Tensor& src); -phi::ScalarArray MakePhiScalarArray(const paddle::framework::Tensor& src); +phi::IntArray MakePhiIntArray(const paddle::framework::Tensor& src); phi::Scalar MakePhiScalarFromVar(const framework::Variable& variable); -phi::ScalarArray MakePhiScalarArrayFromVar(const framework::Variable& variable); +phi::IntArray MakePhiIntArrayFromVar(const framework::Variable& variable); -phi::ScalarArray MakePhiScalarArrayFromVarList( +phi::IntArray MakePhiIntArrayFromVarList( const std::vector& variable_list); } // namespace experimental diff --git a/paddle/phi/backends/gpu/gpu_launch_config.h b/paddle/phi/backends/gpu/gpu_launch_config.h index 41bc6bb47c160..ea54083e8179b 100644 --- a/paddle/phi/backends/gpu/gpu_launch_config.h +++ b/paddle/phi/backends/gpu/gpu_launch_config.h @@ -179,6 +179,43 @@ inline GpuLaunchConfig GetGpuLaunchConfig2D(const phi::GPUContext& context, return config; } +static inline int GetLastPow2(int n) { + n |= (n >> 1); + n |= (n >> 2); + n |= (n >> 4); + n |= (n >> 8); + n |= (n >> 16); + return std::max(1, n - (n >> 1)); +} + +inline GpuLaunchConfig GetGpuLaunchConfig3D(const phi::GPUContext& context, + int num_img, + int height, + int width) { + const int kThreadsPerBlock = 256; + int max_threads_per_block = context.GetMaxThreadsPerBlock(); // 1024 + int max_threads = std::min(kThreadsPerBlock, max_threads_per_block); + + int block_x = std::min(GetLastPow2(width), max_threads); + int block_y = std::min(GetLastPow2(height), max_threads / block_x); + int block_z = std::min(num_img, max_threads / block_x / block_y); + + auto max_grid_dim = context.GetCUDAMaxGridDimSize(); + int grid_x = + std::min(max_grid_dim[0], backends::gpu::DivUp(width, block_x)); + int grid_y = + std::min(max_grid_dim[1], backends::gpu::DivUp(height, block_y)); + int grid_z = std::min(max_grid_dim[2], + backends::gpu::DivUp(num_img, block_z * 4)); + + const int capability = context.GetComputeCapability(); + GpuLaunchConfig config; + config.compute_capability = capability; + config.thread_per_block = dim3(block_x, block_y, block_z); + config.block_per_grid = dim3(grid_x, grid_y, grid_z); + return config; +} + } // namespace gpu } // namespace backends } // namespace phi diff --git a/paddle/phi/common/scalar_array.h b/paddle/phi/common/int_array.h similarity index 77% rename from paddle/phi/common/scalar_array.h rename to paddle/phi/common/int_array.h index 39284095961a7..490d7dabd4007 100644 --- a/paddle/phi/common/scalar_array.h +++ b/paddle/phi/common/int_array.h @@ -21,25 +21,25 @@ namespace paddle { namespace experimental { template -class ScalarArrayBase { +class IntArrayBase { public: // Constructor support implicit - ScalarArrayBase() = default; + IntArrayBase() = default; - ScalarArrayBase(const std::vector& vec) : array_(vec) {} // NOLINT + IntArrayBase(const std::vector& vec) : array_(vec) {} // NOLINT - ScalarArrayBase(const std::vector& vec) { // NOLINT + IntArrayBase(const std::vector& vec) { // NOLINT array_.insert(array_.begin(), vec.begin(), vec.end()); } - ScalarArrayBase(std::initializer_list array_list) + IntArrayBase(std::initializer_list array_list) : array_(array_list) {} - ScalarArrayBase(const int64_t* date_value, int64_t n) { + IntArrayBase(const int64_t* date_value, int64_t n) { AssignData(date_value, n); } - ScalarArrayBase(const int32_t* date_value, int64_t n) { + IntArrayBase(const int32_t* date_value, int64_t n) { AssignData(date_value, n); } @@ -48,7 +48,7 @@ class ScalarArrayBase { void SetFromTensor(bool val) { is_from_tensor_ = val; } // The Tensor must have one dim - ScalarArrayBase(const T& tensor) { // NOLINT + IntArrayBase(const T& tensor) { // NOLINT is_from_tensor_ = true; size_t n = tensor.numel(); array_.reserve(n); @@ -61,7 +61,7 @@ class ScalarArrayBase { break; default: PD_THROW( - "Data type error. Currently, The data type of ScalarArrayBase " + "Data type error. Currently, The data type of IntArrayBase " "only supports Tensor with int32 and int64, " "but now received `", tensor.dtype(), @@ -70,7 +70,7 @@ class ScalarArrayBase { } // The Tensor in vec must have only one element - ScalarArrayBase(const std::vector& tensor_list) { // NOLINT + IntArrayBase(const std::vector& tensor_list) { // NOLINT is_from_tensor_ = true; for (size_t i = 0; i < tensor_list.size(); ++i) { @@ -84,7 +84,7 @@ class ScalarArrayBase { break; default: PD_THROW( - "Data type error. Currently, The data type of ScalarArrayBase " + "Data type error. Currently, The data type of IntArrayBase " "only supports Tensor with int32 and int64, " "but now received `", data_type, @@ -94,8 +94,7 @@ class ScalarArrayBase { } template - ScalarArrayBase(const ScalarArrayBase& other) - : array_(other.GetData()) {} + IntArrayBase(const IntArrayBase& other) : array_(other.GetData()) {} const std::vector& GetData() const { return array_; } @@ -120,8 +119,8 @@ class ScalarArrayBase { bool is_from_tensor_{false}; }; -using ScalarArray = - paddle::experimental::ScalarArrayBase; +using IntArray = + paddle::experimental::IntArrayBase; } // namespace experimental } // namespace paddle @@ -129,6 +128,6 @@ using ScalarArray = namespace phi { class DenseTensor; -using ScalarArray = paddle::experimental::ScalarArrayBase; +using IntArray = paddle::experimental::IntArrayBase; } // namespace phi diff --git a/paddle/phi/core/compat/convert_utils.cc b/paddle/phi/core/compat/convert_utils.cc index 667cee10675d8..cc9c2caa88991 100644 --- a/paddle/phi/core/compat/convert_utils.cc +++ b/paddle/phi/core/compat/convert_utils.cc @@ -33,6 +33,8 @@ Backend TransToPhiBackend(const phi::Place& place) { return Backend::GPU; } else if (allocation_type == phi::AllocationType::XPU) { return Backend::XPU; + } else if (allocation_type == phi::AllocationType::NPU) { + return Backend::NPU; } else if (allocation_type == phi::AllocationType::CUSTOM) { return static_cast( static_cast(Backend::NUM_BACKENDS) + diff --git a/paddle/phi/core/infermeta_utils.cc b/paddle/phi/core/infermeta_utils.cc index 671ba2ec7dc25..0496d727e8d3b 100644 --- a/paddle/phi/core/infermeta_utils.cc +++ b/paddle/phi/core/infermeta_utils.cc @@ -87,6 +87,23 @@ std::vector InferMetaContext::InputsBetween(size_t start, return result; } +paddle::optional> +InferMetaContext::OptionalInputsBetween(size_t start, size_t end) const { + const auto& first = inputs_.at(start); + + if (first) { + std::vector result; + result.reserve(end - start); + + for (size_t i = start; i < end; ++i) { + result.push_back(inputs_.at(i).get()); + } + + return paddle::optional>(result); + } + return paddle::optional>(paddle::none); +} + MetaTensor* InferMetaContext::MutableOutputAt(size_t idx) { return outputs_.at(idx).get(); } diff --git a/paddle/phi/core/infermeta_utils.h b/paddle/phi/core/infermeta_utils.h index 9c351ce9063ec..fad437f82c331 100644 --- a/paddle/phi/core/infermeta_utils.h +++ b/paddle/phi/core/infermeta_utils.h @@ -19,8 +19,8 @@ limitations under the License. */ #include #include +#include "paddle/phi/common/int_array.h" #include "paddle/phi/common/scalar.h" -#include "paddle/phi/common/scalar_array.h" #include "paddle/phi/core/enforce.h" #include "paddle/phi/core/macros.h" #include "paddle/phi/core/meta_tensor.h" @@ -54,6 +54,8 @@ class InferMetaContext { const MetaTensor& InputAt(size_t idx) const; paddle::optional OptionalInputAt(size_t idx) const; std::vector InputsBetween(size_t start, size_t end) const; + paddle::optional> + OptionalInputsBetween(size_t start, size_t end) const; MetaTensor* MutableOutputAt(size_t idx); std::vector MutableOutputBetween(size_t start, size_t end); @@ -174,6 +176,26 @@ struct InferMetaFnImpl { } }; + template + struct InferMetaFnCallHelper< + paddle::optional>, + Tail...> { + template + static void Call(InferMetaContext* ctx, PreviousArgs&... pargs) { + static_assert(attr_idx == 0, + "InferMeta's Input should appear before Attributes."); + static_assert(out_idx == 0, + "InferMeta's Input should appear before Outputs."); + const std::pair range = ctx->InputRangeAt(in_idx); + paddle::optional> arg = + ctx->OptionalInputsBetween(range.first, range.second); + InferMetaFnCallHelper< + Tail...>::template Call(ctx, + pargs..., + arg); + } + }; + // TODO(chenweihang): support other attr type later PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(bool); PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(int); @@ -192,7 +214,7 @@ struct InferMetaFnImpl { PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(Backend); PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(DataLayout); PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const Scalar&); - PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const ScalarArray&); + PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const IntArray&); // TODO(chenweihang): support vector input later diff --git a/paddle/phi/core/kernel_context.h b/paddle/phi/core/kernel_context.h index d3ca1ffc61c42..ab4e044e62537 100644 --- a/paddle/phi/core/kernel_context.h +++ b/paddle/phi/core/kernel_context.h @@ -97,6 +97,22 @@ class KernelContext { return v; } + template + paddle::optional> OptionalInputsBetween( + size_t start, size_t end) { + const auto& first = inputs_.at(start); + + if (first) { + std::vector v; + for (size_t i = start; i < end; ++i) { + auto* t = static_cast(inputs_.at(i)); + v.emplace_back(t); + } + return paddle::optional>(v); + } + return paddle::optional>(paddle::none); + } + template TensorType* MutableOutputAt(size_t idx) { return static_cast(outputs_.at(idx)); diff --git a/paddle/phi/core/kernel_factory.cc b/paddle/phi/core/kernel_factory.cc index ba41e082ab912..81c43764fee9e 100644 --- a/paddle/phi/core/kernel_factory.cc +++ b/paddle/phi/core/kernel_factory.cc @@ -59,6 +59,21 @@ KernelKeyMap KernelFactory::SelectKernelMap( return iter->second; } +bool KernelFactory::IsSelectKernelValid(const std::string& kernel_name, + const KernelKey& kernel_key) const { + auto iter = kernels_.find(kernel_name); + PADDLE_ENFORCE_NE( + iter, + kernels_.end(), + phi::errors::NotFound("The kernel `%s` is not registered.", kernel_name)); + + auto kernel_iter = iter->second.find(kernel_key); + if (kernel_iter == iter->second.end()) { + return false; + } + return true; +} + const Kernel& KernelFactory::SelectKernelOrThrowError( const std::string& kernel_name, const KernelKey& kernel_key) const { auto iter = kernels_.find(kernel_name); diff --git a/paddle/phi/core/kernel_factory.h b/paddle/phi/core/kernel_factory.h index e502b9cb3e025..6c098c75a0eda 100644 --- a/paddle/phi/core/kernel_factory.h +++ b/paddle/phi/core/kernel_factory.h @@ -245,6 +245,9 @@ class KernelFactory { DataLayout layout, DataType dtype) const; + bool IsSelectKernelValid(const std::string& kernel_name, + const KernelKey& kernel_key) const; + Kernel SelectKernel(const std::string& kernel_name, const KernelKey& kernel_key) const; diff --git a/paddle/phi/core/kernel_registry.h b/paddle/phi/core/kernel_registry.h index fac4b1e82792f..b18fd9e05f92f 100644 --- a/paddle/phi/core/kernel_registry.h +++ b/paddle/phi/core/kernel_registry.h @@ -81,6 +81,13 @@ struct KernelArgsParseFunctor { default_tensor_layout, default_key.dtype(), arg_type); + } else if (arg_type == std::type_index(typeid( + paddle::optional< + const std::vector>))) { + args_def->AppendInput(default_key.backend(), + default_tensor_layout, + default_key.dtype(), + arg_type); } else if (arg_type == std::type_index(typeid( paddle::optional))) { args_def->AppendInput(default_key.backend(), diff --git a/paddle/phi/core/kernel_utils.h b/paddle/phi/core/kernel_utils.h index 642dc0b4c830e..55574ea03ab4a 100644 --- a/paddle/phi/core/kernel_utils.h +++ b/paddle/phi/core/kernel_utils.h @@ -18,8 +18,8 @@ #include "paddle/phi/backends/custom/custom_context.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/xpu/xpu_context.h" +#include "paddle/phi/common/int_array.h" #include "paddle/phi/common/scalar.h" -#include "paddle/phi/common/scalar_array.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/enforce.h" #include "paddle/phi/core/kernel_context.h" @@ -126,6 +126,30 @@ namespace phi { } \ } +#define PD_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_MULTI_INPUT(tensor_type) \ + template \ + struct KernelCallHelper< \ + paddle::optional>, \ + Tail...> { \ + template \ + static void Compute(KernelContext* ctx, PreviousArgs&... pargs) { \ + static_assert(attr_idx == 0, \ + "Kernel's Input should appear before Attributes."); \ + static_assert(out_idx == 0, \ + "Kernel's Input should appear before Outputs."); \ + const std::pair range = ctx->InputRangeAt(in_idx); \ + paddle::optional> arg = \ + ctx->OptionalInputsBetween(range.first, range.second); \ + KernelCallHelper:: \ + template Compute( \ + ctx, pargs..., arg); \ + } \ + } + #define PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(attr_type) \ template \ struct KernelCallHelper { \ @@ -224,6 +248,7 @@ struct KernelImpl { PD_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(SelectedRows); PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(DenseTensor); PD_SPECIALIZE_KernelCallHelper_FOR_INPUT(SelectedRows); + PD_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_MULTI_INPUT(DenseTensor); PD_SPECIALIZE_KernelCallHelper_FOR_INPUT(SparseCooTensor); PD_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(SparseCooTensor); @@ -250,7 +275,7 @@ struct KernelImpl { PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(DataLayout); PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(Place); PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector&); - PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const ScalarArray&); + PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const IntArray&); PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector&); PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::string&); PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector&); diff --git a/paddle/phi/core/sparse_coo_tensor.cc b/paddle/phi/core/sparse_coo_tensor.cc index ceaebe4e35b71..7d4261ef82972 100644 --- a/paddle/phi/core/sparse_coo_tensor.cc +++ b/paddle/phi/core/sparse_coo_tensor.cc @@ -16,6 +16,11 @@ limitations under the License. */ namespace phi { +SparseCooTensor::SparseCooTensor() { + DenseTensor non_zero_indices, non_zero_elements; + this->SetMember(non_zero_indices, non_zero_elements, {1}, true); +} + SparseCooTensor::SparseCooTensor(const DenseTensor& non_zero_indices, const DenseTensor& non_zero_elements, const DDim& dims) diff --git a/paddle/phi/core/sparse_coo_tensor.h b/paddle/phi/core/sparse_coo_tensor.h index ca3290f33e61e..ec43c5d62179b 100644 --- a/paddle/phi/core/sparse_coo_tensor.h +++ b/paddle/phi/core/sparse_coo_tensor.h @@ -30,6 +30,7 @@ namespace phi { class SparseCooTensor : public TensorBase, public TypeInfoTraits { public: + SparseCooTensor(); /// \brief Create the sparse coo tensor /// \param non_zero_indices The indices of non zero elements in original dense /// tensor. @@ -145,6 +146,8 @@ class SparseCooTensor : public TensorBase, void* AllocateFrom(Allocator* allocator, DataType dtype, size_t requested_size = 0) override; + + /// \brief set the dims of original dense tensor void set_dims(const DDim& dims) { this->dims_ = dims; } private: diff --git a/paddle/phi/core/sparse_csr_tensor.cc b/paddle/phi/core/sparse_csr_tensor.cc index cbf5f941b665d..ab9717a564eb5 100644 --- a/paddle/phi/core/sparse_csr_tensor.cc +++ b/paddle/phi/core/sparse_csr_tensor.cc @@ -16,6 +16,14 @@ limitations under the License. */ namespace phi { +SparseCsrTensor::SparseCsrTensor() { + DenseTensor crows, cols, values; + this->non_zero_crows_ = crows; + this->non_zero_cols_ = cols; + this->non_zero_elements_ = values; + this->dims_ = phi::make_ddim({1, 1}); +} + inline void check_shape(const DDim& dims) { bool valid = dims.size() == 2 || dims.size() == 3; diff --git a/paddle/phi/core/sparse_csr_tensor.h b/paddle/phi/core/sparse_csr_tensor.h index 8a9de7a841422..7e14cad242d12 100644 --- a/paddle/phi/core/sparse_csr_tensor.h +++ b/paddle/phi/core/sparse_csr_tensor.h @@ -33,6 +33,7 @@ class CompatibleDenseTensorUtils; class SparseCsrTensor : public TensorBase, public TypeInfoTraits { public: + SparseCsrTensor(); /// \brief Because sparse csr tensor is a resource handle, we provide a /// default /// move constructor to support move semantics. @@ -143,6 +144,9 @@ class SparseCsrTensor : public TensorBase, /// return a mutable pointer of non_zero_elements. DenseTensor* mutable_non_zero_elements() { return &non_zero_elements_; } + /// \brief set the dims of original dense tensor + void set_dims(const DDim& dims) { this->dims_ = dims; } + private: // save the compressed rows information of non zero elements DenseTensor non_zero_crows_; diff --git a/paddle/phi/core/utils/data_type.h b/paddle/phi/core/utils/data_type.h index a190b222f86ac..9ef8e8a356c7a 100644 --- a/paddle/phi/core/utils/data_type.h +++ b/paddle/phi/core/utils/data_type.h @@ -44,6 +44,10 @@ namespace phi { _PhiForEachDataTypeHelper_( \ callback, ::phi::dtype::complex, DataType::COMPLEX128); +#define _PhiForEachDataTypeTiny_(callback) \ + _PhiForEachDataTypeHelper_(callback, int, DataType::INT32); \ + _PhiForEachDataTypeHelper_(callback, int64_t, DataType::INT64); + template inline void VisitDataType(phi::DataType type, Visitor visitor) { #define PhiVisitDataTypeCallback(cpp_type, data_type) \ @@ -59,4 +63,21 @@ inline void VisitDataType(phi::DataType type, Visitor visitor) { PADDLE_THROW(phi::errors::Unimplemented( "Not supported phi::DataType(%d) as data type.", static_cast(type))); } + +template +inline void VisitDataTypeTiny(phi::DataType type, Visitor visitor) { +#define PhiVisitDataTypeCallbackTiny(cpp_type, data_type) \ + do { \ + if (type == data_type) { \ + visitor.template apply(); \ + return; \ + } \ + } while (0) + + _PhiForEachDataTypeTiny_(PhiVisitDataTypeCallbackTiny); +#undef PhiVisitDataTypeCallbackTiny + PADDLE_THROW(phi::errors::Unimplemented( + "Not supported phi::DataType(%d) as data type.", static_cast(type))); +} + } // namespace phi diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc index f196744c0411e..1f6cf1a6882d8 100644 --- a/paddle/phi/infermeta/multiary.cc +++ b/paddle/phi/infermeta/multiary.cc @@ -890,6 +890,506 @@ void HierarchicalSigmoidInferMeta(const MetaTensor& x, out->set_dtype(x.dtype()); } +static void Interpolate1DInferShapeCheck( + const MetaTensor& x, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const std::string& data_layout_str, + int out_d, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + MetaTensor* output, + MetaConfig config) { + auto dim_x = x.dims(); + + PADDLE_ENFORCE_EQ("linear", + interp_method, + phi::errors::InvalidArgument( + "Interpolation method can only be \"linear\" when" + "Input(X) dimension is 3, but got method = %s .", + interp_method)); + const DataLayout data_layout = + paddle::framework::StringToDataLayout(data_layout_str); + for (int i = 0; i < dim_x.size(); ++i) { + PADDLE_ENFORCE_NE( + dim_x[i], + 0, + phi::errors::InvalidArgument("The shape of input(x) should be larged " + "than 0, bug received shape[%d] is %d ", + i, + dim_x[i])); + } + if (size_tensor && size_tensor->size() > 0) { + // top prority size + auto inputs_name = size_tensor.get(); + PADDLE_ENFORCE_EQ( + inputs_name.size(), + 1, + phi::errors::InvalidArgument( + "Input(SizeTensor)'size of Op(interpolate) must be 1. " + "Attr(out_shape)'s length must be 1 for 3-D input tensor, but got " + "size = %d .", + inputs_name.size())); + phi::DDim dim_out; + if (data_layout == DataLayout::kNCHW) { + dim_out = {dim_x[0], dim_x[1], out_w}; + } else { + dim_out = {dim_x[0], out_w, dim_x[2]}; + } + output->set_dims(dim_out); + output->set_dtype(x.dtype()); + + return; + } + + int out_w_tmp; + if (scale_tensor) { + auto scale_tensor_dim = scale_tensor->dims(); + PADDLE_ENFORCE_EQ( + scale_tensor_dim.size(), + 1, + phi::errors::InvalidArgument( + "Scale's dimension size must be 1, but got dimension = %d .", + scale_tensor_dim.size())); + PADDLE_ENFORCE_EQ(scale_tensor_dim[0], + 1, + phi::errors::InvalidArgument( + "Scale's shape must be 1, but got shape = %d .", + scale_tensor_dim[0])); + out_w_tmp = -1; + } else { + if (scale.size() > 0) { + float scale_w = -1; + scale_w = scale[0]; + PADDLE_ENFORCE_EQ( + scale_w > 0, + true, + phi::errors::InvalidArgument( + "The scale_w in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); + if (scale_w > 0.) { + // round down + out_w_tmp = (data_layout == DataLayout::kNCHW + ? static_cast(dim_x[2] * scale_w) + : static_cast(dim_x[1] * scale_w)); + // protect when input shape is -1 + out_w_tmp = out_w_tmp > 0 ? out_w_tmp : -1; + } + } else { + out_w_tmp = out_w; + } + } + + if (out_size && config.is_runtime) { + auto out_size_dim = out_size->dims(); + PADDLE_ENFORCE_EQ( + out_size_dim.size(), + 1, + phi::errors::InvalidArgument( + "OutSize's dimension size must be 1, but got dimention = %d .", + out_size_dim.size())); + PADDLE_ENFORCE_EQ( + out_size_dim[0], + 1, + phi::errors::InvalidArgument( + "OutSize's 0-th dimension's value must be 1, but got value = %d .", + out_size_dim[0])); + + // dims will be seted in kernel + output->set_dtype(x.dtype()); + output->share_lod(x); + return; + } + + phi::DDim dim_out; + if (data_layout == DataLayout::kNCHW) { + dim_out = {dim_x[0], dim_x[1], out_w_tmp}; + } else { + dim_out = {dim_x[0], out_w_tmp, dim_x[2]}; + } + output->set_dims(dim_out); + output->set_dtype(x.dtype()); +} + +static void Interpolate2DInferShapeCheck( + const MetaTensor& x, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const std::string& data_layout_str, + int out_d, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + MetaTensor* output, + MetaConfig config) { + auto dim_x = x.dims(); + + PADDLE_ENFORCE( + "bilinear" == interp_method || "nearest" == interp_method || + "bicubic" == interp_method, + phi::errors::InvalidArgument( + "Interpolation method can only be \"bilinear\" or \"nearest\" when " + "Input(X) dimension is 4, but got method = %s.", + interp_method)); + const DataLayout data_layout = + paddle::framework::StringToDataLayout(data_layout_str); + + for (int i = 0; i < dim_x.size(); ++i) { + PADDLE_ENFORCE_NE( + dim_x[i], + 0, + phi::errors::InvalidArgument("The shape of input(x) should be larged " + "than 0, bug received shape[%d] is %d ", + i, + dim_x[i])); + } + + if (size_tensor && size_tensor->size()) { + // top prority size + auto inputs_name = size_tensor.get(); + PADDLE_ENFORCE_EQ( + inputs_name.size(), + 2, + phi::errors::InvalidArgument( + "Input(SizeTensor)'size of Op(interpolate) must be 2. " + "Attr(out_shape)'s length must be 2 for 4-D input " + "tensor, but got size = %d .", + inputs_name.size())); + phi::DDim dim_out; + if (data_layout == DataLayout::kNCHW) { + dim_out = {dim_x[0], dim_x[1], out_h, out_w}; + } else { + dim_out = {dim_x[0], out_h, out_w, dim_x[3]}; + } + output->set_dims(dim_out); + output->set_dtype(x.dtype()); + + return; + } + + int out_h_tmp, out_w_tmp; + if (scale_tensor) { + auto scale_tensor_dim = scale_tensor->dims(); + PADDLE_ENFORCE_EQ( + scale_tensor_dim.size(), + 1, + phi::errors::InvalidArgument( + "Scale's dimension size must be 1, but got dimension = %d .", + scale_tensor_dim.size())); + PADDLE_ENFORCE_EQ(scale_tensor_dim[0] == 2 || scale_tensor_dim[0] == 1, + true, + phi::errors::InvalidArgument( + "Scale's shape must be 2 or 1, but got shape = %d .", + scale_tensor_dim[0])); + out_h_tmp = -1; + out_w_tmp = -1; + } else { + if (scale.size() > 0) { + float scale_h = -1; + float scale_w = -1; + scale_h = scale[0]; + scale_w = scale[1]; + PADDLE_ENFORCE_EQ( + scale_w > 0, + true, + phi::errors::InvalidArgument( + "The scale_w in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); + PADDLE_ENFORCE_EQ( + scale_h > 0, + true, + phi::errors::InvalidArgument( + "The scale_h in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_h)); + if (scale_h > 0. && scale_w > 0.) { + // round down + out_h_tmp = (data_layout == DataLayout::kNCHW + ? static_cast(dim_x[2] * scale_h) + : static_cast(dim_x[1] * scale_h)); + out_w_tmp = (data_layout == DataLayout::kNCHW + ? static_cast(dim_x[3] * scale_w) + : static_cast(dim_x[2] * scale_w)); + // protect when input shape is -1 + out_h_tmp = out_h_tmp > 0 ? out_h_tmp : -1; + out_w_tmp = out_w_tmp > 0 ? out_w_tmp : -1; + } + } else { + out_h_tmp = out_h; + out_w_tmp = out_w; + } + } + + if (out_size && config.is_runtime) { + auto out_size_dim = out_size->dims(); + PADDLE_ENFORCE_EQ( + out_size_dim.size(), + 1, + phi::errors::InvalidArgument( + "OutSize's dimension size must be 1, but got dimension = %d .", + out_size_dim.size())); + PADDLE_ENFORCE_EQ( + out_size_dim[0], + 2, + phi::errors::InvalidArgument( + "OutSize's dim[0] must be 2, but got dimention = %d .", + out_size_dim[0])); + // dims will be seted in kernel + output->set_dtype(x.dtype()); + output->share_lod(x); + return; + } + + phi::DDim dim_out; + if (data_layout == DataLayout::kNCHW) { + dim_out = {dim_x[0], dim_x[1], out_h_tmp, out_w_tmp}; + } else { + dim_out = {dim_x[0], out_h_tmp, out_w_tmp, dim_x[3]}; + } + + output->set_dims(dim_out); + output->set_dtype(x.dtype()); +} + +static void Interpolate3DInferShapeCheck( + const MetaTensor& x, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const std::string& data_layout_str, + int out_d, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + MetaTensor* output, + MetaConfig config) { + auto dim_x = x.dims(); + + PADDLE_ENFORCE("nearest" == interp_method || "trilinear" == interp_method, + phi::errors::InvalidArgument( + "Interpolation method can only be \"trilinear\" or " + "\"nearest\" when Input(X) " + "dimension is 5, but got method = %s .", + interp_method)); + const DataLayout data_layout = + paddle::framework::StringToDataLayout(data_layout_str); + + for (int i = 0; i < dim_x.size(); ++i) { + PADDLE_ENFORCE_NE( + dim_x[i], + 0, + phi::errors::InvalidArgument("The shape of input(x) should be larged " + "than 0, bug received shape[%d] is %d ", + i, + dim_x[i])); + } + + if (size_tensor && size_tensor->size() > 0) { + // top prority size + auto inputs_name = size_tensor.get(); + PADDLE_ENFORCE_EQ( + inputs_name.size(), + 3, + phi::errors::InvalidArgument( + "Input(SizeTensor)'s size of Op(interpolate) must be 3. " + "Attr(out_shape)'s length must be 3 for 5-D input " + "tensor, but got size = %d .", + inputs_name.size())); + phi::DDim dim_out; + if (data_layout == DataLayout::kNCHW) { + dim_out = {dim_x[0], dim_x[1], out_d, out_h, out_w}; + } else { + dim_out = {dim_x[0], out_d, out_h, out_w, dim_x[4]}; + } + output->set_dims(dim_out); + output->set_dtype(x.dtype()); + return; + } + + int out_d_tmp, out_h_tmp, out_w_tmp; + if (scale_tensor) { + auto scale_tensor_dim = scale_tensor->dims(); + PADDLE_ENFORCE_EQ( + scale_tensor_dim.size(), + 1, + phi::errors::InvalidArgument( + "Scale's dimension size must be 1, but got size = %d .", + scale_tensor_dim.size())); + PADDLE_ENFORCE_EQ(scale_tensor_dim[0] == 3 || scale_tensor_dim[0] == 1, + true, + phi::errors::InvalidArgument( + "Scale's shape must be 3 or 1, but got shape = %d .", + scale_tensor_dim[0])); + out_d_tmp = -1; + out_h_tmp = -1; + out_w_tmp = -1; + } else { + if (scale.size() > 0) { + float scale_d = -1; + float scale_h = -1; + float scale_w = -1; + scale_d = scale[0]; + scale_h = scale[1]; + scale_w = scale[2]; + PADDLE_ENFORCE_EQ( + scale_w > 0, + true, + phi::errors::InvalidArgument( + "The scale_w in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); + PADDLE_ENFORCE_EQ( + scale_h > 0, + true, + phi::errors::InvalidArgument( + "The scale_h in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_h)); + PADDLE_ENFORCE_EQ( + scale_d > 0, + true, + phi::errors::InvalidArgument( + "The scale_d in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_d)); + if (scale_d > 0. && scale_h > 0. && scale_w > 0.) { + // round down + out_d_tmp = (data_layout == DataLayout::kNCHW + ? static_cast(dim_x[2] * scale_d) + : static_cast(dim_x[1] * scale_d)); + out_h_tmp = (data_layout == DataLayout::kNCHW + ? static_cast(dim_x[3] * scale_h) + : static_cast(dim_x[2] * scale_h)); + out_w_tmp = (data_layout == DataLayout::kNCHW + ? static_cast(dim_x[4] * scale_w) + : static_cast(dim_x[3] * scale_w)); + // protect when input shape is -1 + out_d_tmp = out_d_tmp > 0 ? out_d_tmp : -1; + out_h_tmp = out_h_tmp > 0 ? out_h_tmp : -1; + out_w_tmp = out_w_tmp > 0 ? out_w_tmp : -1; + } + } else { + out_d_tmp = out_d; + out_h_tmp = out_h; + out_w_tmp = out_w; + } + } + + if (out_size && config.is_runtime) { + auto out_size_dim = out_size->dims(); + PADDLE_ENFORCE_EQ( + out_size_dim.size(), + 1, + phi::errors::InvalidArgument( + "OutSize's dimension size must be 1, but got size is %d.", + out_size_dim.size())); + PADDLE_ENFORCE_EQ(out_size_dim[0], + 3, + phi::errors::InvalidArgument( + "OutSize's dim[0] must be 3, but got size is %d.", + out_size_dim[0])); + // dims will be seted in kernel + output->set_dtype(x.dtype()); + output->share_lod(x); + return; + } + + phi::DDim dim_out; + if (data_layout == DataLayout::kNCHW) { + dim_out = {dim_x[0], dim_x[1], out_d_tmp, out_h_tmp, out_w_tmp}; + } else { + dim_out = {dim_x[0], out_d_tmp, out_h_tmp, out_w_tmp, dim_x[4]}; + } + output->set_dims(dim_out); + output->set_dtype(x.dtype()); +} + +void InterpolateInferMeta( + const MetaTensor& x, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const std::string& data_layout_str, + int out_d, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + MetaTensor* output, + MetaConfig config) { + auto dim_x = x.dims(); // NCHW format + PADDLE_ENFORCE( + dim_x.size() == 3 || dim_x.size() == 4 || dim_x.size() == 5, + phi::errors::Unimplemented( + "Input(X) dimension must be 3, 4 or 5, but got dimension = %d .", + dim_x.size())); + if (dim_x.size() == 3) { + // shape check for 1D interpolate for input tensor shape NCHW + Interpolate1DInferShapeCheck(x, + out_size, + size_tensor, + scale_tensor, + data_layout_str, + out_d, + out_h, + out_w, + scale, + interp_method, + align_corners, + align_mode, + output, + config); + } else if (dim_x.size() == 4) { + // shape check for 2D interpolate for input tensor shape NCHW + Interpolate2DInferShapeCheck(x, + out_size, + size_tensor, + scale_tensor, + data_layout_str, + out_d, + out_h, + out_w, + scale, + interp_method, + align_corners, + align_mode, + output, + config); + } else { // dim_x.size() == 5 + // shape check for 3D interpolate for input tensor shape NCDHW + Interpolate3DInferShapeCheck(x, + out_size, + size_tensor, + scale_tensor, + data_layout_str, + out_d, + out_h, + out_w, + scale, + interp_method, + align_corners, + align_mode, + output, + config); + } +} + void MultiDotInferMeta(const std::vector& x, MetaTensor* out) { auto inputs_dims = GetMetaTensorsDim(x); @@ -1167,6 +1667,52 @@ void RnnInferMeta(const MetaTensor& x, } } +void StackInferMeta(const std::vector& x, + int axis, + MetaTensor* out) { + PADDLE_ENFORCE_GT(x.size(), + 0UL, + phi::errors::InvalidArgument( + "Number of Inputs(x) must be larger than 0, but" + " received value is:%d.", + x.size())); + const auto& input_dims = GetMetaTensorsDim(x); + for (size_t i = 1; i < input_dims.size(); ++i) { + PADDLE_ENFORCE_EQ(input_dims[i], + input_dims[0], + phi::errors::InvalidArgument( + "Dims of all Inputs(X) must be the same, but" + " received input %d dim is:%d not equal to input 0" + " dim:%d.", + i, + input_dims[i], + input_dims[0])); + } + int rank = input_dims[0].size(); + PADDLE_ENFORCE_GE( + axis, + -(rank + 1), + phi::errors::InvalidArgument( + "Attr(axis) must be inside [-(rank+1), rank+1), where rank = %d, " + "but received axis is:%d.", + rank, + axis)); + PADDLE_ENFORCE_LT( + axis, + rank + 1, + phi::errors::InvalidArgument( + "Attr(axis) must be inside [-(rank+1), rank+1), where rank = %d, " + "but received axis is:%d", + rank, + axis)); + if (axis < 0) axis += (rank + 1); + auto vec = phi::vectorize(input_dims[0]); + vec.insert(vec.begin() + axis, input_dims.size()); + out->set_dims(phi::make_ddim(vec)); + out->set_dtype(x.at(0)->dtype()); + out->share_lod(*x.at(0)); +} + void WarpctcInferMeta(const MetaTensor& logits, const MetaTensor& label, const paddle::optional logits_length, diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h index 6abbf1c0ef478..b748d898c1e4e 100644 --- a/paddle/phi/infermeta/multiary.h +++ b/paddle/phi/infermeta/multiary.h @@ -199,6 +199,22 @@ void HierarchicalSigmoidInferMeta(const MetaTensor& x, MetaTensor* pre_out, MetaTensor* w_out); +void InterpolateInferMeta( + const MetaTensor& x, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const std::string& data_layout, + int out_d, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + MetaTensor* output, + MetaConfig config = MetaConfig()); + void MultiDotInferMeta(const std::vector& x, MetaTensor* out); void MultiplexInferMeta(const std::vector& ins, @@ -231,6 +247,10 @@ void RnnInferMeta(const MetaTensor& x, std::vector state, MetaTensor* reserve); +void StackInferMeta(const std::vector& x, + int axis, + MetaTensor* out); + void WarpctcInferMeta(const MetaTensor& logits, const MetaTensor& label, const paddle::optional logits_length, diff --git a/paddle/phi/infermeta/nullary.cc b/paddle/phi/infermeta/nullary.cc index 6c3bbe654faec..4a11d24a9868b 100644 --- a/paddle/phi/infermeta/nullary.cc +++ b/paddle/phi/infermeta/nullary.cc @@ -23,9 +23,7 @@ void AssignValueInferMeta(const std::vector& shape, out->set_dtype(dtype); } -void CreateInferMeta(const ScalarArray& shape, - DataType dtype, - MetaTensor* out) { +void CreateInferMeta(const IntArray& shape, DataType dtype, MetaTensor* out) { CreateInferMetaBase(shape.GetData(), dtype, DataLayout::NCHW, out); } @@ -48,7 +46,7 @@ void EyeInferMeta(int64_t num_rows, out->set_dtype(dtype); } -void GaussianRandomInferMeta(const ScalarArray& shape, +void GaussianRandomInferMeta(const IntArray& shape, float mean, float std, int seed, diff --git a/paddle/phi/infermeta/nullary.h b/paddle/phi/infermeta/nullary.h index d72e92654cdaa..4c9eb0b62a74e 100644 --- a/paddle/phi/infermeta/nullary.h +++ b/paddle/phi/infermeta/nullary.h @@ -14,7 +14,7 @@ limitations under the License. */ #pragma once -#include "paddle/phi/common/scalar_array.h" +#include "paddle/phi/common/int_array.h" #include "paddle/phi/core/meta_tensor.h" namespace phi { @@ -34,7 +34,7 @@ void AssignValueInferMeta(const std::vector& shape, DataType dtype, MetaTensor* out); -void CreateInferMeta(const ScalarArray& shape, DataType dtype, MetaTensor* out); +void CreateInferMeta(const IntArray& shape, DataType dtype, MetaTensor* out); void CreateInferMetaBase(const std::vector& shape, DataType dtype, @@ -46,7 +46,7 @@ void EyeInferMeta(int64_t num_rows, DataType dtype, MetaTensor* out); -void GaussianRandomInferMeta(const ScalarArray& shape, +void GaussianRandomInferMeta(const IntArray& shape, float mean, float std, int seed, diff --git a/paddle/phi/infermeta/strings/nullary.cc b/paddle/phi/infermeta/strings/nullary.cc index 807a5a9bf80a0..c2428a2ff3ae9 100644 --- a/paddle/phi/infermeta/strings/nullary.cc +++ b/paddle/phi/infermeta/strings/nullary.cc @@ -16,7 +16,7 @@ limitations under the License. */ namespace phi { namespace strings { -void CreateInferMeta(const ScalarArray& shape, MetaTensor* out) { +void CreateInferMeta(const IntArray& shape, MetaTensor* out) { const auto& out_dims = phi::make_ddim(shape.GetData()); out->set_dims(out_dims); out->set_dtype(DataType::PSTRING); diff --git a/paddle/phi/infermeta/strings/nullary.h b/paddle/phi/infermeta/strings/nullary.h index 513792ffff37d..8fbcc63b2ae5d 100644 --- a/paddle/phi/infermeta/strings/nullary.h +++ b/paddle/phi/infermeta/strings/nullary.h @@ -14,7 +14,7 @@ limitations under the License. */ #pragma once -#include "paddle/phi/common/scalar_array.h" +#include "paddle/phi/common/int_array.h" #include "paddle/phi/core/meta_tensor.h" #include "paddle/phi/core/tensor_meta.h" @@ -22,7 +22,7 @@ namespace phi { namespace strings { void CreateInferMeta(const std::vector& shape, MetaTensor* out); -void CreateInferMeta(const ScalarArray& shape, MetaTensor* out); +void CreateInferMeta(const IntArray& shape, MetaTensor* out); } // namespace strings } // namespace phi diff --git a/paddle/phi/infermeta/strings/unary.h b/paddle/phi/infermeta/strings/unary.h index fe942db6c9f3a..13b94ec1ace78 100644 --- a/paddle/phi/infermeta/strings/unary.h +++ b/paddle/phi/infermeta/strings/unary.h @@ -15,7 +15,7 @@ limitations under the License. */ #pragma once // See Note [ Why still include the fluid headers? ] -#include "paddle/phi/common/scalar_array.h" +#include "paddle/phi/common/int_array.h" #include "paddle/phi/core/infermeta_utils.h" #include "paddle/phi/core/meta_tensor.h" #include "paddle/phi/core/tensor_meta.h" diff --git a/paddle/phi/infermeta/ternary.cc b/paddle/phi/infermeta/ternary.cc index 0376d4e79e00d..582dcb0137894 100644 --- a/paddle/phi/infermeta/ternary.cc +++ b/paddle/phi/infermeta/ternary.cc @@ -345,6 +345,56 @@ void PutAlongAxisInferMeta(const MetaTensor& x, out->set_dtype(x.dtype()); } +void RangeInferMeta(const MetaTensor& start, + const MetaTensor& end, + const MetaTensor& step, + MetaTensor* out) { + auto start_dims = start.dims(); + auto end_dims = end.dims(); + auto step_dims = step.dims(); + PADDLE_ENFORCE_EQ( + start_dims.size(), + 1, + phi::errors::InvalidArgument( + "The dim of the shape of Input(Start) should be 1, but got %d", + start_dims.size())); + + PADDLE_ENFORCE_EQ(start_dims[0], + 1, + phi::errors::InvalidArgument( + "The first dim of the shape of Input(Start) should " + "be 1, but got %d", + start_dims[0])); + PADDLE_ENFORCE_EQ( + end_dims.size(), + 1, + phi::errors::InvalidArgument( + "The dim of the shape of Input(End) should be 1, but got %d", + end_dims.size())); + + PADDLE_ENFORCE_EQ( + end_dims[0], + 1, + phi::errors::InvalidArgument("The first dim of the shape of " + "Input(End) should be 1, but got %d", + end_dims[0])); + PADDLE_ENFORCE_EQ( + step_dims.size(), + 1, + phi::errors::InvalidArgument( + "The dim of the shape of Input(Step) should be 1, but got %d", + step_dims.size())); + + PADDLE_ENFORCE_EQ(step_dims[0], + 1, + phi::errors::InvalidArgument( + "The first dim of the shape of Input(Step) should " + "be 1, but got %d", + step_dims[0])); + out->set_dims({-1}); + out->set_dtype(start.dtype()); +} + void RoiAlignInferMeta(const MetaTensor& x, const MetaTensor& boxes, paddle::optional boxes_num, diff --git a/paddle/phi/infermeta/ternary.h b/paddle/phi/infermeta/ternary.h index 30fdb4e612c8b..c18dde42f1ed2 100644 --- a/paddle/phi/infermeta/ternary.h +++ b/paddle/phi/infermeta/ternary.h @@ -81,6 +81,11 @@ void PutAlongAxisInferMeta(const MetaTensor& x, const std::string& reduce, MetaTensor* out); +void RangeInferMeta(const MetaTensor& start, + const MetaTensor& end, + const MetaTensor& step, + MetaTensor* out); + void RoiAlignInferMeta(const MetaTensor& x, const MetaTensor& boxes, paddle::optional boxes_num, diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index bbeb14363e84e..6bf7a36b06534 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -1173,7 +1173,7 @@ void PadInferMeta(const MetaTensor& input, } void Pad3dInferMeta(const MetaTensor& x, - const ScalarArray& paddings_scalar_array, + const IntArray& paddings_int_array, const std::string& mode, float value, const std::string& data_format, @@ -1189,21 +1189,21 @@ void Pad3dInferMeta(const MetaTensor& x, std::vector out_dims(x_dim.size()); out_dims[0] = x_dim[0]; - if (paddings_scalar_array.FromTensor()) { + if (paddings_int_array.FromTensor()) { if (config.is_runtime) { PADDLE_ENFORCE_EQ( - paddings_scalar_array.GetData().size(), + paddings_int_array.GetData().size(), 6, errors::InvalidArgument("Shape of Input(Paddings) should be equal to " "[6], but received [%d].", - paddings_scalar_array.GetData().size())); + paddings_int_array.GetData().size())); } out_dims[1] = x_dim[1]; out_dims[2] = x_dim[2]; out_dims[3] = x_dim[3]; out_dims[4] = x_dim[4]; } else { - auto paddings = paddings_scalar_array.GetData(); + auto paddings = paddings_int_array.GetData(); PADDLE_ENFORCE_EQ( paddings.size(), @@ -1592,7 +1592,7 @@ void ReduceInferMetaBase(const MetaTensor& x, } void ReshapeInferMeta(const MetaTensor& x, - const ScalarArray& shape, + const IntArray& shape, MetaTensor* out, MetaConfig config) { auto& shape_data = shape.GetData(); @@ -1612,7 +1612,7 @@ void ReshapeInferMeta(const MetaTensor& x, } void ReshapeWithXShapeInferMeta(const MetaTensor& x, - const ScalarArray& shape, + const IntArray& shape, MetaTensor* out, MetaTensor* xshape, MetaConfig config) { @@ -1659,7 +1659,7 @@ void ReverseInferMeta(const MetaTensor& x, } void RollInferMeta(const MetaTensor& x, - const ScalarArray& shifts, + const IntArray& shifts, const std::vector& axis, MetaTensor* out) { auto shifts_data = shifts.GetData(); @@ -1758,7 +1758,7 @@ void SoftmaxInferMeta(const MetaTensor& x, int axis, MetaTensor* out) { } void SplitInferMeta(const MetaTensor& x, - const ScalarArray& num_or_sections, + const IntArray& num_or_sections, const Scalar& axis, std::vector out, MetaConfig config) { @@ -1922,15 +1922,15 @@ void SqueezeInferMeta(const MetaTensor& x, out->set_dtype(x.dtype()); } -void StridedSliceInferMeta(const MetaTensor& x, - const std::vector& axes, - const ScalarArray& starts, - const ScalarArray& ends, - const ScalarArray& strides, - const std::vector& infer_flags, - const std::vector& decrease_axis, - MetaTensor* out, - MetaConfig config) { +void StridedSliceRawInferMeta(const MetaTensor& x, + const std::vector& axes, + const IntArray& starts, + const IntArray& ends, + const IntArray& strides, + const std::vector& infer_flags, + const std::vector& decrease_axis, + MetaTensor* out, + MetaConfig config) { auto in_dims = x.dims(); PADDLE_ENFORCE_LT( in_dims.size(), @@ -1968,7 +1968,7 @@ void StridedSliceInferMeta(const MetaTensor& x, } auto tensor_input = false; - auto HasInput = [](const ScalarArray& arr) { return arr.FromTensor(); }; + auto HasInput = [](const IntArray& arr) { return arr.FromTensor(); }; if (HasInput(starts) || HasInput(ends) || HasInput(strides)) { tensor_input = true; } @@ -2052,6 +2052,19 @@ void StridedSliceInferMeta(const MetaTensor& x, out->set_dtype(x.dtype()); } +void StridedSliceInferMeta(const MetaTensor& x, + const std::vector& axes, + const IntArray& starts, + const IntArray& ends, + const IntArray& strides, + MetaTensor* out, + MetaConfig config) { + std::vector infer_flags(axes.size(), 1); + std::vector decrease_axis; + StridedSliceRawInferMeta( + x, axes, starts, ends, strides, infer_flags, decrease_axis, out, config); +} + /* Why not use SumRawInferMeta directly? Because we need make InferMetaFunction's args follow the design of api.yaml */ @@ -2090,7 +2103,7 @@ void SumRawInferMeta(const MetaTensor& x, } void TileInferMeta(const MetaTensor& x, - const ScalarArray& repeat_times, + const IntArray& repeat_times, MetaTensor* out, MetaConfig config) { #define MAX_RANK_SUPPORTED 6 @@ -2552,8 +2565,87 @@ void UnfoldInferMeta(const MetaTensor& x, out->set_dims(phi::make_ddim(out_dims)); } +void UniqueInferMeta(const MetaTensor& x, + bool return_index, + bool return_inverse, + bool return_counts, + const std::vector& axis, + DataType dtype, + MetaTensor* out, + MetaTensor* indices, + MetaTensor* index, + MetaTensor* counts) { + bool is_sorted = true; + UniqueRawInferMeta(x, + return_index, + return_inverse, + return_counts, + axis, + dtype, + is_sorted, + out, + indices, + index, + counts); +} + +void UniqueRawInferMeta(const MetaTensor& x, + bool return_index, + bool return_inverse, + bool return_counts, + const std::vector& axis, + DataType dtype, + bool is_sorted, + MetaTensor* out, + MetaTensor* indices, + MetaTensor* index, + MetaTensor* counts) { + if (!is_sorted) { + PADDLE_ENFORCE_EQ( + x.dims().size(), + 1, + phi::errors::InvalidArgument("The Input(X) should be 1-D Tensor, " + "But now the dims of Input(X) is %d.", + x.dims().size())); + out->set_dims(phi::make_ddim({-1})); + index->set_dims(x.dims()); + return; + } + + if (axis.empty()) { + out->set_dims(phi::make_ddim({-1})); + if (return_inverse) { + index->set_dims(phi::make_ddim({phi::product(x.dims())})); + } + } else { + int axis_value = axis[0]; + if (axis_value < 0) { + axis_value += x.dims().size(); + } + PADDLE_ENFORCE_LT( + axis_value, + x.dims().size(), + phi::errors::InvalidArgument("The axis(%d) should be less than " + "the dimension size(%d) of x.", + axis_value, + x.dims().size())); + auto out_dims = x.dims(); + out_dims[axis_value] = -1; + out->set_dims(out_dims); + if (return_inverse) { + index->set_dims(phi::make_ddim({x.dims()[axis_value]})); + } + } + if (return_index) { + indices->set_dims(phi::make_ddim({-1})); + } + if (return_counts) { + counts->set_dims(phi::make_ddim({-1})); + } +} + void UnsqueezeInferMeta(const MetaTensor& x, - const ScalarArray& axes, + const IntArray& axes, MetaTensor* xshape, MetaTensor* out, MetaConfig config) { @@ -2595,6 +2687,53 @@ void UnsqueezeInferMeta(const MetaTensor& x, xshape->set_dtype(x.dtype()); } +void UnStackInferMeta(const MetaTensor& x, + int axis, + int num, + std::vector outs) { + auto x_dim = x.dims(); + int rank = x_dim.size(); + PADDLE_ENFORCE_GE(axis, + -rank, + phi::errors::InvalidArgument( + "The attribute axis is out of range, it must be " + "inside [-rank, rank), where rank = %d", + rank)); + PADDLE_ENFORCE_LT(axis, + rank, + phi::errors::InvalidArgument( + "The attribute axis is out of range, it must be " + "inside [-rank, rank), where rank = %d", + rank)); + if (axis < 0) axis += rank; + + size_t output_count = outs.size(); + PADDLE_ENFORCE_EQ(output_count, + static_cast(num), + phi::errors::InvalidArgument( + "Number of Outputs(Y) is wrong. Got %d , but it must " + "equal to attribute num which is %d.", + output_count, + static_cast(num))); + if (x_dim[axis] > 0) { + PADDLE_ENFORCE_EQ( + num, + x_dim[axis], + phi::errors::InvalidArgument( + "The number of attribute num is not equal to the length of the " + "%d axis of Input(X). Expect %d but got %d.", + axis, + x_dim[axis], + num)); + } + auto vec = phi::vectorize(x_dim); + vec.erase(vec.begin() + axis); + for (size_t i = 0; i < output_count; i++) { + outs[i]->set_dims(phi::make_ddim(vec)); + outs[i]->set_dtype(x.dtype()); + } +} + void OneHotRawInferMeta(const MetaTensor& x, int32_t depth, DataType dtype, diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h index ea902e0d98eca..54f70d8d55405 100644 --- a/paddle/phi/infermeta/unary.h +++ b/paddle/phi/infermeta/unary.h @@ -15,8 +15,8 @@ limitations under the License. */ #pragma once // See Note [ Why still include the fluid headers? ] +#include "paddle/phi/common/int_array.h" #include "paddle/phi/common/scalar.h" -#include "paddle/phi/common/scalar_array.h" #include "paddle/phi/core/meta_tensor.h" namespace phi { @@ -185,7 +185,7 @@ void PadInferMeta(const MetaTensor& input, MetaConfig config = MetaConfig()); void Pad3dInferMeta(const MetaTensor& x, - const ScalarArray& paddings, + const IntArray& paddings, const std::string& mode, float value, const std::string& data_format, @@ -238,12 +238,12 @@ void ReduceInferMetaBase(const MetaTensor& x, MetaTensor* out); void ReshapeInferMeta(const MetaTensor& x, - const ScalarArray& shape, + const IntArray& shape, MetaTensor* out, MetaConfig config = MetaConfig()); void ReshapeWithXShapeInferMeta(const MetaTensor& x, - const ScalarArray& shape, + const IntArray& shape, MetaTensor* out, MetaTensor* xshape, MetaConfig config = MetaConfig()); @@ -253,7 +253,7 @@ void ReverseInferMeta(const MetaTensor& x, MetaTensor* out); void RollInferMeta(const MetaTensor& x, - const ScalarArray& shifts, + const IntArray& shifts, const std::vector& axis, MetaTensor* out); @@ -274,7 +274,7 @@ void SizeInferMeta(const MetaTensor& input, MetaTensor* out); void SoftmaxInferMeta(const MetaTensor& x, int axis, MetaTensor* out); void SplitInferMeta(const MetaTensor& x_meta, - const ScalarArray& num_or_sections, + const IntArray& num_or_sections, const Scalar& axis, std::vector out, MetaConfig config = MetaConfig()); @@ -284,13 +284,21 @@ void SqueezeInferMeta(const MetaTensor& x, MetaTensor* xshape, MetaTensor* out); +void StridedSliceRawInferMeta(const MetaTensor& x, + const std::vector& axes, + const IntArray& starts, + const IntArray& ends, + const IntArray& strides, + const std::vector& infer_flags, + const std::vector& decrease_axis, + MetaTensor* out, + MetaConfig config = MetaConfig()); + void StridedSliceInferMeta(const MetaTensor& x, const std::vector& axes, - const ScalarArray& starts, - const ScalarArray& ends, - const ScalarArray& strides, - const std::vector& infer_flags, - const std::vector& decrease_axis, + const IntArray& starts, + const IntArray& ends, + const IntArray& strides, MetaTensor* out, MetaConfig config = MetaConfig()); @@ -308,7 +316,7 @@ void SumRawInferMeta(const MetaTensor& x, MetaTensor* out); void TileInferMeta(const MetaTensor& x, - const ScalarArray& repeat_times, + const IntArray& repeat_times, MetaTensor* out, MetaConfig config = MetaConfig()); @@ -360,12 +368,40 @@ void UnfoldInferMeta(const MetaTensor& x, MetaTensor* out, MetaConfig config = MetaConfig()); +void UniqueInferMeta(const MetaTensor& x, + bool return_index, + bool return_inverse, + bool return_counts, + const std::vector& axis, + DataType dtype, + MetaTensor* out, + MetaTensor* indices, + MetaTensor* index, + MetaTensor* counts); + +void UniqueRawInferMeta(const MetaTensor& x, + bool return_index, + bool return_inverse, + bool return_counts, + const std::vector& axis, + DataType dtype, + bool is_sorted, + MetaTensor* out, + MetaTensor* indices, + MetaTensor* index, + MetaTensor* counts); + void UnsqueezeInferMeta(const MetaTensor& x, - const ScalarArray& axes, + const IntArray& axes, MetaTensor* xshape, MetaTensor* out, MetaConfig config = MetaConfig()); +void UnStackInferMeta(const MetaTensor& x, + int axis, + int num, + std::vector outs); + void OneHotRawInferMeta(const MetaTensor& x, int32_t depth, DataType dtype, diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt index b0d762d00ecf9..d4b832cef0bd2 100644 --- a/paddle/phi/kernels/CMakeLists.txt +++ b/paddle/phi/kernels/CMakeLists.txt @@ -27,7 +27,7 @@ kernel_library(full_kernel DEPS ${COMMON_KERNEL_DEPS} empty_kernel) # Some kernels depend on some targets that are not commonly used. # These targets are not suitable for common dependencies. # In this case, you need to manually generate them here. -set(MANUAL_BUILD_KERNELS adam_kernel adamw_kernel deformable_conv_kernel deformable_conv_grad_kernel eigh_kernel +set(MANUAL_BUILD_KERNELS cross_entropy_kernel adam_kernel adamw_kernel deformable_conv_kernel deformable_conv_grad_kernel eigh_kernel gumbel_softmax_kernel gumbel_softmax_grad_kernel hierarchical_sigmoid_kernel hierarchical_sigmoid_grad_kernel matrix_power_kernel matrix_power_grad_kernel maxout_kernel maxout_grad_kernel pool_kernel put_along_axis_kernel put_along_axis_grad_kernel segment_pool_kernel segment_pool_grad_kernel @@ -35,8 +35,10 @@ set(MANUAL_BUILD_KERNELS adam_kernel adamw_kernel deformable_conv_kernel deforma triangular_solve_grad_kernel determinant_grad_kernel reduce_kernel rnn_kernel rnn_grad_kernel warpctc_kernel warpctc_grad_kernel) kernel_library(adam_kernel DEPS gflags glog flags ${COMMON_KERNEL_DEPS} selected_rows_functor threadpool jit_kernel_helper) kernel_library(adamw_kernel DEPS ${COMMON_KERNEL_DEPS} adam_kernel) +kernel_library(cross_entropy_kernel DEPS ${COMMON_KERNEL_DEPS} softmax cross_entropy) kernel_library(deformable_conv_kernel DEPS ${COMMON_KERNEL_DEPS} deformable_conv_functor) kernel_library(deformable_conv_grad_kernel DEPS ${COMMON_KERNEL_DEPS} deformable_conv_functor) +kernel_library(determinant_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_inverse) kernel_library(eigh_kernel DEPS ${COMMON_KERNEL_DEPS} lapack_function) kernel_library(hierarchical_sigmoid_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_bit_code) kernel_library(hierarchical_sigmoid_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_bit_code) @@ -57,7 +59,6 @@ kernel_library(softmax_grad_kernel DEPS ${COMMON_KERNEL_DEPS} softmax) kernel_library(take_along_axis_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel) kernel_library(take_along_axis_grad_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel) kernel_library(triangular_solve_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_reduce) -kernel_library(determinant_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_inverse) kernel_library(rnn_kernel DEPS ${COMMON_KERNEL_DEPS} concat_and_split_functor lstm_compute gru_compute) kernel_library(rnn_grad_kernel DEPS ${COMMON_KERNEL_DEPS} concat_and_split_functor lstm_compute gru_compute) kernel_library(warpctc_kernel DEPS ${COMMON_KERNEL_DEPS} phi_dynload_warpctc sequence_padding sequence_scale) diff --git a/paddle/phi/kernels/copy_kernel.h b/paddle/phi/kernels/copy_kernel.h index 95df29f7e653a..21b59d8d11b8d 100644 --- a/paddle/phi/kernels/copy_kernel.h +++ b/paddle/phi/kernels/copy_kernel.h @@ -15,7 +15,6 @@ limitations under the License. */ #pragma once #include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/core/sparse_csr_tensor.h" namespace phi { diff --git a/paddle/phi/kernels/cpu/cross_entropy_grad_kernel.cc b/paddle/phi/kernels/cpu/cross_entropy_grad_kernel.cc new file mode 100644 index 0000000000000..d4a632b5e6ece --- /dev/null +++ b/paddle/phi/kernels/cpu/cross_entropy_grad_kernel.cc @@ -0,0 +1,226 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/cross_entropy_grad_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/copy_kernel.h" +#include "paddle/phi/kernels/funcs/axis_utils.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" + +// TODO(chenweihang): move dispatch.h into phi/core +#include "paddle/phi/api/ext/dispatch.h" + +namespace phi { + +template +void CrossEntropyWithSoftmaxGradCPUKernel(const CPUContext& dev_ctx, + const DenseTensor& label, + const DenseTensor& softmax, + const DenseTensor& loss_grad, + bool soft_label, + bool use_softmax, + bool numeric_stable_mode, + int ignore_index, + int axis, + DenseTensor* logits_grad) { + const DenseTensor* out_grad = &loss_grad; + DenseTensor* logit_grad = logits_grad; + + if (logit_grad != &softmax || !use_softmax) { + phi::Copy(dev_ctx, softmax, dev_ctx.GetPlace(), false, logit_grad); + } + + const int rank = logit_grad->dims().size(); + const int axis_v = phi::funcs::CanonicalAxis(axis, rank); + int axis_dim = logit_grad->dims()[axis_v]; + PADDLE_ENFORCE_GT( + axis_dim, + 0, + phi::errors::InvalidArgument( + "The axis dimention should be larger than 0, but received " + "axis dimention is %d.", + axis_dim)); + + const int n = phi::funcs::SizeToAxis(axis_v, logit_grad->dims()); + PADDLE_ENFORCE_GT( + n, + 0, + phi::errors::InvalidArgument( + "The size of axis should be larger than 0, but received " + "SizeToAxis of logit_grad is %d.", + n)); + + const int d = phi::funcs::SizeFromAxis(axis_v, logit_grad->dims()); + DenseTensor logit_grad_2d(*logit_grad); + logit_grad_2d.Resize({n, d}); + DenseTensor labels_2d(label); + labels_2d.Resize({n, label.numel() / n}); + DenseTensor out_grad_2d(*out_grad); + out_grad_2d.Resize({n, d / axis_dim}); + + auto out_grad_mat = EigenMatrix::From(out_grad_2d); + auto logit_grad_mat = EigenMatrix::From(logit_grad_2d); + auto& place = *dev_ctx.eigen_device(); + + if (!use_softmax) { + // use_softmax step1 + if (soft_label) { + auto lbl_mat = EigenMatrix::From(labels_2d); + logit_grad_mat.device(place) = + (-lbl_mat / logit_grad_mat); // for each sample ,i is sample id + logit_grad_mat.device(place) = + out_grad_mat.broadcast(Eigen::DSizes(1, axis_dim)) * + logit_grad_mat; + } else { + // use_softmax step2 + const auto* label_data = label.data(); + T* logit_grad_data = logit_grad->data(); + const T* out_grad_data = out_grad->data(); + const int remain = d / axis_dim; + for (int i = 0; i < n; ++i) { // for each sample_1_dim + for (int j = 0; j < remain; j++) { // for each sample_other_dims + int idx = i * remain + j; // this sample's label_idx. for 1d case, + // remain=1 and j=0, so, idx = i + auto lbl = static_cast(label_data[idx]); + if (lbl == ignore_index) { + for (int k = 0; k < axis_dim; ++k) { // for each class id's label + logit_grad_data[i * d + k * remain + j] = 0; + } + } else { + // only for this sample's label_idx, the label is 1, others is 0, + // so, only compute this label_idx's class + logit_grad_data[i * d + lbl * remain + j] = + (-1 / logit_grad_data[i * d + lbl * remain + j]) * + out_grad_data[idx]; + for (int k = 0; k < axis_dim; ++k) { // for each class id's label + if (k != + label_data[idx]) { // label_data[idx]: this sample's label + logit_grad_data[i * d + k * remain + j] = 0; + } + } + } + } + } + } + return; + } + // for use_softmax=False, continue + + if (soft_label) { + // when soft_label = True, ignore_index is not supported + auto lbl_mat = EigenMatrix::From(labels_2d); + logit_grad_mat.device(place) = + out_grad_mat.broadcast(Eigen::DSizes(1, axis_dim)) * + (logit_grad_mat - lbl_mat); + // for each sample, i is sample id + // 1) compute dy/dx by p_j - y_j or P-Y, where j is class id, + // P=logit_grad_mat[i] is all class's probs, Y=lbl_mat[i] is + // all class's label + // 2) compute dy * dy/dx by Chain rule, dy=out_grad_mat[i] + // for high dims, e.g. (n,c) or (n,d1,...,dm, c), compute grad by matrix + // operation + + } else { + logit_grad_mat.device(place) = + logit_grad_mat * // element_wise multiply + out_grad_mat.broadcast(Eigen::DSizes(1, axis_dim)); + + const auto* label_data = label.data(); + T* logit_grad_data = logit_grad->data(); + const T* out_grad_data = out_grad->data(); + const int remain = d / axis_dim; + for (int i = 0; i < n; ++i) { // for each sample_1_dim + for (int j = 0; j < remain; j++) { // for each sample_other_dims + int idx = i * remain + j; // this sample's label_idx. for 1d case, + // remain=1 and j=0, so, idx = i + auto lbl = static_cast(label_data[idx]); + if (lbl == ignore_index) { + for (int k = 0; k < axis_dim; ++k) { // for each class id's label + logit_grad_data[i * d + k * remain + j] = 0; + } + } else { + // only for this sample's label_idx, the label is 1, others is 0, + // so, only compute this label_idx's class + // for 1d case, remain=1 and j=0, so, [i * d + label_data[idx] * + // remain + j] = [i * d + label_data[idx]] + // let idx_x = i * d + label_data[idx] * remain + j, + // logit_grad_data[idx_x] = logit_grad_data[idx_x] - + // out_grad_data[idx] + // note: logit_grad_mat = logit_grad_mat * out_grad_mat + // so: logit_grad_data[idx_x] = (logit_grad_data[idx_x] - 1) * + // out_grad_data[idx] + // means: dy/dp * dy= ( p - y ) * dy + + logit_grad_data[i * d + lbl * remain + j] -= out_grad_data[idx]; + } + } + } + } +} + +template +void CrossEntropyWithSoftmaxGradKernel(const Context& dev_ctx, + const DenseTensor& label, + const DenseTensor& softmax, + const DenseTensor& loss_grad, + bool soft_label, + bool use_softmax, + bool numeric_stable_mode, + int ignore_index, + int axis, + DenseTensor* logits_grad) { + auto dtype = label.dtype(); + if (soft_label) { + PADDLE_ENFORCE_EQ( + dtype, + paddle::experimental::CppTypeToDataType::Type(), + phi::errors::InvalidArgument("The Input(Label) should be with the " + "same data type as kernel data type.")); + CrossEntropyWithSoftmaxGradCPUKernel(dev_ctx, + label, + softmax, + loss_grad, + soft_label, + use_softmax, + numeric_stable_mode, + ignore_index, + axis, + logits_grad); + } else { + PD_DISPATCH_INTEGRAL_TYPES( + dtype, "CrossEntropyWithSoftmaxGradCPUKernel", ([&] { + CrossEntropyWithSoftmaxGradCPUKernel(dev_ctx, + label, + softmax, + loss_grad, + soft_label, + use_softmax, + numeric_stable_mode, + ignore_index, + axis, + logits_grad); + })); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(cross_entropy_with_softmax_grad, + CPU, + ALL_LAYOUT, + phi::CrossEntropyWithSoftmaxGradKernel, + float, + double) {} diff --git a/paddle/phi/kernels/cpu/cross_entropy_kernel.cc b/paddle/phi/kernels/cpu/cross_entropy_kernel.cc new file mode 100644 index 0000000000000..c684fb416eaab --- /dev/null +++ b/paddle/phi/kernels/cpu/cross_entropy_kernel.cc @@ -0,0 +1,104 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/cross_entropy_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/copy_kernel.h" +#include "paddle/phi/kernels/funcs/axis_utils.h" +#include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/softmax_kernel.h" + +#include "paddle/fluid/operators/math/cross_entropy.h" + +namespace phi { + +template +void CrossEntropy(const CPUContext& dev_ctx, + const DenseTensor& x, + const DenseTensor& label, + bool soft_label, + int ignore_index, + int axis, + DenseTensor* out) { + const int rank = x.dims().size(); + const int axis_v = phi::funcs::CanonicalAxis(axis, rank); + int axis_dim = x.dims()[axis_v]; + + PADDLE_ENFORCE_GT( + axis_dim, + 0, + phi::errors::InvalidArgument( + "The axis dimention should be larger than 0, but received " + "axis dimention is %d.", + axis_dim)); + + dev_ctx.template Alloc(out); + + const int n = phi::funcs::SizeToAxis(axis_v, x.dims()); + PADDLE_ENFORCE_GT( + n, + 0, + phi::errors::InvalidArgument( + "The size of axis should be larger than 0, but received " + "SizeToAxis of softmax is %d.", + n)); + + const int d = phi::funcs::SizeFromAxis(axis_v, x.dims()); + + DenseTensor x_2d(x); + x_2d.Resize({n, d}); + DenseTensor label_2d(label); + label_2d.Resize({n, label.numel() / n}); + DenseTensor out_2d(*out); + out_2d.Resize({n, d / axis_dim}); + + paddle::operators::math::CrossEntropyFunctor()( + dev_ctx, &out_2d, &x_2d, &label_2d, soft_label, ignore_index, axis_dim); +} + +template +void CrossEntropyWithSoftmaxKernel(const Context& dev_ctx, + const DenseTensor& logits, + const DenseTensor& label, + bool soft_label, + bool use_softmax, + bool numeric_stable_mode, + int ignore_index, + int axis, + DenseTensor* softmax, + DenseTensor* loss) { + // do not with softmax op, and input is softmax + if (!use_softmax) { + CrossEntropy( + dev_ctx, logits, label, soft_label, ignore_index, axis, loss); + // cause of input is softmax, copy to output softmax, directly + phi::Copy(dev_ctx, logits, dev_ctx.GetPlace(), false, softmax); + return; + } + + phi::SoftmaxKernel(dev_ctx, logits, axis, softmax); + CrossEntropy( + dev_ctx, *softmax, label, soft_label, ignore_index, axis, loss); +} + +} // namespace phi + +PD_REGISTER_KERNEL(cross_entropy_with_softmax, + CPU, + ALL_LAYOUT, + phi::CrossEntropyWithSoftmaxKernel, + float, + double) {} diff --git a/paddle/phi/kernels/cpu/full_kernel.cc b/paddle/phi/kernels/cpu/full_kernel.cc index 556de3adcf498..0b76425a659a0 100644 --- a/paddle/phi/kernels/cpu/full_kernel.cc +++ b/paddle/phi/kernels/cpu/full_kernel.cc @@ -31,7 +31,7 @@ void FullValue(const Context& dev_ctx, DenseTensor* tensor, VType val) { template void FullKernel(const Context& dev_ctx, - const ScalarArray& shape, + const IntArray& shape, const Scalar& val, DataType dtype, DenseTensor* out) { diff --git a/paddle/phi/kernels/cpu/gaussian_random_kernel.cc b/paddle/phi/kernels/cpu/gaussian_random_kernel.cc index 7e336f18bf80a..348d24b534e3e 100644 --- a/paddle/phi/kernels/cpu/gaussian_random_kernel.cc +++ b/paddle/phi/kernels/cpu/gaussian_random_kernel.cc @@ -23,7 +23,7 @@ namespace phi { template void GaussianRandomKernel(const Context& dev_ctx, - const ScalarArray& shape, + const IntArray& shape, float mean, float std, int seed, diff --git a/paddle/phi/kernels/cpu/index_sample_grad_kernel.cc b/paddle/phi/kernels/cpu/index_sample_grad_kernel.cc index 006711ceef75e..d060e8c9b2837 100644 --- a/paddle/phi/kernels/cpu/index_sample_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/index_sample_grad_kernel.cc @@ -68,9 +68,9 @@ void IndexSampleGradInner(const Context& context, template void IndexSampleGradKernel(const Context& ctx, - const DenseTensor& out_grad, const DenseTensor& x, const DenseTensor& index, + const DenseTensor& out_grad, DenseTensor* x_grad) { auto index_type = index.dtype(); bool index_type_match = diff --git a/paddle/phi/kernels/cpu/interpolate_grad_kernel.cc b/paddle/phi/kernels/cpu/interpolate_grad_kernel.cc new file mode 100644 index 0000000000000..550439a5251db --- /dev/null +++ b/paddle/phi/kernels/cpu/interpolate_grad_kernel.cc @@ -0,0 +1,1067 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/interpolate_grad_kernel.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/common/layout.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/interpolate_function.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +template +static void LinearInterpolationGrad(const DenseTensor& output_grad, + DenseTensor* input_grad, + const float ratio_w, + const int in_w, + const int n, + const int c, + const int out_w, + const bool align_corners, + const int align_mode, + const DataLayout data_layout) { + auto input_grad_t = EigenTensor::From(*input_grad); + auto output_grad_t = EigenTensor::From(output_grad); + bool align_flag = (align_mode == 0 && !align_corners); + for (int l = 0; l < out_w; l++) { + int x_w = align_flag ? static_cast(ratio_w * (l + 0.5) - 0.5) + : static_cast(ratio_w * l); + x_w = (x_w > 0) ? x_w : 0; // w + int x_e = (x_w < (in_w - 1)) ? (x_w + 1) : x_w; // w_id + + float idx_src_x = ratio_w * (l + 0.5) - 0.5; + idx_src_x = (idx_src_x > 0) ? idx_src_x : 0; + float d_w = align_flag ? idx_src_x - x_w : ratio_w * l - x_w; // w1lambda + float d_e = 1.f - d_w; // w2lambda + + for (int i = 0; i < n; i++) { // loop for batches + for (int j = 0; j < c; j++) { // loop for channels + // linear interpolation grad + if (data_layout == DataLayout::kNCHW) { + const T grad = output_grad_t(i, j, l); + input_grad_t(i, j, x_w) += static_cast(grad * d_e); + input_grad_t(i, j, x_e) += static_cast(grad * d_w); + } else { + const T grad = output_grad_t(i, l, j); + input_grad_t(i, x_w, j) += static_cast(grad * d_e); + input_grad_t(i, x_e, j) += static_cast(grad * d_w); + } + } + } + } +} + +template +static void BilinearInterpolationGrad(const DenseTensor& output_grad, + DenseTensor* input_grad, + const float ratio_h, + const float ratio_w, + const int in_h, + const int in_w, + const int n, + const int c, + const int out_h, + const int out_w, + const bool align_corners, + const int align_mode, + const DataLayout data_layout) { + auto input_grad_t = EigenTensor::From(*input_grad); + auto output_grad_t = EigenTensor::From(output_grad); + bool align_flag = (align_mode == 0 && !align_corners); + for (int k = 0; k < out_h; k++) { // loop for images + int y_n = align_flag ? static_cast(ratio_h * (k + 0.5) - 0.5) + : static_cast(ratio_h * k); + y_n = (y_n > 0) ? y_n : 0; + int y_s = (y_n + 1) < (in_h - 1) ? (y_n + 1) : (in_h - 1); + float idx_src_y = ratio_h * (k + 0.5) - 0.5; + idx_src_y = (idx_src_y > 0) ? idx_src_y : 0; + float d_n = align_flag ? idx_src_y - y_n : ratio_h * k - y_n; + float d_s = 1.f - d_n; + + for (int l = 0; l < out_w; l++) { + int x_w = align_flag ? static_cast(ratio_w * (l + 0.5) - 0.5) + : static_cast(ratio_w * l); + x_w = (x_w > 0) ? x_w : 0; + int x_e = (x_w + 1) < (in_w - 1) ? (x_w + 1) : (in_w - 1); + float idx_src_x = ratio_w * (l + 0.5) - 0.5; + idx_src_x = (idx_src_x > 0) ? idx_src_x : 0; + float d_w = align_flag ? idx_src_x - x_w : ratio_w * l - x_w; + float d_e = 1.f - d_w; + + for (int i = 0; i < n; i++) { // loop for batches + for (int j = 0; j < c; j++) { // loop for channels + // bilinear interpolation grad + if (data_layout == DataLayout::kNCHW) { + const T grad = output_grad_t(i, j, k, l); + input_grad_t(i, j, y_n, x_w) += static_cast(grad * d_s * d_e); + input_grad_t(i, j, y_s, x_w) += static_cast(grad * d_n * d_e); + input_grad_t(i, j, y_n, x_e) += static_cast(grad * d_s * d_w); + input_grad_t(i, j, y_s, x_e) += static_cast(grad * d_n * d_w); + } else { + const T grad = output_grad_t(i, k, l, j); + input_grad_t(i, y_n, x_w, j) += static_cast(grad * d_s * d_e); + input_grad_t(i, y_s, x_w, j) += static_cast(grad * d_n * d_e); + input_grad_t(i, y_n, x_e, j) += static_cast(grad * d_s * d_w); + input_grad_t(i, y_s, x_e, j) += static_cast(grad * d_n * d_w); + } + } + } + } + } +} + +template +static void NearestNeighborInterpolateGrad(const DenseTensor& output_grad, + DenseTensor* input_grad, + const float ratio_h, + const float ratio_w, + const int n, + const int c, + const int out_h, + const int out_w, + const bool align_corners, + const DataLayout data_layout) { + auto input_grad_t = EigenTensor::From(*input_grad); + auto output_grad_t = EigenTensor::From(output_grad); + + for (int k = 0; k < out_h; k++) { // loop for images + int in_k = (align_corners) ? static_cast(ratio_h * k + 0.5) + : static_cast(ratio_h * k); + + for (int l = 0; l < out_w; l++) { + int in_l = (align_corners) ? static_cast(ratio_w * l + 0.5) + : static_cast(ratio_w * l); + + for (int i = 0; i < n; i++) { // loop for batches + for (int j = 0; j < c; j++) { // loop for channels + if (data_layout == DataLayout::kNCHW) { + input_grad_t(i, j, in_k, in_l) += output_grad_t(i, j, k, l); + } else { + input_grad_t(i, in_k, in_l, j) += output_grad_t(i, k, l, j); + } + } + } + } + } +} + +template +static void BicubicInterpolationGrad(const DenseTensor& output_grad, + DenseTensor* input_grad, + const float ratio_h, + const float ratio_w, + const int in_h, + const int in_w, + const int n, + const int c, + const int out_h, + const int out_w, + const bool align_corners, + const DataLayout data_layout) { + auto input_grad_t = EigenTensor::From(*input_grad); + auto output_grad_t = EigenTensor::From(output_grad); + + for (int k = 0; k < out_h; k++) { // loop for images + T y_n = align_corners ? static_cast(ratio_h * k) + : static_cast(ratio_h * (k + 0.5) - 0.5); + int input_y = floorf(y_n); + T y_t = y_n - input_y; + + for (int l = 0; l < out_w; l++) { + T x_n = align_corners ? static_cast(ratio_w * l) + : static_cast(ratio_w * (l + 0.5) - 0.5); + int input_x = floorf(x_n); + T x_t = x_n - input_x; + + T x_coeffs[4]; + T y_coeffs[4]; + + funcs::get_cubic_upsample_coefficients(x_coeffs, x_t); + funcs::get_cubic_upsample_coefficients(y_coeffs, y_t); + + for (int i = 0; i < n; i++) { // loop for batches + for (int j = 0; j < c; j++) { // loop for channels + // bicubic interpolation grad + for (int ii = 0; ii < 4; ii++) { + for (int jj = 0; jj < 4; jj++) { + int access_x = std::max(std::min(input_x - 1 + ii, in_w - 1), + static_cast(0)); + int access_y = std::max(std::min(input_y - 1 + jj, in_h - 1), + static_cast(0)); + if (data_layout == DataLayout::kNCHW) { + T grad = output_grad_t(i, j, k, l); + input_grad_t(i, j, access_y, access_x) += + grad * y_coeffs[jj] * x_coeffs[ii]; + } else { + T grad = output_grad_t(i, k, l, j); + input_grad_t(i, access_y, access_x, j) += + grad * y_coeffs[jj] * x_coeffs[ii]; + } + } + } + } + } + } + } +} + +template +static void TrilinearInterpolationGrad(const DenseTensor& output_grad, + DenseTensor* input_grad, + const float ratio_d, + const float ratio_h, + const float ratio_w, + const int in_d, + const int in_h, + const int in_w, + const int n, + const int c, + const int out_d, + const int out_h, + const int out_w, + const bool align_corners, + const int align_mode, + const DataLayout data_layout) { + auto input_grad_t = EigenTensor::From(*input_grad); + auto output_grad_t = EigenTensor::From(output_grad); + bool align_flag = (align_mode == 0 && !align_corners); + for (int j = 0; j < out_d; j++) { // loop for D + int t_f = align_flag ? static_cast(ratio_d * (j + 0.5) - 0.5) + : static_cast(ratio_d * j); + t_f = (t_f > 0) ? t_f : 0; + int t_b = (t_f + 1) < (in_d - 1) ? (t_f + 1) : (in_d - 1); + float idx_src_t = ratio_d * (j + 0.5) - 0.5; + idx_src_t = (idx_src_t > 0) ? idx_src_t : 0; + float d_f = align_flag ? idx_src_t - t_f : ratio_d * j - t_f; + float d_b = 1.f - d_f; + + for (int k = 0; k < out_h; k++) { // loop for H + int y_n = align_flag ? static_cast(ratio_h * (k + 0.5) - 0.5) + : static_cast(ratio_h * k); + y_n = (y_n > 0) ? y_n : 0; + int y_s = (y_n + 1) < (in_h - 1) ? (y_n + 1) : (in_h - 1); + float idx_src_y = ratio_h * (k + 0.5) - 0.5; + idx_src_y = (idx_src_y > 0) ? idx_src_y : 0; + float d_n = align_flag ? idx_src_y - y_n : ratio_h * k - y_n; + float d_s = 1.f - d_n; + + for (int l = 0; l < out_w; l++) { // loop for W + int x_w = align_flag ? static_cast(ratio_w * (l + 0.5) - 0.5) + : static_cast(ratio_w * l); + x_w = (x_w > 0) ? x_w : 0; + int x_e = (x_w + 1) < (in_w - 1) ? (x_w + 1) : (in_w - 1); + float idx_src_x = ratio_w * (l + 0.5) - 0.5; + idx_src_x = (idx_src_x > 0) ? idx_src_x : 0; + float d_w = align_flag ? idx_src_x - x_w : ratio_w * l - x_w; + float d_e = 1.f - d_w; + + for (int b = 0; b < n; b++) { // loop for batches + for (int i = 0; i < c; i++) { // loop for channels + // trilinear interpolation grad + if (data_layout == DataLayout::kNCHW) { + const T grad = output_grad_t(b, i, j, k, l); + input_grad_t(b, i, t_f, y_n, x_w) += + static_cast(grad * d_b * d_s * d_e); + input_grad_t(b, i, t_f, y_n, x_e) += + static_cast(grad * d_b * d_s * d_w); + input_grad_t(b, i, t_f, y_s, x_w) += + static_cast(grad * d_b * d_n * d_e); + input_grad_t(b, i, t_f, y_s, x_e) += + static_cast(grad * d_b * d_n * d_w); + input_grad_t(b, i, t_b, y_n, x_w) += + static_cast(grad * d_f * d_s * d_e); + input_grad_t(b, i, t_b, y_n, x_e) += + static_cast(grad * d_f * d_s * d_w); + input_grad_t(b, i, t_b, y_s, x_w) += + static_cast(grad * d_f * d_n * d_e); + input_grad_t(b, i, t_b, y_s, x_e) += + static_cast(grad * d_f * d_n * d_w); + } else { + const T grad = output_grad_t(b, j, k, l, i); + input_grad_t(b, t_f, y_n, x_w, i) += + static_cast(grad * d_b * d_s * d_e); + input_grad_t(b, t_f, y_n, x_e, i) += + static_cast(grad * d_b * d_s * d_w); + input_grad_t(b, t_f, y_s, x_w, i) += + static_cast(grad * d_b * d_n * d_e); + input_grad_t(b, t_f, y_s, x_e, i) += + static_cast(grad * d_b * d_n * d_w); + input_grad_t(b, t_b, y_n, x_w, i) += + static_cast(grad * d_f * d_s * d_e); + input_grad_t(b, t_b, y_n, x_e, i) += + static_cast(grad * d_f * d_s * d_w); + input_grad_t(b, t_b, y_s, x_w, i) += + static_cast(grad * d_f * d_n * d_e); + input_grad_t(b, t_b, y_s, x_e, i) += + static_cast(grad * d_f * d_n * d_w); + } + } + } + } + } + } +} + +template +static void NearestNeighbor3DInterpolateGrad(const DenseTensor& output_grad, + DenseTensor* input_grad, + const float ratio_d, + const float ratio_h, + const float ratio_w, + const int n, + const int c, + const int out_d, + const int out_h, + const int out_w, + const bool align_corners, + const DataLayout data_layout) { + auto input_grad_t = EigenTensor::From(*input_grad); + auto output_grad_t = EigenTensor::From(output_grad); + + for (int d = 0; d < out_d; d++) { + int in_d = (align_corners) ? static_cast(ratio_d * d + 0.5) + : static_cast(ratio_d * d); + for (int k = 0; k < out_h; k++) { // loop for images + int in_k = (align_corners) ? static_cast(ratio_h * k + 0.5) + : static_cast(ratio_h * k); + + for (int l = 0; l < out_w; l++) { + int in_l = (align_corners) ? static_cast(ratio_w * l + 0.5) + : static_cast(ratio_w * l); + + for (int i = 0; i < n; i++) { // loop for batches + for (int j = 0; j < c; j++) { // loop for channels + if (data_layout == DataLayout::kNCHW) { + input_grad_t(i, j, in_d, in_k, in_l) += + output_grad_t(i, j, d, k, l); + } else { + input_grad_t(i, in_d, in_k, in_l, j) += + output_grad_t(i, d, k, l, j); + } + } + } + } + } + } +} + +template +static void Interpolate1DCPUBwd( + const Context& dev_ctx, + const DenseTensor& input, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const DenseTensor& output_grad, + const std::string& data_layout_str, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* input_grad) { + const DataLayout data_layout = + paddle::framework::StringToDataLayout(data_layout_str); + int n, c, in_d, in_h, in_w; + funcs::ExtractNCDWH(input.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w); + + float scale_w = -1.0; + if (scale_tensor) { + auto scale_data = + funcs::get_new_data_from_tensor(scale_tensor.get_ptr()); + scale_w = scale_data[0]; + PADDLE_ENFORCE_EQ( + scale_w > 0, + true, + errors::InvalidArgument( + "The scale_w in input 'Scale' Tensor of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); + } else { + if (scale.size() > 0) { + scale_w = scale[0]; + PADDLE_ENFORCE_EQ( + scale_w > 0, + true, + errors::InvalidArgument( + "The scale_w in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); + } + } + if (scale_w > 0.) { + out_w = static_cast(in_w * scale_w); + } + if (out_size) { + auto out_size_data = + funcs::get_new_data_from_tensor(out_size.get_ptr()); + out_w = out_size_data[0]; + } + if (size_tensor && size_tensor->size() > 0) { + // have size tensor + auto new_size = funcs::get_new_shape(size_tensor.get()); + out_w = new_size[0]; + } + + phi::DDim dim_grad; + if (data_layout == DataLayout::kNCHW) { + dim_grad = {n, c, in_w}; + } else { + dim_grad = {n, in_w, c}; + } + + input_grad->Resize(dim_grad); + dev_ctx.template Alloc(input_grad); + + phi::funcs::SetConstant zero; + zero(dev_ctx, input_grad, static_cast(0.0)); + + if (in_w == out_w) { + paddle::framework::TensorCopy(output_grad, dev_ctx.GetPlace(), input_grad); + return; + } + + float ratio_w = 0.f; + if (out_w > 1) { + float new_scale_w = 0.f; + new_scale_w = (scale_w > 0) ? static_cast(1. / scale_w) + : static_cast(in_w) / out_w; + ratio_w = (align_corners) ? static_cast(in_w - 1) / (out_w - 1) + : static_cast(new_scale_w); + } + if ("linear" == interp_method) { + LinearInterpolationGrad(output_grad, + input_grad, + ratio_w, + in_w, + n, + c, + out_w, + align_corners, + align_mode, + data_layout); + } +} + +template +static void Interpolate2DCPUBwd( + const Context& dev_ctx, + const DenseTensor& input, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const DenseTensor& output_grad, + const std::string& data_layout_str, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* input_grad) { + const DataLayout data_layout = + paddle::framework::StringToDataLayout(data_layout_str); + int n, c, in_d, in_h, in_w; + funcs::ExtractNCDWH(input.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w); + + float scale_h = -1; + float scale_w = -1; + if (scale_tensor) { + auto scale_data = + funcs::get_new_data_from_tensor(scale_tensor.get_ptr()); + if (scale_data.size() > 1) { + scale_h = scale_data[0]; + scale_w = scale_data[1]; + } else { + scale_w = scale_data[0]; + scale_h = scale_data[0]; + } + PADDLE_ENFORCE_EQ( + scale_w > 0, + true, + errors::InvalidArgument( + "The scale_w in input 'Scale' Tensor of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); + PADDLE_ENFORCE_EQ( + scale_h > 0, + true, + errors::InvalidArgument( + "The scale_h in input 'Scale' Tensor of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_h)); + } else { + if (scale.size() > 1) { + scale_h = scale[0]; + scale_w = scale[1]; + PADDLE_ENFORCE_EQ( + scale_w > 0, + true, + errors::InvalidArgument( + "The scale_w in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); + PADDLE_ENFORCE_EQ( + scale_h > 0, + true, + errors::InvalidArgument( + "The scale_h in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_h)); + } + } + if (scale_h > 0. && scale_w > 0.) { + out_h = static_cast(in_h * scale_h); + out_w = static_cast(in_w * scale_w); + } + if (out_size) { + auto out_size_data = + funcs::get_new_data_from_tensor(out_size.get_ptr()); + out_h = out_size_data[0]; + out_w = out_size_data[1]; + } + if (size_tensor && size_tensor->size() > 0) { + // have size tensor + auto new_size = funcs::get_new_shape(size_tensor.get()); + out_h = new_size[0]; + out_w = new_size[1]; + } + + phi::DDim dim_grad; + if (data_layout == DataLayout::kNCHW) { + dim_grad = {n, c, in_h, in_w}; + } else { + dim_grad = {n, in_h, in_w, c}; + } + + input_grad->Resize(dim_grad); + dev_ctx.template Alloc(input_grad); + + phi::funcs::SetConstant zero; + zero(dev_ctx, input_grad, static_cast(0.0)); + + if (in_h == out_h && in_w == out_w) { + paddle::framework::TensorCopy(output_grad, dev_ctx.GetPlace(), input_grad); + return; + } + + float ratio_h = 0.f; + float ratio_w = 0.f; + if (out_h > 1) { + float new_scale_h = 0.f; + new_scale_h = (scale_h > 0) ? static_cast(1. / scale_h) + : static_cast(in_h) / out_h; + ratio_h = (align_corners) ? static_cast(in_h - 1) / (out_h - 1) + : static_cast(new_scale_h); + } + if (out_w > 1) { + float new_scale_w = 0.f; + new_scale_w = (scale_w > 0) ? static_cast(1. / scale_w) + : static_cast(in_w) / out_w; + ratio_w = (align_corners) ? static_cast(in_w - 1) / (out_w - 1) + : static_cast(new_scale_w); + } + + if ("bilinear" == interp_method) { + BilinearInterpolationGrad(output_grad, + input_grad, + ratio_h, + ratio_w, + in_h, + in_w, + n, + c, + out_h, + out_w, + align_corners, + align_mode, + data_layout); + } else if ("nearest" == interp_method) { + NearestNeighborInterpolateGrad(output_grad, + input_grad, + ratio_h, + ratio_w, + n, + c, + out_h, + out_w, + align_corners, + data_layout); + } else if ("bicubic" == interp_method) { + BicubicInterpolationGrad(output_grad, + input_grad, + ratio_h, + ratio_w, + in_h, + in_w, + n, + c, + out_h, + out_w, + align_corners, + data_layout); + } +} + +template +static void Interpolate3DCPUBwd( + const Context& dev_ctx, + const DenseTensor& input, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const DenseTensor& output_grad, + const std::string& data_layout_str, + int out_d, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* input_grad) { + const DataLayout data_layout = + paddle::framework::StringToDataLayout(data_layout_str); + int n, c, in_d, in_h, in_w; + funcs::ExtractNCDWH(input.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w); + + float scale_d = -1; + float scale_h = -1; + float scale_w = -1; + if (scale_tensor) { + auto scale_data = + funcs::get_new_data_from_tensor(scale_tensor.get_ptr()); + if (scale_data.size() > 1) { + scale_d = scale_data[0]; + scale_h = scale_data[1]; + scale_w = scale_data[2]; + } else { + scale_d = scale_data[0]; + scale_h = scale_data[0]; + scale_w = scale_data[0]; + } + PADDLE_ENFORCE_EQ( + scale_w > 0, + true, + errors::InvalidArgument( + "The scale_w in input 'Scale' Tensor of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); + PADDLE_ENFORCE_EQ( + scale_h > 0, + true, + errors::InvalidArgument( + "The scale_h in input 'Scale' Tensor of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_h)); + PADDLE_ENFORCE_EQ( + scale_d > 0, + true, + errors::InvalidArgument( + "The scale_d in input 'Scale' Tensor of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_d)); + } else { + if (scale.size() > 1) { + scale_d = scale[0]; + scale_h = scale[1]; + scale_w = scale[2]; + PADDLE_ENFORCE_EQ( + scale_w > 0, + true, + errors::InvalidArgument( + "The scale_w in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); + PADDLE_ENFORCE_EQ( + scale_h > 0, + true, + errors::InvalidArgument( + "The scale_h in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_h)); + PADDLE_ENFORCE_EQ( + scale_d > 0, + true, + errors::InvalidArgument( + "The scale_d in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_d)); + } + } + if (scale_d > 0. && scale_h > 0. && scale_w > 0.) { + out_d = static_cast(in_d * scale_d); + out_h = static_cast(in_h * scale_h); + out_w = static_cast(in_w * scale_w); + } + if (out_size) { + auto out_size_data = + funcs::get_new_data_from_tensor(out_size.get_ptr()); + out_d = out_size_data[0]; + out_h = out_size_data[1]; + out_w = out_size_data[2]; + } + if (size_tensor && size_tensor->size() > 0) { + // have size tensor + auto new_size = funcs::get_new_shape(size_tensor.get()); + out_d = new_size[0]; + out_h = new_size[1]; + out_w = new_size[2]; + } + + phi::DDim dim_grad; + if (data_layout == DataLayout::kNCHW) { + dim_grad = {n, c, in_d, in_h, in_w}; + } else { + dim_grad = {n, in_d, in_h, in_w, c}; + } + input_grad->Resize(dim_grad); + dev_ctx.template Alloc(input_grad); + + phi::funcs::SetConstant zero; + zero(dev_ctx, input_grad, static_cast(0.0)); + + if (in_d == out_d && in_h == out_h && in_w == out_w) { + paddle::framework::TensorCopy(output_grad, dev_ctx.GetPlace(), input_grad); + return; + } + + float ratio_d = 0.f; + float ratio_h = 0.f; + float ratio_w = 0.f; + if (out_d > 1) { + float new_scale_d = 0.f; + new_scale_d = (scale_d > 0) ? static_cast(1. / scale_d) + : static_cast(in_d) / out_d; + ratio_d = (align_corners) ? static_cast(in_d - 1) / (out_d - 1) + : static_cast(new_scale_d); + } + if (out_h > 1) { + float new_scale_h = 0.f; + new_scale_h = (scale_h > 0) ? static_cast(1. / scale_h) + : static_cast(in_h) / out_h; + ratio_h = (align_corners) ? static_cast(in_h - 1) / (out_h - 1) + : static_cast(new_scale_h); + } + if (out_w > 1) { + float new_scale_w = 0.f; + new_scale_w = (scale_w > 0) ? static_cast(1. / scale_w) + : static_cast(in_w) / out_w; + ratio_w = (align_corners) ? static_cast(in_w - 1) / (out_w - 1) + : static_cast(new_scale_w); + } + + if ("trilinear" == interp_method) { + TrilinearInterpolationGrad(output_grad, + input_grad, + ratio_d, + ratio_h, + ratio_w, + in_d, + in_h, + in_w, + n, + c, + out_d, + out_h, + out_w, + align_corners, + align_mode, + data_layout); + } else if ("nearest" == interp_method) { + NearestNeighbor3DInterpolateGrad(output_grad, + input_grad, + ratio_d, + ratio_h, + ratio_w, + n, + c, + out_d, + out_h, + out_w, + align_corners, + data_layout); + } +} + +template +void InterpolateGradKernel( + const Context& dev_ctx, + const DenseTensor& x, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const DenseTensor& output_grad, + const std::string& data_layout, + int out_d, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* x_grad) { + auto output_grad_dims = output_grad.dims(); + if (output_grad_dims.size() == 3) { // 1D interpolation grad + Interpolate1DCPUBwd(dev_ctx, + x, + out_size, + size_tensor, + scale_tensor, + output_grad, + data_layout, + out_w, + scale, + interp_method, + align_corners, + align_mode, + x_grad); + } else if (output_grad_dims.size() == 4) { // 2D interpolation grad + Interpolate2DCPUBwd(dev_ctx, + x, + out_size, + size_tensor, + scale_tensor, + output_grad, + data_layout, + out_h, + out_w, + scale, + interp_method, + align_corners, + align_mode, + x_grad); + + } else if (output_grad_dims.size() == 5) { // 3D interpolation grad + Interpolate3DCPUBwd(dev_ctx, + x, + out_size, + size_tensor, + scale_tensor, + output_grad, + data_layout, + out_d, + out_h, + out_w, + scale, + interp_method, + align_corners, + align_mode, + x_grad); + } +} + +template +void BilinearInterpGradKernel( + const Context& dev_ctx, + const DenseTensor& x, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const DenseTensor& out_grad, + const std::string& data_layout, + int out_d, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* x_grad) { + InterpolateGradKernel(dev_ctx, + x, + out_size, + size_tensor, + scale_tensor, + out_grad, + data_layout, + out_d, + out_h, + out_w, + scale, + interp_method, + align_corners, + align_mode, + x_grad); +} + +template +void NearestInterpGradKernel( + const Context& dev_ctx, + const DenseTensor& x, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const DenseTensor& out_grad, + const std::string& data_layout, + int out_d, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* x_grad) { + InterpolateGradKernel(dev_ctx, + x, + out_size, + size_tensor, + scale_tensor, + out_grad, + data_layout, + out_d, + out_h, + out_w, + scale, + interp_method, + align_corners, + align_mode, + x_grad); +} + +template +void TrilinearInterpGradKernel( + const Context& dev_ctx, + const DenseTensor& x, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const DenseTensor& out_grad, + const std::string& data_layout, + int out_d, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* x_grad) { + InterpolateGradKernel(dev_ctx, + x, + out_size, + size_tensor, + scale_tensor, + out_grad, + data_layout, + out_d, + out_h, + out_w, + scale, + interp_method, + align_corners, + align_mode, + x_grad); +} + +template +void LinearInterpGradKernel( + const Context& dev_ctx, + const DenseTensor& x, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const DenseTensor& out_grad, + const std::string& data_layout, + int out_d, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* x_grad) { + InterpolateGradKernel(dev_ctx, + x, + out_size, + size_tensor, + scale_tensor, + out_grad, + data_layout, + out_d, + out_h, + out_w, + scale, + interp_method, + align_corners, + align_mode, + x_grad); +} + +template +void BicubicInterpGradKernel( + const Context& dev_ctx, + const DenseTensor& x, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const DenseTensor& out_grad, + const std::string& data_layout, + int out_d, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* x_grad) { + InterpolateGradKernel(dev_ctx, + x, + out_size, + size_tensor, + scale_tensor, + out_grad, + data_layout, + out_d, + out_h, + out_w, + scale, + interp_method, + align_corners, + align_mode, + x_grad); +} + +} // namespace phi + +PD_REGISTER_KERNEL(bilinear_interp_v2_grad, + CPU, + ALL_LAYOUT, + phi::BilinearInterpGradKernel, + float, + double) {} +PD_REGISTER_KERNEL(nearest_interp_v2_grad, + CPU, + ALL_LAYOUT, + phi::NearestInterpGradKernel, + float, + double) {} +PD_REGISTER_KERNEL(trilinear_interp_v2_grad, + CPU, + ALL_LAYOUT, + phi::TrilinearInterpGradKernel, + float, + double) {} +PD_REGISTER_KERNEL(linear_interp_v2_grad, + CPU, + ALL_LAYOUT, + phi::LinearInterpGradKernel, + float, + double) {} +PD_REGISTER_KERNEL(bicubic_interp_v2_grad, + CPU, + ALL_LAYOUT, + phi::BicubicInterpGradKernel, + float, + double) {} diff --git a/paddle/phi/kernels/cpu/interpolate_kernel.cc b/paddle/phi/kernels/cpu/interpolate_kernel.cc new file mode 100644 index 0000000000000..da9a54748f06f --- /dev/null +++ b/paddle/phi/kernels/cpu/interpolate_kernel.cc @@ -0,0 +1,1225 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/interpolate_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/common/layout.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/interpolate_function.h" + +namespace phi { + +template +static inline T cubic_interp(T x0, T x1, T x2, T x3, T t) { + T coeffs[4]; + funcs::get_cubic_upsample_coefficients(coeffs, t); + + return x0 * coeffs[0] + x1 * coeffs[1] + x2 * coeffs[2] + x3 * coeffs[3]; +} + +template +static void LinearInterpolation(const DenseTensor& input, + DenseTensor* output, + const float ratio_w, + const int in_w, + const int n, + const int c, + const int out_w, + const bool align_corners, + const int align_mode, + const DataLayout data_layout) { + auto input_t = EigenTensor::From(input); + auto output_t = EigenTensor::From(*output); + bool align_flag = (align_mode == 0 && !align_corners); + + std::vector vx_w, vx_e; + std::vector vd_w, vd_e; + vx_w.reserve(out_w); + vx_e.reserve(out_w); + vd_w.reserve(out_w); + vd_e.reserve(out_w); +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for +#endif + for (int l = 0; l < out_w; l++) { + int x_w = align_flag ? static_cast(ratio_w * (l + 0.5) - 0.5) + : static_cast(ratio_w * l); + x_w = (x_w > 0) ? x_w : 0; // w + int x_e = (x_w < (in_w - 1)) ? (x_w + 1) : x_w; // w_id + + float idx_src_x = ratio_w * (l + 0.5) - 0.5; + idx_src_x = (idx_src_x > 0) ? idx_src_x : 0; + float d_w = align_flag ? idx_src_x - x_w : ratio_w * l - x_w; // w1lambda + float d_e = 1.f - d_w; // w2lambda + { + vx_w[l] = x_w; + vx_e[l] = x_e; + vd_w[l] = d_w; + vd_e[l] = d_e; + } + } + +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for collapse(3) +#endif + for (int i = 0; i < n; i++) { // loop for batches + for (int j = 0; j < c; j++) { // loop for channels + for (int l = 0; l < out_w; l++) { + // linear interpolation + T out_t; + if (data_layout == DataLayout::kNCHW) { + out_t = input_t(i, j, vx_w[l]) * vd_e[l] + + input_t(i, j, vx_e[l]) * vd_w[l]; + output_t(i, j, l) = out_t; + } else { + out_t = input_t(i, vx_w[l], j) * vd_e[l] + + input_t(i, vx_e[l], j) * vd_w[l]; + output_t(i, l, j) = out_t; + } + } + } + } +} + +template +static void BilinearInterpolation(const DenseTensor& input, + DenseTensor* output, + const float ratio_h, + const float ratio_w, + const int in_h, + const int in_w, + const int n, + const int c, + const int out_h, + const int out_w, + const bool align_corners, + const int align_mode, + const DataLayout data_layout) { + auto input_t = EigenTensor::From(input); + auto output_t = EigenTensor::From(*output); + bool align_flag = (align_mode == 0 && !align_corners); + + std::vector vy_n, vy_s; + std::vector vd_n, vd_s; + vy_n.reserve(out_h); + vy_s.reserve(out_h); + vd_n.reserve(out_h); + vd_s.reserve(out_h); +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for +#endif + for (int k = 0; k < out_h; k++) { + int y_n = align_flag ? static_cast(ratio_h * (k + 0.5) - 0.5) + : static_cast(ratio_h * k); + y_n = (y_n > 0) ? y_n : 0; + int y_s = (y_n + 1) < (in_h - 1) ? (y_n + 1) : (in_h - 1); + float idx_src_y = ratio_h * (k + 0.5) - 0.5; + idx_src_y = (idx_src_y > 0) ? idx_src_y : 0; + float d_n = align_flag ? idx_src_y - y_n : ratio_h * k - y_n; + float d_s = 1.f - d_n; + { + vy_n[k] = y_n; + vy_s[k] = y_s; + vd_n[k] = d_n; + vd_s[k] = d_s; + } + } + + std::vector vx_w, vx_e; + std::vector vd_w, vd_e; + vx_w.reserve(out_w); + vx_e.reserve(out_w); + vd_w.reserve(out_w); + vd_e.reserve(out_w); +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for +#endif + for (int l = 0; l < out_w; l++) { + int x_w = (align_mode == 0 && !align_corners) + ? static_cast(ratio_w * (l + 0.5) - 0.5) + : static_cast(ratio_w * l); + x_w = (x_w > 0) ? x_w : 0; + int x_e = (x_w + 1) < (in_w - 1) ? (x_w + 1) : (in_w - 1); + float idx_src_x = ratio_w * (l + 0.5) - 0.5; + idx_src_x = (idx_src_x > 0) ? idx_src_x : 0; + float d_w = align_flag ? idx_src_x - x_w : ratio_w * l - x_w; + float d_e = 1.f - d_w; + { + vx_w[l] = x_w; + vx_e[l] = x_e; + vd_w[l] = d_w; + vd_e[l] = d_e; + } + } + +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for collapse(4) +#endif + for (int i = 0; i < n; i++) { // loop for batches + for (int j = 0; j < c; j++) { // loop for channels + for (int k = 0; k < out_h; k++) { // loop for images + for (int l = 0; l < out_w; l++) { + // bilinear interpolation + T out_t; + if (data_layout == DataLayout::kNCHW) { + out_t = input_t(i, j, vy_n[k], vx_w[l]) * vd_s[k] * vd_e[l] + + input_t(i, j, vy_s[k], vx_w[l]) * vd_n[k] * vd_e[l] + + input_t(i, j, vy_n[k], vx_e[l]) * vd_s[k] * vd_w[l] + + input_t(i, j, vy_s[k], vx_e[l]) * vd_n[k] * vd_w[l]; + output_t(i, j, k, l) = out_t; + + } else { + out_t = input_t(i, vy_n[k], vx_w[l], j) * vd_s[k] * vd_e[l] + + input_t(i, vy_s[k], vx_w[l], j) * vd_n[k] * vd_e[l] + + input_t(i, vy_n[k], vx_e[l], j) * vd_s[k] * vd_w[l] + + input_t(i, vy_s[k], vx_e[l], j) * vd_n[k] * vd_w[l]; + output_t(i, k, l, j) = out_t; + } + } + } + } + } +} + +template +static void NearestNeighborInterpolate(const DenseTensor& input, + DenseTensor* output, + const float ratio_h, + const float ratio_w, + const int n, + const int c, + const int out_h, + const int out_w, + const bool align_corners, + const DataLayout& data_layout) { + auto input_t = EigenTensor::From(input); + auto output_t = EigenTensor::From(*output); + for (int k = 0; k < out_h; k++) { // loop for images + int in_k = (align_corners) ? static_cast(ratio_h * k + 0.5) + : static_cast(ratio_h * k); + + for (int l = 0; l < out_w; l++) { + int in_l = (align_corners) ? static_cast(ratio_w * l + 0.5) + : static_cast(ratio_w * l); + + for (int i = 0; i < n; i++) { // loop for batches + for (int j = 0; j < c; j++) { // loop for channels + if (data_layout == DataLayout::kNCHW) { + output_t(i, j, k, l) = input_t(i, j, in_k, in_l); + } else { + output_t(i, k, l, j) = input_t(i, in_k, in_l, j); + } + } + } + } + } +} + +template +static void BicubicInterpolation(const DenseTensor& input, + DenseTensor* output, + const float ratio_h, + const float ratio_w, + const int in_h, + const int in_w, + const int n, + const int c, + const int out_h, + const int out_w, + const bool align_corners, + const DataLayout data_layout) { + auto input_t = EigenTensor::From(input); + auto output_t = EigenTensor::From(*output); + + for (int k = 0; k < out_h; k++) { // loop for images + T y_n = align_corners ? static_cast(ratio_h * k) + : static_cast(ratio_h * (k + 0.5) - 0.5); + int input_y = floorf(y_n); + const T y_t = y_n - input_y; + + for (int l = 0; l < out_w; l++) { + T x_n = align_corners ? static_cast(ratio_w * l) + : static_cast(ratio_w * (l + 0.5) - 0.5); + int input_x = floorf(x_n); + const T x_t = x_n - input_x; + + for (int i = 0; i < n; i++) { // loop for batches + for (int j = 0; j < c; j++) { // loop for channels + T coefficients[4]; + // interp 4 times in x direction + for (int ii = 0; ii < 4; ii++) { + int access_y = std::max(std::min(input_y - 1 + ii, in_h - 1), + static_cast(0)); + int access_x_0 = + std::max(std::min(input_x - 1, in_w - 1), static_cast(0)); + int access_x_1 = + std::max(std::min(input_x + 0, in_w - 1), static_cast(0)); + int access_x_2 = + std::max(std::min(input_x + 1, in_w - 1), static_cast(0)); + int access_x_3 = + std::max(std::min(input_x + 2, in_w - 1), static_cast(0)); + if (data_layout == DataLayout::kNCHW) { + coefficients[ii] = + cubic_interp(input_t(i, j, access_y, access_x_0), + input_t(i, j, access_y, access_x_1), + input_t(i, j, access_y, access_x_2), + input_t(i, j, access_y, access_x_3), + x_t); + } else { + coefficients[ii] = + cubic_interp(input_t(i, access_y, access_x_0, j), + input_t(i, access_y, access_x_1, j), + input_t(i, access_y, access_x_2, j), + input_t(i, access_y, access_x_3, j), + x_t); + } + } + + // interp y direction + if (data_layout == DataLayout::kNCHW) { + output_t(i, j, k, l) = cubic_interp(coefficients[0], + coefficients[1], + coefficients[2], + coefficients[3], + y_t); + } else { + output_t(i, k, l, j) = cubic_interp(coefficients[0], + coefficients[1], + coefficients[2], + coefficients[3], + y_t); + } + } + } + } + } +} + +template +static void TrilinearInterpolation(const DenseTensor& input, + DenseTensor* output, + const float ratio_d, + const float ratio_h, + const float ratio_w, + const int in_d, + const int in_h, + const int in_w, + const int n, + const int c, + const int out_d, + const int out_h, + const int out_w, + const bool align_corners, + const int align_mode, + const DataLayout& data_layout) { + auto input_t = EigenTensor::From(input); + auto output_t = EigenTensor::From(*output); + bool align_flag = (align_mode == 0 && !align_corners); + + std::vector vt_f, vt_b; + std::vector vd_f, vd_b; + vt_f.reserve(out_d); + vt_b.reserve(out_d); + vd_f.reserve(out_d); + vd_b.reserve(out_d); +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for +#endif + for (int j = 0; j < out_d; j++) { + int t_f = align_flag ? static_cast(ratio_d * (j + 0.5) - 0.5) + : static_cast(ratio_d * j); + t_f = (t_f > 0) ? t_f : 0; + int t_b = (t_f + 1) < (in_d - 1) ? (t_f + 1) : (in_d - 1); + float idx_src_t = ratio_d * (j + 0.5) - 0.5; + idx_src_t = (idx_src_t > 0) ? idx_src_t : 0; + float d_f = align_flag ? idx_src_t - t_f : ratio_d * j - t_f; + float d_b = 1.f - d_f; + { + vt_f[j] = t_f; + vt_b[j] = t_b; + vd_f[j] = d_f; + vd_b[j] = d_b; + } + } + + std::vector vy_n, vy_s; + std::vector vd_n, vd_s; + vy_n.reserve(out_h); + vy_s.reserve(out_h); + vd_n.reserve(out_h); + vd_s.reserve(out_h); +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for +#endif + for (int k = 0; k < out_h; k++) { + int y_n = align_flag ? static_cast(ratio_h * (k + 0.5) - 0.5) + : static_cast(ratio_h * k); + y_n = (y_n > 0) ? y_n : 0; + int y_s = (y_n + 1) < (in_h - 1) ? (y_n + 1) : (in_h - 1); + float idx_src_y = ratio_h * (k + 0.5) - 0.5; + idx_src_y = (idx_src_y > 0) ? idx_src_y : 0; + float d_n = align_flag ? idx_src_y - y_n : ratio_h * k - y_n; + float d_s = 1.f - d_n; + { + vy_n[k] = y_n; + vy_s[k] = y_s; + vd_n[k] = d_n; + vd_s[k] = d_s; + } + } + + std::vector vx_w, vx_e; + std::vector vd_w, vd_e; + vx_w.reserve(out_w); + vx_e.reserve(out_w); + vd_w.reserve(out_w); + vd_e.reserve(out_w); +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for +#endif + for (int l = 0; l < out_w; l++) { + int x_w = (align_mode == 0 && !align_corners) + ? static_cast(ratio_w * (l + 0.5) - 0.5) + : static_cast(ratio_w * l); + x_w = (x_w > 0) ? x_w : 0; + int x_e = (x_w + 1) < (in_w - 1) ? (x_w + 1) : (in_w - 1); + float idx_src_x = ratio_w * (l + 0.5) - 0.5; + idx_src_x = (idx_src_x > 0) ? idx_src_x : 0; + float d_w = align_flag ? idx_src_x - x_w : ratio_w * l - x_w; + float d_e = 1.f - d_w; + { + vx_w[l] = x_w; + vx_e[l] = x_e; + vd_w[l] = d_w; + vd_e[l] = d_e; + } + } + +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for collapse(5) +#endif + for (int b = 0; b < n; b++) { // loop for batches + for (int i = 0; i < c; i++) { // loop for channels + for (int j = 0; j < out_d; j++) { // loop for D, H, W + for (int k = 0; k < out_h; k++) { + for (int l = 0; l < out_w; l++) { + // trilinear interpolation + if (data_layout == DataLayout::kNCHW) { + T out_t = input_t(b, i, vt_f[j], vy_n[k], vx_w[l]) * vd_b[j] * + vd_s[k] * vd_e[l] + + input_t(b, i, vt_f[j], vy_n[k], vx_e[l]) * vd_b[j] * + vd_s[k] * vd_w[l] + + input_t(b, i, vt_f[j], vy_s[k], vx_w[l]) * vd_b[j] * + vd_n[k] * vd_e[l] + + input_t(b, i, vt_f[j], vy_s[k], vx_e[l]) * vd_b[j] * + vd_n[k] * vd_w[l] + + input_t(b, i, vt_b[j], vy_n[k], vx_w[l]) * vd_f[j] * + vd_s[k] * vd_e[l] + + input_t(b, i, vt_b[j], vy_n[k], vx_e[l]) * vd_f[j] * + vd_s[k] * vd_w[l] + + input_t(b, i, vt_b[j], vy_s[k], vx_w[l]) * vd_f[j] * + vd_n[k] * vd_e[l] + + input_t(b, i, vt_b[j], vy_s[k], vx_e[l]) * vd_f[j] * + vd_n[k] * vd_w[l]; + output_t(b, i, j, k, l) = out_t; + } else { + T out_t = input_t(b, vt_f[j], vy_n[k], vx_w[l], i) * vd_b[j] * + vd_s[k] * vd_e[l] + + input_t(b, vt_f[j], vy_n[k], vx_e[l], i) * vd_b[j] * + vd_s[k] * vd_w[l] + + input_t(b, vt_f[j], vy_s[k], vx_w[l], i) * vd_b[j] * + vd_n[k] * vd_e[l] + + input_t(b, vt_f[j], vy_s[k], vx_e[l], i) * vd_b[j] * + vd_n[k] * vd_w[l] + + input_t(b, vt_b[j], vy_n[k], vx_w[l], i) * vd_f[j] * + vd_s[k] * vd_e[l] + + input_t(b, vt_b[j], vy_n[k], vx_e[l], i) * vd_f[j] * + vd_s[k] * vd_w[l] + + input_t(b, vt_b[j], vy_s[k], vx_w[l], i) * vd_f[j] * + vd_n[k] * vd_e[l] + + input_t(b, vt_b[j], vy_s[k], vx_e[l], i) * vd_f[j] * + vd_n[k] * vd_w[l]; + output_t(b, j, k, l, i) = out_t; + } + } + } + } + } + } +} + +template +static void NearestNeighbor3DInterpolate(const DenseTensor& input, + DenseTensor* output, + const float ratio_d, + const float ratio_h, + const float ratio_w, + const int n, + const int c, + const int out_d, + const int out_h, + const int out_w, + const bool align_corners, + const DataLayout& data_layout) { + auto input_t = EigenTensor::From(input); + auto output_t = EigenTensor::From(*output); + for (int d = 0; d < out_d; d++) { // loop for images + int in_d = (align_corners) ? static_cast(ratio_d * d + 0.5) + : static_cast(ratio_d * d); + for (int k = 0; k < out_h; k++) { + int in_k = (align_corners) ? static_cast(ratio_h * k + 0.5) + : static_cast(ratio_h * k); + + for (int l = 0; l < out_w; l++) { + int in_l = (align_corners) ? static_cast(ratio_w * l + 0.5) + : static_cast(ratio_w * l); + + for (int i = 0; i < n; i++) { // loop for batches + for (int j = 0; j < c; j++) { // loop for channels + if (data_layout == DataLayout::kNCHW) { + output_t(i, j, d, k, l) = input_t(i, j, in_d, in_k, in_l); + } else { // NDHWC + output_t(i, d, k, l, j) = input_t(i, in_d, in_k, in_l, j); + } + } + } + } + } + } +} + +template +static void Interpolate1DCPUFwd( + const Context& dev_ctx, + const DenseTensor& x, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const std::string& data_layout_str, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* output) { + const DataLayout data_layout = + paddle::framework::StringToDataLayout(data_layout_str); + int n, c, in_d, in_h, in_w; + funcs::ExtractNCDWH(x.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w); + + float scale_w = -1.; + if (size_tensor && size_tensor->size() > 0) { + // have size tensor + auto new_size = funcs::get_new_shape(size_tensor.get()); + out_w = new_size[0]; + } else { + if (scale_tensor) { + auto scale_data = + funcs::get_new_data_from_tensor(scale_tensor.get_ptr()); + scale_w = scale_data[0]; + PADDLE_ENFORCE_EQ( + scale_w > 0, + true, + errors::InvalidArgument( + "The scale_w in input 'Scale' Tensor of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); + } else { + if (scale.size() > 0) { + scale_w = scale[0]; + + PADDLE_ENFORCE_EQ( + scale_w > 0, + true, + errors::InvalidArgument( + "The scale_w in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); + } + } + if (scale_w > 0.) { + out_w = static_cast(in_w * scale_w); + } + if (out_size) { + auto out_size_data = + funcs::get_new_data_from_tensor(out_size.get_ptr()); + out_w = out_size_data[0]; + } + } + PADDLE_ENFORCE_GT( + out_w, + 0, + errors::InvalidArgument("out_w in Attr(out_shape) of Op(interpolate) " + "should be greater than 0.")); + phi::DDim dim_out; + if (data_layout == DataLayout::kNCHW) { + dim_out = {n, c, out_w}; + } else { + dim_out = {n, out_w, c}; + } + output->Resize(dim_out); + dev_ctx.template Alloc(output); + + if (in_w == out_w) { + paddle::framework::TensorCopy(x, dev_ctx.GetPlace(), output); + return; + } + + float ratio_w = 0.f; + if (out_w > 1) { + float new_scale_w = 0.f; + new_scale_w = (scale_w > 0) ? static_cast(1. / scale_w) + : static_cast(in_w) / out_w; + ratio_w = (align_corners) ? static_cast(in_w - 1) / (out_w - 1) + : static_cast(new_scale_w); + } + if ("linear" == interp_method) { + LinearInterpolation(x, + output, + ratio_w, + in_w, + n, + c, + out_w, + align_corners, + align_mode, + data_layout); + } +} + +template +static void Interpolate2DCPUFwd( + const Context& dev_ctx, + const DenseTensor& x, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const std::string& data_layout_str, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* output) { + const DataLayout data_layout = + paddle::framework::StringToDataLayout(data_layout_str); + int n, c, in_d, in_h, in_w; + funcs::ExtractNCDWH(x.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w); + + float scale_h = -1; + float scale_w = -1; + + if (size_tensor && size_tensor->size() > 0) { + // have size tensor + auto new_size = funcs::get_new_shape(size_tensor.get()); + out_h = new_size[0]; + out_w = new_size[1]; + } else { + if (scale_tensor) { + auto scale_data = + funcs::get_new_data_from_tensor(scale_tensor.get_ptr()); + if (scale_data.size() > 1) { + scale_h = scale_data[0]; + scale_w = scale_data[1]; + } else { + scale_h = scale_data[0]; + scale_w = scale_data[0]; + } + PADDLE_ENFORCE_EQ( + scale_w > 0, + true, + errors::InvalidArgument( + "The scale_w in input 'Scale' Tensor of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); + PADDLE_ENFORCE_EQ( + scale_h > 0, + true, + errors::InvalidArgument( + "The scale_h in input 'Scale' Tensor of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_h)); + } else { + if (scale.size() > 1) { + scale_h = scale[0]; + scale_w = scale[1]; + + PADDLE_ENFORCE_EQ( + scale_w > 0, + true, + errors::InvalidArgument( + "The scale_w in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); + PADDLE_ENFORCE_EQ( + scale_h > 0, + true, + errors::InvalidArgument( + "The scale_h in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_h)); + } + } + if (scale_h > 0. && scale_w > 0.) { + out_h = static_cast(in_h * scale_h); + out_w = static_cast(in_w * scale_w); + } + if (out_size) { + auto out_size_data = + funcs::get_new_data_from_tensor(out_size.get_ptr()); + out_h = out_size_data[0]; + out_w = out_size_data[1]; + } + } + PADDLE_ENFORCE_GT( + out_h, + 0, + errors::InvalidArgument("out_h in Attr(out_shape) of Op(interpolate) " + "should be greater than 0.")); + PADDLE_ENFORCE_GT( + out_w, + 0, + errors::InvalidArgument("out_w in Attr(out_shape) of Op(interpolate) " + "should be greater than 0.")); + phi::DDim dim_out; + if (data_layout == DataLayout::kNCHW) { + dim_out = {n, c, out_h, out_w}; + } else { + dim_out = {n, out_h, out_w, c}; + } + output->Resize(dim_out); + dev_ctx.template Alloc(output); + + if (in_h == out_h && in_w == out_w) { + paddle::framework::TensorCopy(x, dev_ctx.GetPlace(), output); + return; + } + + float ratio_h = 0.f; + float ratio_w = 0.f; + if (out_h > 1) { + float new_scale_h = 0.f; + new_scale_h = (scale_h > 0) ? static_cast(1. / scale_h) + : static_cast(in_h) / out_h; + ratio_h = (align_corners) ? static_cast(in_h - 1) / (out_h - 1) + : static_cast(new_scale_h); + } + if (out_w > 1) { + float new_scale_w = 0.f; + new_scale_w = (scale_w > 0) ? static_cast(1. / scale_w) + : static_cast(in_w) / out_w; + ratio_w = (align_corners) ? static_cast(in_w - 1) / (out_w - 1) + : static_cast(new_scale_w); + } + + if ("bilinear" == interp_method) { + BilinearInterpolation(x, + output, + ratio_h, + ratio_w, + in_h, + in_w, + n, + c, + out_h, + out_w, + align_corners, + align_mode, + data_layout); + } else if ("nearest" == interp_method) { + NearestNeighborInterpolate(x, + output, + ratio_h, + ratio_w, + n, + c, + out_h, + out_w, + align_corners, + data_layout); + } else if ("bicubic" == interp_method) { + BicubicInterpolation(x, + output, + ratio_h, + ratio_w, + in_h, + in_w, + n, + c, + out_h, + out_w, + align_corners, + data_layout); + } +} + +template +static void Interpolate3DCPUFwd( + const Context& dev_ctx, + const DenseTensor& x, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const std::string& data_layout_str, + int out_d, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* output) { + const DataLayout data_layout = + paddle::framework::StringToDataLayout(data_layout_str); + int n, c, in_d, in_h, in_w; + funcs::ExtractNCDWH(x.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w); + + float scale_d = -1; + float scale_h = -1; + float scale_w = -1; + + if (size_tensor && size_tensor->size() > 0) { + // have size tensor + auto new_size = funcs::get_new_shape(size_tensor.get()); + out_d = new_size[0]; + out_h = new_size[1]; + out_w = new_size[2]; + } else { + if (scale_tensor) { + auto scale_data = + funcs::get_new_data_from_tensor(scale_tensor.get_ptr()); + if (scale_data.size() > 1) { + scale_d = scale_data[0]; + scale_h = scale_data[1]; + scale_w = scale_data[2]; + } else { + scale_d = scale_data[0]; + scale_h = scale_data[0]; + scale_w = scale_data[0]; + } + PADDLE_ENFORCE_EQ( + scale_w > 0, + true, + errors::InvalidArgument( + "The scale_w in input 'Scale' Tensor of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); + PADDLE_ENFORCE_EQ( + scale_h > 0, + true, + errors::InvalidArgument( + "The scale_h in input 'Scale' Tensor of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_h)); + PADDLE_ENFORCE_EQ( + scale_d > 0, + true, + errors::InvalidArgument( + "The scale_d in input 'Scale' Tensor of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_d)); + } else { + if (scale.size() > 1) { + scale_d = scale[0]; + scale_h = scale[1]; + scale_w = scale[2]; + + PADDLE_ENFORCE_EQ( + scale_w > 0, + true, + errors::InvalidArgument( + "The scale_w in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); + PADDLE_ENFORCE_EQ( + scale_h > 0, + true, + errors::InvalidArgument( + "The scale_h in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_h)); + PADDLE_ENFORCE_EQ( + scale_d > 0, + true, + errors::InvalidArgument( + "The scale_d in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_d)); + } + } + if (scale_w > 0. && scale_h > 0. && scale_d > 0.) { + out_d = static_cast(in_d * scale_d); + out_h = static_cast(in_h * scale_h); + out_w = static_cast(in_w * scale_w); + } + if (out_size) { + auto out_size_data = + funcs::get_new_data_from_tensor(out_size.get_ptr()); + out_d = out_size_data[0]; + out_h = out_size_data[1]; + out_w = out_size_data[2]; + } + } + PADDLE_ENFORCE_GT( + out_d, + 0, + errors::InvalidArgument("out_d in Attr(out_shape) of Op(interpolate) " + "should be greater than 0.")); + PADDLE_ENFORCE_GT( + out_h, + 0, + errors::InvalidArgument("out_h in Attr(out_shape) of Op(interpolate) " + "should be greater than 0.")); + PADDLE_ENFORCE_GT( + out_w, + 0, + errors::InvalidArgument("out_w in Attr(out_shape) of Op(interpolate) " + "should be greater than 0.")); + + phi::DDim dim_out; + if (data_layout == DataLayout::kNCHW) { + dim_out = {n, c, out_d, out_h, out_w}; + } else { + dim_out = {n, out_d, out_h, out_w, c}; + } + + output->Resize(dim_out); + dev_ctx.template Alloc(output); + + if (in_d == out_d && in_h == out_h && in_w == out_w) { + paddle::framework::TensorCopy(x, dev_ctx.GetPlace(), output); + return; + } + + float ratio_d = 0.f; + float ratio_h = 0.f; + float ratio_w = 0.f; + if (out_d > 1) { + float new_scale_d = 0.f; + new_scale_d = (scale_d > 0) ? static_cast(1. / scale_d) + : static_cast(in_d) / out_d; + ratio_d = (align_corners) ? static_cast(in_d - 1) / (out_d - 1) + : static_cast(new_scale_d); + } + if (out_h > 1) { + float new_scale_h = 0.f; + new_scale_h = (scale_h > 0) ? static_cast(1. / scale_h) + : static_cast(in_h) / out_h; + ratio_h = (align_corners) ? static_cast(in_h - 1) / (out_h - 1) + : static_cast(new_scale_h); + } + if (out_w > 1) { + float new_scale_w = 0.f; + new_scale_w = (scale_w > 0) ? static_cast(1. / scale_w) + : static_cast(in_w) / out_w; + ratio_w = (align_corners) ? static_cast(in_w - 1) / (out_w - 1) + : static_cast(new_scale_w); + } + + if ("trilinear" == interp_method) { + TrilinearInterpolation(x, + output, + ratio_d, + ratio_h, + ratio_w, + in_d, + in_h, + in_w, + n, + c, + out_d, + out_h, + out_w, + align_corners, + align_mode, + data_layout); + } else if ("nearest" == interp_method) { + NearestNeighbor3DInterpolate(x, + output, + ratio_d, + ratio_h, + ratio_w, + n, + c, + out_d, + out_h, + out_w, + align_corners, + data_layout); + } +} + +template +void InterpolateKernel( + const Context& ctx, + const DenseTensor& x, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const std::string& data_layout, + int out_d, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* output) { + auto input_dims = x.dims(); + if (input_dims.size() == 3) { // 1D interpolation + Interpolate1DCPUFwd(ctx, + x, + out_size, + size_tensor, + scale_tensor, + data_layout, + out_w, + scale, + interp_method, + align_corners, + align_mode, + output); + + } else if (input_dims.size() == 4) { // 2D interpolation + Interpolate2DCPUFwd(ctx, + x, + out_size, + size_tensor, + scale_tensor, + data_layout, + out_h, + out_w, + scale, + interp_method, + align_corners, + align_mode, + output); + } else if (input_dims.size() == 5) { // 3D interpolation + Interpolate3DCPUFwd(ctx, + x, + out_size, + size_tensor, + scale_tensor, + data_layout, + out_d, + out_h, + out_w, + scale, + interp_method, + align_corners, + align_mode, + output); + } +} + +template +void BilinearInterpKernel( + const Context& ctx, + const DenseTensor& x, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const std::string& data_layout, + int out_d, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* output) { + InterpolateKernel(ctx, + x, + out_size, + size_tensor, + scale_tensor, + data_layout, + out_d, + out_h, + out_w, + scale, + interp_method, + align_corners, + align_mode, + output); +} + +template +void NearestInterpKernel( + const Context& ctx, + const DenseTensor& x, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const std::string& data_layout, + int out_d, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* output) { + InterpolateKernel(ctx, + x, + out_size, + size_tensor, + scale_tensor, + data_layout, + out_d, + out_h, + out_w, + scale, + interp_method, + align_corners, + align_mode, + output); +} + +template +void TrilinearInterpKernel( + const Context& ctx, + const DenseTensor& x, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const std::string& data_layout, + int out_d, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* output) { + InterpolateKernel(ctx, + x, + out_size, + size_tensor, + scale_tensor, + data_layout, + out_d, + out_h, + out_w, + scale, + interp_method, + align_corners, + align_mode, + output); +} + +template +void LinearInterpKernel( + const Context& ctx, + const DenseTensor& x, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const std::string& data_layout, + int out_d, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* output) { + InterpolateKernel(ctx, + x, + out_size, + size_tensor, + scale_tensor, + data_layout, + out_d, + out_h, + out_w, + scale, + interp_method, + align_corners, + align_mode, + output); +} + +template +void BicubicInterpKernel( + const Context& ctx, + const DenseTensor& x, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const std::string& data_layout, + int out_d, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* output) { + InterpolateKernel(ctx, + x, + out_size, + size_tensor, + scale_tensor, + data_layout, + out_d, + out_h, + out_w, + scale, + interp_method, + align_corners, + align_mode, + output); +} + +} // namespace phi + +PD_REGISTER_KERNEL(bilinear_interp_v2, + CPU, + ALL_LAYOUT, + phi::BilinearInterpKernel, + float, + double, + uint8_t) {} +PD_REGISTER_KERNEL(nearest_interp_v2, + CPU, + ALL_LAYOUT, + phi::NearestInterpKernel, + float, + double, + int, + int64_t, + uint8_t) {} +PD_REGISTER_KERNEL(trilinear_interp_v2, + CPU, + ALL_LAYOUT, + phi::TrilinearInterpKernel, + float, + double, + uint8_t) {} +PD_REGISTER_KERNEL(linear_interp_v2, + CPU, + ALL_LAYOUT, + phi::LinearInterpKernel, + float, + double, + uint8_t) {} +PD_REGISTER_KERNEL(bicubic_interp_v2, + CPU, + ALL_LAYOUT, + phi::BicubicInterpKernel, + float, + double) {} diff --git a/paddle/phi/kernels/cpu/masked_select_grad_kernel.cc b/paddle/phi/kernels/cpu/masked_select_grad_kernel.cc index 7fe41e686af8c..bbb08f0616776 100644 --- a/paddle/phi/kernels/cpu/masked_select_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/masked_select_grad_kernel.cc @@ -21,9 +21,9 @@ namespace phi { template void MaskedSelectGradKernel(const Context& dev_ctx, - const DenseTensor& out_grad, const DenseTensor& x, const DenseTensor& mask, + const DenseTensor& out_grad, DenseTensor* x_grad) { auto* mask_data = mask.data(); auto* input_data = out_grad.data(); diff --git a/paddle/phi/kernels/cpu/nll_loss_grad_kernel.cc b/paddle/phi/kernels/cpu/nll_loss_grad_kernel.cc index e7d74759f516a..5b859b6ec270e 100644 --- a/paddle/phi/kernels/cpu/nll_loss_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/nll_loss_grad_kernel.cc @@ -121,8 +121,8 @@ template void NllLossGradKernel(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& labels, - const DenseTensor& total_weight, paddle::optional weight, + const DenseTensor& total_weight, const DenseTensor& d_out, int64_t ignore_index, const std::string& reduction, diff --git a/paddle/phi/kernels/cpu/pad3d_grad_kernel.cc b/paddle/phi/kernels/cpu/pad3d_grad_kernel.cc index b1adb3e206da9..1b73acfa01509 100644 --- a/paddle/phi/kernels/cpu/pad3d_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/pad3d_grad_kernel.cc @@ -362,7 +362,7 @@ template void Pad3dGradKernel(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& out_grad, - const ScalarArray& paddings, + const IntArray& paddings, const std::string& mode, float pad_value, const std::string& data_format, diff --git a/paddle/phi/kernels/cpu/pad3d_kernel.cc b/paddle/phi/kernels/cpu/pad3d_kernel.cc index 68bd92168364d..aa44491a2a967 100644 --- a/paddle/phi/kernels/cpu/pad3d_kernel.cc +++ b/paddle/phi/kernels/cpu/pad3d_kernel.cc @@ -379,7 +379,7 @@ void Pad3DNDHWC(const T* in_data, template void Pad3dKernel(const Context& dev_ctx, const DenseTensor& x, - const ScalarArray& paddings, + const IntArray& paddings, const std::string& mode, float pad_value, const std::string& data_format, diff --git a/paddle/phi/kernels/cpu/randint_kernel.cc b/paddle/phi/kernels/cpu/randint_kernel.cc index feb418949ba40..2c7433f5d2100 100644 --- a/paddle/phi/kernels/cpu/randint_kernel.cc +++ b/paddle/phi/kernels/cpu/randint_kernel.cc @@ -25,7 +25,7 @@ template void RandintRawKernel(const Context& dev_ctx, int low, int high, - const ScalarArray& shape, + const IntArray& shape, DataType dtype, int seed, DenseTensor* out) { @@ -49,7 +49,7 @@ template void RandintKernel(const Context& dev_ctx, int low, int high, - const ScalarArray& shape, + const IntArray& shape, DataType dtype, DenseTensor* out) { RandintRawKernel(dev_ctx, low, high, shape, dtype, 0, out); diff --git a/paddle/phi/kernels/cpu/range_kernel.cc b/paddle/phi/kernels/cpu/range_kernel.cc new file mode 100644 index 0000000000000..8731696f61760 --- /dev/null +++ b/paddle/phi/kernels/cpu/range_kernel.cc @@ -0,0 +1,45 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/range_kernel.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/range_function.h" + +namespace phi { + +template +void RangeKernel(const Context& dev_ctx, + const DenseTensor& start, + const DenseTensor& end, + const DenseTensor& step, + DenseTensor* out) { + T start_value = start.data()[0]; + T end_value = end.data()[0]; + T step_value = step.data()[0]; + int64_t size = 0; + phi::funcs::GetSize(start_value, end_value, step_value, &size); + out->Resize(phi::make_ddim({size})); + T* out_data = dev_ctx.template Alloc(out); + T value = start_value; + for (int64_t i = 0; i < size; ++i) { + out_data[i] = value; + value += step_value; + } +} + +} // namespace phi + +PD_REGISTER_KERNEL( + range, CPU, ALL_LAYOUT, phi::RangeKernel, float, double, int, int64_t) {} diff --git a/paddle/phi/kernels/cpu/roll_grad_kernel.cc b/paddle/phi/kernels/cpu/roll_grad_kernel.cc index b0d0c0663e4a2..b3bd27fca12a6 100644 --- a/paddle/phi/kernels/cpu/roll_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/roll_grad_kernel.cc @@ -24,7 +24,7 @@ template void RollGradKernel(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& out_grad, - const ScalarArray& shifts, + const IntArray& shifts, const std::vector& axis, DenseTensor* x_grad) { std::vector out_vec; diff --git a/paddle/phi/kernels/cpu/roll_kernel.cc b/paddle/phi/kernels/cpu/roll_kernel.cc index 25b64ef257dfb..67eb80304de58 100644 --- a/paddle/phi/kernels/cpu/roll_kernel.cc +++ b/paddle/phi/kernels/cpu/roll_kernel.cc @@ -25,7 +25,7 @@ namespace phi { template void RollKernel(const Context& dev_ctx, const DenseTensor& x, - const ScalarArray& shifts, + const IntArray& shifts, const std::vector& axis, DenseTensor* out) { std::vector out_vec; diff --git a/paddle/phi/kernels/cpu/roll_kernel_impl.h b/paddle/phi/kernels/cpu/roll_kernel_impl.h index 924e71aff31f3..e2d96b896ac6a 100644 --- a/paddle/phi/kernels/cpu/roll_kernel_impl.h +++ b/paddle/phi/kernels/cpu/roll_kernel_impl.h @@ -14,7 +14,7 @@ #pragma once -#include "paddle/phi/common/scalar_array.h" +#include "paddle/phi/common/int_array.h" #include "paddle/phi/core/dense_tensor.h" namespace phi { diff --git a/paddle/phi/kernels/cpu/split_kernel.cc b/paddle/phi/kernels/cpu/split_kernel.cc index ea8e2702c19d6..56d872922490a 100644 --- a/paddle/phi/kernels/cpu/split_kernel.cc +++ b/paddle/phi/kernels/cpu/split_kernel.cc @@ -25,7 +25,7 @@ namespace phi { template void SplitKernel(const Context& dev_ctx, const DenseTensor& x, - const ScalarArray& num_or_sections, + const IntArray& num_or_sections, const Scalar& axis_scalar, std::vector outs) { // need to infershape output diff --git a/paddle/phi/kernels/cpu/stack_grad_kernel.cc b/paddle/phi/kernels/cpu/stack_grad_kernel.cc new file mode 100644 index 0000000000000..018705333e962 --- /dev/null +++ b/paddle/phi/kernels/cpu/stack_grad_kernel.cc @@ -0,0 +1,59 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/stack_grad_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/stack_functor.h" + +namespace phi { + +template +void StackGradKernel(const Context& dev_ctx, + const DenseTensor& out, + int axis, + std::vector x_grad) { + if (axis < 0) axis += out.dims().size(); + int n = out.dims()[axis]; + std::vector dx_datas(n); // NOLINT + + for (int i = 0; i < n; i++) { + if (x_grad[i] == nullptr) { + dx_datas[i] = nullptr; + } else { + dx_datas[i] = dev_ctx.template Alloc(x_grad[i]); + } + } + auto dy_data = out.data(); + int pre = 1; + for (int i = 0; i < axis; ++i) pre *= out.dims()[i]; + int total_num = out.numel(); + int post = total_num / (n * pre); + auto dx_data_arr = dx_datas.data(); + phi::funcs::StackGradFunctorForRange( + dev_ctx, dx_data_arr, dy_data, total_num, n, post); +} + +} // namespace phi + +PD_REGISTER_KERNEL(stack_grad, + CPU, + ALL_LAYOUT, + phi::StackGradKernel, + float, + double, + int64_t, + int, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/cpu/stack_kernel.cc b/paddle/phi/kernels/cpu/stack_kernel.cc new file mode 100644 index 0000000000000..5eb1cf061be2b --- /dev/null +++ b/paddle/phi/kernels/cpu/stack_kernel.cc @@ -0,0 +1,62 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/stack_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void StackKernel(const Context& dev_ctx, + const std::vector& x, + int axis, + DenseTensor* out) { + if (axis < 0) axis += (x[0]->dims().size() + 1); + int n = static_cast(x.size()); + T* y_data = dev_ctx.template Alloc(out); + std::vector x_datas(n); + for (int i = 0; i < n; i++) x_datas[i] = x[i]->data(); + + int pre = 1, post = 1; + auto& dim = x[0]->dims(); + for (auto i = 0; i < axis; ++i) pre *= dim[i]; + for (auto i = axis; i < dim.size(); ++i) post *= dim[i]; + + auto x_data_arr = x_datas.data(); + + size_t x_offset = 0; + size_t y_offset = 0; + for (int i = 0; i < pre; i++) { + for (int j = 0; j < n; j++) { + std::memcpy( + y_data + y_offset, x_data_arr[j] + x_offset, post * sizeof(T)); + y_offset += post; + } + x_offset += post; + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(stack, + CPU, + ALL_LAYOUT, + phi::StackKernel, + float, + double, + int, + int64_t, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/cpu/strided_slice_grad_kernel.cc b/paddle/phi/kernels/cpu/strided_slice_grad_kernel.cc index cdc5534d63c08..e6c812cf6bd5a 100644 --- a/paddle/phi/kernels/cpu/strided_slice_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/strided_slice_grad_kernel.cc @@ -19,10 +19,10 @@ #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/strided_slice_grad_kernel_impl.h" -PD_REGISTER_KERNEL(strided_slice_grad, +PD_REGISTER_KERNEL(strided_slice_raw_grad, CPU, ALL_LAYOUT, - phi::StridedSliceGradKernel, + phi::StridedSliceRawGradKernel, bool, int, int64_t, diff --git a/paddle/phi/kernels/cpu/strided_slice_kernel.cc b/paddle/phi/kernels/cpu/strided_slice_kernel.cc index f34a3301fcb42..d0aa7b2f4cee6 100644 --- a/paddle/phi/kernels/cpu/strided_slice_kernel.cc +++ b/paddle/phi/kernels/cpu/strided_slice_kernel.cc @@ -19,10 +19,10 @@ #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/strided_slice_kernel_impl.h" -PD_REGISTER_KERNEL(strided_slice, +PD_REGISTER_KERNEL(strided_slice_raw, CPU, ALL_LAYOUT, - phi::StridedSliceKernel, + phi::StridedSliceRawKernel, bool, int, int64_t, diff --git a/paddle/phi/kernels/cpu/top_k_grad_kernel.cc b/paddle/phi/kernels/cpu/top_k_grad_kernel.cc index 582ee1157cce8..e44f85fb6c0fb 100644 --- a/paddle/phi/kernels/cpu/top_k_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/top_k_grad_kernel.cc @@ -51,17 +51,17 @@ static void FullTopKAssign(const Type& input_height, template void TopkGradKernel(const Context& dev_ctx, - const DenseTensor& out_grad, const DenseTensor& x, const DenseTensor& indices, - int k, + const DenseTensor& out_grad, + const Scalar& k_scalar, int axis, bool largest, bool sorted, DenseTensor* x_grad) { const auto& in_dims = x.dims(); const auto& out_dims = indices.dims(); - + int k = k_scalar.to(); // axis < 0, get the real axis axis = (axis < 0) ? (in_dims.size() + axis) : axis; diff --git a/paddle/phi/kernels/cpu/triangular_solve_kernel.cc b/paddle/phi/kernels/cpu/triangular_solve_kernel.cc index c91e7475f5b7c..f2af19792615c 100644 --- a/paddle/phi/kernels/cpu/triangular_solve_kernel.cc +++ b/paddle/phi/kernels/cpu/triangular_solve_kernel.cc @@ -41,14 +41,14 @@ void TriangularSolveKernel(const Context& dev_ctx, int y_bst_ndim = y_bst_dims_vec.size(); // Tensor broadcast to 'out' and temp 'x_bst' - ScalarArray x_bst_dims(x_bst_dims_vec); + IntArray x_bst_dims(x_bst_dims_vec); DenseTensor x_bst = phi::Empty(dev_ctx, x_bst_dims); const T* x_bst_data = x_bst.data(); ExpandKernel(dev_ctx, x, x_bst_dims, &x_bst); out->Resize(phi::make_ddim(y_bst_dims_vec)); T* out_data = dev_ctx.template Alloc(out); - ScalarArray y_bst_dims(y_bst_dims_vec); + IntArray y_bst_dims(y_bst_dims_vec); ExpandKernel(dev_ctx, y, y_bst_dims, out); // Calculate use blas library diff --git a/paddle/phi/kernels/cpu/uniform_random_kernel.cc b/paddle/phi/kernels/cpu/uniform_random_kernel.cc index 8ec1d9683e15a..91a6903418230 100644 --- a/paddle/phi/kernels/cpu/uniform_random_kernel.cc +++ b/paddle/phi/kernels/cpu/uniform_random_kernel.cc @@ -44,7 +44,7 @@ inline void UniformRealDistribution(phi::dtype::bfloat16 *data, template void UniformRandomRawKernel(const Context &dev_ctx, - const ScalarArray &shape, + const IntArray &shape, DataType dtype, float min, float max, @@ -86,7 +86,7 @@ void UniformRandomRawKernel(const Context &dev_ctx, template void UniformRandomKernel(const Context &dev_ctx, - const ScalarArray &shape, + const IntArray &shape, DataType dtype, float min, float max, diff --git a/paddle/phi/kernels/cpu/unique_kernel.cc b/paddle/phi/kernels/cpu/unique_kernel.cc new file mode 100644 index 0000000000000..853b401315d22 --- /dev/null +++ b/paddle/phi/kernels/cpu/unique_kernel.cc @@ -0,0 +1,131 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/unique_kernel.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/utils/data_type.h" +#include "paddle/phi/kernels/funcs/unique_functor.h" + +namespace phi { + +template +void UniqueKernel(const Context& context, + const DenseTensor& x, + bool return_index, + bool return_inverse, + bool return_counts, + const std::vector& axis, + DataType dtype, + DenseTensor* out, + DenseTensor* indices, + DenseTensor* index, + DenseTensor* counts) { + bool is_sorted = true; + UniqueRawKernel(context, + x, + return_index, + return_inverse, + return_counts, + axis, + dtype, + is_sorted, + out, + indices, + index, + counts); +} + +template +void UniqueRawKernel(const Context& context, + const DenseTensor& x, + bool return_index, + bool return_inverse, + bool return_counts, + const std::vector& axis, + DataType dtype, + bool is_sorted, + DenseTensor* out, + DenseTensor* indices, + DenseTensor* index, + DenseTensor* counts) { + if (dtype == phi::DataType::INT32) { + PADDLE_ENFORCE_LE( + x.numel(), + INT_MAX, + phi::errors::InvalidArgument( + "The number of elements in Input(X) should be less than or " + "equal to INT_MAX, but received num is %d. Please set `dtype` to " + "int64.", + x.numel())); + } + if (!is_sorted) { + phi::VisitDataType( + dtype, + phi::funcs::UniqueOpFunctor(context, out, index, &x)); + return; + } + + if (x.numel() == 0) { + context.template Alloc(out); + return; + } + if (axis.empty()) { + phi::VisitDataTypeTiny( + dtype, + phi::funcs::UniqueFlattendTensorFunctor(context, + x, + out, + indices, + index, + counts, + return_index, + return_inverse, + return_counts)); + } else { + int axis_value = axis[0]; + phi::VisitDataTypeTiny( + dtype, + phi::funcs::UniqueDimFunctor(context, + x, + out, + indices, + index, + counts, + axis_value, + return_index, + return_inverse, + return_counts)); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(unique, + CPU, + ALL_LAYOUT, + phi::UniqueKernel, + float, + double, + int32_t, + int64_t) {} + +PD_REGISTER_KERNEL(unique_raw, + CPU, + ALL_LAYOUT, + phi::UniqueRawKernel, + float, + double, + int32_t, + int64_t) {} diff --git a/paddle/phi/kernels/cpu/unstack_grad_kernel.cc b/paddle/phi/kernels/cpu/unstack_grad_kernel.cc new file mode 100644 index 0000000000000..9c2dce808dca7 --- /dev/null +++ b/paddle/phi/kernels/cpu/unstack_grad_kernel.cc @@ -0,0 +1,27 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/unstack_grad_kernel.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/unstack_grad_kernel_impl.h" + +PD_REGISTER_KERNEL(unstack_grad, + CPU, + ALL_LAYOUT, + phi::UnStackGradKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/cpu/unstack_kernel.cc b/paddle/phi/kernels/cpu/unstack_kernel.cc new file mode 100644 index 0000000000000..3d233e9ec405f --- /dev/null +++ b/paddle/phi/kernels/cpu/unstack_kernel.cc @@ -0,0 +1,22 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/unstack_kernel.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/unstack_kernel_impl.h" + +PD_REGISTER_KERNEL( + unstack, CPU, ALL_LAYOUT, phi::UnStackKernel, float, double, int, int64_t) { +} diff --git a/paddle/phi/kernels/cross_entropy_grad_kernel.h b/paddle/phi/kernels/cross_entropy_grad_kernel.h new file mode 100644 index 0000000000000..ae4b0436c93ca --- /dev/null +++ b/paddle/phi/kernels/cross_entropy_grad_kernel.h @@ -0,0 +1,33 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void CrossEntropyWithSoftmaxGradKernel(const Context& dev_ctx, + const DenseTensor& label, + const DenseTensor& softmax, + const DenseTensor& loss_grad, + bool soft_label, + bool use_softmax, + bool numeric_stable_mode, + int ignore_index, + int axis, + DenseTensor* logits_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/cross_entropy_kernel.h b/paddle/phi/kernels/cross_entropy_kernel.h new file mode 100644 index 0000000000000..621c5f3666213 --- /dev/null +++ b/paddle/phi/kernels/cross_entropy_kernel.h @@ -0,0 +1,39 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +// The deformed product of operator iterative upgrade, there is no strict 2.0 +// API corresponding to it! In 2.0 API paddle.nn.functional.cross_entropy, +// use_softmax has become an optional argument, which may be called +// CrossEntropyWithSoftmax more accurately, here we keep this kernel arguments +// same as original OpMaker, and if need a CrossEntropyKernel like +// paddle.nn.functional.cross_entropy, we can reuse this kernel +template +void CrossEntropyWithSoftmaxKernel(const Context& dev_ctx, + const DenseTensor& logits, + const DenseTensor& label, + bool soft_label, + bool use_softmax, + bool numeric_stable_mode, + int ignore_index, + int axis, + DenseTensor* softmax, + DenseTensor* loss); + +} // namespace phi diff --git a/paddle/phi/kernels/empty_kernel.cc b/paddle/phi/kernels/empty_kernel.cc index 6e5f15fe1692b..e547e0ea1318d 100644 --- a/paddle/phi/kernels/empty_kernel.cc +++ b/paddle/phi/kernels/empty_kernel.cc @@ -22,7 +22,7 @@ namespace phi { template void EmptyKernel(const Context& dev_ctx, - const ScalarArray& shape, + const IntArray& shape, DataType dtype, DenseTensor* out) { out->Resize(phi::make_ddim(shape.GetData())); diff --git a/paddle/phi/kernels/empty_kernel.h b/paddle/phi/kernels/empty_kernel.h index f66f4419fd7f5..163179e578d9a 100644 --- a/paddle/phi/kernels/empty_kernel.h +++ b/paddle/phi/kernels/empty_kernel.h @@ -14,7 +14,7 @@ #pragma once -#include "paddle/phi/common/scalar_array.h" +#include "paddle/phi/common/int_array.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/device_context.h" #include "paddle/phi/infermeta/nullary.h" @@ -24,7 +24,7 @@ namespace phi { template void EmptyKernel(const Context& dev_ctx, - const ScalarArray& shape, + const IntArray& shape, DataType dtype, DenseTensor* out); @@ -43,7 +43,7 @@ DenseTensor Empty(const Context& dev_ctx, DenseTensorMeta&& meta) { } template -DenseTensor Empty(const Context& dev_ctx, const ScalarArray& shape) { +DenseTensor Empty(const Context& dev_ctx, const IntArray& shape) { DenseTensor dense_out; MetaTensor meta_out(&dense_out); DataType dtype = paddle::experimental::CppTypeToDataType::Type(); diff --git a/paddle/phi/kernels/expand_grad_kernel.h b/paddle/phi/kernels/expand_grad_kernel.h index d35b4b663e0fe..a734498b9870d 100644 --- a/paddle/phi/kernels/expand_grad_kernel.h +++ b/paddle/phi/kernels/expand_grad_kernel.h @@ -14,7 +14,7 @@ #pragma once -#include "paddle/phi/common/scalar_array.h" +#include "paddle/phi/common/int_array.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/device_context.h" @@ -24,7 +24,7 @@ template void ExpandGradKernel(const Context& ctx, const DenseTensor& x, const DenseTensor& out_grad, - const ScalarArray& shape, + const IntArray& shape, DenseTensor* in_grad); } // namespace phi diff --git a/paddle/phi/kernels/expand_kernel.h b/paddle/phi/kernels/expand_kernel.h index fb5a0112ffcf7..3b44c46e4dd7c 100644 --- a/paddle/phi/kernels/expand_kernel.h +++ b/paddle/phi/kernels/expand_kernel.h @@ -14,7 +14,7 @@ #pragma once -#include "paddle/phi/common/scalar_array.h" +#include "paddle/phi/common/int_array.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/device_context.h" @@ -23,7 +23,7 @@ namespace phi { template void ExpandKernel(const Context& ctx, const DenseTensor& x, - const ScalarArray& shape, + const IntArray& shape, DenseTensor* out); } // namepsace phi diff --git a/paddle/phi/kernels/full_kernel.h b/paddle/phi/kernels/full_kernel.h index df82e651a0b26..d5785f2eedafa 100644 --- a/paddle/phi/kernels/full_kernel.h +++ b/paddle/phi/kernels/full_kernel.h @@ -16,8 +16,8 @@ #include +#include "paddle/phi/common/int_array.h" #include "paddle/phi/common/scalar.h" -#include "paddle/phi/common/scalar_array.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/infermeta/nullary.h" @@ -27,7 +27,7 @@ namespace phi { template void FullKernel(const Context& dev_ctx, - const ScalarArray& shape, + const IntArray& shape, const Scalar& val, DataType dtype, DenseTensor* out); @@ -53,7 +53,7 @@ void FullBatchSizeLikeKernel(const Context& dev_ctx, template void Full(const Context& dev_ctx, - const ScalarArray& shape, + const IntArray& shape, const Scalar& val, DenseTensor* out) { FullKernel(dev_ctx, @@ -65,7 +65,7 @@ void Full(const Context& dev_ctx, template DenseTensor Full(const Context& dev_ctx, - const ScalarArray& shape, + const IntArray& shape, const Scalar& val) { DenseTensor dense_out; MetaTensor meta_out(&dense_out); diff --git a/paddle/phi/kernels/funcs/aligned_vector.h b/paddle/phi/kernels/funcs/aligned_vector.h index 9382b03cf9368..d71a61f107a7a 100644 --- a/paddle/phi/kernels/funcs/aligned_vector.h +++ b/paddle/phi/kernels/funcs/aligned_vector.h @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once - +#include #include "paddle/phi/core/hostdevice.h" namespace phi { diff --git a/paddle/phi/kernels/funcs/interpolate_function.h b/paddle/phi/kernels/funcs/interpolate_function.h new file mode 100644 index 0000000000000..453f9ea87c7cc --- /dev/null +++ b/paddle/phi/kernels/funcs/interpolate_function.h @@ -0,0 +1,154 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/phi/common/layout.h" +#include "paddle/phi/core/ddim.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" + +#if defined(__NVCC__) || defined(__HIPCC__) +#include "paddle/fluid/platform/fast_divmod.h" +#endif + +namespace phi { +namespace funcs { + +template +HOSTDEVICE inline T CubicConvolution1(T x, T A) { + return ((A + 2) * x - (A + 3)) * x * x + 1; +} + +template +HOSTDEVICE inline T CubicConvolution2(T x, T A) { + return ((A * x - 5 * A) * x + 8 * A) * x - 4 * A; +} + +template +HOSTDEVICE inline void get_cubic_upsample_coefficients(T coeffs[4], T t) { + T A = -0.75; + + T x1 = t; + coeffs[0] = CubicConvolution2(x1 + 1.0, A); + coeffs[1] = CubicConvolution1(x1, A); + + // opposite coefficients + T x2 = 1.0 - t; + coeffs[2] = CubicConvolution1(x2, A); + coeffs[3] = CubicConvolution2(x2 + 1.0, A); +} + +inline void ExtractNCDWH(const DDim& dims, + const DataLayout& data_layout, + int* N, + int* C, + int* D, + int* H, + int* W) { + *N = dims[0]; + + if (dims.size() == 3) { + *C = data_layout == DataLayout::kNCHW ? dims[1] : dims[2]; + *D = 1; + *H = 1; + *W = data_layout == DataLayout::kNCHW ? dims[2] : dims[1]; + } else if (dims.size() == 4) { + *C = data_layout == DataLayout::kNCHW ? dims[1] : dims[3]; + *D = 1; + *H = data_layout == DataLayout::kNCHW ? dims[2] : dims[1]; + *W = data_layout == DataLayout::kNCHW ? dims[3] : dims[2]; + } else { + *C = data_layout == DataLayout::kNCHW ? dims[1] : dims[4]; + *D = data_layout == DataLayout::kNCHW ? dims[2] : dims[1]; + *H = data_layout == DataLayout::kNCHW ? dims[3] : dims[2]; + *W = data_layout == DataLayout::kNCHW ? dims[4] : dims[3]; + } +} + +inline std::vector get_new_shape( + const std::vector& list_new_shape_tensor) { + // get tensor from + std::vector vec_new_shape; + for (size_t i = 0; i < list_new_shape_tensor.size(); ++i) { + auto tensor = list_new_shape_tensor[i]; + PADDLE_ENFORCE_EQ( + tensor->dims(), + phi::make_ddim({1}), + errors::InvalidArgument("The shape of dimension tensor should be [1]," + "but received d%.", + tensor->dims())); + if (paddle::platform::is_gpu_place(tensor->place())) { + DenseTensor temp; + paddle::framework::TensorCopySync( + *tensor, paddle::platform::CPUPlace(), &temp); + vec_new_shape.push_back(static_cast(*temp.data())); + } else { + vec_new_shape.push_back(static_cast(*tensor->data())); + } + } + + return vec_new_shape; +} + +template +inline std::vector get_new_data_from_tensor( + const DenseTensor* new_data_tensor) { + std::vector vec_new_data; + auto* new_data = new_data_tensor->data(); + DenseTensor cpu_starts_tensor; + if (paddle::platform::is_gpu_place(new_data_tensor->place())) { + paddle::framework::TensorCopySync( + *new_data_tensor, paddle::platform::CPUPlace(), &cpu_starts_tensor); + new_data = cpu_starts_tensor.data(); + } +#ifdef PADDLE_WITH_ASCEND_CL + if (paddle::platform::is_npu_place(new_data_tensor->place())) { + paddle::framework::TensorCopySync( + *new_data_tensor, paddle::platform::CPUPlace(), &cpu_starts_tensor); + new_data = cpu_starts_tensor.data(); + } +#endif +#ifdef PADDLE_WITH_XPU + if (paddle::platform::is_xpu_place(new_data_tensor->place())) { + paddle::framework::TensorCopySync( + *new_data_tensor, paddle::platform::CPUPlace(), &cpu_starts_tensor); + new_data = cpu_starts_tensor.data(); + } +#endif + vec_new_data = std::vector(new_data, new_data + new_data_tensor->numel()); + return vec_new_data; +} + +#if defined(__NVCC__) || defined(__HIPCC__) +using paddle::platform::FastDivMod; + +struct FastDivModForInterpolate { + public: + FastDivMod channels_div; + FastDivMod output_w_div; + FastDivMod output_wc_div; + + explicit HOSTDEVICE FastDivModForInterpolate(const int channels, + const int output_w, + const int outout_wc) + : channels_div(FastDivMod(channels)), + output_w_div(FastDivMod(output_w)), + output_wc_div(FastDivMod(outout_wc)) {} +}; + +#endif + +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/funcs/range_function.h b/paddle/phi/kernels/funcs/range_function.h new file mode 100644 index 0000000000000..5ace32f46ace1 --- /dev/null +++ b/paddle/phi/kernels/funcs/range_function.h @@ -0,0 +1,49 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/phi/core/enforce.h" + +namespace phi { +namespace funcs { + +template +void GetSize(T start, T end, T step, int64_t* size) { + PADDLE_ENFORCE_NE( + step, + 0, + phi::errors::InvalidArgument("The step of range op should not be 0.")); + + if (start < end) { + PADDLE_ENFORCE_GT( + step, + 0, + phi::errors::InvalidArgument( + "The step should be greater than 0 while start < end.")); + } + + if (start > end) { + PADDLE_ENFORCE_LT(step, + 0, + phi::errors::InvalidArgument( + "The step should be less than 0 while start > end.")); + } + + *size = std::is_integral::value + ? ((std::abs(end - start) + std::abs(step) - 1) / std::abs(step)) + : std::ceil(std::abs((end - start) / step)); +} + +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/funcs/select_impl.cu.h b/paddle/phi/kernels/funcs/select_impl.cu.h index 16e00414ad772..193b9f614c9d5 100644 --- a/paddle/phi/kernels/funcs/select_impl.cu.h +++ b/paddle/phi/kernels/funcs/select_impl.cu.h @@ -414,7 +414,7 @@ void SelectKernel(const KPDevice &dev_ctx, // 1.2 alloc tmp data for CoutBlock const int size_count_block = need_grids + 1; std::vector dims_vec = {size_count_block * 2}; - ScalarArray dims_array(dims_vec); + IntArray dims_array(dims_vec); DenseTensor count_mem = phi::Empty(dev_ctx, dims_array); CT *count_data = count_mem.data(); // 1.3 launch CountKernl diff --git a/paddle/phi/kernels/funcs/stack_functor.h b/paddle/phi/kernels/funcs/stack_functor.h new file mode 100644 index 0000000000000..68379c27058ad --- /dev/null +++ b/paddle/phi/kernels/funcs/stack_functor.h @@ -0,0 +1,83 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/kernels/funcs/for_range.h" + +namespace phi { +namespace funcs { + +template +struct StackFunctor { + HOSTDEVICE StackFunctor(const VecXType &x, T *y, int n, int post) + : x_(x), y_(y), n_(n), post_(post) {} + + HOSTDEVICE void operator()(int idx) { + int i = idx / (n_ * post_); + int which_x = idx / post_ - i * n_; + int x_index = i * post_ + idx % post_; + y_[idx] = x_[which_x][x_index]; + } + + private: + VecXType x_; + T *y_; + int n_; + int post_; +}; + +template +struct StackGradFunctor { + HOSTDEVICE StackGradFunctor(const VecDxType &dx, const T *dy, int n, int post) + : dx_(dx), dy_(dy), n_(n), post_(post) {} + + HOSTDEVICE void operator()(int idx) { + int i = idx / (n_ * post_); + int which_x = idx / post_ - i * n_; + int x_index = i * post_ + idx % post_; + if (dx_[which_x] != nullptr) dx_[which_x][x_index] = dy_[idx]; + } + + private: + VecDxType dx_; + const T *dy_; + int n_; + int post_; +}; + +template +static inline void StackFunctorForRange(const DeviceContext &ctx, + const VecXType &x, + T *y, + int total_num, + int n, + int post) { + phi::funcs::ForRange for_range(ctx, total_num); + for_range(StackFunctor(x, y, n, post)); +} + +template +static inline void StackGradFunctorForRange(const DeviceContext &ctx, + const VecDxType &dx, + const T *dy, + int total_num, + int n, + int post) { + phi::funcs::ForRange for_range(ctx, total_num); + for_range(StackGradFunctor(dx, dy, n, post)); +} + +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/funcs/strided_slice.h b/paddle/phi/kernels/funcs/strided_slice.h index 38a611ba26e22..8eebfc7caa795 100644 --- a/paddle/phi/kernels/funcs/strided_slice.h +++ b/paddle/phi/kernels/funcs/strided_slice.h @@ -178,9 +178,9 @@ template void StridedSliceCompute(const Context& dev_ctx, const DenseTensor& x, const std::vector& axes, - const ScalarArray& starts, - const ScalarArray& ends, - const ScalarArray& strides, + const IntArray& starts, + const IntArray& ends, + const IntArray& strides, const std::vector& infer_flags, const std::vector& decrease_axis, DenseTensor* out) { @@ -295,9 +295,9 @@ template void StridedSliceCompute(const Context& dev_ctx, const std::vector& x, const std::vector& axes, - const ScalarArray& starts, - const ScalarArray& ends, - const ScalarArray& strides, + const IntArray& starts, + const IntArray& ends, + const IntArray& strides, const std::vector& infer_flags, const std::vector& decrease_axis, std::vector out) { @@ -446,9 +446,9 @@ void StridedSliceGradCompute(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& out_grad, const std::vector& axes, - const ScalarArray& starts, - const ScalarArray& ends, - const ScalarArray& strides, + const IntArray& starts, + const IntArray& ends, + const IntArray& strides, const std::vector& infer_flags, const std::vector& decrease_axis, DenseTensor* x_grad) { @@ -530,9 +530,9 @@ void StridedSliceGradCompute(const Context& dev_ctx, const std::vector& x, const std::vector& out_grad, const std::vector& axes, - const ScalarArray& starts, - const ScalarArray& ends, - const ScalarArray& strides, + const IntArray& starts, + const IntArray& ends, + const IntArray& strides, const std::vector& infer_flags, const std::vector& decrease_axis, std::vector x_grad) { diff --git a/paddle/phi/kernels/funcs/unique_functor.h b/paddle/phi/kernels/funcs/unique_functor.h new file mode 100644 index 0000000000000..2bb51cdab65c6 --- /dev/null +++ b/paddle/phi/kernels/funcs/unique_functor.h @@ -0,0 +1,426 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/fluid/framework/convert_utils.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/kernels/funcs/concat_and_split_functor.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { +namespace funcs { + +template +struct UniqueOpFunctor { + const Context& context_; + DenseTensor* out_; + DenseTensor* index_; + const DenseTensor* in_; + DenseTensor* count_; + + UniqueOpFunctor(const Context& context, + DenseTensor* out, + DenseTensor* index, + const DenseTensor* in, + DenseTensor* count = nullptr) + : context_(context), out_(out), index_(index), in_(in), count_(count) {} + + template + void apply() const { + auto* in_data = in_->data(); + auto* index_data = context_.template Alloc(index_); + + int64_t j = 0; + + // TODO(fangzeyang): Should optimize performance here. + std::unordered_map dict; + std::vector uniq; + + PADDLE_ENFORCE_LT( + in_->numel(), + pow(2, 31), + phi::errors::InvalidArgument( + "The num of Input(X) elements should be less then INT_MAX, " + "but received num is %d.", + in_->numel())); + + for (auto i = 0; i < in_->numel(); i++) { + auto it = dict.find(in_data[i]); + if (it == dict.end()) { + dict.emplace(std::make_pair(in_data[i], j)); + uniq.emplace_back(in_data[i]); + index_data[i] = static_cast(j); + j++; + } else { + index_data[i] = static_cast(it->second); + } + } + + if (count_ != nullptr) { + // Resize the count tensor dims to allocate the memory + count_->Resize(phi::make_ddim({static_cast(uniq.size())})); + IndexT* count_data = context_.template Alloc(count_); + // init count_data to 0 + memset(count_data, 0, uniq.size() * sizeof(IndexT)); + + const auto& index_type = index_->dtype(); + bool index_type_match = + index_type == DataType::INT32 || index_type == DataType::INT64; + PADDLE_ENFORCE_EQ( + index_type_match, + true, + phi::errors::InvalidArgument( + "Index holds the wrong type, it holds %s, " + "but desires to be %s or %s", + paddle::framework::DataTypeToString( + paddle::framework::TransToProtoVarType(index_type)), + paddle::framework::DataTypeToString( + paddle::framework::TransToProtoVarType(DataType::INT32)), + paddle::framework::DataTypeToString( + paddle::framework::TransToProtoVarType(DataType::INT64)))); + + if (index_type == DataType::INT32) { + for (auto i = 0; i < in_->numel(); ++i) { + const IndexT& index = index_data[i]; + count_data[static_cast(index)] += static_cast(1); + } + } else { + for (auto i = 0; i < in_->numel(); ++i) { + const IndexT& index = index_data[i]; + count_data[static_cast(index)] += static_cast(1); + } + } + } + + out_->Resize(phi::make_ddim({static_cast(uniq.size())})); + auto* out_data = context_.template Alloc(out_); + std::memcpy(out_data, uniq.data(), uniq.size() * sizeof(InT)); + } +}; + +static std::vector Unbind(const DenseTensor& in) { + int64_t size = in.dims()[0]; + std::vector tensors(size); + for (int64_t i = 0; i < size; ++i) { + tensors[i] = in.Slice(i, i + 1); + } + return tensors; +} + +template +static bool Equal(const DenseTensor& a, const DenseTensor& b) { + if (a.numel() != b.numel()) { + return false; + } + for (int64_t i = 0; i < a.numel(); ++i) { + if (a.data()[i] != b.data()[i]) { + return false; + } + } + return true; +} + +template +static void UniqueFlattendTensor(const Context& context, + const DenseTensor& in, + DenseTensor* out, + DenseTensor* indices, + DenseTensor* index, + DenseTensor* count, + bool return_index, + bool return_inverse, + bool return_counts) { + const InT* in_data = in.data(); + std::set unique(in_data, in_data + in.numel()); + out->Resize(phi::make_ddim({static_cast(unique.size())})); + auto* out_data = context.template Alloc(out); + std::copy(unique.begin(), unique.end(), out_data); + + if (return_index) { + indices->Resize(phi::make_ddim({out->numel()})); + auto indices_data = context.template Alloc(indices); + std::unordered_map indices_map; + indices_map.reserve(out->numel()); + for (int64_t i = 0; i < in.numel(); ++i) { + if (indices_map.find(in_data[i]) != indices_map.end()) continue; + indices_map[in_data[i]] = i; + } + for (int64_t i = 0; i < out->numel(); ++i) { + indices_data[i] = indices_map[out_data[i]]; + } + } + + if (return_inverse) { + index->Resize(phi::make_ddim({in.numel()})); + auto inverse_data = context.template Alloc(index); + std::unordered_map inverse_map; + inverse_map.reserve(out->numel()); + for (int64_t i = 0; i < out->numel(); ++i) { + inverse_map[out_data[i]] = i; + } + for (int64_t i = 0; i < in.numel(); ++i) { + inverse_data[i] = inverse_map[in_data[i]]; + } + } + + if (return_counts) { + count->Resize(phi::make_ddim({out->numel()})); + auto count_data = context.template Alloc(count); + std::unordered_map counts_map; + counts_map.reserve(out->numel()); + for (int64_t i = 0; i < out->numel(); ++i) { + counts_map[out_data[i]] = 0; + } + for (int64_t i = 0; i < in.numel(); i++) { + counts_map[in_data[i]] += 1; + } + for (int64_t i = 0; i < out->numel(); i++) { + count_data[i] = counts_map[out_data[i]]; + } + } +} + +template +static ForwardIt UniqueDimImpl(const Context& context, + ForwardIt first, + ForwardIt last, + const std::vector& sorted_indices_vec, + std::vector* inverse_vec, + std::vector* counts_vec, + std::vector* indices_vec) { + if (first == last) { + return last; + } + + (*inverse_vec)[sorted_indices_vec[0]] = 0; + (*counts_vec)[0] = 1; + (*indices_vec)[0] = sorted_indices_vec[0]; + + ForwardIt begin = first; + ForwardIt result = first; + + while (++first != last) { + int64_t idx_first = std::distance(begin, first); + int64_t idx_result = std::distance(begin, result); + if (!Equal(*result, *first)) { + if (++result != first) { + *result = std::move(*first); + } + idx_result += 1; + (*indices_vec)[idx_result] = sorted_indices_vec[idx_first]; + } + (*inverse_vec)[sorted_indices_vec[idx_first]] = idx_result; + (*counts_vec)[idx_result] += 1; + } + return ++result; +} + +template +static void UniqueDim(const Context& context, + const DenseTensor& in, + DenseTensor* out, + DenseTensor* indices, + DenseTensor* index, + DenseTensor* count, + bool return_index, + bool return_inverse, + bool return_counts, + int axis) { + // transpose tensor: eg. axis=1, [dim0, dim1, dim2] -> [dim1, dim0, dim2] + std::vector permute(in.dims().size()); + std::iota(permute.begin(), permute.end(), 0); + permute[axis] = 0; + permute[0] = axis; + std::vector in_trans_dims_vec(phi::vectorize(in.dims())); + in_trans_dims_vec[axis] = in.dims()[0]; + in_trans_dims_vec[0] = in.dims()[axis]; + DenseTensor in_trans; + phi::DDim in_trans_dims = phi::make_ddim(in_trans_dims_vec); + in_trans.Resize(in_trans_dims); + context.template Alloc(&in_trans); + TransCompute(in.dims().size(), context, in, &in_trans, permute); + // reshape tensor: eg. [dim1, dim0, dim2] -> [dim1, dim0*dim2] + phi::DDim in_trans_flat_dims = phi::flatten_to_2d(in_trans_dims, 1); + in_trans.Resize(in_trans_flat_dims); + + // sort indices + std::vector sorted_indices_vec(in_trans.dims()[0]); + std::iota(sorted_indices_vec.begin(), sorted_indices_vec.end(), 0); + int64_t col = in_trans.dims()[1]; + const InT* in_trans_data = in_trans.data(); + std::sort(sorted_indices_vec.begin(), + sorted_indices_vec.end(), + [&](int64_t a, int64_t b) -> bool { + for (int64_t i = 0; i < col; ++i) { + InT lhs = in_trans_data[i + a * col]; + InT rhs = in_trans_data[i + b * col]; + if (lhs < rhs) { + return true; + } else if (lhs > rhs) { + return false; + } + } + return false; + }); + + // sort tensor according to indices + DenseTensor input_sorted; + input_sorted.Resize(in_trans_dims); + context.template Alloc(&input_sorted); + InT* input_sorted_data = input_sorted.data(); + for (size_t i = 0; i < sorted_indices_vec.size(); ++i) { + memcpy(input_sorted_data + i * col, + in_trans_data + static_cast(sorted_indices_vec[i]) * col, + col * sizeof(InT)); + } + + std::vector input_unbind = Unbind(input_sorted); + std::vector inverse_vec(sorted_indices_vec.size(), 0); + std::vector counts_vec(sorted_indices_vec.size(), 0); + std::vector indices_vec(sorted_indices_vec.size(), 0); + auto last = UniqueDimImpl::iterator, InT>( + context, + input_unbind.begin(), + input_unbind.end(), + sorted_indices_vec, + &inverse_vec, + &counts_vec, + &indices_vec); + input_unbind.erase(last, input_unbind.end()); + counts_vec.erase(counts_vec.begin() + input_unbind.size(), counts_vec.end()); + indices_vec.erase(indices_vec.begin() + input_unbind.size(), + indices_vec.end()); + + phi::funcs::ConcatFunctor concat_functor; + DenseTensor out_trans; + std::vector out_trans_dims_vec = in_trans_dims_vec; + out_trans_dims_vec[0] = input_unbind.size(); + out_trans.Resize(phi::make_ddim(out_trans_dims_vec)); + context.template Alloc(&out_trans); + std::swap(out_trans_dims_vec[0], out_trans_dims_vec[axis]); + out->Resize(phi::make_ddim(out_trans_dims_vec)); + context.template Alloc(out); + concat_functor(context, input_unbind, 0, &out_trans); + TransCompute( + out_trans.dims().size(), context, out_trans, out, permute); + + if (return_inverse) { + paddle::framework::TensorFromVector(inverse_vec, context, index); + } + + if (return_counts) { + paddle::framework::TensorFromVector(counts_vec, context, count); + } + + if (return_index) { + paddle::framework::TensorFromVector(indices_vec, context, indices); + } +} + +template +struct UniqueFlattendTensorFunctor { + const Context& ctx_; /* */ + const DenseTensor& in_; + DenseTensor* out_; + DenseTensor* indices_; + DenseTensor* index_; + DenseTensor* count_; + const bool return_index_; + const bool return_inverse_; + const bool return_counts_; + + UniqueFlattendTensorFunctor(const Context& context, + const DenseTensor& in, + DenseTensor* out, + DenseTensor* indices, + DenseTensor* index, + DenseTensor* count, + bool return_index, + bool return_inverse, + bool return_counts) + : ctx_(context), + in_(in), + out_(out), + indices_(indices), + index_(index), + count_(count), + return_index_(return_index), + return_inverse_(return_inverse), + return_counts_(return_counts) {} + + template + void apply() const { + UniqueFlattendTensor(ctx_, + in_, + out_, + indices_, + index_, + count_, + return_index_, + return_inverse_, + return_counts_); + } +}; + +template +struct UniqueDimFunctor { + const Context& ctx_; + const DenseTensor& in_; + DenseTensor* out_; + DenseTensor* indices_; + DenseTensor* index_; + DenseTensor* count_; + const int axis_; + const bool return_index_; + const bool return_inverse_; + const bool return_counts_; + + UniqueDimFunctor(const Context& context, + const DenseTensor& in, + DenseTensor* out, + DenseTensor* indices, + DenseTensor* index, + DenseTensor* count, + const int axis, + bool return_index, + bool return_inverse, + bool return_counts) + : ctx_(context), + in_(in), + out_(out), + indices_(indices), + index_(index), + count_(count), + axis_(axis), + return_index_(return_index), + return_inverse_(return_inverse), + return_counts_(return_counts) {} + + template + void apply() const { + UniqueDim(ctx_, + in_, + out_, + indices_, + index_, + count_, + return_index_, + return_inverse_, + return_counts_, + axis_); + } +}; + +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/gaussian_random_kernel.h b/paddle/phi/kernels/gaussian_random_kernel.h index 2903d80d22d46..7424ad484a1fd 100644 --- a/paddle/phi/kernels/gaussian_random_kernel.h +++ b/paddle/phi/kernels/gaussian_random_kernel.h @@ -14,7 +14,7 @@ #pragma once -#include "paddle/phi/common/scalar_array.h" +#include "paddle/phi/common/int_array.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/device_context.h" @@ -22,7 +22,7 @@ namespace phi { template void GaussianRandomKernel(const Context& ctx, - const ScalarArray& shape, + const IntArray& shape, float mean, float std, int seed, diff --git a/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu b/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu new file mode 100644 index 0000000000000..215b94c52b395 --- /dev/null +++ b/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu @@ -0,0 +1,294 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/cross_entropy_grad_kernel.h" + +#ifdef __NVCC__ +#include "cub/cub.cuh" +#endif +#ifdef __HIPCC__ +#include +namespace cub = hipcub; +#endif + +#include "paddle/phi/common/amp_type_traits.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/copy_kernel.h" +#include "paddle/phi/kernels/funcs/axis_utils.h" +#include "paddle/phi/kernels/funcs/for_range.h" +#include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h" + +// TODO(chenweihang): move dispatch.h into phi/core +#include "paddle/phi/api/ext/dispatch.h" + +#include "paddle/fluid/operators/math/cross_entropy.h" +#include "paddle/fluid/operators/math/softmax.h" +#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" +#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" + +namespace phi { + +template +__global__ void SoftLabelCrossEntropyGradientKernel(T* logit_grad, + const T* loss_grad, + const T* labels, + const int n, + const int d, + const int remain) { + int ids = blockIdx.x * blockDim.x + threadIdx.x; + if (ids < n * d) { + int idx_n = ids / d; + int idx_remain = ids % remain; + int idx_loss = idx_n * remain + idx_remain; + logit_grad[ids] = loss_grad[idx_loss] * (-labels[ids] / logit_grad[ids]); + } +} + +template +__global__ void HardLabelCrossEntropyGradientKernel(T* logit_grad, + const LabelT* labels, + const int n, + const int d, + const int remain, + const int ignore_index) { + CUDA_KERNEL_LOOP(index, n * remain) { + int idx_n = index / remain; + int idx_remain = index % remain; + int tmp = static_cast(labels[index]); + int idx = idx_n * d + tmp * remain + idx_remain; + if (ignore_index != tmp) { + logit_grad[idx] = -static_cast(1.) / logit_grad[idx]; + } + } +} + +template +__global__ void ScaleCrossEntropyGradient(T* logit_grad, + const T* loss_grad, + const int num, + const int d, + const int remain, + const LabelT* labels, + const int ignore_index) { + CUDA_KERNEL_LOOP(index, num) { + int idx_n = index / d; + int idx_remain = index % remain; + int idx_lbl = idx_n * remain + idx_remain; + int k = (index % d) / remain; + auto lbl = static_cast(labels[idx_lbl]); + if (lbl == ignore_index || lbl != k) { + logit_grad[index] = static_cast(0.); + } else { + logit_grad[index] *= loss_grad[idx_lbl]; + } + } +} + +template +__global__ void SoftCrossEntropyGradientKernel(T* logit_grad, + const T* loss_grad, + const T* labels, + const int64_t n, + const int64_t d, + const int64_t remain) { + int64_t ids = blockIdx.x * blockDim.x + threadIdx.x; + if (ids < n * d) { + int64_t idx_n = ids / d; + int64_t idx_remain = ids % remain; + int64_t idx_loss = idx_n * remain + idx_remain; + logit_grad[ids] = loss_grad[idx_loss] * (logit_grad[ids] - labels[ids]); + } +} + +/* + Wrapper of softmax with cross entropy grad hard label. +*/ +template +__global__ void SoftmaxWithCrossEntropyGradHardLabel(T* logits_grad, + const T* loss_grad, + const T* softmax, + const LabelT* labels, + const int64_t n, + const int64_t dim, + const int64_t d, + const int ignore_index) { + int64_t idx = blockIdx.x * blockDim.x + threadIdx.x; + int64_t idx_n = idx / (d * dim); + int64_t idx_dim = (idx / d) % dim; + int64_t idx_d = idx % d; + int64_t ids = idx_n * d + idx_d; + + if (idx < n * dim * d) { + auto lbl = static_cast(labels[ids]); + if (lbl == ignore_index) { + logits_grad[idx] = static_cast(0.0); + } else if (lbl == idx_dim) { + logits_grad[idx] = (softmax[idx] - static_cast(1.0)) * loss_grad[ids]; + } else { + logits_grad[idx] = softmax[idx] * loss_grad[ids]; + } + } +} + +template +void CrossEntropyWithSoftmaxGradGPUKernel(const GPUContext& dev_ctx, + const DenseTensor& label, + const DenseTensor& softmax, + const DenseTensor& loss_grad, + bool soft_label, + bool use_softmax, + bool numeric_stable_mode, + int ignore_index, + int axis, + DenseTensor* logits_grad) { + PADDLE_ENFORCE_EQ( + dev_ctx.GetPlace().GetType(), + phi::AllocationType::GPU, + phi::errors::Unavailable("softmax_with_cross_entropy operator's " + "CUDA kernel only runs on GPU device.")); + const T* loss_grad_data = loss_grad.data(); + DenseTensor* logit_grad = logits_grad; + + T* logit_grad_data = nullptr; + bool copy_flag = (logit_grad != &softmax && (!use_softmax || soft_label)); + if (copy_flag) { + phi::Copy(dev_ctx, softmax, dev_ctx.GetPlace(), false, logit_grad); + logit_grad_data = logit_grad->data(); + } else { + logit_grad_data = dev_ctx.template Alloc(logit_grad); + } + + const int rank = logit_grad->dims().size(); + const int axis_v = phi::funcs::CanonicalAxis(axis, rank); + int axis_dim = logit_grad->dims()[axis_v]; + + const int64_t n = phi::funcs::SizeToAxis(axis_v, logit_grad->dims()); + const int64_t d = phi::funcs::SizeFromAxis(axis_v, logit_grad->dims()); + const int64_t remain = d / axis_dim; + +#ifdef __HIPCC__ + int block = 256; +#else + int block = 512; +#endif + auto stream = dev_ctx.stream(); + + // do not with softmax op, and input is softmax + if (!use_softmax) { + if (soft_label) { + int grid = (n * d + block - 1) / block; + const T* label_data = label.data(); + SoftLabelCrossEntropyGradientKernel<<>>( + logit_grad_data, loss_grad_data, label_data, n, d, remain); + } else { + DenseTensor logits_grad_2d(*logit_grad); + logits_grad_2d.Resize({n, d}); + int grid = (n * remain + block - 1) / block; + const auto* label_data = label.data(); + HardLabelCrossEntropyGradientKernel<<>>( + logit_grad_data, label_data, n, d, remain, ignore_index); + int num = n * d; + grid = (num + block - 1) / block; + ScaleCrossEntropyGradient<<>>( + logit_grad_data, + loss_grad_data, + num, + d, + remain, + label_data, + ignore_index); + } + + return; + } + + // with softmax, continue + + if (soft_label) { + int64_t grid = (n * d + block - 1) / block; + const T* label_data = label.data(); + SoftCrossEntropyGradientKernel<<>>( + logit_grad_data, loss_grad_data, label_data, n, d, remain); + } else { + const T* softmax_data = softmax.data(); + const auto* label_data = label.data(); + int grid = (n * d + block - 1) / block; + SoftmaxWithCrossEntropyGradHardLabel<<>>( + logit_grad_data, + loss_grad_data, + softmax_data, + label_data, + n, + d / remain, + remain, + ignore_index); + } +} + +template +void CrossEntropyWithSoftmaxGradKernel(const Context& dev_ctx, + const DenseTensor& label, + const DenseTensor& softmax, + const DenseTensor& loss_grad, + bool soft_label, + bool use_softmax, + bool numeric_stable_mode, + int ignore_index, + int axis, + DenseTensor* logits_grad) { + auto dtype = label.dtype(); + if (soft_label) { + PADDLE_ENFORCE_EQ( + dtype, + paddle::experimental::CppTypeToDataType::Type(), + phi::errors::InvalidArgument("The Input(Label) should be with the " + "same data type as kernel data type.")); + CrossEntropyWithSoftmaxGradGPUKernel(dev_ctx, + label, + softmax, + loss_grad, + soft_label, + use_softmax, + numeric_stable_mode, + ignore_index, + axis, + logits_grad); + } else { + PD_DISPATCH_INTEGRAL_TYPES( + dtype, "CrossEntropyWithSoftmaxGradGPUKernel", ([&] { + CrossEntropyWithSoftmaxGradGPUKernel(dev_ctx, + label, + softmax, + loss_grad, + soft_label, + use_softmax, + numeric_stable_mode, + ignore_index, + axis, + logits_grad); + })); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(cross_entropy_with_softmax_grad, + GPU, + ALL_LAYOUT, + phi::CrossEntropyWithSoftmaxGradKernel, + float, + double, + phi::dtype::float16) {} diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu b/paddle/phi/kernels/gpu/cross_entropy_kernel.cu similarity index 58% rename from paddle/fluid/operators/softmax_with_cross_entropy_op.cu rename to paddle/phi/kernels/gpu/cross_entropy_kernel.cu index 41545a1ca20b2..055706cffd41e 100644 --- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu +++ b/paddle/phi/kernels/gpu/cross_entropy_kernel.cu @@ -1,13 +1,19 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ + +#include "paddle/phi/kernels/cross_entropy_kernel.h" + #ifdef __NVCC__ #include "cub/cub.cuh" #endif @@ -15,39 +21,43 @@ limitations under the License. */ #include namespace cub = hipcub; #endif -#include "paddle/fluid/operators/amp/fp16_type_traits.h" + +#include "paddle/phi/common/amp_type_traits.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/copy_kernel.h" +#include "paddle/phi/kernels/funcs/axis_utils.h" +#include "paddle/phi/kernels/funcs/for_range.h" +#include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h" + +// TODO(chenweihang): move dispatch.h into phi/core +#include "paddle/phi/api/ext/dispatch.h" + #include "paddle/fluid/operators/math/cross_entropy.h" -#include "paddle/fluid/operators/softmax_with_cross_entropy_op.h" +#include "paddle/fluid/operators/math/softmax.h" #include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" -#include "paddle/fluid/platform/for_range.h" -#include "paddle/phi/kernels/funcs/math_function.h" -#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h" -namespace paddle { -namespace operators { +namespace phi { #define ALIGN_BYTES 16 -using ScopedTensorDescriptor = platform::ScopedTensorDescriptor; -using DataLayout = platform::DataLayout; -using Tensor = framework::Tensor; -namespace kps = phi::kps; +enum class SoftmaxMode { kSoftmax, kLogSoftmax, kCrossEntropy }; // Wrapper of log function. Use log(float32) for float16 template static __device__ __forceinline__ T Log(T x) { - using AccT = typename details::MPTypeTrait::Type; + using AccT = typename dtype::MPTypeTrait::Type; AccT logx = std::log(static_cast(x)); - return math::TolerableValue()(static_cast(logx)); + return paddle::operators::math::TolerableValue()(static_cast(logx)); } // Wrapper of exp function. Use exp(float32) for float16 template static __device__ __forceinline__ T Exp(T x) { - using AccT = typename details::MPTypeTrait::Type; + using AccT = typename dtype::MPTypeTrait::Type; AccT expx = std::exp(static_cast(x)); - return math::TolerableValue()(static_cast(expx)); + return paddle::operators::math::TolerableValue()(static_cast(expx)); } template @@ -62,22 +72,114 @@ struct ExpAddFunctor { Tx max; }; -// log2(value) -static inline int Log2Ceil(int value) { - int log2_value = 0; - while ((1 << log2_value) < value) ++log2_value; - return log2_value; -} +/* + Cross entropy soft label with dynamic size on axis (log2_elements is + varibale). + - if the input is softmax,compute loss with softmax + - if the input is log_softmax, compute loss with log_softmax and update + softmax +*/ +template +__global__ void CrossEntropySoftLabel(T* loss, + T* softmaxwrt, + const T* softmax, + const T* labels, + const int n, + const int dim, + const int d, + int log2_elements) { + const int kDimCeil = 1 << log2_elements; + const int kVSize = sizeof(VecT) / sizeof(T); -enum class SoftmaxMode { kSoftmax, kLogSoftmax, kCrossEntropy }; +#ifdef __HIPCC__ + const int kThreadPerBlock = 256; +#else + const int kThreadPerBlock = 512; +#endif + const int kBatchPerBlock = 1; + const int kWarpSize = 32; // (dim < 32) ? dim : 32; + const int kBatchSize = 1; + const int kThreadPerBatch = kThreadPerBlock / kBatchPerBlock; + const int kWarpPerBatch = kThreadPerBatch / kWarpSize; + + const int kIterations = (dim + kThreadPerBatch - 1) / kThreadPerBatch; + const int kIterationsV = (kIterations >= kVSize) ? (kIterations / kVSize) : 1; + + const int first_batch = (blockDim.y * blockIdx.x + threadIdx.y) * kBatchSize; + + T sum[kBatchSize]{static_cast(0.0)}; +#pragma unroll + for (int i = 0; i < kBatchSize; ++i) { + int ids = first_batch + i; + if (ids >= n * d) break; + int idx_n = ids / d; + int idx_d = ids % d; +#pragma unroll + for (int it = 0; it < kIterations; ++it) { + int idx_dim = it * kThreadPerBatch + threadIdx.x; + int idx = idx_n * dim * d + idx_dim * d + idx_d; + + if (idx_n < n && idx_dim < dim) { + VecT softmaxdata; + if (InLogMode) { + softmaxdata = reinterpret_cast(&softmaxwrt[idx])[0]; + } else { + softmaxdata = reinterpret_cast(&softmax[idx])[0]; + } + VecT labelsdata = reinterpret_cast(&labels[idx])[0]; + T* softmaxptr = reinterpret_cast(&softmaxdata); + T* labelsptr = reinterpret_cast(&labelsdata); +#pragma unroll + for (int s = 0; s < kVSize; s++) { + if (InLogMode) { + sum[i] -= softmaxptr[s] * labelsptr[s]; + softmaxptr[s] = Exp(softmaxptr[s]); + } else { + sum[i] -= Log(softmaxptr[s]) * labelsptr[s]; + } + } + if (InLogMode) { + reinterpret_cast(&softmaxwrt[idx])[0] = softmaxdata; + } + } + } + } + phi::WarpReduceSum(sum); + __syncthreads(); + + __shared__ T sumshare[kWarpPerBatch][kBatchPerBlock][kBatchSize]; + if (threadIdx.x % kWarpSize == 0) { +#pragma unroll + for (int i = 0; i < kBatchSize; i++) { + sumshare[threadIdx.x / kWarpSize][threadIdx.y][i] = sum[i]; + } + } + __syncthreads(); + + // write + if (threadIdx.x == 0) { + for (int i = 0; i < kBatchSize; i++) { + int ids = first_batch + i; + if (ids < n * d) { + loss[ids] = sumshare[0][threadIdx.y][i]; + for (int s = 1; s < kWarpPerBatch; s++) { + loss[ids] += sumshare[s][threadIdx.y][i]; + } + } + } + } +} /* Hard label cross entropy. */ template -__global__ void CrossEntropyHardLabel(T* loss, const T* softmax, - const LabelT* labels, const int n, - const int dim, const int d, +__global__ void CrossEntropyHardLabel(T* loss, + const T* softmax, + const LabelT* labels, + const int n, + const int dim, + const int d, const int ignore_idx) { int64_t ids = blockIdx.x * blockDim.x + threadIdx.x; int64_t idx_n = ids / d; @@ -111,9 +213,12 @@ __global__ void CrossEntropyHardLabel(T* loss, const T* softmax, Output: loss and exp(input) */ template -__global__ void CrossEntropyExpHardLabel(T* loss, T* softmax, - const LabelT* labels, const int n, - const int dim, const int d, +__global__ void CrossEntropyExpHardLabel(T* loss, + T* softmax, + const LabelT* labels, + const int n, + const int dim, + const int d, const int ignore_idx) { int64_t idx = blockIdx.x * blockDim.x + threadIdx.x; int64_t idx_n = idx / (d * dim); @@ -146,308 +251,64 @@ __global__ void CrossEntropyExpHardLabel(T* loss, T* softmax, } } -/* - Core function of softmax with cross entropy forward - - softmax, SoftmaxMode=kSoftmax - - log softmax, SoftmaxMode=kLogSoftmax - - softmax with cross entropy hard label, SoftmaxMode=kCrossEntropy - The computation includes - - Compute max value: maxvalue_{i} = max_j src_{i,j} - - Compute sum of exp: s_{i} = sum_{j}{e^{src_{i,j} - maxvalue_{i}}} - - Compute: softmax_{i,j} = e^{src_{i,j} - maxvalue_{i}} / s_{i} - - Compute: logsoftmax_{i,j} = src_{i,j} - maxvalue_{i} - log(s_{i}) - - Compute: loss_{i} = -logsoftmax[i,label[i]] (Hard label) - This computation results from following formula: - softmax_{i,j} = e^{src_{i,j}} / sum_{j}{e^{src_{i,j}}} - = e^{src_{i,j} - maxvalue_{i}} - / sum_{j}{e^{src_{i,j} - maxvalue_{i}}} - = e^{src_{i,j} - maxvalue_{i}} / s_{i} - logsoftmax_{i,j} = log(softmax_{i,j}) - = src_{i,j} - maxvalue_{i} - log(s_{i}) - One warp (32 threads) is used to compute 1 or 2 batch (kBatchSize). - For reduction max (sum), firstly compute max (sum) to one warp, then use - shuffle api to compute max (sum) in one warp. -*/ -template -__global__ void WarpSoftmaxForward(T* loss, T* softmax, const T* src, - const LabelT* label, const int batch_size, - const int stride, const int element_count, - const int ignore_index) { - constexpr int kDimCeil = 1 << Log2Elements; - constexpr int kWarpSize = (kDimCeil < 32) ? kDimCeil : 32; - constexpr int kVSize = sizeof(VecT) / sizeof(T); - constexpr int kIterations = kDimCeil / kWarpSize; - constexpr int kIterationsV = - (kIterations >= kVSize) ? (kIterations / kVSize) : 1; - constexpr int kBatchSize = (kDimCeil <= 128) ? 2 : 1; - - int first_batch = (blockDim.y * blockIdx.x + threadIdx.y) * kBatchSize; +template +__device__ __forceinline__ AccT ThreadReduce(const T* input, + int size, + const int offset, + AccT init, + ReduceFunctor reducer) { + using VecT = kps::details::VectorType; + int tid = threadIdx.x; + AccT val = init; - // max index to read - int idx_max_v[kBatchSize]; -#pragma unroll - for (int i = 0; i < kBatchSize; i++) { - int idx_max = ((i + first_batch) < batch_size) ? element_count : 0; - idx_max_v[i] = idx_max / kVSize; + if (offset > 0) { + input -= offset; + size += offset; + if (tid >= offset) { + val = reducer(val, input[tid]); + } + size -= blockDim.x; + input += blockDim.x; } + int remain = size % (VecSize * blockDim.x); - // read data from global memory - AccT srcdata[kBatchSize][kIterationsV][kVSize]; + T ins[VecSize]; + VecT* ins_vec = reinterpret_cast(&ins); -#pragma unroll - for (int i = 0; i < kBatchSize; ++i) { -// read data to srcdata: - KVSize==1, - KVSize>1 -#pragma unroll - for (int it = 0; it < kIterationsV; ++it) { - int src_idx = threadIdx.x + it * kWarpSize; - if (kVSize == 1) { - if (src_idx < idx_max_v[i]) { - srcdata[i][it][0] = - static_cast(src[(first_batch + i) * stride + src_idx]); - } else { - srcdata[i][it][0] = -std::numeric_limits::infinity(); - } - } else { - const VecT* src_v = - reinterpret_cast(&src[(first_batch + i) * stride]); - if (src_idx < idx_max_v[i]) { - VecT srctmp = src_v[src_idx]; - const T* srcinptr = reinterpret_cast(&srctmp); -#pragma unroll - for (int s = 0; s < kVSize; s++) { - srcdata[i][it][s] = static_cast(srcinptr[s]); - } - } else { -#pragma unroll - for (int s = 0; s < kVSize; s++) { - srcdata[i][it][s] = -std::numeric_limits::infinity(); - } - } - } - } - } + // vector part + for (; VecSize * tid < (size - remain); tid += blockDim.x) { + *ins_vec = reinterpret_cast(input)[tid]; - // compute max value: maxvalue_{i} = max_j src_{i,j} - AccT max_value[kBatchSize]; -#pragma unroll - for (int i = 0; i < kBatchSize; ++i) { - // it = 0 - AccT valmax = srcdata[i][0][0]; #pragma unroll - for (int s = 1; s < kVSize; ++s) { - valmax = (valmax > srcdata[i][0][s]) ? valmax : srcdata[i][0][s]; + for (int i = 0; i < VecSize; ++i) { + val = reducer(val, ins[i]); } - max_value[i] = valmax; + } -// it = 1, 2, ... -#pragma unroll - for (int it = 1; it < kIterationsV; ++it) { - AccT valmax = srcdata[i][it][0]; -#pragma unroll - for (int s = 1; s < kVSize; ++s) { - valmax = (valmax > srcdata[i][it][s]) ? valmax : srcdata[i][it][s]; - } - max_value[i] = (max_value[i] > valmax) ? max_value[i] : valmax; - } + // scalar part + tid = size - remain + threadIdx.x; + for (; tid < size; tid += blockDim.x) { + val = reducer(val, input[tid]); } - phi::WarpReduceMax(max_value); + return val; +} - // compute sum: s_{i} = sum_{j}{ exp(src_{i,j} - maxvalue_{i} } - AccT sum[kBatchSize]; -#pragma unroll - for (int i = 0; i < kBatchSize; ++i) { - // it = 0 - if (mode == SoftmaxMode::kLogSoftmax || - mode == SoftmaxMode::kCrossEntropy) { - sum[i] = std::exp(srcdata[i][0][0] - max_value[i]); - } else { - srcdata[i][0][0] = std::exp(srcdata[i][0][0] - max_value[i]); - sum[i] = srcdata[i][0][0]; - } -#pragma unroll - for (int s = 1; s < kVSize; ++s) { - if (mode == SoftmaxMode::kLogSoftmax || - mode == SoftmaxMode::kCrossEntropy) { - sum[i] += std::exp(srcdata[i][0][s] - max_value[i]); +template +__device__ __forceinline__ void ComputeLoss(T* loss, + const T loss_value, + const int label_id, + const int64_t label_value, + const int tid, + const int vec_size, + const int offset, + const int ignore_index) { + int loss_id = vec_size * tid + offset; + if (IgnoreIndex) { + if (label_value == loss_id) { + if (label_value == ignore_index) { + loss[label_id] = static_cast(0.0f); } else { - srcdata[i][0][s] = std::exp(srcdata[i][0][s] - max_value[i]); - sum[i] += srcdata[i][0][s]; - } - } - -// it = 1, 2, ... -#pragma unroll - for (int it = 1; it < kIterationsV; ++it) { -#pragma unroll - for (int s = 0; s < kVSize; ++s) { - if (mode == SoftmaxMode::kLogSoftmax || - mode == SoftmaxMode::kCrossEntropy) { - sum[i] += std::exp(srcdata[i][it][s] - max_value[i]); - } else { - srcdata[i][it][s] = std::exp(srcdata[i][it][s] - max_value[i]); - sum[i] += srcdata[i][it][s]; - } - } - } - } - phi::WarpReduceSum(sum); - -// write data -#pragma unroll - for (int i = 0; i < kBatchSize; ++i) { - if (mode == SoftmaxMode::kLogSoftmax || - mode == SoftmaxMode::kCrossEntropy) { - sum[i] = std::log(sum[i]); - } - -#pragma unroll - for (int it = 0; it < kIterationsV; ++it) { - int idx = threadIdx.x + it * kWarpSize; - if (kVSize == 1) { // kVSize==1 - if (idx < idx_max_v[i]) { - if (mode == SoftmaxMode::kLogSoftmax) { // log softmax - softmax[(first_batch + i) * stride + idx] = - srcdata[i][it][0] - max_value[i] - sum[i]; - // softmax with cross entropy hard label - } else if (mode == SoftmaxMode::kCrossEntropy) { - AccT logsoftmax = srcdata[i][it][0] - max_value[i] - sum[i]; - // softmax - softmax[(first_batch + i) * stride + idx] = std::exp(logsoftmax); - // label - int loss_idx = (threadIdx.x + it * kWarpSize) * kVSize; - auto lbl = static_cast(label[first_batch + i]); - if (IgnoreIndex == true) { - // IgnoreIndex is true - if (lbl == loss_idx) { - if (lbl != ignore_index) { - loss[first_batch + i] = -logsoftmax; - } else { - loss[first_batch + i] = static_cast(0.0); - } - } - } else { - // IgnoreIndex is false - if (lbl >= 0 && lbl < element_count) { - if (lbl == loss_idx) { - loss[first_batch + i] = -logsoftmax; - } - } else { - loss[first_batch + i] = static_cast(0.0); - } - } - } else { // softmax - softmax[(first_batch + i) * stride + idx] = - srcdata[i][it][0] / sum[i]; - } - } else { - break; - } - } else { // KVSize>1 - VecT* softmax_v = - reinterpret_cast(&softmax[(first_batch + i) * stride]); - VecT tmpdata; - T* tmpptr = reinterpret_cast(&tmpdata); -#pragma unroll - for (int s = 0; s < kVSize; ++s) { - if (mode == SoftmaxMode::kLogSoftmax) { // log softmax - tmpptr[s] = srcdata[i][it][s] - max_value[i] - sum[i]; - // softmax with cross entropy hard label - } else if (mode == SoftmaxMode::kCrossEntropy) { - AccT logsoftmax = srcdata[i][it][s] - max_value[i] - sum[i]; - // softmax - tmpptr[s] = std::exp(logsoftmax); - // label - int loss_idx = (threadIdx.x + it * kWarpSize) * kVSize + s; - auto lbl = static_cast(label[first_batch + i]); - if (IgnoreIndex == true) { - // IgnoreIndex is true - if (lbl == loss_idx && lbl != ignore_index) { - loss[first_batch + i] = -logsoftmax; - } - } else { - // IgnoreIndex is false - if (lbl >= 0 && lbl < element_count) { - if (lbl == loss_idx) { - loss[first_batch + i] = -logsoftmax; - } - } else { - loss[first_batch + i] = static_cast(0.0); - } - } - } else { // softmax - tmpptr[s] = srcdata[i][it][s] / sum[i]; - } - } - if (idx < idx_max_v[i]) { - softmax_v[idx] = tmpdata; - } else { - break; - } - } - } - } -} - -#define SOFTMAX_WARP_FORWARD_CASE(Log2Elements, LabelT, VecT, AccT) \ - case Log2Elements: \ - WarpSoftmaxForward<<>>( \ - loss, softmax, src, label, batch_size, stride, element_count, \ - ignore_index); \ - break; - -/* - Wrapper of softmax with cross entropy forward hard label. -*/ -template -void SwitchWarpSoftmaxForward(T* loss, T* softmax, const T* src, - const LabelT* label, const int batch_size, - const int stride, const int element_count, - const int ignore_index, gpuStream_t stream) { - using AccT = typename details::MPTypeTrait::Type; - - // use 128 threads per block to maximimize gpu utilization - const int log2_elements = static_cast(Log2Ceil(element_count)); - const int kDimCeil = 1 << log2_elements; - int kWarpSize = (kDimCeil < 32) ? kDimCeil : 32; - int batches_per_warp = (kDimCeil <= 128) ? 2 : 1; - constexpr int threads_per_block = 128; - int warps_per_block = (threads_per_block / kWarpSize); - int batches_per_block = warps_per_block * batches_per_warp; - int blocks = (batch_size + batches_per_block - 1) / batches_per_block; - dim3 threads(kWarpSize, warps_per_block, 1); - - switch (log2_elements) { - SOFTMAX_WARP_FORWARD_CASE(0, LabelT, T, AccT); - SOFTMAX_WARP_FORWARD_CASE(1, LabelT, T, AccT); - SOFTMAX_WARP_FORWARD_CASE(2, LabelT, T, AccT); - SOFTMAX_WARP_FORWARD_CASE(3, LabelT, T, AccT); - SOFTMAX_WARP_FORWARD_CASE(4, LabelT, T, AccT); - SOFTMAX_WARP_FORWARD_CASE(5, LabelT, T, AccT); - SOFTMAX_WARP_FORWARD_CASE(6, LabelT, T, AccT); - SOFTMAX_WARP_FORWARD_CASE(7, LabelT, T, AccT); - SOFTMAX_WARP_FORWARD_CASE(8, LabelT, T, AccT); - SOFTMAX_WARP_FORWARD_CASE(9, LabelT, T, AccT); - default: - break; - } -} - -template -__device__ __forceinline__ void ComputeLoss(T* loss, const T loss_value, - const int label_id, - const int64_t label_value, - const int tid, const int vec_size, - const int offset, - const int ignore_index) { - int loss_id = vec_size * tid + offset; - if (IgnoreIndex) { - if (label_value == loss_id) { - if (label_value == ignore_index) { - loss[label_id] = static_cast(0.0f); - } else { - loss[label_id] = loss_value; + loss[label_id] = loss_value; } } } else { @@ -457,51 +318,19 @@ __device__ __forceinline__ void ComputeLoss(T* loss, const T loss_value, } } -template -__device__ __forceinline__ AccT ThreadReduce(const T* input, int size, - const int offset, AccT init, - ReduceFunctor reducer) { - using VecT = kps::details::VectorType; - int tid = threadIdx.x; - AccT val = init; - - if (offset > 0) { - input -= offset; - size += offset; - if (tid >= offset) { - val = reducer(val, input[tid]); - } - size -= blockDim.x; - input += blockDim.x; - } - int remain = size % (VecSize * blockDim.x); - - T ins[VecSize]; - VecT* ins_vec = reinterpret_cast(&ins); - - // vector part - for (; VecSize * tid < (size - remain); tid += blockDim.x) { - *ins_vec = reinterpret_cast(input)[tid]; - -#pragma unroll - for (int i = 0; i < VecSize; ++i) { - val = reducer(val, ins[i]); - } - } - - // scalar part - tid = size - remain + threadIdx.x; - for (; tid < size; tid += blockDim.x) { - val = reducer(val, input[tid]); - } - return val; -} - -template __device__ __forceinline__ void VectorizedSoftmaxForwardImpl( - T* loss, T* softmax, const T* logits, const LabelT* label, int size, - const int offset, const phi::LogSoftmaxForwardFunctor& func, + T* loss, + T* softmax, + const T* logits, + const LabelT* label, + int size, + const int offset, + const phi::LogSoftmaxForwardFunctor& func, const int ignore_index) { using VecT = kps::details::VectorType; int tid = threadIdx.x; @@ -520,9 +349,14 @@ __device__ __forceinline__ void VectorizedSoftmaxForwardImpl( softmax[tid] = static_cast(std::exp(log_softmax)); // loss if (label_valid) { - ComputeLoss(loss, static_cast(-log_softmax), - label_id, label_value, tid, 1, - loss_id_offset, ignore_index); + ComputeLoss(loss, + static_cast(-log_softmax), + label_id, + label_value, + tid, + 1, + loss_id_offset, + ignore_index); } } size -= blockDim.x; @@ -550,9 +384,14 @@ __device__ __forceinline__ void VectorizedSoftmaxForwardImpl( // loss if (label_valid) { - ComputeLoss(loss, static_cast(-log_softmax), - label_id, label_value, tid, VecSize, - loss_id_offset + i, ignore_index); + ComputeLoss(loss, + static_cast(-log_softmax), + label_id, + label_value, + tid, + VecSize, + loss_id_offset + i, + ignore_index); } } @@ -568,8 +407,13 @@ __device__ __forceinline__ void VectorizedSoftmaxForwardImpl( // loss if (label_valid) { - ComputeLoss(loss, static_cast(-log_softmax), label_id, - label_value, tid, 1, loss_id_offset, + ComputeLoss(loss, + static_cast(-log_softmax), + label_id, + label_value, + tid, + 1, + loss_id_offset, ignore_index); } } @@ -580,11 +424,19 @@ __device__ __forceinline__ void VectorizedSoftmaxForwardImpl( } } -template __device__ __forceinline__ void ScalarSoftmaxForwardImpl( - T* loss, T* softmax, const T* logits, const LabelT* label, const int size, - const phi::LogSoftmaxForwardFunctor& func, const int ignore_index) { + T* loss, + T* softmax, + const T* logits, + const LabelT* label, + const int size, + const phi::LogSoftmaxForwardFunctor& func, + const int ignore_index) { int tid = threadIdx.x; int remain = size % (VecSize * blockDim.x); int label_id = blockIdx.x; @@ -605,8 +457,13 @@ __device__ __forceinline__ void ScalarSoftmaxForwardImpl( softmax[tid + i * blockDim.x] = static_cast(std::exp(log_softmax)); // loss if (label_valid) { - ComputeLoss(loss, static_cast(-log_softmax), - label_id, label_value, tid, VecSize, i, + ComputeLoss(loss, + static_cast(-log_softmax), + label_id, + label_value, + tid, + VecSize, + i, ignore_index); } } @@ -618,8 +475,14 @@ __device__ __forceinline__ void ScalarSoftmaxForwardImpl( softmax[tid] = static_cast(std::exp(log_softmax)); // loss if (label_valid) { - ComputeLoss(loss, static_cast(-log_softmax), label_id, - label_value, tid, 1, 0, ignore_index); + ComputeLoss(loss, + static_cast(-log_softmax), + label_id, + label_value, + tid, + 1, + 0, + ignore_index); } } @@ -629,11 +492,17 @@ __device__ __forceinline__ void ScalarSoftmaxForwardImpl( } } -template -__global__ void VectorizedSoftmaxForward(T* loss, T* softmax, const T* logits, +__global__ void VectorizedSoftmaxForward(T* loss, + T* softmax, + const T* logits, const LabelT* label, - const int high_dim, const int mid_dim, + const int high_dim, + const int mid_dim, const int ignore_index) { using VecT = kps::details::VectorType; @@ -646,14 +515,20 @@ __global__ void VectorizedSoftmaxForward(T* loss, T* softmax, const T* logits, // 1. reduce max AccT max = ThreadReduce>( - logits, mid_dim, input_offset, -std::numeric_limits::infinity(), + logits, + mid_dim, + input_offset, + -std::numeric_limits::infinity(), kps::MaxFunctor()); max = kps::details::BlockXReduce>( max, kps::MaxFunctor()); // 2. reduce sum AccT sum = ThreadReduce>( - logits, mid_dim, input_offset, static_cast(0), + logits, + mid_dim, + input_offset, + static_cast(0), ExpAddFunctor(max)); sum = kps::details::BlockXReduce>( sum, kps::AddFunctor()); @@ -662,7 +537,13 @@ __global__ void VectorizedSoftmaxForward(T* loss, T* softmax, const T* logits, phi::LogSoftmaxForwardFunctor func(max, sum); if (input_offset == output_offset) { VectorizedSoftmaxForwardImpl( - loss, softmax, logits, label, mid_dim, input_offset, func, + loss, + softmax, + logits, + label, + mid_dim, + input_offset, + func, ignore_index); } else { ScalarSoftmaxForwardImpl( @@ -670,229 +551,26 @@ __global__ void VectorizedSoftmaxForward(T* loss, T* softmax, const T* logits, } } -template -void LaunchVectorizedSoftmaxForward(T* loss, T* softmax, const T* logits, - const LabelT* label, const int high_dim, - const int mid_dim, const int ignore_index, - gpuStream_t stream) { - using AccT = typename details::MPTypeTrait::Type; - constexpr int vec_size = sizeof(float4) / sizeof(T); - const int max_num_threads = 1024; - int max_block_size = std::min(mid_dim / vec_size, max_num_threads); - if (vec_size > 1) { - max_block_size /= 2; - } - - int block_size = 1; - while (block_size < max_block_size) { - block_size *= 2; - } - block_size = std::max(block_size, kps::details::kWarpSize); - dim3 grids(high_dim); - dim3 blocks(block_size); - VectorizedSoftmaxForward<<>>( - loss, softmax, logits, label, high_dim, mid_dim, ignore_index); -} - /* - Wrapper of softmax with cross entropy hard label. - - SwitchWarpSoftmaxForward for small size when axis == -1 - - LaunchVectorizedSoftmaxForward for large size when axis == -1 - - cudnn function for axis != -1 +Core function of softmax with cross entropy forward soft label. +The computation includes + - Compute maximum of batch: maxvalue_{i} = max_j src_{i,j} + - Compute sum of exp batch: s_{i} = sum_{j}{ exp(src_{i,j} - maxvalue_{i} } + - Compute: sum of - sum_{j}{ label_{i,j} * (src_{i,j} - maxvalue_{i} - +log(sum[i]))} +One warp (32 threads) is used to compute 1 or 2 batch (kBatchSize). +For reduction max (sum), firstly compute max (sum) to one warp, then use shuffle +api to compute max (sum) in one warp. */ -template -static void SoftmaxWithCrossEntropyHardLabel( - const platform::CUDADeviceContext& ctx, int rank, int axis, - const T* logits_data, const LabelT* labels_data, T* loss_data, - T* softmax_data, int N, int dim, int D, const int ignore_index) { - auto stream = ctx.stream(); - constexpr int max_dim = 320; - if (D == 1) { - if (dim <= max_dim) { // small size - const SoftmaxMode mode = SoftmaxMode::kCrossEntropy; - SwitchWarpSoftmaxForward( - loss_data, softmax_data, logits_data, labels_data, N, dim, dim, - ignore_index, stream); - } else { // large size - LaunchVectorizedSoftmaxForward( - loss_data, softmax_data, logits_data, labels_data, N, dim, - ignore_index, stream); - } - } else { - ScopedTensorDescriptor desc; - std::vector tensor_dims = {N, dim, D, 1}; - DataLayout layout = DataLayout::kNCHW; -#ifdef PADDLE_WITH_HIP - miopenTensorDescriptor_t descp = desc.descriptor(layout, tensor_dims); -#else - cudnnTensorDescriptor_t descp = desc.descriptor(layout, tensor_dims); -#endif - - auto handle = ctx.cudnn_handle(); - -#ifdef PADDLE_WITH_HIP - auto mode = axis == rank - 1 ? MIOPEN_SOFTMAX_MODE_INSTANCE - : MIOPEN_SOFTMAX_MODE_CHANNEL; - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSoftmaxForward_V2( - handle, platform::CudnnDataType::kOne(), descp, logits_data, - platform::CudnnDataType::kZero(), descp, softmax_data, - MIOPEN_SOFTMAX_LOG, mode)); -#else - auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE - : CUDNN_SOFTMAX_MODE_CHANNEL; - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSoftmaxForward( - handle, CUDNN_SOFTMAX_LOG, mode, platform::CudnnDataType::kOne(), - descp, logits_data, platform::CudnnDataType::kZero(), descp, - softmax_data)); -#endif - int threads = 128; - int blocks = (N * dim * D + threads - 1) / threads; - // compute cross entropy, input is log softmax - CrossEntropyExpHardLabel<<>>( - loss_data, softmax_data, labels_data, N, dim, D, ignore_index); - } -} - -/* - Wrapper of softmax with cross entropy grad hard label. -*/ -template -__global__ void SoftmaxWithCrossEntropyGradHardLabel( - T* logits_grad, const T* loss_grad, const T* softmax, const LabelT* labels, - const int64_t n, const int64_t dim, const int64_t d, - const int ignore_index) { - int64_t idx = blockIdx.x * blockDim.x + threadIdx.x; - int64_t idx_n = idx / (d * dim); - int64_t idx_dim = (idx / d) % dim; - int64_t idx_d = idx % d; - int64_t ids = idx_n * d + idx_d; - - if (idx < n * dim * d) { - auto lbl = static_cast(labels[ids]); - if (lbl == ignore_index) { - logits_grad[idx] = static_cast(0.0); - } else if (lbl == idx_dim) { - logits_grad[idx] = (softmax[idx] - static_cast(1.0)) * loss_grad[ids]; - } else { - logits_grad[idx] = softmax[idx] * loss_grad[ids]; - } - } -} - -/* - Cross entropy soft label with dynamic size on axis (log2_elements is - varibale). - - if the input is softmax,compute loss with softmax - - if the input is log_softmax, compute loss with log_softmax and update - softmax -*/ -template -__global__ void CrossEntropySoftLabel(T* loss, T* softmaxwrt, const T* softmax, - const T* labels, const int n, - const int dim, const int d, - int log2_elements) { - const int kDimCeil = 1 << log2_elements; - const int kVSize = sizeof(VecT) / sizeof(T); - -#ifdef __HIPCC__ - const int kThreadPerBlock = 256; -#else - const int kThreadPerBlock = 512; -#endif - const int kBatchPerBlock = 1; - const int kWarpSize = 32; // (dim < 32) ? dim : 32; - const int kBatchSize = 1; - const int kThreadPerBatch = kThreadPerBlock / kBatchPerBlock; - const int kWarpPerBatch = kThreadPerBatch / kWarpSize; - - const int kIterations = (dim + kThreadPerBatch - 1) / kThreadPerBatch; - const int kIterationsV = (kIterations >= kVSize) ? (kIterations / kVSize) : 1; - - const int first_batch = (blockDim.y * blockIdx.x + threadIdx.y) * kBatchSize; - - T sum[kBatchSize]{static_cast(0.0)}; -#pragma unroll - for (int i = 0; i < kBatchSize; ++i) { - int ids = first_batch + i; - if (ids >= n * d) break; - int idx_n = ids / d; - int idx_d = ids % d; -#pragma unroll - for (int it = 0; it < kIterations; ++it) { - int idx_dim = it * kThreadPerBatch + threadIdx.x; - int idx = idx_n * dim * d + idx_dim * d + idx_d; - - if (idx_n < n && idx_dim < dim) { - VecT softmaxdata; - if (InLogMode) { - softmaxdata = reinterpret_cast(&softmaxwrt[idx])[0]; - } else { - softmaxdata = reinterpret_cast(&softmax[idx])[0]; - } - VecT labelsdata = reinterpret_cast(&labels[idx])[0]; - T* softmaxptr = reinterpret_cast(&softmaxdata); - T* labelsptr = reinterpret_cast(&labelsdata); -#pragma unroll - for (int s = 0; s < kVSize; s++) { - if (InLogMode) { - sum[i] -= softmaxptr[s] * labelsptr[s]; - softmaxptr[s] = Exp(softmaxptr[s]); - } else { - sum[i] -= Log(softmaxptr[s]) * labelsptr[s]; - } - } - if (InLogMode) { - reinterpret_cast(&softmaxwrt[idx])[0] = softmaxdata; - } - } - } - } - phi::WarpReduceSum(sum); - __syncthreads(); - - __shared__ T sumshare[kWarpPerBatch][kBatchPerBlock][kBatchSize]; - if (threadIdx.x % kWarpSize == 0) { -#pragma unroll - for (int i = 0; i < kBatchSize; i++) { - sumshare[threadIdx.x / kWarpSize][threadIdx.y][i] = sum[i]; - } - } - __syncthreads(); - - // write - if (threadIdx.x == 0) { - for (int i = 0; i < kBatchSize; i++) { - int ids = first_batch + i; - if (ids < n * d) { - loss[ids] = sumshare[0][threadIdx.y][i]; - for (int s = 1; s < kWarpPerBatch; s++) { - loss[ids] += sumshare[s][threadIdx.y][i]; - } - } - } - } -} - -/* -Core function of softmax with cross entropy forward soft label. -The computation includes - - Compute maximum of batch: maxvalue_{i} = max_j src_{i,j} - - Compute sum of exp batch: s_{i} = sum_{j}{ exp(src_{i,j} - maxvalue_{i} } - - Compute: sum of - sum_{j}{ label_{i,j} * (src_{i,j} - maxvalue_{i} - -log(sum[i]))} -One warp (32 threads) is used to compute 1 or 2 batch (kBatchSize). -For reduction max (sum), firstly compute max (sum) to one warp, then use shuffle -api to compute max (sum) in one warp. -*/ -template -__global__ void WarpSoftmaxForwardSoftLabel(T* loss, T* softmax, const T* src, - const T* label, - const int batch_size, - const int stride, - const int element_count) { - const bool LogMode = true; +template +__global__ void WarpSoftmaxForwardSoftLabel(T* loss, + T* softmax, + const T* src, + const T* label, + const int batch_size, + const int stride, + const int element_count) { + const bool LogMode = true; constexpr int kDimCeil = 1 << Log2Elements; constexpr int kWarpSize = (kDimCeil < 32) ? kDimCeil : 32; @@ -1030,7 +708,9 @@ __global__ void WarpSoftmaxForwardSoftLabel(T* loss, T* softmax, const T* src, #define SOFTMAX_WARP_FORWARD_SOFT_CASE(Log2Elements, VecT, AccT) \ case Log2Elements: \ - WarpSoftmaxForwardSoftLabel<<>>( \ loss, softmax, src, label, batch_size, stride, element_count); \ break; @@ -1039,13 +719,18 @@ __global__ void WarpSoftmaxForwardSoftLabel(T* loss, T* softmax, const T* src, Wrapper of softmax with cross entropy forward soft label. */ template -void SwitchWarpSoftmaxForwardSoftLabel(const int blocks, const dim3 threads, - gpuStream_t stream, T* loss, T* softmax, - const T* src, const T* label, - const int batch_size, const int stride, +void SwitchWarpSoftmaxForwardSoftLabel(const int blocks, + const dim3 threads, + gpuStream_t stream, + T* loss, + T* softmax, + const T* src, + const T* label, + const int batch_size, + const int stride, const int element_count, const int log2_elements) { - using AccT = typename details::MPTypeTrait::Type; + using AccT = typename dtype::MPTypeTrait::Type; switch (log2_elements) { SOFTMAX_WARP_FORWARD_SOFT_CASE(0, T, AccT); SOFTMAX_WARP_FORWARD_SOFT_CASE(1, T, AccT); @@ -1063,10 +748,16 @@ void SwitchWarpSoftmaxForwardSoftLabel(const int blocks, const dim3 threads, } template -static void SoftmaxWithCrossEntropySoftLabel( - const platform::CUDADeviceContext& ctx, const int rank, const int axis, - const T* logits_data, const T* labels_data, T* softmax_data, T* loss_data, - int N, int dim, int D) { +static void SoftmaxWithCrossEntropySoftLabel(const GPUContext& dev_ctx, + const int rank, + const int axis, + const T* logits_data, + const T* labels_data, + T* softmax_data, + T* loss_data, + int N, + int dim, + int D) { #ifdef __HIPCC__ constexpr int kMaxBlockDim = 256; #else @@ -1081,7 +772,7 @@ static void SoftmaxWithCrossEntropySoftLabel( const int kDimLog2 = static_cast(Log2Ceil(dim)); const int kDimCeil = 1 << kDimLog2; - auto stream = ctx.stream(); + auto stream = dev_ctx.stream(); if (D == 1 && dim <= max_dim) { int kWarpSize = (kDimCeil < 32) ? kDimCeil : 32; @@ -1094,35 +785,55 @@ static void SoftmaxWithCrossEntropySoftLabel( int blocks = (N + batches_per_block - 1) / batches_per_block; dim3 threads(kWarpSize, warps_per_block, 1); - SwitchWarpSoftmaxForwardSoftLabel(blocks, threads, stream, loss_data, - softmax_data, logits_data, labels_data, - N, dim, dim, kDimLog2); + SwitchWarpSoftmaxForwardSoftLabel(blocks, + threads, + stream, + loss_data, + softmax_data, + logits_data, + labels_data, + N, + dim, + dim, + kDimLog2); } else { ScopedTensorDescriptor desc; std::vector tensor_dims = {N, dim, D, 1}; - DataLayout layout = DataLayout::kNCHW; + GPUDNNDataLayout layout = GPUDNNDataLayout::kNCHW; #ifdef PADDLE_WITH_HIP miopenTensorDescriptor_t descp = desc.descriptor(layout, tensor_dims); #else cudnnTensorDescriptor_t descp = desc.descriptor(layout, tensor_dims); #endif - auto handle = ctx.cudnn_handle(); + auto handle = dev_ctx.cudnn_handle(); #ifdef PADDLE_WITH_HIP auto mode = axis == rank - 1 ? MIOPEN_SOFTMAX_MODE_INSTANCE : MIOPEN_SOFTMAX_MODE_CHANNEL; - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSoftmaxForward_V2( - handle, platform::CudnnDataType::kOne(), descp, logits_data, - platform::CudnnDataType::kZero(), descp, softmax_data, - MIOPEN_SOFTMAX_LOG, mode)); + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSoftmaxForward_V2( + handle, + paddle::platform::CudnnDataType::kOne(), + descp, + logits_data, + paddle::platform::CudnnDataType::kZero(), + descp, + softmax_data, + MIOPEN_SOFTMAX_LOG, + mode)); #else auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE : CUDNN_SOFTMAX_MODE_CHANNEL; - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSoftmaxForward( - handle, CUDNN_SOFTMAX_LOG, mode, platform::CudnnDataType::kOne(), - descp, logits_data, platform::CudnnDataType::kZero(), descp, + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSoftmaxForward( + handle, + CUDNN_SOFTMAX_LOG, + mode, + paddle::platform::CudnnDataType::kOne(), + descp, + logits_data, + paddle::platform::CudnnDataType::kZero(), + descp, softmax_data)); #endif @@ -1143,351 +854,712 @@ static void SoftmaxWithCrossEntropySoftLabel( } } -template -__global__ void SoftCrossEntropyGradientKernel(T* logit_grad, - const T* loss_grad, - const T* labels, const int64_t n, - const int64_t d, - const int64_t remain) { - int64_t ids = blockIdx.x * blockDim.x + threadIdx.x; - if (ids < n * d) { - int64_t idx_n = ids / d; - int64_t idx_remain = ids % remain; - int64_t idx_loss = idx_n * remain + idx_remain; - logit_grad[ids] = loss_grad[idx_loss] * (logit_grad[ids] - labels[ids]); - } -} +/* + Core function of softmax with cross entropy forward + - softmax, SoftmaxMode=kSoftmax + - log softmax, SoftmaxMode=kLogSoftmax + - softmax with cross entropy hard label, SoftmaxMode=kCrossEntropy + The computation includes + - Compute max value: maxvalue_{i} = max_j src_{i,j} + - Compute sum of exp: s_{i} = sum_{j}{e^{src_{i,j} - maxvalue_{i}}} + - Compute: softmax_{i,j} = e^{src_{i,j} - maxvalue_{i}} / s_{i} + - Compute: logsoftmax_{i,j} = src_{i,j} - maxvalue_{i} - log(s_{i}) + - Compute: loss_{i} = -logsoftmax[i,label[i]] (Hard label) + This computation results from following formula: + softmax_{i,j} = e^{src_{i,j}} / sum_{j}{e^{src_{i,j}}} + = e^{src_{i,j} - maxvalue_{i}} + / sum_{j}{e^{src_{i,j} - maxvalue_{i}}} + = e^{src_{i,j} - maxvalue_{i}} / s_{i} + logsoftmax_{i,j} = log(softmax_{i,j}) + = src_{i,j} - maxvalue_{i} - log(s_{i}) + One warp (32 threads) is used to compute 1 or 2 batch (kBatchSize). + For reduction max (sum), firstly compute max (sum) to one warp, then use + shuffle api to compute max (sum) in one warp. +*/ +template +__global__ void WarpSoftmaxForward(T* loss, + T* softmax, + const T* src, + const LabelT* label, + const int batch_size, + const int stride, + const int element_count, + const int ignore_index) { + constexpr int kDimCeil = 1 << Log2Elements; + constexpr int kWarpSize = (kDimCeil < 32) ? kDimCeil : 32; + constexpr int kVSize = sizeof(VecT) / sizeof(T); + constexpr int kIterations = kDimCeil / kWarpSize; + constexpr int kIterationsV = + (kIterations >= kVSize) ? (kIterations / kVSize) : 1; + constexpr int kBatchSize = (kDimCeil <= 128) ? 2 : 1; -template -__global__ void SoftLabelCrossEntropyGradientKernel(T* logit_grad, - const T* loss_grad, - const T* labels, - const int n, const int d, - const int remain) { - int ids = blockIdx.x * blockDim.x + threadIdx.x; - if (ids < n * d) { - int idx_n = ids / d; - int idx_remain = ids % remain; - int idx_loss = idx_n * remain + idx_remain; - logit_grad[ids] = loss_grad[idx_loss] * (-labels[ids] / logit_grad[ids]); + int first_batch = (blockDim.y * blockIdx.x + threadIdx.y) * kBatchSize; + + // max index to read + int idx_max_v[kBatchSize]; +#pragma unroll + for (int i = 0; i < kBatchSize; i++) { + int idx_max = ((i + first_batch) < batch_size) ? element_count : 0; + idx_max_v[i] = idx_max / kVSize; } -} -template -__global__ void HardLabelCrossEntropyGradientKernel(T* logit_grad, - const LabelT* labels, - const int n, const int d, - const int remain, - const int ignore_index) { - CUDA_KERNEL_LOOP(index, n * remain) { - int idx_n = index / remain; - int idx_remain = index % remain; - int tmp = static_cast(labels[index]); - int idx = idx_n * d + tmp * remain + idx_remain; - if (ignore_index != tmp) { - logit_grad[idx] = -static_cast(1.) / logit_grad[idx]; + // read data from global memory + AccT srcdata[kBatchSize][kIterationsV][kVSize]; + +#pragma unroll + for (int i = 0; i < kBatchSize; ++i) { +// read data to srcdata: - KVSize==1, - KVSize>1 +#pragma unroll + for (int it = 0; it < kIterationsV; ++it) { + int src_idx = threadIdx.x + it * kWarpSize; + if (kVSize == 1) { + if (src_idx < idx_max_v[i]) { + srcdata[i][it][0] = + static_cast(src[(first_batch + i) * stride + src_idx]); + } else { + srcdata[i][it][0] = -std::numeric_limits::infinity(); + } + } else { + const VecT* src_v = + reinterpret_cast(&src[(first_batch + i) * stride]); + if (src_idx < idx_max_v[i]) { + VecT srctmp = src_v[src_idx]; + const T* srcinptr = reinterpret_cast(&srctmp); +#pragma unroll + for (int s = 0; s < kVSize; s++) { + srcdata[i][it][s] = static_cast(srcinptr[s]); + } + } else { +#pragma unroll + for (int s = 0; s < kVSize; s++) { + srcdata[i][it][s] = -std::numeric_limits::infinity(); + } + } + } } } -} -template -__global__ void ScaleCrossEntropyGradient(T* logit_grad, const T* loss_grad, - const int num, const int d, - const int remain, - const LabelT* labels, - const int ignore_index) { - CUDA_KERNEL_LOOP(index, num) { - int idx_n = index / d; - int idx_remain = index % remain; - int idx_lbl = idx_n * remain + idx_remain; - int k = (index % d) / remain; - auto lbl = static_cast(labels[idx_lbl]); - if (lbl == ignore_index || lbl != k) { - logit_grad[index] = static_cast(0.); - } else { - logit_grad[index] *= loss_grad[idx_lbl]; + // compute max value: maxvalue_{i} = max_j src_{i,j} + AccT max_value[kBatchSize]; +#pragma unroll + for (int i = 0; i < kBatchSize; ++i) { + // it = 0 + AccT valmax = srcdata[i][0][0]; +#pragma unroll + for (int s = 1; s < kVSize; ++s) { + valmax = (valmax > srcdata[i][0][s]) ? valmax : srcdata[i][0][s]; } - } -} + max_value[i] = valmax; -template -class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - RunSoftmaxWithCrossEntropyFunctor(context, *this); +// it = 1, 2, ... +#pragma unroll + for (int it = 1; it < kIterationsV; ++it) { + AccT valmax = srcdata[i][it][0]; +#pragma unroll + for (int s = 1; s < kVSize; ++s) { + valmax = (valmax > srcdata[i][it][s]) ? valmax : srcdata[i][it][s]; + } + max_value[i] = (max_value[i] > valmax) ? max_value[i] : valmax; + } } + phi::WarpReduceMax(max_value); - template - static void Apply(const framework::ExecutionContext& context, - const framework::Tensor& labels, const bool soft_label) { - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(context.GetPlace()), true, - platform::errors::Unavailable("softmax_with_cross_entropy operator's " - "CUDA kernel only runs on GPU device.")); - const bool use_softmax = context.Attr("use_softmax"); - - // do not with softmax op, and input is softmax - if (!use_softmax) { - const Tensor* softmax = context.Input("Logits"); - Tensor* softmax_out = context.Output("Softmax"); - Tensor* loss = context.Output("Loss"); - - const int rank = softmax->dims().size(); - const int axis = - phi::funcs::CanonicalAxis(context.Attr("axis"), rank); - const int axis_dim = softmax->dims()[axis]; - - const int n = phi::funcs::SizeToAxis(axis, softmax->dims()); - const int d = phi::funcs::SizeFromAxis(axis, softmax->dims()); - - auto* softmax_out_data = - softmax_out->template mutable_data(context.GetPlace()); - auto* loss_data = loss->template mutable_data(context.GetPlace()); - - phi::funcs::SetConstant set_constant; - set_constant(context.cuda_device_context(), loss, static_cast(0)); - if (axis_dim == 1) { - set_constant(context.cuda_device_context(), softmax_out, - static_cast(1)); - return; + // compute sum: s_{i} = sum_{j}{ exp(src_{i,j} - maxvalue_{i} } + AccT sum[kBatchSize]; +#pragma unroll + for (int i = 0; i < kBatchSize; ++i) { + // it = 0 + if (mode == SoftmaxMode::kLogSoftmax || + mode == SoftmaxMode::kCrossEntropy) { + sum[i] = std::exp(srcdata[i][0][0] - max_value[i]); + } else { + srcdata[i][0][0] = std::exp(srcdata[i][0][0] - max_value[i]); + sum[i] = srcdata[i][0][0]; + } +#pragma unroll + for (int s = 1; s < kVSize; ++s) { + if (mode == SoftmaxMode::kLogSoftmax || + mode == SoftmaxMode::kCrossEntropy) { + sum[i] += std::exp(srcdata[i][0][s] - max_value[i]); + } else { + srcdata[i][0][s] = std::exp(srcdata[i][0][s] - max_value[i]); + sum[i] += srcdata[i][0][s]; } + } - auto ignore_index = context.Attr("ignore_index"); - - Tensor softmax_2d, labels_2d, loss_2d, softmax_out_2d; - softmax_2d.ShareDataWith(*softmax).Resize({n, d}); - labels_2d.ShareDataWith(labels).Resize({n, labels.numel() / n}); - loss_2d.ShareDataWith(*loss).Resize({n, 1}); - softmax_out_2d.ShareDataWith(*softmax_out).Resize({n, d}); - - // math::CrossEntropyFunctor support axis is the last - if (axis == -1) { - math::CrossEntropyFunctor()( - context.cuda_device_context(), &loss_2d, &softmax_2d, &labels_2d, - soft_label, ignore_index, axis_dim); - return; +// it = 1, 2, ... +#pragma unroll + for (int it = 1; it < kIterationsV; ++it) { +#pragma unroll + for (int s = 0; s < kVSize; ++s) { + if (mode == SoftmaxMode::kLogSoftmax || + mode == SoftmaxMode::kCrossEntropy) { + sum[i] += std::exp(srcdata[i][it][s] - max_value[i]); + } else { + srcdata[i][it][s] = std::exp(srcdata[i][it][s] - max_value[i]); + sum[i] += srcdata[i][it][s]; + } } + } + } + phi::WarpReduceSum(sum); - // if axis is not the last, we need a new impliment - if (soft_label) { - auto* logits_data = softmax->template data(); - auto* labels_data = labels.template data(); +// write data +#pragma unroll + for (int i = 0; i < kBatchSize; ++i) { + if (mode == SoftmaxMode::kLogSoftmax || + mode == SoftmaxMode::kCrossEntropy) { + sum[i] = std::log(sum[i]); + } - const int kDimLog2 = static_cast(Log2Ceil(axis_dim)); - const int kDimCeil = 1 << kDimLog2; -#ifdef __HIPCC__ - int kThreadPerBlock = 256; -#else - int kThreadPerBlock = 512; -#endif - int kBatchPerBlock = 1; - int blocks = (n * d + kBatchPerBlock - 1) / kBatchPerBlock; - dim3 threads(kThreadPerBlock / kBatchPerBlock, kBatchPerBlock, 1); - - CrossEntropySoftLabel<<< - blocks, threads, 0, context.cuda_device_context().stream()>>>( - loss_data, NULL, logits_data, labels_data, n, axis_dim, - d / axis_dim, kDimLog2); - } else { // HardLabel - auto* logits_data = softmax->template data(); - auto* labels_data = labels.template data(); - int threads = 128; - int blocks = (n * d / axis_dim + threads - 1) / threads; - if (ignore_index >= 0 && ignore_index < axis_dim) { - CrossEntropyHardLabel<<< - blocks, threads, 0, context.cuda_device_context().stream()>>>( - loss_data, logits_data, labels_data, n, axis_dim, d / axis_dim, - ignore_index); +#pragma unroll + for (int it = 0; it < kIterationsV; ++it) { + int idx = threadIdx.x + it * kWarpSize; + if (kVSize == 1) { // kVSize==1 + if (idx < idx_max_v[i]) { + if (mode == SoftmaxMode::kLogSoftmax) { // log softmax + softmax[(first_batch + i) * stride + idx] = + srcdata[i][it][0] - max_value[i] - sum[i]; + // softmax with cross entropy hard label + } else if (mode == SoftmaxMode::kCrossEntropy) { + AccT logsoftmax = srcdata[i][it][0] - max_value[i] - sum[i]; + // softmax + softmax[(first_batch + i) * stride + idx] = std::exp(logsoftmax); + // label + int loss_idx = (threadIdx.x + it * kWarpSize) * kVSize; + auto lbl = static_cast(label[first_batch + i]); + if (IgnoreIndex == true) { + // IgnoreIndex is true + if (lbl == loss_idx) { + if (lbl != ignore_index) { + loss[first_batch + i] = -logsoftmax; + } else { + loss[first_batch + i] = static_cast(0.0); + } + } + } else { + // IgnoreIndex is false + if (lbl >= 0 && lbl < element_count) { + if (lbl == loss_idx) { + loss[first_batch + i] = -logsoftmax; + } + } else { + loss[first_batch + i] = static_cast(0.0); + } + } + } else { // softmax + softmax[(first_batch + i) * stride + idx] = + srcdata[i][it][0] / sum[i]; + } + } else { + break; + } + } else { // KVSize>1 + VecT* softmax_v = + reinterpret_cast(&softmax[(first_batch + i) * stride]); + VecT tmpdata; + T* tmpptr = reinterpret_cast(&tmpdata); +#pragma unroll + for (int s = 0; s < kVSize; ++s) { + if (mode == SoftmaxMode::kLogSoftmax) { // log softmax + tmpptr[s] = srcdata[i][it][s] - max_value[i] - sum[i]; + // softmax with cross entropy hard label + } else if (mode == SoftmaxMode::kCrossEntropy) { + AccT logsoftmax = srcdata[i][it][s] - max_value[i] - sum[i]; + // softmax + tmpptr[s] = std::exp(logsoftmax); + // label + int loss_idx = (threadIdx.x + it * kWarpSize) * kVSize + s; + auto lbl = static_cast(label[first_batch + i]); + if (IgnoreIndex == true) { + // IgnoreIndex is true + if (lbl == loss_idx && lbl != ignore_index) { + loss[first_batch + i] = -logsoftmax; + } + } else { + // IgnoreIndex is false + if (lbl >= 0 && lbl < element_count) { + if (lbl == loss_idx) { + loss[first_batch + i] = -logsoftmax; + } + } else { + loss[first_batch + i] = static_cast(0.0); + } + } + } else { // softmax + tmpptr[s] = srcdata[i][it][s] / sum[i]; + } + } + if (idx < idx_max_v[i]) { + softmax_v[idx] = tmpdata; } else { - CrossEntropyHardLabel<<< - blocks, threads, 0, context.cuda_device_context().stream()>>>( - loss_data, logits_data, labels_data, n, axis_dim, d / axis_dim, - ignore_index); + break; } } - - // cause of input is softmax - // copy to output softmax, directly - framework::TensorCopy(*softmax, context.GetPlace(), - context.device_context(), softmax_out); - - return; } + } +} - const Tensor* logits = context.Input("Logits"); - Tensor* softmax = context.Output("Softmax"); - Tensor* loss = context.Output("Loss"); +#define SOFTMAX_WARP_FORWARD_CASE(Log2Elements, LabelT, VecT, AccT) \ + case Log2Elements: \ + WarpSoftmaxForward<<>>( \ + loss, \ + softmax, \ + src, \ + label, \ + batch_size, \ + stride, \ + element_count, \ + ignore_index); \ + break; - const int rank = logits->dims().size(); - const int axis = phi::funcs::CanonicalAxis(context.Attr("axis"), rank); - int axis_dim = logits->dims()[axis]; +/* + Wrapper of softmax with cross entropy forward hard label. +*/ +template +void SwitchWarpSoftmaxForward(T* loss, + T* softmax, + const T* src, + const LabelT* label, + const int batch_size, + const int stride, + const int element_count, + const int ignore_index, + gpuStream_t stream) { + using AccT = typename dtype::MPTypeTrait::Type; - const int64_t n = phi::funcs::SizeToAxis(axis, logits->dims()); - const int64_t d = phi::funcs::SizeFromAxis(axis, logits->dims()); + // use 128 threads per block to maximimize gpu utilization + const int log2_elements = static_cast(Log2Ceil(element_count)); + const int kDimCeil = 1 << log2_elements; + int kWarpSize = (kDimCeil < 32) ? kDimCeil : 32; + int batches_per_warp = (kDimCeil <= 128) ? 2 : 1; + constexpr int threads_per_block = 128; + int warps_per_block = (threads_per_block / kWarpSize); + int batches_per_block = warps_per_block * batches_per_warp; + int blocks = (batch_size + batches_per_block - 1) / batches_per_block; + dim3 threads(kWarpSize, warps_per_block, 1); - auto* softmax_data = softmax->template mutable_data(context.GetPlace()); - auto* loss_data = loss->template mutable_data(context.GetPlace()); + switch (log2_elements) { + SOFTMAX_WARP_FORWARD_CASE(0, LabelT, T, AccT); + SOFTMAX_WARP_FORWARD_CASE(1, LabelT, T, AccT); + SOFTMAX_WARP_FORWARD_CASE(2, LabelT, T, AccT); + SOFTMAX_WARP_FORWARD_CASE(3, LabelT, T, AccT); + SOFTMAX_WARP_FORWARD_CASE(4, LabelT, T, AccT); + SOFTMAX_WARP_FORWARD_CASE(5, LabelT, T, AccT); + SOFTMAX_WARP_FORWARD_CASE(6, LabelT, T, AccT); + SOFTMAX_WARP_FORWARD_CASE(7, LabelT, T, AccT); + SOFTMAX_WARP_FORWARD_CASE(8, LabelT, T, AccT); + SOFTMAX_WARP_FORWARD_CASE(9, LabelT, T, AccT); + default: + break; + } +} - if (axis_dim == 1) { - phi::funcs::SetConstant set_constant; - set_constant(context.cuda_device_context(), softmax, static_cast(1)); - set_constant(context.cuda_device_context(), loss, static_cast(0)); - return; - } +template +void LaunchVectorizedSoftmaxForward(T* loss, + T* softmax, + const T* logits, + const LabelT* label, + const int high_dim, + const int mid_dim, + const int ignore_index, + gpuStream_t stream) { + using AccT = typename dtype::MPTypeTrait::Type; + constexpr int vec_size = sizeof(float4) / sizeof(T); + const int max_num_threads = 1024; + int max_block_size = std::min(mid_dim / vec_size, max_num_threads); + if (vec_size > 1) { + max_block_size /= 2; + } - auto ignore_index = context.Attr("ignore_index"); + int block_size = 1; + while (block_size < max_block_size) { + block_size *= 2; + } + block_size = std::max(block_size, kps::details::kWarpSize); + dim3 grids(high_dim); + dim3 blocks(block_size); + VectorizedSoftmaxForward<<>>( + loss, softmax, logits, label, high_dim, mid_dim, ignore_index); +} - if (soft_label) { - auto* logits_data = logits->template data(); - auto* labels_data = labels.template data(); - SoftmaxWithCrossEntropySoftLabel( - context.cuda_device_context(), rank, axis, logits_data, labels_data, - softmax_data, loss_data, n, axis_dim, d / axis_dim); - } else { - if (!context.Attr("numeric_stable_mode")) { - // CUDNN kernel only suppoer 2-D tensor and perfome softmax on last dim - Tensor logits_2d, softmax_2d, labels_2d, loss_2d; - logits_2d.ShareDataWith(*logits).Resize({n, d}); - softmax_2d.ShareDataWith(*softmax).Resize({n, d}); - labels_2d.ShareDataWith(labels).Resize({n, labels.numel() / n}); - loss_2d.ShareDataWith(*loss).Resize({n, 1}); - math::SoftmaxCUDNNFunctor()(context.cuda_device_context(), - &logits_2d, &softmax_2d); - math::CrossEntropyFunctor()( - context.cuda_device_context(), &loss_2d, &softmax_2d, &labels_2d, - false, ignore_index, axis_dim); - } else { - auto* logits_data = logits->template data(); - auto* labels_data = labels.template data(); - if (ignore_index >= 0 && ignore_index < axis_dim) { - SoftmaxWithCrossEntropyHardLabel( - context.cuda_device_context(), rank, axis, logits_data, - labels_data, loss_data, softmax_data, n, axis_dim, d / axis_dim, - ignore_index); - } else { - SoftmaxWithCrossEntropyHardLabel( - context.cuda_device_context(), rank, axis, logits_data, - labels_data, loss_data, softmax_data, n, axis_dim, d / axis_dim, - ignore_index); - } - } +/* + Wrapper of softmax with cross entropy hard label. + - SwitchWarpSoftmaxForward for small size when axis == -1 + - LaunchVectorizedSoftmaxForward for large size when axis == -1 + - cudnn function for axis != -1 +*/ +template +static void SoftmaxWithCrossEntropyHardLabel(const GPUContext& dev_ctx, + int rank, + int axis, + const T* logits_data, + const LabelT* labels_data, + T* loss_data, + T* softmax_data, + int N, + int dim, + int D, + const int ignore_index) { + auto stream = dev_ctx.stream(); + constexpr int max_dim = 320; + if (D == 1) { + if (dim <= max_dim) { // small size + const SoftmaxMode mode = SoftmaxMode::kCrossEntropy; + SwitchWarpSoftmaxForward(loss_data, + softmax_data, + logits_data, + labels_data, + N, + dim, + dim, + ignore_index, + stream); + } else { // large size + LaunchVectorizedSoftmaxForward(loss_data, + softmax_data, + logits_data, + labels_data, + N, + dim, + ignore_index, + stream); } - } -}; + } else { + ScopedTensorDescriptor desc; + std::vector tensor_dims = {N, dim, D, 1}; + GPUDNNDataLayout layout = GPUDNNDataLayout::kNCHW; +#ifdef PADDLE_WITH_HIP + miopenTensorDescriptor_t descp = desc.descriptor(layout, tensor_dims); +#else + cudnnTensorDescriptor_t descp = desc.descriptor(layout, tensor_dims); +#endif -template -class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - RunSoftmaxWithCrossEntropyFunctor(context, *this); + auto handle = dev_ctx.cudnn_handle(); + +#ifdef PADDLE_WITH_HIP + auto mode = axis == rank - 1 ? MIOPEN_SOFTMAX_MODE_INSTANCE + : MIOPEN_SOFTMAX_MODE_CHANNEL; + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSoftmaxForward_V2( + handle, + paddle::platform::CudnnDataType::kOne(), + descp, + logits_data, + paddle::platform::CudnnDataType::kZero(), + descp, + softmax_data, + MIOPEN_SOFTMAX_LOG, + mode)); +#else + auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE + : CUDNN_SOFTMAX_MODE_CHANNEL; + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSoftmaxForward( + handle, + CUDNN_SOFTMAX_LOG, + mode, + paddle::platform::CudnnDataType::kOne(), + descp, + logits_data, + paddle::platform::CudnnDataType::kZero(), + descp, + softmax_data)); +#endif + int threads = 128; + int blocks = (N * dim * D + threads - 1) / threads; + // compute cross entropy, input is log softmax + CrossEntropyExpHardLabel<<>>( + loss_data, softmax_data, labels_data, N, dim, D, ignore_index); } +} - template - static void Apply(const framework::ExecutionContext& context, - const framework::Tensor& labels, const bool soft_label) { - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(context.GetPlace()), true, - platform::errors::Unavailable("softmax_with_cross_entropy operator's " - "CUDA kernel only runs on GPU device.")); - const T* loss_grad_data = - context.Input(framework::GradVarName("Loss")) - ->template data(); - Tensor* logit_grad = - context.Output(framework::GradVarName("Logits")); - const Tensor* softmax = context.Input("Softmax"); - auto stream = context.cuda_device_context().stream(); - auto ignore_index = context.Attr("ignore_index"); - auto use_softmax = context.Attr("use_softmax"); - - T* logit_grad_data = nullptr; - bool copy_flag = (logit_grad != softmax && (!use_softmax || soft_label)); - if (copy_flag) { - framework::TensorCopy(*softmax, context.GetPlace(), - context.device_context(), logit_grad); - logit_grad_data = logit_grad->template data(); - } else { - logit_grad_data = - logit_grad->template mutable_data(context.GetPlace()); +template +void CrossEntropyWithSoftmaxCUDAKernel(const GPUContext& dev_ctx, + const DenseTensor& logits, + const DenseTensor& label, + bool soft_label, + bool use_softmax, + bool numeric_stable_mode, + int ignore_index, + int axis, + DenseTensor* softmax, + DenseTensor* loss) { + PADDLE_ENFORCE_EQ( + dev_ctx.GetPlace().GetType(), + AllocationType::GPU, + phi::errors::Unavailable("softmax_with_cross_entropy operator's " + "CUDA kernel only runs on GPU device.")); + + // do not with softmax op, and input is softmax + if (!use_softmax) { + DenseTensor* softmax_out = softmax; + const DenseTensor* softmax = &logits; + const DenseTensor& labels = label; + + const int rank = softmax->dims().size(); + const int axis_v = phi::funcs::CanonicalAxis(axis, rank); + const int axis_dim = softmax->dims()[axis_v]; + + const int n = phi::funcs::SizeToAxis(axis_v, softmax->dims()); + const int d = phi::funcs::SizeFromAxis(axis_v, softmax->dims()); + + auto* softmax_out_data = dev_ctx.template Alloc(softmax_out); + auto* loss_data = dev_ctx.template Alloc(loss); + + phi::funcs::SetConstant set_constant; + set_constant(dev_ctx, loss, static_cast(0)); + if (axis_dim == 1) { + set_constant(dev_ctx, softmax_out, static_cast(1)); + return; } - const int rank = logit_grad->dims().size(); - const int axis = phi::funcs::CanonicalAxis(context.Attr("axis"), rank); - int axis_dim = logit_grad->dims()[axis]; + DenseTensor softmax_2d(*softmax); + softmax_2d.Resize({n, d}); + DenseTensor labels_2d(labels); + labels_2d.Resize({n, labels.numel() / n}); + DenseTensor loss_2d(*loss); + loss_2d.Resize({n, 1}); + DenseTensor softmax_out_2d(*softmax_out); + softmax_out_2d.Resize({n, d}); + + // math::CrossEntropyFunctor support axis is the last + if (axis_v == -1) { + paddle::operators::math::CrossEntropyFunctor()( + dev_ctx, + &loss_2d, + &softmax_2d, + &labels_2d, + soft_label, + ignore_index, + axis_dim); + return; + } - const int64_t n = phi::funcs::SizeToAxis(axis, logit_grad->dims()); - const int64_t d = phi::funcs::SizeFromAxis(axis, logit_grad->dims()); - const int64_t remain = d / axis_dim; + // if axis is not the last, we need a new impliment + if (soft_label) { + auto* logits_data = softmax->data(); + auto* labels_data = labels.data(); + const int kDimLog2 = static_cast(Log2Ceil(axis_dim)); + const int kDimCeil = 1 << kDimLog2; #ifdef __HIPCC__ - int block = 256; + int kThreadPerBlock = 256; #else - int block = 512; + int kThreadPerBlock = 512; #endif - - // do not with softmax op, and input is softmax - if (!use_softmax) { - if (soft_label) { - int grid = (n * d + block - 1) / block; - const T* label_data = labels.template data(); - SoftLabelCrossEntropyGradientKernel<<>>( - logit_grad_data, loss_grad_data, label_data, n, d, remain); + int kBatchPerBlock = 1; + int blocks = (n * d + kBatchPerBlock - 1) / kBatchPerBlock; + dim3 threads(kThreadPerBlock / kBatchPerBlock, kBatchPerBlock, 1); + + CrossEntropySoftLabel<<>>( + loss_data, + NULL, + logits_data, + labels_data, + n, + axis_dim, + d / axis_dim, + kDimLog2); + } else { // HardLabel + auto* logits_data = softmax->data(); + auto* labels_data = labels.data(); + int threads = 128; + int blocks = (n * d / axis_dim + threads - 1) / threads; + if (ignore_index >= 0 && ignore_index < axis_dim) { + CrossEntropyHardLabel<<>>( + loss_data, + logits_data, + labels_data, + n, + axis_dim, + d / axis_dim, + ignore_index); } else { - Tensor logits_grad_2d; - logits_grad_2d.ShareDataWith(*logit_grad).Resize({n, d}); - int grid = (n * remain + block - 1) / block; - const auto* label_data = labels.template data(); - HardLabelCrossEntropyGradientKernel<<>>( - logit_grad_data, label_data, n, d, remain, ignore_index); - int num = n * d; - grid = (num + block - 1) / block; - ScaleCrossEntropyGradient<<>>( - logit_grad_data, loss_grad_data, num, d, remain, label_data, + CrossEntropyHardLabel<<>>( + loss_data, + logits_data, + labels_data, + n, + axis_dim, + d / axis_dim, ignore_index); } - - return; } - // with softmax, continue + // cause of input is softmax + // copy to output softmax, directly + phi::Copy( + dev_ctx, *softmax, dev_ctx.GetPlace(), false, softmax_out); - if (soft_label) { - int64_t grid = (n * d + block - 1) / block; - const T* label_data = labels.template data(); - SoftCrossEntropyGradientKernel<<>>( - logit_grad_data, loss_grad_data, label_data, n, d, remain); + return; + } + + const int rank = logits.dims().size(); + const int axis_v = phi::funcs::CanonicalAxis(axis, rank); + int axis_dim = logits.dims()[axis_v]; + + const int64_t n = phi::funcs::SizeToAxis(axis_v, logits.dims()); + const int64_t d = phi::funcs::SizeFromAxis(axis_v, logits.dims()); + + auto* softmax_data = dev_ctx.template Alloc(softmax); + auto* loss_data = dev_ctx.template Alloc(loss); + + if (axis_dim == 1) { + phi::funcs::SetConstant set_constant; + set_constant(dev_ctx, softmax, static_cast(1)); + set_constant(dev_ctx, loss, static_cast(0)); + return; + } + + if (soft_label) { + auto* logits_data = logits.data(); + auto* labels_data = label.data(); + SoftmaxWithCrossEntropySoftLabel(dev_ctx, + rank, + axis_v, + logits_data, + labels_data, + softmax_data, + loss_data, + n, + axis_dim, + d / axis_dim); + } else { + if (!numeric_stable_mode) { + // CUDNN kernel only suppoer 2-D tensor and perfome softmax on last dim + DenseTensor logits_2d(logits); + logits_2d.Resize({n, d}); + DenseTensor softmax_2d(*softmax); + softmax_2d.Resize({n, d}); + DenseTensor labels_2d(label); + labels_2d.Resize({n, label.numel() / n}); + DenseTensor loss_2d(*loss); + loss_2d.Resize({n, 1}); + paddle::operators::math::SoftmaxCUDNNFunctor()( + dev_ctx, &logits_2d, &softmax_2d); + paddle::operators::math::CrossEntropyFunctor()( + dev_ctx, + &loss_2d, + &softmax_2d, + &labels_2d, + false, + ignore_index, + axis_dim); } else { - const T* softmax_data = softmax->template data(); - const auto* label_data = labels.template data(); - int grid = (n * d + block - 1) / block; - SoftmaxWithCrossEntropyGradHardLabel<<>>( - logit_grad_data, loss_grad_data, softmax_data, label_data, n, - d / remain, remain, ignore_index); + auto* logits_data = logits.data(); + auto* labels_data = label.data(); + if (ignore_index >= 0 && ignore_index < axis_dim) { + SoftmaxWithCrossEntropyHardLabel(dev_ctx, + rank, + axis_v, + logits_data, + labels_data, + loss_data, + softmax_data, + n, + axis_dim, + d / axis_dim, + ignore_index); + } else { + SoftmaxWithCrossEntropyHardLabel(dev_ctx, + rank, + axis_v, + logits_data, + labels_data, + loss_data, + softmax_data, + n, + axis_dim, + d / axis_dim, + ignore_index); + } } } -}; +} + +template +void CrossEntropyWithSoftmaxKernel(const Context& dev_ctx, + const DenseTensor& logits, + const DenseTensor& label, + bool soft_label, + bool use_softmax, + bool numeric_stable_mode, + int ignore_index, + int axis, + DenseTensor* softmax, + DenseTensor* loss) { + auto dtype = label.dtype(); + if (soft_label) { + PADDLE_ENFORCE_EQ( + dtype, + paddle::experimental::CppTypeToDataType::Type(), + phi::errors::InvalidArgument("The Input(Label) should be with the " + "same data type as Input(Logits).")); + CrossEntropyWithSoftmaxCUDAKernel(dev_ctx, + logits, + label, + soft_label, + use_softmax, + numeric_stable_mode, + ignore_index, + axis, + softmax, + loss); + } else { + PD_DISPATCH_INTEGRAL_TYPES( + dtype, "CrossEntropyWithSoftmaxCUDAKernel", ([&] { + CrossEntropyWithSoftmaxCUDAKernel(dev_ctx, + logits, + label, + soft_label, + use_softmax, + numeric_stable_mode, + ignore_index, + axis, + softmax, + loss); + })); + } +} -} // namespace operators -} // namespace paddle +} // namespace phi -namespace ops = paddle::operators; #ifdef PADDLE_WITH_HIP -// MIOPEN do not support double -REGISTER_OP_CUDA_KERNEL( - softmax_with_cross_entropy, ops::SoftmaxWithCrossEntropyCUDAKernel, - ops::SoftmaxWithCrossEntropyCUDAKernel); -REGISTER_OP_CUDA_KERNEL( - softmax_with_cross_entropy_grad, - ops::SoftmaxWithCrossEntropyGradCUDAKernel, - ops::SoftmaxWithCrossEntropyGradCUDAKernel); +PD_REGISTER_KERNEL(cross_entropy_with_softmax, + GPU, + ALL_LAYOUT, + phi::CrossEntropyWithSoftmaxKernel, + float, + phi::dtype::float16) {} #else -REGISTER_OP_CUDA_KERNEL( - softmax_with_cross_entropy, ops::SoftmaxWithCrossEntropyCUDAKernel, - ops::SoftmaxWithCrossEntropyCUDAKernel, - ops::SoftmaxWithCrossEntropyCUDAKernel); -REGISTER_OP_CUDA_KERNEL( - softmax_with_cross_entropy_grad, - ops::SoftmaxWithCrossEntropyGradCUDAKernel, - ops::SoftmaxWithCrossEntropyGradCUDAKernel, - ops::SoftmaxWithCrossEntropyGradCUDAKernel); +PD_REGISTER_KERNEL(cross_entropy_with_softmax, + GPU, + ALL_LAYOUT, + phi::CrossEntropyWithSoftmaxKernel, + float, + double, + phi::dtype::float16) {} #endif diff --git a/paddle/phi/kernels/gpu/full_kernel.cu b/paddle/phi/kernels/gpu/full_kernel.cu index 852d209ee0185..79b71b95d9ee8 100644 --- a/paddle/phi/kernels/gpu/full_kernel.cu +++ b/paddle/phi/kernels/gpu/full_kernel.cu @@ -35,7 +35,7 @@ struct FullFuctor { template void FullKernel(const Context& dev_ctx, - const ScalarArray& shape, + const IntArray& shape, const Scalar& val, DataType dtype, DenseTensor* out) { diff --git a/paddle/phi/kernels/gpu/gaussian_random_kernel.cu b/paddle/phi/kernels/gpu/gaussian_random_kernel.cu index e2fe2190c1ce0..e159e5916cff2 100644 --- a/paddle/phi/kernels/gpu/gaussian_random_kernel.cu +++ b/paddle/phi/kernels/gpu/gaussian_random_kernel.cu @@ -58,7 +58,7 @@ struct GaussianGenerator { template void GaussianRandomKernel(const Context& dev_ctx, - const ScalarArray& shape, + const IntArray& shape, float mean, float std, int seed, diff --git a/paddle/phi/kernels/gpu/index_sample_grad_kernel.cu b/paddle/phi/kernels/gpu/index_sample_grad_kernel.cu index 8b1ef964124d7..669ae11543950 100644 --- a/paddle/phi/kernels/gpu/index_sample_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/index_sample_grad_kernel.cu @@ -36,7 +36,7 @@ void LimitGridDim(const Context& ctx, dim3* grid_dim) { #define PREDEFINED_BLOCK_SIZE_X 512 #define PREDEFINED_BLOCK_SIZE 1024 #define MIN(a, b) ((a) < (b) ? (a) : (b)) -}; +} // namespace template __global__ void IndexSampleGrad(const IndexT* index, @@ -67,9 +67,9 @@ __global__ void IndexSampleGrad(const IndexT* index, template void IndexSampleGradKernel(const Context& ctx, - const DenseTensor& out_grad, const DenseTensor& x, const DenseTensor& index, + const DenseTensor& out_grad, DenseTensor* x_grad) { const T* output_grad_data = out_grad.data(); T* input_grad_data = ctx.template Alloc(x_grad); diff --git a/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu b/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu new file mode 100644 index 0000000000000..73334d9c38aa3 --- /dev/null +++ b/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu @@ -0,0 +1,1601 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/interpolate_grad_kernel.h" + +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/fluid/platform/fast_divmod.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_launch_config.h" +#include "paddle/phi/common/layout.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/interpolate_function.h" +#include "paddle/phi/kernels/funcs/math_cuda_utils.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +template +__forceinline__ __device__ void PreCalculatorForLinearInterpInputIndex( + int* in_img_idx, + int* x_id, + T* lambda1, + T* lambda2, + T src_x, + const int in_img_x) { + src_x = (src_x > 0) ? src_x : 0.f; + *in_img_idx = static_cast(src_x); + *x_id = (*in_img_idx < in_img_x - 1) ? 1 : 0; + *lambda1 = src_x - *in_img_idx; + *lambda2 = 1.f - *lambda1; +} + +template +__global__ void KeLinearInterpBw(T* in, + const size_t in_img_w, + const size_t input_w, + const T* out, + const size_t out_img_w, + const size_t output_h, + const size_t output_w, + const size_t num_channels, + const T ratio_w, + const bool align_corners, + const int align_mode, + const DataLayout data_layout) { + int nthreads = output_h * output_w; + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + bool align_flag = (align_mode == 0 && !align_corners); + for (; tid < nthreads; tid += stride) { + int out_id_h = tid / output_w; + int out_id_w = tid % output_w; + int in_img_size = input_w / num_channels; + int out_img_size = output_w / num_channels; + + int channel_id, out_img_idx; + if (data_layout == DataLayout::kNCHW) { + channel_id = out_id_w / out_img_size; + out_img_idx = tid % out_img_w; + } else { + out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels; + channel_id = tid % num_channels; + } + + int in_img_idx = align_flag ? ratio_w * (out_img_idx + 0.5) - 0.5 + : ratio_w * out_img_idx; + in_img_idx = (in_img_idx > 0) ? in_img_idx : 0; // w + int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0; // w_id + + T src_w = ratio_w * (out_img_idx + 0.5) - 0.5; + src_w = (src_w > 0) ? src_w : 0; + T w1lambda = + align_flag ? src_w - in_img_idx : ratio_w * out_img_idx - in_img_idx; + T w2lambda = 1.f - w1lambda; + + T* in_pos; + if (data_layout == DataLayout::kNCHW) { + in_pos = &in[out_id_h * input_w + channel_id * in_img_size + in_img_idx]; + } else { + in_pos = &in[out_id_h * input_w + in_img_idx * num_channels + channel_id]; + } + const T* out_pos = &out[out_id_w]; + + if (data_layout == DataLayout::kNCHW) { + paddle::platform::CudaAtomicAdd(&in_pos[0], w2lambda * out_pos[0]); + paddle::platform::CudaAtomicAdd(&in_pos[w_id], w1lambda * out_pos[0]); + } else { + paddle::platform::CudaAtomicAdd(&in_pos[0], w2lambda * out_pos[0]); + paddle::platform::CudaAtomicAdd(&in_pos[w_id * num_channels], + w1lambda * out_pos[0]); + } + } +} + +template +__global__ void KeNearestNeighborInterpNCHWBw(T* in, + const size_t in_img_h, + const size_t in_img_w, + const T* out, + const size_t out_img_h, + const size_t out_img_w, + const size_t nc, + const float ratio_h, + const float ratio_w, + const bool align_corners) { + int out_img_idx = threadIdx.x + blockIdx.x * blockDim.x; + int out_img_idy = threadIdx.y + blockIdx.y * blockDim.y; + int nc_id = threadIdx.z + blockIdx.z * blockDim.z; + int nc_stride = blockDim.z * gridDim.z; + + // nearest_sampling by multiple read in_addr and write to out_addr + int in_img_idx = (align_corners) + ? static_cast(ratio_w * out_img_idx + 0.5) + : static_cast(ratio_w * out_img_idx); + int in_img_idy = (align_corners) + ? static_cast(ratio_h * out_img_idy + 0.5) + : static_cast(ratio_h * out_img_idy); + + int in_index = (nc_id * in_img_h + in_img_idy) * in_img_w + in_img_idx; + int in_index_stride = nc_stride * in_img_h * in_img_w; + + int out_index = (nc_id * out_img_h + out_img_idy) * out_img_w + out_img_idx; + int out_index_stride = nc_stride * out_img_h * out_img_w; + + // prevent from multiple threads writing + if (out_img_idx < out_img_w && out_img_idy < out_img_h) { + while (nc_id < nc) { + T* in_pos = &in[in_index]; + const T out_pos = out[out_index]; + paddle::platform::CudaAtomicAdd(in_pos, out_pos); + in_index += in_index_stride; + out_index += out_index_stride; + nc_id += nc_stride; + } + } +} + +template +__global__ void KeNearestNeighborInterpBw( + T* in, + const size_t in_img_h, + const size_t in_img_w, + const size_t input_h, + const size_t input_w, + const T* out, + const size_t out_img_h, + const size_t out_img_w, + const size_t output_h, + const size_t output_w, + const size_t num_channels, + const float ratio_h, + const float ratio_w, + const bool align_corners, + funcs::FastDivModForInterpolate divmods) { + int nthreads = output_h * output_w; + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + int in_img_size = in_img_h * in_img_w; + int out_img_size = out_img_h * out_img_w; + + for (; tid < nthreads; tid += stride) { + auto out_id_divmod = divmods.output_w_div.Divmod(tid); + int out_id_h = out_id_divmod.val[0]; + int out_id_w = out_id_divmod.val[1]; + + int channel_id = divmods.channels_div.Divmod(tid).val[1]; + auto outimg_id_divmod = divmods.output_wc_div.Divmod(out_id_w); + int out_img_idy = outimg_id_divmod.val[0]; + int out_img_idx = + divmods.channels_div.Divmod(outimg_id_divmod.val[1]).val[0]; + + int in_img_idy = (align_corners) + ? static_cast(ratio_h * out_img_idy + 0.5) + : static_cast(ratio_h * out_img_idy); + int in_img_idx = (align_corners) + ? static_cast(ratio_w * out_img_idx + 0.5) + : static_cast(ratio_w * out_img_idx); + + T* in_pos = &in[out_id_h * input_w + in_img_idy * in_img_w * num_channels + + in_img_idx * num_channels + channel_id]; + + const T out_pos = out[tid]; + paddle::platform::CudaAtomicAdd(in_pos, out_pos); + } +} + +/* Calculate the minimum of partial elements in a block */ +template +__inline__ __device__ T PartialBlockMin(T val, + size_t threads_num_in_block, + unsigned mask) { + __shared__ T shared[WARP_SIZE]; + __shared__ T shared_last_val; + __shared__ int shared_last_idx; + int lane = threadIdx.x & 0x1f; + int wid = threadIdx.x >> 5; + int threshold = (threads_num_in_block & (-WARP_SIZE)); + + if (threadIdx.x < threshold) { + shared_last_idx = (threshold >> 5) - 1; + val = phi::funcs::warpReduceMin(val, mask); + if (lane == 0) { + shared[wid] = val; + } + } else { + shared_last_val = std::numeric_limits::max(); + paddle::platform::CudaAtomicMin(&shared_last_val, val); + shared[wid] = shared_last_val; + shared_last_idx = wid; + } + __syncthreads(); + + if (threadIdx.x < threshold) { + val = (lane <= shared_last_idx) ? shared[lane] + : std::numeric_limits::max(); + val = phi::funcs::warpReduceMin(val, mask); + shared_last_val = val; + } + __syncthreads(); + if (threadIdx.x >= threshold) { + val = shared_last_val; + } + return val; +} + +template +__global__ void KeBilinearInterpBwShareMemory(T* in, + const int in_h, + const int in_w, + const T* __restrict__ out, + const int out_h, + const int out_w, + const int n, + const int num_channels, + float ratio_h, + float ratio_w, + const T align_type_value, + bool is_nchw) { + __shared__ T s_data[2][1024]; + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + int in_chw = in_h * in_w * num_channels; + int out_chw = num_channels * out_h * out_w; + int nthreads = n * out_chw; + + for (; tid < nthreads; tid += stride) { + int out_id_h = tid / out_chw; + int out_id_w = tid % out_chw; + const int in_img_size = in_h * in_w; + const int out_img_size = out_h * out_w; + T value = out[out_id_h * out_chw + out_id_w]; + + int channel_id = out_id_w / out_img_size; + int out_img_idy = (out_id_w % out_img_size) / out_w; + int out_img_idx = tid % out_w; + + int in_img_idx, in_img_idy, w_id, h_id; + T w1lambda, h1lambda, w2lambda, h2lambda; + T src_w = ratio_w * (out_img_idx + align_type_value) - align_type_value; + T src_h = ratio_h * (out_img_idy + align_type_value) - align_type_value; + + PreCalculatorForLinearInterpInputIndex( + &in_img_idx, &w_id, &w1lambda, &w2lambda, src_w, in_w); + PreCalculatorForLinearInterpInputIndex( + &in_img_idy, &h_id, &h1lambda, &h2lambda, src_h, in_h); + + // top_left_index is just input_index. + int input_index = out_id_h * in_chw + channel_id * in_img_size + + in_img_idy * in_w + in_img_idx; + int top_right_index = input_index + w_id; + int bot_left_index = input_index + h_id * in_w; + int bot_right_index = input_index + h_id * in_w + w_id; + int in_top_min_index, in_bot_min_index; + + s_data[0][threadIdx.x] = 0.f; + s_data[1][threadIdx.x] = 0.f; + int remain = nthreads - (tid & (-blockDim.x)); + int in_top_max_index = + phi::funcs::blockReduceMax(top_right_index, FINAL_MASK); + int in_bot_max_index = + phi::funcs::blockReduceMax(bot_right_index, FINAL_MASK); + + if (remain > blockDim.x) { + in_top_min_index = phi::funcs::blockReduceMin(input_index, FINAL_MASK); + in_bot_min_index = phi::funcs::blockReduceMin(bot_left_index, FINAL_MASK); + } else { + in_top_min_index = PartialBlockMin(input_index, remain, FINAL_MASK); + in_bot_min_index = PartialBlockMin(bot_left_index, remain, FINAL_MASK); + } + int upper_limit_share_idx = (in_top_max_index - in_top_min_index) > + (in_bot_max_index - in_bot_min_index) + ? (in_top_max_index - in_top_min_index) + : (in_bot_max_index - in_bot_min_index); + if (h_id != 0) { + paddle::platform::CudaAtomicAdd( + &s_data[0][input_index - in_top_min_index], + h2lambda * w2lambda * value); + paddle::platform::CudaAtomicAdd( + &s_data[0][top_right_index - in_top_min_index], + h2lambda * w1lambda * value); + paddle::platform::CudaAtomicAdd( + &s_data[1][bot_left_index - in_bot_min_index], + h1lambda * w2lambda * value); + paddle::platform::CudaAtomicAdd( + &s_data[1][bot_right_index - in_bot_min_index], + h1lambda * w1lambda * value); + } else { + paddle::platform::CudaAtomicAdd( + &s_data[0][top_right_index - in_top_min_index], + (h2lambda + h1lambda) * w1lambda * value); + paddle::platform::CudaAtomicAdd( + &s_data[1][bot_left_index - in_bot_min_index], + (h1lambda + h2lambda) * w2lambda * value); + } + __syncthreads(); + + if (threadIdx.x <= upper_limit_share_idx) { + paddle::platform::CudaAtomicAdd(&in[in_top_min_index + threadIdx.x], + s_data[0][threadIdx.x]); + paddle::platform::CudaAtomicAdd(&in[in_bot_min_index + threadIdx.x], + s_data[1][threadIdx.x]); + } + } +} + +__device__ __forceinline__ int GetInputIndex(const size_t nc, + const int height, + const int width, + const int h, + const int w) { + return (nc * height + h) * width + w; +} + +template +__global__ void KeBilinearInterpNCHWBw(T* in, + const int in_h, + const int in_w, + const int out_h, + const int out_w, + const int n, + const int num_channels, + float ratio_h, + float ratio_w, + const T* __restrict__ out, + const T align_type_value) { + int index = threadIdx.x + blockDim.x * blockIdx.x; + int stride = blockDim.x * gridDim.x; + int num_out = n * num_channels * out_h * out_w; + int num_in = n * num_channels * in_h * in_w; + + for (; index < num_out; index += stride) { + int index_tmp = index; + int w2 = index_tmp % out_w; + index_tmp /= out_w; + int h2 = index_tmp % out_h; + int nc = index_tmp / out_h; + + int h1, y_id; + T h1lambda, h0lambda; + T src_y = ratio_h * (h2 + align_type_value) - align_type_value; + + PreCalculatorForLinearInterpInputIndex( + &h1, &y_id, &h1lambda, &h0lambda, src_y, in_h); + int w1, x_id; + T w1lambda, w0lambda; + T src_x = ratio_w * (w2 + align_type_value) - align_type_value; + PreCalculatorForLinearInterpInputIndex( + &w1, &x_id, &w1lambda, &w0lambda, src_x, in_w); + + T d2val = out[index]; + + paddle::platform::CudaAtomicAdd(in + GetInputIndex(nc, in_h, in_w, h1, w1), + h0lambda * w0lambda * d2val); + paddle::platform::CudaAtomicAdd( + in + GetInputIndex(nc, in_h, in_w, h1, w1 + x_id), + h0lambda * w1lambda * d2val); + paddle::platform::CudaAtomicAdd( + in + GetInputIndex(nc, in_h, in_w, h1 + y_id, w1), + h1lambda * w0lambda * d2val); + paddle::platform::CudaAtomicAdd( + in + GetInputIndex(nc, in_h, in_w, h1 + y_id, w1 + x_id), + h1lambda * w1lambda * d2val); + } +} + +template +__global__ void KeBilinearInterpBw(T* in, + const int in_h, + const int in_w, + const T* __restrict__ out, + const int out_h, + const int out_w, + const int n, + const int out_chw, + const int num_channels, + float ratio_h, + float ratio_w, + const T align_type_value, + funcs::FastDivModForInterpolate divmods) { + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + int in_chw = in_h * in_w * num_channels; + int nthreads = n * out_chw; + + for (; tid < nthreads; tid += stride) { + auto out_id_divmod = divmods.output_w_div.Divmod(tid); + int out_id_h = out_id_divmod.val[0]; + int out_id_w = out_id_divmod.val[1]; + + int channel_id = divmods.channels_div.Divmod(tid).val[1]; + auto outimg_id_divmod = divmods.output_wc_div.Divmod(out_id_w); + int out_img_idy = outimg_id_divmod.val[0]; + int out_img_idx = + divmods.channels_div.Divmod(outimg_id_divmod.val[1]).val[0]; + + int in_img_idx, in_img_idy, w_id, h_id; + T w1lambda, h1lambda, w2lambda, h2lambda; + T src_w = ratio_w * (out_img_idx + align_type_value) - align_type_value; + T src_h = ratio_h * (out_img_idy + align_type_value) - align_type_value; + + PreCalculatorForLinearInterpInputIndex( + &in_img_idx, &w_id, &w1lambda, &w2lambda, src_w, in_w); + PreCalculatorForLinearInterpInputIndex( + &in_img_idy, &h_id, &h1lambda, &h2lambda, src_h, in_h); + + T value = out[tid]; + T* in_pos = &in[out_id_h * in_chw + in_img_idy * in_w * num_channels + + in_img_idx * num_channels + channel_id]; + paddle::platform::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * value); + paddle::platform::CudaAtomicAdd(&in_pos[w_id * num_channels], + h2lambda * w1lambda * value); + paddle::platform::CudaAtomicAdd(&in_pos[h_id * in_w * num_channels], + h1lambda * w2lambda * value); + paddle::platform::CudaAtomicAdd( + &in_pos[h_id * in_w * num_channels + w_id * num_channels], + h1lambda * w1lambda * value); + } +} + +template +__global__ void KeBicubicInterpBw(T* in, + const size_t in_img_h, + const size_t in_img_w, + const size_t input_h, + const size_t input_w, + const T* out, + const size_t out_img_h, + const size_t out_img_w, + const size_t output_h, + const size_t output_w, + const size_t num_channels, + const float ratio_h, + const float ratio_w, + const bool align_corners, + const DataLayout data_layout) { + int nthreads = output_h * output_w; + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + + for (; tid < nthreads; tid += stride) { + int out_id_h = tid / output_w; + int out_id_w = tid % output_w; + int in_img_size = input_w / num_channels; + int out_img_size = output_w / num_channels; + + int channel_id, out_img_idy, out_img_idx; + if (data_layout == DataLayout::kNCHW) { + channel_id = out_id_w / out_img_size; + out_img_idy = (out_id_w % out_img_size) / out_img_w; + out_img_idx = tid % out_img_w; + } else { + out_img_idy = out_id_w / (out_img_w * num_channels); + out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels; + channel_id = tid % num_channels; + } + + T in_img_idy = align_corners + ? static_cast(ratio_h * out_img_idy) + : static_cast(ratio_h * (out_img_idy + 0.5) - 0.5); + int input_y = floorf(in_img_idy); + const T y_t = in_img_idy - input_y; + + T in_img_idx = align_corners + ? static_cast(ratio_w * out_img_idx) + : static_cast(ratio_w * (out_img_idx + 0.5) - 0.5); + int input_x = floorf(in_img_idx); + + const T x_t = in_img_idx - input_x; + + T x_coeffs[4]; + T y_coeffs[4]; + + funcs::get_cubic_upsample_coefficients(x_coeffs, x_t); + funcs::get_cubic_upsample_coefficients(y_coeffs, y_t); + + const T* out_pos = &out[out_id_h * output_w + out_id_w]; + T* in_pos; + + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 4; j++) { + int access_y = max(min(static_cast(input_y - 1 + j), + static_cast(in_img_h - 1)), + 0); + int access_x = max(min(static_cast(input_x - 1 + i), + static_cast(in_img_w - 1)), + 0); + if (data_layout == DataLayout::kNCHW) { + in_pos = &in[out_id_h * input_w + channel_id * in_img_size + + access_y * in_img_w + access_x]; + } else { + in_pos = &in[out_id_h * input_w + access_y * in_img_w * num_channels + + access_x * num_channels + channel_id]; + } + paddle::platform::CudaAtomicAdd( + &in_pos[0], (out_pos[0] * y_coeffs[j] * x_coeffs[i])); + } + } + } +} + +template +__global__ void KeTrilinearInterpBw(T* in, + const size_t in_img_d, + const size_t in_img_h, + const size_t in_img_w, + const size_t input_h, + const size_t input_w, + const T* out, + const size_t out_img_d, + const size_t out_img_h, + const size_t out_img_w, + const size_t output_h, + const size_t output_w, + const size_t num_channels, + const T ratio_d, + const T ratio_h, + const T ratio_w, + const bool align_corners, + const int align_mode, + const DataLayout data_layout) { + int nthreads = output_h * output_w; + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + bool align_flag = (align_mode == 0 && !align_corners); + for (; tid < nthreads; tid += stride) { + int out_id_h = tid / output_w; + int out_id_w = tid % output_w; + int in_img_size = input_w / num_channels; + int out_img_size = output_w / num_channels; + + int channel_id, out_img_idt, out_img_idy, out_img_idx; + if (data_layout == DataLayout::kNCHW) { + channel_id = out_id_w / out_img_size; + out_img_idt = (out_id_w % out_img_size) / out_img_h / out_img_w; + out_img_idy = ((out_id_w % out_img_size) / out_img_w) % out_img_h; + out_img_idx = tid % out_img_w; + } else { + out_img_idt = out_id_w / (out_img_h * out_img_w * num_channels); + out_img_idy = out_id_w % (out_img_h * out_img_w * num_channels) / + (out_img_w * num_channels); + out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels; + channel_id = tid % num_channels; + } + + int in_img_idt = align_flag + ? static_cast(ratio_d * (out_img_idt + 0.5) - 0.5) + : static_cast(ratio_d * out_img_idt); + in_img_idt = (in_img_idt > 0) ? in_img_idt : 0; + int d_id = (in_img_idt < in_img_d - 1) ? 1 : 0; + T src_d = ratio_d * (out_img_idt + 0.5) - 0.5; + src_d = (src_d > 0) ? src_d : 0; + T d1lambda = + align_flag ? src_d - in_img_idt : ratio_d * out_img_idt - in_img_idt; + T d2lambda = 1.f - d1lambda; + + int in_img_idy = align_flag + ? static_cast(ratio_h * (out_img_idy + 0.5) - 0.5) + : static_cast(ratio_h * out_img_idy); + in_img_idy = (in_img_idy > 0) ? in_img_idy : 0; + int h_id = (in_img_idy < in_img_h - 1) ? 1 : 0; + T src_h = ratio_h * (out_img_idy + 0.5) - 0.5; + src_h = (src_h > 0) ? src_h : 0; + T h1lambda = + align_flag ? src_h - in_img_idy : ratio_h * out_img_idy - in_img_idy; + T h2lambda = 1.f - h1lambda; + + int in_img_idx = align_flag + ? static_cast(ratio_w * (out_img_idx + 0.5) - 0.5) + : static_cast(ratio_w * out_img_idx); + in_img_idx = (in_img_idx > 0) ? in_img_idx : 0; + int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0; + T src_w = ratio_w * (out_img_idx + 0.5) - 0.5; + src_w = (src_w > 0) ? src_w : 0; + T w1lambda = + align_flag ? src_w - in_img_idx : ratio_w * out_img_idx - in_img_idx; + T w2lambda = 1.f - w1lambda; + + if (data_layout == DataLayout::kNCHW) { + int in_pos1_idx = out_id_h * input_w + channel_id * in_img_size + + (in_img_idt * in_img_h + in_img_idy) * in_img_w + + in_img_idx; + T* in_pos1 = &in[in_pos1_idx]; + int in_pos2_idx = in_pos1_idx + d_id * in_img_h * in_img_w; + T* in_pos2 = &in[in_pos2_idx]; + + const T* out_pos = &out[out_id_h * output_w + out_id_w]; + + // trilinear interpolation grad + paddle::platform::CudaAtomicAdd( + &in_pos1[0], d2lambda * h2lambda * w2lambda * out_pos[0]); + paddle::platform::CudaAtomicAdd( + &in_pos1[w_id], d2lambda * h2lambda * w1lambda * out_pos[0]); + paddle::platform::CudaAtomicAdd( + &in_pos1[h_id * in_img_w], + d2lambda * h1lambda * w2lambda * out_pos[0]); + paddle::platform::CudaAtomicAdd( + &in_pos1[h_id * in_img_w + w_id], + d2lambda * h1lambda * w1lambda * out_pos[0]); + paddle::platform::CudaAtomicAdd( + &in_pos2[0], d1lambda * h2lambda * w2lambda * out_pos[0]); + paddle::platform::CudaAtomicAdd( + &in_pos2[w_id], d1lambda * h2lambda * w1lambda * out_pos[0]); + paddle::platform::CudaAtomicAdd( + &in_pos2[h_id * in_img_w], + d1lambda * h1lambda * w2lambda * out_pos[0]); + paddle::platform::CudaAtomicAdd( + &in_pos2[h_id * in_img_w + w_id], + d1lambda * h1lambda * w1lambda * out_pos[0]); + } else { + int in_pos1_idx = out_id_h * input_w + + in_img_idt * in_img_h * in_img_w * num_channels + + in_img_idy * in_img_w * num_channels + + in_img_idx * num_channels + channel_id; + T* in_pos1 = &in[in_pos1_idx]; + int in_pos2_idx = in_pos1_idx + d_id * in_img_h * in_img_w * num_channels; + T* in_pos2 = &in[in_pos2_idx]; + + const T* out_pos = &out[out_id_h * output_w + out_id_w]; + + // trilinear interpolation grad + paddle::platform::CudaAtomicAdd( + &in_pos1[0], d2lambda * h2lambda * w2lambda * out_pos[0]); + paddle::platform::CudaAtomicAdd( + &in_pos1[w_id * num_channels], + d2lambda * h2lambda * w1lambda * out_pos[0]); + paddle::platform::CudaAtomicAdd( + &in_pos1[h_id * in_img_w * num_channels], + d2lambda * h1lambda * w2lambda * out_pos[0]); + paddle::platform::CudaAtomicAdd( + &in_pos1[h_id * in_img_w * num_channels + w_id * num_channels], + d2lambda * h1lambda * w1lambda * out_pos[0]); + paddle::platform::CudaAtomicAdd( + &in_pos2[0], d1lambda * h2lambda * w2lambda * out_pos[0]); + paddle::platform::CudaAtomicAdd( + &in_pos2[w_id * num_channels], + d1lambda * h2lambda * w1lambda * out_pos[0]); + paddle::platform::CudaAtomicAdd( + &in_pos2[h_id * in_img_w * num_channels], + d1lambda * h1lambda * w2lambda * out_pos[0]); + paddle::platform::CudaAtomicAdd( + &in_pos2[h_id * in_img_w * num_channels + w_id * num_channels], + d1lambda * h1lambda * w1lambda * out_pos[0]); + } + } +} + +template +__global__ void KeNearestNeighbor3DInterpBw(T* in, + const size_t in_img_d, + const size_t in_img_h, + const size_t in_img_w, + const size_t input_h, + const size_t input_w, + const T* out, + const size_t out_img_d, + const size_t out_img_h, + const size_t out_img_w, + const size_t output_h, + const size_t output_w, + const size_t num_channels, + const float ratio_d, + const float ratio_h, + const float ratio_w, + const bool align_corners, + const DataLayout data_layout) { + int nthreads = output_h * output_w; + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + for (; tid < nthreads; tid += stride) { + int out_id_h = tid / output_w; + int out_id_w = tid % output_w; + int in_img_size = input_w / num_channels; + int out_img_size = output_w / num_channels; + + int channel_id, out_img_idt, out_img_idy, out_img_idx; + if (data_layout == DataLayout::kNCHW) { + channel_id = out_id_w / out_img_size; + out_img_idt = (out_id_w % out_img_size) / out_img_h / out_img_w; + out_img_idy = ((out_id_w % out_img_size) / out_img_w) % out_img_h; + out_img_idx = tid % out_img_w; + } else { + out_img_idt = out_id_w / (out_img_h * out_img_w * num_channels); + out_img_idy = out_id_w % (out_img_h * out_img_w * num_channels) / + (out_img_w * num_channels); + out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels; + channel_id = tid % num_channels; + } + + int in_img_idt = (align_corners) + ? static_cast(ratio_d * out_img_idt + 0.5) + : static_cast(ratio_d * out_img_idt); + int in_img_idy = (align_corners) + ? static_cast(ratio_h * out_img_idy + 0.5) + : static_cast(ratio_h * out_img_idy); + int in_img_idx = (align_corners) + ? static_cast(ratio_w * out_img_idx + 0.5) + : static_cast(ratio_w * out_img_idx); + + T* in_pos; + if (data_layout == DataLayout::kNCHW) { + in_pos = &in[out_id_h * input_w + channel_id * in_img_size + + in_img_idt * in_img_h * in_img_w + in_img_idy * in_img_w + + in_img_idx]; + } else { + in_pos = &in[out_id_h * input_w + + in_img_idt * in_img_h * in_img_w * num_channels + + in_img_idy * in_img_w * num_channels + + in_img_idx * num_channels + channel_id]; + } + const T out_pos = out[out_id_h * output_w + out_id_w]; + paddle::platform::CudaAtomicAdd(in_pos, out_pos); + } +} + +template +static void Interpolate1DCUDABwd( + const Context& dev_ctx, + const DenseTensor& input, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const DenseTensor& output_grad, + const std::string& data_layout_str, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* input_grad) { + const DataLayout data_layout = + paddle::framework::StringToDataLayout(data_layout_str); + int n, c, in_d, in_h, in_w; + funcs::ExtractNCDWH(input.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w); + + float scale_w = -1; + if (scale_tensor) { + auto scale_data = + funcs::get_new_data_from_tensor(scale_tensor.get_ptr()); + scale_w = scale_data[0]; + PADDLE_ENFORCE_EQ( + scale_w > 0, + true, + errors::InvalidArgument( + "The scale_w in input 'Scale' Tensor of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); + } else { + if (scale.size() > 0) { + scale_w = scale[0]; + + PADDLE_ENFORCE_EQ( + scale_w > 0, + true, + errors::InvalidArgument( + "The scale_w in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); + } + } + if (scale_w > 0.) { + out_w = static_cast(in_w * scale_w); + } + + if (out_size) { + DenseTensor sizes; + paddle::framework::TensorCopySync( + *out_size, paddle::platform::CPUPlace(), &sizes); + auto size_data = sizes.data(); + out_w = size_data[0]; + } + if (size_tensor && size_tensor->size() > 0) { + // have size tensor + auto new_size = funcs::get_new_shape(size_tensor.get()); + out_w = new_size[0]; + } + + auto* output_grad_data = output_grad.data(); + phi::DDim dim_grad; + if (data_layout == DataLayout::kNCHW) { + dim_grad = {n, c, in_w}; + } else { + dim_grad = {n, in_w, c}; + } + input_grad->Resize(dim_grad); + auto* input_grad_data = dev_ctx.template Alloc(input_grad); + + phi::funcs::SetConstant zero; + zero(dev_ctx, input_grad, static_cast(0.0)); + + if (in_w == out_w) { + paddle::framework::TensorCopy(output_grad, dev_ctx.GetPlace(), input_grad); + return; + } + + float ratio_w = 0.f; + if (out_w > 1) { + float new_scale_w = 0.f; + new_scale_w = (scale_w > 0) ? static_cast(1. / scale_w) + : static_cast(in_w) / out_w; + ratio_w = (align_corners) ? static_cast(in_w - 1) / (out_w - 1) + : static_cast(new_scale_w); + } + int64_t in_cw = c * in_w; + int64_t out_cw = c * out_w; + auto pixelNum = n * out_cw; + + backends::gpu::GpuLaunchConfig config = + backends::gpu::GetGpuLaunchConfig1D(dev_ctx, pixelNum); + + if ("linear" == interp_method) { + KeLinearInterpBw<<>>(input_grad_data, + in_w, + in_cw, + output_grad_data, + out_w, + n, + out_cw, + c, + ratio_w, + align_corners, + align_mode, + data_layout); + } +} + +template +static void Interpolate2DCUDABwd( + const Context& dev_ctx, + const DenseTensor& input, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const DenseTensor& output_grad, + const std::string& data_layout_str, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* input_grad) { + const DataLayout data_layout = + paddle::framework::StringToDataLayout(data_layout_str); + int n, c, in_d, in_h, in_w; + funcs::ExtractNCDWH(input.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w); + + float scale_h = -1; + float scale_w = -1; + if (scale_tensor) { + auto scale_data = + funcs::get_new_data_from_tensor(scale_tensor.get_ptr()); + if (scale_data.size() > 1) { + scale_h = scale_data[0]; + scale_w = scale_data[1]; + } else { + scale_h = scale_data[0]; + scale_w = scale_data[0]; + } + + PADDLE_ENFORCE_EQ( + scale_w > 0, + true, + errors::InvalidArgument( + "The scale_w in input 'Scale' Tensor of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); + PADDLE_ENFORCE_EQ( + scale_h > 0, + true, + errors::InvalidArgument( + "The scale_h in input 'Scale' Tensor of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_h)); + } else { + if (scale.size() > 1) { + scale_w = scale[1]; + scale_h = scale[0]; + + PADDLE_ENFORCE_EQ( + scale_w > 0, + true, + errors::InvalidArgument( + "The scale_w in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); + PADDLE_ENFORCE_EQ( + scale_h > 0, + true, + errors::InvalidArgument( + "The scale_h in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_h)); + } + } + if (scale_w > 0. && scale_h > 0.) { + out_h = static_cast(in_h * scale_h); + out_w = static_cast(in_w * scale_w); + } + + if (out_size) { + DenseTensor sizes; + paddle::framework::TensorCopySync( + *out_size, paddle::platform::CPUPlace(), &sizes); + auto size_data = sizes.data(); + out_h = size_data[0]; + out_w = size_data[1]; + } + if (size_tensor && size_tensor->size() > 0) { + // have size tensor + auto new_size = funcs::get_new_shape(size_tensor.get()); + out_h = new_size[0]; + out_w = new_size[1]; + } + + auto* output_grad_data = output_grad.data(); + phi::DDim dim_grad; + if (data_layout == DataLayout::kNCHW) { + dim_grad = {n, c, in_h, in_w}; + } else { + dim_grad = {n, in_h, in_w, c}; + } + input_grad->Resize(dim_grad); + auto* input_grad_data = dev_ctx.template Alloc(input_grad); + phi::funcs::SetConstant zero; + zero(dev_ctx, input_grad, static_cast(0.0)); + + if (in_h == out_h && in_w == out_w) { + paddle::framework::TensorCopy(output_grad, dev_ctx.GetPlace(), input_grad); + return; + } + + float ratio_h = 0.f; + float ratio_w = 0.f; + if (out_h > 1) { + float new_scale_h = 0.f; + new_scale_h = (scale_h > 0) ? static_cast(1. / scale_h) + : static_cast(in_h) / out_h; + ratio_h = (align_corners) ? static_cast(in_h - 1) / (out_h - 1) + : static_cast(new_scale_h); + } + if (out_w > 1) { + float new_scale_w = 0.f; + new_scale_w = (scale_w > 0) ? static_cast(1. / scale_w) + : static_cast(in_w) / out_w; + ratio_w = (align_corners) ? static_cast(in_w - 1) / (out_w - 1) + : static_cast(new_scale_w); + } + + int64_t in_hw = in_h * in_w; + int64_t out_hw = out_h * out_w; + int64_t in_chw = c * in_hw; + int64_t out_chw = c * out_hw; + auto pixelNum = n * out_chw; + + backends::gpu::GpuLaunchConfig config = + backends::gpu::GetGpuLaunchConfig1D(dev_ctx, pixelNum); + + if ("nearest" == interp_method) { + if (data_layout == DataLayout::kNCHW) { + // get launch 3D config + int nc = n * c; + backends::gpu::GpuLaunchConfig config_3d = + backends::gpu::GetGpuLaunchConfig3D(dev_ctx, nc, out_h, out_w); + KeNearestNeighborInterpNCHWBw<<>>(input_grad_data, + in_h, + in_w, + output_grad_data, + out_h, + out_w, + nc, + ratio_h, + ratio_w, + align_corners); + } else { + int64_t cw = c * out_w; + auto interp_divmods = funcs::FastDivModForInterpolate(c, out_chw, cw); + KeNearestNeighborInterpBw<<>>(input_grad_data, + in_h, + in_w, + n, + in_chw, + output_grad_data, + out_h, + out_w, + n, + out_chw, + c, + ratio_h, + ratio_w, + align_corners, + interp_divmods); + } + } else if ("bilinear" == interp_method) { + const T align_type_value = (align_mode == 0 && !align_corners) ? 0.5f : 0; + bool is_nchw = (data_layout == DataLayout::kNCHW) ? true : false; + bool optimize_flag = false; +#ifndef __HIPCC__ + optimize_flag = (in_h < (out_h >> 6) && in_w < (out_w >> 6)) + ? true + : ((in_h == 1 && in_w == 1) ? true : false); +#endif + + if (optimize_flag & is_nchw) { + KeBilinearInterpBwShareMemory<<>>(input_grad_data, + in_h, + in_w, + output_grad_data, + out_h, + out_w, + n, + c, + ratio_h, + ratio_w, + align_type_value, + is_nchw); + } else if (!optimize_flag & is_nchw) { + const int num_kernels = n * c * out_h * out_w; + const int num_threads = std::min(dev_ctx.GetMaxThreadsPerBlock(), 1024); + KeBilinearInterpNCHWBw< + T><<>>(input_grad_data, + in_h, + in_w, + out_h, + out_w, + n, + c, + ratio_h, + ratio_w, + output_grad_data, + align_type_value); + } else { + int64_t cw = c * out_w; + auto interp_divmods = funcs::FastDivModForInterpolate(c, out_chw, cw); + KeBilinearInterpBw<<>>(input_grad_data, + in_h, + in_w, + output_grad_data, + out_h, + out_w, + n, + out_chw, + c, + ratio_h, + ratio_w, + align_type_value, + interp_divmods); + } + } else if ("bicubic" == interp_method) { +#ifdef __HIPCC__ + constexpr int thread_per_block = 256; +#else + constexpr int thread_per_block = 512; +#endif + KeBicubicInterpBw< + T><<>>( + input_grad_data, + in_h, + in_w, + n, + in_chw, + output_grad_data, + out_h, + out_w, + n, + out_chw, + c, + ratio_h, + ratio_w, + align_corners, + data_layout); + } +} + +template +static void Interpolate3DCUDABwd( + const Context& dev_ctx, + const DenseTensor& input, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const DenseTensor& output_grad, + const std::string& data_layout_str, + int out_d, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* input_grad) { + const DataLayout data_layout = + paddle::framework::StringToDataLayout(data_layout_str); + int n, c, in_d, in_h, in_w; + funcs::ExtractNCDWH(input.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w); + + float scale_d = -1; + float scale_h = -1; + float scale_w = -1; + if (scale_tensor) { + auto scale_data = + funcs::get_new_data_from_tensor(scale_tensor.get_ptr()); + if (scale_data.size() > 1) { + scale_d = scale_data[0]; + scale_h = scale_data[1]; + scale_w = scale_data[2]; + } else { + scale_d = scale_data[0]; + scale_h = scale_data[0]; + scale_w = scale_data[0]; + } + PADDLE_ENFORCE_EQ( + scale_w > 0, + true, + errors::InvalidArgument( + "The scale_w in input 'Scale' Tensor of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); + PADDLE_ENFORCE_EQ( + scale_h > 0, + true, + errors::InvalidArgument( + "The scale_h in input 'Scale' Tensor of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_h)); + PADDLE_ENFORCE_EQ( + scale_d > 0, + true, + errors::InvalidArgument( + "The scale_d in input 'Scale' Tensor of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_d)); + } else { + if (scale.size() > 1) { + scale_d = scale[0]; + scale_h = scale[1]; + scale_w = scale[2]; + + PADDLE_ENFORCE_EQ( + scale_w > 0, + true, + errors::InvalidArgument( + "The scale_w in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); + PADDLE_ENFORCE_EQ( + scale_h > 0, + true, + errors::InvalidArgument( + "The scale_h in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_h)); + PADDLE_ENFORCE_EQ( + scale_d > 0, + true, + errors::InvalidArgument( + "The scale_d in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_d)); + } + } + if (scale_d > 0. && scale_h > 0. && scale_w > 0.) { + out_d = static_cast(in_d * scale_d); + out_h = static_cast(in_h * scale_h); + out_w = static_cast(in_w * scale_w); + } + + if (out_size) { + DenseTensor sizes; + paddle::framework::TensorCopySync( + *out_size, paddle::platform::CPUPlace(), &sizes); + auto size_data = sizes.data(); + out_d = size_data[0]; + out_h = size_data[1]; + out_w = size_data[2]; + } + if (size_tensor && size_tensor->size() > 0) { + // have size tensor + auto new_size = funcs::get_new_shape(size_tensor.get()); + out_d = new_size[0]; + out_h = new_size[1]; + out_w = new_size[2]; + } + + auto* output_grad_data = output_grad.data(); + phi::DDim dim_grad; + if (data_layout == DataLayout::kNCHW) { + dim_grad = {n, c, in_d, in_h, in_w}; + } else { + dim_grad = {n, in_d, in_h, in_w, c}; + } + input_grad->Resize(dim_grad); + auto* input_grad_data = dev_ctx.template Alloc(input_grad); + phi::funcs::SetConstant zero; + zero(dev_ctx, input_grad, static_cast(0.0)); + + if (in_d == out_d && in_h == out_h && in_w == out_w) { + paddle::framework::TensorCopy(output_grad, dev_ctx.GetPlace(), input_grad); + return; + } + + float ratio_d = 0.f; + float ratio_h = 0.f; + float ratio_w = 0.f; + if (out_d > 1) { + float new_scale_d = 0.f; + new_scale_d = (scale_d > 0) ? static_cast(1. / scale_d) + : static_cast(in_d) / out_d; + ratio_d = (align_corners) ? static_cast(in_d - 1) / (out_d - 1) + : static_cast(new_scale_d); + } + if (out_h > 1) { + float new_scale_h = 0.f; + new_scale_h = (scale_h > 0) ? static_cast(1. / scale_h) + : static_cast(in_h) / out_h; + ratio_h = (align_corners) ? static_cast(in_h - 1) / (out_h - 1) + : static_cast(new_scale_h); + } + if (out_w > 1) { + float new_scale_w = 0.f; + new_scale_w = (scale_w > 0) ? static_cast(1. / scale_w) + : static_cast(in_w) / out_w; + ratio_w = (align_corners) ? static_cast(in_w - 1) / (out_w - 1) + : static_cast(new_scale_w); + } + + int64_t in_dhw = in_d * in_h * in_w; + int64_t out_dhw = out_d * out_h * out_w; + int64_t in_cdhw = c * in_dhw; + int64_t out_cdhw = c * out_dhw; + + auto pixelNum = n * out_cdhw; + + backends::gpu::GpuLaunchConfig config = + backends::gpu::GetGpuLaunchConfig1D(dev_ctx, pixelNum); + + if ("trilinear" == interp_method) { + KeTrilinearInterpBw<<>>(input_grad_data, + in_d, + in_h, + in_w, + n, + in_cdhw, + output_grad_data, + out_d, + out_h, + out_w, + n, + out_cdhw, + c, + ratio_d, + ratio_h, + ratio_w, + align_corners, + align_mode, + data_layout); + } else if ("nearest" == interp_method) { + KeNearestNeighbor3DInterpBw<<>>(input_grad_data, + in_d, + in_h, + in_w, + n, + in_cdhw, + output_grad_data, + out_d, + out_h, + out_w, + n, + out_cdhw, + c, + ratio_d, + ratio_h, + ratio_w, + align_corners, + data_layout); + } +} + +template +void InterpolateGradKernel( + const Context& dev_ctx, + const DenseTensor& x, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const DenseTensor& output_grad, + const std::string& data_layout, + int out_d, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* x_grad) { + auto output_grad_dims = output_grad.dims(); + if (output_grad_dims.size() == 3) { // 1D interpolation grad + Interpolate1DCUDABwd(dev_ctx, + x, + out_size, + size_tensor, + scale_tensor, + output_grad, + data_layout, + out_w, + scale, + interp_method, + align_corners, + align_mode, + x_grad); + } else if (output_grad_dims.size() == 4) { // 2D interpolation grad + Interpolate2DCUDABwd(dev_ctx, + x, + out_size, + size_tensor, + scale_tensor, + output_grad, + data_layout, + out_h, + out_w, + scale, + interp_method, + align_corners, + align_mode, + x_grad); + + } else if (output_grad_dims.size() == 5) { // 3D interpolation grad + Interpolate3DCUDABwd(dev_ctx, + x, + out_size, + size_tensor, + scale_tensor, + output_grad, + data_layout, + out_d, + out_h, + out_w, + scale, + interp_method, + align_corners, + align_mode, + x_grad); + } +} + +template +void BilinearInterpGradKernel( + const Context& dev_ctx, + const DenseTensor& x, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const DenseTensor& out_grad, + const std::string& data_layout, + int out_d, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* x_grad) { + InterpolateGradKernel(dev_ctx, + x, + out_size, + size_tensor, + scale_tensor, + out_grad, + data_layout, + out_d, + out_h, + out_w, + scale, + interp_method, + align_corners, + align_mode, + x_grad); +} + +template +void NearestInterpGradKernel( + const Context& dev_ctx, + const DenseTensor& x, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const DenseTensor& out_grad, + const std::string& data_layout, + int out_d, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* x_grad) { + InterpolateGradKernel(dev_ctx, + x, + out_size, + size_tensor, + scale_tensor, + out_grad, + data_layout, + out_d, + out_h, + out_w, + scale, + interp_method, + align_corners, + align_mode, + x_grad); +} + +template +void TrilinearInterpGradKernel( + const Context& dev_ctx, + const DenseTensor& x, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const DenseTensor& out_grad, + const std::string& data_layout, + int out_d, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* x_grad) { + InterpolateGradKernel(dev_ctx, + x, + out_size, + size_tensor, + scale_tensor, + out_grad, + data_layout, + out_d, + out_h, + out_w, + scale, + interp_method, + align_corners, + align_mode, + x_grad); +} + +template +void LinearInterpGradKernel( + const Context& dev_ctx, + const DenseTensor& x, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const DenseTensor& out_grad, + const std::string& data_layout, + int out_d, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* x_grad) { + InterpolateGradKernel(dev_ctx, + x, + out_size, + size_tensor, + scale_tensor, + out_grad, + data_layout, + out_d, + out_h, + out_w, + scale, + interp_method, + align_corners, + align_mode, + x_grad); +} + +template +void BicubicInterpGradKernel( + const Context& dev_ctx, + const DenseTensor& x, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const DenseTensor& out_grad, + const std::string& data_layout, + int out_d, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* x_grad) { + InterpolateGradKernel(dev_ctx, + x, + out_size, + size_tensor, + scale_tensor, + out_grad, + data_layout, + out_d, + out_h, + out_w, + scale, + interp_method, + align_corners, + align_mode, + x_grad); +} + +} // namespace phi + +PD_REGISTER_KERNEL(bilinear_interp_v2_grad, + GPU, + ALL_LAYOUT, + phi::BilinearInterpGradKernel, + float, + double) {} +PD_REGISTER_KERNEL(nearest_interp_v2_grad, + GPU, + ALL_LAYOUT, + phi::NearestInterpGradKernel, + float, + double) {} +PD_REGISTER_KERNEL(trilinear_interp_v2_grad, + GPU, + ALL_LAYOUT, + phi::TrilinearInterpGradKernel, + float, + double) {} +PD_REGISTER_KERNEL(linear_interp_v2_grad, + GPU, + ALL_LAYOUT, + phi::LinearInterpGradKernel, + float, + double) {} +PD_REGISTER_KERNEL(bicubic_interp_v2_grad, + GPU, + ALL_LAYOUT, + phi::BicubicInterpGradKernel, + float, + double) {} diff --git a/paddle/phi/kernels/gpu/interpolate_kernel.cu b/paddle/phi/kernels/gpu/interpolate_kernel.cu new file mode 100644 index 0000000000000..6e609aa11674e --- /dev/null +++ b/paddle/phi/kernels/gpu/interpolate_kernel.cu @@ -0,0 +1,1479 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/interpolate_kernel.h" + +#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/fluid/platform/fast_divmod.h" +#include "paddle/phi/backends/gpu/gpu_launch_config.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/layout.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/interpolate_function.h" + +namespace phi { +using paddle::platform::FastDivMod; + +template +__forceinline__ __device__ void PreCalculatorForLinearInterpInputIndex( + int* in_img_idx, + int* x_id, + T* lambda1, + T* lambda2, + T src_x, + const int in_img_x) { + src_x = (src_x > 0) ? src_x : 0.f; + *in_img_idx = static_cast(src_x); + *x_id = (*in_img_idx < in_img_x - 1) ? 1 : 0; + *lambda1 = src_x - *in_img_idx; + *lambda2 = 1.f - *lambda1; +} + +template +__global__ void KeLinearInterpFw(const T* in, + const size_t in_img_w, + const size_t input_w, + T* out, + const size_t out_img_w, + const size_t output_h, + const size_t output_w, + const size_t num_channels, + const float ratio_w, + const bool align_corners, + const int align_mode, + const DataLayout data_layout) { + int nthreads = output_h * output_w; + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + bool align_flag = (align_mode == 0 && !align_corners); + for (; tid < nthreads; tid += stride) { + int out_id_h = tid / output_w; + int out_id_w = tid % output_w; + int in_img_size = input_w / num_channels; + int out_img_size = output_w / num_channels; + + int channel_id, out_img_idy, out_img_idx; + if (data_layout == DataLayout::kNCHW) { + channel_id = out_id_w / out_img_size; + out_img_idx = tid % out_img_w; + } else { + out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels; + channel_id = tid % num_channels; + } + + int in_img_idx = align_flag + ? static_cast(ratio_w * (out_img_idx + 0.5) - 0.5) + : static_cast(ratio_w * out_img_idx); + in_img_idx = (in_img_idx > 0) ? in_img_idx : 0; // w + int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0; // w_id + + T src_w = ratio_w * (out_img_idx + 0.5) - 0.5; + src_w = (src_w > 0) ? src_w : 0; + T w1lambda = + align_flag ? src_w - in_img_idx : ratio_w * out_img_idx - in_img_idx; + T w2lambda = 1.f - w1lambda; + + if (data_layout == DataLayout::kNCHW) { + const T* in_pos = + &in[out_id_h * out_id_w + channel_id * in_img_size + in_img_idx]; + // linear interpolation + out[out_id_h * output_w + out_id_w] = + w2lambda * in_pos[0] + w1lambda * in_pos[w_id]; + + } else { + const T* in_pos = + &in[out_id_h * input_w + in_img_idx * num_channels + channel_id]; + // linear interpolation + out[out_id_h * output_w + out_id_w] = + w2lambda * in_pos[0] + w1lambda * in_pos[w_id * num_channels]; + } + } +} + +template +__global__ void KeNearestNeighborInterpNCHWFw(const T* in, + const size_t in_img_h, + const size_t in_img_w, + T* out, + const size_t out_img_h, + const size_t out_img_w, + const size_t nc, + const float ratio_h, + const float ratio_w, + const bool align_corners) { + int out_img_idx = threadIdx.x + blockIdx.x * blockDim.x; + int out_img_idy = threadIdx.y + blockIdx.y * blockDim.y; + int nc_id = threadIdx.z + blockIdx.z * blockDim.z; + int nc_stride = blockDim.z * gridDim.z; + + // nearest_sampling by multiple read in_addr and write to out_addr + int in_img_idx = (align_corners) + ? static_cast(ratio_w * out_img_idx + 0.5) + : static_cast(ratio_w * out_img_idx); + int in_img_idy = (align_corners) + ? static_cast(ratio_h * out_img_idy + 0.5) + : static_cast(ratio_h * out_img_idy); + + int in_index = (nc_id * in_img_h + in_img_idy) * in_img_w + in_img_idx; + int in_index_stride = nc_stride * in_img_h * in_img_w; + + int out_index = (nc_id * out_img_h + out_img_idy) * out_img_w + out_img_idx; + int out_index_stride = nc_stride * out_img_h * out_img_w; + + // prevent from multiple threads writing + if (out_img_idx < out_img_w && out_img_idy < out_img_h) { + while (nc_id < nc) { + out[out_index] = in[in_index]; + in_index += in_index_stride; + out_index += out_index_stride; + nc_id += nc_stride; + } + } +} + +template +__global__ void KeNearestNeighborInterpFw( + const T* in, + const size_t in_img_h, + const size_t in_img_w, + const size_t input_h, + const size_t input_w, + T* out, + const size_t out_img_h, + const size_t out_img_w, + const size_t output_h, + const size_t output_w, + const size_t num_channels, + const float ratio_h, + const float ratio_w, + const bool align_corners, + funcs::FastDivModForInterpolate divmods) { + int nthreads = output_h * output_w; + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + int in_img_size = in_img_h * in_img_w; + int out_img_size = out_img_h * out_img_w; + + for (; tid < nthreads; tid += stride) { + auto out_id_divmod = divmods.output_w_div.Divmod(tid); + int out_id_h = out_id_divmod.val[0]; + int out_id_w = out_id_divmod.val[1]; + + int channel_id = divmods.channels_div.Divmod(tid).val[1]; + auto outimg_id_divmod = divmods.output_wc_div.Divmod(out_id_w); + int out_img_idy = outimg_id_divmod.val[0]; + int out_img_idx = + divmods.channels_div.Divmod(outimg_id_divmod.val[1]).val[0]; + + int in_img_idy = (align_corners) + ? static_cast(ratio_h * out_img_idy + 0.5) + : static_cast(ratio_h * out_img_idy); + int in_img_idx = (align_corners) + ? static_cast(ratio_w * out_img_idx + 0.5) + : static_cast(ratio_w * out_img_idx); + + out[tid] = in[out_id_h * input_w + in_img_idy * in_img_w * num_channels + + in_img_idx * num_channels + channel_id]; + } +} + +template +__global__ void KeBilinearInterpFw(const T* in, + const size_t in_img_h, + const size_t in_img_w, + const size_t input_h, + const size_t input_w, + T* out, + const size_t out_img_h, + const size_t out_img_w, + const size_t output_h, + const size_t output_w, + const size_t num_channels, + const float ratio_h, + const float ratio_w, + const T align_type_value, + funcs::FastDivModForInterpolate divmods) { + int nthreads = output_h * output_w; + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + + for (; tid < nthreads; tid += stride) { + auto out_id_divmod = divmods.output_w_div.Divmod(tid); + int out_id_h = out_id_divmod.val[0]; + int out_id_w = out_id_divmod.val[1]; + + int channel_id = divmods.channels_div.Divmod(tid).val[1]; + auto outimg_id_divmod = divmods.output_wc_div.Divmod(out_id_w); + int out_img_idy = outimg_id_divmod.val[0]; + int out_img_idx = + divmods.channels_div.Divmod(outimg_id_divmod.val[1]).val[0]; + + int in_img_idx, in_img_idy, h_id, w_id; + T h1lambda, w1lambda, h2lambda, w2lambda; + T src_w = ratio_w * (out_img_idx + align_type_value) - align_type_value; + T src_h = ratio_h * (out_img_idy + align_type_value) - align_type_value; + + PreCalculatorForLinearInterpInputIndex( + &in_img_idx, &w_id, &w1lambda, &w2lambda, src_w, in_img_w); + PreCalculatorForLinearInterpInputIndex( + &in_img_idy, &h_id, &h1lambda, &h2lambda, src_h, in_img_h); + + // bilinear interpolation + const T* in_pos = + &in[out_id_h * input_w + in_img_idy * in_img_w * num_channels + + in_img_idx * num_channels + channel_id]; + out[tid] = + h2lambda * + (w2lambda * in_pos[0] + w1lambda * in_pos[w_id * num_channels]) + + h1lambda * + (w2lambda * in_pos[h_id * in_img_w * num_channels] + + w1lambda * + in_pos[h_id * in_img_w * num_channels + w_id * num_channels]); + } +} + +template +__global__ void KeBilinearInterpNCHWFw(const T* in, + const size_t in_img_h, + const size_t in_img_w, + T* out, + const size_t out_img_h, + const size_t out_img_w, + const size_t nc, + const float ratio_h, + const float ratio_w, + const T align_type_value) { + int out_img_idx = threadIdx.x + blockIdx.x * blockDim.x; + int out_img_idy = threadIdx.y + blockIdx.y * blockDim.y; + int nc_id = threadIdx.z + blockIdx.z * blockDim.z; + int nc_stride = blockDim.z * gridDim.z; + + int in_img_idx, in_img_idy, h_id, w_id; + T h1lambda, w1lambda, h2lambda, w2lambda; + T src_w = ratio_w * (out_img_idx + align_type_value) - align_type_value; + T src_h = ratio_h * (out_img_idy + align_type_value) - align_type_value; + + PreCalculatorForLinearInterpInputIndex( + &in_img_idx, &w_id, &w1lambda, &w2lambda, src_w, in_img_w); + PreCalculatorForLinearInterpInputIndex( + &in_img_idy, &h_id, &h1lambda, &h2lambda, src_h, in_img_h); + + int in_index = (nc_id * in_img_h + in_img_idy) * in_img_w + in_img_idx; + int in_index_stride = nc_stride * in_img_h * in_img_w; + + int out_index = (nc_id * out_img_h + out_img_idy) * out_img_w + out_img_idx; + int out_index_stride = nc_stride * out_img_h * out_img_w; + + // prevent from multiple threads writing + if (out_img_idx < out_img_w && out_img_idy < out_img_h) { + while (nc_id < nc) { + const T* in_pos = &in[in_index]; + out[out_index] = + h2lambda * (w2lambda * in_pos[0] + w1lambda * in_pos[w_id]) + + h1lambda * (w2lambda * in_pos[h_id * in_img_w] + + w1lambda * in_pos[h_id * in_img_w + w_id]); + + in_index += in_index_stride; + out_index += out_index_stride; + nc_id += nc_stride; + } + } +} + +template +__device__ __forceinline__ static T Kecubic_interp( + const T x0, const T x1, const T x2, const T x3, T t) { + T coeffs[4]; + T a = -0.75; + T x_1 = t; + T x_2 = 1.0 - t; + coeffs[0] = funcs::CubicConvolution2(x_1 + 1.0, a); + coeffs[1] = funcs::CubicConvolution1(x_1, a); + coeffs[2] = funcs::CubicConvolution1(x_2, a); + coeffs[3] = funcs::CubicConvolution2(x_2 + 1.0, a); + return x0 * coeffs[0] + x1 * coeffs[1] + x2 * coeffs[2] + x3 * coeffs[3]; +} + +template +__global__ void KeBicubicInterpFw(const T* in, + const size_t in_img_h, + const size_t in_img_w, + const size_t input_h, + const size_t input_w, + T* out, + const size_t out_img_h, + const size_t out_img_w, + const size_t output_h, + const size_t output_w, + const size_t num_channels, + const float ratio_h, + const float ratio_w, + const bool align_corners, + const DataLayout data_layout) { + int nthreads = output_h * output_w; + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + + for (; tid < nthreads; tid += stride) { + int out_id_h = tid / output_w; + int out_id_w = tid % output_w; + int in_img_size = input_w / num_channels; + int out_img_size = output_w / num_channels; + + int channel_id, out_img_idy, out_img_idx; + + if (data_layout == DataLayout::kNCHW) { + channel_id = out_id_w / out_img_size; + out_img_idy = (out_id_w % out_img_size) / out_img_w; + out_img_idx = tid % out_img_w; + } else { + out_img_idy = out_id_w / (out_img_w * num_channels); + out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels; + channel_id = tid % num_channels; + } + + T in_img_idy = align_corners + ? static_cast(ratio_h * out_img_idy) + : static_cast(ratio_h * (out_img_idy + 0.5) - 0.5); + int input_y = floorf(in_img_idy); + const T y_t = in_img_idy - input_y; + + T in_img_idx = align_corners + ? static_cast(ratio_w * out_img_idx) + : static_cast(ratio_w * (out_img_idx + 0.5) - 0.5); + int input_x = floorf(in_img_idx); + const T x_t = in_img_idx - input_x; + + T coefficients[4]; + const T* in_pos_0; + const T* in_pos_1; + const T* in_pos_2; + const T* in_pos_3; + int access_x_0; + if (data_layout == DataLayout::kNCHW) { + for (int k = 0; k < 4; k++) { + int access_y = + max(min(input_y - 1 + k, static_cast(in_img_h - 1)), 0); + access_x_0 = max(min(input_x - 1, static_cast(in_img_w - 1)), 0); + int access_x_1 = + max(min(input_x + 0, static_cast(in_img_w - 1)), 0); + int access_x_2 = + max(min(input_x + 1, static_cast(in_img_w - 1)), 0); + int access_x_3 = + max(min(input_x + 2, static_cast(in_img_w - 1)), 0); + + in_pos_0 = &in[out_id_h * input_w + channel_id * in_img_size + + access_y * in_img_w + access_x_0]; + in_pos_1 = &in[out_id_h * input_w + channel_id * in_img_size + + access_y * in_img_w + access_x_1]; + in_pos_2 = &in[out_id_h * input_w + channel_id * in_img_size + + access_y * in_img_w + access_x_2]; + in_pos_3 = &in[out_id_h * input_w + channel_id * in_img_size + + access_y * in_img_w + access_x_3]; + + coefficients[k] = Kecubic_interp( + in_pos_0[0], in_pos_1[0], in_pos_2[0], in_pos_3[0], x_t); + } + + out[out_id_h * output_w + out_id_w] = Kecubic_interp(coefficients[0], + coefficients[1], + coefficients[2], + coefficients[3], + y_t); + + } else { + for (int k = 0; k < 4; k++) { + int access_y = + max(min(input_y - 1 + k, static_cast((in_img_h - 1))), 0); + int access_x_0 = + max(min(input_x - 1, static_cast((in_img_w - 1))), 0); + int access_x_1 = + max(min(input_x + 0, static_cast((in_img_w - 1))), 0); + int access_x_2 = + max(min(input_x + 1, static_cast((in_img_w - 1))), 0); + int access_x_3 = + max(min(input_x + 2, static_cast((in_img_w - 1))), 0); + + const T* in_pos_0 = + &in[out_id_h * input_w + access_y * in_img_w * num_channels + + access_x_0 * num_channels + channel_id]; + const T* in_pos_1 = + &in[out_id_h * input_w + access_y * in_img_w * num_channels + + access_x_1 * num_channels + channel_id]; + const T* in_pos_2 = + &in[out_id_h * input_w + access_y * in_img_w * num_channels + + access_x_2 * num_channels + channel_id]; + const T* in_pos_3 = + &in[out_id_h * input_w + access_y * in_img_w * num_channels + + access_x_3 * num_channels + channel_id]; + + coefficients[k] = Kecubic_interp( + in_pos_0[0], in_pos_1[0], in_pos_2[0], in_pos_3[0], x_t); + } + + out[out_id_h * output_w + out_id_w] = + static_cast(Kecubic_interp(coefficients[0], + coefficients[1], + coefficients[2], + coefficients[3], + y_t)); + } + } +} + +template +__global__ void KeTrilinearInterpFw(const T* in, + const size_t in_img_d, + const size_t in_img_h, + const size_t in_img_w, + const size_t input_h, + const size_t input_w, + T* out, + const size_t out_img_d, + const size_t out_img_h, + const size_t out_img_w, + const size_t output_h, + const size_t output_w, + const size_t num_channels, + const float ratio_d, + const float ratio_h, + const float ratio_w, + const bool align_corners, + const int align_mode, + const DataLayout data_layout) { + int nthreads = output_h * output_w; + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + bool align_flag = (align_mode == 0 && !align_corners); + for (; tid < nthreads; tid += stride) { + int out_id_h = tid / output_w; + int out_id_w = tid % output_w; + int in_img_size = input_w / num_channels; + int out_img_size = output_w / num_channels; + + int channel_id, out_img_idt, out_img_idy, out_img_idx; + if (data_layout == DataLayout::kNCHW) { + channel_id = out_id_w / out_img_size; + out_img_idt = (out_id_w % out_img_size) / out_img_h / out_img_w; + out_img_idy = ((out_id_w % out_img_size) / out_img_w) % out_img_h; + out_img_idx = tid % out_img_w; + } else { + out_img_idt = out_id_w / (out_img_h * out_img_w * num_channels); + out_img_idy = out_id_w % (out_img_h * out_img_w * num_channels) / + (out_img_w * num_channels); + out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels; + channel_id = tid % num_channels; + } + + int in_img_idt = align_flag + ? static_cast(ratio_d * (out_img_idt + 0.5) - 0.5) + : static_cast(ratio_d * out_img_idt); + in_img_idt = (in_img_idt > 0) ? in_img_idt : 0; + int d_id = (in_img_idt < in_img_d - 1) ? 1 : 0; + T src_d = ratio_d * (out_img_idt + 0.5) - 0.5; + src_d = (src_d > 0) ? src_d : 0; + T d1lambda = + align_flag ? src_d - in_img_idt : ratio_d * out_img_idt - in_img_idt; + T d2lambda = 1.f - d1lambda; + + int in_img_idy = align_flag + ? static_cast(ratio_h * (out_img_idy + 0.5) - 0.5) + : static_cast(ratio_h * out_img_idy); + in_img_idy = (in_img_idy > 0) ? in_img_idy : 0; + int h_id = (in_img_idy < in_img_h - 1) ? 1 : 0; + T src_h = ratio_h * (out_img_idy + 0.5) - 0.5; + src_h = (src_h > 0) ? src_h : 0; + T h1lambda = + align_flag ? src_h - in_img_idy : ratio_h * out_img_idy - in_img_idy; + T h2lambda = 1.f - h1lambda; + + int in_img_idx = align_flag + ? static_cast(ratio_w * (out_img_idx + 0.5) - 0.5) + : static_cast(ratio_w * out_img_idx); + in_img_idx = (in_img_idx > 0) ? in_img_idx : 0; + int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0; + T src_w = ratio_w * (out_img_idx + 0.5) - 0.5; + src_w = (src_w > 0) ? src_w : 0; + T w1lambda = + align_flag ? src_w - in_img_idx : ratio_w * out_img_idx - in_img_idx; + T w2lambda = 1.f - w1lambda; + + if (data_layout == DataLayout::kNCHW) { + int in_pos1_idx = out_id_h * input_w + channel_id * in_img_size + + (in_img_idt * in_img_h + in_img_idy) * in_img_w + + in_img_idx; + const T* in_pos1 = &in[in_pos1_idx]; + int in_pos2_idx = in_pos1_idx + d_id * in_img_h * in_img_w; + const T* in_pos2 = &in[in_pos2_idx]; + + // trilinear interpolation + out[out_id_h * output_w + out_id_w] = + d2lambda * + (h2lambda * (w2lambda * in_pos1[0] + w1lambda * in_pos1[w_id]) + + h1lambda * (w2lambda * in_pos1[h_id * in_img_w] + + w1lambda * in_pos1[h_id * in_img_w + w_id])) + + d1lambda * + (h2lambda * (w2lambda * in_pos2[0] + w1lambda * in_pos2[w_id]) + + h1lambda * (w2lambda * in_pos2[h_id * in_img_w] + + w1lambda * in_pos2[h_id * in_img_w + w_id])); + + } else { + int in_pos1_idx = out_id_h * input_w + + in_img_idt * in_img_h * in_img_w * num_channels + + in_img_idy * in_img_w * num_channels + + in_img_idx * num_channels + channel_id; + const T* in_pos1 = &in[in_pos1_idx]; + int in_pos2_idx = in_pos1_idx + d_id * in_img_h * in_img_w * num_channels; + const T* in_pos2 = &in[in_pos2_idx]; + + // trilinear interpolation + out[out_id_h * output_w + out_id_w] = + d2lambda * + (h2lambda * (w2lambda * in_pos1[0] + + w1lambda * in_pos1[w_id * num_channels]) + + h1lambda * (w2lambda * in_pos1[h_id * in_img_w * num_channels] + + w1lambda * in_pos1[h_id * in_img_w * num_channels + + w_id * num_channels])) + + d1lambda * + (h2lambda * (w2lambda * in_pos2[0] + + w1lambda * in_pos2[w_id * num_channels]) + + h1lambda * (w2lambda * in_pos2[h_id * in_img_w * num_channels] + + w1lambda * in_pos2[h_id * in_img_w * num_channels + + w_id * num_channels])); + } + } +} + +template +__global__ void KeNearestNeighbor3DInterpFw(const T* in, + const size_t in_img_d, + const size_t in_img_h, + const size_t in_img_w, + const size_t input_h, + const size_t input_w, + T* out, + const size_t out_img_d, + const size_t out_img_h, + const size_t out_img_w, + const size_t output_h, + const size_t output_w, + const size_t num_channels, + const float ratio_d, + const float ratio_h, + const float ratio_w, + const bool align_corners, + const DataLayout data_layout) { + int nthreads = output_h * output_w; // ncdhw + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + for (; tid < nthreads; tid += stride) { + int out_id_h = tid / output_w; + int out_id_w = tid % output_w; + int in_img_size = input_w / num_channels; + int out_img_size = output_w / num_channels; + + int channel_id, out_img_idt, out_img_idy, out_img_idx; + if (data_layout == DataLayout::kNCHW) { + channel_id = out_id_w / out_img_size; + out_img_idt = (out_id_w % out_img_size) / out_img_h / out_img_w; + out_img_idy = ((out_id_w % out_img_size) / out_img_w) % out_img_h; + out_img_idx = tid % out_img_w; + } else { + out_img_idt = out_id_w / (out_img_h * out_img_w * num_channels); + out_img_idy = out_id_w % (out_img_h * out_img_w * num_channels) / + (out_img_w * num_channels); + out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels; + channel_id = tid % num_channels; + } + + int in_img_idt = (align_corners) + ? static_cast(ratio_d * out_img_idt + 0.5) + : static_cast(ratio_d * out_img_idt); + + int in_img_idy = (align_corners) + ? static_cast(ratio_h * out_img_idy + 0.5) + : static_cast(ratio_h * out_img_idy); + int in_img_idx = (align_corners) + ? static_cast(ratio_w * out_img_idx + 0.5) + : static_cast(ratio_w * out_img_idx); + + if (data_layout == DataLayout::kNCHW) { + out[tid] = in[out_id_h * input_w + channel_id * in_img_size + + in_img_idt * in_img_h * in_img_w + in_img_idy * in_img_w + + in_img_idx]; + } else { + out[tid] = in[out_id_h * input_w + + in_img_idt * in_img_h * in_img_w * num_channels + + in_img_idy * in_img_w * num_channels + + in_img_idx * num_channels + channel_id]; + } + } +} + +template +static void Interpolate1DCUDAFwd( + const Context& dev_ctx, + const DenseTensor& input, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const std::string& data_layout_str, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* output) { + auto* input_data = input.data(); + + const DataLayout data_layout = + paddle::framework::StringToDataLayout(data_layout_str); + int n, c, in_d, in_h, in_w; + funcs::ExtractNCDWH(input.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w); + + float scale_w = -1; + if (size_tensor && size_tensor->size() > 0) { + // have size tensor + auto new_size = funcs::get_new_shape(size_tensor.get()); + out_w = new_size[0]; + } else { + if (scale_tensor) { + auto scale_data = + funcs::get_new_data_from_tensor(scale_tensor.get_ptr()); + scale_w = scale_data[0]; + PADDLE_ENFORCE_EQ( + scale_w > 0, + true, + errors::InvalidArgument( + "The scale_w in input 'Scale' Tensor of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); + } else { + if (scale.size() > 0) { + scale_w = scale[0]; + PADDLE_ENFORCE_EQ( + scale_w > 0, + true, + errors::InvalidArgument( + "The scale_w in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); + } + } + if (scale_w > 0.) { + out_w = static_cast(in_w * scale_w); + } + if (out_size) { + DenseTensor sizes; + paddle::framework::TensorCopySync( + *out_size, paddle::platform::CPUPlace(), &sizes); + auto size_data = sizes.data(); + out_w = size_data[0]; + } + } + PADDLE_ENFORCE_GT( + out_w, + 0, + errors::InvalidArgument("out_w in Attr(out_shape) of Op(interpolate) " + "should be greater than 0.")); + phi::DDim dim_out; + if (data_layout == DataLayout::kNCHW) { + dim_out = {n, c, out_w}; + } else { + dim_out = {n, out_w, c}; + } + output->Resize(dim_out); + auto output_data = dev_ctx.template Alloc(output); + + if (in_w == out_w) { + paddle::framework::TensorCopy(input, dev_ctx.GetPlace(), output); + return; + } + + float ratio_w = 0.f; + if (out_w > 1) { + float new_scale_w = 0.f; + new_scale_w = (scale_w > 0) ? static_cast(1. / scale_w) + : static_cast(in_w) / out_w; + ratio_w = (align_corners) ? static_cast(in_w - 1.0) / (out_w - 1.0) + : static_cast(new_scale_w); + } + + int64_t in_cw = c * in_w; + int64_t out_cw = c * out_w; + auto pixelNum = n * out_cw; + + backends::gpu::GpuLaunchConfig config = + backends::gpu::GetGpuLaunchConfig1D(dev_ctx, pixelNum); + + if ("linear" == interp_method) { + KeLinearInterpFw<<>>(input_data, + in_w, + in_cw, + output_data, + out_w, + n, + out_cw, + c, + ratio_w, + align_corners, + align_mode, + data_layout); + } +} + +template +static void Interpolate2DCUDAFwd( + const Context& dev_ctx, + const DenseTensor& input, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const std::string& data_layout_str, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* output) { + auto* input_data = input.data(); + + const DataLayout data_layout = + paddle::framework::StringToDataLayout(data_layout_str); + int n, c, in_d, in_h, in_w; + funcs::ExtractNCDWH(input.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w); + + float scale_w = -1; + float scale_h = -1; + if (size_tensor && size_tensor->size() > 0) { + // have size tensor + auto new_size = funcs::get_new_shape(size_tensor.get()); + out_h = new_size[0]; + out_w = new_size[1]; + } else { + if (scale_tensor) { + auto scale_data = + funcs::get_new_data_from_tensor(scale_tensor.get_ptr()); + if (scale_data.size() > 1) { + scale_h = scale_data[0]; + scale_w = scale_data[1]; + } else { + scale_h = scale_data[0]; + scale_w = scale_data[0]; + } + + PADDLE_ENFORCE_EQ( + scale_w > 0, + true, + errors::InvalidArgument( + "The scale_w in input 'Scale' Tensor of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); + PADDLE_ENFORCE_EQ( + scale_h > 0, + true, + errors::InvalidArgument( + "The scale_h in input 'Scale' Tensor of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_h)); + } else { + if (scale.size() > 1) { + scale_w = scale[1]; + scale_h = scale[0]; + + PADDLE_ENFORCE_EQ( + scale_w > 0, + true, + errors::InvalidArgument( + "The scale_w in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); + PADDLE_ENFORCE_EQ( + scale_h > 0, + true, + errors::InvalidArgument( + "The scale_h in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_h)); + } + } + if (scale_w > 0. && scale_h > 0.) { + out_h = static_cast(in_h * scale_h); + out_w = static_cast(in_w * scale_w); + } + if (out_size) { + DenseTensor sizes; + paddle::framework::TensorCopySync( + *out_size, paddle::platform::CPUPlace(), &sizes); + auto size_data = sizes.data(); + out_h = size_data[0]; + out_w = size_data[1]; + } + } + PADDLE_ENFORCE_GT( + out_h, + 0, + errors::InvalidArgument("out_h in Attr(out_shape) of Op(interpolate) " + "should be greater than 0.")); + PADDLE_ENFORCE_GT( + out_w, + 0, + errors::InvalidArgument("out_w in Attr(out_shape) of Op(interpolate) " + "should be greater than 0.")); + + phi::DDim dim_out; + if (data_layout == DataLayout::kNCHW) { + dim_out = {n, c, out_h, out_w}; + } else { + dim_out = {n, out_h, out_w, c}; + } + output->Resize(dim_out); + auto output_data = dev_ctx.template Alloc(output); + + if (in_h == out_h && in_w == out_w) { + paddle::framework::TensorCopy(input, dev_ctx.GetPlace(), output); + return; + } + + float ratio_h = 0.f; + float ratio_w = 0.f; + if (out_h > 1) { + float new_scale_h = 0.f; + new_scale_h = (scale_h > 0) ? static_cast(1. / scale_h) + : static_cast(in_h) / out_h; + ratio_h = (align_corners) ? static_cast(in_h - 1) / (out_h - 1) + : static_cast(new_scale_h); + } + if (out_w > 1) { + float new_scale_w = 0.f; + new_scale_w = (scale_w > 0) ? static_cast(1. / scale_w) + : static_cast(in_w) / out_w; + ratio_w = (align_corners) ? static_cast(in_w - 1) / (out_w - 1) + : static_cast(new_scale_w); + } + + int64_t in_hw = in_h * in_w; + int64_t out_hw = out_h * out_w; + int64_t in_chw = c * in_hw; + int64_t out_chw = c * out_hw; + + auto pixelNum = n * out_chw; + + backends::gpu::GpuLaunchConfig config = + backends::gpu::GetGpuLaunchConfig1D(dev_ctx, pixelNum); + + if ("nearest" == interp_method) { + if (data_layout == DataLayout::kNCHW) { + // get launch 3D config + int nc = n * c; + backends::gpu::GpuLaunchConfig config_3d = + backends::gpu::GetGpuLaunchConfig3D(dev_ctx, nc, out_h, out_w); + KeNearestNeighborInterpNCHWFw<<>>(input_data, + in_h, + in_w, + output_data, + out_h, + out_w, + nc, + ratio_h, + ratio_w, + align_corners); + } else { + int64_t cw = c * out_w; + auto interp_divmods = funcs::FastDivModForInterpolate(c, out_chw, cw); + KeNearestNeighborInterpFw<<>>(input_data, + in_h, + in_w, + n, + in_chw, + output_data, + out_h, + out_w, + n, + out_chw, + c, + ratio_h, + ratio_w, + align_corners, + interp_divmods); + } + } else if ("bilinear" == interp_method) { + dim3 thread_num = config.thread_per_block; +#ifdef WITH_NV_JETSON + if (config.compute_capability == 53 || config.compute_capability == 62) { + thread_num = 512; + } +#endif + const T align_type_value = (align_mode == 0 && !align_corners) ? 0.5f : 0; + if (data_layout == DataLayout::kNCHW) { + // get launch 3D config + int nc = n * c; + backends::gpu::GpuLaunchConfig config_3d = + backends::gpu::GetGpuLaunchConfig3D(dev_ctx, nc, out_h, out_w); + KeBilinearInterpNCHWFw<<>>(input_data, + in_h, + in_w, + output_data, + out_h, + out_w, + nc, + ratio_h, + ratio_w, + align_type_value); + } else { + int64_t cw = c * out_w; + auto interp_divmods = funcs::FastDivModForInterpolate(c, out_chw, cw); + KeBilinearInterpFw< + T><<>>( + input_data, + in_h, + in_w, + n, + in_chw, + output_data, + out_h, + out_w, + n, + out_chw, + c, + ratio_h, + ratio_w, + align_type_value, + interp_divmods); + } + } else if ("bicubic" == interp_method) { +#ifdef __HIPCC__ + constexpr int thread_per_block = 256; +#else + constexpr int thread_per_block = 512; +#endif + KeBicubicInterpFw< + T><<>>( + input_data, + in_h, + in_w, + n, + in_chw, + output_data, + out_h, + out_w, + n, + out_chw, + c, + ratio_h, + ratio_w, + align_corners, + data_layout); + } +} + +template +static void Interpolate3DCUDAFwd( + const Context& dev_ctx, + const DenseTensor& input, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const std::string& data_layout_str, + int out_d, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* output) { + auto* input_data = input.data(); + + const DataLayout data_layout = + paddle::framework::StringToDataLayout(data_layout_str); + int n, c, in_d, in_h, in_w; + funcs::ExtractNCDWH(input.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w); + + float scale_w = -1; + float scale_d = -1; + float scale_h = -1; + if (size_tensor && size_tensor->size() > 0) { + // have size tensor + auto new_size = funcs::get_new_shape(size_tensor.get()); + out_d = new_size[0]; + out_h = new_size[1]; + out_w = new_size[2]; + } else { + if (scale_tensor) { + auto scale_data = + funcs::get_new_data_from_tensor(scale_tensor.get_ptr()); + if (scale_data.size() > 1) { + scale_d = scale_data[0]; + scale_h = scale_data[1]; + scale_w = scale_data[2]; + } else { + scale_d = scale_data[0]; + scale_h = scale_data[0]; + scale_w = scale_data[0]; + } + + PADDLE_ENFORCE_EQ( + scale_w > 0, + true, + errors::InvalidArgument( + "The scale_w in input 'Scale' Tensor of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); + PADDLE_ENFORCE_EQ( + scale_h > 0, + true, + errors::InvalidArgument( + "The scale_h in input 'Scale' Tensor of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_h)); + PADDLE_ENFORCE_EQ( + scale_d > 0, + true, + errors::InvalidArgument( + "The scale_d in input 'Scale' Tensor of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_d)); + } else { + if (scale.size() > 1) { + scale_d = scale[0]; + scale_h = scale[1]; + scale_w = scale[2]; + + PADDLE_ENFORCE_EQ( + scale_w > 0, + true, + errors::InvalidArgument( + "The scale_w in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); + PADDLE_ENFORCE_EQ( + scale_h > 0, + true, + errors::InvalidArgument( + "The scale_h in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_h)); + PADDLE_ENFORCE_EQ( + scale_d > 0, + true, + errors::InvalidArgument( + "The scale_d in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_d)); + } + } + if (scale_d > 0. && scale_h > 0. && scale_w > 0.) { + out_d = static_cast(in_d * scale_d); + out_h = static_cast(in_h * scale_h); + out_w = static_cast(in_w * scale_w); + } + if (out_size) { + DenseTensor sizes; + paddle::framework::TensorCopySync( + *out_size, paddle::platform::CPUPlace(), &sizes); + auto size_data = sizes.data(); + out_d = size_data[0]; + out_h = size_data[1]; + out_w = size_data[2]; + } + } + PADDLE_ENFORCE_GT( + out_d, + 0, + errors::InvalidArgument("out_d in Attr(out_shape) of Op(interpolate) " + "should be greater than 0.")); + PADDLE_ENFORCE_GT( + out_h, + 0, + errors::InvalidArgument("out_h in Attr(out_shape) of Op(interpolate) " + "should be greater than 0.")); + PADDLE_ENFORCE_GT( + out_w, + 0, + errors::InvalidArgument("out_w in Attr(out_shape) of Op(interpolate) " + "should be greater than 0.")); + + phi::DDim dim_out; + if (data_layout == DataLayout::kNCHW) { + dim_out = {n, c, out_d, out_h, out_w}; + } else { + dim_out = {n, out_d, out_h, out_w, c}; + } + output->Resize(dim_out); + auto output_data = dev_ctx.template Alloc(output); + + if (in_d == out_d && in_h == out_h && in_w == out_w) { + paddle::framework::TensorCopy(input, dev_ctx.GetPlace(), output); + return; + } + + float ratio_d = 0.f; + float ratio_h = 0.f; + float ratio_w = 0.f; + if (out_d > 1) { + float new_scale_d = 0.f; + new_scale_d = (scale_d > 0) ? static_cast(1. / scale_d) + : static_cast(in_d) / out_d; + ratio_d = (align_corners) ? static_cast(in_d - 1) / (out_d - 1) + : static_cast(new_scale_d); + } + if (out_h > 1) { + float new_scale_h = 0.f; + new_scale_h = (scale_h > 0) ? static_cast(1. / scale_h) + : static_cast(in_h) / out_h; + ratio_h = (align_corners) ? static_cast(in_h - 1) / (out_h - 1) + : static_cast(new_scale_h); + } + if (out_w > 1) { + float new_scale_w = 0.f; + new_scale_w = (scale_w > 0) ? static_cast(1. / scale_w) + : static_cast(in_w) / out_w; + ratio_w = (align_corners) ? static_cast(in_w - 1) / (out_w - 1) + : static_cast(new_scale_w); + } + + int64_t in_dhw = in_d * in_h * in_w; + int64_t out_dhw = out_d * out_h * out_w; + int64_t in_cdhw = c * in_dhw; + int64_t out_cdhw = c * out_dhw; + + auto pixelNum = n * out_cdhw; + + backends::gpu::GpuLaunchConfig config = + backends::gpu::GetGpuLaunchConfig1D(dev_ctx, pixelNum); + + if ("trilinear" == interp_method) { + KeTrilinearInterpFw<<>>(input_data, + in_d, + in_h, + in_w, + n, + in_cdhw, + output_data, + out_d, + out_h, + out_w, + n, + out_cdhw, + c, + ratio_d, + ratio_h, + ratio_w, + align_corners, + align_mode, + data_layout); + } else if ("nearest" == interp_method) { + KeNearestNeighbor3DInterpFw<<>>(input_data, + in_d, + in_h, + in_w, + n, + in_cdhw, + output_data, + out_d, + out_h, + out_w, + n, + out_cdhw, + c, + ratio_d, + ratio_h, + ratio_w, + align_corners, + data_layout); + } +} + +template +void InterpolateKernel( + const Context& dev_ctx, + const DenseTensor& x, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const std::string& data_layout, + int out_d, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* output) { + auto input_dims = x.dims(); + if (input_dims.size() == 3) { // 1D interpolation + Interpolate1DCUDAFwd(dev_ctx, + x, + out_size, + size_tensor, + scale_tensor, + data_layout, + out_w, + scale, + interp_method, + align_corners, + align_mode, + output); + } else if (input_dims.size() == 4) { // 2D interpolation + Interpolate2DCUDAFwd(dev_ctx, + x, + out_size, + size_tensor, + scale_tensor, + data_layout, + out_h, + out_w, + scale, + interp_method, + align_corners, + align_mode, + output); + } else if (input_dims.size() == 5) { // 3D interpolation + Interpolate3DCUDAFwd(dev_ctx, + x, + out_size, + size_tensor, + scale_tensor, + data_layout, + out_d, + out_h, + out_w, + scale, + interp_method, + align_corners, + align_mode, + output); + } +} + +template +void BilinearInterpKernel( + const Context& dev_ctx, + const DenseTensor& x, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const std::string& data_layout, + int out_d, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* output) { + InterpolateKernel(dev_ctx, + x, + out_size, + size_tensor, + scale_tensor, + data_layout, + out_d, + out_h, + out_w, + scale, + interp_method, + align_corners, + align_mode, + output); +} + +template +void NearestInterpKernel( + const Context& dev_ctx, + const DenseTensor& x, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const std::string& data_layout, + int out_d, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* output) { + InterpolateKernel(dev_ctx, + x, + out_size, + size_tensor, + scale_tensor, + data_layout, + out_d, + out_h, + out_w, + scale, + interp_method, + align_corners, + align_mode, + output); +} + +template +void TrilinearInterpKernel( + const Context& dev_ctx, + const DenseTensor& x, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const std::string& data_layout, + int out_d, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* output) { + InterpolateKernel(dev_ctx, + x, + out_size, + size_tensor, + scale_tensor, + data_layout, + out_d, + out_h, + out_w, + scale, + interp_method, + align_corners, + align_mode, + output); +} + +template +void LinearInterpKernel( + const Context& dev_ctx, + const DenseTensor& x, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const std::string& data_layout, + int out_d, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* output) { + InterpolateKernel(dev_ctx, + x, + out_size, + size_tensor, + scale_tensor, + data_layout, + out_d, + out_h, + out_w, + scale, + interp_method, + align_corners, + align_mode, + output); +} + +template +void BicubicInterpKernel( + const Context& dev_ctx, + const DenseTensor& x, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const std::string& data_layout, + int out_d, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* output) { + InterpolateKernel(dev_ctx, + x, + out_size, + size_tensor, + scale_tensor, + data_layout, + out_d, + out_h, + out_w, + scale, + interp_method, + align_corners, + align_mode, + output); +} + +} // namespace phi + +PD_REGISTER_KERNEL(bilinear_interp_v2, + GPU, + ALL_LAYOUT, + phi::BilinearInterpKernel, + float, + double, + int) {} +PD_REGISTER_KERNEL(nearest_interp_v2, + GPU, + ALL_LAYOUT, + phi::NearestInterpKernel, + float, + double, + int, + int64_t) {} +PD_REGISTER_KERNEL(trilinear_interp_v2, + GPU, + ALL_LAYOUT, + phi::TrilinearInterpKernel, + float, + double, + int) {} +PD_REGISTER_KERNEL(linear_interp_v2, + GPU, + ALL_LAYOUT, + phi::LinearInterpKernel, + float, + double, + int) {} +PD_REGISTER_KERNEL(bicubic_interp_v2, + GPU, + ALL_LAYOUT, + phi::BicubicInterpKernel, + float, + double, + int) {} diff --git a/paddle/phi/kernels/gpu/masked_select_grad_kernel.cu b/paddle/phi/kernels/gpu/masked_select_grad_kernel.cu index 5a4ce3a2679b9..171baab5513e4 100644 --- a/paddle/phi/kernels/gpu/masked_select_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/masked_select_grad_kernel.cu @@ -44,9 +44,9 @@ struct MaskedSelectGradFunctor { template void MaskedSelectGradKernel(const Context& dev_ctx, - const DenseTensor& out_grad, const DenseTensor& x, const DenseTensor& mask, + const DenseTensor& out_grad, DenseTensor* x_grad) { auto mask_size = mask.numel(); dev_ctx.template Alloc(x_grad); diff --git a/paddle/phi/kernels/gpu/multinomial_kernel.cu b/paddle/phi/kernels/gpu/multinomial_kernel.cu index 752a91fa48198..ee5f843b18a90 100644 --- a/paddle/phi/kernels/gpu/multinomial_kernel.cu +++ b/paddle/phi/kernels/gpu/multinomial_kernel.cu @@ -243,8 +243,7 @@ void MultinomialKernel(const Context& dev_ctx, dev_ctx, rand, -1, true, false, 3 /*proto::VarType::INT64*/, out); } else { std::vector out_dim_vec = vectorize(out->dims()); - DenseTensor value = - Empty(dev_ctx, ScalarArray(out_dim_vec)); + DenseTensor value = Empty(dev_ctx, IntArray(out_dim_vec)); TopkKernel( dev_ctx, rand, Scalar(num_samples), -1, true, true, &value, out); } diff --git a/paddle/phi/kernels/gpu/nll_loss_grad_kernel.cu b/paddle/phi/kernels/gpu/nll_loss_grad_kernel.cu index 9a2d9c6e479aa..43106ec1d863f 100644 --- a/paddle/phi/kernels/gpu/nll_loss_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/nll_loss_grad_kernel.cu @@ -23,8 +23,8 @@ template void NllLossGradKernel(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& labels, - const DenseTensor& total_weight, paddle::optional weight, + const DenseTensor& total_weight, const DenseTensor& dout, int64_t ignore_index, const std::string& reduction, diff --git a/paddle/phi/kernels/gpu/pad3d_grad_kernel.cu b/paddle/phi/kernels/gpu/pad3d_grad_kernel.cu index 5ca8f3d73dade..8f4af0a450890 100644 --- a/paddle/phi/kernels/gpu/pad3d_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/pad3d_grad_kernel.cu @@ -330,7 +330,7 @@ template void Pad3dGradKernel(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& out_grad, - const ScalarArray& paddings, + const IntArray& paddings, const std::string& mode, float pad_value, const std::string& data_format, diff --git a/paddle/phi/kernels/gpu/pad3d_kernel.cu b/paddle/phi/kernels/gpu/pad3d_kernel.cu index 8f7cf716e79cf..d1b1d70667673 100644 --- a/paddle/phi/kernels/gpu/pad3d_kernel.cu +++ b/paddle/phi/kernels/gpu/pad3d_kernel.cu @@ -327,7 +327,7 @@ __global__ void Pad3DCircularNDHWC(const int nthreads, template void Pad3dKernel(const Context& dev_ctx, const DenseTensor& x, - const ScalarArray& paddings, + const IntArray& paddings, const std::string& mode, float pad_value, const std::string& data_format, diff --git a/paddle/phi/kernels/gpu/randint_kernel.cu b/paddle/phi/kernels/gpu/randint_kernel.cu index d4cbd5c73feae..0188505002268 100644 --- a/paddle/phi/kernels/gpu/randint_kernel.cu +++ b/paddle/phi/kernels/gpu/randint_kernel.cu @@ -31,7 +31,7 @@ template void RandintRawKernel(const Context& dev_ctx, int low, int high, - const ScalarArray& shape, + const IntArray& shape, DataType dtype, int seed, DenseTensor* out) { @@ -74,7 +74,7 @@ template void RandintKernel(const Context& dev_ctx, int low, int high, - const ScalarArray& shape, + const IntArray& shape, DataType dtype, DenseTensor* out) { RandintRawKernel(dev_ctx, low, high, shape, dtype, 0, out); diff --git a/paddle/phi/kernels/gpu/randperm_kernel.cu b/paddle/phi/kernels/gpu/randperm_kernel.cu index 92948bf47c934..678b580beca2f 100644 --- a/paddle/phi/kernels/gpu/randperm_kernel.cu +++ b/paddle/phi/kernels/gpu/randperm_kernel.cu @@ -89,12 +89,12 @@ void RandpermRawKernel( RandintKernel(dev_ctx, std::numeric_limits::min(), std::numeric_limits::max(), - ScalarArray({n}), + IntArray({n}), phi::DataType::INT32, &key); - DenseTensor key_out = Empty(dev_ctx, ScalarArray({n})); + DenseTensor key_out = Empty(dev_ctx, IntArray({n})); - DenseTensor range = Empty(dev_ctx, ScalarArray({n})); + DenseTensor range = Empty(dev_ctx, IntArray({n})); T* range_data = range.data(); funcs::ForRange for_range(dev_ctx, n); for_range([range_data] __device__(size_t idx) { diff --git a/paddle/phi/kernels/gpu/range_kernel.cu b/paddle/phi/kernels/gpu/range_kernel.cu new file mode 100644 index 0000000000000..65d9b45efbcdd --- /dev/null +++ b/paddle/phi/kernels/gpu/range_kernel.cu @@ -0,0 +1,57 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/range_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/copy_kernel.h" +#include "paddle/phi/kernels/funcs/range_function.h" + +namespace phi { + +template +__global__ void Range(T start, T step, int64_t size, T* out) { + CUDA_KERNEL_LOOP(index, size) { out[index] = start + step * index; } +} + +template +void RangeKernel(const Context& dev_ctx, + const DenseTensor& start, + const DenseTensor& end, + const DenseTensor& step, + DenseTensor* out) { + T start_value = start.data()[0]; + T end_value = end.data()[0]; + T step_value = step.data()[0]; + + int64_t size = 0; + phi::funcs::GetSize(start_value, end_value, step_value, &size); + out->Resize(phi::make_ddim({size})); + T* out_data = dev_ctx.template Alloc(out); + + auto stream = dev_ctx.stream(); + int block = std::min(size, static_cast(256)); + int grid = (size + block - 1) / block; + Range<<>>(start_value, step_value, size, out_data); +} + +} // namespace phi + +PD_REGISTER_KERNEL( + range, GPU, ALL_LAYOUT, phi::RangeKernel, float, double, int64_t, int) { + kernel->InputAt(0).SetBackend(phi::Backend::CPU); + kernel->InputAt(1).SetBackend(phi::Backend::CPU); + kernel->InputAt(2).SetBackend(phi::Backend::CPU); +} diff --git a/paddle/phi/kernels/gpu/roll_grad_kernel.cu b/paddle/phi/kernels/gpu/roll_grad_kernel.cu index 93e9e81882c9e..82e0fa72ab076 100644 --- a/paddle/phi/kernels/gpu/roll_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/roll_grad_kernel.cu @@ -26,7 +26,7 @@ template void RollGradKernel(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& out_grad, - const ScalarArray& shifts, + const IntArray& shifts, const std::vector& axis, DenseTensor* x_grad) { auto* in_data = out_grad.data(); diff --git a/paddle/phi/kernels/gpu/roll_kernel.cu b/paddle/phi/kernels/gpu/roll_kernel.cu index 1543335d3a0c5..5d3584e4f44c1 100644 --- a/paddle/phi/kernels/gpu/roll_kernel.cu +++ b/paddle/phi/kernels/gpu/roll_kernel.cu @@ -26,7 +26,7 @@ using paddle::platform::PADDLE_CUDA_NUM_THREADS; template void RollKernel(const Context& dev_ctx, const DenseTensor& x, - const ScalarArray& shifts, + const IntArray& shifts, const std::vector& axis, DenseTensor* out) { auto* in_data = x.data(); diff --git a/paddle/phi/kernels/gpu/split_kernel.cu b/paddle/phi/kernels/gpu/split_kernel.cu index 83c2ec4b6e99d..73b64ce970319 100644 --- a/paddle/phi/kernels/gpu/split_kernel.cu +++ b/paddle/phi/kernels/gpu/split_kernel.cu @@ -24,7 +24,7 @@ namespace phi { template void SplitKernel(const Context& dev_ctx, const DenseTensor& x, - const ScalarArray& num_or_sections, + const IntArray& num_or_sections, const Scalar& axis_scalar, std::vector outs) { // need to infershape output diff --git a/paddle/phi/kernels/gpu/stack_grad_kernel.cu b/paddle/phi/kernels/gpu/stack_grad_kernel.cu new file mode 100644 index 0000000000000..9b754e22692af --- /dev/null +++ b/paddle/phi/kernels/gpu/stack_grad_kernel.cu @@ -0,0 +1,143 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/stack_grad_kernel.h" + +#include "paddle/fluid/memory/memory.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_launch_config.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +__global__ void UnStackHelperCUDAKernel(const T* __restrict__ input, + int pre_dim_size, + int split_dim_size, + int suf_dim_size, + int num_split, + T** output_ptrs) { + assert(blockDim.y == 1); + assert(blockDim.z == 1); + // In this case they are equal + assert(split_dim_size % num_split == 0); + + IntType size = pre_dim_size * split_dim_size * suf_dim_size; + IntType each_dim_size = split_dim_size / num_split; + + for (IntType offset = blockIdx.x * blockDim.x + threadIdx.x; offset < size; + offset += blockDim.x * gridDim.x) { + IntType i = offset / (split_dim_size * suf_dim_size); + IntType j = (offset % (split_dim_size * suf_dim_size)) / suf_dim_size; + IntType k = offset % suf_dim_size; + + T* output = output_ptrs[j / each_dim_size]; + if (output == nullptr) { + return; + } + IntType output_ind = i * each_dim_size * suf_dim_size + + (j % each_dim_size) * suf_dim_size + k; + *(output + output_ind) = input[offset]; + } +} + +template +void StackGradKernel(const Context& dev_ctx, + const DenseTensor& out, + int axis, + std::vector x_grad) { + if (axis < 0) axis += out.dims().size(); + + int n = out.dims()[axis]; + PADDLE_ENFORCE_EQ(n, + x_grad.size(), + phi::errors::InvalidArgument( + "Output x_grad size should be equal to n, but" + " received n is:%d x_grad size is:%d.", + n, + x_grad.size())); + + // x_grad is output, so save each data address, then copy each dy into dx_data + std::vector outputs(n); + for (size_t j = 0; j < x_grad.size(); ++j) { + if (x_grad[j] == nullptr) { + outputs[j] = nullptr; + continue; + } + if (x_grad[j]->numel() != 0UL) { + T* ptr = dev_ctx.template Alloc(x_grad[j]); + outputs[j] = ptr; + } else { + outputs[j] = nullptr; + } + } + auto dy_data = out.data(); + // each x_grad should have same shape + int dy_pre = 1, dy_suf = 1; + auto dy_dims = out.dims(); + int split_dim = n; + for (int i = 0; i < axis; ++i) { + dy_pre *= dy_dims[i]; + } + dy_suf = out.numel() / (split_dim * dy_pre); + + auto tmp_out_data = + paddle::memory::Alloc(dev_ctx, outputs.size() * sizeof(T*)); + paddle::memory::Copy(dev_ctx.GetPlace(), + tmp_out_data->ptr(), + phi::CPUPlace(), + reinterpret_cast(outputs.data()), + outputs.size() * sizeof(T*), + dev_ctx.stream()); + + auto config = phi::backends::gpu::GetGpuLaunchConfig1D( + dev_ctx, dy_pre * split_dim * dy_suf); + + if (out.numel() < std::numeric_limits::max()) { + UnStackHelperCUDAKernel<<>>( + dy_data, + dy_pre, + split_dim, + dy_suf, + split_dim, + reinterpret_cast(tmp_out_data->ptr())); + } else { + UnStackHelperCUDAKernel<<>>( + dy_data, + dy_pre, + split_dim, + dy_suf, + split_dim, + reinterpret_cast(tmp_out_data->ptr())); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(stack_grad, + GPU, + ALL_LAYOUT, + phi::StackGradKernel, + float, + double, + int64_t, + int, + phi::dtype::float16, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/gpu/stack_kernel.cu b/paddle/phi/kernels/gpu/stack_kernel.cu new file mode 100644 index 0000000000000..cc7d136c95293 --- /dev/null +++ b/paddle/phi/kernels/gpu/stack_kernel.cu @@ -0,0 +1,113 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/stack_kernel.h" + +#include "paddle/fluid/memory/memory.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_launch_config.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +__global__ void StackCUDAKernel(T** input_ptrs, + int split_size, + int rows, + int cols, + T* __restrict__ output) { + IntType grid_x = blockIdx.x * blockDim.x + threadIdx.x; + + for (; grid_x < cols; grid_x += blockDim.x * gridDim.x) { + IntType grid_y = blockIdx.y * blockDim.y + threadIdx.y; + + IntType split = grid_x / split_size; + const T* input_ptr = input_ptrs[split]; + IntType col_offset = grid_x % split_size; +#pragma unroll + for (; grid_y < rows; grid_y += blockDim.y * gridDim.y) { + output[grid_y * cols + grid_x] = + input_ptr[grid_y * split_size + col_offset]; + } + } +} + +template +void StackKernel(const Context& dev_ctx, + const std::vector& x, + int axis, + DenseTensor* out) { + if (axis < 0) axis += (x[0]->dims().size() + 1); + + int n = static_cast(x.size()); + T* y_data = dev_ctx.template Alloc(out); + std::vector x_datas(n); + for (int i = 0; i < n; i++) { + x_datas[i] = x[i]->data(); + } + + auto tmp_x_data = paddle::memory::Alloc(dev_ctx, x_datas.size() * sizeof(T*)); + paddle::memory::Copy(dev_ctx.GetPlace(), + tmp_x_data->ptr(), + phi::CPUPlace(), + reinterpret_cast(x_datas.data()), + x_datas.size() * sizeof(T*), + dev_ctx.stream()); + + // Split x dim from axis to matrix + int x_row = 1, x_col = 1; + for (int i = 0; i < axis; ++i) { + x_row *= x[0]->dims()[i]; + } + x_col = x[0]->numel() / x_row; + int out_col = x_col * n; + + auto config = + phi::backends::gpu::GetGpuLaunchConfig2D(dev_ctx, out_col, x_row); + + if (out->numel() < std::numeric_limits::max()) { + StackCUDAKernel<<>>( + reinterpret_cast(tmp_x_data->ptr()), + x_col, + x_row, + out_col, + y_data); + } else { + StackCUDAKernel<<>>( + reinterpret_cast(tmp_x_data->ptr()), + x_col, + x_row, + out_col, + y_data); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(stack, + GPU, + ALL_LAYOUT, + phi::StackKernel, + float, + double, + int64_t, + int, + phi::dtype::float16, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/gpu/strided_slice_grad_kernel.cu b/paddle/phi/kernels/gpu/strided_slice_grad_kernel.cu index 5f31d488533a6..90d9f1d986577 100644 --- a/paddle/phi/kernels/gpu/strided_slice_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/strided_slice_grad_kernel.cu @@ -19,10 +19,10 @@ #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/strided_slice_grad_kernel_impl.h" -PD_REGISTER_KERNEL(strided_slice_grad, +PD_REGISTER_KERNEL(strided_slice_raw_grad, GPU, ALL_LAYOUT, - phi::StridedSliceGradKernel, + phi::StridedSliceRawGradKernel, bool, int, int64_t, diff --git a/paddle/phi/kernels/gpu/strided_slice_kernel.cu b/paddle/phi/kernels/gpu/strided_slice_kernel.cu index ff10718edb323..716150ff47dea 100644 --- a/paddle/phi/kernels/gpu/strided_slice_kernel.cu +++ b/paddle/phi/kernels/gpu/strided_slice_kernel.cu @@ -19,10 +19,10 @@ #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/strided_slice_kernel_impl.h" -PD_REGISTER_KERNEL(strided_slice, +PD_REGISTER_KERNEL(strided_slice_raw, GPU, ALL_LAYOUT, - phi::StridedSliceKernel, + phi::StridedSliceRawKernel, bool, int, int64_t, diff --git a/paddle/phi/kernels/gpu/top_k_grad_kernel.cu b/paddle/phi/kernels/gpu/top_k_grad_kernel.cu index b0b45223489e9..32c5fc0006f4c 100644 --- a/paddle/phi/kernels/gpu/top_k_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/top_k_grad_kernel.cu @@ -25,10 +25,10 @@ namespace ops = paddle::operators; template void TopkGradKernel(const Context& dev_ctx, - const DenseTensor& out_grad, const DenseTensor& x, const DenseTensor& indices, - int k, + const DenseTensor& out_grad, + const Scalar& k_scalar, int axis, bool largest, bool sorted, @@ -36,6 +36,8 @@ void TopkGradKernel(const Context& dev_ctx, const auto& in_dims = x.dims(); const auto& out_dims = indices.dims(); + int k = k_scalar.to(); + // get the real the axis and the k if (axis < 0) { axis += in_dims.size(); diff --git a/paddle/phi/kernels/gpu/triangular_solve_kernel.cu b/paddle/phi/kernels/gpu/triangular_solve_kernel.cu index f137d8e1c2603..a48afeb2c796b 100644 --- a/paddle/phi/kernels/gpu/triangular_solve_kernel.cu +++ b/paddle/phi/kernels/gpu/triangular_solve_kernel.cu @@ -45,14 +45,14 @@ void TriangularSolveKernel(const Context& dev_ctx, int y_bst_ndim = y_bst_dims_vec.size(); // Tensor broadcast to 'out' and temp 'x_bst' - ScalarArray x_bst_dims(x_bst_dims_vec); + IntArray x_bst_dims(x_bst_dims_vec); DenseTensor x_bst = phi::Empty(dev_ctx, x_bst_dims); const T* x_bst_data = x_bst.data(); ExpandKernel(dev_ctx, x, x_bst_dims, &x_bst); out->Resize(phi::make_ddim(y_bst_dims_vec)); T* out_data = dev_ctx.template Alloc(out); - ScalarArray y_bst_dims(y_bst_dims_vec); + IntArray y_bst_dims(y_bst_dims_vec); ExpandKernel(dev_ctx, y, y_bst_dims, out); // calculate use cublas library diff --git a/paddle/phi/kernels/gpu/uniform_random_kernel.cu b/paddle/phi/kernels/gpu/uniform_random_kernel.cu index cdab9faf6aafe..2cabde0bbf942 100644 --- a/paddle/phi/kernels/gpu/uniform_random_kernel.cu +++ b/paddle/phi/kernels/gpu/uniform_random_kernel.cu @@ -93,7 +93,7 @@ struct UniformGeneratorOffset { template void UniformRandomRawKernel(const Context& dev_ctx, - const ScalarArray& shape, + const IntArray& shape, DataType dtype, float min, float max, @@ -140,7 +140,7 @@ void UniformRandomRawKernel(const Context& dev_ctx, template void UniformRandomKernel(const Context& dev_ctx, - const ScalarArray& shape, + const IntArray& shape, DataType dtype, float min, float max, diff --git a/paddle/phi/kernels/gpu/unique_kernel.cu b/paddle/phi/kernels/gpu/unique_kernel.cu new file mode 100644 index 0000000000000..c09730ba76a91 --- /dev/null +++ b/paddle/phi/kernels/gpu/unique_kernel.cu @@ -0,0 +1,615 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/unique_kernel.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/tensor_util.h" // TensorToVector() +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/copy_kernel.h" +#include "paddle/phi/kernels/funcs/unique_functor.h" + +namespace phi { + +// Binary function 'less than' +template +struct LessThan { + int col; + const InT* in_trans_data; + + LessThan(int64_t _col, const InT* _in_trans_data) + : col(_col), in_trans_data(_in_trans_data) {} + + __device__ bool operator()(int64_t a, int64_t b) const { + for (int i = 0; i < col; ++i) { + InT lhs = in_trans_data[i + a * col]; + InT rhs = in_trans_data[i + b * col]; + if (lhs < rhs) { + return true; + } else if (lhs > rhs) { + return false; + } + } + return false; + } +}; + +// Binary function 'equal_to' +template +struct BinaryEqual { + int64_t col; + const InT* in_trans_data; + + BinaryEqual(int64_t _col, const InT* _in_trans_data) + : col(_col), in_trans_data(_in_trans_data) {} + + __device__ bool operator()(int64_t a, int64_t b) const { + for (int64_t i = 0; i < col; ++i) { + InT lhs = in_trans_data[i + a * col]; + InT rhs = in_trans_data[i + b * col]; + if (lhs != rhs) { + return false; + } + } + return true; + } +}; + +// Binary function 'not_equal_to' +template +struct BinaryNotEqual { + int64_t col; + const InT* in_trans_data; + + BinaryNotEqual(int64_t _col, const InT* _in_trans_data) + : col(_col), in_trans_data(_in_trans_data) {} + + __device__ bool operator()(int64_t a, int64_t b) const { + for (int64_t i = 0; i < col; ++i) { + InT lhs = in_trans_data[i + a * col]; + InT rhs = in_trans_data[i + b * col]; + if (lhs != rhs) { + return true; + } + } + return false; + } +}; + +// index_select() function for DenseTensor +template +void IndexSelect(const Context& context, + const DenseTensor& input, + const DenseTensor& index, + DenseTensor* output, + int dim) { + auto input_dim = input.dims(); + auto input_dim_size = input_dim.size(); + auto output_dim = output->dims(); + + auto slice_size = 1; + for (auto i = dim + 1; i < input_dim_size; i++) { + slice_size *= input_dim[i]; + } + + auto input_width = slice_size * input_dim[dim]; + auto output_width = slice_size * output_dim[dim]; + + auto outer_nums = 1; + for (auto i = 0; i < dim; i++) { + outer_nums *= input_dim[i]; + } + + auto index_size = index.dims()[0]; + + std::vector input_vec; + std::vector index_vec; + paddle::framework::TensorToVector(input, context, &input_vec); + paddle::framework::TensorToVector(index, context, &index_vec); + std::vector out_vec(output->numel()); + + for (int i = 0; i < index_size; i++) { + PADDLE_ENFORCE_GE( + index_vec[i], + 0, + phi::errors::InvalidArgument( + "Variable value (index) of OP(index_select) " + "expected >= 0 and < %ld, but got %ld. Please check input " + "value.", + input_dim[dim], + index_vec[i])); + PADDLE_ENFORCE_LT( + index_vec[i], + input_dim[dim], + phi::errors::InvalidArgument( + "Variable value (index) of OP(index_select) " + "expected >= 0 and < %ld, but got %ld. Please check input " + "value.", + input_dim[dim], + index_vec[i])); + } + + for (auto i = 0; i < outer_nums; i++) { + auto input_start_offset = i * input_width; + auto output_start_offset = i * output_width; + + for (auto j = 0; j < index_size; j++) { + IndexT index_value = index_vec[j]; + for (auto k = 0; k < slice_size; k++) { + out_vec[output_start_offset + j * slice_size + k] = + input_vec[input_start_offset + index_value * slice_size + k]; + } + } + } + context.template Alloc(output); + paddle::framework::TensorFromVector(out_vec, context, output); + output->Resize(output_dim); +} + +// The core logic of computing Unique for a flattend DenseTensor +template +static void UniqueFlattendCUDATensor(const Context& context, + const DenseTensor& in, + DenseTensor* out, + DenseTensor* indices, + DenseTensor* index, + DenseTensor* counts, + bool return_index, + bool return_inverse, + bool return_counts, + equal_T equal, + not_equal_T not_equal, + int64_t num_input) { + // 0. Prepration + DenseTensor in_hat; + phi::Copy(context, in, context.GetPlace(), false, &in_hat); + auto* in_data_hat = context.template Alloc(&in_hat); + + indices->Resize(phi::make_ddim({num_input})); + auto* indices_data = context.template Alloc(indices); + + thrust::sequence(thrust::device, indices_data, indices_data + num_input); + thrust::sort_by_key( + thrust::device, in_data_hat, in_data_hat + num_input, indices_data); + + // 1. Calculate op result: 'out' + DenseTensor range; + range.Resize(phi::make_ddim({num_input + 1})); + auto* range_data_ptr = context.template Alloc(&range); + thrust::sequence( + thrust::device, range_data_ptr, range_data_ptr + num_input + 1); + phi::Copy(context, in_hat, context.GetPlace(), false, out); + int num_out; + auto out_data = context.template Alloc(out); + num_out = + thrust::unique_by_key( + thrust::device, out_data, out_data + num_input, range_data_ptr, equal) + .first - + out_data; + out->Resize(phi::make_ddim({num_out})); + + // 3. Calculate inverse index: 'inverse' + if (return_inverse) { + index->Resize(phi::make_ddim({num_input})); + auto* inverse_data = context.template Alloc(index); + DenseTensor inv_loc; + inv_loc.Resize(phi::make_ddim({num_input})); + auto inv_loc_data_ptr = context.template Alloc(&inv_loc); + thrust::adjacent_difference(thrust::device, + in_data_hat, + in_data_hat + num_input, + inv_loc_data_ptr, + not_equal); + thrust::device_ptr inv_loc_data_dev(inv_loc_data_ptr); + inv_loc_data_dev[0] = 0; // without device_ptr, segmentation fault + thrust::inclusive_scan(thrust::device, + inv_loc_data_ptr, + inv_loc_data_ptr + num_input, + inv_loc_data_ptr); + thrust::scatter(thrust::device, + inv_loc_data_ptr, + inv_loc_data_ptr + num_input, + indices_data, + inverse_data); + } + + // 2. Calculate sorted index: 'indices' + if (return_index) { + DenseTensor tmp_indices; + tmp_indices.Resize(phi::make_ddim({num_input})); + auto* tmp_indices_data_ptr = context.template Alloc(&tmp_indices); + thrust::copy(thrust::device, + in_data_hat, + in_data_hat + num_input, + tmp_indices_data_ptr); + thrust::unique_by_key(thrust::device, + tmp_indices_data_ptr, + tmp_indices_data_ptr + num_input, + indices_data, + equal); + indices->Resize(phi::make_ddim({num_out})); + } + + // 4. Calculate 'counts' + if (return_counts) { + counts->Resize(phi::make_ddim({num_out})); + auto count_data = context.template Alloc(counts); + // init 'count_data' as 0 + thrust::fill(thrust::device, count_data, count_data + num_out, 0); + thrust::device_ptr range_data_ptr_dev(range_data_ptr); + range_data_ptr_dev[num_out] = num_input; + thrust::adjacent_difference(thrust::device, + range_data_ptr + 1, + range_data_ptr + num_out + 1, + count_data); + } +} + +// The logic of compute unique with axis required, it's a little different +// from above function +template +static void ComputeUniqueDims(const Context& context, + DenseTensor* sorted_indices, + IndexT* sorted_indices_data, + DenseTensor* out, + DenseTensor* inverse, + DenseTensor* counts, + bool return_index, + bool return_inverse, + bool return_counts, + equal_T equal, + not_equal_T not_equal, + int64_t row) { + // 1. inverse indices: 'inverse' + inverse->Resize(phi::make_ddim({row})); + auto* inverse_data = context.template Alloc(inverse); + DenseTensor inv_loc; + inv_loc.Resize(phi::make_ddim({row})); + auto inv_loc_data_ptr = context.template Alloc(&inv_loc); + thrust::adjacent_difference(thrust::device, + sorted_indices_data, + sorted_indices_data + row, + inv_loc_data_ptr, + not_equal); + thrust::device_ptr inv_loc_data_dev(inv_loc_data_ptr); + inv_loc_data_dev[0] = 0; + thrust::inclusive_scan(thrust::device, + inv_loc_data_ptr, + inv_loc_data_ptr + row, + inv_loc_data_ptr); + thrust::scatter(thrust::device, + inv_loc_data_ptr, + inv_loc_data_ptr + row, + sorted_indices_data, + inverse_data); + + // 2. sorted indices + DenseTensor range; + range.Resize(phi::make_ddim({row + 1})); + auto range_data_ptr = context.template Alloc(&range); + thrust::sequence(thrust::device, range_data_ptr, range_data_ptr + row + 1); + int num_out; + num_out = thrust::unique_by_key(thrust::device, + sorted_indices_data, + sorted_indices_data + row, + range_data_ptr, + equal) + .first - + sorted_indices_data; + thrust::device_ptr range_data_ptr_dev(range_data_ptr); + range_data_ptr_dev[num_out] = row; + sorted_indices->Resize(phi::make_ddim({num_out})); + + // 3. counts: 'counts' + counts->Resize(phi::make_ddim({num_out})); + auto* count_data = context.template Alloc(counts); + thrust::fill(thrust::device, count_data, count_data + row, 0); + thrust::adjacent_difference( + thrust::device, range_data_ptr + 1, range_data_ptr + row + 1, count_data); +} + +// Calculate unique when 'axis' is set +template +static void UniqueDimsCUDATensor(const Context& context, + const DenseTensor& in, + DenseTensor* out, + DenseTensor* indices, + DenseTensor* index, + DenseTensor* counts, + bool return_index, + bool return_inverse, + bool return_counts, + int axis) { + // 1. Transpose & reshape + // Transpose tensor: eg. axis=1, [dim0, dim1, dim2] -> [dim1, dim0, dim2] + std::vector permute(in.dims().size()); + std::iota(permute.begin(), permute.end(), 0); + permute[axis] = 0; + permute[0] = axis; + std::vector in_trans_dims_vec(phi::vectorize(in.dims())); + in_trans_dims_vec[axis] = in.dims()[0]; + in_trans_dims_vec[0] = in.dims()[axis]; + DenseTensor in_trans; + auto in_trans_dims = phi::make_ddim(in_trans_dims_vec); + in_trans.Resize(in_trans_dims); + context.template Alloc(&in_trans); + phi::funcs::TransCompute( + in.dims().size(), // num of dims + context, // device + in, // original DenseTensor + &in_trans, // DenseTensor after reshape + permute); // index of axis + + // Reshape tensor: eg. [dim1, dim0, dim2] -> [dim1, dim0*dim2] + auto in_trans_flat_dims = phi::flatten_to_2d(in_trans_dims, 1); + in_trans.Resize(in_trans_flat_dims); + + // now 'in_trans' is 2D + int64_t col = in_trans.dims()[1]; + int64_t row = in_trans.dims()[0]; + const InT* in_trans_data = in_trans.data(); + + indices->Resize(phi::make_ddim({row})); + auto* sorted_indices_data = context.template Alloc(indices); + + // 2. Calculate 'indices', 'inverse', 'counts' + // Init index and sort + thrust::sequence( + thrust::device, sorted_indices_data, sorted_indices_data + row); + thrust::sort(thrust::device, + sorted_indices_data, + sorted_indices_data + row, + LessThan(col, in_trans_data)); + ComputeUniqueDims( + context, + indices, + sorted_indices_data, + out, + index, + counts, + return_index, + return_inverse, + return_counts, + BinaryEqual(col, in_trans_data), + BinaryNotEqual(col, in_trans_data), + row); + + // 3. Select indices and reshape back to get 'out' + DenseTensor out_trans; + std::vector out_trans_dims_vec = in_trans_dims_vec; + out_trans_dims_vec[0] = indices->numel(); + out_trans.Resize(phi::make_ddim(out_trans_dims_vec)); + context.template Alloc(&out_trans); + + IndexSelect(context, in_trans, *indices, &out_trans, 0); + + std::swap(out_trans_dims_vec[0], out_trans_dims_vec[axis]); + out->Resize(phi::make_ddim(out_trans_dims_vec)); + context.template Alloc(out); + std::vector out_trans_unbind = phi::funcs::Unbind(out_trans); + phi::funcs::ConcatFunctor concat_functor; + concat_functor(context, out_trans_unbind, 0, &out_trans); + phi::funcs::TransCompute( + out_trans.dims().size(), context, out_trans, out, permute); +} + +// functor for processing a flattend DenseTensor +template +struct UniqueFlattendCUDAFunctor { + const Context& ctx_; + const DenseTensor& in_; + DenseTensor* out_; + DenseTensor* indices_; + DenseTensor* index_; + DenseTensor* counts_; + const bool return_index_; + const bool return_inverse_; + const bool return_counts_; + + UniqueFlattendCUDAFunctor(const Context& context, + const DenseTensor& in, + DenseTensor* out, + DenseTensor* indices, + DenseTensor* index, + DenseTensor* counts, + bool return_index, + bool return_inverse, + bool return_counts) + : ctx_(context), + in_(in), + out_(out), + indices_(indices), + index_(index), + counts_(counts), + return_index_(return_index), + return_inverse_(return_inverse), + return_counts_(return_counts) {} + + template + void apply() const { + UniqueFlattendCUDATensor(ctx_, + in_, + out_, + indices_, + index_, + counts_, + return_index_, + return_inverse_, + return_counts_, + thrust::equal_to(), + thrust::not_equal_to(), + in_.numel()); + } +}; + +// functor for processing a multi-dimentional DenseTensor +template +struct UniqueDimsCUDAFunctor { + const Context& ctx_; + const DenseTensor& in_; + DenseTensor* out_; + DenseTensor* indices_; + DenseTensor* index_; + DenseTensor* counts_; + const int axis_; + const bool return_index_; + const bool return_inverse_; + const bool return_counts_; + + UniqueDimsCUDAFunctor(const Context& context, + const DenseTensor& in, + DenseTensor* out, + DenseTensor* indices, + DenseTensor* index, + DenseTensor* counts, + const int axis, + bool return_index, + bool return_inverse, + bool return_counts) + : ctx_(context), + in_(in), + out_(out), + indices_(indices), + index_(index), + counts_(counts), + axis_(axis), + return_index_(return_index), + return_inverse_(return_inverse), + return_counts_(return_counts) {} + + template + void apply() const { + UniqueDimsCUDATensor(ctx_, + in_, + out_, + indices_, + index_, + counts_, + return_index_, + return_inverse_, + return_counts_, + axis_); + } +}; + +template +void UniqueRawKernel(const Context& context, + const DenseTensor& x, + bool return_index, + bool return_inverse, + bool return_counts, + const std::vector& axis, + DataType dtype, + bool is_sorted, + DenseTensor* out, + DenseTensor* indices, + DenseTensor* index, + DenseTensor* counts) { + if (dtype == phi::DataType::INT32) { + PADDLE_ENFORCE_LE( + x.numel() + 1, + INT_MAX, + phi::errors::InvalidArgument( + "The number of elements in Input(X) should be less than or " + "equal to INT_MAX, but received num is %d. Please set `dtype` to " + "int64.", + x.numel())); + } + // if 'axis' is not required, flatten the DenseTensor. + if (axis.empty()) { + phi::VisitDataTypeTiny( + dtype, + UniqueFlattendCUDAFunctor(context, + x, + out, + indices, + index, + counts, + return_index, + return_inverse, + return_counts)); + } else { + // 'axis' is required. + int axis_value = axis[0]; + phi::VisitDataTypeTiny(dtype, + UniqueDimsCUDAFunctor(context, + x, + out, + indices, + index, + counts, + axis_value, + return_index, + return_inverse, + return_counts)); + } +} + +template +void UniqueKernel(const Context& context, + const DenseTensor& x, + bool return_index, + bool return_inverse, + bool return_counts, + const std::vector& axis, + DataType dtype, + DenseTensor* out, + DenseTensor* indices, + DenseTensor* index, + DenseTensor* counts) { + bool is_sorted = true; + UniqueRawKernel(context, + x, + return_index, + return_inverse, + return_counts, + axis, + dtype, + is_sorted, + out, + indices, + index, + counts); +} + +} // namespace phi + +PD_REGISTER_KERNEL( + unique, GPU, ALL_LAYOUT, phi::UniqueKernel, float, double, int64_t, int) {} + +PD_REGISTER_KERNEL(unique_raw, + GPU, + ALL_LAYOUT, + phi::UniqueRawKernel, + float, + double, + int64_t, + int) {} diff --git a/paddle/phi/kernels/gpu/unstack_grad_kernel.cu b/paddle/phi/kernels/gpu/unstack_grad_kernel.cu new file mode 100644 index 0000000000000..b7c349de0df32 --- /dev/null +++ b/paddle/phi/kernels/gpu/unstack_grad_kernel.cu @@ -0,0 +1,29 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/unstack_grad_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/unstack_grad_kernel_impl.h" + +PD_REGISTER_KERNEL(unstack_grad, + GPU, + ALL_LAYOUT, + phi::UnStackGradKernel, + float, + double, + int64_t, + int, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/gpu/unstack_kernel.cu b/paddle/phi/kernels/gpu/unstack_kernel.cu new file mode 100644 index 0000000000000..f147f4c0f0edf --- /dev/null +++ b/paddle/phi/kernels/gpu/unstack_kernel.cu @@ -0,0 +1,29 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/unstack_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/unstack_kernel_impl.h" + +PD_REGISTER_KERNEL(unstack, + GPU, + ALL_LAYOUT, + phi::UnStackKernel, + float, + double, + int64_t, + int, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/histogram_kernel.h b/paddle/phi/kernels/histogram_kernel.h index b6b4593361dad..0020f7b0435da 100644 --- a/paddle/phi/kernels/histogram_kernel.h +++ b/paddle/phi/kernels/histogram_kernel.h @@ -18,11 +18,11 @@ namespace phi { template -void HistogramSelectKernel(const Context& dev_ctx, - const DenseTensor& input, - int64_t bins, - int min, - int max, - DenseTensor* out); +void HistogramKernel(const Context& dev_ctx, + const DenseTensor& input, + int64_t bins, + int min, + int max, + DenseTensor* output); } // namespace phi diff --git a/paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h b/paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h index e3ea10705d24e..8c37091ef1b54 100644 --- a/paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h @@ -46,8 +46,8 @@ void CholeskySolveGradKernel(const Context& dev_ctx, std::vector y_bst_dims_vec; std::tie(x_bst_dims_vec, y_bst_dims_vec) = funcs::MatrixGetBroadcastDims(x, y); - ScalarArray x_bst_dims(x_bst_dims_vec); - ScalarArray y_bst_dims(y_bst_dims_vec); + IntArray x_bst_dims(x_bst_dims_vec); + IntArray y_bst_dims(y_bst_dims_vec); // Tensor broadcast to temp 'y_bst' DenseTensor y_bst = phi::Empty(dev_ctx, y_bst_dims); diff --git a/paddle/phi/kernels/impl/cholesky_solve_kernel_impl.h b/paddle/phi/kernels/impl/cholesky_solve_kernel_impl.h index 16ceb776f1a98..c039d11635ba2 100644 --- a/paddle/phi/kernels/impl/cholesky_solve_kernel_impl.h +++ b/paddle/phi/kernels/impl/cholesky_solve_kernel_impl.h @@ -49,8 +49,8 @@ void CholeskySolveKernel(const Context& dev_ctx, std::vector y_bst_dims_vec; std::tie(x_bst_dims_vec, y_bst_dims_vec) = funcs::MatrixGetBroadcastDims(x, y); - ScalarArray x_bst_dims(x_bst_dims_vec); - ScalarArray y_bst_dims(y_bst_dims_vec); + IntArray x_bst_dims(x_bst_dims_vec); + IntArray y_bst_dims(y_bst_dims_vec); DenseTensor y_bst = phi::Empty(dev_ctx, y_bst_dims); ExpandKernel(dev_ctx, y, y_bst_dims, &y_bst); @@ -79,8 +79,7 @@ void CholeskySolveKernel(const Context& dev_ctx, int N = static_cast(x_bst_dims_vec[x_bst_ndim - 1]); int batchsize = product(phi::slice_ddim(x_bst.dims(), 0, x_bst_ndim - 2)); - DenseTensor info = - phi::Empty(dev_ctx, ScalarArray({batchsize})); + DenseTensor info = phi::Empty(dev_ctx, IntArray({batchsize})); int* info_data = info.data(); CholeskySolveFunctor functor; diff --git a/paddle/phi/kernels/impl/expand_grad_kernel_impl.h b/paddle/phi/kernels/impl/expand_grad_kernel_impl.h index 766f91cd22e1f..a4fc7157eeaf8 100644 --- a/paddle/phi/kernels/impl/expand_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/expand_grad_kernel_impl.h @@ -50,7 +50,7 @@ template void ExpandGradKernel(const Context& ctx, const DenseTensor& x, const DenseTensor& out_grad, - const ScalarArray& shape, + const IntArray& shape, DenseTensor* in_grad) { auto expand_shape = shape.GetData(); auto x_dims = x.dims(); diff --git a/paddle/phi/kernels/impl/expand_kernel_impl.h b/paddle/phi/kernels/impl/expand_kernel_impl.h index 4d287771be297..54fd1100ab332 100644 --- a/paddle/phi/kernels/impl/expand_kernel_impl.h +++ b/paddle/phi/kernels/impl/expand_kernel_impl.h @@ -27,7 +27,7 @@ using Tensor = DenseTensor; template void Expand(const Context& ctx, const DenseTensor& x, - const ScalarArray& shape, + const IntArray& shape, DenseTensor* out) { auto in_dims = x.dims(); auto expand_shape = shape.GetData(); @@ -107,7 +107,7 @@ void Expand(const Context& ctx, template void ExpandKernel(const Context& ctx, const DenseTensor& x, - const ScalarArray& shape, + const IntArray& shape, DenseTensor* out) { auto rank = x.dims().size(); PADDLE_ENFORCE_GE( diff --git a/paddle/phi/kernels/impl/set_value_grad_kernel_impl.h b/paddle/phi/kernels/impl/set_value_grad_kernel_impl.h index 0e39c0a726bf4..96660c7084be6 100644 --- a/paddle/phi/kernels/impl/set_value_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/set_value_grad_kernel_impl.h @@ -14,7 +14,7 @@ #pragma once -#include "paddle/phi/common/scalar_array.h" +#include "paddle/phi/common/int_array.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/kernels/copy_kernel.h" @@ -48,9 +48,9 @@ inline void GetOffsets(const DDim& big_dim, template void SetValueGradImpl(const Context& dev_ctx, const DenseTensor& out_grad, - const ScalarArray& starts, - const ScalarArray& ends, - const ScalarArray& steps, + const IntArray& starts, + const IntArray& ends, + const IntArray& steps, const std::vector& axes, const std::vector& decrease_axes, const std::vector& none_axes, @@ -249,9 +249,9 @@ void SetValueGradImpl(const Context& dev_ctx, template void SetValueGradKernel(const Context& dev_ctx, const DenseTensor& out_grad, - const ScalarArray& starts, - const ScalarArray& ends, - const ScalarArray& steps, + const IntArray& starts, + const IntArray& ends, + const IntArray& steps, const std::vector& axes, const std::vector& decrease_axes, const std::vector& none_axes, diff --git a/paddle/phi/kernels/impl/set_value_kernel_impl.h b/paddle/phi/kernels/impl/set_value_kernel_impl.h index cbe94efb43908..229dcf671f993 100644 --- a/paddle/phi/kernels/impl/set_value_kernel_impl.h +++ b/paddle/phi/kernels/impl/set_value_kernel_impl.h @@ -14,8 +14,8 @@ #pragma once +#include "paddle/phi/common/int_array.h" #include "paddle/phi/common/scalar.h" -#include "paddle/phi/common/scalar_array.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/kernels/copy_kernel.h" @@ -73,9 +73,9 @@ template void SetValueImpl(const Context& dev_ctx, const DenseTensor& in, const DenseTensor& value, - const ScalarArray& starts, - const ScalarArray& ends, - const ScalarArray& steps, + const IntArray& starts, + const IntArray& ends, + const IntArray& steps, const std::vector& axes, const std::vector& decrease_axes, const std::vector& none_axes, @@ -134,9 +134,9 @@ void SetValueImpl(const Context& dev_ctx, Copy(dev_ctx, in, place, false, out); DenseTensor slice_tensor = - Empty(dev_ctx, ScalarArray{slice_dims.Get(), slice_dims.size()}); + Empty(dev_ctx, IntArray{slice_dims.Get(), slice_dims.size()}); DenseTensor pad_tensor = - Empty(dev_ctx, ScalarArray{in_dims.Get(), in_dims.size()}); + Empty(dev_ctx, IntArray{in_dims.Get(), in_dims.size()}); auto pad_e = EigenTensor::From(pad_tensor, in_dims); auto out_e = EigenTensor::From(*out); @@ -211,9 +211,9 @@ template void SetTensorValueKernel(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& value, - const ScalarArray& starts, - const ScalarArray& ends, - const ScalarArray& steps, + const IntArray& starts, + const IntArray& ends, + const IntArray& steps, const std::vector& axes, const std::vector& decrease_axes, const std::vector& none_axes, @@ -302,9 +302,9 @@ void SetTensorValueKernel(const Context& dev_ctx, template void SetValueKernel(const Context& dev_ctx, const DenseTensor& x, - const ScalarArray& starts, - const ScalarArray& ends, - const ScalarArray& steps, + const IntArray& starts, + const IntArray& ends, + const IntArray& steps, const std::vector& axes, const std::vector& decrease_axes, const std::vector& none_axes, diff --git a/paddle/phi/kernels/impl/slice_grad_kernel_impl.h b/paddle/phi/kernels/impl/slice_grad_kernel_impl.h index 1dbb5bd142c52..a5c67a32553c9 100644 --- a/paddle/phi/kernels/impl/slice_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/slice_grad_kernel_impl.h @@ -274,8 +274,8 @@ void SliceGradRawKernel(const Context& ctx, const DenseTensor& input, const DenseTensor& out_grad, const std::vector& axes, - const ScalarArray& starts_arr, - const ScalarArray& ends_arr, + const IntArray& starts_arr, + const IntArray& ends_arr, const std::vector& infer_flags, const std::vector& decrease_axis, DenseTensor* input_grad) { diff --git a/paddle/phi/kernels/impl/slice_kernel_impl.h b/paddle/phi/kernels/impl/slice_kernel_impl.h index 5c127358e8eee..b855ef43aa7ee 100644 --- a/paddle/phi/kernels/impl/slice_kernel_impl.h +++ b/paddle/phi/kernels/impl/slice_kernel_impl.h @@ -110,8 +110,8 @@ template void SliceRawKernel(const Context& ctx, const DenseTensor& input, const std::vector& axes, - const ScalarArray& starts_arr, - const ScalarArray& ends_arr, + const IntArray& starts_arr, + const IntArray& ends_arr, const std::vector& infer_flags, const std::vector& decrease_axis, DenseTensor* out) { diff --git a/paddle/phi/kernels/impl/strided_slice_grad_kernel_impl.h b/paddle/phi/kernels/impl/strided_slice_grad_kernel_impl.h index 1d75b32a5f21d..95780682c98dd 100644 --- a/paddle/phi/kernels/impl/strided_slice_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/strided_slice_grad_kernel_impl.h @@ -20,16 +20,16 @@ namespace phi { template -void StridedSliceGradKernel(const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& out_grad, - const std::vector& axes, - const ScalarArray& starts, - const ScalarArray& ends, - const ScalarArray& strides, - const std::vector& infer_flags, - const std::vector& decrease_axis, - DenseTensor* x_grad) { +void StridedSliceRawGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + const std::vector& axes, + const IntArray& starts, + const IntArray& ends, + const IntArray& strides, + const std::vector& infer_flags, + const std::vector& decrease_axis, + DenseTensor* x_grad) { int rank = x.dims().size(); #define SLICE_CASE(Rank) \ case Rank: \ @@ -62,9 +62,9 @@ void StridedSliceArrayGradKernel( const std::vector& x, const std::vector& out_grad, const std::vector& axes, - const ScalarArray& starts, - const ScalarArray& ends, - const ScalarArray& strides, + const IntArray& starts, + const IntArray& ends, + const IntArray& strides, const std::vector& infer_flags, const std::vector& decrease_axis, std::vector x_grad) { diff --git a/paddle/phi/kernels/impl/strided_slice_kernel_impl.h b/paddle/phi/kernels/impl/strided_slice_kernel_impl.h index f98ac1aedcf17..81e6d5056267a 100644 --- a/paddle/phi/kernels/impl/strided_slice_kernel_impl.h +++ b/paddle/phi/kernels/impl/strided_slice_kernel_impl.h @@ -20,15 +20,15 @@ namespace phi { template -void StridedSliceKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& axes, - const ScalarArray& starts, - const ScalarArray& ends, - const ScalarArray& strides, - const std::vector& infer_flags, - const std::vector& decrease_axis, - DenseTensor* out) { +void StridedSliceRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& axes, + const IntArray& starts, + const IntArray& ends, + const IntArray& strides, + const std::vector& infer_flags, + const std::vector& decrease_axis, + DenseTensor* out) { int rank = x.dims().size(); #define SLICE_CASE(Rank) \ case Rank: \ @@ -58,9 +58,9 @@ template void StridedSliceArrayKernel(const Context& dev_ctx, const std::vector& x, const std::vector& axes, - const ScalarArray& starts, - const ScalarArray& ends, - const ScalarArray& strides, + const IntArray& starts, + const IntArray& ends, + const IntArray& strides, const std::vector& infer_flags, const std::vector& decrease_axis, std::vector out) { diff --git a/paddle/phi/kernels/impl/tile_grad_kernel_impl.h b/paddle/phi/kernels/impl/tile_grad_kernel_impl.h index b373855eee50a..9e56e50534d19 100644 --- a/paddle/phi/kernels/impl/tile_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/tile_grad_kernel_impl.h @@ -53,7 +53,7 @@ template void TileGradKernel(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& out_grad, - const ScalarArray& repeat_times, + const IntArray& repeat_times, DenseTensor* x_grad) { auto x_dims = x.dims(); auto vec_x_dims = phi::vectorize(x_dims); diff --git a/paddle/phi/kernels/impl/tile_kernel_impl.h b/paddle/phi/kernels/impl/tile_kernel_impl.h index bafbbde4e680d..d19a6a7800671 100644 --- a/paddle/phi/kernels/impl/tile_kernel_impl.h +++ b/paddle/phi/kernels/impl/tile_kernel_impl.h @@ -85,7 +85,7 @@ void Tile(const Context& dev_ctx, template void TileKernel(const Context& dev_ctx, const DenseTensor& x, - const ScalarArray& repeat_times, + const IntArray& repeat_times, DenseTensor* out) { auto rank = x.dims().size(); auto& repeat_times_data = repeat_times.GetData(); diff --git a/paddle/phi/kernels/impl/triangular_solve_grad_kernel_impl.h b/paddle/phi/kernels/impl/triangular_solve_grad_kernel_impl.h index 044adb0230cac..30f2d5a05cdc0 100644 --- a/paddle/phi/kernels/impl/triangular_solve_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/triangular_solve_grad_kernel_impl.h @@ -44,7 +44,7 @@ void TriangularSolveGradKernel(const Context& dev_ctx, std::tie(x_bst_dims_vec, y_bst_dims_vec) = funcs::MatrixGetBroadcastDims(x, y); - ScalarArray y_bst_dims_array(y_bst_dims_vec); + IntArray y_bst_dims_array(y_bst_dims_vec); DenseTensor dy_bst = phi::Empty(dev_ctx, y_bst_dims_array); if (dy) { // calculate x's conjugate for complex @@ -71,7 +71,7 @@ void TriangularSolveGradKernel(const Context& dev_ctx, } } - ScalarArray x_bst_dims_array(x_bst_dims_vec); + IntArray x_bst_dims_array(x_bst_dims_vec); DenseTensor dx_bst = phi::Empty(dev_ctx, x_bst_dims_array); if (dx) { // calculate x's conjugate for complex diff --git a/paddle/phi/kernels/impl/unsqueeze_kernel_impl.h b/paddle/phi/kernels/impl/unsqueeze_kernel_impl.h index 884fa26df451c..58efff16a5a5a 100644 --- a/paddle/phi/kernels/impl/unsqueeze_kernel_impl.h +++ b/paddle/phi/kernels/impl/unsqueeze_kernel_impl.h @@ -21,7 +21,7 @@ namespace phi { template void UnsqueezeKernel(const Context& dev_ctx, const DenseTensor& x, - const ScalarArray& axes, + const IntArray& axes, DenseTensor* xshape, DenseTensor* out) { auto x_dims = x.dims(); diff --git a/paddle/phi/kernels/impl/unstack_grad_kernel_impl.h b/paddle/phi/kernels/impl/unstack_grad_kernel_impl.h new file mode 100644 index 0000000000000..0576742e349a8 --- /dev/null +++ b/paddle/phi/kernels/impl/unstack_grad_kernel_impl.h @@ -0,0 +1,70 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/kernels/funcs/stack_functor.h" +#if defined(__NVCC__) || defined(__HIPCC__) +#include +#endif +namespace phi { + +template +void UnStackGradKernel(const Context &dev_ctx, + const std::vector &x, + int axis, + DenseTensor *x_grad) { + if (axis < 0) axis += (x[0]->dims().size() + 1); + + int n = static_cast(x.size()); + auto *x_grad_data = dev_ctx.template Alloc(x_grad); + std::vector x_datas(n); + for (int i = 0; i < n; i++) x_datas[i] = x[i]->data(); + + int pre = 1; + int post = 1; + auto &dim = x[0]->dims(); + for (auto i = 0; i < axis; ++i) pre *= dim[i]; + for (auto i = axis; i < dim.size(); ++i) post *= dim[i]; + +#if defined(__NVCC__) || defined(__HIPCC__) + int total_num = pre * n * post; + + thrust::device_vector device_x_vec(x_datas); + auto x_data_arr = device_x_vec.data().get(); + + phi::funcs::StackFunctorForRange( + dev_ctx, x_data_arr, x_grad_data, total_num, n, post); + + // Wait() must be called because device_x_vec may be destructed before + // kernel ends + dev_ctx.Wait(); +#else + auto x_data_arr = x_datas.data(); + + size_t x_offset = 0; + size_t y_offset = 0; + for (int i = 0; i < pre; i++) { + for (int j = 0; j < n; j++) { + std::memcpy( + x_grad_data + y_offset, x_data_arr[j] + x_offset, post * sizeof(T)); + y_offset += post; + } + x_offset += post; + } +#endif +} + +} // namespace phi diff --git a/paddle/phi/kernels/impl/unstack_kernel_impl.h b/paddle/phi/kernels/impl/unstack_kernel_impl.h new file mode 100644 index 0000000000000..030f4a62c6e00 --- /dev/null +++ b/paddle/phi/kernels/impl/unstack_kernel_impl.h @@ -0,0 +1,62 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/kernels/funcs/stack_functor.h" +#if defined(__NVCC__) || defined(__HIPCC__) +#include +#endif + +namespace phi { + +template +void UnStackKernel(const Context &dev_ctx, + const DenseTensor &x, + int axis, + int num, + std::vector outs) { + auto *dy = &x; + auto dx = outs; + if (axis < 0) axis += dy->dims().size(); + + int n = dy->dims()[axis]; + std::vector dx_datas(n); // NOLINT + for (int i = 0; i < n; i++) { + dx_datas[i] = dev_ctx.template Alloc(dx[i]); + } + auto dy_data = dy->data(); + if (dy->numel() == 0) return; + int pre = 1; + for (int i = 0; i < axis; ++i) pre *= dy->dims()[i]; + int total_num = dy->numel(); + int post = total_num / (n * pre); + +#if defined(__NVCC__) || defined(__HIPCC__) + thrust::device_vector device_dx_vec(dx_datas); + auto dx_data_arr = device_dx_vec.data().get(); +#else + auto dx_data_arr = dx_datas.data(); +#endif + phi::funcs::StackGradFunctorForRange( + dev_ctx, dx_data_arr, dy_data, total_num, n, post); +#if defined(__NVCC__) || defined(__HIPCC__) + // Wait() must be called because device_dx_vec may be destructed before + // kernel ends + dev_ctx.Wait(); +#endif +} + +} // namespace phi diff --git a/paddle/phi/kernels/index_sample_grad_kernel.h b/paddle/phi/kernels/index_sample_grad_kernel.h index 5c6e101f1b43d..2b66076ee0a2b 100644 --- a/paddle/phi/kernels/index_sample_grad_kernel.h +++ b/paddle/phi/kernels/index_sample_grad_kernel.h @@ -20,9 +20,9 @@ namespace phi { template void IndexSampleGradKernel(const Context& ctx, - const DenseTensor& out_grad, const DenseTensor& x, const DenseTensor& index, + const DenseTensor& out_grad, DenseTensor* in_grad); } // namespace phi diff --git a/paddle/phi/kernels/interpolate_grad_kernel.h b/paddle/phi/kernels/interpolate_grad_kernel.h new file mode 100644 index 0000000000000..59d2dddd87007 --- /dev/null +++ b/paddle/phi/kernels/interpolate_grad_kernel.h @@ -0,0 +1,39 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void BilinearInterpGradKernel( + const Context& ctx, + const DenseTensor& x, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const DenseTensor& out_grad, + const std::string& data_layout, + int out_d, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* x_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/interpolate_kernel.h b/paddle/phi/kernels/interpolate_kernel.h new file mode 100644 index 0000000000000..4623657f5a594 --- /dev/null +++ b/paddle/phi/kernels/interpolate_kernel.h @@ -0,0 +1,110 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void BilinearInterpKernel( + const Context& ctx, + const DenseTensor& x, + paddle::optional out_size, + paddle::optional> size_tensor, + paddle::optional scale_tensor, + const std::string& data_layout, + int out_d, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* output); + +template +void NearestInterpKernel( + const Context& ctx, + const DenseTensor& x, + paddle::optional out_size, + paddle::optional> size_tensor, + + paddle::optional scale_tensor, + const std::string& data_layout, + int out_d, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* output); + +template +void TrilinearInterpKernel( + const Context& ctx, + const DenseTensor& x, + paddle::optional out_size, + paddle::optional> size_tensor, + + paddle::optional scale_tensor, + const std::string& data_layout, + int out_d, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* output); + +template +void LinearInterpKernel( + const Context& ctx, + const DenseTensor& x, + paddle::optional out_size, + paddle::optional> size_tensor, + + paddle::optional scale_tensor, + const std::string& data_layout, + int out_d, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* output); + +template +void BicubicInterpKernel( + const Context& ctx, + const DenseTensor& x, + paddle::optional out_size, + paddle::optional> size_tensor, + + paddle::optional scale_tensor, + const std::string& data_layout, + int out_d, + int out_h, + int out_w, + const std::vector& scale, + const std::string& interp_method, + bool align_corners, + int align_mode, + DenseTensor* output); + +} // namespace phi diff --git a/paddle/phi/kernels/masked_select_grad_kernel.h b/paddle/phi/kernels/masked_select_grad_kernel.h index f9db1fcd2acc7..db7d105093d2a 100644 --- a/paddle/phi/kernels/masked_select_grad_kernel.h +++ b/paddle/phi/kernels/masked_select_grad_kernel.h @@ -19,9 +19,9 @@ namespace phi { template void MaskedSelectGradKernel(const Context& dev_ctx, - const DenseTensor& out_grad, const DenseTensor& x, const DenseTensor& mask, + const DenseTensor& out_grad, DenseTensor* x_grad); } // namspace phi diff --git a/paddle/phi/kernels/nll_loss_grad_kernel.h b/paddle/phi/kernels/nll_loss_grad_kernel.h index 127dc2f961f10..c06f0726899ee 100644 --- a/paddle/phi/kernels/nll_loss_grad_kernel.h +++ b/paddle/phi/kernels/nll_loss_grad_kernel.h @@ -22,8 +22,8 @@ template void NllLossGradKernel(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& label, - const DenseTensor& total_weight, paddle::optional weight, + const DenseTensor& total_weight, const DenseTensor& d_out, int64_t ignore_index, const std::string& reduction, diff --git a/paddle/phi/kernels/pad3d_grad_kernel.h b/paddle/phi/kernels/pad3d_grad_kernel.h index 38f1e5335e8c2..bbad50f4d83bd 100644 --- a/paddle/phi/kernels/pad3d_grad_kernel.h +++ b/paddle/phi/kernels/pad3d_grad_kernel.h @@ -14,7 +14,7 @@ #pragma once -#include "paddle/phi/common/scalar_array.h" +#include "paddle/phi/common/int_array.h" #include "paddle/phi/core/dense_tensor.h" namespace phi { @@ -23,7 +23,7 @@ template void Pad3dGradKernel(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& out_grad, - const ScalarArray& paddings, + const IntArray& paddings, const std::string& mode, float pad_value, const std::string& data_format, diff --git a/paddle/phi/kernels/pad3d_kernel.h b/paddle/phi/kernels/pad3d_kernel.h index d8876c3e7bc74..1589ff854ec23 100644 --- a/paddle/phi/kernels/pad3d_kernel.h +++ b/paddle/phi/kernels/pad3d_kernel.h @@ -14,7 +14,7 @@ #pragma once -#include "paddle/phi/common/scalar_array.h" +#include "paddle/phi/common/int_array.h" #include "paddle/phi/core/dense_tensor.h" namespace phi { @@ -22,7 +22,7 @@ namespace phi { template void Pad3dKernel(const Context& dev_ctx, const DenseTensor& x, - const ScalarArray& paddings, + const IntArray& paddings, const std::string& mode, float pad_value, const std::string& data_format, diff --git a/paddle/phi/kernels/randint_kernel.h b/paddle/phi/kernels/randint_kernel.h index bfefc628614fb..85d440e305635 100644 --- a/paddle/phi/kernels/randint_kernel.h +++ b/paddle/phi/kernels/randint_kernel.h @@ -14,7 +14,7 @@ #pragma once -#include "paddle/phi/common/scalar_array.h" +#include "paddle/phi/common/int_array.h" #include "paddle/phi/core/dense_tensor.h" namespace phi { @@ -23,7 +23,7 @@ template void RandintKernel(const Context& dev_ctx, int low, int high, - const ScalarArray& shape, + const IntArray& shape, DataType dtype, DenseTensor* out); @@ -31,7 +31,7 @@ template void RandintRawKernel(const Context& dev_ctx, int low, int high, - const ScalarArray& shape, + const IntArray& shape, DataType dtype, int seed, DenseTensor* out); diff --git a/paddle/pten/kernels/slice_kernel.h b/paddle/phi/kernels/range_kernel.h similarity index 63% rename from paddle/pten/kernels/slice_kernel.h rename to paddle/phi/kernels/range_kernel.h index ff27824b9e676..c76308193ae5e 100644 --- a/paddle/pten/kernels/slice_kernel.h +++ b/paddle/phi/kernels/range_kernel.h @@ -14,19 +14,15 @@ #pragma once -#include "paddle/phi/common/scalar_array.h" #include "paddle/phi/core/dense_tensor.h" namespace phi { template -void SliceRawKernel(const Context& ctx, - const DenseTensor& input, - const std::vector& axes, - const ScalarArray& starts, - const ScalarArray& ends, - const std::vector& infer_flags, - const std::vector& decrease_axis, - DenseTensor* out); +void RangeKernel(const Context& dev_ctx, + const DenseTensor& start, + const DenseTensor& end, + const DenseTensor& step, + DenseTensor* out); } // namespace phi diff --git a/paddle/phi/kernels/reshape_kernel.cc b/paddle/phi/kernels/reshape_kernel.cc index 12a75a838058a..efcad999b447d 100644 --- a/paddle/phi/kernels/reshape_kernel.cc +++ b/paddle/phi/kernels/reshape_kernel.cc @@ -24,7 +24,7 @@ namespace phi { template void ReshapeKernel(const Context& dev_ctx, const DenseTensor& x, - const ScalarArray& shape, + const IntArray& shape, DenseTensor* out) { MetaTensor meta_out(out); InferMetaFromVecValue(x, shape.GetData(), &meta_out); @@ -44,7 +44,7 @@ void ReshapeKernel(const Context& dev_ctx, template void ReshapeWithXShape(const Context& dev_ctx, const DenseTensor& x, - const ScalarArray& shape, + const IntArray& shape, DenseTensor* out, DenseTensor* xshape) { ReshapeKernel(dev_ctx, x, shape, out); diff --git a/paddle/phi/kernels/reshape_kernel.h b/paddle/phi/kernels/reshape_kernel.h index 11b19766a918b..88b1bd9587140 100644 --- a/paddle/phi/kernels/reshape_kernel.h +++ b/paddle/phi/kernels/reshape_kernel.h @@ -14,7 +14,7 @@ limitations under the License. */ #pragma once -#include "paddle/phi/common/scalar_array.h" +#include "paddle/phi/common/int_array.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/infermeta/unary.h" #include "paddle/phi/kernels/empty_kernel.h" @@ -24,13 +24,13 @@ namespace phi { template void ReshapeKernel(const Context& dev_ctx, const DenseTensor& x, - const ScalarArray& shape, + const IntArray& shape, DenseTensor* out); template void ReshapeWithXShape(const Context& dev_ctx, const DenseTensor& x, - const ScalarArray& shape, + const IntArray& shape, DenseTensor* out, DenseTensor* xshape); @@ -41,7 +41,7 @@ DenseTensor Reshape(const Context& dev_ctx, DenseTensor dense_out; MetaTensor meta_out(&dense_out); InferMetaFromVecValue(x, shape, &meta_out); - ReshapeKernel(dev_ctx, x, ScalarArray(shape), &dense_out); + ReshapeKernel(dev_ctx, x, IntArray(shape), &dense_out); return dense_out; } diff --git a/paddle/phi/kernels/roll_grad_kernel.h b/paddle/phi/kernels/roll_grad_kernel.h index 331f3626e5657..82e9c0249f7af 100644 --- a/paddle/phi/kernels/roll_grad_kernel.h +++ b/paddle/phi/kernels/roll_grad_kernel.h @@ -14,7 +14,7 @@ #pragma once -#include "paddle/phi/common/scalar_array.h" +#include "paddle/phi/common/int_array.h" #include "paddle/phi/core/dense_tensor.h" namespace phi { @@ -23,7 +23,7 @@ template void RollGradKernel(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& out_grad, - const ScalarArray& shifts, + const IntArray& shifts, const std::vector& axis, DenseTensor* x_grad); diff --git a/paddle/phi/kernels/roll_kernel.h b/paddle/phi/kernels/roll_kernel.h index 56f32174a4c00..4e0c41b8b8de8 100644 --- a/paddle/phi/kernels/roll_kernel.h +++ b/paddle/phi/kernels/roll_kernel.h @@ -14,7 +14,7 @@ #pragma once -#include "paddle/phi/common/scalar_array.h" +#include "paddle/phi/common/int_array.h" #include "paddle/phi/core/dense_tensor.h" namespace phi { @@ -22,7 +22,7 @@ namespace phi { template void RollKernel(const Context& dev_ctx, const DenseTensor& x, - const ScalarArray& shifts, + const IntArray& shifts, const std::vector& axis, DenseTensor* out); diff --git a/paddle/phi/kernels/selected_rows/full_kernel.cc b/paddle/phi/kernels/selected_rows/full_kernel.cc index 39fd009cd6586..03cd7fed411f3 100644 --- a/paddle/phi/kernels/selected_rows/full_kernel.cc +++ b/paddle/phi/kernels/selected_rows/full_kernel.cc @@ -29,7 +29,7 @@ namespace sr { template void FullKernel(const Context& dev_ctx, - const ScalarArray& shape, + const IntArray& shape, const Scalar& val, DataType dtype, SelectedRows* out) { diff --git a/paddle/phi/kernels/selected_rows/full_kernel.h b/paddle/phi/kernels/selected_rows/full_kernel.h index d84ddcc0d3f63..d4b1859fdfcfb 100644 --- a/paddle/phi/kernels/selected_rows/full_kernel.h +++ b/paddle/phi/kernels/selected_rows/full_kernel.h @@ -14,8 +14,8 @@ #pragma once +#include "paddle/phi/common/int_array.h" #include "paddle/phi/common/scalar.h" -#include "paddle/phi/common/scalar_array.h" #include "paddle/phi/core/selected_rows.h" namespace phi { @@ -23,7 +23,7 @@ namespace sr { template void FullKernel(const Context& dev_ctx, - const ScalarArray& shape, + const IntArray& shape, const Scalar& val, DataType dtype, SelectedRows* out); diff --git a/paddle/phi/kernels/selected_rows/uniform_random_kernel.cc b/paddle/phi/kernels/selected_rows/uniform_random_kernel.cc index b3dd1d1b7d2a0..ce2bbc533f703 100644 --- a/paddle/phi/kernels/selected_rows/uniform_random_kernel.cc +++ b/paddle/phi/kernels/selected_rows/uniform_random_kernel.cc @@ -24,7 +24,7 @@ namespace sr { template void UniformRandomRawKernel(const Context& dev_ctx, - const ScalarArray& shape, + const IntArray& shape, DataType dtype, float min, float max, @@ -47,7 +47,7 @@ void UniformRandomRawKernel(const Context& dev_ctx, template void UniformRandomKernel(const Context& dev_ctx, - const ScalarArray& shape, + const IntArray& shape, DataType dtype, float min, float max, diff --git a/paddle/phi/kernels/selected_rows/uniform_random_kernel.h b/paddle/phi/kernels/selected_rows/uniform_random_kernel.h index aee7a4c7aaf62..d0a9084b40b37 100644 --- a/paddle/phi/kernels/selected_rows/uniform_random_kernel.h +++ b/paddle/phi/kernels/selected_rows/uniform_random_kernel.h @@ -14,7 +14,7 @@ #pragma once -#include "paddle/phi/common/scalar_array.h" +#include "paddle/phi/common/int_array.h" #include "paddle/phi/core/selected_rows.h" namespace phi { @@ -22,7 +22,7 @@ namespace sr { template void UniformRandomRawKernel(const Context& dev_ctx, - const ScalarArray& shape, + const IntArray& shape, DataType dtype, float min, float max, @@ -34,7 +34,7 @@ void UniformRandomRawKernel(const Context& dev_ctx, template void UniformRandomKernel(const Context& dev_ctx, - const ScalarArray& shape, + const IntArray& shape, DataType dtype, float min, float max, diff --git a/paddle/phi/kernels/set_value_grad_kernel.h b/paddle/phi/kernels/set_value_grad_kernel.h index 6a028b0c8dc50..3ee95d7628193 100644 --- a/paddle/phi/kernels/set_value_grad_kernel.h +++ b/paddle/phi/kernels/set_value_grad_kernel.h @@ -14,7 +14,7 @@ #pragma once -#include "paddle/phi/common/scalar_array.h" +#include "paddle/phi/common/int_array.h" #include "paddle/phi/core/dense_tensor.h" namespace phi { @@ -22,9 +22,9 @@ namespace phi { template void SetValueGradKernel(const Context& dev_ctx, const DenseTensor& out_grad, - const ScalarArray& starts, - const ScalarArray& ends, - const ScalarArray& steps, + const IntArray& starts, + const IntArray& ends, + const IntArray& steps, const std::vector& axes, const std::vector& decrease_axes, const std::vector& none_axes, diff --git a/paddle/phi/kernels/set_value_kernel.h b/paddle/phi/kernels/set_value_kernel.h index 271691b1a3596..69fd88f02e852 100644 --- a/paddle/phi/kernels/set_value_kernel.h +++ b/paddle/phi/kernels/set_value_kernel.h @@ -14,8 +14,8 @@ #pragma once +#include "paddle/phi/common/int_array.h" #include "paddle/phi/common/scalar.h" -#include "paddle/phi/common/scalar_array.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/infermeta/unary.h" @@ -25,9 +25,9 @@ template void SetTensorValueKernel(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& value, - const ScalarArray& starts, - const ScalarArray& ends, - const ScalarArray& steps, + const IntArray& starts, + const IntArray& ends, + const IntArray& steps, const std::vector& axes, const std::vector& decrease_axes, const std::vector& none_axes, @@ -36,9 +36,9 @@ void SetTensorValueKernel(const Context& dev_ctx, template void SetValueKernel(const Context& dev_ctx, const DenseTensor& x, - const ScalarArray& starts, - const ScalarArray& ends, - const ScalarArray& steps, + const IntArray& starts, + const IntArray& ends, + const IntArray& steps, const std::vector& axes, const std::vector& decrease_axes, const std::vector& none_axes, diff --git a/paddle/phi/kernels/slice_grad_kernel.h b/paddle/phi/kernels/slice_grad_kernel.h index a7ee9ffde4eb0..a74b432c2b1b9 100644 --- a/paddle/phi/kernels/slice_grad_kernel.h +++ b/paddle/phi/kernels/slice_grad_kernel.h @@ -14,7 +14,7 @@ #pragma once -#include "paddle/phi/common/scalar_array.h" +#include "paddle/phi/common/int_array.h" #include "paddle/phi/core/dense_tensor.h" namespace phi { @@ -24,8 +24,8 @@ void SliceGradRawKernel(const Context& ctx, const DenseTensor& input, const DenseTensor& out_grad, const std::vector& axes, - const ScalarArray& starts, - const ScalarArray& ends, + const IntArray& starts, + const IntArray& ends, const std::vector& infer_flags, const std::vector& decrease_axis, DenseTensor* input_grad); diff --git a/paddle/phi/kernels/slice_kernel.h b/paddle/phi/kernels/slice_kernel.h index ff27824b9e676..c2a96312cdd5e 100644 --- a/paddle/phi/kernels/slice_kernel.h +++ b/paddle/phi/kernels/slice_kernel.h @@ -14,7 +14,7 @@ #pragma once -#include "paddle/phi/common/scalar_array.h" +#include "paddle/phi/common/int_array.h" #include "paddle/phi/core/dense_tensor.h" namespace phi { @@ -23,8 +23,8 @@ template void SliceRawKernel(const Context& ctx, const DenseTensor& input, const std::vector& axes, - const ScalarArray& starts, - const ScalarArray& ends, + const IntArray& starts, + const IntArray& ends, const std::vector& infer_flags, const std::vector& decrease_axis, DenseTensor* out); diff --git a/paddle/phi/kernels/sparse/copy_kernel.cc b/paddle/phi/kernels/sparse/copy_kernel.cc new file mode 100644 index 0000000000000..705c19e020c84 --- /dev/null +++ b/paddle/phi/kernels/sparse/copy_kernel.cc @@ -0,0 +1,99 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/sparse/copy_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/sparse_coo_tensor.h" +#include "paddle/phi/core/sparse_csr_tensor.h" +#include "paddle/phi/kernels/copy_kernel.h" + +namespace phi { +namespace sparse { + +template +void CopyCoo(const Context& dev_ctx, + const SparseCooTensor& src, + Place dst_place, + bool blocking, + SparseCooTensor* dst) { + phi::Copy(dev_ctx, + src.non_zero_indices(), + dst_place, + blocking, + dst->mutable_non_zero_indices()); + + phi::Copy(dev_ctx, + src.non_zero_elements(), + dst_place, + blocking, + dst->mutable_non_zero_elements()); + dst->set_dims(src.dims()); +} + +template +void CopyCsr(const Context& dev_ctx, + const SparseCsrTensor& src, + Place dst_place, + bool blocking, + SparseCsrTensor* dst) { + phi::Copy(dev_ctx, + src.non_zero_crows(), + dst_place, + blocking, + dst->mutable_non_zero_crows()); + + phi::Copy(dev_ctx, + src.non_zero_cols(), + dst_place, + blocking, + dst->mutable_non_zero_cols()); + + phi::Copy(dev_ctx, + src.non_zero_elements(), + dst_place, + blocking, + dst->mutable_non_zero_elements()); + dst->set_dims(src.dims()); +} + +} // namespace sparse +} // namespace phi + +PD_REGISTER_GENERAL_KERNEL(copy_sparse_coo, + CPU, + ALL_LAYOUT, + phi::sparse::CopyCoo, + ALL_DTYPE) {} + +PD_REGISTER_GENERAL_KERNEL(copy_sparse_csr, + CPU, + ALL_LAYOUT, + phi::sparse::CopyCsr, + ALL_DTYPE) {} + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_REGISTER_GENERAL_KERNEL(copy_sparse_coo, + GPU, + ALL_LAYOUT, + phi::sparse::CopyCoo, + ALL_DTYPE) {} +PD_REGISTER_GENERAL_KERNEL(copy_sparse_csr, + GPU, + ALL_LAYOUT, + phi::sparse::CopyCsr, + ALL_DTYPE) {} +#endif diff --git a/paddle/phi/kernels/sparse/copy_kernel.h b/paddle/phi/kernels/sparse/copy_kernel.h new file mode 100644 index 0000000000000..a43621a4dfeed --- /dev/null +++ b/paddle/phi/kernels/sparse/copy_kernel.h @@ -0,0 +1,41 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/api/lib/utils/storage.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/sparse_coo_tensor.h" +#include "paddle/phi/core/sparse_csr_tensor.h" +#include "paddle/phi/kernels/empty_kernel.h" + +namespace phi { +namespace sparse { + +template +void CopyCoo(const Context& dev_ctx, + const SparseCooTensor& src, + Place dst_place, + bool blocking, + SparseCooTensor* dst); + +template +void CopyCsr(const Context& dev_ctx, + const SparseCsrTensor& src, + Place dst_place, + bool blocking, + SparseCsrTensor* dst); + +} // namespace sparse +} // namespace phi diff --git a/paddle/phi/kernels/sparse/cpu/convolution.h b/paddle/phi/kernels/sparse/cpu/convolution.h index 93a335e2f1c35..4ea93f4ad5aaf 100644 --- a/paddle/phi/kernels/sparse/cpu/convolution.h +++ b/paddle/phi/kernels/sparse/cpu/convolution.h @@ -153,8 +153,9 @@ void UpdateRulebookAndOutIndex(const Context& dev_ctx, const int64_t sparse_dim = 4; DenseTensorMeta indices_meta( DataType::INT32, {sparse_dim, out_non_zero_num}, DataLayout::NCHW); - DenseTensorMeta values_meta( - x.dtype(), {out_non_zero_num, out_channels}, x.layout()); + DenseTensorMeta values_meta(x.dtype(), + {out_non_zero_num, out_channels}, + x.non_zero_elements().layout()); phi::DenseTensor out_indices = phi::Empty(dev_ctx, std::move(indices_meta)); phi::DenseTensor out_values = phi::Empty(dev_ctx, std::move(values_meta)); int* out_indices_ptr = out_indices.data(); diff --git a/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc b/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc index 50e95ee0b8a48..21dd24b5a9904 100644 --- a/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc +++ b/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc @@ -121,7 +121,8 @@ void SparseCsrToCooKernel(const Context& dev_ctx, const auto place = dev_ctx.GetPlace(); DenseTensorMeta indices_meta( DataType::INT64, {sparse_dim, non_zero_num}, DataLayout::NCHW); - DenseTensorMeta values_meta(x.dtype(), {non_zero_num}, x.layout()); + DenseTensorMeta values_meta( + x.dtype(), {non_zero_num}, x.non_zero_elements().layout()); phi::DenseTensor indices = phi::Empty(dev_ctx, std::move(indices_meta)); phi::DenseTensor values = phi::Empty(dev_ctx, std::move(values_meta)); int64_t* coo_indices = indices.mutable_data(place); @@ -174,7 +175,8 @@ void SparseCooToCsrKernel(const Context& dev_ctx, DenseTensorMeta crows_meta( DataType::INT64, {batchs * (rows + 1)}, DataLayout::NCHW); DenseTensorMeta cols_meta(DataType::INT64, {non_zero_num}, DataLayout::NCHW); - DenseTensorMeta values_meta(x.dtype(), {non_zero_num}, x.layout()); + DenseTensorMeta values_meta( + x.dtype(), {non_zero_num}, x.non_zero_elements().layout()); phi::DenseTensor non_zero_crows( phi::make_intrusive(place), std::move(crows_meta)); diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h index 5b928817f64d7..a512a60b94ff8 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h +++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h @@ -349,7 +349,10 @@ int ProductRuleBook(const Context& dev_ctx, int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2]; const int rulebook_rows = 3; const int rulebook_cols = kernel_size * non_zero_num; - rulebook->ResizeAndAllocate({rulebook_rows, rulebook_cols}); + DenseTensorMeta rulebook_meta( + DataType::INT32, {rulebook_rows, rulebook_cols}, DataLayout::NCHW); + rulebook->set_meta(rulebook_meta); + dev_ctx.Alloc(rulebook, rulebook->dtype(), rulebook->numel() * sizeof(int)); int* rulebook_ptr = rulebook->data(); const auto x_dims = x.dims(); @@ -608,8 +611,9 @@ int ProductRuleBook(const Context& dev_ctx, const int64_t sparse_dim = 4; DenseTensorMeta indices_meta( DataType::INT32, {sparse_dim, out_non_zero_num}, DataLayout::NCHW); - DenseTensorMeta values_meta( - x.dtype(), {out_non_zero_num, kernel_sizes[4]}, x.layout()); + DenseTensorMeta values_meta(x.dtype(), + {out_non_zero_num, kernel_sizes[4]}, + x.non_zero_elements().layout()); phi::DenseTensor out_indices = phi::Empty(dev_ctx, std::move(indices_meta)); phi::DenseTensor out_values = phi::Empty(dev_ctx, std::move(values_meta)); diff --git a/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu index 8048180e425ea..1451ef45356af 100644 --- a/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu @@ -257,7 +257,8 @@ void SparseCsrToCooKernel(const Context& dev_ctx, const auto place = dev_ctx.GetPlace(); DenseTensorMeta indices_meta( DataType::INT64, {sparse_dim, non_zero_num}, DataLayout::NCHW); - DenseTensorMeta values_meta(x.dtype(), {non_zero_num}, x.layout()); + DenseTensorMeta values_meta( + x.dtype(), {non_zero_num}, x.non_zero_elements().layout()); DenseTensorMeta offsets_meta(DataType::INT32, {batchs}, DataLayout::NCHW); DenseTensor indices = phi::Empty(dev_ctx, std::move(indices_meta)); DenseTensor values = phi::Empty(dev_ctx, std::move(values_meta)); @@ -385,7 +386,8 @@ void SparseCooToCsrKernel(const Context& dev_ctx, DenseTensorMeta crows_meta( DataType::INT64, {batchs * (rows + 1)}, DataLayout::NCHW); DenseTensorMeta cols_meta(DataType::INT64, {non_zero_num}, DataLayout::NCHW); - DenseTensorMeta values_meta(x.dtype(), {non_zero_num}, x.layout()); + DenseTensorMeta values_meta( + x.dtype(), {non_zero_num}, x.non_zero_elements().layout()); phi::DenseTensor non_zero_crows( phi::make_intrusive(place), std::move(crows_meta)); diff --git a/paddle/phi/kernels/split_kernel.h b/paddle/phi/kernels/split_kernel.h index e42b25e60c422..6baac241426c7 100644 --- a/paddle/phi/kernels/split_kernel.h +++ b/paddle/phi/kernels/split_kernel.h @@ -16,8 +16,8 @@ #include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/common/int_array.h" #include "paddle/phi/common/scalar.h" -#include "paddle/phi/common/scalar_array.h" #include "paddle/phi/infermeta/unary.h" #include "paddle/phi/kernels/empty_kernel.h" @@ -26,14 +26,14 @@ namespace phi { template void SplitKernel(const Context& dev_ctx, const DenseTensor& x, - const ScalarArray& num_or_sections, + const IntArray& num_or_sections, const Scalar& axis, std::vector out); template std::vector Split(const Context& dev_ctx, const DenseTensor& x, - const ScalarArray& num_or_sections, + const IntArray& num_or_sections, const Scalar& axis) { size_t out_number; if (num_or_sections.GetData().size() == 1) { diff --git a/paddle/phi/kernels/stack_grad_kernel.h b/paddle/phi/kernels/stack_grad_kernel.h new file mode 100644 index 0000000000000..32451e606f26a --- /dev/null +++ b/paddle/phi/kernels/stack_grad_kernel.h @@ -0,0 +1,27 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void StackGradKernel(const Context& dev_ctx, + const DenseTensor& out, + int axis, + std::vector x_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/stack_kernel.h b/paddle/phi/kernels/stack_kernel.h new file mode 100644 index 0000000000000..679c74063080e --- /dev/null +++ b/paddle/phi/kernels/stack_kernel.h @@ -0,0 +1,27 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void StackKernel(const Context& dev_ctx, + const std::vector& x, + int axis, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/kernels/strided_slice_grad_kernel.cc b/paddle/phi/kernels/strided_slice_grad_kernel.cc new file mode 100644 index 0000000000000..38dd360ea66c2 --- /dev/null +++ b/paddle/phi/kernels/strided_slice_grad_kernel.cc @@ -0,0 +1,69 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/strided_slice_grad_kernel.h" + +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void StridedSliceGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + const std::vector& axes, + const IntArray& starts, + const IntArray& ends, + const IntArray& strides, + DenseTensor* x_grad) { + std::vector infer_flags(axes.size(), 1); + std::vector decrease_axis; + StridedSliceRawGradKernel(dev_ctx, + x, + out_grad, + axes, + starts, + ends, + strides, + infer_flags, + decrease_axis, + x_grad); +} + +} // namespace phi + +PD_REGISTER_KERNEL(strided_slice_grad, + CPU, + ALL_LAYOUT, + phi::StridedSliceGradKernel, + bool, + int, + int64_t, + float, + double, + phi::dtype::complex, + phi::dtype::complex) {} +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_REGISTER_KERNEL(strided_slice_grad, + GPU, + ALL_LAYOUT, + phi::StridedSliceGradKernel, + bool, + int, + int64_t, + float, + double, + phi::dtype::complex, + phi::dtype::complex) {} +#endif diff --git a/paddle/phi/kernels/strided_slice_grad_kernel.h b/paddle/phi/kernels/strided_slice_grad_kernel.h index f753402e49833..21d01310b662f 100644 --- a/paddle/phi/kernels/strided_slice_grad_kernel.h +++ b/paddle/phi/kernels/strided_slice_grad_kernel.h @@ -14,21 +14,31 @@ #pragma once -#include "paddle/phi/common/scalar_array.h" +#include "paddle/phi/common/int_array.h" #include "paddle/phi/core/dense_tensor.h" namespace phi { +template +void StridedSliceRawGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + const std::vector& axes, + const IntArray& starts, + const IntArray& ends, + const IntArray& strides, + const std::vector& infer_flags, + const std::vector& decrease_axis, + DenseTensor* x_grad); + template void StridedSliceGradKernel(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& out_grad, const std::vector& axes, - const ScalarArray& starts, - const ScalarArray& ends, - const ScalarArray& strides, - const std::vector& infer_flags, - const std::vector& decrease_axis, + const IntArray& starts, + const IntArray& ends, + const IntArray& strides, DenseTensor* x_grad); template @@ -37,9 +47,9 @@ void StridedSliceArrayGradKernel( const std::vector& x, const std::vector& out_grad, const std::vector& axes, - const ScalarArray& starts, - const ScalarArray& ends, - const ScalarArray& strides, + const IntArray& starts, + const IntArray& ends, + const IntArray& strides, const std::vector& infer_flags, const std::vector& decrease_axis, std::vector x_grad); diff --git a/paddle/phi/kernels/strided_slice_kernel.cc b/paddle/phi/kernels/strided_slice_kernel.cc new file mode 100644 index 0000000000000..547d574cd78d0 --- /dev/null +++ b/paddle/phi/kernels/strided_slice_kernel.cc @@ -0,0 +1,60 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/strided_slice_kernel.h" + +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void StridedSliceKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& axes, + const IntArray& starts, + const IntArray& ends, + const IntArray& strides, + DenseTensor* out) { + std::vector infer_flags(axes.size(), 1); + std::vector decrease_axis; + StridedSliceRawKernel( + dev_ctx, x, axes, starts, ends, strides, infer_flags, decrease_axis, out); +} + +} // namespace phi + +PD_REGISTER_KERNEL(strided_slice, + CPU, + ALL_LAYOUT, + phi::StridedSliceKernel, + bool, + int, + int64_t, + float, + double, + phi::dtype::complex, + phi::dtype::complex) {} +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_REGISTER_KERNEL(strided_slice, + GPU, + ALL_LAYOUT, + phi::StridedSliceKernel, + bool, + int, + int64_t, + float, + double, + phi::dtype::complex, + phi::dtype::complex) {} +#endif diff --git a/paddle/phi/kernels/strided_slice_kernel.h b/paddle/phi/kernels/strided_slice_kernel.h index f23d1c04d5da3..2c8b373bf03a8 100644 --- a/paddle/phi/kernels/strided_slice_kernel.h +++ b/paddle/phi/kernels/strided_slice_kernel.h @@ -14,29 +14,38 @@ #pragma once -#include "paddle/phi/common/scalar_array.h" +#include "paddle/phi/common/int_array.h" #include "paddle/phi/core/dense_tensor.h" namespace phi { +template +void StridedSliceRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& axes, + const IntArray& starts, + const IntArray& ends, + const IntArray& strides, + const std::vector& infer_flags, + const std::vector& decrease_axis, + DenseTensor* out); + template void StridedSliceKernel(const Context& dev_ctx, const DenseTensor& x, const std::vector& axes, - const ScalarArray& starts, - const ScalarArray& ends, - const ScalarArray& strides, - const std::vector& infer_flags, - const std::vector& decrease_axis, + const IntArray& starts, + const IntArray& ends, + const IntArray& strides, DenseTensor* out); template void StridedSliceArrayKernel(const Context& dev_ctx, const std::vector& x, const std::vector& axes, - const ScalarArray& starts, - const ScalarArray& ends, - const ScalarArray& strides, + const IntArray& starts, + const IntArray& ends, + const IntArray& strides, const std::vector& infer_flags, const std::vector& decrease_axis, std::vector out); diff --git a/paddle/phi/kernels/strings/strings_empty_kernel.cc b/paddle/phi/kernels/strings/strings_empty_kernel.cc index b8df19fdba8cf..433d3ad0a95f6 100644 --- a/paddle/phi/kernels/strings/strings_empty_kernel.cc +++ b/paddle/phi/kernels/strings/strings_empty_kernel.cc @@ -22,7 +22,7 @@ namespace strings { template void EmptyKernel(const Context& dev_ctx, - const ScalarArray& shape, + const IntArray& shape, StringTensor* out) { out->Resize(phi::make_ddim(shape.GetData())); dev_ctx.template Alloc(out); diff --git a/paddle/phi/kernels/strings/strings_empty_kernel.h b/paddle/phi/kernels/strings/strings_empty_kernel.h index 7f416375f6b43..1add1963614d8 100644 --- a/paddle/phi/kernels/strings/strings_empty_kernel.h +++ b/paddle/phi/kernels/strings/strings_empty_kernel.h @@ -16,7 +16,7 @@ #include "paddle/phi/api/lib/utils/allocator.h" #include "paddle/phi/api/lib/utils/storage.h" -#include "paddle/phi/common/scalar_array.h" +#include "paddle/phi/common/int_array.h" #include "paddle/phi/core/string_tensor.h" #include "paddle/phi/infermeta/strings/nullary.h" #include "paddle/phi/infermeta/strings/unary.h" @@ -26,7 +26,7 @@ namespace strings { template void EmptyKernel(const Context& dev_ctx, - const ScalarArray& shape, + const IntArray& shape, StringTensor* out); template @@ -48,7 +48,7 @@ StringTensor Empty(const Context& dev_ctx) { } template -StringTensor Empty(const Context& dev_ctx, const ScalarArray& shape) { +StringTensor Empty(const Context& dev_ctx, const IntArray& shape) { StringTensor string_out; MetaTensor meta_out(&string_out); phi::strings::CreateInferMeta(shape, &meta_out); diff --git a/paddle/phi/kernels/tile_grad_kernel.h b/paddle/phi/kernels/tile_grad_kernel.h index 830276c28e053..d40a0f4dfce7b 100644 --- a/paddle/phi/kernels/tile_grad_kernel.h +++ b/paddle/phi/kernels/tile_grad_kernel.h @@ -14,7 +14,7 @@ #pragma once -#include "paddle/phi/common/scalar_array.h" +#include "paddle/phi/common/int_array.h" #include "paddle/phi/core/dense_tensor.h" #define MAX_RANK_SUPPORTED 6 @@ -25,7 +25,7 @@ template void TileGradKernel(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& out_grad, - const ScalarArray& repeat_times, + const IntArray& repeat_times, DenseTensor* x_grad); } // namespace phi diff --git a/paddle/phi/kernels/tile_kernel.h b/paddle/phi/kernels/tile_kernel.h index 924d0149fe345..32e3685e8569e 100644 --- a/paddle/phi/kernels/tile_kernel.h +++ b/paddle/phi/kernels/tile_kernel.h @@ -14,7 +14,7 @@ #pragma once -#include "paddle/phi/common/scalar_array.h" +#include "paddle/phi/common/int_array.h" #include "paddle/phi/core/dense_tensor.h" #define MAX_RANK_SUPPORTED 6 @@ -24,7 +24,7 @@ namespace phi { template void TileKernel(const Context& dev_ctx, const DenseTensor& x, - const ScalarArray& repeat_times, + const IntArray& repeat_times, DenseTensor* out); } // namespace phi diff --git a/paddle/phi/kernels/top_k_grad_kernel.h b/paddle/phi/kernels/top_k_grad_kernel.h index f577b982c575d..e4fde92dad8fd 100644 --- a/paddle/phi/kernels/top_k_grad_kernel.h +++ b/paddle/phi/kernels/top_k_grad_kernel.h @@ -14,16 +14,17 @@ #pragma once +#include "paddle/phi/common/scalar.h" #include "paddle/phi/core/dense_tensor.h" namespace phi { template void TopkGradKernel(const Context& dev_ctx, - const DenseTensor& out_grad, const DenseTensor& x, const DenseTensor& indices, - int k, + const DenseTensor& out_grad, + const Scalar& k, int axis, bool largest, bool sorted, diff --git a/paddle/phi/kernels/truncated_gaussian_random_kernel.h b/paddle/phi/kernels/truncated_gaussian_random_kernel.h index f8547ced41934..2781b79520a5d 100644 --- a/paddle/phi/kernels/truncated_gaussian_random_kernel.h +++ b/paddle/phi/kernels/truncated_gaussian_random_kernel.h @@ -17,7 +17,7 @@ #include #include -#include "paddle/phi/common/scalar_array.h" +#include "paddle/phi/common/int_array.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/device_context.h" #include "paddle/phi/infermeta/nullary.h" diff --git a/paddle/phi/kernels/uniform_random_kernel.h b/paddle/phi/kernels/uniform_random_kernel.h index 36ce4c3f9eef5..03eca83db03ac 100644 --- a/paddle/phi/kernels/uniform_random_kernel.h +++ b/paddle/phi/kernels/uniform_random_kernel.h @@ -14,7 +14,7 @@ #pragma once -#include "paddle/phi/common/scalar_array.h" +#include "paddle/phi/common/int_array.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/device_context.h" @@ -22,7 +22,7 @@ namespace phi { template void UniformRandomRawKernel(const Context& dev_ctx, - const ScalarArray& shape, + const IntArray& shape, DataType dtype, float min, float max, @@ -34,7 +34,7 @@ void UniformRandomRawKernel(const Context& dev_ctx, template void UniformRandomKernel(const Context& dev_ctx, - const ScalarArray& shape, + const IntArray& shape, DataType dtype, float min, float max, diff --git a/paddle/phi/kernels/unique_kernel.h b/paddle/phi/kernels/unique_kernel.h new file mode 100644 index 0000000000000..353570c8e7da3 --- /dev/null +++ b/paddle/phi/kernels/unique_kernel.h @@ -0,0 +1,48 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void UniqueKernel(const Context& dev_ctx, + const DenseTensor& x, + bool return_index, + bool return_inverse, + bool return_counts, + const std::vector& axis, + DataType dtype, + DenseTensor* out, + DenseTensor* indices, + DenseTensor* index, + DenseTensor* counts); + +template +void UniqueRawKernel(const Context& dev_ctx, + const DenseTensor& x, + bool return_index, + bool return_inverse, + bool return_counts, + const std::vector& axis, + DataType dtype, + bool is_sorted, + DenseTensor* out, + DenseTensor* indices, + DenseTensor* index, + DenseTensor* counts); + +} // namespace phi diff --git a/paddle/phi/kernels/unsqueeze_kernel.h b/paddle/phi/kernels/unsqueeze_kernel.h index 8f818a1b49042..d18bde1c2efab 100644 --- a/paddle/phi/kernels/unsqueeze_kernel.h +++ b/paddle/phi/kernels/unsqueeze_kernel.h @@ -15,7 +15,7 @@ #pragma once -#include "paddle/phi/common/scalar_array.h" +#include "paddle/phi/common/int_array.h" #include "paddle/phi/core/dense_tensor.h" namespace phi { @@ -23,7 +23,7 @@ namespace phi { template void UnsqueezeKernel(const Context& dev_ctx, const DenseTensor& x, - const ScalarArray& axes, + const IntArray& axes, DenseTensor* xshape, DenseTensor* out); } // namespace phi diff --git a/paddle/phi/kernels/unstack_grad_kernel.h b/paddle/phi/kernels/unstack_grad_kernel.h new file mode 100644 index 0000000000000..de0e3004d8038 --- /dev/null +++ b/paddle/phi/kernels/unstack_grad_kernel.h @@ -0,0 +1,27 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void UnStackGradKernel(const Context& dev_ctx, + const std::vector& x, + int axis, + DenseTensor* x_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/unstack_kernel.h b/paddle/phi/kernels/unstack_kernel.h new file mode 100644 index 0000000000000..0494aa6327c21 --- /dev/null +++ b/paddle/phi/kernels/unstack_kernel.h @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void UnStackKernel(const Context& dev_ctx, + const DenseTensor& x, + int axis, + int num, + std::vector outs); + +} // namespace phi diff --git a/paddle/phi/kernels/xpu/full_kernel.cc b/paddle/phi/kernels/xpu/full_kernel.cc index d43126d56e88c..6668ae39cbdbe 100644 --- a/paddle/phi/kernels/xpu/full_kernel.cc +++ b/paddle/phi/kernels/xpu/full_kernel.cc @@ -55,7 +55,7 @@ void FullValueXPU(const Context& dev_ctx, DenseTensor* tensor, VType val) { template void FullKernel(const Context& dev_ctx, - const ScalarArray& shape, + const IntArray& shape, const Scalar& val, DataType dtype, DenseTensor* out) { diff --git a/paddle/phi/ops/compat/index_sample_sig.cc b/paddle/phi/ops/compat/index_sample_sig.cc index 0d2aed68a72a5..3b7e3f063d6c1 100644 --- a/paddle/phi/ops/compat/index_sample_sig.cc +++ b/paddle/phi/ops/compat/index_sample_sig.cc @@ -19,7 +19,7 @@ namespace phi { KernelSignature IndexSampleGradOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature("index_sample_grad", - {GradVarName("Out"), "X", "Index"}, + {"X", "Index", GradVarName("Out")}, {}, {GradVarName("X")}); } diff --git a/paddle/phi/ops/compat/interpolate_sig.cc b/paddle/phi/ops/compat/interpolate_sig.cc new file mode 100644 index 0000000000000..ba0e971e4ab00 --- /dev/null +++ b/paddle/phi/ops/compat/interpolate_sig.cc @@ -0,0 +1,194 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature BilinearInterpOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("bilinear_interp_v2", + {"X", "OutSize", "SizeTensor", "Scale"}, + {"data_layout", + "out_d", + "out_h", + "out_w", + "scale", + "interp_method", + "align_corners", + "align_mode"}, + {"Out"}); +} + +KernelSignature NearestInterpOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("nearest_interp_v2", + {"X", "OutSize", "SizeTensor", "Scale"}, + {"data_layout", + "out_d", + "out_h", + "out_w", + "scale", + "interp_method", + "align_corners", + "align_mode"}, + {"Out"}); +} +KernelSignature TrilinearInterpOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("trilinear_interp_v2", + {"X", "OutSize", "SizeTensor", "Scale"}, + {"data_layout", + "out_d", + "out_h", + "out_w", + "scale", + "interp_method", + "align_corners", + "align_mode"}, + {"Out"}); +} + +KernelSignature LinearInterpOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("linear_interp_v2", + {"X", "OutSize", "SizeTensor", "Scale"}, + {"data_layout", + "out_d", + "out_h", + "out_w", + "scale", + "interp_method", + "align_corners", + "align_mode"}, + {"Out"}); +} + +KernelSignature BicubicInterpOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("bicubic_interp_v2", + {"X", "OutSize", "SizeTensor", "Scale"}, + {"data_layout", + "out_d", + "out_h", + "out_w", + "scale", + "interp_method", + "align_corners", + "align_mode"}, + {"Out"}); +} + +KernelSignature BilinearInterpGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "bilinear_interp_v2_grad", + {"X", "OutSize", "SizeTensor", "Scale", GradVarName("Out")}, + {"data_layout", + "out_d", + "out_h", + "out_w", + "scale", + "interp_method", + "align_corners", + "align_mode"}, + {GradVarName("X")}); +} + +KernelSignature NearestInterpGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "nearest_interp_v2_grad", + {"X", "OutSize", "SizeTensor", "Scale", GradVarName("Out")}, + {"data_layout", + "out_d", + "out_h", + "out_w", + "scale", + "interp_method", + "align_corners", + "align_mode"}, + {GradVarName("X")}); +} +KernelSignature TrilinearInterpGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "trilinear_interp_v2_grad", + {"X", "OutSize", "SizeTensor", "Scale", GradVarName("Out")}, + {"data_layout", + "out_d", + "out_h", + "out_w", + "scale", + "interp_method", + "align_corners", + "align_mode"}, + {GradVarName("X")}); +} + +KernelSignature LinearInterpGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "linear_interp_v2_grad", + {"X", "OutSize", "SizeTensor", "Scale", GradVarName("Out")}, + {"data_layout", + "out_d", + "out_h", + "out_w", + "scale", + "interp_method", + "align_corners", + "align_mode"}, + {GradVarName("X")}); +} + +KernelSignature BicubicInterpGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "bicubic_interp_v2_grad", + {"X", "OutSize", "SizeTensor", "Scale", GradVarName("Out")}, + {"data_layout", + "out_d", + "out_h", + "out_w", + "scale", + "interp_method", + "align_corners", + "align_mode"}, + {GradVarName("X")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(bilinear_interp_v2, + phi::BilinearInterpOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(nearest_interp_v2, + phi::NearestInterpOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(trilinear_interp_v2, + phi::TrilinearInterpOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(linear_interp_v2, + phi::LinearInterpOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(bicubic_interp_v2, + phi::BicubicInterpOpArgumentMapping); + +PD_REGISTER_ARG_MAPPING_FN(bilinear_interp_v2_grad, + phi::BilinearInterpGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(nearest_interp_v2_grad, + phi::NearestInterpGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(trilinear_interp_v2_grad, + phi::TrilinearInterpGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(linear_interp_v2_grad, + phi::LinearInterpGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(bicubic_interp_v2_grad, + phi::BicubicInterpGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/masked_select_sig.cc b/paddle/phi/ops/compat/masked_select_sig.cc index 8083b123bcff5..ec0eb90315bc1 100644 --- a/paddle/phi/ops/compat/masked_select_sig.cc +++ b/paddle/phi/ops/compat/masked_select_sig.cc @@ -24,7 +24,7 @@ KernelSignature MaskedSelectOpArgumentMapping( KernelSignature MaskedSelectGradOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature("masked_select_grad", - {GradVarName("Y"), "X", "Mask"}, + {"X", "Mask", GradVarName("Y")}, {}, {GradVarName("X")}); } diff --git a/paddle/phi/ops/compat/nll_loss_sig.cc b/paddle/phi/ops/compat/nll_loss_sig.cc index f274d7f77c5c0..87a060ce7a672 100644 --- a/paddle/phi/ops/compat/nll_loss_sig.cc +++ b/paddle/phi/ops/compat/nll_loss_sig.cc @@ -29,7 +29,7 @@ KernelSignature NllLossGradOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature( "nll_loss_grad", - {"X", "Label", "Total_weight", "Weight", GradVarName("Out")}, + {"X", "Label", "Weight", "Total_weight", GradVarName("Out")}, {"ignore_index", "reduction"}, {GradVarName("X")}); } diff --git a/paddle/phi/ops/compat/softmax_with_cross_entropy_sig.cc b/paddle/phi/ops/compat/softmax_with_cross_entropy_sig.cc new file mode 100644 index 0000000000000..9cfc5ded90a49 --- /dev/null +++ b/paddle/phi/ops/compat/softmax_with_cross_entropy_sig.cc @@ -0,0 +1,53 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature SoftmaxWithCrossEntropyOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("cross_entropy_with_softmax", + {"Logits", "Label"}, + {"soft_label", + "use_softmax", + "numeric_stable_mode", + "ignore_index", + "axis"}, + {"Softmax", "Loss"}); +} + +KernelSignature SoftmaxWithCrossEntropyGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("cross_entropy_with_softmax_grad", + {"Label", "Softmax", GradVarName("Loss")}, + {"soft_label", + "use_softmax", + "numeric_stable_mode", + "ignore_index", + "axis"}, + {GradVarName("Logits")}); +} + +} // namespace phi + +PD_REGISTER_BASE_KERNEL_NAME(softmax_with_cross_entropy, + cross_entropy_with_softmax); +PD_REGISTER_BASE_KERNEL_NAME(softmax_with_cross_entropy_grad, + cross_entropy_with_softmax_grad); + +PD_REGISTER_ARG_MAPPING_FN(softmax_with_cross_entropy, + phi::SoftmaxWithCrossEntropyOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(softmax_with_cross_entropy_grad, + phi::SoftmaxWithCrossEntropyGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/stack_sig.cc b/paddle/phi/ops/compat/stack_sig.cc new file mode 100644 index 0000000000000..97768eb89026e --- /dev/null +++ b/paddle/phi/ops/compat/stack_sig.cc @@ -0,0 +1,23 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature StackGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature( + "stack_grad", {GradVarName("Y")}, {"axis"}, {GradVarName("X")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(stack_grad, phi::StackGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/strided_slice_sig.cc b/paddle/phi/ops/compat/strided_slice_sig.cc index 70ce2e3e07ce9..9fb70af0dea51 100644 --- a/paddle/phi/ops/compat/strided_slice_sig.cc +++ b/paddle/phi/ops/compat/strided_slice_sig.cc @@ -57,14 +57,14 @@ KernelSignature StridedSliceOpArgumentMapping( "decrease_axis"}; paddle::SmallVector outputs = {"Out"}; - std::string op_type; + std::string kernel_name; if (ctx.IsDenseTensorVectorInput("Input")) { - op_type = "strided_slice_array"; + kernel_name = "strided_slice_array"; } else { - op_type = "strided_slice"; + kernel_name = "strided_slice_raw"; } // NOTE(dev): Use this to avoid regularization. - KernelSignature sig(op_type, inputs, attrs, outputs); + KernelSignature sig(kernel_name, inputs, attrs, outputs); return sig; } @@ -106,15 +106,15 @@ KernelSignature StridedSliceGradOpArgumentMapping( "decrease_axis"}; paddle::SmallVector outputs = {GradVarName("Input")}; - std::string op_type; + std::string kernel_name; if (ctx.IsDenseTensorVectorInput("Input")) { - op_type = "strided_slice_array_grad"; + kernel_name = "strided_slice_array_grad"; } else { - op_type = "strided_slice_grad"; + kernel_name = "strided_slice_raw_grad"; } // NOTE(dev): Use this to avoid regularization. - KernelSignature sig(op_type, inputs, attrs, outputs); + KernelSignature sig(kernel_name, inputs, attrs, outputs); return sig; } @@ -132,573 +132,273 @@ NOTE: The following codes are for 'get_compat_kernel_signature.py' ############################ Forward ############################ -return KernelSignature("{strided_slice}", {"Input"}, +return KernelSignature("strided_slice_raw", {"Input"}, {"axes", "StartsTensor", "EndsTensor", "StartsTensor","infer_flags", "decrease_axis"}, {"Out"}); -return KernelSignature("{strided_slice}", {"Input"}, +return KernelSignature("strided_slice_raw", {"Input"}, {"axes", "StartsTensor", "EndsTensor", "StartsTensorList","infer_flags", "decrease_axis"}, {"Out"}); -return KernelSignature("{strided_slice}", {"Input"}, +return KernelSignature("strided_slice_raw", {"Input"}, {"axes", "StartsTensor", "EndsTensor", "starts","infer_flags", "decrease_axis"}, {"Out"}); -return KernelSignature("{strided_slice}", {"Input"}, +return KernelSignature("strided_slice_raw", {"Input"}, {"axes", "StartsTensor", "EndsTensorList", "StartsTensor","infer_flags", "decrease_axis"}, {"Out"}); -return KernelSignature("{strided_slice}", {"Input"}, +return KernelSignature("strided_slice_raw", {"Input"}, {"axes", "StartsTensor", "EndsTensorList", "StartsTensorList","infer_flags", "decrease_axis"}, {"Out"}); -return KernelSignature("{strided_slice}", {"Input"}, +return KernelSignature("strided_slice_raw", {"Input"}, {"axes", "StartsTensor", "EndsTensorList", "starts","infer_flags", "decrease_axis"}, {"Out"}); -return KernelSignature("{strided_slice}", {"Input"}, +return KernelSignature("strided_slice_raw", {"Input"}, {"axes", "StartsTensor", "ends", "StartsTensor","infer_flags", "decrease_axis"}, {"Out"}); -return KernelSignature("{strided_slice}", {"Input"}, +return KernelSignature("strided_slice_raw", {"Input"}, {"axes", "StartsTensor", "ends", "StartsTensorList","infer_flags", "decrease_axis"}, {"Out"}); -return KernelSignature("{strided_slice}", {"Input"}, +return KernelSignature("strided_slice_raw", {"Input"}, {"axes", "StartsTensor", "ends", "starts","infer_flags", "decrease_axis"}, {"Out"}); -return KernelSignature("{strided_slice}", {"Input"}, +return KernelSignature("strided_slice_raw", {"Input"}, {"axes", "StartsTensorList", "EndsTensor", "StartsTensor","infer_flags", "decrease_axis"}, {"Out"}); -return KernelSignature("{strided_slice}", {"Input"}, +return KernelSignature("strided_slice_raw", {"Input"}, {"axes", "StartsTensorList", "EndsTensor", "StartsTensorList","infer_flags", "decrease_axis"}, {"Out"}); -return KernelSignature("{strided_slice}", {"Input"}, +return KernelSignature("strided_slice_raw", {"Input"}, {"axes", "StartsTensorList", "EndsTensor", "starts","infer_flags", "decrease_axis"}, {"Out"}); -return KernelSignature("{strided_slice}", {"Input"}, +return KernelSignature("strided_slice_raw", {"Input"}, {"axes", "StartsTensorList", "EndsTensorList", "StartsTensor","infer_flags", "decrease_axis"}, {"Out"}); -return KernelSignature("{strided_slice}", {"Input"}, +return KernelSignature("strided_slice_raw", {"Input"}, {"axes", "StartsTensorList", "EndsTensorList", "StartsTensorList","infer_flags", "decrease_axis"}, {"Out"}); -return KernelSignature("{strided_slice}", {"Input"}, +return KernelSignature("strided_slice_raw", {"Input"}, {"axes", "StartsTensorList", "EndsTensorList", "starts","infer_flags", "decrease_axis"}, {"Out"}); -return KernelSignature("{strided_slice}", {"Input"}, +return KernelSignature("strided_slice_raw", {"Input"}, {"axes", "StartsTensorList", "ends", "StartsTensor","infer_flags", "decrease_axis"}, {"Out"}); -return KernelSignature("{strided_slice}", {"Input"}, +return KernelSignature("strided_slice_raw", {"Input"}, {"axes", "StartsTensorList", "ends", "StartsTensorList","infer_flags", "decrease_axis"}, {"Out"}); -return KernelSignature("{strided_slice}", {"Input"}, +return KernelSignature("strided_slice_raw", {"Input"}, {"axes", "StartsTensorList", "ends", "starts","infer_flags", "decrease_axis"}, {"Out"}); -return KernelSignature("{strided_slice}", {"Input"}, +return KernelSignature("strided_slice_raw", {"Input"}, {"axes", "starts", "EndsTensor", "StartsTensor","infer_flags", "decrease_axis"}, {"Out"}); -return KernelSignature("{strided_slice}", {"Input"}, +return KernelSignature("strided_slice_raw", {"Input"}, {"axes", "starts", "EndsTensor", "StartsTensorList","infer_flags", "decrease_axis"}, {"Out"}); -return KernelSignature("{strided_slice}", {"Input"}, +return KernelSignature("strided_slice_raw", {"Input"}, {"axes", "starts", "EndsTensor", "starts","infer_flags", "decrease_axis"}, {"Out"}); -return KernelSignature("{strided_slice}", {"Input"}, +return KernelSignature("strided_slice_raw", {"Input"}, {"axes", "starts", "EndsTensorList", "StartsTensor","infer_flags", "decrease_axis"}, {"Out"}); -return KernelSignature("{strided_slice}", {"Input"}, +return KernelSignature("strided_slice_raw", {"Input"}, {"axes", "starts", "EndsTensorList", "StartsTensorList","infer_flags", "decrease_axis"}, {"Out"}); -return KernelSignature("{strided_slice}", {"Input"}, +return KernelSignature("strided_slice_raw", {"Input"}, {"axes", "starts", "EndsTensorList", "starts","infer_flags", "decrease_axis"}, {"Out"}); -return KernelSignature("{strided_slice}", {"Input"}, +return KernelSignature("strided_slice_raw", {"Input"}, {"axes", "starts", "ends", "StartsTensor","infer_flags", "decrease_axis"}, {"Out"}); -return KernelSignature("{strided_slice}", {"Input"}, +return KernelSignature("strided_slice_raw", {"Input"}, {"axes", "starts", "ends", "StartsTensorList","infer_flags", "decrease_axis"}, {"Out"}); -return KernelSignature("{strided_slice}", {"Input"}, +return KernelSignature("strided_slice_raw", {"Input"}, {"axes", "starts", "ends", "starts","infer_flags", "decrease_axis"}, {"Out"}); -return KernelSignature("{strided_slice_array}", {"Input"}, +return KernelSignature("strided_slice_array", {"Input"}, {"axes", "StartsTensor", "EndsTensor", "StartsTensor","infer_flags", "decrease_axis"}, {"Out"}); -return KernelSignature("{strided_slice_array}", {"Input"}, +return KernelSignature("strided_slice_array", {"Input"}, {"axes", "StartsTensor", "EndsTensor", "StartsTensorList","infer_flags", "decrease_axis"}, {"Out"}); -return KernelSignature("{strided_slice_array}", {"Input"}, +return KernelSignature("strided_slice_array", {"Input"}, {"axes", "StartsTensor", "EndsTensor", "starts","infer_flags", "decrease_axis"}, {"Out"}); -return KernelSignature("{strided_slice_array}", {"Input"}, +return KernelSignature("strided_slice_array", {"Input"}, {"axes", "StartsTensor", "EndsTensorList", "StartsTensor","infer_flags", "decrease_axis"}, {"Out"}); -return KernelSignature("{strided_slice_array}", {"Input"}, +return KernelSignature("strided_slice_array", {"Input"}, {"axes", "StartsTensor", "EndsTensorList", "StartsTensorList","infer_flags", "decrease_axis"}, {"Out"}); -return KernelSignature("{strided_slice_array}", {"Input"}, +return KernelSignature("strided_slice_array", {"Input"}, {"axes", "StartsTensor", "EndsTensorList", "starts","infer_flags", "decrease_axis"}, {"Out"}); -return KernelSignature("{strided_slice_array}", {"Input"}, +return KernelSignature("strided_slice_array", {"Input"}, {"axes", "StartsTensor", "ends", "StartsTensor","infer_flags", "decrease_axis"}, {"Out"}); -return KernelSignature("{strided_slice_array}", {"Input"}, +return KernelSignature("strided_slice_array", {"Input"}, {"axes", "StartsTensor", "ends", "StartsTensorList","infer_flags", "decrease_axis"}, {"Out"}); -return KernelSignature("{strided_slice_array}", {"Input"}, +return KernelSignature("strided_slice_array", {"Input"}, {"axes", "StartsTensor", "ends", "starts","infer_flags", "decrease_axis"}, {"Out"}); -return KernelSignature("{strided_slice_array}", {"Input"}, +return KernelSignature("strided_slice_array", {"Input"}, {"axes", "StartsTensorList", "EndsTensor", "StartsTensor","infer_flags", "decrease_axis"}, {"Out"}); -return KernelSignature("{strided_slice_array}", {"Input"}, +return KernelSignature("strided_slice_array", {"Input"}, {"axes", "StartsTensorList", "EndsTensor", "StartsTensorList","infer_flags", "decrease_axis"}, {"Out"}); -return KernelSignature("{strided_slice_array}", {"Input"}, +return KernelSignature("strided_slice_array", {"Input"}, {"axes", "StartsTensorList", "EndsTensor", "starts","infer_flags", "decrease_axis"}, {"Out"}); -return KernelSignature("{strided_slice_array}", {"Input"}, +return KernelSignature("strided_slice_array", {"Input"}, {"axes", "StartsTensorList", "EndsTensorList", "StartsTensor","infer_flags", "decrease_axis"}, {"Out"}); -return KernelSignature("{strided_slice_array}", {"Input"}, +return KernelSignature("strided_slice_array", {"Input"}, {"axes", "StartsTensorList", "EndsTensorList", "StartsTensorList","infer_flags", "decrease_axis"}, {"Out"}); -return KernelSignature("{strided_slice_array}", {"Input"}, +return KernelSignature("strided_slice_array", {"Input"}, {"axes", "StartsTensorList", "EndsTensorList", "starts","infer_flags", "decrease_axis"}, {"Out"}); -return KernelSignature("{strided_slice_array}", {"Input"}, +return KernelSignature("strided_slice_array", {"Input"}, {"axes", "StartsTensorList", "ends", "StartsTensor","infer_flags", "decrease_axis"}, {"Out"}); -return KernelSignature("{strided_slice_array}", {"Input"}, +return KernelSignature("strided_slice_array", {"Input"}, {"axes", "StartsTensorList", "ends", "StartsTensorList","infer_flags", "decrease_axis"}, {"Out"}); -return KernelSignature("{strided_slice_array}", {"Input"}, +return KernelSignature("strided_slice_array", {"Input"}, {"axes", "StartsTensorList", "ends", "starts","infer_flags", "decrease_axis"}, {"Out"}); -return KernelSignature("{strided_slice_array}", {"Input"}, +return KernelSignature("strided_slice_array", {"Input"}, {"axes", "starts", "EndsTensor", "StartsTensor","infer_flags", "decrease_axis"}, {"Out"}); -return KernelSignature("{strided_slice_array}", {"Input"}, +return KernelSignature("strided_slice_array", {"Input"}, {"axes", "starts", "EndsTensor", "StartsTensorList","infer_flags", "decrease_axis"}, {"Out"}); -return KernelSignature("{strided_slice_array}", {"Input"}, +return KernelSignature("strided_slice_array", {"Input"}, {"axes", "starts", "EndsTensor", "starts","infer_flags", "decrease_axis"}, {"Out"}); -return KernelSignature("{strided_slice_array}", {"Input"}, +return KernelSignature("strided_slice_array", {"Input"}, {"axes", "starts", "EndsTensorList", "StartsTensor","infer_flags", "decrease_axis"}, {"Out"}); -return KernelSignature("{strided_slice_array}", {"Input"}, +return KernelSignature("strided_slice_array", {"Input"}, {"axes", "starts", "EndsTensorList", "StartsTensorList","infer_flags", "decrease_axis"}, {"Out"}); -return KernelSignature("{strided_slice_array}", {"Input"}, +return KernelSignature("strided_slice_array", {"Input"}, {"axes", "starts", "EndsTensorList", "starts","infer_flags", "decrease_axis"}, {"Out"}); -return KernelSignature("{strided_slice_array}", {"Input"}, +return KernelSignature("strided_slice_array", {"Input"}, {"axes", "starts", "ends", "StartsTensor","infer_flags", "decrease_axis"}, {"Out"}); -return KernelSignature("{strided_slice_array}", {"Input"}, +return KernelSignature("strided_slice_array", {"Input"}, {"axes", "starts", "ends", "StartsTensorList","infer_flags", "decrease_axis"}, {"Out"}); -return KernelSignature("{strided_slice_array}", {"Input"}, +return KernelSignature("strided_slice_array", {"Input"}, {"axes", "starts", "ends", "starts","infer_flags", "decrease_axis"}, {"Out"}); - -############################ Backward ############################ - - -return KernelSignature("{strided_slice_grad}", {"Input", GradVarName("Out")}, - {"axes", "StartsTensor", "EndsTensor", -"StartsTensor","infer_flags", "decrease_axis"}, - {GradVarName("Input")}); - -return KernelSignature("{strided_slice_grad}", {"Input", GradVarName("Out")}, - {"axes", "StartsTensor", "EndsTensor", -"StartsTensorList","infer_flags", "decrease_axis"}, - {GradVarName("Input")}); - -return KernelSignature("{strided_slice_grad}", {"Input", GradVarName("Out")}, - {"axes", "StartsTensor", "EndsTensor", "starts","infer_flags", -"decrease_axis"}, - {GradVarName("Input")}); - -return KernelSignature("{strided_slice_grad}", {"Input", GradVarName("Out")}, - {"axes", "StartsTensor", "EndsTensorList", -"StartsTensor","infer_flags", "decrease_axis"}, - {GradVarName("Input")}); - -return KernelSignature("{strided_slice_grad}", {"Input", GradVarName("Out")}, - {"axes", "StartsTensor", "EndsTensorList", -"StartsTensorList","infer_flags", "decrease_axis"}, - {GradVarName("Input")}); - -return KernelSignature("{strided_slice_grad}", {"Input", GradVarName("Out")}, - {"axes", "StartsTensor", "EndsTensorList", "starts","infer_flags", -"decrease_axis"}, - {GradVarName("Input")}); - -return KernelSignature("{strided_slice_grad}", {"Input", GradVarName("Out")}, - {"axes", "StartsTensor", "ends", "StartsTensor","infer_flags", -"decrease_axis"}, - {GradVarName("Input")}); - -return KernelSignature("{strided_slice_grad}", {"Input", GradVarName("Out")}, - {"axes", "StartsTensor", "ends", "StartsTensorList","infer_flags", -"decrease_axis"}, - {GradVarName("Input")}); - -return KernelSignature("{strided_slice_grad}", {"Input", GradVarName("Out")}, - {"axes", "StartsTensor", "ends", "starts","infer_flags", -"decrease_axis"}, - {GradVarName("Input")}); - -return KernelSignature("{strided_slice_grad}", {"Input", GradVarName("Out")}, - {"axes", "StartsTensorList", "EndsTensor", -"StartsTensor","infer_flags", "decrease_axis"}, - {GradVarName("Input")}); - -return KernelSignature("{strided_slice_grad}", {"Input", GradVarName("Out")}, - {"axes", "StartsTensorList", "EndsTensor", -"StartsTensorList","infer_flags", "decrease_axis"}, - {GradVarName("Input")}); - -return KernelSignature("{strided_slice_grad}", {"Input", GradVarName("Out")}, - {"axes", "StartsTensorList", "EndsTensor", "starts","infer_flags", -"decrease_axis"}, - {GradVarName("Input")}); - -return KernelSignature("{strided_slice_grad}", {"Input", GradVarName("Out")}, - {"axes", "StartsTensorList", "EndsTensorList", -"StartsTensor","infer_flags", "decrease_axis"}, - {GradVarName("Input")}); - -return KernelSignature("{strided_slice_grad}", {"Input", GradVarName("Out")}, - {"axes", "StartsTensorList", "EndsTensorList", -"StartsTensorList","infer_flags", "decrease_axis"}, - {GradVarName("Input")}); - -return KernelSignature("{strided_slice_grad}", {"Input", GradVarName("Out")}, - {"axes", "StartsTensorList", "EndsTensorList", -"starts","infer_flags", "decrease_axis"}, - {GradVarName("Input")}); - -return KernelSignature("{strided_slice_grad}", {"Input", GradVarName("Out")}, - {"axes", "StartsTensorList", "ends", "StartsTensor","infer_flags", -"decrease_axis"}, - {GradVarName("Input")}); - -return KernelSignature("{strided_slice_grad}", {"Input", GradVarName("Out")}, - {"axes", "StartsTensorList", "ends", -"StartsTensorList","infer_flags", "decrease_axis"}, - {GradVarName("Input")}); - -return KernelSignature("{strided_slice_grad}", {"Input", GradVarName("Out")}, - {"axes", "StartsTensorList", "ends", "starts","infer_flags", -"decrease_axis"}, - {GradVarName("Input")}); - -return KernelSignature("{strided_slice_grad}", {"Input", GradVarName("Out")}, - {"axes", "starts", "EndsTensor", "StartsTensor","infer_flags", -"decrease_axis"}, - {GradVarName("Input")}); - -return KernelSignature("{strided_slice_grad}", {"Input", GradVarName("Out")}, - {"axes", "starts", "EndsTensor", "StartsTensorList","infer_flags", -"decrease_axis"}, - {GradVarName("Input")}); - -return KernelSignature("{strided_slice_grad}", {"Input", GradVarName("Out")}, - {"axes", "starts", "EndsTensor", "starts","infer_flags", -"decrease_axis"}, - {GradVarName("Input")}); - -return KernelSignature("{strided_slice_grad}", {"Input", GradVarName("Out")}, - {"axes", "starts", "EndsTensorList", "StartsTensor","infer_flags", -"decrease_axis"}, - {GradVarName("Input")}); - -return KernelSignature("{strided_slice_grad}", {"Input", GradVarName("Out")}, - {"axes", "starts", "EndsTensorList", -"StartsTensorList","infer_flags", "decrease_axis"}, - {GradVarName("Input")}); - -return KernelSignature("{strided_slice_grad}", {"Input", GradVarName("Out")}, - {"axes", "starts", "EndsTensorList", "starts","infer_flags", -"decrease_axis"}, - {GradVarName("Input")}); - -return KernelSignature("{strided_slice_grad}", {"Input", GradVarName("Out")}, - {"axes", "starts", "ends", "StartsTensor","infer_flags", -"decrease_axis"}, - {GradVarName("Input")}); - -return KernelSignature("{strided_slice_grad}", {"Input", GradVarName("Out")}, - {"axes", "starts", "ends", "StartsTensorList","infer_flags", -"decrease_axis"}, - {GradVarName("Input")}); - -return KernelSignature("{strided_slice_grad}", {"Input", GradVarName("Out")}, - {"axes", "starts", "ends", "starts","infer_flags", -"decrease_axis"}, - {GradVarName("Input")}); - -return KernelSignature("{strided_slice_array_grad}", {"Input", -GradVarName("Out")}, - {"axes", "StartsTensor", "EndsTensor", -"StartsTensor","infer_flags", "decrease_axis"}, - {GradVarName("Input")}); - -return KernelSignature("{strided_slice_array_grad}", {"Input", -GradVarName("Out")}, - {"axes", "StartsTensor", "EndsTensor", -"StartsTensorList","infer_flags", "decrease_axis"}, - {GradVarName("Input")}); - -return KernelSignature("{strided_slice_array_grad}", {"Input", -GradVarName("Out")}, - {"axes", "StartsTensor", "EndsTensor", "starts","infer_flags", -"decrease_axis"}, - {GradVarName("Input")}); - -return KernelSignature("{strided_slice_array_grad}", {"Input", -GradVarName("Out")}, - {"axes", "StartsTensor", "EndsTensorList", -"StartsTensor","infer_flags", "decrease_axis"}, - {GradVarName("Input")}); - -return KernelSignature("{strided_slice_array_grad}", {"Input", -GradVarName("Out")}, - {"axes", "StartsTensor", "EndsTensorList", -"StartsTensorList","infer_flags", "decrease_axis"}, - {GradVarName("Input")}); - -return KernelSignature("{strided_slice_array_grad}", {"Input", -GradVarName("Out")}, - {"axes", "StartsTensor", "EndsTensorList", "starts","infer_flags", -"decrease_axis"}, - {GradVarName("Input")}); - -return KernelSignature("{strided_slice_array_grad}", {"Input", -GradVarName("Out")}, - {"axes", "StartsTensor", "ends", "StartsTensor","infer_flags", -"decrease_axis"}, - {GradVarName("Input")}); - -return KernelSignature("{strided_slice_array_grad}", {"Input", -GradVarName("Out")}, - {"axes", "StartsTensor", "ends", "StartsTensorList","infer_flags", -"decrease_axis"}, - {GradVarName("Input")}); - -return KernelSignature("{strided_slice_array_grad}", {"Input", -GradVarName("Out")}, - {"axes", "StartsTensor", "ends", "starts","infer_flags", -"decrease_axis"}, - {GradVarName("Input")}); - -return KernelSignature("{strided_slice_array_grad}", {"Input", -GradVarName("Out")}, - {"axes", "StartsTensorList", "EndsTensor", -"StartsTensor","infer_flags", "decrease_axis"}, - {GradVarName("Input")}); - -return KernelSignature("{strided_slice_array_grad}", {"Input", -GradVarName("Out")}, - {"axes", "StartsTensorList", "EndsTensor", -"StartsTensorList","infer_flags", "decrease_axis"}, - {GradVarName("Input")}); - -return KernelSignature("{strided_slice_array_grad}", {"Input", -GradVarName("Out")}, - {"axes", "StartsTensorList", "EndsTensor", "starts","infer_flags", -"decrease_axis"}, - {GradVarName("Input")}); - -return KernelSignature("{strided_slice_array_grad}", {"Input", -GradVarName("Out")}, - {"axes", "StartsTensorList", "EndsTensorList", -"StartsTensor","infer_flags", "decrease_axis"}, - {GradVarName("Input")}); - -return KernelSignature("{strided_slice_array_grad}", {"Input", -GradVarName("Out")}, - {"axes", "StartsTensorList", "EndsTensorList", -"StartsTensorList","infer_flags", "decrease_axis"}, - {GradVarName("Input")}); - -return KernelSignature("{strided_slice_array_grad}", {"Input", -GradVarName("Out")}, - {"axes", "StartsTensorList", "EndsTensorList", -"starts","infer_flags", "decrease_axis"}, - {GradVarName("Input")}); - -return KernelSignature("{strided_slice_array_grad}", {"Input", -GradVarName("Out")}, - {"axes", "StartsTensorList", "ends", "StartsTensor","infer_flags", -"decrease_axis"}, - {GradVarName("Input")}); - -return KernelSignature("{strided_slice_array_grad}", {"Input", -GradVarName("Out")}, - {"axes", "StartsTensorList", "ends", -"StartsTensorList","infer_flags", "decrease_axis"}, - {GradVarName("Input")}); - -return KernelSignature("{strided_slice_array_grad}", {"Input", -GradVarName("Out")}, - {"axes", "StartsTensorList", "ends", "starts","infer_flags", -"decrease_axis"}, - {GradVarName("Input")}); - -return KernelSignature("{strided_slice_array_grad}", {"Input", -GradVarName("Out")}, - {"axes", "starts", "EndsTensor", "StartsTensor","infer_flags", -"decrease_axis"}, - {GradVarName("Input")}); - -return KernelSignature("{strided_slice_array_grad}", {"Input", -GradVarName("Out")}, - {"axes", "starts", "EndsTensor", "StartsTensorList","infer_flags", -"decrease_axis"}, - {GradVarName("Input")}); - -return KernelSignature("{strided_slice_array_grad}", {"Input", -GradVarName("Out")}, - {"axes", "starts", "EndsTensor", "starts","infer_flags", -"decrease_axis"}, - {GradVarName("Input")}); - -return KernelSignature("{strided_slice_array_grad}", {"Input", -GradVarName("Out")}, - {"axes", "starts", "EndsTensorList", "StartsTensor","infer_flags", -"decrease_axis"}, - {GradVarName("Input")}); - -return KernelSignature("{strided_slice_array_grad}", {"Input", -GradVarName("Out")}, - {"axes", "starts", "EndsTensorList", -"StartsTensorList","infer_flags", "decrease_axis"}, - {GradVarName("Input")}); - -return KernelSignature("{strided_slice_array_grad}", {"Input", -GradVarName("Out")}, - {"axes", "starts", "EndsTensorList", "starts","infer_flags", -"decrease_axis"}, - {GradVarName("Input")}); - -return KernelSignature("{strided_slice_array_grad}", {"Input", -GradVarName("Out")}, - {"axes", "starts", "ends", "StartsTensor","infer_flags", -"decrease_axis"}, - {GradVarName("Input")}); - -return KernelSignature("{strided_slice_array_grad}", {"Input", -GradVarName("Out")}, - {"axes", "starts", "ends", "StartsTensorList","infer_flags", -"decrease_axis"}, - {GradVarName("Input")}); - -return KernelSignature("{strided_slice_array_grad}", {"Input", -GradVarName("Out")}, - {"axes", "starts", "ends", "starts","infer_flags", -"decrease_axis"}, - {GradVarName("Input")}); */ diff --git a/paddle/phi/ops/compat/top_k_sig.cc b/paddle/phi/ops/compat/top_k_sig.cc index 9bf922b3d1b58..8488a18e34ce1 100644 --- a/paddle/phi/ops/compat/top_k_sig.cc +++ b/paddle/phi/ops/compat/top_k_sig.cc @@ -29,7 +29,7 @@ KernelSignature TopkOpArgumentMapping(const ArgumentMappingContext& ctx) { KernelSignature TopkGradOpArgumentMapping(const ArgumentMappingContext& ctx) { return KernelSignature("top_k_grad", - {GradVarName("Out"), "X", "Indices"}, + {"X", "Indices", GradVarName("Out")}, {"k", "axis", "largest", "sorted"}, {GradVarName("X")}); } diff --git a/paddle/phi/ops/compat/unique_sig.cc b/paddle/phi/ops/compat/unique_sig.cc new file mode 100644 index 0000000000000..2a7ba543012f3 --- /dev/null +++ b/paddle/phi/ops/compat/unique_sig.cc @@ -0,0 +1,42 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature UniqueOpArgumentMapping(const ArgumentMappingContext& ctx) { + bool is_sorted = paddle::any_cast(ctx.Attr("is_sorted")); + if (is_sorted) { + return KernelSignature( + "unique", + {"X"}, + {"return_index", "return_inverse", "return_counts", "axis", "dtype"}, + {"Out", "Indices", "Index", "Counts"}); + } else { + return KernelSignature("unique_raw", + {"X"}, + {"return_index", + "return_inverse", + "return_counts", + "axis", + "dtype", + "is_sorted"}, + {"Out", "Indices", "Index", "Counts"}); + } +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(unique, phi::UniqueOpArgumentMapping); diff --git a/paddle/phi/ops/compat/unstack_sig.cc b/paddle/phi/ops/compat/unstack_sig.cc new file mode 100644 index 0000000000000..41d7fc120a9ef --- /dev/null +++ b/paddle/phi/ops/compat/unstack_sig.cc @@ -0,0 +1,24 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature UnStackGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "unstack_grad", {GradVarName("Y")}, {"axis"}, {GradVarName("X")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(unstack_grad, phi::UnStackGradOpArgumentMapping); diff --git a/paddle/phi/tests/api/scale_api.h b/paddle/phi/tests/api/scale_api.h index 8e92c9fc3aa5d..190fef3d94657 100644 --- a/paddle/phi/tests/api/scale_api.h +++ b/paddle/phi/tests/api/scale_api.h @@ -20,8 +20,8 @@ #include "paddle/phi/api/lib/kernel_dispatch.h" #include "paddle/phi/api/lib/utils/allocator.h" #include "paddle/phi/api/lib/utils/storage.h" +#include "paddle/phi/common/int_array.h" #include "paddle/phi/common/scalar.h" -#include "paddle/phi/common/scalar_array.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/meta_tensor.h" #include "paddle/phi/infermeta/unary.h" diff --git a/paddle/phi/tests/core/test_custom_kernel.cc b/paddle/phi/tests/core/test_custom_kernel.cc index 6fe34a6891a35..07530f70b7ab5 100644 --- a/paddle/phi/tests/core/test_custom_kernel.cc +++ b/paddle/phi/tests/core/test_custom_kernel.cc @@ -24,8 +24,8 @@ limitations under the License. */ #include "paddle/fluid/platform/device_context.h" #include "paddle/phi/api/lib/utils/allocator.h" #include "paddle/phi/api/lib/utils/storage.h" +#include "paddle/phi/common/int_array.h" #include "paddle/phi/common/scalar.h" -#include "paddle/phi/common/scalar_array.h" #include "paddle/phi/core/kernel_context.h" #include "paddle/phi/core/kernel_factory.h" #include "paddle/phi/core/kernel_registry.h" @@ -52,7 +52,7 @@ void FakeDot(const Context& dev_ctx, phi::dtype::float16 fake_attr_f16, phi::DataType fake_attr_dtype, const phi::Scalar& fake_attr_scalar, - const phi::ScalarArray& fake_attr_scalar_array, + const phi::IntArray& fake_attr_int_array, const std::vector& fake_attr_int64_vec, const std::vector& fake_attr_int_vec, phi::DenseTensor* out, @@ -253,7 +253,7 @@ TEST(CustomKernel, custom_kernel_dot) { paddle::framework::LoDTensor tmp_tensor; tmp_tensor.mutable_data({1}, phi::TransToPhiPlace(backend)); phi::Scalar fake_attr_scalar{tmp_tensor}; - phi::ScalarArray fake_attr_scalar_array; + phi::IntArray fake_attr_int_array; std::vector fake_attr_int64_vec; std::vector fake_attr_int_vec; @@ -265,7 +265,7 @@ TEST(CustomKernel, custom_kernel_dot) { kernel_context.EmplaceBackAttr(fake_attr_f16); kernel_context.EmplaceBackAttr(fake_attr_dtype); kernel_context.EmplaceBackAttr(fake_attr_scalar); - kernel_context.EmplaceBackAttr(fake_attr_scalar_array); + kernel_context.EmplaceBackAttr(fake_attr_int_array); kernel_context.EmplaceBackAttr(fake_attr_int64_vec); kernel_context.EmplaceBackAttr(fake_attr_int_vec); diff --git a/paddle/phi/tests/core/test_meta_fn_utils.cc b/paddle/phi/tests/core/test_meta_fn_utils.cc index 399112d09c2ad..c90e2f3dbcded 100644 --- a/paddle/phi/tests/core/test_meta_fn_utils.cc +++ b/paddle/phi/tests/core/test_meta_fn_utils.cc @@ -100,7 +100,7 @@ TEST(MetaFnFactory, SplitInferMetaFn) { phi::InferMetaContext ctx; ctx.EmplaceBackInput(shared_meat_x); - ScalarArray num_or_sections{2, 2}; + IntArray num_or_sections{2, 2}; Scalar axis{0}; ctx.EmplaceBackAttr(num_or_sections); ctx.EmplaceBackAttr(axis); diff --git a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc index 4800e1402ba56..5e6b097ad367b 100644 --- a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc +++ b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc @@ -218,11 +218,8 @@ void TestConv3dBase(const std::vector& indices, correct_out_indices.size() * sizeof(int)); ASSERT_EQ(cmp_indices2, 0); - DenseTensor h_features_tensor = phi::Empty( - dev_ctx_cpu, - DenseTensorMeta(paddle::experimental::CppTypeToDataType::Type(), - {d_out.nnz()}, - d_out.layout())); + DenseTensor h_features_tensor = + phi::EmptyLike(dev_ctx_cpu, d_out.non_zero_elements()); phi::Copy(dev_ctx_gpu, d_out.non_zero_elements(), @@ -243,15 +240,11 @@ void TestConv3dBase(const std::vector& indices, strides, 1, subm); - DenseTensor h_features_grad = phi::Empty( - dev_ctx_cpu, - DenseTensorMeta(grads[0].dtype(), grads[0].dims(), grads[0].layout())); + DenseTensor h_features_grad = phi::EmptyLike(dev_ctx_cpu, grads[0]); phi::Copy(dev_ctx_gpu, grads[0], phi::CPUPlace(), true, &h_features_grad); f_verify(h_features_grad.data(), features_grad); - DenseTensor h_kernel_grad = phi::Empty( - dev_ctx_cpu, - DenseTensorMeta(grads[1].dtype(), grads[1].dims(), grads[1].layout())); + DenseTensor h_kernel_grad = phi::EmptyLike(dev_ctx_cpu, grads[1]); phi::Copy(dev_ctx_gpu, grads[1], phi::CPUPlace(), true, &h_kernel_grad); f_verify(h_kernel_grad.data(), kernel_grad); } diff --git a/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc index 27673704168c9..80b3392a611b0 100644 --- a/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc +++ b/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc @@ -56,6 +56,10 @@ void TestMaxPoolBase(const std::vector& indices, paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(paddle::platform::CPUPlace()) .get()); + dev_ctx_cpu.SetHostAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(phi::CPUPlace()) + .get()); dev_ctx_cpu.Init(); const int in_channels = x_dims[4]; @@ -138,11 +142,8 @@ void TestMaxPoolBase(const std::vector& indices, phi::Copy( dev_ctx_gpu, indices_tensor, phi::GPUPlace(), true, &d_indices_tensor); - DenseTensor d_features_tensor = phi::Empty( - dev_ctx_gpu, - DenseTensorMeta(paddle::experimental::CppTypeToDataType::Type(), - {non_zero_num, in_channels}, - DataLayout::NHWC)); + DenseTensor d_features_tensor = + phi::EmptyLike(dev_ctx_gpu, features_tensor); phi::Copy( dev_ctx_gpu, features_tensor, phi::GPUPlace(), true, &d_features_tensor); @@ -178,11 +179,8 @@ void TestMaxPoolBase(const std::vector& indices, correct_out_indices.size() * sizeof(int)); ASSERT_EQ(cmp_indices2, 0); - DenseTensor h_features_tensor = phi::Empty( - dev_ctx_cpu, - DenseTensorMeta(paddle::experimental::CppTypeToDataType::Type(), - {d_out.nnz()}, - d_out.layout())); + DenseTensor h_features_tensor = + phi::EmptyLike(dev_ctx_cpu, d_out.non_zero_elements()); phi::Copy(dev_ctx_gpu, d_out.non_zero_elements(), @@ -198,9 +196,7 @@ void TestMaxPoolBase(const std::vector& indices, d_out, d_out.non_zero_elements(), kernel_sizes); - DenseTensor h_features_grad = phi::Empty( - dev_ctx_cpu, - DenseTensorMeta(x_grad.dtype(), x_grad.dims(), x_grad.layout())); + DenseTensor h_features_grad = phi::EmptyLike(dev_ctx_cpu, x_grad); phi::Copy(dev_ctx_gpu, x_grad, phi::CPUPlace(), true, &h_features_grad); f_verify(h_features_grad.data(), features_grad); } diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat index 4092922d01322..c4127527b390d 100644 --- a/paddle/scripts/paddle_build.bat +++ b/paddle/scripts/paddle_build.bat @@ -327,8 +327,12 @@ set PreferredToolArchitecture=x64 for /F %%# in ('wmic os get localdatetime^|findstr 20') do set start=%%# set start=%start:~4,10% -if not defined CUDA_TOOLKIT_ROOT_DIR set CUDA_TOOLKIT_ROOT_DIR=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2 -set PATH=%TENSORRT_ROOT:/=\%\lib;%CUDA_TOOLKIT_ROOT_DIR%\bin;%CUDA_TOOLKIT_ROOT_DIR%\libnvvp;%PATH% +if not defined CUDA_TOOLKIT_ROOT_DIR set CUDA_TOOLKIT_ROOT_DIR=C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.2 +set PATH=%TENSORRT_ROOT:/=\%\lib;%CUDA_TOOLKIT_ROOT_DIR:/=\%\bin;%CUDA_TOOLKIT_ROOT_DIR:/=\%\libnvvp;%PATH% + +rem CUDA_TOOLKIT_ROOT_DIR in cmake must use / rather than \ +set TENSORRT_ROOT=%TENSORRT_ROOT:\=/% +set CUDA_TOOLKIT_ROOT_DIR=%CUDA_TOOLKIT_ROOT_DIR:\=/% rem install ninja if GENERATOR is Ninja if %GENERATOR% == "Ninja" ( @@ -427,14 +431,16 @@ echo cmake .. -G %GENERATOR% -DCMAKE_BUILD_TYPE=Release -DWITH_AVX=%WITH_AVX% -D -DWITH_INFERENCE_API_TEST=%WITH_INFERENCE_API_TEST% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% ^ -DINFERENCE_DEMO_INSTALL_DIR=%INFERENCE_DEMO_INSTALL_DIR% -DWITH_STATIC_LIB=%WITH_STATIC_LIB% ^ -DWITH_TENSORRT=%WITH_TENSORRT% -DTENSORRT_ROOT="%TENSORRT_ROOT%" -DMSVC_STATIC_CRT=%MSVC_STATIC_CRT% ^ --DWITH_UNITY_BUILD=%WITH_UNITY_BUILD% -DCUDA_ARCH_NAME=%CUDA_ARCH_NAME% -DCUB_PATH=%THIRD_PARTY_HOME%/cub +-DWITH_UNITY_BUILD=%WITH_UNITY_BUILD% -DCUDA_ARCH_NAME=%CUDA_ARCH_NAME% -DCUB_PATH=%THIRD_PARTY_HOME%/cub ^ +-DCUDA_TOOLKIT_ROOT_DIR="%CUDA_TOOLKIT_ROOT_DIR%" cmake .. -G %GENERATOR% -DCMAKE_BUILD_TYPE=Release -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^ -DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DPYTHON_EXECUTABLE=%PYTHON_EXECUTABLE% -DON_INFER=%ON_INFER% ^ -DWITH_INFERENCE_API_TEST=%WITH_INFERENCE_API_TEST% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% ^ -DINFERENCE_DEMO_INSTALL_DIR=%INFERENCE_DEMO_INSTALL_DIR% -DWITH_STATIC_LIB=%WITH_STATIC_LIB% ^ -DWITH_TENSORRT=%WITH_TENSORRT% -DTENSORRT_ROOT="%TENSORRT_ROOT%" -DMSVC_STATIC_CRT=%MSVC_STATIC_CRT% ^ --DWITH_UNITY_BUILD=%WITH_UNITY_BUILD% -DCUDA_ARCH_NAME=%CUDA_ARCH_NAME% -DCUB_PATH=%THIRD_PARTY_HOME%/cub +-DWITH_UNITY_BUILD=%WITH_UNITY_BUILD% -DCUDA_ARCH_NAME=%CUDA_ARCH_NAME% -DCUB_PATH=%THIRD_PARTY_HOME%/cub ^ +-DCUDA_TOOLKIT_ROOT_DIR="%CUDA_TOOLKIT_ROOT_DIR%" goto:eof :cmake_error @@ -699,7 +705,8 @@ echo cmake .. -G %GENERATOR% -DCMAKE_BUILD_TYPE=Release -DWITH_AVX=%WITH_AVX% -D -DWITH_INFERENCE_API_TEST=%WITH_INFERENCE_API_TEST% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% ^ -DINFERENCE_DEMO_INSTALL_DIR=%INFERENCE_DEMO_INSTALL_DIR% -DWITH_STATIC_LIB=%WITH_STATIC_LIB% ^ -DWITH_TENSORRT=%WITH_TENSORRT% -DTENSORRT_ROOT="%TENSORRT_ROOT%" -DMSVC_STATIC_CRT=%MSVC_STATIC_CRT% ^ --DWITH_UNITY_BUILD=%WITH_UNITY_BUILD% -DCUDA_ARCH_NAME=%CUDA_ARCH_NAME% >> %work_dir%\win_cmake.sh +-DWITH_UNITY_BUILD=%WITH_UNITY_BUILD% -DCUDA_ARCH_NAME=%CUDA_ARCH_NAME% ^ +-DCUDA_TOOLKIT_ROOT_DIR="%CUDA_TOOLKIT_ROOT_DIR%" >> %work_dir%\win_cmake.sh %cache_dir%\tools\busybox64.exe bash %work_dir%\tools\windows\run_unittests.sh %NIGHTLY_MODE% %PRECISION_TEST% %WITH_GPU% diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index b9b2e87aeba92..d1220e4537582 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -2778,7 +2778,8 @@ function exec_samplecode_test() { if [ "$1" = "cpu" ] ; then python sampcd_processor.py cpu; example_error=$? elif [ "$1" = "gpu" ] ; then - python sampcd_processor.py --threads=16 gpu; example_error=$? + SAMPLE_CODE_EXEC_THREADS=${SAMPLE_CODE_EXEC_THREADS:-2} + python sampcd_processor.py --threads=${SAMPLE_CODE_EXEC_THREADS} gpu; example_error=$? fi if [ "$example_error" != "0" ];then echo "Code instance execution failed" >&2 diff --git a/python/paddle/distributed/launch/context/device.py b/python/paddle/distributed/launch/context/device.py index c2f6896ab6c04..30b8cc1538590 100644 --- a/python/paddle/distributed/launch/context/device.py +++ b/python/paddle/distributed/launch/context/device.py @@ -142,4 +142,4 @@ def detect_device(self): if __name__ == '__main__': d = Device.parse_device() - print(d.get_selected_flag()) + print(d.get_selected_devices()) diff --git a/python/paddle/distributed/launch/controllers/collective.py b/python/paddle/distributed/launch/controllers/collective.py index bbcb7c81d6e65..3763bac041451 100644 --- a/python/paddle/distributed/launch/controllers/collective.py +++ b/python/paddle/distributed/launch/controllers/collective.py @@ -93,7 +93,7 @@ def build_pod(self): "PADDLE_RANK_IN_NODE": str(i), } if self.pod.replicas == 1: - e.update({selected_dev_key: selected_dev_list}) + e.update({selected_dev_key: ",".join(selected_dev_list)}) else: e.update({selected_dev_key: selected_dev_list[i]}) self.add_container(envs=e, log_tag=i) @@ -134,7 +134,7 @@ def run(self): if ok: self.job.replicas = replicas else: - self.ctx.logger.warnning("peer not ready {}".format(self.job)) + self.ctx.logger.warning("peer not ready {}".format(self.job)) break self.ctx.logger.debug("Run {}".format(self.job)) diff --git a/python/paddle/distributed/launch/job/container.py b/python/paddle/distributed/launch/job/container.py index 7105cae9024f2..1f43b6ce04bac 100644 --- a/python/paddle/distributed/launch/job/container.py +++ b/python/paddle/distributed/launch/job/container.py @@ -162,7 +162,7 @@ def logs(self, fn=None, offset=0, whence=1, lines=1000): if idx > lines: break finally: - return self._log_handler.tell() + return def tail(self, length=3000): if not self._log_handler: diff --git a/python/paddle/distributed/ps/the_one_ps.py b/python/paddle/distributed/ps/the_one_ps.py index 00daaf986bfa0..007aaeb4fed67 100755 --- a/python/paddle/distributed/ps/the_one_ps.py +++ b/python/paddle/distributed/ps/the_one_ps.py @@ -16,7 +16,7 @@ import os import paddle.fluid as fluid -import paddle.distributed.fleet as fleet +from paddle.distributed import fleet from paddle.fluid import core from paddle.distributed.ps.utils.public import * from paddle.fluid.framework import Program @@ -26,7 +26,7 @@ from paddle.fluid.framework import Variable, Parameter from paddle.distributed.fleet.runtime.runtime_base import RuntimeBase from paddle.distributed.fleet.base.private_helper_function import wait_server_ready -import paddle.distributed.fleet.proto.the_one_ps_pb2 as ps_pb2 +from paddle.distributed.fleet.proto import the_one_ps_pb2 from paddle.fluid.communicator import Communicator, HeterClient from google.protobuf import text_format @@ -518,7 +518,7 @@ def _set(self, table_proto): table_proto.table_id = self.idx table_proto.table_class = 'BarrierTable' table_proto.shard_num = 256 - table_proto.type = ps_pb2.PS_OTHER_TABLE + table_proto.type = the_one_ps_pb2.PS_OTHER_TABLE table_proto.accessor.accessor_class = "CommMergeAccessor" table_proto.accessor.fea_dim = 0 @@ -544,7 +544,7 @@ def __init__(self, idx, tensor_dict, role_maker): def _set(self, table_proto): table_proto.table_id = self.idx - table_proto.type = ps_pb2.PS_OTHER_TABLE + table_proto.type = the_one_ps_pb2.PS_OTHER_TABLE table_proto.table_class = self.tensor_dict.get("tensor_table_class", '') table_proto.accessor.accessor_class = "CommMergeAccessor" @@ -573,7 +573,7 @@ def _set(self, table_proto): return table_proto.table_id = ctx.table_id() table_proto.table_class = self.table_class - table_proto.type = ps_pb2.PS_SPARSE_TABLE + table_proto.type = the_one_ps_pb2.PS_SPARSE_TABLE table_proto.shard_num = self.shard_num self.common.table_name = self.context['grad_name_to_param_name'][ @@ -632,7 +632,7 @@ def _set(self, table_proto): return table_proto.table_id = ctx.table_id() table_proto.table_class = self.table_class - table_proto.type = ps_pb2.PS_SPARSE_TABLE + table_proto.type = the_one_ps_pb2.PS_SPARSE_TABLE table_proto.shard_num = self.shard_num table_proto.accessor.accessor_class = 'CommMergeAccessor' @@ -664,7 +664,7 @@ def _set(self, table_proto): table_proto.table_id = ctx.table_id() - table_proto.type = ps_pb2.PS_DENSE_TABLE + table_proto.type = the_one_ps_pb2.PS_DENSE_TABLE table_proto.table_class = "CommonDenseTable" table_proto.shard_num = 256 @@ -748,7 +748,7 @@ def __init__(self, context): self.service = self._get_service() self.fs_client = self._get_fs_client() - self.ps_desc = ps_pb2.PSParameter() + self.ps_desc = the_one_ps_pb2.PSParameter() def _get_tensor_tables(self): program_idx = 0 @@ -806,7 +806,7 @@ def build_server_desc(self): table_proto = self.ps_desc.server_param.downpour_server_param.downpour_table_param.add( ) table._set(table_proto) - if table_proto.type == ps_pb2.PS_SPARSE_TABLE and table_proto.common is not None: + if table_proto.type == the_one_ps_pb2.PS_SPARSE_TABLE and table_proto.common is not None: self.sparse_table_maps[ table_proto.common.table_name] = table_proto.table_id diff --git a/python/paddle/fluid/contrib/layers/nn.py b/python/paddle/fluid/contrib/layers/nn.py index fde42e35e0739..c73ea8b5b0e1a 100644 --- a/python/paddle/fluid/contrib/layers/nn.py +++ b/python/paddle/fluid/contrib/layers/nn.py @@ -530,21 +530,44 @@ def fused_seqpool_cvm(input, use_cvm=True, cvm_offset=2): """ - **Embedding Sequence pool** + :api_attr: Static Graph - This layer is the fusion of sequence_pool and continuous_value_model. + This OP is the fusion of sequence_pool and continuous_value_model op. - **Notes: The Op only receives List of LoDTensor as input, only support SUM pooling now. + **Note:** The Op only receives List of LoDTensor as input, only support SUM pooling now. Args: input(Variable|list of Variable): Input is List of LoDTensor. pool_type(str): pooling type, only support SUM pooling now. cvm(Variable): cvm Variable. - pad_value(float): padding value of sequence pool. - use_cvm(bool): use cvm or not. + pad_value(float, optional): padding value of sequence pool. Default: 0.0. + use_cvm(bool, optional): use cvm or not. Default: True. + cvm_offset(int, optional): cvm offset. Default: 2, which means cvm contains show, click. + Returns: Variable|list of Variable: The tensor variable storing sequence pool and cvm of input. + + Examples: + .. code-block:: python + + import paddle + import paddle.fluid as fluid + paddle.enable_static() + + data = paddle.static.data(name='x', shape=[-1, 1], dtype='int64', lod_level=1) + data2 = paddle.static.data(name='y', shape=[-1, 1], dtype='int64', lod_level=1) + inputs = [data, data2] + embs = fluid.layers.nn._pull_box_sparse(input=inputs, size=11, is_distributed=True, is_sparse=True) + + label = paddle.static.data(name="label", shape=[-1, 1], dtype="int64", lod_level=1) + ones = fluid.layers.fill_constant_batch_size_like(input=label, shape=[-1, 1], dtype="int64", value=1) + show_clk = paddle.cast(paddle.concat([ones, label], axis=1), dtype='float32') + show_clk.stop_gradient = True + + cvms = fluid.contrib.layers.fused_seqpool_cvm(embs, 'sum', show_clk) + + """ helper = LayerHelper('fused_seqpool_cvm', **locals()) diff --git a/python/paddle/fluid/dygraph/tracer.py b/python/paddle/fluid/dygraph/tracer.py index e1fabf9aeda10..747fe7d32cb65 100644 --- a/python/paddle/fluid/dygraph/tracer.py +++ b/python/paddle/fluid/dygraph/tracer.py @@ -110,6 +110,9 @@ def eager_trace_op(self, arg_list = [] for i in range(len(op_args)): + # initialized with None + arg_to_append = None + arg_name = op_args[i] arg_type = op_args_type[i] if arg_name in inputs.keys(): @@ -117,14 +120,20 @@ def eager_trace_op(self, elif arg_name in outputs.keys(): arg_to_append = outputs[arg_name] else: - if "Num" in arg_name: + if "Num" in arg_name[-3:]: # Remove "Num" suffix to get out_name out_name = arg_name[:-3] assert out_name in outputs.keys() num_outs = len(outputs[out_name]) arg_to_append = num_outs - else: - arg_to_append = None + # NOTE(dev): For MasterParam/MasterParamOut in optimzer op + elif "Var" in arg_name[-3:]: + out_name = arg_name[:-3] + print(out_name) + if out_name in outputs.keys(): + arg_to_append = outputs[out_name] + elif out_name in inputs.keys(): + arg_to_append = inputs[out_name] if arg_to_append is None: arg_list.append(arg_to_append) diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py index 60c144d550028..d67edf3eb1fdf 100644 --- a/python/paddle/fluid/dygraph/varbase_patch_methods.py +++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py @@ -20,7 +20,7 @@ import paddle from .. import framework -from ..framework import convert_np_dtype_to_dtype_ +from ..framework import convert_np_dtype_to_dtype_, _in_legacy_dygraph from .. import core from .. import unique_name from ..framework import Variable, Parameter, ParamBase, _getitem_impl_, _setitem_impl_, EagerParamBase @@ -798,7 +798,11 @@ def _set_grad_ivar(self, value): @framework.dygraph_only def clone(self): - return _C_ops.assign(self) + if _in_legacy_dygraph(): + output = core.VarBase() + else: + output = core.eager.Tensor() + return _C_ops.assign(self, output) @framework.dygraph_only def value(self): diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 6d32632f2b445..b8ed2716fc7d5 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -171,6 +171,12 @@ def _test_eager_guard(place=None): if not _already_patch_eager_tensor: monkey_patch_varbase() monkey_patch_math_varbase() + + # Ugly setting + from paddle.tensor.manipulation import fill_, zero_ + setattr(core.eager.Tensor, 'fill_', fill_) + setattr(core.eager.Tensor, 'zero_', zero_) + _already_patch_eager_tensor = True try: yield diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py index 138e968a0b385..785a3e6eac132 100755 --- a/python/paddle/fluid/layers/control_flow.py +++ b/python/paddle/fluid/layers/control_flow.py @@ -974,6 +974,19 @@ def get_inputs_outputs_in_block(current_block, inner_inputs, inner_outputs, :return: inner_inputs, inner_outputs """ + def is_ignore_vars(op, var_name): + # NOTE(dev): There are some persistable var created in some non-standard API + # such as "contrib.layers.shuffle_batch". It create a "Seed" used both in + # Input and Output. This var shall not be considered as a loop_var in + # control_flow. + IGNORE_VAR_NAMES = {"shuffle_batch": ["shuffle_batch_seed"]} + if op.type in IGNORE_VAR_NAMES: + var_names = IGNORE_VAR_NAMES[op.type] + for name in var_names: + if name in var_name: + return True + return False + # Step1: update inner_inputs and inner_outputs # NOTE: Here assumes that all variables are input or output of Ops, # but some variables are created without appendding a real op. @@ -982,7 +995,8 @@ def get_inputs_outputs_in_block(current_block, inner_inputs, inner_outputs, assert isinstance(op, Operator) for iname in op.input_names: for in_var_name in op.input(iname): - if in_var_name not in inner_outputs: + if in_var_name not in inner_outputs and not is_ignore_vars( + op, in_var_name): inner_inputs.add(in_var_name) for oname in op.output_names: diff --git a/python/paddle/fluid/layers/layer_function_generator.py b/python/paddle/fluid/layers/layer_function_generator.py index bf7309e474a17..a99838cb27d4c 100755 --- a/python/paddle/fluid/layers/layer_function_generator.py +++ b/python/paddle/fluid/layers/layer_function_generator.py @@ -20,7 +20,7 @@ from six.moves import cStringIO from ..proto import framework_pb2 -from ..framework import OpProtoHolder, Variable, core, convert_np_dtype_to_dtype_, _non_static_mode +from ..framework import OpProtoHolder, Variable, core, convert_np_dtype_to_dtype_, _non_static_mode, in_dygraph_mode, _in_legacy_dygraph from ..layer_helper import LayerHelper from ..data_feeder import check_variable_and_dtype from paddle import _C_ops @@ -257,6 +257,12 @@ def generate_activation_fn(op_type): op_proto = OpProtoHolder.instance().get_op_proto(op_type) def func(x, name=None): + final_state_op_type = "final_state_%s" % op_type + if in_dygraph_mode() and hasattr(_C_ops, final_state_op_type): + op = getattr(_C_ops, final_state_op_type) + return op(x) + # TODO(dev): Because some ops' yaml has not been migrated. + # Replace it with _in_legacy_dygraph while all yaml work is done. if _non_static_mode(): op = getattr(_C_ops, op_type) return op(x) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index d1ef9d6d8b4ea..9567490551c28 100755 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -40,6 +40,7 @@ import paddle from paddle.utils import deprecated from paddle import _C_ops +from paddle.fluid.framework import in_dygraph_mode, _in_legacy_dygraph __all__ = [ 'fc', @@ -204,7 +205,6 @@ def _elementwise_op_in_dygraph(x, op_name=None): op = getattr(_C_ops, op_name) out = op(x, y, 'axis', axis, 'use_mkldnn', use_mkldnn) - return dygraph_utils._append_activation_in_dygraph( out, act, use_mkldnn=use_mkldnn) @@ -11426,6 +11426,10 @@ def strided_slice(input, axes, starts, ends, strides): sliced_2 = fluid.layers.strided_slice(input, axes=axes, starts=[minus_3, 0, 2], ends=ends, strides=strides_2) # sliced_2 is input[:, 0:3:1, 0:2:1, 2:4:2]. """ + if in_dygraph_mode(): + return _C_ops.final_state_strided_slice(input, axes, starts, ends, + strides) + helper = LayerHelper('strided_slice', **locals()) check_variable_and_dtype(input, 'input', @@ -11590,7 +11594,11 @@ def shape(input): res = exe.run(fluid.default_main_program(), feed={'x':img}, fetch_list=[output]) print(res) # [array([ 3, 100, 100], dtype=int32)] """ - if _non_static_mode(): + if in_dygraph_mode(): + out = _C_ops.final_state_shape(input) + out.stop_gradient = True + return out + if _in_legacy_dygraph(): out = _C_ops.shape(input) out.stop_gradient = True return out @@ -12529,6 +12537,9 @@ def logical_and(x, y, out=None, name=None): res = paddle.logical_and(x, y) print(res) # [True False True False] """ + if in_dygraph_mode(): + return _C_ops.final_state_logical_and(x, y) + return _logical_op( op_name="logical_and", x=x, y=y, name=name, out=out, binary_op=True) @@ -12568,6 +12579,8 @@ def logical_or(x, y, out=None, name=None): res = paddle.logical_or(x, y) print(res) # [[ True True] [ True False]] """ + if in_dygraph_mode(): + return _C_ops.final_state_logical_or(x, y) return _logical_op( op_name="logical_or", x=x, y=y, name=name, out=out, binary_op=True) @@ -12607,6 +12620,9 @@ def logical_xor(x, y, out=None, name=None): res = paddle.logical_xor(x, y) print(res) # [[False, True], [ True, False]] """ + if in_dygraph_mode(): + return _C_ops.final_state_logical_xor(x, y) + return _logical_op( op_name="logical_xor", x=x, y=y, name=name, out=out, binary_op=True) @@ -12639,7 +12655,8 @@ def logical_not(x, out=None, name=None): res = paddle.logical_not(x) print(res) # [False True False True] """ - + if in_dygraph_mode(): + return _C_ops.final_state_logical_not(x) return _logical_op( op_name="logical_not", x=x, y=None, name=name, out=out, binary_op=False) diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py index f9f65ffb57f90..252e4931b39a4 100644 --- a/python/paddle/fluid/layers/tensor.py +++ b/python/paddle/fluid/layers/tensor.py @@ -606,15 +606,24 @@ def assign(input, output=None): # isinstance(VarBase, Variable) == False. It will cause return None # after this api. if isinstance(input, (Variable, core.VarBase)): - check_dtype(input.dtype, 'input', [ - 'float16', 'uint16', 'float32', 'float64', 'int32', 'int64', - 'uint8', 'bool' - ], 'assign', '(When the type of input in assign is Variable.)') - if output is None: - output = helper.create_variable_for_type_inference( - dtype=input.dtype) - helper.append_op( - type='assign', inputs={'X': [input]}, outputs={'Out': [output]}) + if _non_static_mode(): + if output is None: + if _in_legacy_dygraph(): + output = core.VarBase() + else: + output = core.eager.Tensor() + _C_ops.assign(input, output) + else: + check_dtype(input.dtype, 'input', [ + 'float16', 'uint16', 'float32', 'float64', 'int32', 'int64', + 'uint8', 'bool' + ], 'assign', '(When the type of input in assign is Variable.)') + if output is None: + output = helper.create_variable_for_type_inference( + dtype=input.dtype) + helper.append_op( + type='assign', inputs={'X': [input]}, + outputs={'Out': [output]}) elif isinstance(input, numpy.ndarray): # Not support [var, var, ...] currently. if len(input.shape) > 0 and any(isinstance(x, Variable) for x in input): @@ -663,9 +672,7 @@ def assign(input, output=None): }) if is_inplace and _non_static_mode(): - # TODO(jiabin): Remove this when we support inplace - if _in_legacy_dygraph(): - output._bump_inplace_version() + output._bump_inplace_version() return output diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cycle_gan.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cycle_gan.py index d6840ed62810e..8a9a1e19205fb 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cycle_gan.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cycle_gan.py @@ -631,4 +631,5 @@ def test_train(self): if __name__ == "__main__": - unittest.main() + with fluid.framework._test_eager_guard(): + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py index 88c6060abf7d9..19965821e8750 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py @@ -610,4 +610,5 @@ def predict_dygraph_jit(self, batch): if __name__ == "__main__": - unittest.main() + with fluid.framework._test_eager_guard(): + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_amp.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_amp.py index 509accd8f8ab2..1d45e906cd378 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_amp.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_amp.py @@ -116,4 +116,5 @@ def test_resnet(self): if __name__ == '__main__': - unittest.main() + with fluid.framework._test_eager_guard(): + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_pure_fp16.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_pure_fp16.py index cf5c2b731141f..49d114730e4ed 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_pure_fp16.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_pure_fp16.py @@ -125,4 +125,5 @@ def test_resnet(self): if __name__ == '__main__': - unittest.main() + with fluid.framework._test_eager_guard(): + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_nearest_interp_v2.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_nearest_interp_v2.py index 57d7d70c66a5b..cf8b7b3516b37 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_nearest_interp_v2.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_nearest_interp_v2.py @@ -41,7 +41,9 @@ def generate_input(): "data_layout": "NCHW", "interp_method": "nearest", "align_corners": False, + "align_mode": 1, "scale": [2., 2.], + "out_d": 0, "out_h": 0, "out_w": 0 } diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py index ae74fbd1c1e09..8d14516374038 100644 --- a/python/paddle/fluid/tests/unittests/op_test.py +++ b/python/paddle/fluid/tests/unittests/op_test.py @@ -781,10 +781,12 @@ def parse_attri_value(name, op_inputs, op_attrs): if arg_name in api_ignore_param_list: results.append(get_default(idx, api_defaults)) else: - assert idx_of_op_proto_arguments < len( - input_arguments), "Assert False." - tmp = input_arguments[idx_of_op_proto_arguments] - idx_of_op_proto_arguments += 1 + if (idx_of_op_proto_arguments < len(input_arguments)): + tmp = input_arguments[idx_of_op_proto_arguments] + idx_of_op_proto_arguments += 1 + else: + tmp = Empty() # use the default value + if isinstance(tmp, Empty): results.append(get_default(idx, api_defaults)) else: @@ -1356,6 +1358,9 @@ def __init__(self, op_test, expect_dict): self.op_test = op_test # stop the op_test object. self.op_type = op_test.op_type + def init(self): + pass + def convert_uint16_to_float(self, actual_np, expect_np): raise NotImplementedError("base class, not implement!") @@ -1387,7 +1392,7 @@ def _compare_numpy(self, name, actual_np, expect_np): rtol=self.rtol if hasattr(self, 'rtol') else 1e-5, equal_nan=equal_nan), "Output (" + name + ") has diff at " + str(place) + " in " + - self.checker_name + " checker") + self.checker_name) def _compare_list(self, name, actual, expect): """ if expect is a tuple, we need to compare list. @@ -1403,7 +1408,7 @@ def compare_single_output_with_expect(self, name, expect): # NOTE(zhiqiu): np.allclose([], [1.]) returns True # see details: https://stackoverflow.com/questions/38331703/why-does-numpys-broadcasting-sometimes-allow-comparing-arrays-of-different-leng if expect_np.size == 0: - self.op_test.assertTrue(actual_np.size == 0) + self.op_test.assertTrue(actual_np.size == 0) # }}} self._compare_numpy(name, actual_np, expect_np) if isinstance(expect, tuple): self._compare_list(name, actual, expect) @@ -1431,10 +1436,14 @@ def check(self): the main enter point of Checker class """ + self.init() self.calculate_output() self.compare_outputs_with_expects() class StaticChecker(Checker): + def init(self): + self.checker_name = "static checker" + def calculate_output(self): outs, fetch_list = self.op_test._calc_output( place, no_check_set=no_check_set) @@ -1474,6 +1483,9 @@ def _compare_list(self, name, actual, expect): "Output (" + name + ") has different lod at " + str(place)) class DygraphChecker(Checker): + def init(self): + self.checker_name = "dygraph checker" + def calculate_output(self): self.outputs = self.op_test._calc_dygraph_output( place, no_check_set=no_check_set) @@ -1519,18 +1531,21 @@ def _compare_numpy(self, name, actual_np, expect_np): rtol=self.rtol if hasattr(self, 'rtol') else 1e-5, equal_nan=equal_nan), "Output (" + name + ") has diff at " + str(place) + - " in " + self.checker_name + " checker") + " in " + self.checker_name) class EagerChecker(DygraphChecker): + def init(self): + self.checker_name = "eager checker" + def calculate_output(self): # we only check end2end api when check_eager=True - self.is_python_api_test = True with _test_eager_guard(): + self.is_python_api_test = True eager_dygraph_outs = self.op_test._calc_python_api_output( place) if eager_dygraph_outs is None: - # missing KernelSignature, fall back to eager middle output. self.is_python_api_test = False + # missing KernelSignature, fall back to eager middle output. eager_dygraph_outs = self.op_test._calc_dygraph_output( place, no_check_set=no_check_set) self.outputs = eager_dygraph_outs diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py index add49d11e53a1..471d0245aa83c 100755 --- a/python/paddle/fluid/tests/unittests/test_activation_op.py +++ b/python/paddle/fluid/tests/unittests/test_activation_op.py @@ -18,7 +18,7 @@ import numpy as np from scipy.special import expit, erf -from paddle.fluid.tests.unittests.op_test import OpTest, convert_float_to_uint16, skip_check_grad_ci +from op_test import OpTest, convert_float_to_uint16, skip_check_grad_ci import paddle import paddle.nn as nn import paddle.nn.functional as F @@ -958,6 +958,7 @@ def test_errors(self): class TestSqrt(TestActivation, TestParameter): def setUp(self): self.op_type = "sqrt" + self.python_api = paddle.sqrt self.init_dtype() np.random.seed(1023) @@ -970,7 +971,10 @@ def setUp(self): def test_check_grad(self): if self.dtype == np.float16: return - self.check_grad(['X'], 'Out') + self.check_grad(['X'], 'Out', check_eager=True) + + def test_check_output(self): + self.check_output(check_eager=True) @unittest.skipIf(not core.is_compiled_with_cuda(), @@ -978,6 +982,7 @@ def test_check_grad(self): class TestSqrtBF16(OpTest): def setUp(self): self.op_type = "sqrt" + self.python_api = paddle.sqrt self.init_dtype() np.random.seed(1023) @@ -994,11 +999,11 @@ def init_dtype(self): def test_check_output(self): place = core.CUDAPlace(0) - self.check_output_with_place(place) + self.check_output_with_place(place, check_eager=True) def test_check_grad(self): place = core.CUDAPlace(0) - self.check_grad_with_place(place, ['X'], 'Out') + self.check_grad_with_place(place, ['X'], 'Out', check_eager=True) class TestRsqrt(TestActivation): @@ -2048,6 +2053,7 @@ def test_errors(self): class TestReciprocal(TestActivation): def setUp(self): self.op_type = "reciprocal" + self.python_api = paddle.reciprocal self.init_dtype() np.random.seed(1024) @@ -2060,7 +2066,10 @@ def setUp(self): def test_check_grad(self): if self.dtype == np.float16: return - self.check_grad(['X'], 'Out', max_relative_error=0.01) + self.check_grad(['X'], 'Out', max_relative_error=0.01, check_eager=True) + + def test_check_output(self): + self.check_output(check_eager=True) class TestLog(TestActivation): @@ -2236,6 +2245,7 @@ def test_api(self): class TestSquare(TestActivation): def setUp(self): self.op_type = "square" + self.python_api = paddle.square self.init_dtype() np.random.seed(1024) @@ -2248,7 +2258,11 @@ def setUp(self): def test_check_grad(self): if self.dtype == np.float16: return - self.check_grad(['X'], 'Out', max_relative_error=0.007) + self.check_grad( + ['X'], 'Out', max_relative_error=0.007, check_eager=True) + + def test_check_output(self): + self.check_output(check_eager=True) @unittest.skipIf(not core.is_compiled_with_cuda(), @@ -2256,6 +2270,7 @@ def test_check_grad(self): class TestSquareBF16(OpTest): def setUp(self): self.op_type = "square" + self.python_api = paddle.square self.init_dtype() np.random.seed(1024) @@ -2272,11 +2287,12 @@ def init_dtype(self): def test_check_output(self): place = core.CUDAPlace(0) - self.check_output_with_place(place) + self.check_output_with_place(place, check_eager=True) def test_check_grad(self): place = core.CUDAPlace(0) - self.check_grad_with_place(place, ['X'], 'Out', numeric_grad_delta=0.5) + self.check_grad_with_place( + place, ['X'], 'Out', numeric_grad_delta=0.5, check_eager=True) class TestPow(TestActivation): diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py index c9abac8fb7946..dda10fdd84fff 100644 --- a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py +++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py @@ -19,7 +19,7 @@ from paddle.fluid.op import Operator import paddle.fluid as fluid from op_test import OpTest, _set_use_system_allocator -from paddle.fluid.framework import grad_var_name +from paddle.fluid.framework import grad_var_name, _test_eager_guard import paddle.fluid as fluid from paddle.fluid import Program, program_guard import paddle @@ -46,32 +46,32 @@ def test_error(self): def error1d_dataformat(): x_data_4 = np.random.random(size=(2, 1, 3, 3)).astype('float32') batch_norm1d = paddle.nn.BatchNorm1D(1, data_format='NCDHW') - batch_norm1d(fluid.dygraph.to_variable(x_data_4)) + batch_norm1d(paddle.to_tensor(x_data_4)) def error2d_dataformat(): x_data_3 = np.random.random(size=(2, 1, 3)).astype('float32') batch_norm2d = paddle.nn.BatchNorm2D(1, data_format='NCDHW') - batch_norm2d(fluid.dygraph.to_variable(x_data_3)) + batch_norm2d(paddle.to_tensor(x_data_3)) def error3d_dataformat(): x_data_4 = np.random.random(size=(2, 1, 3, 3)).astype('float32') batch_norm3d = paddle.nn.BatchNorm3D(1, data_format='NCL') - batch_norm3d(fluid.dygraph.to_variable(x_data_4)) + batch_norm3d(paddle.to_tensor(x_data_4)) def error1d(): x_data_4 = np.random.random(size=(2, 1, 3, 3)).astype('float32') batch_norm1d = paddle.nn.BatchNorm1D(1) - batch_norm1d(fluid.dygraph.to_variable(x_data_4)) + batch_norm1d(paddle.to_tensor(x_data_4)) def error2d(): x_data_3 = np.random.random(size=(2, 1, 3)).astype('float32') batch_norm2d = paddle.nn.BatchNorm2D(1) - batch_norm2d(fluid.dygraph.to_variable(x_data_3)) + batch_norm2d(paddle.to_tensor(x_data_3)) def error3d(): x_data_4 = np.random.random(size=(2, 1, 3, 3)).astype('float32') batch_norm3d = paddle.nn.BatchNorm3D(1) - batch_norm3d(fluid.dygraph.to_variable(x_data_4)) + batch_norm3d(paddle.to_tensor(x_data_4)) with fluid.dygraph.guard(p): self.assertRaises(ValueError, error1d) @@ -94,13 +94,18 @@ def compute_v1(x, is_test, trainable_statistics): shape[1], is_test=is_test, trainable_statistics=trainable_statistics) - y = bn(fluid.dygraph.to_variable(x)) + y = bn(paddle.to_tensor(x)) return y.numpy() def compute_v2(x): with fluid.dygraph.guard(p): bn = paddle.nn.BatchNorm2D(shape[1]) - y = bn(fluid.dygraph.to_variable(x)) + y = bn(paddle.to_tensor(x)) + + with _test_eager_guard(): + bn = paddle.nn.BatchNorm2D(shape[1]) + eag_y = bn(paddle.to_tensor(x)) + assert np.allclose(eag_y.numpy(), y.numpy()) return y.numpy() def compute_v3(x, is_test, trainable_statistics): @@ -115,14 +120,14 @@ def compute_v3(x, is_test, trainable_statistics): initializer=fluid.initializer.Constant(0.0), trainable=False), trainable_statistics=trainable_statistics) - y = bn(fluid.dygraph.to_variable(x)) + y = bn(paddle.to_tensor(x)) return y.numpy() def compute_v4(x): with fluid.dygraph.guard(p): bn = paddle.nn.BatchNorm2D( shape[1], weight_attr=False, bias_attr=False) - y = bn(fluid.dygraph.to_variable(x)) + y = bn(paddle.to_tensor(x)) return y.numpy() x = np.random.randn(*shape).astype("float32") diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py index a86758a9cb92b..d50241e58dea3 100644 --- a/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py +++ b/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py @@ -32,6 +32,7 @@ def setUp(self): 'X': np.random.random((32,84)).astype("float32"), 'Y': np.random.random((32,84)).astype("float32") """ + self.inputs = { 'X': np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype), 'Y': np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype) @@ -39,7 +40,7 @@ def setUp(self): self.outputs = {'Out': np.divide(self.inputs['X'], self.inputs['Y'])} def check_eager(self): - return (self.use_mkldnn == False and self.axis == -1) + return (not hasattr(self, "attrs") or (self.attrs["axis"] != -1)) def test_check_output(self): self.check_output(check_eager=False) @@ -65,6 +66,7 @@ def init_dtype(self): class TestElementwiseDivOpBF16(OpTest): def setUp(self): self.op_type = "elementwise_div" + self.python_api = paddle.divide self.dtype = np.uint16 x = np.random.uniform(0.1, 1, [12, 13]).astype(np.float32) @@ -100,6 +102,7 @@ def test_check_grad_ingore_y(self): class TestElementwiseDivOp_scalar(ElementwiseDivOp): def setUp(self): self.op_type = "elementwise_div" + self.python_api = paddle.divide self.inputs = { 'X': np.random.uniform(0.1, 1, [20, 3, 4]).astype(np.float64), 'Y': np.random.uniform(0.1, 1, [1]).astype(np.float64) @@ -110,6 +113,7 @@ def setUp(self): class TestElementwiseDivOp_Vector(ElementwiseDivOp): def setUp(self): self.op_type = "elementwise_div" + self.python_api = paddle.divide self.inputs = { 'X': np.random.uniform(0.1, 1, [100]).astype("float64"), 'Y': np.random.uniform(0.1, 1, [100]).astype("float64") @@ -120,6 +124,7 @@ def setUp(self): class TestElementwiseDivOp_broadcast_0(ElementwiseDivOp): def setUp(self): self.op_type = "elementwise_div" + self.python_api = paddle.divide self.inputs = { 'X': np.random.uniform(0.1, 1, [100, 3, 4]).astype("float64"), 'Y': np.random.uniform(0.1, 1, [100]).astype("float64") @@ -135,6 +140,7 @@ def setUp(self): class TestElementwiseDivOp_broadcast_1(ElementwiseDivOp): def setUp(self): self.op_type = "elementwise_div" + self.python_api = paddle.divide self.inputs = { 'X': np.random.uniform(0.1, 1, [2, 100, 4]).astype("float64"), 'Y': np.random.uniform(0.1, 1, [100]).astype("float64") @@ -150,6 +156,7 @@ def setUp(self): class TestElementwiseDivOp_broadcast_2(ElementwiseDivOp): def setUp(self): self.op_type = "elementwise_div" + self.python_api = paddle.divide self.inputs = { 'X': np.random.uniform(0.1, 1, [2, 3, 100]).astype("float64"), 'Y': np.random.uniform(0.1, 1, [100]).astype("float64") @@ -164,6 +171,7 @@ def setUp(self): class TestElementwiseDivOp_broadcast_3(ElementwiseDivOp): def setUp(self): self.op_type = "elementwise_div" + self.python_api = paddle.divide self.inputs = { 'X': np.random.uniform(0.1, 1, [2, 10, 12, 5]).astype("float64"), 'Y': np.random.uniform(0.1, 1, [10, 12]).astype("float64") @@ -179,6 +187,7 @@ def setUp(self): class TestElementwiseDivOp_broadcast_4(ElementwiseDivOp): def setUp(self): self.op_type = "elementwise_div" + self.python_api = paddle.divide self.inputs = { 'X': np.random.uniform(0.1, 1, [2, 3, 50]).astype("float64"), 'Y': np.random.uniform(0.1, 1, [2, 1, 50]).astype("float64") @@ -189,6 +198,7 @@ def setUp(self): class TestElementwiseDivOp_broadcast_5(ElementwiseDivOp): def setUp(self): self.op_type = "elementwise_div" + self.python_api = paddle.divide self.inputs = { 'X': np.random.uniform(0.1, 1, [2, 3, 4, 20]).astype("float64"), 'Y': np.random.uniform(0.1, 1, [2, 3, 1, 20]).astype("float64") @@ -199,6 +209,7 @@ def setUp(self): class TestElementwiseDivOp_commonuse_1(ElementwiseDivOp): def setUp(self): self.op_type = "elementwise_div" + self.python_api = paddle.divide self.inputs = { 'X': np.random.uniform(0.1, 1, [2, 3, 100]).astype("float64"), 'Y': np.random.uniform(0.1, 1, [1, 1, 100]).astype("float64"), @@ -209,6 +220,7 @@ def setUp(self): class TestElementwiseDivOp_commonuse_2(ElementwiseDivOp): def setUp(self): self.op_type = "elementwise_div" + self.python_api = paddle.divide self.inputs = { 'X': np.random.uniform(0.1, 1, [30, 3, 1, 5]).astype("float64"), 'Y': np.random.uniform(0.1, 1, [30, 1, 4, 1]).astype("float64"), @@ -219,6 +231,7 @@ def setUp(self): class TestElementwiseDivOp_xsize_lessthan_ysize(ElementwiseDivOp): def setUp(self): self.op_type = "elementwise_div" + self.python_api = paddle.divide self.inputs = { 'X': np.random.uniform(0.1, 1, [10, 12]).astype("float64"), 'Y': np.random.uniform(0.1, 1, [2, 3, 10, 12]).astype("float64"), @@ -232,6 +245,7 @@ def setUp(self): class TestElementwiseDivOp_INT(OpTest): def setUp(self): self.op_type = "elementwise_div" + self.python_api = paddle.divide self.dtype = np.int32 self.init_dtype() self.inputs = { @@ -304,6 +318,7 @@ def test_dygraph(self): class TestComplexElementwiseDivOp(OpTest): def setUp(self): self.op_type = "elementwise_div" + self.python_api = paddle.divide self.init_base_dtype() self.init_input_output() self.init_grad_input_output() @@ -334,7 +349,7 @@ def init_grad_input_output(self): self.grad_y = -self.grad_out * np.conj(self.x / self.y / self.y) def test_check_output(self): - self.check_output() + self.check_output(check_eager=False) def test_check_grad_normal(self): self.check_grad( diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_max_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_max_op.py index 719ee5df6dbbf..21b0595b6dc86 100644 --- a/python/paddle/fluid/tests/unittests/test_elementwise_max_op.py +++ b/python/paddle/fluid/tests/unittests/test_elementwise_max_op.py @@ -20,11 +20,13 @@ import os import re import paddle.fluid.core as core +import paddle class TestElementwiseOp(OpTest): def setUp(self): self.op_type = "elementwise_max" + self.python_api = paddle.maximum # If x and y have the same value, the max() is not differentiable. # So we generate test data by the following method # to avoid them being too close to each other. @@ -35,10 +37,16 @@ def setUp(self): self.outputs = {'Out': np.maximum(self.inputs['X'], self.inputs['Y'])} def test_check_output(self): - self.check_output() + if hasattr(self, 'attrs'): + self.check_output(check_eager=False) + else: + self.check_output(check_eager=True) def test_check_grad_normal(self): - self.check_grad(['X', 'Y'], 'Out') + if hasattr(self, 'attrs'): + self.check_grad(['X', 'Y'], 'Out', check_eager=False) + else: + self.check_grad(['X', 'Y'], 'Out', check_eager=True) def test_check_grad_ingore_x(self): self.check_grad( @@ -55,6 +63,7 @@ def test_check_grad_ingore_y(self): class TestElementwiseBF16Op(OpTest): def setUp(self): self.op_type = "elementwise_max" + self.python_api = paddle.maximum self.dtype = np.uint16 # If x and y have the same value, the max() is not differentiable. # So we generate test data by the following method @@ -69,10 +78,16 @@ def setUp(self): self.outputs = {'Out': convert_float_to_uint16(np.maximum(x, y))} def test_check_output(self): - self.check_output() + if hasattr(self, 'attrs'): + self.check_output(check_eager=False) + else: + self.check_output(check_eager=True) def test_check_grad_normal(self): - self.check_grad(['X', 'Y'], 'Out') + if hasattr(self, 'attrs'): + self.check_grad(['X', 'Y'], 'Out', check_eager=False) + else: + self.check_grad(['X', 'Y'], 'Out', check_eager=True) def test_check_grad_ingore_x(self): self.check_grad(['Y'], 'Out', no_grad_set=set("X")) @@ -86,6 +101,7 @@ def test_check_grad_ingore_y(self): class TestElementwiseMaxOp_scalar(TestElementwiseOp): def setUp(self): self.op_type = "elementwise_max" + self.python_api = paddle.maximum x = np.random.random_integers(-5, 5, [2, 3, 20]).astype("float64") y = np.array([0.5]).astype("float64") self.inputs = {'X': x, 'Y': y} @@ -95,6 +111,7 @@ def setUp(self): class TestElementwiseMaxOp_Vector(TestElementwiseOp): def setUp(self): self.op_type = "elementwise_max" + self.python_api = paddle.maximum x = np.random.random((100, )).astype("float64") sgn = np.random.choice([-1, 1], (100, )).astype("float64") y = x + sgn * np.random.uniform(0.1, 1, (100, )).astype("float64") @@ -105,6 +122,7 @@ def setUp(self): class TestElementwiseMaxOp_broadcast_0(TestElementwiseOp): def setUp(self): self.op_type = "elementwise_max" + self.python_api = paddle.maximum x = np.random.uniform(0.5, 1, (100, 5, 2)).astype(np.float64) sgn = np.random.choice([-1, 1], (100, )).astype(np.float64) y = x[:, 0, 0] + sgn * \ @@ -121,6 +139,7 @@ def setUp(self): class TestElementwiseMaxOp_broadcast_1(TestElementwiseOp): def setUp(self): self.op_type = "elementwise_max" + self.python_api = paddle.maximum x = np.random.uniform(0.5, 1, (2, 100, 3)).astype(np.float64) sgn = np.random.choice([-1, 1], (100, )).astype(np.float64) y = x[0, :, 0] + sgn * \ @@ -137,6 +156,7 @@ def setUp(self): class TestElementwiseMaxOp_broadcast_2(TestElementwiseOp): def setUp(self): self.op_type = "elementwise_max" + self.python_api = paddle.maximum x = np.random.uniform(0.5, 1, (1, 3, 100)).astype(np.float64) sgn = np.random.choice([-1, 1], (100, )).astype(np.float64) y = x[0, 0, :] + sgn * \ @@ -152,6 +172,7 @@ def setUp(self): class TestElementwiseMaxOp_broadcast_3(TestElementwiseOp): def setUp(self): self.op_type = "elementwise_max" + self.python_api = paddle.maximum x = np.random.uniform(0.5, 1, (2, 50, 2, 1)).astype(np.float64) sgn = np.random.choice([-1, 1], (50, 2)).astype(np.float64) y = x[0, :, :, 0] + sgn * \ @@ -168,6 +189,7 @@ def setUp(self): class TestElementwiseMaxOp_broadcast_4(TestElementwiseOp): def setUp(self): self.op_type = "elementwise_max" + self.python_api = paddle.maximum x = np.random.uniform(0.5, 1, (2, 3, 4, 5)).astype(np.float64) sgn = np.random.choice([-1, 1], (2, 3, 1, 5)).astype(np.float64) y = x + sgn * \ diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_min_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_min_op.py index 0999acc75acff..f8dc9602c35a5 100644 --- a/python/paddle/fluid/tests/unittests/test_elementwise_min_op.py +++ b/python/paddle/fluid/tests/unittests/test_elementwise_min_op.py @@ -27,6 +27,7 @@ class TestElementwiseOp(OpTest): def setUp(self): self.op_type = "elementwise_min" + self.python_api = paddle.minimum # If x and y have the same value, the min() is not differentiable. # So we generate test data by the following method # to avoid them being too close to each other. @@ -37,10 +38,16 @@ def setUp(self): self.outputs = {'Out': np.minimum(self.inputs['X'], self.inputs['Y'])} def test_check_output(self): - self.check_output() + if hasattr(self, 'attrs'): + self.check_output(check_eager=False) + else: + self.check_output(check_eager=True) def test_check_grad_normal(self): - self.check_grad(['X', 'Y'], 'Out') + if hasattr(self, 'attrs'): + self.check_grad(['X', 'Y'], 'Out', check_eager=False) + else: + self.check_grad(['X', 'Y'], 'Out', check_eager=True) def test_check_grad_ingore_x(self): self.check_grad( @@ -56,6 +63,7 @@ def test_check_grad_ingore_y(self): class TestElementwiseMinOp_scalar(TestElementwiseOp): def setUp(self): self.op_type = "elementwise_min" + self.python_api = paddle.minimum x = np.random.random_integers(-5, 5, [10, 3, 4]).astype("float64") y = np.array([0.5]).astype("float64") self.inputs = {'X': x, 'Y': y} @@ -65,6 +73,7 @@ def setUp(self): class TestElementwiseMinOp_Vector(TestElementwiseOp): def setUp(self): self.op_type = "elementwise_min" + self.python_api = paddle.minimum x = np.random.random((100, )).astype("float64") sgn = np.random.choice([-1, 1], (100, )).astype("float64") y = x + sgn * np.random.uniform(0.1, 1, (100, )).astype("float64") @@ -75,6 +84,7 @@ def setUp(self): class TestElementwiseMinOp_broadcast_0(TestElementwiseOp): def setUp(self): self.op_type = "elementwise_min" + self.python_api = paddle.minimum x = np.random.uniform(0.5, 1, (100, 3, 2)).astype(np.float64) sgn = np.random.choice([-1, 1], (100, )).astype(np.float64) y = x[:, 0, 0] + sgn * \ @@ -91,6 +101,7 @@ def setUp(self): class TestElementwiseMinOp_broadcast_1(TestElementwiseOp): def setUp(self): self.op_type = "elementwise_min" + self.python_api = paddle.minimum x = np.random.uniform(0.5, 1, (2, 100, 3)).astype(np.float64) sgn = np.random.choice([-1, 1], (100, )).astype(np.float64) y = x[0, :, 0] + sgn * \ @@ -107,6 +118,7 @@ def setUp(self): class TestElementwiseMinOp_broadcast_2(TestElementwiseOp): def setUp(self): self.op_type = "elementwise_min" + self.python_api = paddle.minimum x = np.random.uniform(0.5, 1, (2, 3, 100)).astype(np.float64) sgn = np.random.choice([-1, 1], (100, )).astype(np.float64) y = x[0, 0, :] + sgn * \ @@ -122,6 +134,7 @@ def setUp(self): class TestElementwiseMinOp_broadcast_3(TestElementwiseOp): def setUp(self): self.op_type = "elementwise_min" + self.python_api = paddle.minimum x = np.random.uniform(0.5, 1, (2, 25, 4, 1)).astype(np.float64) sgn = np.random.choice([-1, 1], (25, 4)).astype(np.float64) y = x[0, :, :, 0] + sgn * \ @@ -138,6 +151,7 @@ def setUp(self): class TestElementwiseMinOp_broadcast_4(TestElementwiseOp): def setUp(self): self.op_type = "elementwise_min" + self.python_api = paddle.minimum x = np.random.uniform(0.5, 1, (2, 10, 2, 5)).astype(np.float64) sgn = np.random.choice([-1, 1], (2, 10, 1, 5)).astype(np.float64) y = x + sgn * \ diff --git a/python/paddle/fluid/tests/unittests/test_expand_as_v2_op.py b/python/paddle/fluid/tests/unittests/test_expand_as_v2_op.py index 62cd465a176d5..416a60b8ba200 100755 --- a/python/paddle/fluid/tests/unittests/test_expand_as_v2_op.py +++ b/python/paddle/fluid/tests/unittests/test_expand_as_v2_op.py @@ -24,6 +24,7 @@ class TestExpandAsOpRank1(OpTest): def setUp(self): self.op_type = "expand_as_v2" + self.python_api = paddle.expand_as x = np.random.rand(100).astype("float64") target_tensor = np.random.rand(2, 100).astype("float64") self.inputs = {'X': x} diff --git a/python/paddle/fluid/tests/unittests/test_histogram_op.py b/python/paddle/fluid/tests/unittests/test_histogram_op.py index 7da9dbd62e9f9..819029c5fcd9d 100644 --- a/python/paddle/fluid/tests/unittests/test_histogram_op.py +++ b/python/paddle/fluid/tests/unittests/test_histogram_op.py @@ -21,6 +21,7 @@ import paddle.fluid.core as core from paddle.fluid import Program, program_guard from op_test import OpTest +from paddle.fluid.framework import _test_eager_guard class TestHistogramOpAPI(unittest.TestCase): @@ -57,6 +58,15 @@ def test_dygraph(self): (actual.numpy() == expected).all(), msg='histogram output is wrong, out =' + str(actual.numpy())) + with _test_eager_guard(): + inputs_np = np.array([[2, 4, 2], [2, 5, 4]]).astype(np.int64) + inputs = paddle.to_tensor(inputs_np) + actual = paddle.histogram(inputs, bins=5, min=1, max=5) + self.assertTrue( + (actual.numpy() == expected).all(), + msg='histogram output is wrong, out =' + + str(actual.numpy())) + class TestHistogramOpError(unittest.TestCase): """Test histogram op error.""" @@ -118,6 +128,7 @@ def setUp(self): self.op_type = "histogram" self.init_test_case() np_input = np.random.uniform(low=0.0, high=20.0, size=self.in_shape) + self.python_api = paddle.histogram self.inputs = {"X": np_input} self.init_attrs() Out, _ = np.histogram( @@ -134,7 +145,7 @@ def init_attrs(self): self.attrs = {"bins": self.bins, "min": self.min, "max": self.max} def test_check_output(self): - self.check_output() + self.check_output(check_eager=True) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/test_index_sample_op.py b/python/paddle/fluid/tests/unittests/test_index_sample_op.py index e2ccb153f4063..4da03c9643fa9 100644 --- a/python/paddle/fluid/tests/unittests/test_index_sample_op.py +++ b/python/paddle/fluid/tests/unittests/test_index_sample_op.py @@ -40,10 +40,10 @@ def setUp(self): self.outputs = {'Out': out} def test_check_output(self): - self.check_output(check_eager=False) + self.check_output(check_eager=True) def test_check_grad(self): - self.check_grad(['X'], 'Out', check_eager=False) + self.check_grad(['X'], 'Out', check_eager=True) def config(self): """ diff --git a/python/paddle/fluid/tests/unittests/test_inplace.py b/python/paddle/fluid/tests/unittests/test_inplace.py index 617e9811d630f..b4f1dc22f4ee4 100644 --- a/python/paddle/fluid/tests/unittests/test_inplace.py +++ b/python/paddle/fluid/tests/unittests/test_inplace.py @@ -31,11 +31,7 @@ def func_test_forward_version(self): var[0] = 1.1 self.assertEqual(var.inplace_version, 1) - # TODO1: assign don't support inplace in temporary - if in_dygraph_mode(): - var[0] = 2 - else: - paddle.assign(paddle.ones(shape=[3]), var) + paddle.assign(paddle.ones(shape=[3]), var) # NOTE(liym27): assign(input, output) is an inplace operation for output. # There is inplace-related processing for api assign, var.inplace_version should be 2 not 1. @@ -65,18 +61,11 @@ def func_test_backward_error(self): var_d = var_b**2 loss = paddle.nn.functional.relu(var_c + var_d) - if in_dygraph_mode(): - with self.assertRaisesRegexp( - RuntimeError, - "received current_inplace_version:{} != inplace_version_snapshot_:{}". - format(1, 0)): - loss.backward() - else: - with self.assertRaisesRegexp( - RuntimeError, - "received tensor_version:{} != wrapper_version_snapshot:{}". - format(1, 0)): - loss.backward() + with self.assertRaisesRegexp( + RuntimeError, + "received tensor_version:{} != wrapper_version_snapshot:{}". + format(1, 0)): + loss.backward() def test_backward_error(self): with _test_eager_guard(): @@ -122,7 +111,7 @@ def func_test_backward_success_2(self): loss.backward() def test_backward_success_2(self): - # TODO2: need to process no_need_buffer in eager mode + # TODO: need to process no_need_buffer in eager mode # with _test_eager_guard(): # self.func_test_backward_success_2() self.func_test_backward_success_2() @@ -207,18 +196,11 @@ def func_test_backward_error(self): self.inplace_api_processing(var_b) loss = paddle.nn.functional.relu(var_c) - if in_dygraph_mode(): - with self.assertRaisesRegexp( - RuntimeError, - "received current_inplace_version:{} != inplace_version_snapshot_:{}". - format(1, 0)): - loss.backward() - else: - with self.assertRaisesRegexp( - RuntimeError, - "received tensor_version:{} != wrapper_version_snapshot:{}". - format(1, 0)): - loss.backward() + with self.assertRaisesRegexp( + RuntimeError, + "received tensor_version:{} != wrapper_version_snapshot:{}". + format(1, 0)): + loss.backward() def test_backward_error(self): with _test_eager_guard(): diff --git a/python/paddle/fluid/tests/unittests/test_isfinite_v2_op.py b/python/paddle/fluid/tests/unittests/test_isfinite_v2_op.py index 0d4d3b58e862c..c861f912803f9 100644 --- a/python/paddle/fluid/tests/unittests/test_isfinite_v2_op.py +++ b/python/paddle/fluid/tests/unittests/test_isfinite_v2_op.py @@ -16,6 +16,7 @@ import paddle.fluid as fluid import unittest import numpy as np +from paddle.fluid.framework import _test_eager_guard def run_static(x_np, dtype, op_str, use_gpu=False): @@ -46,6 +47,18 @@ def run_dygraph(x_np, op_str, use_gpu=True): return dygraph_result +def run_eager(x_np, op_str, use_gpu=True): + with paddle.fluid.dygraph.guard(): + with _test_eager_guard(): + place = paddle.CPUPlace() + if use_gpu and fluid.core.is_compiled_with_cuda(): + place = paddle.CUDAPlace(0) + + x = paddle.to_tensor(x_np) + dygraph_result = getattr(paddle.tensor, op_str)(x) + return dygraph_result + + def np_data_generator(low, high, np_shape, type, sv_list, op_str, *args, **kwargs): x_np = np.random.uniform(low, high, np_shape).astype(getattr(np, type)) @@ -107,8 +120,10 @@ def test(test_case, op_str, use_gpu=False): x_np, result_np = np_data_generator(**meta_data) static_result = run_static(x_np, meta_data['type'], op_str, use_gpu) dygraph_result = run_dygraph(x_np, op_str, use_gpu) + eager_result = run_eager(x_np, op_str, use_gpu) test_case.assertTrue((static_result == result_np).all()) test_case.assertTrue((dygraph_result.numpy() == result_np).all()) + test_case.assertTrue((eager_result.numpy() == result_np).all()) class TestCPUNormal(unittest.TestCase): @@ -158,4 +173,5 @@ def test_isfinite_bad_x(): if __name__ == '__main__': + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_lerp_op.py b/python/paddle/fluid/tests/unittests/test_lerp_op.py index 0f740444123cb..10ab2610a26e4 100644 --- a/python/paddle/fluid/tests/unittests/test_lerp_op.py +++ b/python/paddle/fluid/tests/unittests/test_lerp_op.py @@ -27,6 +27,7 @@ class TestLerp(OpTest): def setUp(self): self.op_type = "lerp" + self.python_api = paddle.lerp self.init_dtype() self.init_shape() x = np.arange(1., 101.).astype(self.dtype).reshape(self.shape) @@ -42,10 +43,10 @@ def init_shape(self): self.shape = [100] def test_check_output(self): - self.check_output() + self.check_output(check_eager=True) def test_check_grad(self): - self.check_grad(['X', 'Y'], 'Out') + self.check_grad(['X', 'Y'], 'Out', check_eager=True) class TestLerpWithDim2(TestLerp): diff --git a/python/paddle/fluid/tests/unittests/test_logical_op.py b/python/paddle/fluid/tests/unittests/test_logical_op.py index 174f3bc665ea1..91d339940d114 100755 --- a/python/paddle/fluid/tests/unittests/test_logical_op.py +++ b/python/paddle/fluid/tests/unittests/test_logical_op.py @@ -20,6 +20,7 @@ import paddle import paddle.fluid as fluid from paddle.static import Program, program_guard +from paddle.fluid.framework import _test_eager_guard SUPPORTED_DTYPES = [ bool, np.int8, np.int16, np.int32, np.int64, np.float32, np.float64 @@ -144,6 +145,22 @@ def run_dygraph(x_np, y_np, op_str, use_gpu=False, binary_op=True): return dygraph_result +def run_eager(x_np, y_np, op_str, use_gpu=False, binary_op=True): + place = paddle.CPUPlace() + if use_gpu and fluid.core.is_compiled_with_cuda(): + place = paddle.CUDAPlace(0) + paddle.disable_static(place) + with _test_eager_guard(): + op = getattr(paddle, op_str) + x = paddle.to_tensor(x_np, dtype=x_np.dtype) + if not binary_op: + dygraph_result = op(x) + else: + y = paddle.to_tensor(y_np, dtype=y_np.dtype) + dygraph_result = op(x, y) + return dygraph_result + + def np_data_generator(np_shape, dtype, *args, **kwargs): if dtype == bool: return np.random.choice(a=[True, False], size=np_shape).astype(bool) @@ -174,6 +191,7 @@ def test(unit_test, use_gpu=False, test_error=False): continue static_result = run_static(**meta_data) dygraph_result = run_dygraph(**meta_data) + eager_result = run_eager(**meta_data) if meta_data['binary_op']: np_result = np_op(meta_data['x_np'], meta_data['y_np']) else: @@ -181,6 +199,7 @@ def test(unit_test, use_gpu=False, test_error=False): unit_test.assertTrue((static_result == np_result).all()) unit_test.assertTrue((dygraph_result.numpy() == np_result).all( )) + unit_test.assertTrue((eager_result.numpy() == np_result).all()) def test_type_error(unit_test, use_gpu, type_str_map): @@ -259,4 +278,5 @@ def test_type_error(self): if __name__ == '__main__': + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_masked_select_op.py b/python/paddle/fluid/tests/unittests/test_masked_select_op.py index ed1a981d0306b..764f4806ba4ba 100644 --- a/python/paddle/fluid/tests/unittests/test_masked_select_op.py +++ b/python/paddle/fluid/tests/unittests/test_masked_select_op.py @@ -33,6 +33,7 @@ class TestMaskedSelectOp(OpTest): def setUp(self): self.init() self.op_type = "masked_select" + self.python_api = paddle.masked_select x = np.random.random(self.shape).astype("float64") mask = np.array(np.random.randint(2, size=self.shape, dtype=bool)) out = np_masked_select(x, mask) @@ -40,10 +41,10 @@ def setUp(self): self.outputs = {'Y': out} def test_check_output(self): - self.check_output() + self.check_output(check_eager=True) def test_check_grad(self): - self.check_grad(['X'], 'Y') + self.check_grad(['X'], 'Y', check_eager=True) def init(self): self.shape = (50, 3) @@ -121,4 +122,5 @@ def test_mask_dtype(): if __name__ == '__main__': + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_nll_loss.py b/python/paddle/fluid/tests/unittests/test_nll_loss.py index a87d9052bd6d3..0bc5e1cad9acd 100644 --- a/python/paddle/fluid/tests/unittests/test_nll_loss.py +++ b/python/paddle/fluid/tests/unittests/test_nll_loss.py @@ -17,6 +17,7 @@ import numpy as np import unittest from op_test import OpTest +from paddle.fluid.framework import _test_eager_guard def nll_loss_1d(logs, targets, weight=None, reduction='mean', @@ -97,14 +98,21 @@ def test_NLLLoss_1D_mean(self): with fluid.dygraph.guard(): nll_loss = paddle.nn.loss.NLLLoss() dy_res = nll_loss( - fluid.dygraph.to_variable(input_np), - fluid.dygraph.to_variable(label_np)) + paddle.to_tensor(input_np), paddle.to_tensor(label_np)) dy_result = dy_res.numpy() + with fluid.dygraph.guard(): + with _test_eager_guard(): + nll_loss = paddle.nn.loss.NLLLoss() + eager_res = nll_loss( + paddle.to_tensor(input_np), paddle.to_tensor(label_np)) + eager_result = eager_res.numpy() + expected = nll_loss_1d(input_np, label_np)[0] self.assertTrue(np.allclose(static_result, expected)) self.assertTrue(np.allclose(static_result, dy_result)) self.assertTrue(np.allclose(dy_result, expected)) + self.assertTrue(np.allclose(eager_result, expected)) def test_NLLLoss_1D_sum(self): np.random.seed(200) @@ -132,14 +140,24 @@ def test_NLLLoss_1D_sum(self): with fluid.dygraph.guard(): nll_loss = paddle.nn.loss.NLLLoss(reduction='sum') dy_res = nll_loss( - fluid.dygraph.to_variable(input_np), - fluid.dygraph.to_variable(label_np)) + paddle.to_tensor(input_np), paddle.to_tensor(label_np)) dy_result = dy_res.numpy() + with _test_eager_guard(): + nll_loss = paddle.nn.loss.NLLLoss(reduction='sum') + in_t = paddle.to_tensor(input_np) + label = paddle.to_tensor(label_np) + in_t.stop_gradient = False + eager_res = nll_loss(in_t, label) + eager_result = eager_res.numpy() + loss = eager_res.sum() + loss.backward() + expected = nll_loss_1d(input_np, label_np, reduction='sum')[0] self.assertTrue(np.allclose(static_result, expected)) self.assertTrue(np.allclose(static_result, dy_result)) self.assertTrue(np.allclose(dy_result, expected)) + self.assertTrue(np.allclose(eager_result, expected)) def test_NLLLoss_1D_with_weight_mean(self): np.random.seed(200) @@ -170,16 +188,26 @@ def test_NLLLoss_1D_with_weight_mean(self): with fluid.dygraph.guard(): nll_loss = paddle.nn.loss.NLLLoss( - weight=fluid.dygraph.to_variable(weight_np)) + weight=paddle.to_tensor(weight_np)) dy_res = nll_loss( - fluid.dygraph.to_variable(input_np), - fluid.dygraph.to_variable(label_np)) + paddle.to_tensor(input_np), paddle.to_tensor(label_np)) dy_result = dy_res.numpy() + + with _test_eager_guard(): + nll_loss = paddle.nn.loss.NLLLoss( + weight=paddle.to_tensor(weight_np)) + eager_res = nll_loss( + paddle.to_tensor(input_np), paddle.to_tensor(label_np)) + loss = eager_res.sum() + loss.backward() + eager_result = eager_res.numpy() + expected = nll_loss_1d(input_np, label_np, weight=weight_np)[0] self.assertTrue(np.allclose(static_result, expected)) self.assertTrue(np.allclose(static_result, dy_result)) self.assertTrue(np.allclose(dy_result, expected)) + self.assertTrue(np.allclose(eager_result, expected)) def test_NLLLoss_1D_with_weight_sum(self): np.random.seed(200) @@ -210,10 +238,9 @@ def test_NLLLoss_1D_with_weight_sum(self): with fluid.dygraph.guard(): nll_loss = paddle.nn.loss.NLLLoss( - weight=fluid.dygraph.to_variable(weight_np), reduction='sum') + weight=paddle.to_tensor(weight_np), reduction='sum') dy_res = nll_loss( - fluid.dygraph.to_variable(input_np), - fluid.dygraph.to_variable(label_np)) + paddle.to_tensor(input_np), paddle.to_tensor(label_np)) dy_result = dy_res.numpy() expected = nll_loss_1d( input_np, label_np, weight=weight_np, reduction='sum')[0] @@ -249,10 +276,9 @@ def test_NLLLoss_1D_with_weight_mean_cpu(self): with fluid.dygraph.guard(): nll_loss = paddle.nn.loss.NLLLoss( - weight=fluid.dygraph.to_variable(weight_np)) + weight=paddle.to_tensor(weight_np)) dy_res = nll_loss( - fluid.dygraph.to_variable(input_np), - fluid.dygraph.to_variable(label_np)) + paddle.to_tensor(input_np), paddle.to_tensor(label_np)) dy_result = dy_res.numpy() expected = nll_loss_1d(input_np, label_np, weight=weight_np)[0] @@ -287,10 +313,9 @@ def test_NLLLoss_1D_with_weight_no_reduce_cpu(self): with fluid.dygraph.guard(): nll_loss = paddle.nn.loss.NLLLoss( - weight=fluid.dygraph.to_variable(weight_np), reduction='none') + weight=paddle.to_tensor(weight_np), reduction='none') dy_res = nll_loss( - fluid.dygraph.to_variable(input_np), - fluid.dygraph.to_variable(label_np)) + paddle.to_tensor(input_np), paddle.to_tensor(label_np)) dy_result = dy_res.numpy() expected = nll_loss_1d( input_np, label_np, weight=weight_np, reduction='none') @@ -326,8 +351,7 @@ def test_NLLLoss_2D_mean(self): with fluid.dygraph.guard(): nll_loss = paddle.nn.loss.NLLLoss() dy_res = nll_loss( - fluid.dygraph.to_variable(input_np), - fluid.dygraph.to_variable(label_np)) + paddle.to_tensor(input_np), paddle.to_tensor(label_np)) dy_result = dy_res.numpy() expected = nll_loss_2d(input_np, label_np)[0] @@ -363,8 +387,7 @@ def test_NLLLoss_2D_sum(self): with fluid.dygraph.guard(): nll_loss = paddle.nn.loss.NLLLoss(reduction='sum') dy_res = nll_loss( - fluid.dygraph.to_variable(input_np), - fluid.dygraph.to_variable(label_np)) + paddle.to_tensor(input_np), paddle.to_tensor(label_np)) dy_result = dy_res.numpy() expected = nll_loss_2d(input_np, label_np, reduction='sum')[0] @@ -404,10 +427,9 @@ def test_NLLLoss_2D_with_weight_mean(self): with fluid.dygraph.guard(): nll_loss = paddle.nn.loss.NLLLoss( - weight=fluid.dygraph.to_variable(weight_np)) + weight=paddle.to_tensor(weight_np)) dy_res = nll_loss( - fluid.dygraph.to_variable(input_np), - fluid.dygraph.to_variable(label_np)) + paddle.to_tensor(input_np), paddle.to_tensor(label_np)) dy_result = dy_res.numpy() expected = nll_loss_2d(input_np, label_np, weight=weight_np)[0] @@ -445,10 +467,9 @@ def test_NLLLoss_2D_with_weight_mean_cpu(self): with fluid.dygraph.guard(): nll_loss = paddle.nn.loss.NLLLoss( - weight=fluid.dygraph.to_variable(weight_np)) + weight=paddle.to_tensor(weight_np)) dy_res = nll_loss( - fluid.dygraph.to_variable(input_np), - fluid.dygraph.to_variable(label_np)) + paddle.to_tensor(input_np), paddle.to_tensor(label_np)) dy_result = dy_res.numpy() expected = nll_loss_2d(input_np, label_np, weight=weight_np)[0] @@ -487,10 +508,9 @@ def test_NLLLoss_2D_with_weight_sum(self): with fluid.dygraph.guard(): nll_loss = paddle.nn.loss.NLLLoss( - weight=fluid.dygraph.to_variable(weight_np), reduction='sum') + weight=paddle.to_tensor(weight_np), reduction='sum') dy_res = nll_loss( - fluid.dygraph.to_variable(input_np), - fluid.dygraph.to_variable(label_np)) + paddle.to_tensor(input_np), paddle.to_tensor(label_np)) dy_result = dy_res.numpy() expected = nll_loss_2d( @@ -527,8 +547,7 @@ def test_NLLLoss_in_dims_not_2or4_mean(self): with fluid.dygraph.guard(): nll_loss = paddle.nn.loss.NLLLoss() dy_res = nll_loss( - fluid.dygraph.to_variable(input_np), - fluid.dygraph.to_variable(label_np)) + paddle.to_tensor(input_np), paddle.to_tensor(label_np)) dy_result = dy_res.numpy() input_shape = input_np.shape @@ -572,10 +591,9 @@ def test_NLLLoss_in_dims_not_2or4_with_weight_mean(self): with fluid.dygraph.guard(): nll_loss = paddle.nn.loss.NLLLoss( - weight=fluid.dygraph.to_variable(weight_np)) + weight=paddle.to_tensor(weight_np)) dy_res = nll_loss( - fluid.dygraph.to_variable(input_np), - fluid.dygraph.to_variable(label_np)) + paddle.to_tensor(input_np), paddle.to_tensor(label_np)) dy_result = dy_res.numpy() input_shape = input_np.shape @@ -620,10 +638,9 @@ def test_NLLLoss_in_dims_not_2or4_with_weight_sum(self): with fluid.dygraph.guard(): nll_loss = paddle.nn.loss.NLLLoss( - weight=fluid.dygraph.to_variable(weight_np), reduction='sum') + weight=paddle.to_tensor(weight_np), reduction='sum') dy_res = nll_loss( - fluid.dygraph.to_variable(input_np), - fluid.dygraph.to_variable(label_np)) + paddle.to_tensor(input_np), paddle.to_tensor(label_np)) dy_result = dy_res.numpy() input_shape = input_np.shape @@ -671,10 +688,9 @@ def test_NLLLoss_in_dims_not_2or4_with_weight_no_reduce(self): with fluid.dygraph.guard(): nll_loss = paddle.nn.loss.NLLLoss( - weight=fluid.dygraph.to_variable(weight_np), reduction='none') + weight=paddle.to_tensor(weight_np), reduction='none') dy_res = nll_loss( - fluid.dygraph.to_variable(input_np), - fluid.dygraph.to_variable(label_np)) + paddle.to_tensor(input_np), paddle.to_tensor(label_np)) dy_result = dy_res.numpy() input_shape = input_np.shape @@ -721,10 +737,9 @@ def test_NLLLoss_in_dims_not_2or4_with_weight_no_reduce_cpu(self): with fluid.dygraph.guard(): nll_loss = paddle.nn.loss.NLLLoss( - weight=fluid.dygraph.to_variable(weight_np), reduction='none') + weight=paddle.to_tensor(weight_np), reduction='none') dy_res = nll_loss( - fluid.dygraph.to_variable(input_np), - fluid.dygraph.to_variable(label_np)) + paddle.to_tensor(input_np), paddle.to_tensor(label_np)) dy_result = dy_res.numpy() input_shape = input_np.shape @@ -749,6 +764,8 @@ def setUp(self): self.init_test_case() self.op_type = "nll_loss" self.with_weight = False + self.python_api = paddle.nn.functional.nll_loss + self.python_out_sig = ["Out"] np.random.seed(200) input_np = np.random.uniform(0.1, 0.8, self.input_shape).astype("float64") @@ -769,7 +786,7 @@ def setUp(self): self.attrs = {'reduction': 'mean', 'ignore_index': -100} def test_check_output(self): - self.check_output() + self.check_output(check_eager=False) def test_check_output_with_weight(self): self.with_weight = True @@ -778,7 +795,7 @@ def test_check_output_with_weight(self): def test_check_grad(self): self.with_weight = True place = fluid.CPUPlace() - self.check_grad_with_place(place, ['X'], 'Out') + self.check_grad_with_place(place, ['X'], 'Out', check_eager=False) if fluid.core.is_compiled_with_cuda(): place = fluid.CUDAPlace(0) self.check_grad_with_place(place, ['X'], 'Out') @@ -1014,4 +1031,5 @@ def test_nll_loss_function_reduction_imperative_not_sum_mean_none(): if __name__ == "__main__": + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_pylayer_op.py b/python/paddle/fluid/tests/unittests/test_pylayer_op.py index 91e7b5d00e1a7..aadfb4d39442c 100644 --- a/python/paddle/fluid/tests/unittests/test_pylayer_op.py +++ b/python/paddle/fluid/tests/unittests/test_pylayer_op.py @@ -487,7 +487,7 @@ def forward(self, data): z = layer(data) with self.assertRaisesRegexp( RuntimeError, - "received current_inplace_version:{} != inplace_version_snapshot_:{}". + "received tensor_version:{} != wrapper_version_snapshot:{}". format(1, 0)): z.backward() diff --git a/python/paddle/fluid/tests/unittests/test_quantile.py b/python/paddle/fluid/tests/unittests/test_quantile.py index 0fd3c1de9ca82..936d1d3be3a19 100644 --- a/python/paddle/fluid/tests/unittests/test_quantile.py +++ b/python/paddle/fluid/tests/unittests/test_quantile.py @@ -20,46 +20,59 @@ class TestQuantile(unittest.TestCase): + """ + This class is used for numerical precision testing. If there is + a corresponding numpy API, the precision comparison can be performed directly. + Otherwise, it needs to be verified by numpy implementated function. + """ + def setUp(self): np.random.seed(678) self.input_data = np.random.rand(6, 7, 8, 9, 10) + # Test correctness when q and axis are set. def test_quantile_single_q(self): x = paddle.to_tensor(self.input_data) paddle_res = paddle.quantile(x, q=0.5, axis=2) np_res = np.quantile(self.input_data, q=0.5, axis=2) self.assertTrue(np.allclose(paddle_res.numpy(), np_res)) + # Test correctness for default axis. def test_quantile_with_no_axis(self): x = paddle.to_tensor(self.input_data) paddle_res = paddle.quantile(x, q=0.35) np_res = np.quantile(self.input_data, q=0.35) self.assertTrue(np.allclose(paddle_res.numpy(), np_res)) + # Test correctness for multiple axis. def test_quantile_with_multi_axis(self): x = paddle.to_tensor(self.input_data) paddle_res = paddle.quantile(x, q=0.75, axis=[0, 2, 3]) np_res = np.quantile(self.input_data, q=0.75, axis=[0, 2, 3]) self.assertTrue(np.allclose(paddle_res.numpy(), np_res)) + # Test correctness when keepdim is set. def test_quantile_with_keepdim(self): x = paddle.to_tensor(self.input_data) paddle_res = paddle.quantile(x, q=0.35, axis=4, keepdim=True) np_res = np.quantile(self.input_data, q=0.35, axis=4, keepdims=True) self.assertTrue(np.allclose(paddle_res.numpy(), np_res)) + # Test correctness when all parameters are set. def test_quantile_with_keepdim_and_multiple_axis(self): x = paddle.to_tensor(self.input_data) paddle_res = paddle.quantile(x, q=0.1, axis=[1, 4], keepdim=True) np_res = np.quantile(self.input_data, q=0.1, axis=[1, 4], keepdims=True) self.assertTrue(np.allclose(paddle_res.numpy(), np_res)) + # Test correctness when q = 0. def test_quantile_with_boundary_q(self): x = paddle.to_tensor(self.input_data) paddle_res = paddle.quantile(x, q=0, axis=3) np_res = np.quantile(self.input_data, q=0, axis=3) self.assertTrue(np.allclose(paddle_res.numpy(), np_res)) + # Test correctness when input includes NaN. def test_quantile_include_NaN(self): input_data = np.random.randn(2, 3, 4) input_data[0, 1, 1] = np.nan @@ -69,6 +82,10 @@ def test_quantile_include_NaN(self): class TestQuantileMuitlpleQ(unittest.TestCase): + """ + This class is used to test multiple input of q. + """ + def setUp(self): np.random.seed(678) self.input_data = np.random.rand(10, 3, 4, 5, 4) @@ -95,56 +112,125 @@ def test_quantile_multiple_axis_keepdim(self): class TestQuantileError(unittest.TestCase): + """ + This class is used to test that exceptions are thrown correctly. + Validity of all parameter values and types should be considered. + """ + def setUp(self): self.x = paddle.randn((2, 3, 4)) def test_errors(self): + # Test error when q > 1 def test_q_range_error_1(): paddle_res = paddle.quantile(self.x, q=1.5) self.assertRaises(ValueError, test_q_range_error_1) + # Test error when q < 0 def test_q_range_error_2(): paddle_res = paddle.quantile(self.x, q=[0.2, -0.3]) self.assertRaises(ValueError, test_q_range_error_2) + # Test error with no valid q def test_q_range_error_3(): paddle_res = paddle.quantile(self.x, q=[]) self.assertRaises(ValueError, test_q_range_error_3) + # Test error when x is not Tensor def test_x_type_error(): x = [1, 3, 4] paddle_res = paddle.quantile(x, q=0.9) self.assertRaises(TypeError, test_x_type_error) + # Test error when scalar axis is not int def test_axis_type_error_1(): paddle_res = paddle.quantile(self.x, q=0.4, axis=0.4) self.assertRaises(ValueError, test_axis_type_error_1) + # Test error when axis in List is not int def test_axis_type_error_2(): paddle_res = paddle.quantile(self.x, q=0.4, axis=[1, 0.4]) self.assertRaises(ValueError, test_axis_type_error_2) + # Test error when axis not in [-D, D) def test_axis_value_error_1(): paddle_res = paddle.quantile(self.x, q=0.4, axis=10) self.assertRaises(ValueError, test_axis_value_error_1) + # Test error when axis not in [-D, D) def test_axis_value_error_2(): paddle_res = paddle.quantile(self.x, q=0.4, axis=[1, -10]) self.assertRaises(ValueError, test_axis_value_error_2) + # Test error with no valid axis def test_axis_value_error_3(): paddle_res = paddle.quantile(self.x, q=0.4, axis=[]) self.assertRaises(ValueError, test_axis_value_error_3) +class TestQuantileRuntime(unittest.TestCase): + """ + This class is used to test the API could run correctly with + different devices, different data types, and dygraph/static mode. + """ + + def setUp(self): + np.random.seed(678) + self.input_data = np.random.rand(6, 7, 8, 9, 10) + self.dtypes = ['float32', 'float64'] + self.devices = ['cpu'] + if paddle.device.is_compiled_with_cuda(): + self.devices.append('gpu') + + def test_dygraph(self): + paddle.disable_static() + for device in self.devices: + # Check different devices + paddle.set_device(device) + for dtype in self.dtypes: + # Check different dtypes + np_input_data = self.input_data.astype(dtype) + x = paddle.to_tensor(np_input_data, dtype=dtype) + paddle_res = paddle.quantile(x, q=0.5, axis=2) + np_res = np.quantile(np_input_data, q=0.5, axis=2) + self.assertTrue(np.allclose(paddle_res.numpy(), np_res)) + + def test_static(self): + paddle.enable_static() + for device in self.devices: + x = paddle.static.data( + name="x", shape=self.input_data.shape, dtype=paddle.float32) + x_fp64 = paddle.static.data( + name="x_fp64", + shape=self.input_data.shape, + dtype=paddle.float64) + + results = paddle.quantile(x, q=0.5, axis=2) + np_input_data = self.input_data.astype('float32') + results_fp64 = paddle.quantile(x_fp64, q=0.5, axis=2) + np_input_data_fp64 = self.input_data.astype('float64') + + exe = paddle.static.Executor(device) + paddle_res, paddle_res_fp64 = exe.run( + paddle.static.default_main_program(), + feed={"x": np_input_data, + "x_fp64": np_input_data_fp64}, + fetch_list=[results, results_fp64]) + np_res = np.quantile(np_input_data, q=0.5, axis=2) + np_res_fp64 = np.quantile(np_input_data_fp64, q=0.5, axis=2) + self.assertTrue( + np.allclose(paddle_res, np_res) and np.allclose(paddle_res_fp64, + np_res_fp64)) + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_shape_op.py b/python/paddle/fluid/tests/unittests/test_shape_op.py index bada62e3239ea..3d961a7413ca0 100644 --- a/python/paddle/fluid/tests/unittests/test_shape_op.py +++ b/python/paddle/fluid/tests/unittests/test_shape_op.py @@ -17,6 +17,7 @@ import unittest import numpy as np from op_test import OpTest +import paddle from paddle.fluid import core from paddle.fluid.op import Operator @@ -24,6 +25,7 @@ class TestShapeOp(OpTest): def setUp(self): self.op_type = "shape" + self.python_api = paddle.shape self.config() self.shape = [2, 3] input = np.zeros(self.shape) @@ -34,7 +36,7 @@ def config(self): self.shape = [2, 3] def test_check_output(self): - self.check_output() + self.check_output(check_eager=True) class case1(TestShapeOp): diff --git a/python/paddle/fluid/tests/unittests/test_sparse_copy_op.py b/python/paddle/fluid/tests/unittests/test_sparse_copy_op.py new file mode 100644 index 0000000000000..8dab034d643ed --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_sparse_copy_op.py @@ -0,0 +1,51 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import unittest +import numpy as np +import paddle +from paddle import _C_ops +from paddle.fluid import core +from paddle.fluid.framework import _test_eager_guard + + +class TestSparseCopy(unittest.TestCase): + def test_copy_sparse_coo(self): + with _test_eager_guard(): + np_x = [[0, 1.0, 0], [2.0, 0, 0], [0, 3.0, 0]] + np_values = [1.0, 2.0, 3.0] + dense_x = paddle.to_tensor(np_x, dtype='float32') + coo_x = dense_x.to_sparse_coo(2) + + np_x_2 = [[0, 3.0, 0], [2.0, 0, 0], [0, 3.0, 0]] + dense_x_2 = paddle.to_tensor(np_x_2, dtype='float32') + coo_x_2 = dense_x_2.to_sparse_coo(2) + coo_x_2.copy_(coo_x, True) + assert np.array_equal(np_values, + coo_x_2.non_zero_elements().numpy()) + + def test_copy_sparse_csr(self): + with _test_eager_guard(): + np_x = [[0, 1.0, 0], [2.0, 0, 0], [0, 3.0, 0]] + np_values = [1.0, 2.0, 3.0] + dense_x = paddle.to_tensor(np_x, dtype='float32') + csr_x = dense_x.to_sparse_csr() + + np_x_2 = [[0, 3.0, 0], [2.0, 0, 0], [0, 3.0, 0]] + dense_x_2 = paddle.to_tensor(np_x_2, dtype='float32') + csr_x_2 = dense_x_2.to_sparse_csr() + csr_x_2.copy_(csr_x, True) + assert np.array_equal(np_values, + csr_x_2.non_zero_elements().numpy()) diff --git a/python/paddle/fluid/tests/unittests/test_strided_slice_op.py b/python/paddle/fluid/tests/unittests/test_strided_slice_op.py index e9be6b338fb86..ae17cb9b1b57c 100644 --- a/python/paddle/fluid/tests/unittests/test_strided_slice_op.py +++ b/python/paddle/fluid/tests/unittests/test_strided_slice_op.py @@ -58,6 +58,7 @@ class TestStrideSliceOp(OpTest): def setUp(self): self.initTestCase() self.op_type = 'strided_slice' + self.python_api = paddle.strided_slice self.output = strided_slice_native_forward( self.input, self.axes, self.starts, self.ends, self.strides) @@ -72,10 +73,10 @@ def setUp(self): } def test_check_output(self): - self.check_output() + self.check_output(check_eager=True) def test_check_grad(self): - self.check_grad(set(['Input']), 'Out') + self.check_grad(set(['Input']), 'Out', check_eager=True) def initTestCase(self): self.input = np.random.rand(100) @@ -704,7 +705,7 @@ def create_case(self, net): l2.sum().backward() grads_static = net.get_all_grads() net.clear_all_grad() - # compare result of dygraph and static + # compare result of dygraph and static self.is_grads_equal(grads_static, grads_dy) self.assertTrue( np.array_equal(s1, s2), diff --git a/python/paddle/fluid/tests/unittests/test_tensor_fill_.py b/python/paddle/fluid/tests/unittests/test_tensor_fill_.py index 5891aee5bd32e..2f43f129978cd 100644 --- a/python/paddle/fluid/tests/unittests/test_tensor_fill_.py +++ b/python/paddle/fluid/tests/unittests/test_tensor_fill_.py @@ -17,13 +17,14 @@ import numpy as np import six import paddle +from paddle.fluid.framework import _test_eager_guard class TensorFill_Test(unittest.TestCase): def setUp(self): self.shape = [32, 32] - def test_tensor_fill_true(self): + def func_test_tensor_fill_true(self): typelist = ['float32', 'float64', 'int32', 'int64', 'float16'] places = [fluid.CPUPlace()] if fluid.core.is_compiled_with_cuda(): @@ -46,7 +47,12 @@ def test_tensor_fill_true(self): tensor.fill_(var) #var type is basic type in typelist self.assertEqual((tensor.numpy() == target).all(), True) - def test_tensor_fill_backward(self): + def test_tensor_fill_true(self): + with _test_eager_guard(): + self.func_test_tensor_fill_true() + self.func_test_tensor_fill_true() + + def func_test_tensor_fill_backward(self): typelist = ['float32'] places = [fluid.CPUPlace()] if fluid.core.is_compiled_with_cuda(): @@ -71,13 +77,23 @@ def test_tensor_fill_backward(self): self.assertEqual((y.grad.numpy() == 0).all().item(), True) - def test_errors(self): + def test_tensor_fill_backward(self): + with _test_eager_guard(): + self.func_test_tensor_fill_backward() + self.func_test_tensor_fill_backward() + + def func_test_errors(self): def test_list(): x = paddle.to_tensor([2, 3, 4]) x.fill_([1]) self.assertRaises(TypeError, test_list) + def test_errors(self): + with _test_eager_guard(): + self.func_test_errors() + self.func_test_errors() + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_tensor_zero_.py b/python/paddle/fluid/tests/unittests/test_tensor_zero_.py index 65620038fc497..d47585f78bb7b 100644 --- a/python/paddle/fluid/tests/unittests/test_tensor_zero_.py +++ b/python/paddle/fluid/tests/unittests/test_tensor_zero_.py @@ -17,13 +17,14 @@ import numpy as np import six import paddle +from paddle.fluid.framework import _test_eager_guard class TensorFill_Test(unittest.TestCase): def setUp(self): self.shape = [32, 32] - def test_tensor_fill_true(self): + def func_test_tensor_fill_true(self): typelist = ['float32', 'float64', 'int32', 'int64', 'float16'] places = [fluid.CPUPlace()] if fluid.core.is_compiled_with_cuda(): @@ -41,6 +42,11 @@ def test_tensor_fill_true(self): tensor.zero_() self.assertEqual((tensor.numpy() == target).all().item(), True) + def test_tensor_fill_true(self): + with _test_eager_guard(): + self.func_test_tensor_fill_true() + self.func_test_tensor_fill_true() + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_top_k_op.py b/python/paddle/fluid/tests/unittests/test_top_k_op.py index 52d1fda0ae299..83a940d064e76 100644 --- a/python/paddle/fluid/tests/unittests/test_top_k_op.py +++ b/python/paddle/fluid/tests/unittests/test_top_k_op.py @@ -18,6 +18,7 @@ import numpy as np from op_test import OpTest import paddle.fluid.core as core +import paddle class TestTopkOp(OpTest): @@ -61,4 +62,5 @@ def test_check_grad(self): if __name__ == "__main__": + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_top_k_v2_op.py b/python/paddle/fluid/tests/unittests/test_top_k_v2_op.py index 4be53304733cb..f1c4ca18da72b 100644 --- a/python/paddle/fluid/tests/unittests/test_top_k_v2_op.py +++ b/python/paddle/fluid/tests/unittests/test_top_k_v2_op.py @@ -19,6 +19,7 @@ from op_test import OpTest import paddle import paddle.fluid.core as core +from paddle.fluid.framework import _test_eager_guard def numpy_topk(x, k=1, axis=-1, largest=True): @@ -45,6 +46,7 @@ def init_args(self): def setUp(self): self.op_type = "top_k_v2" + self.python_api = paddle.topk self.dtype = np.float64 self.input_data = np.random.rand(10, 20) self.init_args() @@ -55,12 +57,10 @@ def setUp(self): self.outputs = {'Out': output, 'Indices': indices} def test_check_output(self): - paddle.enable_static() - self.check_output() + self.check_output(check_eager=False) def test_check_grad(self): - paddle.enable_static() - self.check_grad(set(['X']), 'Out') + self.check_grad(set(['X']), 'Out', check_eager=False) class TestTopkOp1(TestTopkOp): @@ -85,6 +85,7 @@ def init_args(self): def setUp(self): self.op_type = "top_k_v2" + self.python_api = paddle.topk self.dtype = np.float64 self.input_data = np.random.rand(16, 100) self.init_args() @@ -103,6 +104,7 @@ def init_args(self): def setUp(self): self.op_type = "top_k_v2" + self.python_api = paddle.topk self.dtype = np.float64 self.input_data = np.random.rand(10, 10, 5) self.init_args() @@ -121,6 +123,7 @@ def init_args(self): def setUp(self): self.op_type = "top_k_v2" + self.python_api = paddle.topk self.dtype = np.float64 self.input_data = np.random.rand(10, 10, 5) self.init_args() @@ -139,6 +142,7 @@ def init_args(self): def setUp(self): self.op_type = "top_k_v2" + self.python_api = paddle.topk self.dtype = np.float64 self.input_data = np.random.rand(80, 16384) self.init_args() @@ -156,48 +160,64 @@ def setUp(self): self.large_input_data = np.random.rand(2, 1030) def run_dygraph(self, place): - paddle.disable_static(place) - input_tensor = paddle.to_tensor(self.input_data) - large_input_tensor = paddle.to_tensor(self.large_input_data) - # test case for basic test case 1 - paddle_result = paddle.topk(input_tensor, k=2) - numpy_result = numpy_topk(self.input_data, k=2) - self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0])) - self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1])) - # test case for basic test case 2 with axis - paddle_result = paddle.topk(input_tensor, k=2, axis=1) - numpy_result = numpy_topk(self.input_data, k=2, axis=1) - self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0])) - self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1])) - # test case for basic test case 3 with tensor K - k_tensor = paddle.to_tensor(np.array([2])) - paddle_result = paddle.topk(input_tensor, k=k_tensor, axis=1) - numpy_result = numpy_topk(self.input_data, k=2, axis=1) - self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0])) - self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1])) - # test case for basic test case 4 with tensor largest - k_tensor = paddle.to_tensor(np.array([2])) - paddle_result = paddle.topk(input_tensor, k=2, axis=1, largest=False) - numpy_result = numpy_topk(self.input_data, k=2, axis=1, largest=False) - self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0])) - self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1])) - # test case for basic test case 5 with axis -1 - k_tensor = paddle.to_tensor(np.array([2])) - paddle_result = paddle.topk(input_tensor, k=2, axis=-1, largest=False) - numpy_result = numpy_topk(self.input_data, k=2, axis=-1, largest=False) - self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0])) - self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1])) - # test case for basic test case 6 for the partial sort - paddle_result = paddle.topk(large_input_tensor, k=1, axis=-1) - numpy_result = numpy_topk(self.large_input_data, k=1, axis=-1) - self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0])) - self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1])) - # test case for basic test case 7 for the unsorted - paddle_result = paddle.topk(input_tensor, k=2, axis=1, sorted=False) - sort_paddle = numpy_topk( - np.array(paddle_result[0].numpy()), axis=1, k=2) - numpy_result = numpy_topk(self.input_data, k=2, axis=1) - self.assertTrue(np.allclose(sort_paddle[0], numpy_result[0])) + with paddle.fluid.dygraph.guard(place): + input_tensor = paddle.to_tensor(self.input_data) + large_input_tensor = paddle.to_tensor(self.large_input_data) + # test case for basic test case 1 + paddle_result = paddle.topk(input_tensor, k=2) + numpy_result = numpy_topk(self.input_data, k=2) + self.assertTrue( + np.allclose(paddle_result[0].numpy(), numpy_result[0])) + self.assertTrue( + np.allclose(paddle_result[1].numpy(), numpy_result[1])) + # test case for basic test case 2 with axis + paddle_result = paddle.topk(input_tensor, k=2, axis=1) + numpy_result = numpy_topk(self.input_data, k=2, axis=1) + self.assertTrue( + np.allclose(paddle_result[0].numpy(), numpy_result[0])) + self.assertTrue( + np.allclose(paddle_result[1].numpy(), numpy_result[1])) + # test case for basic test case 3 with tensor K + k_tensor = paddle.to_tensor(np.array([2])) + paddle_result = paddle.topk(input_tensor, k=k_tensor, axis=1) + numpy_result = numpy_topk(self.input_data, k=2, axis=1) + self.assertTrue( + np.allclose(paddle_result[0].numpy(), numpy_result[0])) + self.assertTrue( + np.allclose(paddle_result[1].numpy(), numpy_result[1])) + # test case for basic test case 4 with tensor largest + k_tensor = paddle.to_tensor(np.array([2])) + paddle_result = paddle.topk( + input_tensor, k=2, axis=1, largest=False) + numpy_result = numpy_topk( + self.input_data, k=2, axis=1, largest=False) + self.assertTrue( + np.allclose(paddle_result[0].numpy(), numpy_result[0])) + self.assertTrue( + np.allclose(paddle_result[1].numpy(), numpy_result[1])) + # test case for basic test case 5 with axis -1 + k_tensor = paddle.to_tensor(np.array([2])) + paddle_result = paddle.topk( + input_tensor, k=2, axis=-1, largest=False) + numpy_result = numpy_topk( + self.input_data, k=2, axis=-1, largest=False) + self.assertTrue( + np.allclose(paddle_result[0].numpy(), numpy_result[0])) + self.assertTrue( + np.allclose(paddle_result[1].numpy(), numpy_result[1])) + # test case for basic test case 6 for the partial sort + paddle_result = paddle.topk(large_input_tensor, k=1, axis=-1) + numpy_result = numpy_topk(self.large_input_data, k=1, axis=-1) + self.assertTrue( + np.allclose(paddle_result[0].numpy(), numpy_result[0])) + self.assertTrue( + np.allclose(paddle_result[1].numpy(), numpy_result[1])) + # test case for basic test case 7 for the unsorted + paddle_result = paddle.topk(input_tensor, k=2, axis=1, sorted=False) + sort_paddle = numpy_topk( + np.array(paddle_result[0].numpy()), axis=1, k=2) + numpy_result = numpy_topk(self.input_data, k=2, axis=1) + self.assertTrue(np.allclose(sort_paddle[0], numpy_result[0])) def run_static(self, place): paddle.enable_static() @@ -264,14 +284,15 @@ def test_cases(self): self.run_static(place) def test_errors(self): - paddle.disable_static() - x = paddle.to_tensor([1, 2, 3]) - with self.assertRaises(BaseException): - paddle.topk(x, k=-1) + with paddle.fluid.dygraph.guard(): + x = paddle.to_tensor([1, 2, 3]) + with self.assertRaises(BaseException): + paddle.topk(x, k=-1) - with self.assertRaises(BaseException): - paddle.topk(x, k=0) + with self.assertRaises(BaseException): + paddle.topk(x, k=0) if __name__ == "__main__": + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_view_op_reuse_allocation.py b/python/paddle/fluid/tests/unittests/test_view_op_reuse_allocation.py index 92078a69b53a5..0d4e379660b75 100644 --- a/python/paddle/fluid/tests/unittests/test_view_op_reuse_allocation.py +++ b/python/paddle/fluid/tests/unittests/test_view_op_reuse_allocation.py @@ -91,18 +91,11 @@ def func_test_backward_error(self): view_var_b[0] = 2. # var_b is modified inplace loss = paddle.nn.functional.relu(var_c) - if in_dygraph_mode(): - with self.assertRaisesRegexp( - RuntimeError, - "received current_inplace_version:{} != inplace_version_snapshot_:{}". - format(1, 0)): - loss.backward() - else: - with self.assertRaisesRegexp( - RuntimeError, - "received tensor_version:{} != wrapper_version_snapshot:{}". - format(1, 0)): - loss.backward() + with self.assertRaisesRegexp( + RuntimeError, + "received tensor_version:{} != wrapper_version_snapshot:{}". + format(1, 0)): + loss.backward() def test_backward_error(self): with _test_eager_guard(): diff --git a/python/paddle/fluid/tests/unittests/test_viterbi_decode_op.py b/python/paddle/fluid/tests/unittests/test_viterbi_decode_op.py index 6f64322e97545..163e246b71560 100644 --- a/python/paddle/fluid/tests/unittests/test_viterbi_decode_op.py +++ b/python/paddle/fluid/tests/unittests/test_viterbi_decode_op.py @@ -74,6 +74,7 @@ def set_attr(self): def setUp(self): self.op_type = "viterbi_decode" + self.python_api = paddle.text.viterbi_decode self.set_attr() bz, length, ntags = self.bz, self.len, self.ntags self.input = np.random.randn(bz, length, ntags).astype(self.dtype) @@ -90,7 +91,7 @@ def setUp(self): self.outputs = {'Scores': scores, 'Path': path} def test_output(self): - self.check_output() + self.check_output(check_eager=True) class TestViterbiAPI(unittest.TestCase): @@ -132,3 +133,8 @@ def check_static_result(self, place): def test_static_net(self): for place in self.places: self.check_static_result(place) + + +if __name__ == "__main__": + paddle.enable_static() + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_while_op.py b/python/paddle/fluid/tests/unittests/test_while_op.py index d6d52b7d604aa..8af9a39634fdb 100644 --- a/python/paddle/fluid/tests/unittests/test_while_op.py +++ b/python/paddle/fluid/tests/unittests/test_while_op.py @@ -137,5 +137,44 @@ def test_bad_x(): self.assertRaises(TypeError, test_bad_x) +class TestIgnoreVarNameInWhile(unittest.TestCase): + def test_ignore_var(self): + def cond(i, ten, temp, y): + return i < ten + + def body_func(i, ten, batch_info, origin_seq): + print(batch_info) + batch_info = fluid.contrib.layers.shuffle_batch(batch_info) + print(batch_info) + i = i + 1 + return [i, ten, batch_info, origin_seq] + + x = fluid.layers.data(name='x', shape=[-1, 1, 4]) + y = fluid.layers.data(name='y', shape=[-1, 1, 1]) + temp = layers.concat(input=[x, y], axis=-1) + i = layers.fill_constant(shape=[1], value=0, dtype='int32') + num = layers.fill_constant(shape=[1], value=5, dtype='int32') + + i, ten, shuffle_temp, y = layers.while_loop(cond, body_func, + [i, num, temp, y]) + + output = shuffle_temp + + exe = fluid.Executor(fluid.CPUPlace()) + exe.run(fluid.default_startup_program()) + + input_x = numpy.array([[1, 2, 3, 4], [4, 5, 6, 7], [7, 8, 9, 10]]) + input_x = input_x.reshape(3, 1, 4) + input_y = numpy.array([[10], [12], [33]]) + input_y = input_y.reshape(3, 1, 1) + + res, = exe.run(fluid.default_main_program(), + feed={'x': input_x, + 'y': input_y}, + fetch_list=[output]) + + self.assertListEqual(list(res.shape), [3, 1, 5]) + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_yolo_box_op.py b/python/paddle/fluid/tests/unittests/test_yolo_box_op.py index 05a4dfe3c06b6..19dcb49cd957c 100644 --- a/python/paddle/fluid/tests/unittests/test_yolo_box_op.py +++ b/python/paddle/fluid/tests/unittests/test_yolo_box_op.py @@ -31,7 +31,7 @@ def YoloBox(x, img_size, attrs): an_num = int((len(anchors) // 2)) class_num = attrs['class_num'] conf_thresh = attrs['conf_thresh'] - downsample = attrs['downsample'] + downsample = attrs['downsample_ratio'] clip_bbox = attrs['clip_bbox'] scale_x_y = attrs['scale_x_y'] iou_aware = attrs['iou_aware'] @@ -92,13 +92,14 @@ class TestYoloBoxOp(OpTest): def setUp(self): self.initTestCase() self.op_type = 'yolo_box' + self.python_api = paddle.vision.ops.yolo_box x = np.random.random(self.x_shape).astype('float32') img_size = np.random.randint(10, 20, self.imgsize_shape).astype('int32') self.attrs = { 'anchors': self.anchors, 'class_num': self.class_num, 'conf_thresh': self.conf_thresh, - 'downsample': self.downsample, + 'downsample_ratio': self.downsample, 'clip_bbox': self.clip_bbox, 'scale_x_y': self.scale_x_y, 'iou_aware': self.iou_aware, diff --git a/python/paddle/fluid/tests/unittests/xpu/test_rnn_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_rnn_op_xpu.py index a27d806319cb2..e0d208644e79e 100755 --- a/python/paddle/fluid/tests/unittests/xpu/test_rnn_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_rnn_op_xpu.py @@ -1,9 +1,7 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# +# You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software @@ -14,6 +12,8 @@ from __future__ import print_function +import sys +sys.path.append("..") import unittest import numpy as np import math @@ -22,152 +22,180 @@ import paddle.fluid as fluid import paddle.fluid.layers as layers import random -import sys -sys.path.append("..") +from op_test import OpTest from op_test_xpu import XPUOpTest sys.path.append("../rnn") from rnn_numpy import SimpleRNN, LSTM, GRU from convert import get_params_for_net +from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper random.seed(2) np.set_printoptions(threshold=np.inf) paddle.enable_static() -class TestRNNOp(XPUOpTest): - def init_size(self): - self.seq_length = 1 - self.batch_size = 1 - self.input_size = 5 - self.hidden_size = 16 - - def get_weight_names(self): - weight_names = [] - for i in range(self.num_layers): - for j in range(0, 2 * self.direction_num): - weight_names.append("{}.weight_{}".format(i, j)) - for i in range(self.num_layers): - for j in range(0, 2 * self.direction_num): - weight_names.append("{}.bias_{}".format(i, j)) - return weight_names - - def setUp(self): - self.init_size() - self.op_type = "rnn" - self.dtype = np.float32 - self.sequence_length = np.ones( - (self.batch_size, ), dtype=np.int32) * self.seq_length - self.num_layers = 1 - self.is_bidirec = False - self.mode = "LSTM" - self.is_test = False - self.dropout = 0.0 - self.set_attrs() - - self.direction_num = 2 if self.is_bidirec else 1 - direction = "bidirectional" if self.is_bidirec else "forward" - - input = np.random.uniform( - low=-0.1, - high=0.1, - size=(self.seq_length, self.batch_size, - self.input_size)).astype(self.dtype) - - rnn1 = LSTM( - self.input_size, - self.hidden_size, - num_layers=self.num_layers, - time_major=True, - direction=direction, - dropout=self.dropout, - dtype="float32") - - flat_w = get_params_for_net(rnn1) - output, (last_hidden, last_cell) = rnn1( - input, sequence_length=self.sequence_length) - - init_h = np.zeros( - (self.num_layers * self.direction_num, self.batch_size, - self.hidden_size)).astype(self.dtype) - init_c = np.zeros( - (self.num_layers * self.direction_num, self.batch_size, - self.hidden_size)).astype(self.dtype) - state_out = np.ndarray((300)).astype("uint8") - - self.inputs = { - 'Input': input, - 'WeightList': flat_w, - 'PreState': [('init_h', init_h), ('init_c', init_c)], - 'SequenceLength': self.sequence_length - } - if self.sequence_length is None: +class XPUTestRNNOp(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'rnn' + self.use_dynamic_create_class = False + + class TestRNNOp(XPUOpTest): + def setUp(self): + self.init_size() + self.init_dtype() + self.op_type = "rnn" + self.place = paddle.XPUPlace(0) + self.sequence_length = np.ones( + (self.batch_size, ), dtype=np.int32) * self.seq_length + self.set_attrs() + self.mode = "LSTM" + self.is_test = False + self.dropout = 0.0 + + self.direction_num = 2 if self.is_bidirec else 1 + direction = "bidirectional" if self.is_bidirec else "forward" + + input = np.random.uniform( + low=-0.1, + high=0.1, + size=(self.seq_length, self.batch_size, + self.input_size)).astype(self.dtype) + + rnn1 = LSTM( + self.input_size, + self.hidden_size, + num_layers=self.num_layers, + time_major=True, + direction=direction, + dropout=self.dropout, + dtype=self.dtype) + + flat_w = get_params_for_net(rnn1) + output, (last_hidden, last_cell) = rnn1( + input, sequence_length=self.sequence_length) + + init_h = np.zeros( + (self.num_layers * self.direction_num, self.batch_size, + self.hidden_size)).astype(self.dtype) + init_c = np.zeros( + (self.num_layers * self.direction_num, self.batch_size, + self.hidden_size)).astype(self.dtype) + state_out = np.ndarray((300)).astype("uint8") + self.inputs = { 'Input': input, 'WeightList': flat_w, 'PreState': [('init_h', init_h), ('init_c', init_c)], + 'SequenceLength': self.sequence_length + } + if self.sequence_length is None: + self.inputs = { + 'Input': input, + 'WeightList': flat_w, + 'PreState': [('init_h', init_h), ('init_c', init_c)], + } + self.attrs = { + 'dropout_prob': self.dropout, + 'is_bidirec': self.is_bidirec, + 'input_size': self.input_size, + 'hidden_size': self.hidden_size, + 'num_layers': self.num_layers, + 'mode': self.mode, + 'is_test': self.is_test + } + self.outputs = { + 'Out': output, + "State": + [('last_hidden', last_hidden), ('last_cell', last_cell)], + 'Reserve': np.ndarray((400)).astype("uint8"), + 'DropoutState': state_out } - self.attrs = { - 'dropout_prob': self.dropout, - 'is_bidirec': self.is_bidirec, - 'input_size': self.input_size, - 'hidden_size': self.hidden_size, - 'num_layers': self.num_layers, - 'mode': self.mode, - 'is_test': self.is_test - } - self.outputs = { - 'Out': output, - "State": [('last_hidden', last_hidden), ('last_cell', last_cell)], - 'Reserve': np.ndarray((400)).astype("uint8"), - 'DropoutState': state_out - } - - def test_output(self): - if paddle.is_compiled_with_xpu(): - place = paddle.XPUPlace(0) - self.check_output_with_place( - place, atol=0.01, no_check_set=['Reserve', 'DropoutState']) - - def set_attrs(self): - pass - - def test_grad(self): - if paddle.is_compiled_with_xpu(): - place = paddle.XPUPlace(0) - if not self.is_test: - var_name_list = self.get_weight_names() - grad_check_list = ['Input', 'init_h', 'init_c'] - grad_check_list.extend(var_name_list) - self.check_grad_with_place( - place, - set(grad_check_list), ['Out', 'last_hidden', 'last_cell'], - max_relative_error=0.1) - - -class TestRNNOpCase0(TestRNNOp): - def init_size(self): - self.seq_length = 2 - self.batch_size = 4 - self.input_size = 10 - self.hidden_size = 32 - - -class TestRNNOpCase1(TestRNNOp): - def init_size(self): - self.seq_length = 5 - self.batch_size = 16 - self.input_size = 30 - self.hidden_size = 64 - - -class TestRNNOpCase2(TestRNNOp): - def init_size(self): - self.seq_length = 10 - self.batch_size = 64 - self.input_size = 50 - self.hidden_size = 64 + def init_dtype(self): + self.dtype = self.in_type + + def set_xpu(self): + self.__class__.use_xpu = True + self.__class__.no_need_check_grad = True + self.__class__.op_type = self.in_type + + def test_check_output(self): + self.check_output_with_place( + self.place, atol=0.01, + no_check_set=['Reserve', 'DropoutState']) + + def init_size(self): + self.seq_length = 1 + self.batch_size = 1 + self.input_size = 5 + self.hidden_size = 16 + + def get_weight_names(self): + weight_names = [] + for i in range(self.num_layers): + for j in range(0, 2 * self.direction_num): + weight_names.append("{}.weight_{}".format(i, j)) + for i in range(self.num_layers): + for j in range(0, 2 * self.direction_num): + weight_names.append("{}.bias_{}".format(i, j)) + return weight_names + + def set_attrs(self): + self.num_layers = 1 + self.is_bidirec = False + + class TestRNNOp1(TestRNNOp): + def init_size(self): + self.seq_length = 2 + self.batch_size = 4 + self.input_size = 10 + self.hidden_size = 32 + + def set_attrs(self): + self.num_layers = 1 + self.is_bidirec = False + + class TestRNNOp2(TestRNNOp): + def init_size(self): + self.seq_length = 5 + self.batch_size = 16 + self.input_size = 30 + self.hidden_size = 64 + + def set_attrs(self): + self.num_layers = 1 + self.is_bidirec = True + + class TestRNNOp3(TestRNNOp): + def init_size(self): + self.seq_length = 10 + self.batch_size = 64 + self.input_size = 50 + self.hidden_size = 64 + + def set_attrs(self): + self.num_layers = 2 + self.is_bidirec = False + + class TestRNNOp4(TestRNNOp): + def set_attrs(self): + self.num_layers = 3 + self.is_bidirec = False + + class TestRNNOp5(TestRNNOp): + def set_attrs(self): + self.num_layers = 2 + self.is_bidirec = True + + +support_types = get_xpu_op_support_types('rnn') +for stype in support_types: + create_test_class( + globals(), + XPUTestRNNOp, + stype, + ignore_deivce_version=[core.XPUVersion.XPU1]) if __name__ == '__main__': unittest.main() diff --git a/python/paddle/framework/__init__.py b/python/paddle/framework/__init__.py index f4a4052ee5e15..2f8c23187e8d1 100644 --- a/python/paddle/framework/__init__.py +++ b/python/paddle/framework/__init__.py @@ -47,6 +47,7 @@ from ..fluid.dygraph.base import enable_dygraph as disable_static # noqa: F401 from ..fluid.dygraph.base import disable_dygraph as enable_static # noqa: F401 from ..fluid.framework import _non_static_mode as in_dynamic_mode # noqa: F401 +from ..fluid.framework import _non_static_mode # noqa: F401; temporary used for hackson from ..fluid.framework import _current_expected_place, _get_paddle_place # noqa: F401 from ..fluid.framework import dygraph_only # noqa: F401 from ..fluid.framework import convert_np_dtype_to_dtype_, _varbase_creator, OpProtoHolder # noqa: F401 diff --git a/python/paddle/incubate/optimizer/functional/bfgs.py b/python/paddle/incubate/optimizer/functional/bfgs.py index 9afcc2240aeb5..9147444f5a6bb 100644 --- a/python/paddle/incubate/optimizer/functional/bfgs.py +++ b/python/paddle/incubate/optimizer/functional/bfgs.py @@ -100,7 +100,7 @@ def func(x): return paddle.dot(x, x) x0 = paddle.to_tensor([1.3, 2.7]) - results = paddle.optimizer.functional.minimize_bfgs(func, x0) + results = paddle.incubate.optimizer.functional.minimize_bfgs(func, x0) print("is_converge: ", results[0]) print("the minimum of func is: ", results[2]) # is_converge: is_converge: Tensor(shape=[1], dtype=bool, place=Place(gpu:0), stop_gradient=True, diff --git a/python/paddle/incubate/optimizer/functional/lbfgs.py b/python/paddle/incubate/optimizer/functional/lbfgs.py index 90ae452653a5c..1fbae18a4c65a 100644 --- a/python/paddle/incubate/optimizer/functional/lbfgs.py +++ b/python/paddle/incubate/optimizer/functional/lbfgs.py @@ -89,7 +89,7 @@ def func(x): return paddle.dot(x, x) x0 = paddle.to_tensor([1.3, 2.7]) - results = paddle.optimizer.functional.minimize_lbfgs(func, x0) + results = paddle.incubate.optimizer.functional.minimize_lbfgs(func, x0) print("is_converge: ", results[0]) print("the minimum of func is: ", results[2]) # is_converge: is_converge: Tensor(shape=[1], dtype=bool, place=Place(gpu:0), stop_gradient=True, diff --git a/python/paddle/incubate/optimizer/functional/utils.py b/python/paddle/incubate/optimizer/functional/utils.py index c197f8a1acb5e..3000c82a71e87 100644 --- a/python/paddle/incubate/optimizer/functional/utils.py +++ b/python/paddle/incubate/optimizer/functional/utils.py @@ -13,7 +13,6 @@ # limitations under the License. import paddle -from paddle.autograd.functional import vjp, Jacobian from paddle.fluid.framework import Variable from paddle.fluid.data_feeder import check_type, check_dtype @@ -86,11 +85,14 @@ def _value_and_gradient(f, x, v=None): value: a tensor that holds the function value. gradient: a tensor that holds the function gradients. """ + # use detach to cut off relation between x and original graph + x = x.detach() + x.stop_gradient = False + value = f(x) if paddle.in_dynamic_mode(): - value, gradient = vjp(f, x, v=v) - gradient = gradient[0] + # only need to compute first order derivative, and some op dont support high order derivative. + gradient = paddle.grad([value], [x], create_graph=False)[0] else: - JJ = Jacobian(f, x) - gradient = JJ[:][0] - value = f(x) - return value, gradient + gradient = paddle.static.gradients([value], [x])[0] + # use detach to make results real number without grad to avoid assign error + return value.detach(), gradient.detach() diff --git a/python/paddle/nn/functional/common.py b/python/paddle/nn/functional/common.py index e757fbf53487e..d988d1653ca69 100644 --- a/python/paddle/nn/functional/common.py +++ b/python/paddle/nn/functional/common.py @@ -28,7 +28,7 @@ from ...tensor import sum from ...tensor import sqrt from ...fluid.data_feeder import check_variable_and_dtype, check_dtype -from ...fluid.framework import _varbase_creator +from ...fluid.framework import _varbase_creator, _in_legacy_dygraph, in_dygraph_mode from ...fluid import dygraph_utils from ...fluid import layers @@ -1616,7 +1616,7 @@ def label_smooth(label, prior_dist=None, epsilon=0.1, name=None): if epsilon > 1. or epsilon < 0.: raise ValueError("The value of epsilon must be between 0 and 1.") - if in_dynamic_mode(): + if paddle.in_dynamic_mode(): return _C_ops.label_smooth(label, prior_dist, 'epsilon', float(epsilon)) check_variable_and_dtype(label, 'label', ['float32', 'float64'], diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py index e7763853bf7c2..660e6d3587108 100755 --- a/python/paddle/nn/functional/loss.py +++ b/python/paddle/nn/functional/loss.py @@ -37,7 +37,7 @@ from paddle import _C_ops from paddle import in_dynamic_mode from paddle.framework import core -from ...fluid.framework import _in_legacy_dygraph, in_dygraph_mode +from ...fluid.framework import _in_legacy_dygraph, in_dygraph_mode, _non_static_mode __all__ = [] @@ -784,11 +784,12 @@ def nll_loss(input, input_dims)) n = input_shape[0] c = input_shape[1] - if in_dynamic_mode(): + if _non_static_mode(): if input_dims != 2 and input_dims != 4: input, _ = _C_ops.reshape2(input, None, 'shape', [n, c, 1, -1]) label, _ = _C_ops.reshape2(label, None, 'shape', [n, 1, -1]) out_shape = [n] + input_shape[2:] + out, total_weight = _C_ops.nll_loss(input, label, weight, 'ignore_index', ignore_index, 'reduction', reduction) diff --git a/python/paddle/nn/functional/norm.py b/python/paddle/nn/functional/norm.py index c039754af4d12..536c611d85f28 100644 --- a/python/paddle/nn/functional/norm.py +++ b/python/paddle/nn/functional/norm.py @@ -181,7 +181,7 @@ def batch_norm(x, trainable_statistics = not use_global_stats if in_dynamic_mode(): - # for dygraph need tuple + attrs = ("momentum", momentum, "epsilon", epsilon, "is_test", not training, "data_layout", data_format, "use_mkldnn", False, "fuse_with_relu", False, "use_global_stats", use_global_stats, diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py index 7901379d9c793..4b8395e1c43c8 100644 --- a/python/paddle/tensor/linalg.py +++ b/python/paddle/tensor/linalg.py @@ -1397,7 +1397,10 @@ def histogram(input, bins=100, min=0, max=0, name=None): result = paddle.histogram(inputs, bins=4, min=0, max=3) print(result) # [0, 2, 1, 0] """ - if paddle.in_dynamic_mode(): + if in_dygraph_mode(): + return _C_ops.final_state_histogram(input, bins, min, max) + + if _in_legacy_dygraph(): return _C_ops.histogram(input, "bins", bins, "min", min, "max", max) helper = LayerHelper('histogram', **locals()) diff --git a/python/paddle/tensor/logic.py b/python/paddle/tensor/logic.py index 03b64e2b828df..3c02c11b933c1 100755 --- a/python/paddle/tensor/logic.py +++ b/python/paddle/tensor/logic.py @@ -536,6 +536,8 @@ def bitwise_and(x, y, out=None, name=None): res = paddle.bitwise_and(x, y) print(res) # [0, 2, 1] """ + if in_dygraph_mode() and out == None: + return _C_ops.final_state_bitwise_and(x, y) return _bitwise_op( op_name="bitwise_and", x=x, y=y, name=name, out=out, binary_op=True) @@ -562,6 +564,9 @@ def bitwise_or(x, y, out=None, name=None): res = paddle.bitwise_or(x, y) print(res) # [-1, -1, -3] """ + if in_dygraph_mode() and out == None: + return _C_ops.final_state_bitwise_or(x, y) + return _bitwise_op( op_name="bitwise_or", x=x, y=y, name=name, out=out, binary_op=True) @@ -588,6 +593,8 @@ def bitwise_xor(x, y, out=None, name=None): res = paddle.bitwise_xor(x, y) print(res) # [-1, -3, -4] """ + if in_dygraph_mode() and out == None: + return _C_ops.final_state_bitwise_xor(x, y) return _bitwise_op( op_name="bitwise_xor", x=x, y=y, name=name, out=out, binary_op=True) @@ -612,6 +619,8 @@ def bitwise_not(x, out=None, name=None): res = paddle.bitwise_not(x) print(res) # [4, 0, -2] """ + if in_dygraph_mode() and out == None: + return _C_ops.final_state_bitwise_not(x) return _bitwise_op( op_name="bitwise_not", x=x, y=None, name=name, out=out, binary_op=False) diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py index 7921c7798be3a..68d6aca35ad65 100755 --- a/python/paddle/tensor/manipulation.py +++ b/python/paddle/tensor/manipulation.py @@ -17,7 +17,7 @@ from ..static import Variable, device_guard from ..framework import core -from ..fluid.framework import _in_legacy_dygraph, in_dygraph_mode, _in_eager_without_dygraph_check +from ..fluid.framework import _in_legacy_dygraph, in_dygraph_mode, _in_eager_without_dygraph_check, _non_static_mode from ..fluid.layer_helper import LayerHelper from ..framework import OpProtoHolder, convert_np_dtype_to_dtype_, dygraph_only from ..fluid.data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype @@ -1845,7 +1845,7 @@ def expand_as(x, y, name=None): np_out = out.numpy() # [[1, 2, 3], [1, 2, 3]] """ - if paddle.in_dynamic_mode(): + if _non_static_mode(): return _C_ops.expand_as_v2(x, 'target_shape', y.shape) check_variable_and_dtype( diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py index 7ee684f5a2f07..124bd69921055 100755 --- a/python/paddle/tensor/math.py +++ b/python/paddle/tensor/math.py @@ -177,6 +177,12 @@ def pow(x, y, name=None): raise TypeError('y must be scalar or tensor type, but received: %s '% (type(y))) +OP_NAMEMAPPING = { + 'elementwise_max': 'final_state_maximum', + 'elementwise_min': 'final_state_minimum', + 'elementwise_pow': 'final_state_elementwise_pow', + 'elementwise_floordiv': 'final_state_floor_divide', +} @dygraph_only def _elementwise_op_in_dygraph(x, @@ -185,13 +191,20 @@ def _elementwise_op_in_dygraph(x, act=None, use_mkldnn=False, op_name=None): - op = getattr(_C_ops, op_name) - out = op(x, y, 'axis', axis, 'use_mkldnn', use_mkldnn) + def is_inplace(op_name): + return op_name[-1] == "_" + + if in_dygraph_mode(): + op = getattr(_C_ops, OP_NAMEMAPPING[op_name] if not is_inplace(op_name) else op_name) + out = op(x, y) + + if _in_legacy_dygraph(): + op = getattr(_C_ops, op_name) + out = op(x, y, 'axis', axis, 'use_mkldnn', use_mkldnn) return dygraph_utils._append_activation_in_dygraph( out, act, use_mkldnn=use_mkldnn) - def _elementwise_op(helper): op_type = helper.layer_type original_op_type = helper.kwargs.get('original_op_type', op_type) @@ -2681,7 +2694,9 @@ def isfinite(x, name=None): out = paddle.tensor.isfinite(x) print(out) # [False True True False True False False] """ - if paddle.in_dynamic_mode(): + if in_dygraph_mode(): + return _C_ops.final_state_isfinite( x ) + if _in_legacy_dygraph(): return _C_ops.isfinite_v2(x) helper = LayerHelper("isfinite_v2", **locals()) check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64', 'int32', 'int64'], 'isfinite') @@ -2709,7 +2724,9 @@ def isinf(x, name=None): out = paddle.tensor.isinf(x) print(out) # [ True False False True False False False] """ - if paddle.in_dynamic_mode(): + if in_dygraph_mode(): + return _C_ops.final_state_isinf( x ) + if _in_legacy_dygraph(): return _C_ops.isinf_v2(x) helper = LayerHelper("isinf_v2", **locals()) check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64', 'int32', 'int64'], 'isinf') @@ -2737,7 +2754,10 @@ def isnan(x, name=None): out = paddle.tensor.isnan(x) print(out) # [False False False False False True True] """ - if paddle.in_dynamic_mode(): + if in_dygraph_mode(): + return _C_ops.final_state_isnan( x ) + + if _in_legacy_dygraph(): return _C_ops.isnan_v2(x) helper = LayerHelper("isnan_v2", **locals()) check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64', 'int32', 'int64'], 'isnan') @@ -3387,8 +3407,13 @@ def lerp(x, y, weight, name=None): # out: [5.5., 6., 6.5, 7.] """ - if paddle.in_dynamic_mode(): + if in_dygraph_mode(): check_type(weight, 'weight', (float, paddle.Tensor, Variable), 'lerp') + if isinstance(weight, float): + weight = paddle.to_tensor(weight, dtype=x.dtype) + + return _C_ops.final_state_lerp( x, y, weight) + if _in_legacy_dygraph(): if isinstance(weight, float): weight = paddle.to_tensor(weight, dtype=x.dtype) return _C_ops.lerp(x, y, weight) diff --git a/python/paddle/tensor/search.py b/python/paddle/tensor/search.py index ef10135fb99c1..c41c76f1b379b 100644 --- a/python/paddle/tensor/search.py +++ b/python/paddle/tensor/search.py @@ -18,7 +18,7 @@ from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype from ..fluid import layers from ..framework import core -from ..fluid.framework import _in_legacy_dygraph, in_dygraph_mode +from ..fluid.framework import _in_legacy_dygraph, in_dygraph_mode, _non_static_mode from paddle.common_ops_import import convert_np_dtype_to_dtype_ from paddle.common_ops_import import Variable from paddle.common_ops_import import VarDesc @@ -774,7 +774,10 @@ def masked_select(x, mask, name=None): #[1.0 5.0 6.0 9.0] """ - if paddle.in_dynamic_mode(): + if in_dygraph_mode(): + return _C_ops.final_state_masked_select(x, mask) + + if _in_legacy_dygraph(): return _C_ops.masked_select(x, mask) helper = LayerHelper("masked_select", **locals()) @@ -844,8 +847,8 @@ def topk(x, k, axis=None, largest=True, sorted=True, name=None): # [[1 1 0 0]] """ - if paddle.in_dynamic_mode(): - k = k.numpy().item(0) if isinstance(k, Variable) else k + + if _non_static_mode(): if axis is None: out, indices = _C_ops.top_k_v2(x, 'k', int(k), 'largest', largest, 'sorted', diff --git a/python/paddle/tensor/stat.py b/python/paddle/tensor/stat.py index dd0da03e4fd28..5876b9180823e 100644 --- a/python/paddle/tensor/stat.py +++ b/python/paddle/tensor/stat.py @@ -387,7 +387,7 @@ def quantile(x, q, axis=None, keepdim=False): if not isinstance(x, Variable): raise TypeError("input x should be a Tensor.") dims = len(x.shape) - out_shape = x.shape + out_shape = list(x.shape) if axis is None: x = paddle.flatten(x) axis = 0 @@ -433,16 +433,15 @@ def quantile(x, q, axis=None, keepdim=False): indices.append(q_num * (x.shape[axis] - 1)) else: raise TypeError("Type of q should be int, float, list or tuple.") - indices = paddle.to_tensor(indices).astype(paddle.float32) sorted_tensor = paddle.sort(x, axis) - indices_below = paddle.floor(indices).astype(paddle.int32) - indices_upper = paddle.ceil(indices).astype(paddle.int32) + indices_tensor = paddle.assign(indices).astype(paddle.float32) + indices_below = paddle.floor(indices_tensor).astype(paddle.int32) + indices_upper = paddle.ceil(indices_tensor).astype(paddle.int32) outputs = [] def expand_dim(indices, sorted_tensor_shape, axis): assert axis < len(list(sorted_tensor_shape)) expanded_shape = [1] * len(list(sorted_tensor_shape)) - expanded_shape[axis] = len(indices) expanded_shape = tuple(expanded_shape) indices = indices.reshape(expanded_shape) return indices diff --git a/python/paddle/text/viterbi_decode.py b/python/paddle/text/viterbi_decode.py index dbf16bfbc6a97..ce5667b134a03 100644 --- a/python/paddle/text/viterbi_decode.py +++ b/python/paddle/text/viterbi_decode.py @@ -13,7 +13,7 @@ # limitations under the License. from ..nn import Layer -from ..fluid.framework import core, _non_static_mode +from ..fluid.framework import core, _non_static_mode, in_dygraph_mode from ..fluid.layer_helper import LayerHelper from ..fluid.data_feeder import check_variable_and_dtype, check_type from paddle import _C_ops @@ -58,6 +58,10 @@ def viterbi_decode(potentials, transition = paddle.rand((num_tags, num_tags), dtype='float32') scores, path = paddle.text.viterbi_decode(emission, transition, length, False) # scores: [3.37089300, 1.56825531], path: [[1, 0, 0], [1, 1, 0]] """ + if in_dygraph_mode(): + return _C_ops.final_state_viterbi_decode(potentials, transition_params, + lengths, include_bos_eos_tag) + if _non_static_mode(): return _C_ops.viterbi_decode(potentials, transition_params, lengths, 'include_bos_eos_tag', include_bos_eos_tag) diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml index 9a6059c53a7e5..5bbc64ec44afc 100644 --- a/python/paddle/utils/code_gen/api.yaml +++ b/python/paddle/utils/code_gen/api.yaml @@ -1,273 +1,8 @@ -# - api : norm -# args : (Tensor x, int axis, float epsilon, bool is_test) -# output : Tensor(out), Tensor(norm) -# infer_meta : -# func : NormInferMeta -# kernel : -# func : norm -# intermediate : norm -# backward : norm_grad - -# # maxout -# - api : maxout -# args : (Tensor x, int groups, int axis) -# output : Tensor -# infer_meta : -# func : MaxoutInferMeta -# kernel : -# func : maxout -# backward : maxout_grad - -# # batch_norm -# - api : batch_norm -# args : (Tensor x, Tensor scale, Tensor bias, Tensor mean, Tensor variance, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics, bool fuse_with_relu) -# output : Tensor(out), Tensor(mean_out), Tensor(variance_out), Tensor(saved_mean), Tensor(saved_variance), Tensor(reserve_space) -# infer_meta : -# func : XXXXInferMeta -# kernel : -# func : batch_norm -# backward: batch_norm_grad - -# # bilinear_tensor_product ?? optional -# - api : bilinear_tensor_product -# args : (Tensor x, Tensor y, Tensor weight, Tensor bias) -# output : Tensor -# infer_meta : -# func : BilinearTensorProductInferMeta -# kernel : -# func : bilinear_tensor_product -# backward : bilinear_tensor_product_grad -# optional : bias - -# broadcast_tensors -# - api : broadcast_tensors -# args : (Tensor[] x) -# output : Tensor[] -# infer_meta : -# func : BroadcastTensorsInferMeta -# kernel : -# func : broadcast_tensors -# backward : broadcast_tensors_grad - -# # dropout -# - api : dropout -# args : (Tensor x, Tensor seed_tensor, float p, bool is_test, str mode, int seed, bool fix_seed) -# output : Tensor(out), Tensor(mask) -# infer_meta : -# func : DropoutInferMeta -# kernel : -# func : dropout - -# # expand -# - api : expand -# args : (Tensor x, ScalarArray shape) -# output : Tensor -# infer_meta : -# func : ExpandInferMeta -# kernel : -# func : expand -# backward : expand_grad - -# eye -# - api : eye -# args : (int64_t num_rows, int64_t num_colums, DataType dtype = DataType::FLOAT32) -# output : Tensor -# infer_meta : -# func : EyeInferMeta -# kernel : -# func : eye - -# gaussian_random -# - api : gaussian_random -# args : (ScalarArray shape, float mean, float std, int seed, DataType dtype=DataType::FLOAT32) -# output : Tensor -# infer_meta : -# func : CreateInferMeta -# param : [shape, dtype] -# kernel : -# func : gaussian_random -# data_type : dtype - -# # graph_send_recv -# - api : graph_send_recv -# args : (Tensor x, Tensor src_index, Tensor dst_index, str pool_type) -# output : Tensor(out), Tensor(dst_count) -# infer_meta : -# func : GraphSendRecvInferMeta -# kernel : -# func : graph_send_recv -# backward : graph_send_recv_grad - -# # label_smooth -# - api : label_smooth -# args : (Tensor label, Tensor prior_dist, float epsilon) -# output : Tensor -# infer_meta : -# func : UnchangedInferMeta -# param : [label] -# kernel : -# func : label_smooth -# data_type : label -# optional : prior_dist -# backward : label_smooth_grad - -# linspace start stop number -# - api : linspace -# args : (Tensor start, Tensor stop, Tensor number, DataType dtype=DataType::FLOAT32) -# output : Tensor -# infer_meta : -# func : LinspaceInferMeta -# kernel : -# func : linspace - -# # multi_dot -# - api : multi_dot -# args : (Tensor[] x) -# output : Tensor -# infer_meta : -# func : MultiDotInferMeta -# kernel : -# func : multi_dot -# backward : multi_dot_grad - -# # nll_loss -# - api : nll_loss -# args : (Tensor x, Tensor label, Tensor weight, int64_t ignore_index, str reduction) -# output : Tensor(out), Tensor(total_weight) -# infer_meta : -# func : NllLossRawInferMeta -# kernel : -# func : nll_loss -# data_type : x -# optional : weight -# backward : nll_loss_grad - -# # psroi_pool -# - api : psroi_pool -# args : (Tensor x, Tensor rois, Tensor rois_num, int pooled_weight, int pooled_width, int output_channels, float spatial_scale ) -# output : Tensor -# infer_meta : -# func : PsroiPoolInferMeta -# kernel : -# func : psroi_pool -# backward : psroi_pool_grad -# optional : rois_num - -# # randint -# - api : randint -# args : (int low, int high, ScalarArray shape, DataType dtype) -# output : Tensor -# infer_meta : -# func : RandintInferMeta -# kernel : -# func : randint - -# # randperm -# - api : randperm -# args : (int n, DataType dtype) -# output : Tensor -# infer_meta : -# func : RandpermInferMeta -# kernel : -# func : randperm - -# # max -# - api : max -# args : (Tensor x, int64_t[] dims, bool keep_dim) -# output : Tensor -# infer_meta : -# func : MaxInferMeta -# kernel : -# func : max - -# # top_k -# - api : top_k -# args : (Tensor x, Scalar k, int axis = -1, bool largest = true, bool sorted = true) -# output : Tensor(out), Tensor(indices) -# infer_meta : -# func : TopKInferMeta -# kernel : -# func : top_k -# backward : top_k_grad - -# # phi_transfer_layout | not have python api - -# # truncated_gaussian_random -# - api : truncated_gaussian_random -# args : (int[] shape, float mean, float std, int seed, DataType dtype) -# output : Tensor -# infer_meta : -# func : TruncatedGaussianRandomInferMeta -# kernel : -# func : truncated_gaussian_random - -# # unbind -# - api : unbind -# args : (Tensor x, int axis) -# output : Tensor[] -# infer_meta : -# func : UnbindInferMeta -# kernel : -# func : unbind - -# # uniform_random_raw selected rows ?? - -# - api : pixel_shuffle -# args : (Tensor x, int upscale_factor, const std::string& data_format) -# output : Tensor -# infer_meta : -# func : PixelShuffleInferMeta -# kernel : -# func : pixel_shuffle - -# BilinearTensorProductInferMeta - -# BroadcastTensorsInferMeta - -# bincount -# - api : bincount -# args : (Tensor x, Tensor weight, int minlength) -# output : Tensor -# infer_meta : -# func : BincountInferMeta -# kernel : -# func : bincount -# optional : weight - -# expand_as -# - api : expand_as -# args : (Tensor x, Tensor y, int[] target_shape) -# output : Tensor -# infer_meta : -# func : ExpandAsInferMeta -# kernel : -# func : expand_as -# optional : y -# # backward : expand_as_grad -# # optional : y - -# - api : equal_all -# args : (Tensor x, Tensor y) -# output : Tensor -# infer_meta : -# func : CompareAllInferMeta -# kernel : -# func : equal_all - -# histogram -# - api : histogram -# args : (Tensor x, int64_t bins, int min, int max) -# output : Tensor -# infer_meta : -# func : HistogramInferMeta -# kernel : -# func : histogram - - api : abs args : (Tensor x) output : Tensor infer_meta : - func : UnchangedInferMeta + func : RealAndImagInferMeta kernel : func : abs backward : abs_grad @@ -655,7 +390,7 @@ backward : elu_grad - api : empty - args : (ScalarArray shape, DataType dtype=DataType::FLOAT32, Place place=CPUPlace()) + args : (IntArray shape, DataType dtype=DataType::FLOAT32, Place place=CPUPlace()) output: Tensor infer_meta : func : CreateInferMeta @@ -724,7 +459,7 @@ func : flip - api : full - args : (ScalarArray shape, Scalar value, DataType dtype=DataType::FLOAT32, Place place=CPUPlace()) + args : (IntArray shape, Scalar value, DataType dtype=DataType::FLOAT32, Place place=CPUPlace()) output: Tensor infer_meta : func : CreateInferMeta @@ -812,6 +547,15 @@ func : hard_sigmoid backward : hard_sigmoid_grad +# histogram +- api : histogram + args : (Tensor x, int64_t bins, int min, int max) + output : Tensor + infer_meta : + func : HistogramInferMeta + kernel : + func : histogram + - api : huber_loss args : (Tensor input, Tensor label, float delta) output : Tensor(out), Tensor(residual) @@ -1000,6 +744,15 @@ func : matrix_power backward : matrix_power_grad +- api : maximum + args : (Tensor x, Tensor y) + output : Tensor(out) + infer_meta : + func : ElementwiseInferMeta + kernel : + func : maximum + backward : maximum_grad + - api : mean args : (Tensor x, int64_t[] axis={}, bool keep_dim=false) output : Tensor @@ -1008,6 +761,24 @@ kernel : func : mean +- api : minimum + args : (Tensor x, Tensor y) + output : Tensor(out) + infer_meta : + func : ElementwiseInferMeta + kernel : + func : minimum + backward : minimum_grad + +- api : modulo + args : (Tensor x, Tensor y) + output : Tensor + infer_meta : + func : ElementwiseInferMeta + kernel : + func : modulo + backward : modulo_grad + # multinomial - api : multinomial args : (Tensor x, int num_samples, bool replacement) @@ -1105,6 +876,15 @@ data_type : x backward : put_along_axis_grad +- api : reciprocal + args : (Tensor x) + output : Tensor + infer_meta : + func : UnchangedInferMeta + kernel : + func : reciprocal + backward : reciprocal_grad + # reduce_prod - api : reduce_prod args : (Tensor x, int64_t[] dims, bool keep_dim, bool reduce_all) @@ -1125,7 +905,7 @@ backward : relu_grad - api : reshape - args : (Tensor x, ScalarArray shape) + args : (Tensor x, IntArray shape) output : Tensor(out), Tensor(xshape) infer_meta : func : ReshapeWithXShapeInferMeta @@ -1189,6 +969,14 @@ func : selu backward : selu_grad +- api : shape + args : (Tensor input) + output : Tensor + infer_meta : + func : ShapeInferMeta + kernel : + func : shape, shape_sr + # shard_index - api : shard_index args : (Tensor in, int index_num, int nshards, int shard_id, int ignore_value) @@ -1286,10 +1074,37 @@ backward : softmax_grad - api : split - args : (Tensor x, ScalarArray num_or_sections, Scalar(int) axis) + args : (Tensor x, IntArray num_or_sections, Scalar(int) axis) output : Tensor[] invoke : split_impl(x, num_or_sections, axis) +- api : sqrt + args : (Tensor x) + output : Tensor + infer_meta : + func : UnchangedInferMeta + kernel : + func : sqrt + backward : sqrt_grad + +- api : square + args : (Tensor x) + output : Tensor + infer_meta : + func : UnchangedInferMeta + kernel : + func : square + backward : square_grad + +- api : strided_slice + args : (Tensor x, int[] axes, IntArray starts, IntArray ends, IntArray strides) + output : Tensor + infer_meta : + func : StridedSliceInferMeta + kernel : + func : strided_slice + backward : strided_slice_grad + - api : subtract args : (Tensor x, Tensor y) output : Tensor @@ -1364,7 +1179,7 @@ # tile - api : tile - args : (Tensor x, ScalarArray repeat_times) + args : (Tensor x, IntArray repeat_times) output : Tensor infer_meta : func : TileInferMeta diff --git a/python/paddle/utils/code_gen/api_base.py b/python/paddle/utils/code_gen/api_base.py index 1c58334794da1..e281484f69744 100644 --- a/python/paddle/utils/code_gen/api_base.py +++ b/python/paddle/utils/code_gen/api_base.py @@ -88,7 +88,7 @@ def parse_input_and_attr(self, api_name, args_config, optional_vars=[]): 'Tensor[]': 'const std::vector&' } attr_types_map = { - 'ScalarArray': 'const ScalarArray&', + 'IntArray': 'const IntArray&', 'Scalar': 'const Scalar&', 'Scalar(int)': 'const Scalar&', 'Scalar(int64_t)': 'const Scalar&', @@ -600,9 +600,9 @@ def get_kernel_args(self, code_indent): kernel_args_type_list.append(kernel_in_type) elif param in attr_names: # set attr for kernel_context - if 'ScalarArray' in self.attrs['attr_info'][param][0]: - kernel_args_type_list.append('const phi::ScalarArray&') - param = 'phi::ScalarArray(' + param + ')' + if 'IntArray' in self.attrs['attr_info'][param][0]: + kernel_args_type_list.append('const phi::IntArray&') + param = 'phi::IntArray(' + param + ')' elif 'Scalar' in self.attrs['attr_info'][param][0]: kernel_args_type_list.append('const phi::Scalar&') param = 'phi::Scalar(' + param + ')' @@ -665,9 +665,9 @@ def get_selected_rows_kernel_args(self, code_indent): kernel_args_type_list.append(kernel_in_type) elif param in attr_names: # set attr for kernel_context - if 'ScalarArray' in self.attrs['attr_info'][param][0]: - kernel_args_type_list.append('const phi::ScalarArray&') - param = 'phi::ScalarArray(' + param + ')' + if 'IntArray' in self.attrs['attr_info'][param][0]: + kernel_args_type_list.append('const phi::IntArray&') + param = 'phi::IntArray(' + param + ')' elif 'Scalar' in self.attrs['attr_info'][param][0]: kernel_args_type_list.append('const phi::Scalar&') param = 'phi::Scalar(' + param + ')' diff --git a/python/paddle/utils/code_gen/api_gen.py b/python/paddle/utils/code_gen/api_gen.py index c8644a8812bd2..f95edf6c591ab 100644 --- a/python/paddle/utils/code_gen/api_gen.py +++ b/python/paddle/utils/code_gen/api_gen.py @@ -138,7 +138,7 @@ def header_include(): #include "paddle/phi/api/include/tensor.h" #include "paddle/phi/common/scalar.h" -#include "paddle/phi/common/scalar_array.h" +#include "paddle/phi/common/int_array.h" #include "paddle/utils/optional.h" """ diff --git a/python/paddle/utils/code_gen/backward.yaml b/python/paddle/utils/code_gen/backward.yaml index 5199fb0eb60cd..aa7fd88285f6f 100644 --- a/python/paddle/utils/code_gen/backward.yaml +++ b/python/paddle/utils/code_gen/backward.yaml @@ -1,192 +1,14 @@ -# - backward_api : norm_grad -# forward : norm (Tensor x, int axis, float epsilon, bool is_test) -> Tensor(out), Tensor(norm) -# args : (Tensor x, Tensor norm, Tensor out_grad, int axis, float epsilon, bool is_test) -# output : Tensor(x_grad) -# infer_meta : -# func : UnchangedInferMeta -# param : [x] -# kernel : -# func : norm_grad - -# - backward_api : matmul_triple_grad -# forward : matmul_double_grad (Tensor x, Tensor y, Tensor out_grad, Tensor dx_grad, Tensor dy_grad, bool transpose_x, bool transpose_y) -> Tensor(d2x), Tensor(d2y), Tensor(dout_grad) -# args : (Tensor x, Tensor y, Tensor out_grad, Tensor dx_grad, Tensor dy_grad, Tensor d2x_grad, Tensor d2y_grad, Tensor dout_grad_grad, bool transpose_x, bool transpose_y) -# output : Tensor(d3x), Tensor(d3y), Tensor(d2out_grad), Tensor(ddx_grad), Tensor(ddy_grad) -# infer_meta : -# func : MatmulTripleGradInferMeta -# kernel : -# func : matmul_triple_grad - -# - backward_api : maxout_grad -# forward : maxout (Tensor x, int groups, int axis) -> Tensor(out) -# args : (Tensor x, Tensor out, Tensor out_grad, int groups, int axis) -# output : Tensor(x_grad) -# infer_meta : -# func : UnchangedInferMeta -# param : [x] -# kernel : -# func : maxout_grad - -# - backward_api : batch_norm_grad -# forward : batch_norm (Tensor x, Tensor scale, Tensor bias, Tensor mean, Tensor variance, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics, bool fuse_with_relu) -> Tensor(out), Tensor(mean_out), Tensor(variance_out), Tensor(saved_mean), Tensor(saved_variance), Tensor(reserve_space) -# args : (Tensor indices, Tensor x, Tensor out_grad, int axis, bool descending) -# output : Tensor(x_grad), Tensor(scale_grad), Tensor(bias_grad) -# infer_meta : -# func : GeneralTernaryGradInferMeta -# param : [x, scale, bias] -# kernel : -# func : batch_norm_grad - -# - backward_api : bilinear_tensor_product_grad -# forward : bilinear_tensor_product (Tensor x, Tensor y, Tensor weight, Tensor bias) -> Tensor(out) -# args : (Tensor x, Tensor y, Tensor weight, Tensor out_grad) -# output : Tensor(x_grad), Tensor(y_grad), Tensor(weight_grad), Tensor(bias_grad) -# infer_meta : -# func : FourXXXXInferMeta -# param : [x, y, weight, bias] -# kernel : -# func : bilinear_tensor_product_grad -# optional : bias - -# - backward_api : broadcast_tensor_grad -# forward : broadcast_tensors (Tensor[] x) -> Tensor [] (out) -# args : (Tensor [] out_grad) -# output : Tensor [] (x_grad) -# infer_meta : -# func : XXXXInferMeta -# param : [out_grad] -# kernel : -# func : broadcast_tensor_grad - -# - backward_api : gumbel_softmax_grad -# forward : gumbel_softmax (Tensor x, float temperature, bool hard, int axis) -> Tensor(out) -# args : (Tensor out, Tensor out_grad, int axis) -# output : Tensor(x_grad) -# infer_meta : -# func : GumbelSoftmaxGradInferMeta -# param : [out, out_grad, axis] -# kernel : -# func : gumbel_softmax_grad - -# - backward_api : huber_loss_grad -# forward : huber_loss (Tensor input, Tensor label, float delta) -> Tensor(out), Tensor(residual) -# args : (Tensor residual, Tensor out_grad, float delta) -# output : Tensor(input_grad), Tensor(label_grad) -# infer_meta : -# func : GeneralBinaryGradInferMeta -# param : [x, y] -# kernel : -# func : where_grad - -# - backward_api : triangular_solve_grad -# forward : triangular_solve (Tensor x, Tensor y, bool upper, bool tranpose, bool unitriangular) -> Tensor(out) -# args : (Tensor x, Tensor y, Tensor out, Tensor out_grad, bool upper, bool tranpose, bool unitriangular) -# output : Tensor(x_grad), Tensor(y_grad) -# infer_meta : -# func : GeneralBinaryGradInferMeta -# param : [x, y] -# kernel : -# func : triangular_solve_grad - -# - backward_api : dropout_grad -# forward : dropout (Tensor x, Tensor seed_tensor, float p, bool is_test, str mode, int seed, bool fix_seed) -> Tensor(out), Tensor(mask) -# args : (Tensor mask, Tensor out_grad, float p, bool is_test, str mode) -# output : Tensor(x_grad) -# infer_meta : -# func : UnchangedInferMeta -# param : [out_grad] -# kernel : -# func : dropout_grad - -# - backward_api : expand_as_grad -# forward : expand_as (Tensor x, Tensor y, int[] target_shape) -> Tensor(out) -# args : (Tensor x, Tensor out_grad, int[] target_shape) -# output : Tensor(x_grad) -# infer_meta : -# func : UnchangedInferMeta -# param : [x] -# kernel : -# func : expand_as_grad - -# - backward_api : expand_grad -# forward : expand (Tensor x, ScalarArray shape) -> Tensor(out) -# args : (Tensor x, Tensor out_grad, ScalarArray shape) -# output : Tensor(x_grad) -# infer_meta : -# func : UnchangedGradInferMeta -# param : [x] -# kernel : -# func : expand_grad - -# - backward_api : graph_send_recv_grad -# forward : graph_send_recv (Tensor x, Tensor src_index, Tensor dst_index, str pool_type) -> Tensor(out), Tensor(dst_count) -# args : (Tensor out_grad, Tensor x, Tensor out, Tensor src_index, Tensor dst_index, Tensor dst_count, str pool_type) -# output : Tensor(x_grad) -# infer_meta : -# func : UnchangedInferMeta -# param : [x] -# kernel : -# func : graph_send_recv_grad - -# - backward_api : multi_dot_grad -# forward : multi_dot (Tensor[] x) -> Tensor(out) -# args : (Tensor out_grad, Tensor[] x) -# output : Tensor[] (x_grad) -# infer_meta : -# func : XXXXInferMeta -# param : [x] -# kernel : -# func : multi_dot_grad - -# - backward_api : pad_grad -# forward : pad (Tensor x, int[] paddings, float pad_value) -> Tensor(out) -# args : (Tensor out_grad, int[] paddings, float pad_value) -# output : Tensor(x_grad) -# infer_meta : -# func : XXXXXInferMeta -# param : [x] -# kernel : -# func : pad_grad - -# - backward_api : pixel_shuffle_grad -# forward : pixel_shuffle (Tensor x, int upscale_factor, str data_format) -> Tensor(out) -# args : (Tensor out_grad, int upscale_factor, str data_format) -# output : Tensor(x_grad) -# infer_meta : -# func : XXXXXInferMeta -# param : [x] -# kernel : -# func : pixel_shuffle_grad - -# - backward_api : poisson_grad -# forward : poisson (Tensor x) -> Tensor(out) -# args : () -# output : Tensor(x_grad) -# infer_meta : -# func : XXXXXInferMeta -# param : [x] -# kernel : -# func : poisson_grad - -# - backward_api : where_index_grad -# forward : where_index (Tensor condition) -> Tensor(out) -# args : (Tensor out_grad, Tensor x, int offset, int axis1, int axis2) -# output : Tensor(x_grad) -# infer_meta : -# func : UnchangedInferMeta -# param : [x] -# kernel : -# func : where_index_grad - - backward_api : abs_grad forward : abs (Tensor x) -> Tensor(out) args : (Tensor x, Tensor out_grad) output : Tensor(x_grad) infer_meta : func : UnchangedInferMeta - param : [out_grad] + param : [x] kernel : func : abs_grad + data_transform: + skip_transform : out_grad - backward_api : acos_grad forward : acos (Tensor x) -> Tensor(out) @@ -460,16 +282,6 @@ param : [x] kernel : func : gather_nd_grad -# # forward backward type not match -# - backward_api : top_k_grad -# forward : top_k (Tensor x, Scalar k, int axis = -1, bool largest = true, bool sorted = true) -> Tensor(out), Tensor(indices) -# args : (Tensor x, Tensor indices, Tensor out_grad, Scalar k = -1, int axis = -1, bool largest = true, bool sorted = true) -# output : Tensor(x_grad) -# infer_meta : -# func : UnchangedInferMeta -# param : [x] -# kernel : -# func : top_k_grad - backward_api : hard_shrink_grad forward : hard_shrink (Tensor x, float threshold) -> Tensor(out) @@ -596,6 +408,37 @@ kernel : func : matrix_power_grad +- backward_api : maximum_grad + forward : maximum(Tensor x, Tensor y) -> Tensor(out) + args : (Tensor x, Tensor y, Tensor out_grad, int axis=-1) + output : Tensor(x_grad), Tensor(y_grad) + infer_meta : + func : GeneralBinaryGradInferMeta + param: [x, y] + kernel : + func : maximum_grad + +- backward_api : minimum_grad + forward : minimum(Tensor x, Tensor y) -> Tensor(out) + args : (Tensor x, Tensor y, Tensor out_grad, int axis=-1) + output : Tensor(x_grad), Tensor(y_grad) + infer_meta : + func : GeneralBinaryGradInferMeta + param: [x, y] + kernel : + func : minimum_grad + +- backward_api : modulo_grad + forward : add (Tensor x, Tensor y) -> Tensor(out) + args : (Tensor x, Tensor y, Tensor out_grad, int axis = -1) + output : Tensor(x_grad), Tensor(y_grad) + infer_meta : + func : GeneralBinaryGradInferMeta + param : [x, y] + kernel : + func : modulo_grad + no_need_buffer : x, y + - backward_api : multiply_grad forward : multiply (Tensor x, Tensor y) -> Tensor(out) args : (Tensor x, Tensor y, Tensor out_grad, int axis = -1) @@ -650,6 +493,16 @@ kernel : func : put_along_axis_grad +- backward_api : reciprocal_grad + forward : reciprocal (Tensor x) -> Tensor(out) + args : (Tensor out, Tensor out_grad) + output : Tensor(x_grad) + infer_meta : + func : UnchangedInferMeta + param : [out] + kernel : + func : reciprocal_grad + - backward_api : relu_double_grad forward : relu_grad (Tensor out, Tensor grad_out) -> Tensor(grad_x) args : (Tensor out, Tensor grad_x_grad) @@ -672,7 +525,7 @@ backward: relu_double_grad - backward_api : reshape_grad - forward : reshape_with_xshape (Tensor x, ScalarArray shape) -> Tensor(out), Tensor(xshape) + forward : reshape_with_xshape (Tensor x, IntArray shape) -> Tensor(out), Tensor(xshape) args : (Tensor xshape, Tensor out_grad) output : Tensor(x_grad) infer_meta : @@ -802,12 +655,42 @@ func : softmax_grad - backward_api : split_grad - forward : split (Tensor x, ScalarArray num_or_sections, Scalar axis) -> Tensor[](out) + forward : split (Tensor x, IntArray num_or_sections, Scalar axis) -> Tensor[](out) args : (Tensor[] out_grad, Scalar axis) output : Tensor(x_grad) invoke : concat( out_grad, axis) # TODO(zhangyunfei) The config of double grad and triple grad will be supported in the future. +- backward_api : sqrt_grad + forward : sqrt (Tensor x) -> Tensor(out) + args : (Tensor out, Tensor out_grad) + output : Tensor(x_grad) + infer_meta : + func : UnchangedInferMeta + param : [out] + kernel : + func : sqrt_grad + +- backward_api : square_grad + forward : square (Tensor x) -> Tensor(out) + args : (Tensor x, Tensor out_grad) + output : Tensor(x_grad) + infer_meta : + func : UnchangedInferMeta + param : [x] + kernel : + func : square_grad + +- backward_api : strided_slice_grad + forward : strided_slice (Tensor x, int[] axes, IntArray starts, IntArray ends, IntArray strides) -> Tensor(out) + args : (Tensor x, Tensor out_grad, int[] axes, IntArray starts, IntArray ends, IntArray strides) + output : Tensor(x_grad) + infer_meta : + func : GeneralUnaryGradInferMeta + param : [x] + kernel : + func : strided_slice_grad + - backward_api : subtract_grad forward : subtract (Tensor x, Tensor y) -> Tensor(out) args : (Tensor x, Tensor y, Tensor out_grad, int axis = -1) @@ -869,8 +752,8 @@ func : thresholded_relu_grad - backward_api : tile_grad - forward : tile (Tensor x, ScalarArray repeat_times) -> Tensor(out) - args : (Tensor x, Tensor out_grad, ScalarArray repeat_times) + forward : tile (Tensor x, IntArray repeat_times) -> Tensor(out) + args : (Tensor x, Tensor out_grad, IntArray repeat_times) output : Tensor(x_grad) infer_meta : func : UnchangedInferMeta diff --git a/python/paddle/utils/code_gen/backward_api_gen.py b/python/paddle/utils/code_gen/backward_api_gen.py index bf3c775236284..e26f65387878c 100644 --- a/python/paddle/utils/code_gen/backward_api_gen.py +++ b/python/paddle/utils/code_gen/backward_api_gen.py @@ -156,7 +156,7 @@ def header_include(): #include "paddle/phi/api/include/tensor.h" #include "paddle/phi/common/scalar.h" -#include "paddle/phi/common/scalar_array.h" +#include "paddle/phi/common/int_array.h" #include "paddle/utils/optional.h" """ diff --git a/python/paddle/utils/code_gen/intermediate_api_gen.py b/python/paddle/utils/code_gen/intermediate_api_gen.py index 9f6b1e16a2031..6e1df7b4ec336 100644 --- a/python/paddle/utils/code_gen/intermediate_api_gen.py +++ b/python/paddle/utils/code_gen/intermediate_api_gen.py @@ -27,7 +27,7 @@ def header_include(): #include "paddle/phi/api/include/tensor.h" #include "paddle/phi/common/scalar.h" -#include "paddle/phi/common/scalar_array.h" +#include "paddle/phi/common/int_array.h" #include "paddle/utils/optional.h" """ diff --git a/python/paddle/utils/code_gen/sparse_api_gen.py b/python/paddle/utils/code_gen/sparse_api_gen.py index ba63217068fd8..c0316fc164294 100644 --- a/python/paddle/utils/code_gen/sparse_api_gen.py +++ b/python/paddle/utils/code_gen/sparse_api_gen.py @@ -113,8 +113,8 @@ def gen_sparse_kernel_context(self, kernel_output_names): continue if param in attr_names: # set attr for kernel_context - if 'ScalarArray' in self.attrs['attr_info'][param][0]: - param = 'phi::ScalarArray(' + param + ')' + if 'IntArray' in self.attrs['attr_info'][param][0]: + param = 'phi::IntArray(' + param + ')' elif 'Scalar' in self.attrs['attr_info'][param][0]: param = 'phi::Scalar(' + param + ')' elif isinstance(param, bool): @@ -167,7 +167,7 @@ def header_include(): #include "paddle/phi/api/include/tensor.h" #include "paddle/phi/common/scalar.h" -#include "paddle/phi/common/scalar_array.h" +#include "paddle/phi/common/int_array.h" #include "paddle/utils/optional.h" """ diff --git a/python/paddle/utils/code_gen/sparse_bw_api_gen.py b/python/paddle/utils/code_gen/sparse_bw_api_gen.py index 9f74cf9ad58cc..4f209a7592161 100644 --- a/python/paddle/utils/code_gen/sparse_bw_api_gen.py +++ b/python/paddle/utils/code_gen/sparse_bw_api_gen.py @@ -97,7 +97,7 @@ def header_include(): return """ #include "paddle/phi/api/include/tensor.h" #include "paddle/phi/common/scalar.h" -#include "paddle/phi/common/scalar_array.h" +#include "paddle/phi/common/int_array.h" #include "paddle/utils/optional.h" """ diff --git a/python/paddle/utils/code_gen/strings_api.yaml b/python/paddle/utils/code_gen/strings_api.yaml index 14bcafe2281ae..34dac9221a4a0 100644 --- a/python/paddle/utils/code_gen/strings_api.yaml +++ b/python/paddle/utils/code_gen/strings_api.yaml @@ -1,5 +1,5 @@ - api : empty - args : (ScalarArray shape, Place place=CPUPlace()) + args : (IntArray shape, Place place=CPUPlace()) output : Tensor(out@StringTensor) infer_meta : func : strings::CreateInferMeta diff --git a/python/paddle/utils/code_gen/strings_api_gen.py b/python/paddle/utils/code_gen/strings_api_gen.py index 42344ecb7d193..d7117e9d54060 100644 --- a/python/paddle/utils/code_gen/strings_api_gen.py +++ b/python/paddle/utils/code_gen/strings_api_gen.py @@ -144,9 +144,9 @@ def get_kernel_args(self, code_indent): kernel_args_type_list.append(kernel_in_type) elif param in attr_names: # set attr for kernel_context - if 'ScalarArray' in self.attrs['attr_info'][param][0]: - kernel_args_type_list.append('const phi::ScalarArray&') - param = 'phi::ScalarArray(' + param + ')' + if 'IntArray' in self.attrs['attr_info'][param][0]: + kernel_args_type_list.append('const phi::IntArray&') + param = 'phi::IntArray(' + param + ')' elif 'Scalar' in self.attrs['attr_info'][param][0]: kernel_args_type_list.append('const phi::Scalar&') param = 'phi::Scalar(' + param + ')' @@ -277,7 +277,7 @@ def header_include(): #include "paddle/phi/api/include/tensor.h" #include "paddle/phi/common/scalar.h" -#include "paddle/phi/common/scalar_array.h" +#include "paddle/phi/common/int_array.h" #include "paddle/utils/optional.h" """ diff --git a/python/paddle/utils/code_gen/wrapped_infermeta_gen.py b/python/paddle/utils/code_gen/wrapped_infermeta_gen.py index 13c35813aab7d..39b950e15dc93 100644 --- a/python/paddle/utils/code_gen/wrapped_infermeta_gen.py +++ b/python/paddle/utils/code_gen/wrapped_infermeta_gen.py @@ -92,7 +92,7 @@ def header_include(): return """ #include "paddle/phi/core/meta_tensor.h" #include "paddle/phi/common/scalar.h" -#include "paddle/phi/common/scalar_array.h" +#include "paddle/phi/common/int_array.h" """ diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py index 00bd6ed38a3ad..b510b7c8bdfe8 100644 --- a/python/paddle/vision/ops.py +++ b/python/paddle/vision/ops.py @@ -19,7 +19,7 @@ from ..fluid.layers import nn, utils from ..nn import Layer, Conv2D, Sequential, ReLU, BatchNorm2D from ..fluid.initializer import Normal -from ..fluid.framework import _non_static_mode +from ..fluid.framework import _non_static_mode, in_dygraph_mode from paddle.common_ops_import import * from paddle import _C_ops @@ -377,6 +377,12 @@ def yolo_box(x, clip_bbox=True, scale_x_y=1.) """ + if in_dygraph_mode(): + boxes, scores = _C_ops.final_state_yolo_box( + x, img_size, anchors, class_num, conf_thresh, downsample_ratio, + clip_bbox, scale_x_y, iou_aware, iou_aware_factor) + return boxes, scores + if _non_static_mode(): boxes, scores = _C_ops.yolo_box( x, img_size, 'anchors', anchors, 'class_num', class_num, diff --git a/tools/parallel_UT_rule.py b/tools/parallel_UT_rule.py index f075439e54fe7..5088ad3457fb9 100755 --- a/tools/parallel_UT_rule.py +++ b/tools/parallel_UT_rule.py @@ -1174,6 +1174,7 @@ ] LOWEST_PARALLEL_JOB_NEW = [ + 'heter_cloud_comm_cpu_test', 'heter_server_test', 'test_scatter_op', 'test_trt_convert_hard_sigmoid',